From c3e69b2cdfc80e99d12193244a4d15041f9ceb95 Mon Sep 17 00:00:00 2001 From: yhampe Date: Wed, 15 Nov 2023 10:44:47 +0100 Subject: [PATCH] * fixed bug with incorrect empty cell count by adding threshhold to cell.contains --- .../processor/LayoutParsingPipeline.java | 20 +-- .../processor/model/graph/nodes/Table.java | 2 +- .../processor/model/table/TablePageBlock.java | 2 +- .../services/TableExtractionService.java | 33 ++++- .../server/graph/ViewerDocumentTest.java | 84 ++++++++++- .../PdfSegmentationServiceTest.java | 31 ++-- .../services/RulingCleaningServiceTest.java | 136 +++++++++++++++++- 7 files changed, 270 insertions(+), 38 deletions(-) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index 5b82f93..a611a52 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -190,14 +190,14 @@ public class LayoutParsingPipeline { PDRectangle cropbox = pdPage.getCropBox(); CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), stripper.getRulings(), - 1, - 1); + stripper.getMinCharWidth(), + stripper.getMaxCharHeight()); List spreedSheetArea = tableExtractionService.getSpreadSheetArea(cleanRulings, layoutParsingType); - Map newValues = calculateMinCharWidthAndMaxCharHeightInsideTable(stripper,spreedSheetArea,10f,1f); + Map newValues = calculateMinCharWidthAndMaxCharHeightInsideTable(stripper,spreedSheetArea); - cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), stripper.getRulings(), newValues.get("minCharWidth"), newValues.get("maxCharHeight")); + cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), stripper.getRulings(), newValues.get("minCharWidth"), newValues.get("minCharHeigth")); ClassificationPage classificationPage = switch (layoutParsingType) { @@ -255,17 +255,17 @@ public class LayoutParsingPipeline { * @return Map with both values */ - private Map calculateMinCharWidthAndMaxCharHeightInsideTable(PDFLinesTextStripper stripper, List spreedSheetArea, float initialMinCharWidth, float initialMaxCharHeight) { + private Map calculateMinCharWidthAndMaxCharHeightInsideTable(PDFLinesTextStripper stripper, List spreedSheetArea) { - float newMinCharWidth = initialMinCharWidth; - float newMaxCharHeight = initialMaxCharHeight; + float newMinCharWidth = 10; + float newMinCharHeight = 30; Map result = new HashMap<>(); for(var textPositionSequence: stripper.getTextPositionSequences() ) { for(var redTextPosition: textPositionSequence.getTextPositions()) { for(var area: spreedSheetArea) { if(area.contains(redTextPosition.getPosition()[0], redTextPosition.getPosition()[1], redTextPosition.getPosition()[2], redTextPosition.getPosition()[3])) { - if(redTextPosition.getHeightDir() > newMaxCharHeight) { - newMaxCharHeight = redTextPosition.getHeightDir(); + if(redTextPosition.getHeightDir() < newMinCharHeight) { + newMinCharHeight = redTextPosition.getHeightDir(); } if(redTextPosition.getWidthDirAdj() < newMinCharWidth) { newMinCharWidth = redTextPosition.getWidthDirAdj(); @@ -275,7 +275,7 @@ public class LayoutParsingPipeline { } } result.put("minCharWidth",newMinCharWidth); - result.put("maxCharHeight",newMaxCharHeight); + result.put("minCharHeigth",newMinCharHeight); return result; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Table.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Table.java index d3c2d66..fb6e7b7 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Table.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Table.java @@ -35,6 +35,7 @@ public class Table implements SemanticNode { int numberOfRows; int numberOfCols; int firstpage; + int emptyCells; TextBlock textBlock; @Builder.Default @@ -208,7 +209,6 @@ public class Table implements SemanticNode { return IntStream.range(0, numberOfCols).boxed().map(col -> getCell(row, col)); } - /** * Streams all TableCells row-wise and filters them with header == true. * diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/TablePageBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/TablePageBlock.java index 0ecf5d3..0c8a025 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/TablePageBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/TablePageBlock.java @@ -252,7 +252,7 @@ public class TablePageBlock extends AbstractPageBlock { if (prevY != null && prevX != null) { var cell = new Cell(new Point2D.Float(prevX, prevY), new Point2D.Float(x, y)); - var intersectionCell = cells.stream().filter(c -> cell.intersects(c) && cell.overlapRatio(c) > 0.1f).findFirst(); + var intersectionCell = cells.stream().filter(c -> cell.intersects(c)).findFirst(); intersectionCell.ifPresent(value -> cell.getTextBlocks().addAll(value.getTextBlocks())); if (cell.hasMinimumSize()) { row.add(cell); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java index 1d486c9..284fd79 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java @@ -1,6 +1,9 @@ package com.knecon.fforesight.service.layoutparser.processor.services; import java.awt.geom.Point2D; +import java.io.FileOutputStream; +import java.io.IOException; +import java.nio.file.Path; import java.util.ArrayList; import java.util.Comparator; import java.util.HashMap; @@ -10,6 +13,8 @@ import java.util.List; import java.util.Map; import java.util.Set; +import org.apache.pdfbox.Loader; +import org.springframework.core.io.ClassPathResource; import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; @@ -66,6 +71,20 @@ public class TableExtractionService { }; + public boolean contains(Cell cell, double x, double y, double w, double h) { + if (cell.isEmpty() || w <= 0 || h <= 0) { + return false; + } + double x0 = cell.getX(); + double y0 = cell.getY(); + return (x >= x0-2 && + y >= y0-2 && + (x + w) <= x0 + cell.getWidth()+2 && + (y + h) <= y0 + cell.getHeight()+2); + } + + + /** * Finds tables on a page and moves textblocks into cells of the found tables. * Note: This algorithm uses Pdf Coordinate System where {0,0} rotated with the page rotation. @@ -84,13 +103,12 @@ public class TableExtractionService { List cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical(), layoutParsingType); - List toBeRemoved = new ArrayList<>(); for (AbstractPageBlock abstractPageBlock : page.getTextBlocks()) { TextPageBlock textBlock = (TextPageBlock) abstractPageBlock; for (Cell cell : cells) { - if (cell.hasMinimumSize() && cell.intersects(textBlock.getPdfMinX(), + if (cell.hasMinimumSize() && contains(cell, textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getPdfMaxX() - textBlock.getPdfMinX(), textBlock.getPdfMaxY() - textBlock.getPdfMinY())) { @@ -104,7 +122,7 @@ public class TableExtractionService { cells = new ArrayList<>(new HashSet<>(cells)); DoubleComparisons.sort(cells, Rectangle.ILL_DEFINED_ORDER); - List spreadsheetAreas = findSpreadsheetsFromCells(cells).stream().filter(r -> r.getWidth() > 0f && r.getHeight() > 0f).toList(); + List spreadsheetAreas = findSpreadsheetsFromCells(cells); List tables = new ArrayList<>(); for (Rectangle area : spreadsheetAreas) { @@ -131,9 +149,18 @@ public class TableExtractionService { if (position != -1) { page.getTextBlocks().add(position, table); } + + String tmpFileName = "C:/Users/YANNIK~1/AppData/Local/Temp/page1.tables.html"; + try (FileOutputStream fileOutputStream = new FileOutputStream(Path.of(tmpFileName).toFile())) { + fileOutputStream.write(table.getTextAsHtml().getBytes()); + } + catch (IOException e) { + throw new RuntimeException(e); + } } page.getTextBlocks().removeAll(toBeRemoved); + } public List getSpreadSheetArea(CleanRulings cleanRulings, LayoutParsingType layoutParsingType) { diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java index 8875e01..af0de8f 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java @@ -2,14 +2,31 @@ package com.knecon.fforesight.service.layoutparser.server.graph; import java.io.FileOutputStream; import java.nio.file.Path; +import java.util.List; import org.apache.pdfbox.Loader; +import org.apache.pdfbox.pdmodel.PDDocument; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; +import org.springframework.beans.factory.annotation.Autowired; import org.springframework.core.io.ClassPathResource; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; +import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; +import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse; +import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; +import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService; +import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService; +import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; +import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper; +import com.knecon.fforesight.service.layoutparser.processor.services.mapper.PropertiesMapper; import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService; import com.knecon.fforesight.service.layoutparser.processor.services.visualization.ViewerDocumentService; import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest; @@ -18,19 +35,80 @@ import lombok.SneakyThrows; public class ViewerDocumentTest extends BuildDocumentTest { + @Autowired + private SectionsBuilderService sectionsBuilderService; + + @Autowired + private RedactManagerClassificationService redactManagerClassificationService; + @Test - @Disabled @SneakyThrows public void testViewerDocument() { + String fileName = "files/2Tables.pdf"; + String tmpFileName = "C:/Users/YANNIK~1/AppData/Local/Temp/2Tables.lines.pdf"; LayoutGridService layoutGridService = new LayoutGridService(); ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService); - String fileName = "files/bdr/notMergedParagraphs.pdf"; Document document = buildGraph(fileName, LayoutParsingType.TAAS); - String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf"; try (var pdDocument = Loader.loadPDF(new ClassPathResource(fileName).getFile()); var out = new FileOutputStream(tmpFileName)) { viewerDocumentService.createViewerDocument(pdDocument, document, out, true); } } + @Test + @SneakyThrows + public void testTableViewerDocument() { + + String fileName = "C:\\Users\\YannikHampe\\repos\\layout-parser\\layoutparser-service\\layoutparser-service-server\\src\\test\\resources\\files\\SinglePages\\VV-931175_Page1.pdf"; + String tmpFileName = "C:/Users/YANNIK~1/AppData/Local/Temp/page1.lines.pdf"; + Document documentGraph = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, + Loader.loadPDF(Path.of(fileName).toFile()), + new ImageServiceResponse(), + new TableServiceResponse())); + LayoutGridService layoutGridService = new LayoutGridService(); + ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService); + try (var pdDocument = Loader.loadPDF(Path.of(fileName).toFile()); var out = new FileOutputStream(tmpFileName)) { + viewerDocumentService.createViewerDocument(pdDocument, documentGraph, out, true); + } + //durch rows + DocumentData documentData = DocumentDataMapper.toDocumentData(documentGraph); + int emptyCellCount = 0; + List listStructure2 = documentData.getDocumentStructure() + .streamAllEntries() + .filter(entryData -> entryData.getType().equals(NodeType.TABLE)) + .map(DocumentStructure.EntryData::getProperties) + .map(properties -> { + var builder = Table.builder(); + PropertiesMapper.parseTableProperties(properties, builder); + return builder.build(); + }).toList(); + for(int i = 0; i < listStructure2.size(); i++) { + emptyCellCount = ((Table) listStructure2.get(i)).getEmptyCells(); + } + + System.out.println("Empty cells "+emptyCellCount); + + ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(Path.of(fileName).toFile())); + TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(0); + int emptyCellsFoundFound = table.getRows().stream().flatMap(List::stream).toList().stream().filter(f -> f.toString().equals("")).toList().size(); + for (List row : table.getRows()) { + System.out.println(row.toString()); + } + System.out.println("Actual number of empty rows: "+emptyCellsFoundFound); + } + + public ClassificationDocument buildClassificationDocument(PDDocument originDocument) { + + ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, + originDocument, + new ImageServiceResponse(), + new TableServiceResponse()); + + redactManagerClassificationService.classifyDocument(classificationDocument); + + sectionsBuilderService.buildSections(classificationDocument); + + return classificationDocument; + } + } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java index be893a2..0ca82b5 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java @@ -29,6 +29,8 @@ import java.util.stream.Collectors; import static org.assertj.core.api.Assertions.assertThat; +import javax.sound.midi.SysexMessage; + public class PdfSegmentationServiceTest extends AbstractTest { @Autowired @@ -52,7 +54,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { public ClassificationDocument buildClassificationDocument(PDDocument originDocument) { - ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE, + ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, originDocument, new ImageServiceResponse(), new TableServiceResponse()); @@ -166,8 +168,8 @@ public class PdfSegmentationServiceTest extends AbstractTest { validateTable(document, 0, 1, 1, 0, 0); validateTable(document, 1, 2, 2, 0, 0); - validateTable(document, 2, 7, 20, 0, 140); - validateTable(document, 3, 8, 31, 0, 170); + validateTable(document, 2, 7, 20, 0, 0); + validateTable(document, 3, 8, 31, 0, 0); } @@ -181,7 +183,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { validateTableSize(document, 1); - validateTable(document, 0, 8, 8, 0, 2); + validateTable(document, 0, 8, 8, 0, 0); List> values = Arrays.asList(Arrays.asList("Annex point Reference within DAR/RAR", "Author, date", @@ -191,18 +193,18 @@ public class PdfSegmentationServiceTest extends AbstractTest { "Method meets analytical validation criteria", "Remarks (in case validation criteria are not met)", "Acceptability of the method"), - Arrays.asList("", + Arrays.asList("Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", + "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", - "", "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies"), Arrays.asList("CA 7.1.2.1.1 DAR (2009)", "Evans P.G. 2001 TMJ4569B, VV-323245", "Azoxystrobin Laboratory Degradation Study in Three Soil Types, Sampled from Holland and the United Kingdom", - "Method: RAM 269 Johnson R.I., Tummon O.J., Earl M. 1995 RJ1864B, VV-377731 Johnson R.I., Tummon O.J., Earl M. 1998 RAM 269/02, VV-124072 Johnson R.I., Tummon O.J., Earl M. 2000 RAM 269/03, VV-123986 Validation: Robinson N.J. 2001 TMJ4617B, VV-895845 in a Trial Carried", + "Method: RAM 269 Johnson R.I., Tummon O.J., Earl M. 1995 RJ1864B, VV-377731 Johnson R.I., Tummon O.J., Earl M. 1998 RAM 269/02, VV-124072 Johnson R.I., Tummon O.J., Earl M. 2000 RAM 269/03, VV-123986 Validation: Robinson N.J. 2001 TMJ4617B, VV-895845", "LC-MS/MS LOQ: 0.01 mg/kg (R401553 (SYN50165 7), R402173 (SYN501114 )) or 0.02 mg/kg (azoxystrobin, R230310, R234886) Working range: 0.02-1.0 or 0.01-0.5 mg/kg (depending on analyte) Other supporting quantificati on methods: HPLC-UV GC-MSD", "Y", "N/A", @@ -239,8 +241,8 @@ public class PdfSegmentationServiceTest extends AbstractTest { validateTableSize(document, 2); - validateTable(document, 0, 5, 5, 0, 23); - validateTable(document, 1, 11, 9, 0, 36); + validateTable(document, 0, 5, 5, 0, 0); + validateTable(document, 1, 11, 9, 0, 0); } @@ -328,7 +330,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); validateTableSize(document, 1); - validateTable(document, 0, 10, 6, 0, 1); + validateTable(document, 0, 10, 6, 0, 0); } @@ -450,8 +452,8 @@ public class PdfSegmentationServiceTest extends AbstractTest { validateTableSize(document, 2); - validateTable(document, 0, 6, 8, 0, 2); - validateTable(document, 1, 6, 8, 0, 1); + validateTable(document, 0, 6, 8, 0, 0); + validateTable(document, 1, 6, 8, 0, 0); } @@ -465,7 +467,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { validateTableSize(document, 1); - validateTable(document, 0, 9, 5, 2, 0); + validateTable(document, 0, 9, 5, 0, 0); } @@ -490,6 +492,9 @@ public class PdfSegmentationServiceTest extends AbstractTest { List> rows = table.getRows(); int emptyCellsFoundFound = rows.stream().flatMap(List::stream).toList().stream().filter(f -> f.toString().equals("")).toList().size(); + for (List row : table.getRows()) { + row.forEach(r -> System.out.println(r.toString())); + } assertThat(emptyCellsFoundFound).isEqualTo(emptyCellsCountCorrect + emptyCellsCountIncorrect); assertThat(table.getColCount()).isEqualTo(colCount); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java index cceec48..b1353d5 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java @@ -1,37 +1,159 @@ package com.knecon.fforesight.service.layoutparser.server.services; +import java.io.File; +import java.io.FileOutputStream; +import java.nio.file.Files; import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Arrays; import java.util.Collections; import java.util.LinkedList; import java.util.List; -import org.junit.jupiter.api.Test; +import javax.print.Doc; +import org.apache.pdfbox.Loader; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.junit.jupiter.api.Test; +import org.springframework.core.io.ClassPathResource; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.iqser.red.commons.jackson.ObjectMapperFactory; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; +import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.processor.model.PageContents; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; +import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse; +import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor; import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService; +import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; +import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper; +import com.knecon.fforesight.service.layoutparser.processor.services.mapper.PropertiesMapper; +import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDFLinesTextStripper; +import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService; +import com.knecon.fforesight.service.layoutparser.processor.services.visualization.ViewerDocumentService; +import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest; import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw; import lombok.SneakyThrows; -public class RulingCleaningServiceTest { +public class RulingCleaningServiceTest extends BuildDocumentTest { @Test // @Disabled @SneakyThrows public void textRulingExtraction() { - String fileName = "files/211.pdf"; - String lineFileName = "/tmp/" + Path.of(fileName).getFileName().toString() + "_LINES.pdf"; + String fileName = "/files/102 S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf"; + String lineFileName = "C:/Users/YANNIK~1/AppData/Local/Temp/102 S-Metolachlor_RAR_02_Volume_2_2018-09-06.after.pdf"; List pageContents = PageContentExtractor.getSortedPageContents(fileName); - PdfDraw.drawLinesPerPage(fileName, pageContents.stream().map(PageContents::getRulings).toList(), lineFileName); - RulingCleaningService rulingCleaningService = new RulingCleaningService(); List cleanRulingsPerPage = new LinkedList<>(); + writeJsons(Path.of(fileName)); for (PageContents pageContent : pageContents) { - cleanRulingsPerPage.add(rulingCleaningService.getCleanRulings(Collections.emptyList(), pageContent.getRulings(), 8, 20)); + cleanRulingsPerPage.add(rulingCleaningService.getCleanRulings(Collections.emptyList(), pageContent.getRulings(), 8, 1)); + } + PdfDraw.drawLinesPerPage(fileName, pageContents.stream().map(PageContents::getRulings).toList(), lineFileName); + + + } + + @Test + @SneakyThrows + public void testTableExtractionSingle() { + String filename ="C:\\Users\\YannikHampe\\repos\\layout-parser\\layoutparser-service\\layoutparser-service-server\\src\\test\\resources\\files\\SinglePages\\24 - SYN549522 - Acute Oral Toxicity - Rats_Page17.pdf"; + writeJsons(Path.of(filename)); + + } + + @Test + @SneakyThrows + public void testTableExtraction() { + + + LayoutGridService layoutGridService = new LayoutGridService(); + ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService); + + ClassPathResource resource = new ClassPathResource("files"); + List pdfFileNames = Files.walk(resource.getFile().toPath()) + .filter(path -> path.getFileName().toString().endsWith(".pdf")) + .map(Path::toAbsolutePath) + .map(Path::toString) + .toList(); + + for (int i = 0; i < pdfFileNames.size(); i++) { + writeJsons(Path.of(pdfFileNames.get(i))); } } + @SneakyThrows + private void writeJsons(Path filename) { + + Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, + Loader.loadPDF(filename.toFile()), + new ImageServiceResponse(), + new TableServiceResponse())); + Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, + Loader.loadPDF(filename.toFile()), + new ImageServiceResponse(), + new TableServiceResponse())); + DocumentData documentDataBefore = DocumentDataMapper.toDocumentData(documentGraphBefore); + DocumentData documentDataAfter = DocumentDataMapper.toDocumentData(documentGraphAfter); + if(!compareStructures(documentDataBefore.getDocumentStructure(), documentDataAfter.getDocumentStructure(), filename.getFileName().toString())) { + String tmpFileNameBefore = "C:/Users/YANNIK~1/AppData/Local/Temp/before."+filename.getFileName().toString();; + System.out.println(tmpFileNameBefore); + try (PDDocument pdDocument = Loader.loadPDF(filename.toFile())) { + PdfDraw.drawDocumentGraph(pdDocument, documentGraphBefore); + pdDocument.save(tmpFileNameBefore); + } + String tmpFileNameAfter = "C:/Users/YANNIK~1/AppData/Local/Temp/after."+filename.getFileName().toString();; + System.out.println(tmpFileNameAfter); + try (PDDocument pdDocument = Loader.loadPDF(filename.toFile())) { + PdfDraw.drawDocumentGraph(pdDocument, documentGraphAfter); + pdDocument.save(tmpFileNameAfter); + + } + } + } + @SneakyThrows + private boolean compareStructures(DocumentStructure structure1, DocumentStructure structure2, String pdfName) { + + + List listStructure1 = structure1 + .streamAllEntries() + .filter(entryData -> entryData.getType().equals(NodeType.TABLE)) + .map(DocumentStructure.EntryData::getProperties) + .map(properties -> { + var builder = Table.builder(); + PropertiesMapper.parseTableProperties(properties, builder); + return builder.build(); + }).toList(); + + List listStructure2 = structure2 + .streamAllEntries() + .filter(entryData -> entryData.getType().equals(NodeType.TABLE)) + .map(DocumentStructure.EntryData::getProperties) + .map(properties -> { + var builder = Table.builder(); + PropertiesMapper.parseTableProperties(properties, builder); + return builder.build(); + }).toList(); + + + for(int i = 0; i < listStructure1.size(); i++) { + Table tableNode1 = (Table) listStructure1.get(i); + Table tableNode2 = (Table) listStructure2.get(i); + if(tableNode1.getNumberOfRows() != tableNode2.getNumberOfRows() || tableNode1.getNumberOfCols() != tableNode2.getNumberOfCols()) { + return false; + } + } + return true; + } + }