diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineObject.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineObject.java index b6b9efe..6f8af6b 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineObject.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineObject.java @@ -16,7 +16,7 @@ public class OutlineObject { private Point2D point; private final int treeDepth; - private boolean found = false; + private boolean found; public OutlineObject(String title, int pageNumber, Point2D point2D, int depth) { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TableOfContents.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TableOfContents.java index 72ee8a2..8d80cd3 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TableOfContents.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TableOfContents.java @@ -97,7 +97,7 @@ public class TableOfContents implements Iterable { private final Stack> stack = new Stack<>(); - public TableOfContentItemIterator(List mainSections) { + TableOfContentItemIterator(List mainSections) { stack.push(mainSections.iterator()); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java index a38780a..7863e71 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java @@ -206,9 +206,11 @@ public class BlockificationPostprocessingService { if (minDistance == distanceToDirectMatch) { directMatch.setClassification(headlineType); } else if (minDistance == distanceToSplitCandidate) { - List others = splitBlock(classificationPage, splitCandidate, context.sectionIdentifier, outlineObject.getTitle()); - splitCandidate.setClassification(headlineType); - others.forEach(other -> other.setClassification(null)); + SplitBlockResult splitBlockResult = splitBlock(classificationPage, splitCandidate, context.sectionIdentifier, outlineObject.getTitle()); + if (splitBlockResult.modifiedBlockToSplit) { + splitCandidate.setClassification(headlineType); + } + splitBlockResult.otherBlocks.forEach(other -> other.setClassification(null)); } else { var merged = mergeBlocks(classificationPage, bestMergeCandidateCombination); merged.setClassification(headlineType); @@ -217,7 +219,7 @@ public class BlockificationPostprocessingService { } - private List splitBlock(ClassificationPage classificationPage, TextPageBlock blockToSplit, SectionIdentifier sectionIdentifier, String title) { + private SplitBlockResult splitBlock(ClassificationPage classificationPage, TextPageBlock blockToSplit, SectionIdentifier sectionIdentifier, String title) { List otherBlocks = new ArrayList<>(); int blockToSplitIdx = classificationPage.getTextBlocks().indexOf(blockToSplit); @@ -228,12 +230,16 @@ public class BlockificationPostprocessingService { } WordSequenceResult wordSequenceResult = findWordSequence(blockToSplit.getSequences(), headline); - if (wordSequenceResult.inSequence.isEmpty()) { + if (wordSequenceResult.inSequence.isEmpty() && !headline.equals(title)) { wordSequenceResult = findWordSequence(blockToSplit.getSequences(), title); } - blockToSplit.setSequences(wordSequenceResult.inSequence); - blockToSplit.resize(); + boolean modifiedBlockToSplit = false; + if (!wordSequenceResult.inSequence.isEmpty()) { + blockToSplit.setSequences(wordSequenceResult.inSequence); + blockToSplit.resize(); + modifiedBlockToSplit = true; + } if (!wordSequenceResult.preSequence.isEmpty()) { TextPageBlock block = buildTextBlock(wordSequenceResult.preSequence, 0); @@ -246,32 +252,8 @@ public class BlockificationPostprocessingService { classificationPage.getTextBlocks().add(blockToSplitIdx + 1, block); otherBlocks.add(block); } - return otherBlocks; - } - - - public static class WordSequenceResult { - - public List inSequence; - public List preSequence; - public List postSequence; - - - public WordSequenceResult(List inSequence, List preSequence, List postSequence) { - - this.inSequence = inSequence; - this.preSequence = preSequence; - this.postSequence = postSequence; - } - - - public WordSequenceResult() { - - this.inSequence = new ArrayList<>(); - this.preSequence = new ArrayList<>(); - this.postSequence = new ArrayList<>(); - } + return new SplitBlockResult(modifiedBlockToSplit, otherBlocks); } @@ -358,11 +340,6 @@ public class BlockificationPostprocessingService { } - public record SplitSequenceResult(TextPositionSequence in, TextPositionSequence out) { - - } - - private static TextPositionSequence createSubSequence(TextPositionSequence sequence, int start, int end) { TextPositionSequence newSeq = new TextPositionSequence(new ArrayList<>(sequence.getTextPositions().subList(start, end)), sequence.getPage()); @@ -502,7 +479,7 @@ public class BlockificationPostprocessingService { private SectionIdentifier sectionIdentifier; - public OutlineProcessionContext(OutlineObject outlineObject) { + OutlineProcessionContext(OutlineObject outlineObject) { this.outlineObject = outlineObject; this.directMatch = null; @@ -513,4 +490,36 @@ public class BlockificationPostprocessingService { } + public static class WordSequenceResult { + + public List inSequence; + public List preSequence; + public List postSequence; + + + public WordSequenceResult(List inSequence, List preSequence, List postSequence) { + + this.inSequence = inSequence; + this.preSequence = preSequence; + this.postSequence = postSequence; + } + + + public WordSequenceResult() { + + this.inSequence = new ArrayList<>(); + this.preSequence = new ArrayList<>(); + this.postSequence = new ArrayList<>(); + } + + } + + public record SplitBlockResult(boolean modifiedBlockToSplit, List otherBlocks) { + + } + + public record SplitSequenceResult(TextPositionSequence in, TextPositionSequence out) { + + } + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/HeadlineClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/HeadlineClassificationService.java index f8b6ea7..e302321 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/HeadlineClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/HeadlineClassificationService.java @@ -33,28 +33,29 @@ public class HeadlineClassificationService { } - public void classifyHeadline(TextPageBlock textBlock, PageBlockType headlineType) { + public void classifyHeadline(TextPageBlock textBlock, PageBlockType initialHeadlineType) { TextPageBlock lastHeadline = getLastHeadline(); TextPageBlock lastHeadlineFromOutline = getLastHeadlineFromOutline(); PageBlockType originalClassifiedBlockType = getOriginalClassifiedBlockType(); + PageBlockType finalHeadlineType = initialHeadlineType; if (lastHeadline != null) { if (lastHeadline.equals(lastHeadlineFromOutline)) { - headlineType = PageBlockType.getHeadlineType(getHeadlineNumber(lastHeadline.getClassification()) + 1); + finalHeadlineType = PageBlockType.getHeadlineType(getHeadlineNumber(lastHeadline.getClassification()) + 1); } else if (originalClassifiedBlockType != null && lastHeadline.getClassification() != originalClassifiedBlockType) { PageBlockType lastHeadlineType = lastHeadline.getClassification(); int difference = getHeadlineNumber(originalClassifiedBlockType) - getHeadlineNumber(lastHeadlineType); - headlineType = PageBlockType.getHeadlineType(getHeadlineNumber(headlineType) + difference); + finalHeadlineType = PageBlockType.getHeadlineType(getHeadlineNumber(initialHeadlineType) - difference); } } - setOriginalClassifiedBlockType(headlineType); - textBlock.setClassification(headlineType); + setOriginalClassifiedBlockType(initialHeadlineType); + textBlock.setClassification(finalHeadlineType); setLastHeadline(textBlock); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java index 1a097fc..a77bf22 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java @@ -150,7 +150,7 @@ public class TableNodeFactory { cell.getTextBlocks() .stream() .map(tb -> (AbstractPageBlock) tb) - .toList(), + .collect(Collectors.toList()), emptyList(), context, document); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/HeadlinesGoldStandardIntegrationTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/HeadlinesGoldStandardIntegrationTest.java index 274e1e8..1c4514f 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/HeadlinesGoldStandardIntegrationTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/HeadlinesGoldStandardIntegrationTest.java @@ -69,10 +69,10 @@ public class HeadlinesGoldStandardIntegrationTest { public void testHeadlineDetection() { List metrics = new ArrayList<>(); - metrics.add(getMetrics("files/syngenta/CustomerFiles/01 - CGA100251 - Acute Oral Toxicity (Up and Down Procedure) - Rat (1).pdf", - "files/headlineTest/01 - CGA100251 - Acute Oral Toxicity (Up and Down Procedure) - Rat (1)_REDACTION_LOG.json")); - metrics.add(getMetrics("files/syngenta/CustomerFiles/91 Trinexapac-ethyl_RAR_01_Volume_1_2018-02-23.pdf", - "files/headlineTest/91 Trinexapac-ethyl_RAR_01_Volume_1_2018-02-23_REDACTION_LOG.json")); + //metrics.add(getMetrics("files/syngenta/CustomerFiles/01 - CGA100251 - Acute Oral Toxicity (Up and Down Procedure) - Rat (1).pdf", + // "files/headlineTest/01 - CGA100251 - Acute Oral Toxicity (Up and Down Procedure) - Rat (1)_REDACTION_LOG.json")); + //metrics.add(getMetrics("files/syngenta/CustomerFiles/91 Trinexapac-ethyl_RAR_01_Volume_1_2018-02-23.pdf", + // "files/headlineTest/91 Trinexapac-ethyl_RAR_01_Volume_1_2018-02-23_REDACTION_LOG.json")); metrics.add(getMetrics("files/syngenta/CustomerFiles/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf", "files/headlineTest/S-Metolachlor_RAR_01_Volume_1_2018-09-06_REDACTION_LOG.json")); double precision = metrics.stream().mapToDouble(Metrics::getPrecision).average().orElse(1.0); @@ -96,8 +96,8 @@ public class HeadlinesGoldStandardIntegrationTest { goldStandardLog.getRedactionLogEntry().removeIf(r -> !r.isRedacted() || r.getChanges().get(r.getChanges().size() - 1).getType().equals(ChangeType.REMOVED)); goldStandardLog.getRedactionLogEntry().forEach(e -> goldStandardHeadlines.add(new Headline(e.getPositions().get(0).getPage(), e.getValue()))); - Document documentGraph = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_OLD, - layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD, + Document documentGraph = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER, + layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, pdfFileResource.getFile(), new ImageServiceResponse(), new TableServiceResponse(), diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java index a26754a..0ac3ef5 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java @@ -27,7 +27,8 @@ public class ViewerDocumentTest extends BuildDocumentTest { @SneakyThrows public void testViewerDocument() { - String fileName = "files/syngenta/CustomerFiles/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"; + String fileName = "files/syngenta/CustomerFiles/90 Trinexapac-ethyl - Peer Review Report Syngenta - March 2018.pdf"; + String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf"; var documentFile = new ClassPathResource(fileName).getFile(); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java index a66d540..1a5755b 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java @@ -92,6 +92,7 @@ public class RulingCleaningServiceTest extends BuildDocumentTest { .toList(); for (String pdfFileName : pdfFileNames) { + writeJsons(Path.of(pdfFileName)); } } @@ -100,15 +101,15 @@ public class RulingCleaningServiceTest extends BuildDocumentTest { @SneakyThrows private void writeJsons(Path filename) { - Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_OLD, - layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD, + Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER, + layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, filename.toFile(), new ImageServiceResponse(), new TableServiceResponse(), new VisualLayoutParsingResponse(), Map.of("file",filename.toFile().toString()))); - Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_OLD, - layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD, + Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER, + layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, filename.toFile(), new ImageServiceResponse(), new TableServiceResponse(), diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/DontMergeNonConsecutiveTables.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/DontMergeNonConsecutiveTables.pdf deleted file mode 100644 index 4e18c90..0000000 Binary files a/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/DontMergeNonConsecutiveTables.pdf and /dev/null differ