diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index b021e37..dfa0537 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -47,6 +47,7 @@ import com.knecon.fforesight.service.layoutparser.processor.services.TableExtrac import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService; import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService; import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService; +import com.knecon.fforesight.service.layoutparser.processor.services.classification.ClarifyndClassificationService; import com.knecon.fforesight.service.layoutparser.processor.services.classification.DocuMineClassificationService; import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService; import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; @@ -88,6 +89,7 @@ public class LayoutParsingPipeline { LayoutGridService layoutGridService; ObservationRegistry observationRegistry; VisualLayoutParsingAdapter visualLayoutParsingAdapter; + ClarifyndClassificationService clarifyndClassificationService; public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException { @@ -291,7 +293,7 @@ public class LayoutParsingPipeline { if (layoutParsingType == LayoutParsingType.REDACT_MANAGER) { docstrumBlockificationService.combineBlocks(classificationPage); } else if (layoutParsingType == LayoutParsingType.CLARIFYND) { - docstrumBlockificationService.mergeIntersectingBlocks(classificationPage.getTextBlocks()); + docstrumBlockificationService.mergeIntersectingBlocks(classificationPage.getTextBlocks(), 0, 6.5f); } buildPageStatistics(classificationPage); @@ -306,9 +308,10 @@ public class LayoutParsingPipeline { bodyTextFrameService.setBodyTextFrames(classificationDocument, layoutParsingType); log.info("Classify TextBlocks for {}", identifier); switch (layoutParsingType) { - case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_OLD, CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> + case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_OLD, CLARIFYND_PARAGRAPH_DEBUG -> redactManagerClassificationService.classifyDocument(classificationDocument); case DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument); + case CLARIFYND -> clarifyndClassificationService.classifyDocument(classificationDocument); } log.info("Building Sections for {}", identifier); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java index fc560a2..23e598a 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java @@ -80,7 +80,7 @@ public class DocstrumBlockificationService { public void combineBlocks(ClassificationPage page) { - mergeIntersectingBlocks(page.getTextBlocks()); + mergeIntersectingBlocks(page.getTextBlocks(), 0, 0); TextPageBlock previous = new TextPageBlock(); ListIterator itty = page.getTextBlocks().listIterator(); @@ -123,7 +123,7 @@ public class DocstrumBlockificationService { previous = current; } - mergeIntersectingBlocks(page.getTextBlocks()); + mergeIntersectingBlocks(page.getTextBlocks(), 0, 0); } @@ -203,7 +203,7 @@ public class DocstrumBlockificationService { } - public void mergeIntersectingBlocks(List blocks) { + public void mergeIntersectingBlocks(List blocks, float xThreshold, float yThreshold) { ListIterator itty = blocks.listIterator(); Set toRemove = new HashSet<>(); @@ -237,7 +237,7 @@ public class DocstrumBlockificationService { continue; } - if (current.getDir() == inner.getDir() && current.almostIntersects(inner, 0, 0)) { + if (current.getDir() == inner.getDir() && current.almostIntersects(inner, yThreshold, xThreshold)) { current.getSequences().addAll(inner.getSequences()); QuickSort.sort(current.getSequences(), new TextPositionSequenceComparator()); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/ClarifyndClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/ClarifyndClassificationService.java new file mode 100644 index 0000000..e3520c7 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/ClarifyndClassificationService.java @@ -0,0 +1,114 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.classification; + +import java.util.List; +import java.util.regex.Pattern; + +import org.springframework.stereotype.Service; + +import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; +import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils; + +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; + +@Slf4j +@Service +@RequiredArgsConstructor +public class ClarifyndClassificationService { + + public void classifyDocument(ClassificationDocument document) { + + List headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular(); + + log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue()); + + for (ClassificationPage page : document.getPages()) { + classifyPage(page, document, headlineFontSizes); + } + } + + + private void classifyPage(ClassificationPage page, ClassificationDocument document, List headlineFontSizes) { + + for (AbstractPageBlock textBlock : page.getTextBlocks()) { + if (textBlock instanceof TextPageBlock) { + classifyBlock((TextPageBlock) textBlock, page, document, headlineFontSizes); + } + } + } + + + private void classifyBlock(TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List headlineFontSizes) { + + var bodyTextFrame = page.getBodyTextFrame(); + + if (document.getFontSizeCounter().getMostPopular() == null) { + textBlock.setClassification(PageBlockType.PARAGRAPH); + return; + } + if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) || PositionUtils.isOverBodyTextFrame(bodyTextFrame, + textBlock, + page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter() + .getMostPopular())) { + textBlock.setClassification(PageBlockType.PARAGRAPH); + + } else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER) || PositionUtils.isUnderBodyTextFrame(bodyTextFrame, + textBlock, + page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter() + .getMostPopular())) { + textBlock.setClassification(PageBlockType.PARAGRAPH); + } else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, + document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks() + .size() == 1)) { + if (!Pattern.matches("[0-9]+", textBlock.toString())) { + textBlock.setClassification(PageBlockType.TITLE); + } + } else if (textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter() + .getMostPopular() && PositionUtils.getApproxLineCount(textBlock) < 4.9 && (textBlock.getMostPopularWordStyle().equals("bold") || !document.getFontStyleCounter() + .getCountPerValue() + .containsKey("bold") && textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() + 1) && textBlock.getSequences() + .get(0) + .getTextPositions() + .get(0) + .getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) { + + for (int i = 1; i <= headlineFontSizes.size(); i++) { + if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) { + textBlock.setClassification(PageBlockType.getHeadlineType(i)); + document.setHeadlines(true); + } + } + } else if (!textBlock.getText().startsWith("Figure ") && PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordStyle() + .equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9 && textBlock.getSequences() + .get(0) + .getTextPositions() + .get(0) + .getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) { + textBlock.setClassification(PageBlockType.getHeadlineType(headlineFontSizes.size() + 1)); + document.setHeadlines(true); + } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter() + .getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) { + textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD); + } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFont() + .equals(document.getFontCounter().getMostPopular()) && textBlock.getMostPopularWordStyle() + .equals(document.getFontStyleCounter().getMostPopular()) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) { + textBlock.setClassification(PageBlockType.PARAGRAPH); + } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter() + .getMostPopular() && textBlock.getMostPopularWordStyle().equals("italic") && !document.getFontStyleCounter() + .getMostPopular() + .equals("italic") && PositionUtils.getApproxLineCount(textBlock) < 2.9) { + textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC); + } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) { + textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN); + } else { + textBlock.setClassification(PageBlockType.PARAGRAPH); + } + } + +} + diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java index 3153952..9adf903 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java @@ -27,7 +27,7 @@ public class ViewerDocumentTest extends BuildDocumentTest { @SneakyThrows public void testViewerDocument() { - String fileName = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"; + String fileName = "files/bdr/notMergedParagraphs.pdf"; String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf"; var documentFile = new ClassPathResource(fileName).getFile(); @@ -35,7 +35,7 @@ public class ViewerDocumentTest extends BuildDocumentTest { LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService); long start = System.currentTimeMillis(); - Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER); + Document document = buildGraph(fileName, LayoutParsingType.CLARIFYND); layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true); System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000); } diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/knecon_extracted_msg_WG_ BAP-BFB, Windtests.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/knecon_extracted_msg_WG_ BAP-BFB, Windtests.pdf new file mode 100644 index 0000000..e42b4cd Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/knecon_extracted_msg_WG_ BAP-BFB, Windtests.pdf differ