diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationDocument.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationDocument.java index 1044b14..e6ef1ad 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationDocument.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationDocument.java @@ -3,7 +3,6 @@ package com.knecon.fforesight.service.layoutparser.processor.model; import java.util.ArrayList; import java.util.List; -import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject; import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree; import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents; import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter; @@ -18,7 +17,6 @@ public class ClassificationDocument { private List pages = new ArrayList<>(); private List sections = new ArrayList<>(); - //private Map> sectionsMap = new HashMap<>(); private List headers = new ArrayList<>(); private List footers = new ArrayList<>(); private List unclassifiedTexts = new ArrayList<>(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Section.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Section.java index 953af03..5aed41d 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Section.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Section.java @@ -1,10 +1,8 @@ package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes; import java.awt.geom.Rectangle2D; -import java.util.Arrays; import java.util.HashSet; import java.util.List; -import java.util.Locale; import java.util.Map; import java.util.Set; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/SuperSection.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/SuperSection.java index 2876c96..e3fe66c 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/SuperSection.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/SuperSection.java @@ -1,16 +1,5 @@ package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes; -import java.awt.geom.Rectangle2D; -import java.util.List; -import java.util.Map; -import java.util.Set; - -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; - -import lombok.Builder; import lombok.Data; import lombok.EqualsAndHashCode; import lombok.experimental.SuperBuilder; @@ -20,14 +9,4 @@ import lombok.experimental.SuperBuilder; @EqualsAndHashCode(callSuper = true) public class SuperSection extends Section { - public SuperSection(Set engines, - List treeId, - TextBlock textBlock, - DocumentTree documentTree, - Set entities, - Map bBoxCache) { - - super(engines, treeId, textBlock, documentTree, entities, bBoxCache); - } - } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineObject.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineObject.java index 5e9bf6b..b6b9efe 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineObject.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineObject.java @@ -4,7 +4,6 @@ import java.awt.geom.Point2D; import lombok.AllArgsConstructor; import lombok.Data; -import lombok.NoArgsConstructor; import lombok.RequiredArgsConstructor; @Data diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineObjectTreeNode.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineObjectTreeNode.java index 7753030..f5cfd49 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineObjectTreeNode.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineObjectTreeNode.java @@ -1,12 +1,9 @@ package com.knecon.fforesight.service.layoutparser.processor.model.outline; -import java.awt.geom.Point2D; import java.util.ArrayList; import java.util.List; import lombok.Data; -import lombok.EqualsAndHashCode; -import lombok.Getter; @Data public class OutlineObjectTreeNode { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/ClarifyndClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/ClarifyndClassificationService.java index e3520c7..b0622e3 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/ClarifyndClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/ClarifyndClassificationService.java @@ -21,12 +21,16 @@ import lombok.extern.slf4j.Slf4j; @RequiredArgsConstructor public class ClarifyndClassificationService { + private final HeadlineClassificationService headlineClassificationService; + public void classifyDocument(ClassificationDocument document) { List headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular(); log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue()); + headlineClassificationService.resetContext(); + for (ClassificationPage page : document.getPages()) { classifyPage(page, document, headlineFontSizes); } @@ -47,6 +51,10 @@ public class ClarifyndClassificationService { var bodyTextFrame = page.getBodyTextFrame(); + if (textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) { + headlineClassificationService.setLastHeadlineFromOutline(textBlock); + return; + } if (document.getFontSizeCounter().getMostPopular() == null) { textBlock.setClassification(PageBlockType.PARAGRAPH); return; @@ -79,7 +87,8 @@ public class ClarifyndClassificationService { for (int i = 1; i <= headlineFontSizes.size(); i++) { if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) { - textBlock.setClassification(PageBlockType.getHeadlineType(i)); + PageBlockType headlineType = PageBlockType.getHeadlineType(i); + headlineClassificationService.classifyHeadline(textBlock, headlineType); document.setHeadlines(true); } } @@ -89,7 +98,8 @@ public class ClarifyndClassificationService { .getTextPositions() .get(0) .getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) { - textBlock.setClassification(PageBlockType.getHeadlineType(headlineFontSizes.size() + 1)); + PageBlockType headlineType = PageBlockType.getHeadlineType(headlineFontSizes.size() + 1); + headlineClassificationService.classifyHeadline(textBlock, headlineType); document.setHeadlines(true); } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter() .getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java index a3cbe19..812c5dd 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java @@ -6,6 +6,7 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; + import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; @@ -23,6 +24,7 @@ import lombok.extern.slf4j.Slf4j; @RequiredArgsConstructor public class DocuMineClassificationService { + private final HeadlineClassificationService headlineClassificationService; private static final Pattern pattern = Pattern.compile("^(\\d{1,2}\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE); private static final Pattern pattern2 = Pattern.compile("\\p{L}{3,}", Pattern.CASE_INSENSITIVE); private static final Pattern pattern3 = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*"); @@ -34,6 +36,8 @@ public class DocuMineClassificationService { log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue()); + headlineClassificationService.resetContext(); + for (ClassificationPage page : document.getPages()) { classifyPage(page, document, headlineFontSizes); } @@ -59,7 +63,8 @@ public class DocuMineClassificationService { Matcher matcher2 = pattern2.matcher(textBlock.toString()); Matcher matcher3 = pattern3.matcher(textBlock.toString()); - if(textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) { + if (textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) { + headlineClassificationService.setLastHeadlineFromOutline(textBlock); return; } if (document.getFontSizeCounter().getMostPopular() == null) { @@ -67,46 +72,57 @@ public class DocuMineClassificationService { return; } if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) - || PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() - .getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular()) - ) { + || PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null + || textBlock.getHighestFontSize() <= document.getFontSizeCounter() + .getMostPopular())) { textBlock.setClassification(PageBlockType.HEADER); } else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER) - || PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() - .getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular()) - ) { + || PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null + || textBlock.getHighestFontSize() <= document.getFontSizeCounter() + .getMostPopular())) { textBlock.setClassification(PageBlockType.FOOTER); - } else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, - document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks() - .size() == 1)) { + } else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5 + && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) { if (!Pattern.matches("[0-9]+", textBlock.toString())) { textBlock.setClassification(PageBlockType.TITLE); } - } else if (textBlock.getText().length() > 5 && (textBlock.getMostPopularWordHeight() > document.getTextHeightCounter() - .getMostPopular() || textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular()) && PositionUtils.getApproxLineCount(textBlock) < 5.9 + } else if (textBlock.getText().length() > 5 + && (textBlock.getMostPopularWordHeight() > document.getTextHeightCounter().getMostPopular() + || textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular()) + && PositionUtils.getApproxLineCount(textBlock) < 5.9 - && (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && !matcher2.matches() && !textBlock.toString() - .contains(":") || textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && !matcher2.matches() && !textBlock.toString() - .contains(":") || textBlock.toString().startsWith("APPENDIX") || textBlock.toString().startsWith("FIGURE") || textBlock.toString() - .startsWith("TABLE")) && !textBlock.toString().endsWith(":") && matcher2.find()) { - textBlock.setClassification(PageBlockType.getHeadlineType(1)); + && (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && !matcher2.matches() && !textBlock.toString() + .contains(":") + || textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && !matcher2.matches() && !textBlock.toString().contains(":") + || textBlock.toString().startsWith("APPENDIX") + || textBlock.toString().startsWith("FIGURE") + || textBlock.toString().startsWith("TABLE")) + && !textBlock.toString().endsWith(":") + && matcher2.find()) { + PageBlockType headlineType = PageBlockType.getHeadlineType(1); + headlineClassificationService.classifyHeadline(textBlock, headlineType); document.setHeadlines(true); } else if (matcher.find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && matcher2.find() && !matcher3.matches()) { - textBlock.setClassification(PageBlockType.getHeadlineType(2)); + PageBlockType headlineType = PageBlockType.getHeadlineType(2); + headlineClassificationService.classifyHeadline(textBlock, headlineType); document.setHeadlines(true); - } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter() - .getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) { + } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) + && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular() + && textBlock.getMostPopularWordStyle().equals("bold") + && !document.getFontStyleCounter().getMostPopular().equals("bold")) { textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD); - } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFont() - .equals(document.getFontCounter().getMostPopular()) && textBlock.getMostPopularWordStyle() - .equals(document.getFontStyleCounter().getMostPopular()) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) { + } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) + && textBlock.getMostPopularWordFont().equals(document.getFontCounter().getMostPopular()) + && textBlock.getMostPopularWordStyle().equals(document.getFontStyleCounter().getMostPopular()) + && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) { textBlock.setClassification(PageBlockType.PARAGRAPH); - } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter() - .getMostPopular() && textBlock.getMostPopularWordStyle().equals("italic") && !document.getFontStyleCounter() - .getMostPopular() - .equals("italic") && PositionUtils.getApproxLineCount(textBlock) < 2.9) { + } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) + && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular() + && textBlock.getMostPopularWordStyle().equals("italic") + && !document.getFontStyleCounter().getMostPopular().equals("italic") + && PositionUtils.getApproxLineCount(textBlock) < 2.9) { textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC); } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) { textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/HeadlineClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/HeadlineClassificationService.java new file mode 100644 index 0000000..f8b6ea7 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/HeadlineClassificationService.java @@ -0,0 +1,61 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.classification; + +import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.getHeadlineNumber; + +import org.springframework.stereotype.Service; + +import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; + +import lombok.Getter; +import lombok.Setter; + +@Service +@Getter +@Setter +public class HeadlineClassificationService { + + TextPageBlock lastHeadline; + PageBlockType originalClassifiedBlockType; + TextPageBlock lastHeadlineFromOutline; + + public void resetContext() { + setLastHeadline(null); + setOriginalClassifiedBlockType(null); + setLastHeadlineFromOutline(null); + } + + + public void setLastHeadlineFromOutline(TextPageBlock lastHeadlineFromOutline) { + + this.lastHeadlineFromOutline = lastHeadlineFromOutline; + this.setLastHeadline(lastHeadlineFromOutline); + } + + + public void classifyHeadline(TextPageBlock textBlock, PageBlockType headlineType) { + + TextPageBlock lastHeadline = getLastHeadline(); + TextPageBlock lastHeadlineFromOutline = getLastHeadlineFromOutline(); + PageBlockType originalClassifiedBlockType = getOriginalClassifiedBlockType(); + + if (lastHeadline != null) { + + if (lastHeadline.equals(lastHeadlineFromOutline)) { + + headlineType = PageBlockType.getHeadlineType(getHeadlineNumber(lastHeadline.getClassification()) + 1); + + } else if (originalClassifiedBlockType != null && lastHeadline.getClassification() != originalClassifiedBlockType) { + + PageBlockType lastHeadlineType = lastHeadline.getClassification(); + int difference = getHeadlineNumber(originalClassifiedBlockType) - getHeadlineNumber(lastHeadlineType); + headlineType = PageBlockType.getHeadlineType(getHeadlineNumber(headlineType) + difference); + } + } + + setOriginalClassifiedBlockType(headlineType); + textBlock.setClassification(headlineType); + setLastHeadline(textBlock); + } + +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java index 62c9eef..ff532b5 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java @@ -1,7 +1,5 @@ package com.knecon.fforesight.service.layoutparser.processor.services.classification; -import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.getHeadlineNumber; - import java.util.List; import java.util.regex.Pattern; @@ -16,7 +14,6 @@ import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageB import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils; -import lombok.Data; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; @@ -25,6 +22,8 @@ import lombok.extern.slf4j.Slf4j; @RequiredArgsConstructor public class RedactManagerClassificationService { + private final HeadlineClassificationService headlineClassificationService; + public void classifyDocument(ClassificationDocument document) { @@ -32,33 +31,30 @@ public class RedactManagerClassificationService { log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue()); - HeadLineClassificationContext headLineClassificationContext = new HeadLineClassificationContext(); + headlineClassificationService.resetContext(); + for (ClassificationPage page : document.getPages()) { - classifyPage(page, document, headlineFontSizes, headLineClassificationContext); + classifyPage(page, document, headlineFontSizes); } } - private void classifyPage(ClassificationPage page, ClassificationDocument document, List headlineFontSizes, HeadLineClassificationContext headLineClassificationContext) { + private void classifyPage(ClassificationPage page, ClassificationDocument document, List headlineFontSizes) { for (AbstractPageBlock textBlock : page.getTextBlocks()) { if (textBlock instanceof TextPageBlock) { - classifyBlock((TextPageBlock) textBlock, page, document, headlineFontSizes, headLineClassificationContext); + classifyBlock((TextPageBlock) textBlock, page, document, headlineFontSizes); } } } - private void classifyBlock(TextPageBlock textBlock, - ClassificationPage page, - ClassificationDocument document, - List headlineFontSizes, - HeadLineClassificationContext headLineClassificationContext) { + private void classifyBlock(TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List headlineFontSizes) { var bodyTextFrame = page.getBodyTextFrame(); if (textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) { - headLineClassificationContext.setLastHeadlineFromOutline(textBlock); + headlineClassificationService.setLastHeadlineFromOutline(textBlock); return; } if (document.getFontSizeCounter().getMostPopular() == null) { @@ -72,7 +68,8 @@ public class RedactManagerClassificationService { .anyMatch(graphic -> graphic.getPosition().intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight()))) { textBlock.setClassification(PageBlockType.PARAGRAPH); return; - } if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) + } + if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) || PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter() .getMostPopular())) { @@ -100,7 +97,7 @@ public class RedactManagerClassificationService { for (int i = 1; i <= headlineFontSizes.size(); i++) { if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) { PageBlockType headlineType = PageBlockType.getHeadlineType(i); - classifyHeadline(textBlock, headLineClassificationContext, headlineType); + headlineClassificationService.classifyHeadline(textBlock, headlineType); document.setHeadlines(true); } } @@ -113,7 +110,7 @@ public class RedactManagerClassificationService { .get(0).getTextPositions() .get(0).getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) { PageBlockType headlineType = PageBlockType.getHeadlineType(headlineFontSizes.size() + 1); - classifyHeadline(textBlock, headLineClassificationContext, headlineType); + headlineClassificationService.classifyHeadline(textBlock, headlineType); document.setHeadlines(true); } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular() @@ -138,55 +135,4 @@ public class RedactManagerClassificationService { } } - - private static void classifyHeadline(TextPageBlock textBlock, HeadLineClassificationContext headLineClassificationContext, PageBlockType headlineType) { - - TextPageBlock lastHeadline = headLineClassificationContext.getLastHeadline(); - TextPageBlock lastHeadlineFromOutline = headLineClassificationContext.getLastHeadlineFromOutline(); - PageBlockType originalClassifiedBlockType = headLineClassificationContext.getOriginalClassifiedBlockType(); - - if (lastHeadline != null) { - - if (lastHeadline.equals(lastHeadlineFromOutline)) { - - headlineType = getNextType(lastHeadline.getClassification()); - - } else if (originalClassifiedBlockType != null && lastHeadline.getClassification() != originalClassifiedBlockType) { - - PageBlockType lastHeadlineType = lastHeadline.getClassification(); - int difference = getHeadlineNumber(originalClassifiedBlockType) - getHeadlineNumber(lastHeadlineType); - headlineType = PageBlockType.getHeadlineType(getHeadlineNumber(headlineType) + difference); - } - } - - headLineClassificationContext.setOriginalClassifiedBlockType(headlineType); - textBlock.setClassification(headlineType); - headLineClassificationContext.setLastHeadline(textBlock); - } - - - private static PageBlockType getNextType(PageBlockType pageBlockType) { - - return PageBlockType.getHeadlineType(getHeadlineNumber(pageBlockType) + 1); - } - - - - - @Data - static class HeadLineClassificationContext { - - TextPageBlock lastHeadline; - PageBlockType originalClassifiedBlockType; - TextPageBlock lastHeadlineFromOutline; - - - public void setLastHeadlineFromOutline(TextPageBlock lastHeadlineFromOutline) { - - this.lastHeadlineFromOutline = lastHeadlineFromOutline; - this.setLastHeadline(lastHeadlineFromOutline); - } - - } - } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java index b1ad145..3be5959 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java @@ -7,7 +7,6 @@ import static java.util.stream.Collectors.toList; import java.awt.geom.Rectangle2D; import java.util.ArrayList; import java.util.HashMap; -import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; @@ -16,7 +15,6 @@ import java.util.Optional; import java.util.Set; import java.util.stream.Collectors; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/OutlineProcessingTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/OutlineProcessingTest.java deleted file mode 100644 index 33ceaba..0000000 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/OutlineProcessingTest.java +++ /dev/null @@ -1,25 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.server.graph; - -import org.junit.jupiter.api.Test; -import org.springframework.beans.factory.annotation.Autowired; - -import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineExtractorService; -import com.knecon.fforesight.service.layoutparser.processor.services.blockification.BlockificationPostprocessingService; -import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest; - -import lombok.SneakyThrows; - -public class OutlineProcessingTest extends BuildDocumentTest { - - @Autowired - OutlineExtractorService outlineExtractorService; - @Autowired - BlockificationPostprocessingService blockificationPostprocessingService; - - @Test - @SneakyThrows - public void test() { - - } - -} diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java index a8ab674..a26754a 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java @@ -1,12 +1,8 @@ package com.knecon.fforesight.service.layoutparser.server.graph; import java.io.File; -import java.nio.file.Files; import java.nio.file.Path; -import java.util.ArrayList; -import java.util.List; import java.util.Map; -import java.util.stream.Stream; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; @@ -27,80 +23,11 @@ import lombok.SneakyThrows; public class ViewerDocumentTest extends BuildDocumentTest { - @Test - @SneakyThrows - @Disabled - public void testViewerDocuments() { - - String directory = "files/syngenta_190_deduplicated/"; - Path dirPath = new ClassPathResource(directory).getFile().toPath(); - - // Ensure the directory exists and is accessible - if (!Files.exists(dirPath) || !Files.isDirectory(dirPath)) { - throw new IllegalArgumentException("The specified path must be a directory and it must exist."); - } - - ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null); - LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService); - - // Use try-with-resources to ensure the stream is closed after use - try (Stream paths = Files.walk(dirPath)) { - paths.filter(Files::isRegularFile) - .filter(path -> path.toString().endsWith(".pdf")) // Filter to process only PDF files - .forEach(path -> processFile(path, layoutGridService)); - } - } - - - private void processFile(Path filePath, LayoutGridService layoutGridService) { - - try { - File documentFile = filePath.toFile(); - String tmpFileName = "/tmp/" + filePath.getFileName().toString() + "_VIEWER.pdf"; - - long start = System.currentTimeMillis(); - var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, - documentFile, - new ImageServiceResponse(), - new TableServiceResponse(), - new VisualLayoutParsingResponse(), - Map.of("file", filePath.getFileName().toFile().toString())); - Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER, classificationDocument); - - if (classificationDocument.getOutlineObjectTree().getRootNodes().size() > 1) { - layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true); - System.out.printf("Processed %s in %.2fs%n", filePath, ((float) (System.currentTimeMillis() - start)) / 1000); - } - } catch (Exception exception) - { - System.out.println(exception); - } - } - - @Test @SneakyThrows public void testViewerDocument() { - //String fileName = "files/documine/21_TiltPlus_MutacaoGenicaEmCelulasBacterianas.pdf";//fail here - - //String fileName = "files/documine/Study Document 1 - Acute Eye IrritationCorrosion - Rabbits.pdf"; - //String fileName = "files/documine/Study Document 3 - Acute Eye IrritationCorrosion - Rabbits.pdf"; - //String fileName = "files/documine/VV-547521_Irritação_Ocular_in_Vivo.pdf"; - //String fileName = "files/documine/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica.pdf"; - //String fileName = "files/new/UTT-Books-53.pdf"; - //String fileName = "files/documine/A21924A - Acute Oral Toxicity - Rats.pdf"; - //String fileName = "files/documine/A16361B - Acute Dermal Irritation Toxicity Study in Rabbits.pdf"; - //String fileName = "files/documine/ITEM 20_Sensibilização cutânea.pdf"; - //String fileName = "files/documine/VV-547523_LLNA.pdf"; - //String fileName = "files/new/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"; - //String fileName = "files/syngenta_190_deduplicated/1 Abamectin_prr.pdf"; - //String fileName = "files/new/abschlussarbeiten-template-institut-fur-informatik-padagogische-hochschule-karlsruhe.pdf"; - //String fileName = "files/new/kaust-official-thesis-template.pdf"; - //String fileName = "files/new/$100m Offers.pdf"; - String fileName = "files/new/18-Curacron_ToxicidadeOcularInVitro.pdf"; - //String fileName = "files/new/mistitled_outlines_example.pdf"; - //String fileName = "files/bdr/Plenarprotokoll 1 (keine Druchsache!) (1) 1.pdf"; + String fileName = "files/syngenta/CustomerFiles/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"; String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf"; var documentFile = new ClassPathResource(fileName).getFile(); @@ -108,39 +35,12 @@ public class ViewerDocumentTest extends BuildDocumentTest { LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService); long start = System.currentTimeMillis(); - Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER); + Document document = buildGraph(fileName, LayoutParsingType.DOCUMINE); layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true); System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000); } - @Test - @SneakyThrows - public void testViewerDocumentWithImages() { - - String fileName = "files/new/UTT-Books-53.pdf"; - Path path = Path.of(fileName); - String tmpFileName = "/tmp/" + path.getFileName() + "_VIEWER.pdf"; - String imageFileName = "files/images/test_outlines.IMAGE_INFO.json"; - - var mapper = ObjectMapperFactory.create(); - var imageServiceResponse = mapper.readValue(new ClassPathResource(imageFileName).getInputStream(), ImageServiceResponse.class); - var documentFile = new ClassPathResource(fileName).getFile(); - - var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, - documentFile, - imageServiceResponse, - new TableServiceResponse(), - new VisualLayoutParsingResponse(), - Map.of("file", path.getFileName().toFile().toString())); - ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null); - LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService); - Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER, classificationDocument); - - layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true); - } - - @Test @Disabled @SneakyThrows @@ -148,19 +48,18 @@ public class ViewerDocumentTest extends BuildDocumentTest { String fileName = "files/cv_tables/brokenTablesOnOcr_ocred.pdf"; String tableFileName = "files/cv_tables/brokenTablesOnOcr_ocred.TABLES.json"; - Path path = Path.of(fileName); - String tmpFileName = "/tmp/" + path.getFileName() + "_VIEWER.pdf"; + String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf"; var mapper = ObjectMapperFactory.create(); var tableResponse = mapper.readValue(new ClassPathResource(tableFileName).getInputStream(), TableServiceResponse.class); var documentFile = new ClassPathResource(fileName).getFile(); var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE, - documentFile, - new ImageServiceResponse(), - tableResponse, - new VisualLayoutParsingResponse(), - Map.of("file", path.getFileName().toFile().toString())); + documentFile, + new ImageServiceResponse(), + tableResponse, + new VisualLayoutParsingResponse(), + Map.of("file", Path.of(fileName).getFileName().toFile().toString())); ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null); LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService); Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.DOCUMINE, classificationDocument); diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/new/UTT-Books-53.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/new/UTT-Books-53.pdf deleted file mode 100644 index 1626c3f..0000000 Binary files a/layoutparser-service/layoutparser-service-server/src/test/resources/files/new/UTT-Books-53.pdf and /dev/null differ