diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index d903200..79d5cce 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -10,12 +10,11 @@ import java.util.HashMap; import java.util.List; import java.util.Map; -import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentConverter; +import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent; -import org.apache.pdfbox.text.PDFMarkedContentExtractor; import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent; @@ -167,9 +166,12 @@ public class LayoutParsingPipeline { stripper.getMaxCharHeight()); ClassificationPage classificationPage = switch (layoutParsingType) { - case REDACT_MANAGER -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); - case TAAS -> taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); - case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); + case REDACT_MANAGER -> + redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); + case TAAS -> + taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); + case DOCUMINE -> + docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); }; classificationPage.setCleanRulings(cleanRulings); classificationPage.setRotation(rotation); @@ -209,10 +211,10 @@ public class LayoutParsingPipeline { } - private Map convertMarkedContents(List pdMarkedContents){ + private Map convertMarkedContents(List pdMarkedContents) { Map markedContentBboxes = new HashMap<>(); - markedContentBboxes.put("Header", MarkedContentConverter.getMarkedContentBboxPerLine(pdMarkedContents, "Header")); - markedContentBboxes.put("Footer", MarkedContentConverter.getMarkedContentBboxPerLine(pdMarkedContents, "Footer")); + markedContentBboxes.put(MarkedContentUtils.HEADER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.HEADER)); + markedContentBboxes.put(MarkedContentUtils.FOOTER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.FOOTER)); return markedContentBboxes; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java index b9c816a..285efd4 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java @@ -73,6 +73,7 @@ public class TextPageBlock extends AbstractPageBlock { return sequences.get(0).getPageWidth(); } + public static TextPageBlock merge(List textBlocksToMerge) { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java index a23e0ba..cb4db7b 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java @@ -2,6 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor.services; import java.util.List; +import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; import org.springframework.stereotype.Service; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point; @@ -156,8 +157,8 @@ public class BodyTextFrameService { continue; } - if (page.getMarkedContentBboxPerType().get("Header") != null && page.getMarkedContentBboxPerType().get("Header").intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight()) - || page.getMarkedContentBboxPerType().get("Footer") != null && page.getMarkedContentBboxPerType().get("Footer").intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight())) { + if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) + || MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)) { continue; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java index 8d7c5d7..d622fc8 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java @@ -5,6 +5,7 @@ import java.util.Locale; import java.util.regex.Matcher; import java.util.regex.Pattern; +import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; @@ -62,13 +63,13 @@ public class DocuMineClassificationService { textBlock.setClassification(PageBlockType.OTHER); return; } - if (page.getMarkedContentBboxPerType().get("Header") != null && page.getMarkedContentBboxPerType().get("Header").intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight()) + if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) || PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() .getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular()) ) { textBlock.setClassification(PageBlockType.HEADER); - } else if (page.getMarkedContentBboxPerType().get("Footer") != null && page.getMarkedContentBboxPerType().get("Footer").intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight()) + } else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER) || PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() .getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular()) ) { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java index e89b225..3e90c57 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java @@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.services.classifica import java.util.List; import java.util.regex.Pattern; +import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; @@ -51,12 +52,12 @@ public class RedactManagerClassificationService { textBlock.setClassification(PageBlockType.OTHER); return; } - if (page.getMarkedContentBboxPerType().get("Header") != null && page.getMarkedContentBboxPerType().get("Header").intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight()) + if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) || PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() .getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) { textBlock.setClassification(PageBlockType.HEADER); - } else if (page.getMarkedContentBboxPerType().get("Footer") != null && page.getMarkedContentBboxPerType().get("Footer").intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight()) + } else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER) || PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() .getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) { textBlock.setClassification(PageBlockType.FOOTER); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TaasClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TaasClassificationService.java index 5eee9a3..7a91be1 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TaasClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TaasClassificationService.java @@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.services.classifica import java.util.List; import java.util.regex.Pattern; +import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; @@ -56,10 +57,10 @@ public class TaasClassificationService { textBlock.setClassification(PageBlockType.OTHER); return; } - if (page.getMarkedContentBboxPerType().get("Header") != null && page.getMarkedContentBboxPerType().get("Header").intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight()) + if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) || PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation())) { textBlock.setClassification(PageBlockType.HEADER); - } else if (page.getMarkedContentBboxPerType().get("Footer") != null && page.getMarkedContentBboxPerType().get("Footer").intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight()) + } else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER) || PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation())) { textBlock.setClassification(PageBlockType.FOOTER); } else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java index 1872f68..6bdbba3 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java @@ -10,7 +10,6 @@ import lombok.extern.slf4j.Slf4j; import org.apache.pdfbox.contentstream.operator.Operator; import org.apache.pdfbox.contentstream.operator.OperatorName; import org.apache.pdfbox.contentstream.operator.color.*; -import org.apache.pdfbox.contentstream.operator.markedcontent.BeginMarkedContentSequence; import org.apache.pdfbox.contentstream.operator.markedcontent.BeginMarkedContentSequenceWithProperties; import org.apache.pdfbox.contentstream.operator.markedcontent.EndMarkedContentSequence; import org.apache.pdfbox.contentstream.operator.state.*; @@ -76,14 +75,12 @@ public class PDFLinesTextStripper extends PDFTextStripper { addOperator(new BeginMarkedContentSequenceWithProperties()); - addOperator(new BeginMarkedContentSequence()); +// addOperator(new BeginMarkedContentSequence()); addOperator(new EndMarkedContentSequence()); } - - @Override protected void processOperator(Operator operator, List arguments) throws IOException { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/MarkedContentConverter.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/MarkedContentUtils.java similarity index 76% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/MarkedContentConverter.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/MarkedContentUtils.java index ad34f43..49d46f3 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/MarkedContentConverter.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/MarkedContentUtils.java @@ -1,5 +1,6 @@ package com.knecon.fforesight.service.layoutparser.processor.utils; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import lombok.experimental.UtilityClass; import org.apache.pdfbox.cos.COSName; @@ -9,11 +10,14 @@ import org.apache.pdfbox.text.TextPosition; import java.awt.geom.Rectangle2D; import java.util.Collection; import java.util.List; +import java.util.Map; import java.util.stream.Collectors; @UtilityClass -public class MarkedContentConverter { +public class MarkedContentUtils { + public static final String HEADER = "Header"; + public static final String FOOTER = "Footer"; public Rectangle2D getMarkedContentBboxPerLine(List markedContents, String subtype) { @@ -42,4 +46,10 @@ public class MarkedContentConverter { .getRectangle()) .map(t -> new Rectangle2D.Float(t.getTopLeft().getX(), t.getTopLeft().getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight()))).collect(Collectors.toList())); } + + + public boolean intersects(TextPageBlock textBlock, Map markedContentBboxPerType, String type) { + return markedContentBboxPerType.get(type) != null && markedContentBboxPerType.get(type).intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight()); + } + } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/BuildDocumentGraphTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/BuildDocumentGraphTest.java index d3f1a5d..092a530 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/BuildDocumentGraphTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/BuildDocumentGraphTest.java @@ -40,7 +40,7 @@ public class BuildDocumentGraphTest extends AbstractTest { @SneakyThrows protected Document buildGraph(String filename) { - return buildGraph(filename, LayoutParsingType.DOCUMINE); + return buildGraph(filename, LayoutParsingType.REDACT_MANAGER); }