diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index e31c993..79d5cce 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -2,15 +2,19 @@ package com.knecon.fforesight.service.layoutparser.processor; import static java.lang.String.format; +import java.awt.geom.Rectangle2D; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; import java.util.Map; +import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.common.PDRectangle; +import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent; import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent; @@ -162,9 +166,12 @@ public class LayoutParsingPipeline { stripper.getMaxCharHeight()); ClassificationPage classificationPage = switch (layoutParsingType) { - case REDACT_MANAGER -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); - case TAAS -> taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); - case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); + case REDACT_MANAGER -> + redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); + case TAAS -> + taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); + case DOCUMINE -> + docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); }; classificationPage.setCleanRulings(cleanRulings); classificationPage.setRotation(rotation); @@ -173,6 +180,9 @@ public class LayoutParsingPipeline { classificationPage.setPageWidth(cropbox.getWidth()); classificationPage.setPageHeight(cropbox.getHeight()); + // MarkedContent needs to be converted at this point, otherwise it leads to GC Problems in Pdfbox. + classificationPage.setMarkedContentBboxPerType(convertMarkedContents(stripper.getMarkedContents())); + // If images is ocr needs to be calculated before textBlocks are moved into tables, otherwise findOcr algorithm needs to be adopted. if (pdfImages != null && pdfImages.containsKey(pageNumber)) { classificationPage.setImages(pdfImages.get(pageNumber)); @@ -201,6 +211,13 @@ public class LayoutParsingPipeline { } + private Map convertMarkedContents(List pdMarkedContents) { + Map markedContentBboxes = new HashMap<>(); + markedContentBboxes.put(MarkedContentUtils.HEADER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.HEADER)); + markedContentBboxes.put(MarkedContentUtils.FOOTER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.FOOTER)); + return markedContentBboxes; + } + private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) { if (!classificationPage.isLandscape()) { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationPage.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationPage.java index cc0a420..798612e 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationPage.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationPage.java @@ -1,7 +1,10 @@ package com.knecon.fforesight.service.layoutparser.processor.model; +import java.awt.geom.Rectangle2D; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; +import java.util.Map; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; @@ -11,6 +14,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFre import lombok.Data; import lombok.NonNull; import lombok.RequiredArgsConstructor; +import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent; @Data @RequiredArgsConstructor @@ -38,4 +42,6 @@ public class ClassificationPage { CleanRulings cleanRulings; + private Map markedContentBboxPerType = new HashMap<>(); + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java index b9c816a..285efd4 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java @@ -73,6 +73,7 @@ public class TextPageBlock extends AbstractPageBlock { return sequences.get(0).getPageWidth(); } + public static TextPageBlock merge(List textBlocksToMerge) { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java index 436df0c..cb4db7b 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java @@ -2,6 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor.services; import java.util.List; +import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; import org.springframework.stereotype.Service; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point; @@ -20,7 +21,6 @@ import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils; public class BodyTextFrameService { - public void setBodyTextFrames(ClassificationDocument classificationDocument, LayoutParsingType layoutParsingType) { Rectangle bodyTextFrame = calculateBodyTextFrame(classificationDocument.getPages(), classificationDocument.getFontSizeCounter(), false, layoutParsingType); @@ -157,6 +157,11 @@ public class BodyTextFrameService { continue; } + if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) + || MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)) { + continue; + } + float approxLineCount = PositionUtils.getApproxLineCount(textBlock); if (layoutParsingType.equals(LayoutParsingType.DOCUMINE) && approxLineCount < approximateHeaderLineCount && textBlock.getMaxY() >= page.getPageHeight() - (page.getPageHeight() / 10) || !layoutParsingType.equals( LayoutParsingType.DOCUMINE) && approxLineCount < approximateHeaderLineCount) { @@ -233,4 +238,4 @@ public class BodyTextFrameService { } -} +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java index f26f2d2..d622fc8 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java @@ -5,6 +5,7 @@ import java.util.Locale; import java.util.regex.Matcher; import java.util.regex.Pattern; +import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; @@ -62,12 +63,16 @@ public class DocuMineClassificationService { textBlock.setClassification(PageBlockType.OTHER); return; } - if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() - .getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) { + if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) + || PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() + .getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular()) + ) { textBlock.setClassification(PageBlockType.HEADER); - } else if (PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() - .getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) { + } else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER) + || PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() + .getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular()) + ) { textBlock.setClassification(PageBlockType.FOOTER); } else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks() @@ -107,4 +112,4 @@ public class DocuMineClassificationService { } } -} +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java index 2be8f03..3e90c57 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java @@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.services.classifica import java.util.List; import java.util.regex.Pattern; +import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; @@ -51,11 +52,13 @@ public class RedactManagerClassificationService { textBlock.setClassification(PageBlockType.OTHER); return; } - if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() + if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) + || PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() .getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) { textBlock.setClassification(PageBlockType.HEADER); - } else if (PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() + } else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER) + || PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() .getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) { textBlock.setClassification(PageBlockType.FOOTER); } else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TaasClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TaasClassificationService.java index c177efd..7a91be1 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TaasClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TaasClassificationService.java @@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.services.classifica import java.util.List; import java.util.regex.Pattern; +import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; @@ -56,9 +57,11 @@ public class TaasClassificationService { textBlock.setClassification(PageBlockType.OTHER); return; } - if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation())) { + if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) + || PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation())) { textBlock.setClassification(PageBlockType.HEADER); - } else if (PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation())) { + } else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER) + || PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation())) { textBlock.setClassification(PageBlockType.FOOTER); } else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks() diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java index 839075d..6bdbba3 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java @@ -1,33 +1,18 @@ package com.knecon.fforesight.service.layoutparser.processor.services.parsing; -import java.awt.color.CMMException; -import java.awt.geom.Point2D; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - +import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; +import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import lombok.Getter; +import lombok.Setter; +import lombok.SneakyThrows; +import lombok.extern.slf4j.Slf4j; import org.apache.pdfbox.contentstream.operator.Operator; import org.apache.pdfbox.contentstream.operator.OperatorName; -import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColor; -import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColorN; -import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColorSpace; -import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceCMYKColor; -import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceGrayColor; -import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceRGBColor; -import org.apache.pdfbox.contentstream.operator.color.SetStrokingColor; -import org.apache.pdfbox.contentstream.operator.color.SetStrokingColorN; -import org.apache.pdfbox.contentstream.operator.color.SetStrokingColorSpace; -import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceCMYKColor; -import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceGrayColor; -import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceRGBColor; -import org.apache.pdfbox.contentstream.operator.state.SetFlatness; -import org.apache.pdfbox.contentstream.operator.state.SetLineCapStyle; -import org.apache.pdfbox.contentstream.operator.state.SetLineDashPattern; -import org.apache.pdfbox.contentstream.operator.state.SetLineJoinStyle; -import org.apache.pdfbox.contentstream.operator.state.SetLineMiterLimit; -import org.apache.pdfbox.contentstream.operator.state.SetLineWidth; -import org.apache.pdfbox.contentstream.operator.state.SetRenderingIntent; +import org.apache.pdfbox.contentstream.operator.color.*; +import org.apache.pdfbox.contentstream.operator.markedcontent.BeginMarkedContentSequenceWithProperties; +import org.apache.pdfbox.contentstream.operator.markedcontent.EndMarkedContentSequence; +import org.apache.pdfbox.contentstream.operator.state.*; import org.apache.pdfbox.contentstream.operator.text.SetFontAndSize; import org.apache.pdfbox.cos.COSBase; import org.apache.pdfbox.cos.COSNumber; @@ -36,14 +21,11 @@ import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.graphics.color.PDColor; import org.apache.pdfbox.text.TextPosition; -import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; -import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; - -import lombok.Getter; -import lombok.Setter; -import lombok.SneakyThrows; -import lombok.extern.slf4j.Slf4j; +import java.awt.color.CMMException; +import java.awt.geom.Point2D; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; @Getter @Slf4j @@ -59,6 +41,7 @@ public class PDFLinesTextStripper extends PDFTextStripper { private int minCharHeight; private int maxCharHeight; + private float path_x; private float path_y; @@ -89,6 +72,12 @@ public class PDFLinesTextStripper extends PDFTextStripper { this.addOperator(new SetNonStrokingColorN()); this.addOperator(new SetFontAndSize()); this.addOperator(new SetLineWidth()); + + + addOperator(new BeginMarkedContentSequenceWithProperties()); +// addOperator(new BeginMarkedContentSequence()); + addOperator(new EndMarkedContentSequence()); + } @@ -99,6 +88,7 @@ public class PDFLinesTextStripper extends PDFTextStripper { //move switch (operation) { + case OperatorName.MOVE_TO: if (arguments.size() == 2) { Point2D.Float pos = transformPosition(floatValue(arguments.get(0)), floatValue(arguments.get(1))); @@ -349,3 +339,4 @@ public class PDFLinesTextStripper extends PDFTextStripper { } + diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFTextStripper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFTextStripper.java index 237dc0c..46c0578 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFTextStripper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFTextStripper.java @@ -25,27 +25,20 @@ import java.io.StringWriter; import java.io.Writer; import java.text.Bidi; import java.text.Normalizer; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Comparator; -import java.util.HashMap; -import java.util.Iterator; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; -import java.util.SortedMap; -import java.util.SortedSet; -import java.util.StringTokenizer; -import java.util.TreeMap; -import java.util.TreeSet; +import java.util.*; import java.util.regex.Pattern; +import lombok.Getter; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.pdfbox.cos.COSDictionary; +import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPageTree; import org.apache.pdfbox.pdmodel.common.PDRectangle; +import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent; +import org.apache.pdfbox.pdmodel.graphics.PDXObject; import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem; import org.apache.pdfbox.pdmodel.interactive.pagenavigation.PDThreadBead; import org.apache.pdfbox.text.TextPosition; @@ -63,6 +56,10 @@ public class PDFTextStripper extends LegacyPDFStreamEngine { private static float defaultDropThreshold = 2.5f; private static final boolean useCustomQuickSort; + @Getter + protected final List markedContents = new ArrayList<>(); + protected final Deque currentMarkedContents = new ArrayDeque<>(); + private static final Log LOG = LogFactory.getLog(PDFTextStripper.class); // enable the ability to set the default indent/drop thresholds @@ -196,6 +193,45 @@ public class PDFTextStripper extends LegacyPDFStreamEngine { } + + public void beginMarkedContentSequence(COSName tag, COSDictionary properties) + { + PDMarkedContent markedContent = PDMarkedContent.create(tag, properties); + if (this.currentMarkedContents.isEmpty()) + { + this.markedContents.add(markedContent); + } + else + { + PDMarkedContent currentMarkedContent = + this.currentMarkedContents.peek(); + if (currentMarkedContent != null) + { + currentMarkedContent.addMarkedContent(markedContent); + } + } + this.currentMarkedContents.push(markedContent); + } + + @Override + public void endMarkedContentSequence() + { + if (!this.currentMarkedContents.isEmpty()) + { + this.currentMarkedContents.pop(); + } + } + + + public void xobject(PDXObject xobject) + { + if (!this.currentMarkedContents.isEmpty()) + { + this.currentMarkedContents.peek().addXObject(xobject); + } + } + + /** * This will return the text of a document. See writeText.
* NOTE: The document must not be encrypted when coming into this method. @@ -877,7 +913,12 @@ public class PDFTextStripper extends LegacyPDFStreamEngine { textList.add(text); } } + if (!this.currentMarkedContents.isEmpty()) + { + this.currentMarkedContents.peek().addText(text); + } } + } @@ -2103,6 +2144,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine { this.isHangingIndent = true; } + } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/MarkedContentUtils.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/MarkedContentUtils.java new file mode 100644 index 0000000..49d46f3 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/MarkedContentUtils.java @@ -0,0 +1,55 @@ +package com.knecon.fforesight.service.layoutparser.processor.utils; + +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import lombok.experimental.UtilityClass; +import org.apache.pdfbox.cos.COSName; +import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent; +import org.apache.pdfbox.text.TextPosition; + +import java.awt.geom.Rectangle2D; +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +@UtilityClass +public class MarkedContentUtils { + + public static final String HEADER = "Header"; + public static final String FOOTER = "Footer"; + + public Rectangle2D getMarkedContentBboxPerLine(List markedContents, String subtype) { + + if (markedContents == null) { + return null; + } + + var markedContentByYPosition = markedContents.stream() + .filter(m -> m.getTag().equals("Artifact")) + .filter(m -> m.getProperties() != null) + .filter(m -> m.getProperties().getItem("Subtype") != null) + .filter(m -> ((COSName) m.getProperties().getItem("Subtype")).getName().equals(subtype)) + .map(PDMarkedContent::getContents).flatMap(Collection::stream) + .filter(t -> t instanceof TextPosition) + .map(t -> (TextPosition) t) + .filter(t -> !t.getUnicode().equals(" ")) + .collect(Collectors.groupingBy(TextPosition::getY)); + + if (markedContentByYPosition.isEmpty()) { + return null; + } + + return RectangleTransformations.rectangle2DBBox(markedContentByYPosition.values().stream() + .map(textPositions -> new TextPositionSequence(textPositions.stream() + .toList(), 0, true) + .getRectangle()) + .map(t -> new Rectangle2D.Float(t.getTopLeft().getX(), t.getTopLeft().getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight()))).collect(Collectors.toList())); + } + + + public boolean intersects(TextPageBlock textBlock, Map markedContentBboxPerType, String type) { + return markedContentBboxPerType.get(type) != null && markedContentBboxPerType.get(type).intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight()); + } + +} diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphVisualizationTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphVisualizationTest.java index aa2351d..0994643 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphVisualizationTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphVisualizationTest.java @@ -26,7 +26,7 @@ public class DocumentGraphVisualizationTest extends BuildDocumentGraphTest { // @Disabled public void visualizeMetolachlor() { - String filename = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"; + String filename = "files/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica (4).pdf"; visualizePdf(filename); } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java index ad363d9..f150ae2 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java @@ -4,6 +4,7 @@ import java.io.FileOutputStream; import java.nio.file.Path; import org.apache.pdfbox.Loader; +import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import org.springframework.core.io.ClassPathResource; @@ -17,13 +18,14 @@ import lombok.SneakyThrows; public class ViewerDocumentTest extends BuildDocumentGraphTest { @Test + @Disabled @SneakyThrows public void testViewerDocument() { LayoutGridService layoutGridService = new LayoutGridService(); ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService); - String fileName = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"; - Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER); + String fileName = "files/new/VV-511309_OCR.pdf"; + Document document = buildGraph(fileName, LayoutParsingType.DOCUMINE); String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf"; try (var pdDocument = Loader.loadPDF(new ClassPathResource(fileName).getInputStream()); var out = new FileOutputStream(tmpFileName)) { viewerDocumentService.createViewerDocument(pdDocument, document, out); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java index 93fde51..0eafbd4 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java @@ -1,24 +1,5 @@ package com.knecon.fforesight.service.layoutparser.server.segmentation; -import static org.assertj.core.api.Assertions.assertThat; - -import java.awt.geom.Rectangle2D; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.HashMap; -import java.util.List; -import java.util.Locale; -import java.util.Map; -import java.util.stream.Collectors; - -import org.apache.pdfbox.Loader; -import org.apache.pdfbox.pdmodel.PDDocument; -import org.junit.jupiter.api.Test; -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.core.io.ClassPathResource; - import com.fasterxml.jackson.databind.ObjectMapper; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline; @@ -34,8 +15,19 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.tab import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService; import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService; import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest; - import lombok.SneakyThrows; +import org.apache.pdfbox.Loader; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.junit.jupiter.api.Test; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.core.io.ClassPathResource; + +import java.awt.geom.Rectangle2D; +import java.io.IOException; +import java.util.*; +import java.util.stream.Collectors; + +import static org.assertj.core.api.Assertions.assertThat; public class PdfSegmentationServiceTest extends AbstractTest { @@ -60,7 +52,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { public ClassificationDocument buildClassificationDocument(PDDocument originDocument) { - ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, + ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE, originDocument, new ImageServiceResponse(), new TableServiceResponse()); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/PageContentExtractorTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/PageContentExtractorTest.java index 1ea53f3..cbaa195 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/PageContentExtractorTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/PageContentExtractorTest.java @@ -17,11 +17,10 @@ import lombok.SneakyThrows; class PageContentExtractorTest { @Test - @Disabled @SneakyThrows public void testTextPositionSequenceExtraction() { - String fileName = "files/invisible_tables/test-two-pages_ocred.pdf"; + String fileName = "files/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica (4).pdf"; var tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_TEXT_POSITION_SEQUENCES.pdf").toString(); List textPositionPerPage = PageContentExtractor.getSortedPageContents(fileName); diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/102 S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/102 S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf new file mode 100644 index 0000000..a6c58ea Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/102 S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf differ diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica (4).pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica (4).pdf new file mode 100644 index 0000000..c2852e8 Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica (4).pdf differ