diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index e31c993..e1ce8ac 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -11,6 +11,7 @@ import java.util.Map; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.common.PDRectangle; +import org.apache.pdfbox.text.PDFMarkedContentExtractor; import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent; @@ -172,6 +173,7 @@ public class LayoutParsingPipeline { classificationPage.setPageNumber(pageNumber); classificationPage.setPageWidth(cropbox.getWidth()); classificationPage.setPageHeight(cropbox.getHeight()); + classificationPage.setMarkedContents(stripper.getMarkedContents()); // If images is ocr needs to be calculated before textBlocks are moved into tables, otherwise findOcr algorithm needs to be adopted. if (pdfImages != null && pdfImages.containsKey(pageNumber)) { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationPage.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationPage.java index cc0a420..8a57127 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationPage.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationPage.java @@ -11,6 +11,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFre import lombok.Data; import lombok.NonNull; import lombok.RequiredArgsConstructor; +import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent; @Data @RequiredArgsConstructor @@ -38,4 +39,6 @@ public class ClassificationPage { CleanRulings cleanRulings; + private List markedContents; + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java index 436df0c..76f77ea 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java @@ -1,7 +1,18 @@ package com.knecon.fforesight.service.layoutparser.processor.services; +import java.awt.geom.Rectangle2D; +import java.util.ArrayList; +import java.util.Collection; import java.util.List; +import java.util.Optional; +import java.util.stream.Collectors; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; +import org.apache.pdfbox.cos.COSArray; +import org.apache.pdfbox.cos.COSName; +import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent; +import org.apache.pdfbox.text.TextPosition; import org.springframework.stereotype.Service; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point; @@ -20,17 +31,80 @@ import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils; public class BodyTextFrameService { - public void setBodyTextFrames(ClassificationDocument classificationDocument, LayoutParsingType layoutParsingType) { Rectangle bodyTextFrame = calculateBodyTextFrame(classificationDocument.getPages(), classificationDocument.getFontSizeCounter(), false, layoutParsingType); Rectangle landscapeBodyTextFrame = calculateBodyTextFrame(classificationDocument.getPages(), classificationDocument.getFontSizeCounter(), true, layoutParsingType); for (ClassificationPage page : classificationDocument.getPages()) { - setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame); + + var markupHeaderBBoxes = getMarkupRectangleByTextPositions(page, "Header"); + var markupFooterBBoxes = getMarkupRectangleByTextPositions(page, "Footer"); + + if(markupHeaderBBoxes.isEmpty() && markupFooterBBoxes.isEmpty()){ + setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame); + } else { + + double minHeader = page.getPageHeight(); + if(!markupHeaderBBoxes.isEmpty()) { + minHeader = RectangleTransformations.rectangle2DBBox(markupHeaderBBoxes).getMinY(); + } + + double maxFooter = 0; + if(!markupFooterBBoxes.isEmpty()) { + maxFooter = RectangleTransformations.rectangle2DBBox(markupFooterBBoxes).getMaxY(); + } + + var btf = new Rectangle(new Point(0f, (float)maxFooter), page.getPageWidth(), ((float)minHeader) - ((float)maxFooter), page.getPageNumber()); + page.setBodyTextFrame(btf); + } } } + private List getMarkupRectangle(ClassificationPage page, String subtype) { + if (page.getMarkedContents() == null) { + return new ArrayList<>(); + } + + return page.getMarkedContents().stream() + .filter(m -> m.getTag().equals("Artifact")) + .map(PDMarkedContent::getProperties) + .filter(p -> p.getItem("Subtype") != null) + .filter(p -> ((COSName) p.getItem("Subtype")).getName().equals(subtype)) + .filter(c -> c.getItem("BBox") != null) + .map(c -> c.getItem("BBox")) + .map(c -> ((COSArray) c).toFloatArray()) + .map(f -> new Rectangle2D.Float(f[0], f[1], f[2] - f[0], f[3] - f[1])) + .collect(Collectors.toList()); + } + + + private List getMarkupRectangleByTextPositions(ClassificationPage page, String subtype) { + if (page.getMarkedContents() == null) { + return new ArrayList<>(); + } + + var markedContent = page.getMarkedContents().stream() + .filter(m -> m.getTag().equals("Artifact")) + .filter(m -> m.getProperties().getItem("Subtype") != null) + .filter(m -> ((COSName) m.getProperties().getItem("Subtype")).getName().equals(subtype)) + .map(m -> m.getContents()).flatMap(Collection::stream) + .map(t -> (TextPosition) t) + .collect(Collectors.groupingBy(t -> t.getY())); + + if (markedContent.isEmpty()) { + return new ArrayList<>(); + } + + return markedContent.entrySet().stream() + .map(e -> new TextPositionSequence(e.getValue().stream() + .toList(), page.getPageNumber(), true) + .getRectangle()) + .map(t -> new Rectangle2D.Float(t.getTopLeft().getX(), t.getTopLeft().getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight()))).collect(Collectors.toList()); + } + + + /* private Rectangle calculateBodyTextFrameByRulings(List pages) { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java index f26f2d2..3bbe841 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java @@ -62,12 +62,10 @@ public class DocuMineClassificationService { textBlock.setClassification(PageBlockType.OTHER); return; } - if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() - .getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) { + if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation())) { textBlock.setClassification(PageBlockType.HEADER); - } else if (PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() - .getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) { + } else if (PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation())) { textBlock.setClassification(PageBlockType.FOOTER); } else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks() diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java index 839075d..1872f68 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java @@ -1,33 +1,19 @@ package com.knecon.fforesight.service.layoutparser.processor.services.parsing; -import java.awt.color.CMMException; -import java.awt.geom.Point2D; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - +import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; +import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import lombok.Getter; +import lombok.Setter; +import lombok.SneakyThrows; +import lombok.extern.slf4j.Slf4j; import org.apache.pdfbox.contentstream.operator.Operator; import org.apache.pdfbox.contentstream.operator.OperatorName; -import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColor; -import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColorN; -import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColorSpace; -import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceCMYKColor; -import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceGrayColor; -import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceRGBColor; -import org.apache.pdfbox.contentstream.operator.color.SetStrokingColor; -import org.apache.pdfbox.contentstream.operator.color.SetStrokingColorN; -import org.apache.pdfbox.contentstream.operator.color.SetStrokingColorSpace; -import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceCMYKColor; -import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceGrayColor; -import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceRGBColor; -import org.apache.pdfbox.contentstream.operator.state.SetFlatness; -import org.apache.pdfbox.contentstream.operator.state.SetLineCapStyle; -import org.apache.pdfbox.contentstream.operator.state.SetLineDashPattern; -import org.apache.pdfbox.contentstream.operator.state.SetLineJoinStyle; -import org.apache.pdfbox.contentstream.operator.state.SetLineMiterLimit; -import org.apache.pdfbox.contentstream.operator.state.SetLineWidth; -import org.apache.pdfbox.contentstream.operator.state.SetRenderingIntent; +import org.apache.pdfbox.contentstream.operator.color.*; +import org.apache.pdfbox.contentstream.operator.markedcontent.BeginMarkedContentSequence; +import org.apache.pdfbox.contentstream.operator.markedcontent.BeginMarkedContentSequenceWithProperties; +import org.apache.pdfbox.contentstream.operator.markedcontent.EndMarkedContentSequence; +import org.apache.pdfbox.contentstream.operator.state.*; import org.apache.pdfbox.contentstream.operator.text.SetFontAndSize; import org.apache.pdfbox.cos.COSBase; import org.apache.pdfbox.cos.COSNumber; @@ -36,14 +22,11 @@ import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.graphics.color.PDColor; import org.apache.pdfbox.text.TextPosition; -import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; -import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; - -import lombok.Getter; -import lombok.Setter; -import lombok.SneakyThrows; -import lombok.extern.slf4j.Slf4j; +import java.awt.color.CMMException; +import java.awt.geom.Point2D; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; @Getter @Slf4j @@ -59,6 +42,7 @@ public class PDFLinesTextStripper extends PDFTextStripper { private int minCharHeight; private int maxCharHeight; + private float path_x; private float path_y; @@ -89,9 +73,17 @@ public class PDFLinesTextStripper extends PDFTextStripper { this.addOperator(new SetNonStrokingColorN()); this.addOperator(new SetFontAndSize()); this.addOperator(new SetLineWidth()); + + + addOperator(new BeginMarkedContentSequenceWithProperties()); + addOperator(new BeginMarkedContentSequence()); + addOperator(new EndMarkedContentSequence()); + } + + @Override protected void processOperator(Operator operator, List arguments) throws IOException { @@ -99,6 +91,7 @@ public class PDFLinesTextStripper extends PDFTextStripper { //move switch (operation) { + case OperatorName.MOVE_TO: if (arguments.size() == 2) { Point2D.Float pos = transformPosition(floatValue(arguments.get(0)), floatValue(arguments.get(1))); @@ -349,3 +342,4 @@ public class PDFLinesTextStripper extends PDFTextStripper { } + diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFTextStripper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFTextStripper.java index 237dc0c..46c0578 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFTextStripper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFTextStripper.java @@ -25,27 +25,20 @@ import java.io.StringWriter; import java.io.Writer; import java.text.Bidi; import java.text.Normalizer; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Comparator; -import java.util.HashMap; -import java.util.Iterator; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; -import java.util.SortedMap; -import java.util.SortedSet; -import java.util.StringTokenizer; -import java.util.TreeMap; -import java.util.TreeSet; +import java.util.*; import java.util.regex.Pattern; +import lombok.Getter; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.pdfbox.cos.COSDictionary; +import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPageTree; import org.apache.pdfbox.pdmodel.common.PDRectangle; +import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent; +import org.apache.pdfbox.pdmodel.graphics.PDXObject; import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem; import org.apache.pdfbox.pdmodel.interactive.pagenavigation.PDThreadBead; import org.apache.pdfbox.text.TextPosition; @@ -63,6 +56,10 @@ public class PDFTextStripper extends LegacyPDFStreamEngine { private static float defaultDropThreshold = 2.5f; private static final boolean useCustomQuickSort; + @Getter + protected final List markedContents = new ArrayList<>(); + protected final Deque currentMarkedContents = new ArrayDeque<>(); + private static final Log LOG = LogFactory.getLog(PDFTextStripper.class); // enable the ability to set the default indent/drop thresholds @@ -196,6 +193,45 @@ public class PDFTextStripper extends LegacyPDFStreamEngine { } + + public void beginMarkedContentSequence(COSName tag, COSDictionary properties) + { + PDMarkedContent markedContent = PDMarkedContent.create(tag, properties); + if (this.currentMarkedContents.isEmpty()) + { + this.markedContents.add(markedContent); + } + else + { + PDMarkedContent currentMarkedContent = + this.currentMarkedContents.peek(); + if (currentMarkedContent != null) + { + currentMarkedContent.addMarkedContent(markedContent); + } + } + this.currentMarkedContents.push(markedContent); + } + + @Override + public void endMarkedContentSequence() + { + if (!this.currentMarkedContents.isEmpty()) + { + this.currentMarkedContents.pop(); + } + } + + + public void xobject(PDXObject xobject) + { + if (!this.currentMarkedContents.isEmpty()) + { + this.currentMarkedContents.peek().addXObject(xobject); + } + } + + /** * This will return the text of a document. See writeText.
* NOTE: The document must not be encrypted when coming into this method. @@ -877,7 +913,12 @@ public class PDFTextStripper extends LegacyPDFStreamEngine { textList.add(text); } } + if (!this.currentMarkedContents.isEmpty()) + { + this.currentMarkedContents.peek().addText(text); + } } + } @@ -2103,6 +2144,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine { this.isHangingIndent = true; } + } } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/BuildDocumentGraphTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/BuildDocumentGraphTest.java index 092a530..d3f1a5d 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/BuildDocumentGraphTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/BuildDocumentGraphTest.java @@ -40,7 +40,7 @@ public class BuildDocumentGraphTest extends AbstractTest { @SneakyThrows protected Document buildGraph(String filename) { - return buildGraph(filename, LayoutParsingType.REDACT_MANAGER); + return buildGraph(filename, LayoutParsingType.DOCUMINE); } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphVisualizationTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphVisualizationTest.java index aa2351d..0994643 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphVisualizationTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphVisualizationTest.java @@ -26,7 +26,7 @@ public class DocumentGraphVisualizationTest extends BuildDocumentGraphTest { // @Disabled public void visualizeMetolachlor() { - String filename = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"; + String filename = "files/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica (4).pdf"; visualizePdf(filename); } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java index ad363d9..dc75ece 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java @@ -22,8 +22,8 @@ public class ViewerDocumentTest extends BuildDocumentGraphTest { LayoutGridService layoutGridService = new LayoutGridService(); ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService); - String fileName = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"; - Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER); + String fileName = "files/BASF/2013-1110704.pdf"; + Document document = buildGraph(fileName, LayoutParsingType.DOCUMINE); String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf"; try (var pdDocument = Loader.loadPDF(new ClassPathResource(fileName).getInputStream()); var out = new FileOutputStream(tmpFileName)) { viewerDocumentService.createViewerDocument(pdDocument, document, out); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java index 93fde51..d795e5a 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java @@ -4,17 +4,17 @@ import static org.assertj.core.api.Assertions.assertThat; import java.awt.geom.Rectangle2D; import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.HashMap; -import java.util.List; -import java.util.Locale; -import java.util.Map; +import java.util.*; import java.util.stream.Collectors; +import java.util.stream.Stream; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw; import org.apache.pdfbox.Loader; +import org.apache.pdfbox.cos.COSArray; +import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent; import org.junit.jupiter.api.Test; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.core.io.ClassPathResource; @@ -60,7 +60,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { public ClassificationDocument buildClassificationDocument(PDDocument originDocument) { - ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, + ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE, originDocument, new ImageServiceResponse(), new TableServiceResponse()); @@ -492,6 +492,46 @@ public class PdfSegmentationServiceTest extends AbstractTest { } + @Test + public void testDoc30Page5dfgfdg() throws IOException { + + ClassPathResource pdfFileResource = new ClassPathResource("files/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica (4).pdf"); + + ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getInputStream())); + +// validateTableSize(document, 1); +// +// validateTable(document, 0, 3, 5, 0, 0); + + + var rects = document.getPages().stream().map(page -> Stream.concat(getMarkupRectangle(page, "Footer").stream(), + (getMarkupRectangle(page, "Header").stream())).toList() + ).toList(); + +// var markupHeaderBBoxes = getMarkupRectangle(page, "Header"); +// var markupFooterBBoxes = getMarkupRectangle(page, "Footer"); + + PdfDraw.drawRectanglesPerPage("files/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica (4).pdf", rects, "/tmp/test.pdf"); + } + + + private List getMarkupRectangle(ClassificationPage page, String subtype) { + if (page.getMarkedContents() == null) { + return new ArrayList<>(); + } + + return page.getMarkedContents().stream() + .filter(m -> m.getTag().equals("Artifact")) + .map(PDMarkedContent::getProperties) + .filter(p -> p.getItem("Subtype") != null && ((COSName) p.getItem("Subtype")).getName().equals(subtype)) + .map(c -> c.getItem("BBox")) + .map(c -> ((COSArray) c).toFloatArray()) + .map(f -> new Rectangle2D.Float(f[0], f[1], f[2] -f[0], f[3] - f[1])) + .collect(Collectors.toList()); + } + + + private void validateTable(ClassificationDocument document, int tableIndex, int colCount, int rowCount, int emptyCellsCountCorrect, int emptyCellsCountIncorrect) { TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(tableIndex); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/PageContentExtractorTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/PageContentExtractorTest.java index 1ea53f3..cbaa195 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/PageContentExtractorTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/PageContentExtractorTest.java @@ -17,11 +17,10 @@ import lombok.SneakyThrows; class PageContentExtractorTest { @Test - @Disabled @SneakyThrows public void testTextPositionSequenceExtraction() { - String fileName = "files/invisible_tables/test-two-pages_ocred.pdf"; + String fileName = "files/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica (4).pdf"; var tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_TEXT_POSITION_SEQUENCES.pdf").toString(); List textPositionPerPage = PageContentExtractor.getSortedPageContents(fileName); diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/102 S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/102 S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf new file mode 100644 index 0000000..a6c58ea Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/102 S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf differ diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica (4).pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica (4).pdf new file mode 100644 index 0000000..c2852e8 Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica (4).pdf differ