diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index e1ce8ac..d903200 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -2,15 +2,19 @@ package com.knecon.fforesight.service.layoutparser.processor; import static java.lang.String.format; +import java.awt.geom.Rectangle2D; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; import java.util.Map; +import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentConverter; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.common.PDRectangle; +import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent; import org.apache.pdfbox.text.PDFMarkedContentExtractor; import org.springframework.stereotype.Service; @@ -173,7 +177,9 @@ public class LayoutParsingPipeline { classificationPage.setPageNumber(pageNumber); classificationPage.setPageWidth(cropbox.getWidth()); classificationPage.setPageHeight(cropbox.getHeight()); - classificationPage.setMarkedContents(stripper.getMarkedContents()); + + // MarkedContent needs to be converted at this point, otherwise it leads to GC Problems in Pdfbox. + classificationPage.setMarkedContentBboxPerType(convertMarkedContents(stripper.getMarkedContents())); // If images is ocr needs to be calculated before textBlocks are moved into tables, otherwise findOcr algorithm needs to be adopted. if (pdfImages != null && pdfImages.containsKey(pageNumber)) { @@ -203,6 +209,13 @@ public class LayoutParsingPipeline { } + private Map convertMarkedContents(List pdMarkedContents){ + Map markedContentBboxes = new HashMap<>(); + markedContentBboxes.put("Header", MarkedContentConverter.getMarkedContentBboxPerLine(pdMarkedContents, "Header")); + markedContentBboxes.put("Footer", MarkedContentConverter.getMarkedContentBboxPerLine(pdMarkedContents, "Footer")); + return markedContentBboxes; + } + private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) { if (!classificationPage.isLandscape()) { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationPage.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationPage.java index 8a57127..798612e 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationPage.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationPage.java @@ -1,7 +1,10 @@ package com.knecon.fforesight.service.layoutparser.processor.model; +import java.awt.geom.Rectangle2D; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; +import java.util.Map; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; @@ -39,6 +42,6 @@ public class ClassificationPage { CleanRulings cleanRulings; - private List markedContents; + private Map markedContentBboxPerType = new HashMap<>(); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java index 76f77ea..a23e0ba 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java @@ -1,18 +1,7 @@ package com.knecon.fforesight.service.layoutparser.processor.services; -import java.awt.geom.Rectangle2D; -import java.util.ArrayList; -import java.util.Collection; import java.util.List; -import java.util.Optional; -import java.util.stream.Collectors; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; -import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; -import org.apache.pdfbox.cos.COSArray; -import org.apache.pdfbox.cos.COSName; -import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent; -import org.apache.pdfbox.text.TextPosition; import org.springframework.stereotype.Service; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point; @@ -36,75 +25,11 @@ public class BodyTextFrameService { Rectangle bodyTextFrame = calculateBodyTextFrame(classificationDocument.getPages(), classificationDocument.getFontSizeCounter(), false, layoutParsingType); Rectangle landscapeBodyTextFrame = calculateBodyTextFrame(classificationDocument.getPages(), classificationDocument.getFontSizeCounter(), true, layoutParsingType); for (ClassificationPage page : classificationDocument.getPages()) { - - var markupHeaderBBoxes = getMarkupRectangleByTextPositions(page, "Header"); - var markupFooterBBoxes = getMarkupRectangleByTextPositions(page, "Footer"); - - if(markupHeaderBBoxes.isEmpty() && markupFooterBBoxes.isEmpty()){ - setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame); - } else { - - double minHeader = page.getPageHeight(); - if(!markupHeaderBBoxes.isEmpty()) { - minHeader = RectangleTransformations.rectangle2DBBox(markupHeaderBBoxes).getMinY(); - } - - double maxFooter = 0; - if(!markupFooterBBoxes.isEmpty()) { - maxFooter = RectangleTransformations.rectangle2DBBox(markupFooterBBoxes).getMaxY(); - } - - var btf = new Rectangle(new Point(0f, (float)maxFooter), page.getPageWidth(), ((float)minHeader) - ((float)maxFooter), page.getPageNumber()); - page.setBodyTextFrame(btf); - } + setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame); } } - private List getMarkupRectangle(ClassificationPage page, String subtype) { - if (page.getMarkedContents() == null) { - return new ArrayList<>(); - } - - return page.getMarkedContents().stream() - .filter(m -> m.getTag().equals("Artifact")) - .map(PDMarkedContent::getProperties) - .filter(p -> p.getItem("Subtype") != null) - .filter(p -> ((COSName) p.getItem("Subtype")).getName().equals(subtype)) - .filter(c -> c.getItem("BBox") != null) - .map(c -> c.getItem("BBox")) - .map(c -> ((COSArray) c).toFloatArray()) - .map(f -> new Rectangle2D.Float(f[0], f[1], f[2] - f[0], f[3] - f[1])) - .collect(Collectors.toList()); - } - - - private List getMarkupRectangleByTextPositions(ClassificationPage page, String subtype) { - if (page.getMarkedContents() == null) { - return new ArrayList<>(); - } - - var markedContent = page.getMarkedContents().stream() - .filter(m -> m.getTag().equals("Artifact")) - .filter(m -> m.getProperties().getItem("Subtype") != null) - .filter(m -> ((COSName) m.getProperties().getItem("Subtype")).getName().equals(subtype)) - .map(m -> m.getContents()).flatMap(Collection::stream) - .map(t -> (TextPosition) t) - .collect(Collectors.groupingBy(t -> t.getY())); - - if (markedContent.isEmpty()) { - return new ArrayList<>(); - } - - return markedContent.entrySet().stream() - .map(e -> new TextPositionSequence(e.getValue().stream() - .toList(), page.getPageNumber(), true) - .getRectangle()) - .map(t -> new Rectangle2D.Float(t.getTopLeft().getX(), t.getTopLeft().getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight()))).collect(Collectors.toList()); - } - - - /* private Rectangle calculateBodyTextFrameByRulings(List pages) { @@ -231,6 +156,11 @@ public class BodyTextFrameService { continue; } + if (page.getMarkedContentBboxPerType().get("Header") != null && page.getMarkedContentBboxPerType().get("Header").intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight()) + || page.getMarkedContentBboxPerType().get("Footer") != null && page.getMarkedContentBboxPerType().get("Footer").intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight())) { + continue; + } + float approxLineCount = PositionUtils.getApproxLineCount(textBlock); if (layoutParsingType.equals(LayoutParsingType.DOCUMINE) && approxLineCount < approximateHeaderLineCount && textBlock.getMaxY() >= page.getPageHeight() - (page.getPageHeight() / 10) || !layoutParsingType.equals( LayoutParsingType.DOCUMINE) && approxLineCount < approximateHeaderLineCount) { @@ -307,4 +237,4 @@ public class BodyTextFrameService { } -} +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java index 3bbe841..8d7c5d7 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java @@ -62,10 +62,16 @@ public class DocuMineClassificationService { textBlock.setClassification(PageBlockType.OTHER); return; } - if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation())) { + if (page.getMarkedContentBboxPerType().get("Header") != null && page.getMarkedContentBboxPerType().get("Header").intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight()) + || PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() + .getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular()) + ) { textBlock.setClassification(PageBlockType.HEADER); - } else if (PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation())) { + } else if (page.getMarkedContentBboxPerType().get("Footer") != null && page.getMarkedContentBboxPerType().get("Footer").intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight()) + || PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() + .getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular()) + ) { textBlock.setClassification(PageBlockType.FOOTER); } else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks() @@ -105,4 +111,4 @@ public class DocuMineClassificationService { } } -} +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java index 2be8f03..e89b225 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java @@ -51,11 +51,13 @@ public class RedactManagerClassificationService { textBlock.setClassification(PageBlockType.OTHER); return; } - if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() + if (page.getMarkedContentBboxPerType().get("Header") != null && page.getMarkedContentBboxPerType().get("Header").intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight()) + || PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() .getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) { textBlock.setClassification(PageBlockType.HEADER); - } else if (PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() + } else if (page.getMarkedContentBboxPerType().get("Footer") != null && page.getMarkedContentBboxPerType().get("Footer").intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight()) + || PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() .getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) { textBlock.setClassification(PageBlockType.FOOTER); } else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TaasClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TaasClassificationService.java index c177efd..5eee9a3 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TaasClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TaasClassificationService.java @@ -56,9 +56,11 @@ public class TaasClassificationService { textBlock.setClassification(PageBlockType.OTHER); return; } - if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation())) { + if (page.getMarkedContentBboxPerType().get("Header") != null && page.getMarkedContentBboxPerType().get("Header").intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight()) + || PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation())) { textBlock.setClassification(PageBlockType.HEADER); - } else if (PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation())) { + } else if (page.getMarkedContentBboxPerType().get("Footer") != null && page.getMarkedContentBboxPerType().get("Footer").intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight()) + || PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation())) { textBlock.setClassification(PageBlockType.FOOTER); } else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks() diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/MarkedContentConverter.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/MarkedContentConverter.java new file mode 100644 index 0000000..ad34f43 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/MarkedContentConverter.java @@ -0,0 +1,45 @@ +package com.knecon.fforesight.service.layoutparser.processor.utils; + +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import lombok.experimental.UtilityClass; +import org.apache.pdfbox.cos.COSName; +import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent; +import org.apache.pdfbox.text.TextPosition; + +import java.awt.geom.Rectangle2D; +import java.util.Collection; +import java.util.List; +import java.util.stream.Collectors; + +@UtilityClass +public class MarkedContentConverter { + + + public Rectangle2D getMarkedContentBboxPerLine(List markedContents, String subtype) { + + if (markedContents == null) { + return null; + } + + var markedContentByYPosition = markedContents.stream() + .filter(m -> m.getTag().equals("Artifact")) + .filter(m -> m.getProperties() != null) + .filter(m -> m.getProperties().getItem("Subtype") != null) + .filter(m -> ((COSName) m.getProperties().getItem("Subtype")).getName().equals(subtype)) + .map(PDMarkedContent::getContents).flatMap(Collection::stream) + .filter(t -> t instanceof TextPosition) + .map(t -> (TextPosition) t) + .filter(t -> !t.getUnicode().equals(" ")) + .collect(Collectors.groupingBy(TextPosition::getY)); + + if (markedContentByYPosition.isEmpty()) { + return null; + } + + return RectangleTransformations.rectangle2DBBox(markedContentByYPosition.values().stream() + .map(textPositions -> new TextPositionSequence(textPositions.stream() + .toList(), 0, true) + .getRectangle()) + .map(t -> new Rectangle2D.Float(t.getTopLeft().getX(), t.getTopLeft().getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight()))).collect(Collectors.toList())); + } +} diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java index dc75ece..f150ae2 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java @@ -4,6 +4,7 @@ import java.io.FileOutputStream; import java.nio.file.Path; import org.apache.pdfbox.Loader; +import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import org.springframework.core.io.ClassPathResource; @@ -17,12 +18,13 @@ import lombok.SneakyThrows; public class ViewerDocumentTest extends BuildDocumentGraphTest { @Test + @Disabled @SneakyThrows public void testViewerDocument() { LayoutGridService layoutGridService = new LayoutGridService(); ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService); - String fileName = "files/BASF/2013-1110704.pdf"; + String fileName = "files/new/VV-511309_OCR.pdf"; Document document = buildGraph(fileName, LayoutParsingType.DOCUMINE); String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf"; try (var pdDocument = Loader.loadPDF(new ClassPathResource(fileName).getInputStream()); var out = new FileOutputStream(tmpFileName)) { diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java index d795e5a..0eafbd4 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java @@ -1,24 +1,5 @@ package com.knecon.fforesight.service.layoutparser.server.segmentation; -import static org.assertj.core.api.Assertions.assertThat; - -import java.awt.geom.Rectangle2D; -import java.io.IOException; -import java.util.*; -import java.util.stream.Collectors; -import java.util.stream.Stream; - -import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; -import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw; -import org.apache.pdfbox.Loader; -import org.apache.pdfbox.cos.COSArray; -import org.apache.pdfbox.cos.COSName; -import org.apache.pdfbox.pdmodel.PDDocument; -import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent; -import org.junit.jupiter.api.Test; -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.core.io.ClassPathResource; - import com.fasterxml.jackson.databind.ObjectMapper; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline; @@ -34,8 +15,19 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.tab import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService; import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService; import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest; - import lombok.SneakyThrows; +import org.apache.pdfbox.Loader; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.junit.jupiter.api.Test; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.core.io.ClassPathResource; + +import java.awt.geom.Rectangle2D; +import java.io.IOException; +import java.util.*; +import java.util.stream.Collectors; + +import static org.assertj.core.api.Assertions.assertThat; public class PdfSegmentationServiceTest extends AbstractTest { @@ -492,46 +484,6 @@ public class PdfSegmentationServiceTest extends AbstractTest { } - @Test - public void testDoc30Page5dfgfdg() throws IOException { - - ClassPathResource pdfFileResource = new ClassPathResource("files/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica (4).pdf"); - - ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getInputStream())); - -// validateTableSize(document, 1); -// -// validateTable(document, 0, 3, 5, 0, 0); - - - var rects = document.getPages().stream().map(page -> Stream.concat(getMarkupRectangle(page, "Footer").stream(), - (getMarkupRectangle(page, "Header").stream())).toList() - ).toList(); - -// var markupHeaderBBoxes = getMarkupRectangle(page, "Header"); -// var markupFooterBBoxes = getMarkupRectangle(page, "Footer"); - - PdfDraw.drawRectanglesPerPage("files/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica (4).pdf", rects, "/tmp/test.pdf"); - } - - - private List getMarkupRectangle(ClassificationPage page, String subtype) { - if (page.getMarkedContents() == null) { - return new ArrayList<>(); - } - - return page.getMarkedContents().stream() - .filter(m -> m.getTag().equals("Artifact")) - .map(PDMarkedContent::getProperties) - .filter(p -> p.getItem("Subtype") != null && ((COSName) p.getItem("Subtype")).getName().equals(subtype)) - .map(c -> c.getItem("BBox")) - .map(c -> ((COSArray) c).toFloatArray()) - .map(f -> new Rectangle2D.Float(f[0], f[1], f[2] -f[0], f[3] - f[1])) - .collect(Collectors.toList()); - } - - - private void validateTable(ClassificationDocument document, int tableIndex, int colCount, int rowCount, int emptyCellsCountCorrect, int emptyCellsCountIncorrect) { TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(tableIndex);