RED-7461: First working iteration of header and footer improvement

This commit is contained in:
deiflaender 2023-08-18 14:04:22 +02:00
parent 880914a167
commit 60615ec5d8
13 changed files with 219 additions and 67 deletions

View File

@ -11,6 +11,7 @@ import java.util.Map;
import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.text.PDFMarkedContentExtractor;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
@ -172,6 +173,7 @@ public class LayoutParsingPipeline {
classificationPage.setPageNumber(pageNumber); classificationPage.setPageNumber(pageNumber);
classificationPage.setPageWidth(cropbox.getWidth()); classificationPage.setPageWidth(cropbox.getWidth());
classificationPage.setPageHeight(cropbox.getHeight()); classificationPage.setPageHeight(cropbox.getHeight());
classificationPage.setMarkedContents(stripper.getMarkedContents());
// If images is ocr needs to be calculated before textBlocks are moved into tables, otherwise findOcr algorithm needs to be adopted. // If images is ocr needs to be calculated before textBlocks are moved into tables, otherwise findOcr algorithm needs to be adopted.
if (pdfImages != null && pdfImages.containsKey(pageNumber)) { if (pdfImages != null && pdfImages.containsKey(pageNumber)) {

View File

@ -11,6 +11,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFre
import lombok.Data; import lombok.Data;
import lombok.NonNull; import lombok.NonNull;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
@Data @Data
@RequiredArgsConstructor @RequiredArgsConstructor
@ -38,4 +39,6 @@ public class ClassificationPage {
CleanRulings cleanRulings; CleanRulings cleanRulings;
private List<PDMarkedContent> markedContents;
} }

View File

@ -1,7 +1,18 @@
package com.knecon.fforesight.service.layoutparser.processor.services; package com.knecon.fforesight.service.layoutparser.processor.services;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List; import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
import org.apache.pdfbox.text.TextPosition;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
@ -20,17 +31,80 @@ import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
public class BodyTextFrameService { public class BodyTextFrameService {
public void setBodyTextFrames(ClassificationDocument classificationDocument, LayoutParsingType layoutParsingType) { public void setBodyTextFrames(ClassificationDocument classificationDocument, LayoutParsingType layoutParsingType) {
Rectangle bodyTextFrame = calculateBodyTextFrame(classificationDocument.getPages(), classificationDocument.getFontSizeCounter(), false, layoutParsingType); Rectangle bodyTextFrame = calculateBodyTextFrame(classificationDocument.getPages(), classificationDocument.getFontSizeCounter(), false, layoutParsingType);
Rectangle landscapeBodyTextFrame = calculateBodyTextFrame(classificationDocument.getPages(), classificationDocument.getFontSizeCounter(), true, layoutParsingType); Rectangle landscapeBodyTextFrame = calculateBodyTextFrame(classificationDocument.getPages(), classificationDocument.getFontSizeCounter(), true, layoutParsingType);
for (ClassificationPage page : classificationDocument.getPages()) { for (ClassificationPage page : classificationDocument.getPages()) {
setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame);
var markupHeaderBBoxes = getMarkupRectangleByTextPositions(page, "Header");
var markupFooterBBoxes = getMarkupRectangleByTextPositions(page, "Footer");
if(markupHeaderBBoxes.isEmpty() && markupFooterBBoxes.isEmpty()){
setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame);
} else {
double minHeader = page.getPageHeight();
if(!markupHeaderBBoxes.isEmpty()) {
minHeader = RectangleTransformations.rectangle2DBBox(markupHeaderBBoxes).getMinY();
}
double maxFooter = 0;
if(!markupFooterBBoxes.isEmpty()) {
maxFooter = RectangleTransformations.rectangle2DBBox(markupFooterBBoxes).getMaxY();
}
var btf = new Rectangle(new Point(0f, (float)maxFooter), page.getPageWidth(), ((float)minHeader) - ((float)maxFooter), page.getPageNumber());
page.setBodyTextFrame(btf);
}
} }
} }
private List<Rectangle2D> getMarkupRectangle(ClassificationPage page, String subtype) {
if (page.getMarkedContents() == null) {
return new ArrayList<>();
}
return page.getMarkedContents().stream()
.filter(m -> m.getTag().equals("Artifact"))
.map(PDMarkedContent::getProperties)
.filter(p -> p.getItem("Subtype") != null)
.filter(p -> ((COSName) p.getItem("Subtype")).getName().equals(subtype))
.filter(c -> c.getItem("BBox") != null)
.map(c -> c.getItem("BBox"))
.map(c -> ((COSArray) c).toFloatArray())
.map(f -> new Rectangle2D.Float(f[0], f[1], f[2] - f[0], f[3] - f[1]))
.collect(Collectors.toList());
}
private List<Rectangle2D> getMarkupRectangleByTextPositions(ClassificationPage page, String subtype) {
if (page.getMarkedContents() == null) {
return new ArrayList<>();
}
var markedContent = page.getMarkedContents().stream()
.filter(m -> m.getTag().equals("Artifact"))
.filter(m -> m.getProperties().getItem("Subtype") != null)
.filter(m -> ((COSName) m.getProperties().getItem("Subtype")).getName().equals(subtype))
.map(m -> m.getContents()).flatMap(Collection::stream)
.map(t -> (TextPosition) t)
.collect(Collectors.groupingBy(t -> t.getY()));
if (markedContent.isEmpty()) {
return new ArrayList<>();
}
return markedContent.entrySet().stream()
.map(e -> new TextPositionSequence(e.getValue().stream()
.toList(), page.getPageNumber(), true)
.getRectangle())
.map(t -> new Rectangle2D.Float(t.getTopLeft().getX(), t.getTopLeft().getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight()))).collect(Collectors.toList());
}
/* /*
private Rectangle calculateBodyTextFrameByRulings(List<ClassificationPage> pages) { private Rectangle calculateBodyTextFrameByRulings(List<ClassificationPage> pages) {

View File

@ -62,12 +62,10 @@ public class DocuMineClassificationService {
textBlock.setClassification(PageBlockType.OTHER); textBlock.setClassification(PageBlockType.OTHER);
return; return;
} }
if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation())) {
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
textBlock.setClassification(PageBlockType.HEADER); textBlock.setClassification(PageBlockType.HEADER);
} else if (PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() } else if (PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation())) {
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
textBlock.setClassification(PageBlockType.FOOTER); textBlock.setClassification(PageBlockType.FOOTER);
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, } else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks() document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()

View File

@ -1,33 +1,19 @@
package com.knecon.fforesight.service.layoutparser.processor.services.parsing; package com.knecon.fforesight.service.layoutparser.processor.services.parsing;
import java.awt.color.CMMException; import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import java.awt.geom.Point2D; import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
import java.io.IOException; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import java.util.ArrayList; import lombok.Getter;
import java.util.Arrays; import lombok.Setter;
import java.util.List; import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import org.apache.pdfbox.contentstream.operator.Operator; import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.contentstream.operator.OperatorName; import org.apache.pdfbox.contentstream.operator.OperatorName;
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColor; import org.apache.pdfbox.contentstream.operator.color.*;
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColorN; import org.apache.pdfbox.contentstream.operator.markedcontent.BeginMarkedContentSequence;
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColorSpace; import org.apache.pdfbox.contentstream.operator.markedcontent.BeginMarkedContentSequenceWithProperties;
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceCMYKColor; import org.apache.pdfbox.contentstream.operator.markedcontent.EndMarkedContentSequence;
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceGrayColor; import org.apache.pdfbox.contentstream.operator.state.*;
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceRGBColor;
import org.apache.pdfbox.contentstream.operator.color.SetStrokingColor;
import org.apache.pdfbox.contentstream.operator.color.SetStrokingColorN;
import org.apache.pdfbox.contentstream.operator.color.SetStrokingColorSpace;
import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceCMYKColor;
import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceGrayColor;
import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceRGBColor;
import org.apache.pdfbox.contentstream.operator.state.SetFlatness;
import org.apache.pdfbox.contentstream.operator.state.SetLineCapStyle;
import org.apache.pdfbox.contentstream.operator.state.SetLineDashPattern;
import org.apache.pdfbox.contentstream.operator.state.SetLineJoinStyle;
import org.apache.pdfbox.contentstream.operator.state.SetLineMiterLimit;
import org.apache.pdfbox.contentstream.operator.state.SetLineWidth;
import org.apache.pdfbox.contentstream.operator.state.SetRenderingIntent;
import org.apache.pdfbox.contentstream.operator.text.SetFontAndSize; import org.apache.pdfbox.contentstream.operator.text.SetFontAndSize;
import org.apache.pdfbox.cos.COSBase; import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSNumber; import org.apache.pdfbox.cos.COSNumber;
@ -36,14 +22,11 @@ import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.graphics.color.PDColor; import org.apache.pdfbox.pdmodel.graphics.color.PDColor;
import org.apache.pdfbox.text.TextPosition; import org.apache.pdfbox.text.TextPosition;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import java.awt.color.CMMException;
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition; import java.awt.geom.Point2D;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import java.io.IOException;
import java.util.ArrayList;
import lombok.Getter; import java.util.List;
import lombok.Setter;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Getter @Getter
@Slf4j @Slf4j
@ -59,6 +42,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
private int minCharHeight; private int minCharHeight;
private int maxCharHeight; private int maxCharHeight;
private float path_x; private float path_x;
private float path_y; private float path_y;
@ -89,9 +73,17 @@ public class PDFLinesTextStripper extends PDFTextStripper {
this.addOperator(new SetNonStrokingColorN()); this.addOperator(new SetNonStrokingColorN());
this.addOperator(new SetFontAndSize()); this.addOperator(new SetFontAndSize());
this.addOperator(new SetLineWidth()); this.addOperator(new SetLineWidth());
addOperator(new BeginMarkedContentSequenceWithProperties());
addOperator(new BeginMarkedContentSequence());
addOperator(new EndMarkedContentSequence());
} }
@Override @Override
protected void processOperator(Operator operator, List<COSBase> arguments) throws IOException { protected void processOperator(Operator operator, List<COSBase> arguments) throws IOException {
@ -99,6 +91,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
//move //move
switch (operation) { switch (operation) {
case OperatorName.MOVE_TO: case OperatorName.MOVE_TO:
if (arguments.size() == 2) { if (arguments.size() == 2) {
Point2D.Float pos = transformPosition(floatValue(arguments.get(0)), floatValue(arguments.get(1))); Point2D.Float pos = transformPosition(floatValue(arguments.get(0)), floatValue(arguments.get(1)));
@ -349,3 +342,4 @@ public class PDFLinesTextStripper extends PDFTextStripper {
} }

View File

@ -25,27 +25,20 @@ import java.io.StringWriter;
import java.io.Writer; import java.io.Writer;
import java.text.Bidi; import java.text.Bidi;
import java.text.Normalizer; import java.text.Normalizer;
import java.util.ArrayList; import java.util.*;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.SortedMap;
import java.util.SortedSet;
import java.util.StringTokenizer;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import lombok.Getter;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageTree; import org.apache.pdfbox.pdmodel.PDPageTree;
import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem; import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
import org.apache.pdfbox.pdmodel.interactive.pagenavigation.PDThreadBead; import org.apache.pdfbox.pdmodel.interactive.pagenavigation.PDThreadBead;
import org.apache.pdfbox.text.TextPosition; import org.apache.pdfbox.text.TextPosition;
@ -63,6 +56,10 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
private static float defaultDropThreshold = 2.5f; private static float defaultDropThreshold = 2.5f;
private static final boolean useCustomQuickSort; private static final boolean useCustomQuickSort;
@Getter
protected final List<PDMarkedContent> markedContents = new ArrayList<>();
protected final Deque<PDMarkedContent> currentMarkedContents = new ArrayDeque<>();
private static final Log LOG = LogFactory.getLog(PDFTextStripper.class); private static final Log LOG = LogFactory.getLog(PDFTextStripper.class);
// enable the ability to set the default indent/drop thresholds // enable the ability to set the default indent/drop thresholds
@ -196,6 +193,45 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
} }
public void beginMarkedContentSequence(COSName tag, COSDictionary properties)
{
PDMarkedContent markedContent = PDMarkedContent.create(tag, properties);
if (this.currentMarkedContents.isEmpty())
{
this.markedContents.add(markedContent);
}
else
{
PDMarkedContent currentMarkedContent =
this.currentMarkedContents.peek();
if (currentMarkedContent != null)
{
currentMarkedContent.addMarkedContent(markedContent);
}
}
this.currentMarkedContents.push(markedContent);
}
@Override
public void endMarkedContentSequence()
{
if (!this.currentMarkedContents.isEmpty())
{
this.currentMarkedContents.pop();
}
}
public void xobject(PDXObject xobject)
{
if (!this.currentMarkedContents.isEmpty())
{
this.currentMarkedContents.peek().addXObject(xobject);
}
}
/** /**
* This will return the text of a document. See writeText. <br> * This will return the text of a document. See writeText. <br>
* NOTE: The document must not be encrypted when coming into this method. * NOTE: The document must not be encrypted when coming into this method.
@ -877,7 +913,12 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
textList.add(text); textList.add(text);
} }
} }
if (!this.currentMarkedContents.isEmpty())
{
this.currentMarkedContents.peek().addText(text);
}
} }
} }
@ -2103,6 +2144,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
this.isHangingIndent = true; this.isHangingIndent = true;
} }
} }
} }

View File

@ -40,7 +40,7 @@ public class BuildDocumentGraphTest extends AbstractTest {
@SneakyThrows @SneakyThrows
protected Document buildGraph(String filename) { protected Document buildGraph(String filename) {
return buildGraph(filename, LayoutParsingType.REDACT_MANAGER); return buildGraph(filename, LayoutParsingType.DOCUMINE);
} }

View File

@ -26,7 +26,7 @@ public class DocumentGraphVisualizationTest extends BuildDocumentGraphTest {
// @Disabled // @Disabled
public void visualizeMetolachlor() { public void visualizeMetolachlor() {
String filename = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"; String filename = "files/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica (4).pdf";
visualizePdf(filename); visualizePdf(filename);
} }

View File

@ -22,8 +22,8 @@ public class ViewerDocumentTest extends BuildDocumentGraphTest {
LayoutGridService layoutGridService = new LayoutGridService(); LayoutGridService layoutGridService = new LayoutGridService();
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService); ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService);
String fileName = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"; String fileName = "files/BASF/2013-1110704.pdf";
Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER); Document document = buildGraph(fileName, LayoutParsingType.DOCUMINE);
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf"; String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
try (var pdDocument = Loader.loadPDF(new ClassPathResource(fileName).getInputStream()); var out = new FileOutputStream(tmpFileName)) { try (var pdDocument = Loader.loadPDF(new ClassPathResource(fileName).getInputStream()); var out = new FileOutputStream(tmpFileName)) {
viewerDocumentService.createViewerDocument(pdDocument, document, out); viewerDocumentService.createViewerDocument(pdDocument, document, out);

View File

@ -4,17 +4,17 @@ import static org.assertj.core.api.Assertions.assertThat;
import java.awt.geom.Rectangle2D; import java.awt.geom.Rectangle2D;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.*;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
import org.apache.pdfbox.Loader; import org.apache.pdfbox.Loader;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.core.io.ClassPathResource; import org.springframework.core.io.ClassPathResource;
@ -60,7 +60,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
public ClassificationDocument buildClassificationDocument(PDDocument originDocument) { public ClassificationDocument buildClassificationDocument(PDDocument originDocument) {
ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE,
originDocument, originDocument,
new ImageServiceResponse(), new ImageServiceResponse(),
new TableServiceResponse()); new TableServiceResponse());
@ -492,6 +492,46 @@ public class PdfSegmentationServiceTest extends AbstractTest {
} }
@Test
public void testDoc30Page5dfgfdg() throws IOException {
ClassPathResource pdfFileResource = new ClassPathResource("files/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica (4).pdf");
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getInputStream()));
// validateTableSize(document, 1);
//
// validateTable(document, 0, 3, 5, 0, 0);
var rects = document.getPages().stream().map(page -> Stream.concat(getMarkupRectangle(page, "Footer").stream(),
(getMarkupRectangle(page, "Header").stream())).toList()
).toList();
// var markupHeaderBBoxes = getMarkupRectangle(page, "Header");
// var markupFooterBBoxes = getMarkupRectangle(page, "Footer");
PdfDraw.drawRectanglesPerPage("files/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica (4).pdf", rects, "/tmp/test.pdf");
}
private List<Rectangle2D> getMarkupRectangle(ClassificationPage page, String subtype) {
if (page.getMarkedContents() == null) {
return new ArrayList<>();
}
return page.getMarkedContents().stream()
.filter(m -> m.getTag().equals("Artifact"))
.map(PDMarkedContent::getProperties)
.filter(p -> p.getItem("Subtype") != null && ((COSName) p.getItem("Subtype")).getName().equals(subtype))
.map(c -> c.getItem("BBox"))
.map(c -> ((COSArray) c).toFloatArray())
.map(f -> new Rectangle2D.Float(f[0], f[1], f[2] -f[0], f[3] - f[1]))
.collect(Collectors.toList());
}
private void validateTable(ClassificationDocument document, int tableIndex, int colCount, int rowCount, int emptyCellsCountCorrect, int emptyCellsCountIncorrect) { private void validateTable(ClassificationDocument document, int tableIndex, int colCount, int rowCount, int emptyCellsCountCorrect, int emptyCellsCountIncorrect) {
TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(tableIndex); TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(tableIndex);

View File

@ -17,11 +17,10 @@ import lombok.SneakyThrows;
class PageContentExtractorTest { class PageContentExtractorTest {
@Test @Test
@Disabled
@SneakyThrows @SneakyThrows
public void testTextPositionSequenceExtraction() { public void testTextPositionSequenceExtraction() {
String fileName = "files/invisible_tables/test-two-pages_ocred.pdf"; String fileName = "files/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica (4).pdf";
var tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_TEXT_POSITION_SEQUENCES.pdf").toString(); var tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_TEXT_POSITION_SEQUENCES.pdf").toString();
List<PageContents> textPositionPerPage = PageContentExtractor.getSortedPageContents(fileName); List<PageContents> textPositionPerPage = PageContentExtractor.getSortedPageContents(fileName);