diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index 9063fb2..686b383 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -30,6 +30,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageB import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.CvTableParsingAdapter; import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.ImageServiceResponseAdapter; +import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.TableExtractorResponseAdapter; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse; @@ -64,6 +65,7 @@ public class LayoutParsingPipeline { private final ImageServiceResponseAdapter imageServiceResponseAdapter; private final CvTableParsingAdapter cvTableParsingAdapter; private final LayoutParsingStorageService layoutParsingStorageService; + private final TableExtractorResponseAdapter tableExtractorResponseAdapter; private final SectionsBuilderService sectionsBuilderService; private final TaasClassificationService taasClassificationService; private final RedactManagerClassificationService redactManagerClassificationService; @@ -89,7 +91,7 @@ public class LayoutParsingPipeline { } TableExtractorResponse tableExtractorResponse = new TableExtractorResponse(); - if(layoutParsingRequest.tableExtractorFileId().isPresent()) { + if (layoutParsingRequest.tableExtractorFileId().isPresent()) { tableExtractorResponse = layoutParsingStorageService.getExtractedTableFile(layoutParsingRequest.tableExtractorFileId().get()); } @@ -98,7 +100,11 @@ public class LayoutParsingPipeline { tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get()); } - ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(), originDocument, imageServiceResponse, tableServiceResponse, tableExtractorResponse); + ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(), + originDocument, + imageServiceResponse, + tableServiceResponse, + tableExtractorResponse); Document documentGraph = DocumentGraphFactory.buildDocumentGraph(classificationDocument); int numberOfPages = originDocument.getNumberOfPages(); @@ -106,9 +112,9 @@ public class LayoutParsingPipeline { layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentGraph)); layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentGraph)); + Map> extractedTableCells = tableExtractorResponseAdapter.buildExtractedTablesPerPage(tableExtractorResponse); try (var out = new ByteArrayOutputStream()) { - viewerDocumentService.createViewerDocument(originDocument, documentGraph, out, false); - viewerDocumentService.drawExtractedTables(originDocument,documentGraph,out,tableExtractorResponse.getExtractedTableData()); + viewerDocumentService.createViewerDocument(originDocument, documentGraph, out,extractedTableCells ,false); layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, out); } @@ -164,12 +170,12 @@ public class LayoutParsingPipeline { public ClassificationDocument parseLayout(LayoutParsingType layoutParsingType, PDDocument originDocument, ImageServiceResponse imageServiceResponse, - TableServiceResponse tableServiceResponse, TableExtractorResponse tableExtractorResponse) { + TableServiceResponse tableServiceResponse, + TableExtractorResponse tableExtractorResponse) { Map> pdfTableCells = cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse); Map> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse); - //Hier muss ich die table cells einlesen - + Map> extractedTableCells = tableExtractorResponseAdapter.buildExtractedTablesPerPage(tableExtractorResponse); ClassificationDocument classificationDocument = new ClassificationDocument(); List classificationPages = new ArrayList<>(); @@ -252,9 +258,9 @@ public class LayoutParsingPipeline { private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) { - if (!classificationPage.isLandscape()) { - document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue()); - } + if (!classificationPage.isLandscape()) { + document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue()); + } document.getFontCounter().addAll(classificationPage.getFontCounter().getCountPerValue()); document.getTextHeightCounter().addAll(classificationPage.getTextHeightCounter().getCountPerValue()); document.getFontStyleCounter().addAll(classificationPage.getFontStyleCounter().getCountPerValue()); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java index 27b33b9..7a713a0 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java @@ -23,7 +23,6 @@ import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.Si import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.ResearchDocumentData; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse; -import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorData; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; import com.knecon.fforesight.tenantcommons.TenantContext; @@ -64,9 +63,11 @@ public class LayoutParsingStorageService { } } + public TableExtractorResponse getExtractedTableFile(String storageId) throws IOException { + try (InputStream inputStream = getObject(storageId)) { - TableExtractorResponse tableExtractorResponse = objectMapper.readValue(inputStream,TableExtractorResponse.class); + TableExtractorResponse tableExtractorResponse = objectMapper.readValue(inputStream, TableExtractorResponse.class); inputStream.close(); return tableExtractorResponse; } @@ -93,7 +94,6 @@ public class LayoutParsingStorageService { } - public void storeResearchDocumentData(LayoutParsingRequest layoutParsingRequest, ResearchDocumentData researchDocumentData) { storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.researchDocumentStorageId(), researchDocumentData); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationPage.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationPage.java index a654636..25c49f2 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationPage.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationPage.java @@ -14,7 +14,6 @@ import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFre import lombok.Data; import lombok.NonNull; import lombok.RequiredArgsConstructor; -import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent; @Data @RequiredArgsConstructor diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageContents.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageContents.java index f9d00ef..5f48353 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageContents.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageContents.java @@ -19,4 +19,5 @@ public class PageContents { Rectangle2D cropBox; Rectangle2D mediaBox; List rulings; + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/Boundary.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/Boundary.java index 28de929..da0a88b 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/Boundary.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/Boundary.java @@ -108,11 +108,13 @@ public class Boundary implements Comparable { return splitBoundaries; } + public IntStream intStream() { return IntStream.range(start, end); } + public static Boundary merge(Collection boundaries) { int minStart = boundaries.stream().mapToInt(Boundary::start).min().orElseThrow(IllegalArgumentException::new); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Document.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Document.java index 0df92c2..e8e43d13 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Document.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Document.java @@ -105,6 +105,7 @@ public class Document implements GenericSemanticNode { return streamAllSubNodes().collect(Collectors.groupingBy(SemanticNode::getType, Collectors.counting())); } + @Override public String toString() { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Table.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Table.java index d62b4cf..7855db0 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Table.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Table.java @@ -207,6 +207,7 @@ public class Table implements SemanticNode { return IntStream.range(0, numberOfCols).boxed().map(col -> getCell(row, col)); } + /** * Streams all TableCells row-wise and filters them with header == true. * diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/AtomicTextBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/AtomicTextBlock.java index 33d9427..ecb0679 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/AtomicTextBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/AtomicTextBlock.java @@ -109,10 +109,7 @@ public class AtomicTextBlock implements TextBlock { } - public static AtomicTextBlock fromAtomicTextBlockData(DocumentTextData documentTextData, - DocumentPositionData documentPositionData, - SemanticNode parent, - Page page) { + public static AtomicTextBlock fromAtomicTextBlockData(DocumentTextData documentTextData, DocumentPositionData documentPositionData, SemanticNode parent, Page page) { return AtomicTextBlock.builder() .id(documentTextData.getId()) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/TablePageBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/TablePageBlock.java index 1295424..b874f2d 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/TablePageBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/TablePageBlock.java @@ -1,14 +1,12 @@ package com.knecon.fforesight.service.layoutparser.processor.model.table; import java.awt.geom.Point2D; -import java.awt.geom.Rectangle2D; import java.util.ArrayList; import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.TreeMap; -import java.util.stream.Collectors; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; @@ -50,6 +48,7 @@ public class TablePageBlock extends AbstractPageBlock { return getColCount() == 0 || getRowCount() == 0; } + public List> getRows() { if (rows == null) { @@ -276,21 +275,17 @@ public class TablePageBlock extends AbstractPageBlock { } - public boolean intersects(Cell cell1, Cell cell2) { + if (cell1.getHeight() <= 0 || cell2.getHeight() <= 0) { return false; } double x0 = cell1.getX() + 2; double y0 = cell1.getY() + 2; - return (cell2.x + cell2.width > x0 && - cell2.y + cell2.height > y0 && - cell2.x < x0 + cell1.getWidth() -2 && - cell2.y < y0 + cell1.getHeight() -2); + return (cell2.x + cell2.width > x0 && cell2.y + cell2.height > y0 && cell2.x < x0 + cell1.getWidth() - 2 && cell2.y < y0 + cell1.getHeight() - 2); } - @Override public String getText() { @@ -328,8 +323,6 @@ public class TablePageBlock extends AbstractPageBlock { } - - public String getTextAsHtml() { StringBuilder sb = new StringBuilder(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/SearchableText.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/SearchableText.java index c0ef4e3..d793e73 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/SearchableText.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/SearchableText.java @@ -2,6 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor.model.text; import java.util.ArrayList; import java.util.List; + import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities; import lombok.Getter; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java index 0442af6..ac2ae98 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java @@ -73,7 +73,7 @@ public class TextPageBlock extends AbstractPageBlock { return sequences.get(0).getPageWidth(); } - + public static TextPageBlock merge(List textBlocksToMerge) { @@ -82,6 +82,7 @@ public class TextPageBlock extends AbstractPageBlock { return fromTextPositionSequences(sequences); } + public static TextPageBlock fromTextPositionSequences(List wordBlockList) { TextPageBlock textBlock = null; @@ -133,7 +134,6 @@ public class TextPageBlock extends AbstractPageBlock { } - /** * Returns the minX value in pdf coordinate system. * Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation. diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequence.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequence.java index 7b776dc..b21076f 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequence.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequence.java @@ -234,6 +234,7 @@ public class TextPositionSequence implements CharSequence { @JsonIgnore @JsonAttribute(ignore = true) public String getFontStyle() { + if (textPositions.get(0).getFontName() == null) { return "standard"; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/adapter/ImageServiceResponseAdapter.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/adapter/ImageServiceResponseAdapter.java index 55c345c..5b1a61d 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/adapter/ImageServiceResponseAdapter.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/adapter/ImageServiceResponseAdapter.java @@ -9,10 +9,10 @@ import java.util.Map; import org.springframework.stereotype.Service; -import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; -import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType; +import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; +import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse; import lombok.RequiredArgsConstructor; @@ -20,8 +20,7 @@ import lombok.RequiredArgsConstructor; @RequiredArgsConstructor public class ImageServiceResponseAdapter { - - public Map> buildClassifiedImagesPerPage(ImageServiceResponse imageServiceResponse ) { + public Map> buildClassifiedImagesPerPage(ImageServiceResponse imageServiceResponse) { Map> images = new HashMap<>(); imageServiceResponse.getData().forEach(imageMetadata -> { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/adapter/TableExtractorResponseAdapter.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/adapter/TableExtractorResponseAdapter.java new file mode 100644 index 0000000..e6c74d2 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/adapter/TableExtractorResponseAdapter.java @@ -0,0 +1,67 @@ +package com.knecon.fforesight.service.layoutparser.processor.python_api.adapter; + +import java.awt.geom.Rectangle2D; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; + +import org.springframework.stereotype.Service; + +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType; +import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; +import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse; +import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.ExtractedTable; +import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.ExtractedTableData; +import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells; +import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableData; +import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse; + +import lombok.RequiredArgsConstructor; + +@Service +@RequiredArgsConstructor +public class TableExtractorResponseAdapter { + + public Map> buildExtractedTablesPerPage(TableExtractorResponse tableExtractorResponse) { + Map> tableCells = new HashMap<>(); + tableExtractorResponse.getData() + .forEach(tableData -> tableCells.computeIfAbsent(tableData.getPage_number(), tableCell -> new ArrayList<>()) + .addAll(convertTableCells(tableData.getTables()))); + + return tableCells; + + } + + public List convertTableCells(List tableObjects) { + + List parsedTableCells = new ArrayList<>(); + + tableObjects.stream().forEach(t -> { + TableCells tableCells = new TableCells(); + tableCells.setX0(t.getTable().getBbox().get(0)); + tableCells.setX1(t.getTable().getBbox().get(2)); + tableCells.setY0(t.getTable().getBbox().get(1)); + tableCells.setY1(t.getTable().getBbox().get(3)); + tableCells.setWidth(tableCells.getX1()- tableCells.getX0()); + tableCells.setHeight(tableCells.getY1()- tableCells.getY0()); + parsedTableCells.add(tableCells); + t.getObjects().forEach(o -> { + TableCells objectCell = new TableCells(); + objectCell.setX0(t.getTable().getBbox().get(0)); + objectCell.setX1(t.getTable().getBbox().get(2)); + objectCell.setY0(t.getTable().getBbox().get(1)); + objectCell.setY1(t.getTable().getBbox().get(3)); + objectCell.setWidth(objectCell.getX1()- objectCell.getX0()); + objectCell.setHeight(objectCell.getY1()- objectCell.getY0()); + parsedTableCells.add(objectCell); + }); + }); + + return parsedTableCells; + + } + + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/ExtractedTable.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/ExtractedTable.java index df5838a..cd83007 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/ExtractedTable.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/ExtractedTable.java @@ -1,5 +1,7 @@ package com.knecon.fforesight.service.layoutparser.processor.python_api.model.table; +import java.util.List; + import lombok.AllArgsConstructor; import lombok.Builder; import lombok.Data; @@ -10,8 +12,9 @@ import lombok.NoArgsConstructor; @NoArgsConstructor @AllArgsConstructor public class ExtractedTable { - private boolean rotated; - private ExtractedTableData extractedTableValue; + private boolean rotated; + private ExtractedTableData table; + private List objects; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/ExtractedTableData.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/ExtractedTableData.java index f3756a3..76b9343 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/ExtractedTableData.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/ExtractedTableData.java @@ -12,8 +12,9 @@ import lombok.NoArgsConstructor; @NoArgsConstructor @AllArgsConstructor public class ExtractedTableData { + private String label; private float score; - private List boundingBox; + private List bbox; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/TableExtractorData.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/TableExtractorData.java index 6c937fe..096df86 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/TableExtractorData.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/TableExtractorData.java @@ -13,13 +13,12 @@ import lombok.NoArgsConstructor; @AllArgsConstructor public class TableExtractorData { - private int pageNumber; - private int pageRotation; - private int imageHeigth; - private int imageWidth; - private float pdfHeight; - private float pdfWidth; + private int page_number; + private int page_rotation; + private int image_heigth; + private int image_width; + private float pdf_height; + private float pdf_width; private int dpi; private List tables; - private List objects; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/TableExtractorResponse.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/TableExtractorResponse.java index 335afa4..9baa449 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/TableExtractorResponse.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/TableExtractorResponse.java @@ -11,12 +11,13 @@ import lombok.NoArgsConstructor; @Builder @NoArgsConstructor @AllArgsConstructor -public class TableExtractorResponse { private String dossierId; +public class TableExtractorResponse { + + private String dossierId; private String fileId; private String targetFileExtension; private String responseFileExtension; private String X_TENANT_ID; - private List extractedTableData; - + private List data; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java index 5942443..63ee3c3 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java @@ -25,6 +25,7 @@ public class BodyTextFrameService { private static final float RULING_HEIGHT_THRESHOLD = 0.15f; // multiplied with page height. Header/Footer Rulings must be within that border of the page. private static final float RULING_WIDTH_THRESHOLD = 0.75f; // multiplied with page width. Header/Footer Rulings must be at least that wide. + public void setBodyTextFrames(ClassificationDocument classificationDocument, LayoutParsingType layoutParsingType) { Rectangle bodyTextFrame = calculateBodyTextFrame(classificationDocument.getPages(), classificationDocument.getFontSizeCounter(), false, layoutParsingType); @@ -155,8 +156,9 @@ public class BodyTextFrameService { continue; } - if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) - || MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)) { + if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) || MarkedContentUtils.intersects(textBlock, + page.getMarkedContentBboxPerType(), + MarkedContentUtils.FOOTER)) { continue; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/DividingColumnDetectionService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/DividingColumnDetectionService.java index 7c062bc..f3e70b2 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/DividingColumnDetectionService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/DividingColumnDetectionService.java @@ -22,7 +22,6 @@ public class DividingColumnDetectionService { public List detectColumns(PageContents pageContents) { - if (pageContents.getSortedTextPositionSequences().size() < 2) { return List.of(pageContents.getCropBox()); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/GapDetectionService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/GapDetectionService.java index ac7db1d..5200b52 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/GapDetectionService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/GapDetectionService.java @@ -72,11 +72,13 @@ public class GapDetectionService { return mirrorY(RectangleTransformations.toRectangle2D(textPosition.getRectangle())); } + private static Rectangle2D mirrorY(Rectangle2D rectangle2D) { return new Rectangle2D.Double(rectangle2D.getX(), Math.min(rectangle2D.getMinY(), rectangle2D.getMaxY()), rectangle2D.getWidth(), Math.abs(rectangle2D.getHeight())); } + private static void addGapToLine(Rectangle2D currentTextPosition, Rectangle2D previousTextPosition, XGapsContext context) { context.gapsInCurrentLine.add(new Rectangle2D.Double(previousTextPosition.getMaxX(), diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/GapsAcrossLinesService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/GapsAcrossLinesService.java index 94bcce2..6f80979 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/GapsAcrossLinesService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/GapsAcrossLinesService.java @@ -6,7 +6,6 @@ import java.util.LinkedList; import java.util.List; import java.util.Queue; import java.util.stream.Stream; -import com.iqser.red.commons.jackson.ObjectMapperFactory; import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation; @@ -51,7 +50,9 @@ public class GapsAcrossLinesService { } return columnFactory.outputGaps.stream() - .filter(gapAcrossLines -> columnFactory.outputGaps.stream().filter(gapAcrossLines::intersectsX).noneMatch(gapAcrossLines1 -> gapAcrossLines1.lineCount > gapAcrossLines.lineCount)) + .filter(gapAcrossLines -> columnFactory.outputGaps.stream() + .filter(gapAcrossLines::intersectsX) + .noneMatch(gapAcrossLines1 -> gapAcrossLines1.lineCount > gapAcrossLines.lineCount)) .filter(gapAcrossLines -> Math.abs(gapAcrossLines.rectangle2D.getMinX() - mainBodyTextFrame.getMinX()) > DISTANCE_TO_BORDER_THRESHOLD) .filter(gapAcrossLines -> Math.abs(gapAcrossLines.rectangle2D.getMaxX() - mainBodyTextFrame.getMaxX()) > DISTANCE_TO_BORDER_THRESHOLD) .map(GapAcrossLines::getRectangle2D) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/LineDetectionService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/LineDetectionService.java index 8b14767..bf1faf5 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/LineDetectionService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/LineDetectionService.java @@ -6,8 +6,8 @@ import java.util.List; import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation; import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation; -import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; import lombok.AllArgsConstructor; import lombok.Getter; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/MainBodyTextFrameExtractionService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/MainBodyTextFrameExtractionService.java index 0cac3ee..6dd9928 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/MainBodyTextFrameExtractionService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/MainBodyTextFrameExtractionService.java @@ -16,8 +16,7 @@ public class MainBodyTextFrameExtractionService { public Rectangle2D calculateMainBodyTextFrame(LineInformation lineInformation) { - Rectangle2D mainBodyTextFrame = lineInformation.getLineBBox().stream() - .collect(RectangleTransformations.collectBBox()); + Rectangle2D mainBodyTextFrame = lineInformation.getLineBBox().stream().collect(RectangleTransformations.collectBBox()); return RectangleTransformations.pad(mainBodyTextFrame, mainBodyTextFrame.getWidth() * TEXT_FRAME_PAD_WIDTH, mainBodyTextFrame.getHeight() * TEXT_FRAME_PAD_HEIGHT); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PageContentExtractor.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PageContentExtractor.java index dde3b94..ff5fb01 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PageContentExtractor.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PageContentExtractor.java @@ -52,7 +52,7 @@ public class PageContentExtractor { stripper.getRulings())); } } - + return textPositionSequencesPerPage; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/SimplifiedSectionTextService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/SimplifiedSectionTextService.java index eb1e824..867b64d 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/SimplifiedSectionTextService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/SimplifiedSectionTextService.java @@ -5,9 +5,9 @@ import java.util.List; import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.SimplifiedSectionText; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.SimplifiedText; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.SimplifiedText; @Service public class SimplifiedSectionTextService { @@ -23,4 +23,5 @@ public class SimplifiedSectionTextService { return SimplifiedSectionText.builder().sectionNumber(section.getTreeId().get(0)).text(section.getTextBlock().getSearchText()).build(); } + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/TaasBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/TaasBlockificationService.java index 287d2ba..ea2f909 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/TaasBlockificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/TaasBlockificationService.java @@ -1,9 +1,20 @@ package com.knecon.fforesight.service.layoutparser.processor.services.blockification; - // TODO: figure out, why this fails the build // import static com.knecon.fforesight.service.layoutparser.processor.services.factory.SearchTextWithTextPositionFactory.HEIGHT_PADDING; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.stream.Stream; + +import org.springframework.stereotype.Service; + import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.Orientation; @@ -11,12 +22,6 @@ import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil; -import org.springframework.stereotype.Service; - -import java.util.*; -import java.util.regex.Matcher; -import java.util.regex.Pattern; -import java.util.stream.Stream; @Service @SuppressWarnings("all") @@ -83,13 +88,13 @@ public class TaasBlockificationService { continue; } - Matcher listIdentifierPattern = listIdentifier.matcher(currentTextBlock.getText()); boolean isListIdentifier = listIdentifierPattern.find(); boolean yGap = Math.abs(currentTextBlock.getPdfMaxY() - previousTextBlock.getPdfMinY()) < previousTextBlock.getMostPopularWordHeight() * Y_GAP_SPLIT_HEIGHT_MODIFIER; - boolean sameFont = previousTextBlock.getMostPopularWordFont().equals(currentTextBlock.getMostPopularWordFont()) && previousTextBlock.getMostPopularWordFontSize() == currentTextBlock.getMostPopularWordFontSize(); + boolean sameFont = previousTextBlock.getMostPopularWordFont() + .equals(currentTextBlock.getMostPopularWordFont()) && previousTextBlock.getMostPopularWordFontSize() == currentTextBlock.getMostPopularWordFontSize(); // boolean yGap = previousTextBlock != null && currentTextBlock.getMinYDirAdj() - maxY > Math.min(word.getHeight(), prev.getHeight()) * Y_GAP_SPLIT_HEIGHT_MODIFIER; boolean alignsXRight = Math.abs(currentTextBlock.getPdfMaxX() - previousTextBlock.getPdfMaxX()) < X_ALIGNMENT_THRESHOLD; @@ -119,8 +124,9 @@ public class TaasBlockificationService { } alreadyMerged.add(textPageBlock); textBlocksToMerge.add(Stream.concat(Stream.of(textPageBlock), - textPageBlocks.stream().filter(textPageBlock2 -> textPageBlock.almostIntersects(textPageBlock2, INTERSECTS_Y_THRESHOLD, 0) && !alreadyMerged.contains(textPageBlock2)).peek(alreadyMerged::add)) - .toList()); + textPageBlocks.stream() + .filter(textPageBlock2 -> textPageBlock.almostIntersects(textPageBlock2, INTERSECTS_Y_THRESHOLD, 0) && !alreadyMerged.contains(textPageBlock2)) + .peek(alreadyMerged::add)).toList()); } return textBlocksToMerge.stream().map(TextPageBlock::merge).toList(); } @@ -163,8 +169,7 @@ public class TaasBlockificationService { while (itty.hasNext()) { TextPageBlock block = (TextPageBlock) itty.next(); - if (previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold( - block.getMaxY(), + if (previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold(block.getMaxY(), previous.getMaxY()) || previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation() .equals(Orientation.RIGHT) && equalsWithThreshold(block.getMaxY(), previous.getMaxY())) { previous.add(block); @@ -189,7 +194,6 @@ public class TaasBlockificationService { TextPositionSequence prev = null; // TODO: make static final constant - boolean wasSplitted = false; Float splitX1 = null; for (TextPositionSequence word : textPositions) { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java index d622fc8..0df4dbe 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java @@ -5,7 +5,6 @@ import java.util.Locale; import java.util.regex.Matcher; import java.util.regex.Pattern; -import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; @@ -13,6 +12,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils; import lombok.RequiredArgsConstructor; @@ -63,16 +63,16 @@ public class DocuMineClassificationService { textBlock.setClassification(PageBlockType.OTHER); return; } - if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) - || PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() - .getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular()) - ) { + if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) || PositionUtils.isOverBodyTextFrame(bodyTextFrame, + textBlock, + page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter() + .getMostPopular())) { textBlock.setClassification(PageBlockType.HEADER); - } else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER) - || PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() - .getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular()) - ) { + } else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER) || PositionUtils.isUnderBodyTextFrame(bodyTextFrame, + textBlock, + page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter() + .getMostPopular())) { textBlock.setClassification(PageBlockType.FOOTER); } else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks() diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java index 3e90c57..9ea2c95 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java @@ -3,7 +3,6 @@ package com.knecon.fforesight.service.layoutparser.processor.services.classifica import java.util.List; import java.util.regex.Pattern; -import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; @@ -11,6 +10,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils; import lombok.RequiredArgsConstructor; @@ -21,7 +21,6 @@ import lombok.extern.slf4j.Slf4j; @RequiredArgsConstructor public class RedactManagerClassificationService { - public void classifyDocument(ClassificationDocument document) { List headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular(); @@ -52,14 +51,16 @@ public class RedactManagerClassificationService { textBlock.setClassification(PageBlockType.OTHER); return; } - if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) - || PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() - .getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) { + if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) || PositionUtils.isOverBodyTextFrame(bodyTextFrame, + textBlock, + page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter() + .getMostPopular())) { textBlock.setClassification(PageBlockType.HEADER); - } else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER) - || PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() - .getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) { + } else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER) || PositionUtils.isUnderBodyTextFrame(bodyTextFrame, + textBlock, + page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter() + .getMostPopular())) { textBlock.setClassification(PageBlockType.FOOTER); } else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks() diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TaasClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TaasClassificationService.java index 7a91be1..78a5339 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TaasClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TaasClassificationService.java @@ -3,7 +3,6 @@ package com.knecon.fforesight.service.layoutparser.processor.services.classifica import java.util.List; import java.util.regex.Pattern; -import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; @@ -12,6 +11,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService; +import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils; import lombok.RequiredArgsConstructor; @@ -27,7 +27,6 @@ public class TaasClassificationService { public void classifyDocument(ClassificationDocument document) { - List headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular(); log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue()); @@ -57,11 +56,13 @@ public class TaasClassificationService { textBlock.setClassification(PageBlockType.OTHER); return; } - if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) - || PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation())) { + if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) || PositionUtils.isOverBodyTextFrame(bodyTextFrame, + textBlock, + page.getRotation())) { textBlock.setClassification(PageBlockType.HEADER); - } else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER) - || PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation())) { + } else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER) || PositionUtils.isUnderBodyTextFrame(bodyTextFrame, + textBlock, + page.getRotation())) { textBlock.setClassification(PageBlockType.FOOTER); } else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks() diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java index 6bde310..a9061c2 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java @@ -18,8 +18,6 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; -import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Footer; @@ -31,6 +29,8 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Pa import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.utils.IdBuilder; import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SearchTextWithTextPositionFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SearchTextWithTextPositionFactory.java index 7ebc737..6d1dace 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SearchTextWithTextPositionFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SearchTextWithTextPositionFactory.java @@ -8,10 +8,10 @@ import java.util.List; import java.util.Locale; import java.util.Objects; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary; import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary; import lombok.experimental.UtilityClass; @@ -110,6 +110,7 @@ public class SearchTextWithTextPositionFactory { return context.stringIdx - context.lastHyphenIdx < MAX_HYPHEN_LINEBREAK_DISTANCE; } + private static List mergeToBoundaries(List integers) { if (integers.isEmpty()) { @@ -125,8 +126,9 @@ public class SearchTextWithTextPositionFactory { } end = current + 1; } - if (boundaries.isEmpty()) + if (boundaries.isEmpty()) { boundaries.add(new Boundary(start, end)); + } return boundaries; } @@ -138,6 +140,7 @@ public class SearchTextWithTextPositionFactory { } } + private boolean isLineBreak(RedTextPosition currentTextPosition, RedTextPosition previousTextPosition) { return Objects.equals(currentTextPosition.getUnicode(), "\n") || isDeltaYLargerThanTextHeight(currentTextPosition, previousTextPosition); @@ -163,17 +166,7 @@ public class SearchTextWithTextPositionFactory { private boolean isHyphen(String unicodeCharacter) { - return Objects.equals(unicodeCharacter, "-") || // - Objects.equals(unicodeCharacter, "~") || // - Objects.equals(unicodeCharacter, "‐") || // - Objects.equals(unicodeCharacter, "‒") || // - Objects.equals(unicodeCharacter, "⁻") || // - Objects.equals(unicodeCharacter, "−") || // - Objects.equals(unicodeCharacter, "﹣") || // - Objects.equals(unicodeCharacter, "゠") || // - Objects.equals(unicodeCharacter, "⁓") || // - Objects.equals(unicodeCharacter, "‑") || // - Objects.equals(unicodeCharacter, "\u00AD"); + return false; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java index 7bd82e2..4face6a 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java @@ -11,12 +11,12 @@ import java.util.Map; import java.util.Set; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; -import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section; +import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; +import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.utils.TableMergingUtility; import lombok.experimental.UtilityClass; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java index c00edd1..8d8a660 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java @@ -8,15 +8,15 @@ import java.util.Set; import java.util.stream.Collectors; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; -import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; +import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations; import lombok.experimental.UtilityClass; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TextBlockFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TextBlockFactory.java index 6a0268c..9247248 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TextBlockFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TextBlockFactory.java @@ -2,10 +2,10 @@ package com.knecon.fforesight.service.layoutparser.processor.services.factory; import java.util.List; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import lombok.AccessLevel; import lombok.experimental.FieldDefaults; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/DocumentGraphMapper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/DocumentGraphMapper.java index c51f9ec..13fa8e9 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/DocumentGraphMapper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/DocumentGraphMapper.java @@ -7,11 +7,11 @@ import java.util.List; import java.util.Map; import java.util.NoSuchElementException; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData; import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Footer; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/PropertiesMapper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/PropertiesMapper.java index 329bd40..f6c66cb 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/PropertiesMapper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/PropertiesMapper.java @@ -1,7 +1,6 @@ package com.knecon.fforesight.service.layoutparser.processor.services.mapper; import java.awt.geom.Rectangle2D; -import java.util.Collections; import java.util.HashMap; import java.util.Locale; import java.util.Map; @@ -9,7 +8,6 @@ import java.util.Map; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java index 09a8eb2..1a04cba 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java @@ -329,6 +329,7 @@ public class PDFLinesTextStripper extends PDFTextStripper { .getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < maximumGapSize; } + @Override public String getText(PDDocument doc) throws IOException { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFTextStripper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFTextStripper.java index 1ca5b43..b3bed93 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFTextStripper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFTextStripper.java @@ -25,10 +25,23 @@ import java.io.StringWriter; import java.io.Writer; import java.text.Bidi; import java.text.Normalizer; -import java.util.*; +import java.util.ArrayDeque; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.Deque; +import java.util.HashMap; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.SortedMap; +import java.util.SortedSet; +import java.util.StringTokenizer; +import java.util.TreeMap; +import java.util.TreeSet; import java.util.regex.Pattern; -import lombok.Getter; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.pdfbox.cos.COSDictionary; @@ -46,6 +59,8 @@ import org.apache.pdfbox.text.TextPositionComparator; import com.knecon.fforesight.service.layoutparser.processor.utils.QuickSort; +import lombok.Getter; + /** * This is just a copy except i only adjusted lines 594-607 cause this is a bug in Pdfbox. * see S416.pdf @@ -194,40 +209,33 @@ public class PDFTextStripper extends LegacyPDFStreamEngine { } + public void beginMarkedContentSequence(COSName tag, COSDictionary properties) { - public void beginMarkedContentSequence(COSName tag, COSDictionary properties) - { PDMarkedContent markedContent = PDMarkedContent.create(tag, properties); - if (this.currentMarkedContents.isEmpty()) - { + if (this.currentMarkedContents.isEmpty()) { this.markedContents.add(markedContent); - } - else - { - PDMarkedContent currentMarkedContent = - this.currentMarkedContents.peek(); - if (currentMarkedContent != null) - { + } else { + PDMarkedContent currentMarkedContent = this.currentMarkedContents.peek(); + if (currentMarkedContent != null) { currentMarkedContent.addMarkedContent(markedContent); } } this.currentMarkedContents.push(markedContent); } + @Override - public void endMarkedContentSequence() - { - if (!this.currentMarkedContents.isEmpty()) - { + public void endMarkedContentSequence() { + + if (!this.currentMarkedContents.isEmpty()) { this.currentMarkedContents.pop(); } } - public void xobject(PDXObject xobject) - { - if (!this.currentMarkedContents.isEmpty()) - { + public void xobject(PDXObject xobject) { + + if (!this.currentMarkedContents.isEmpty()) { this.currentMarkedContents.peek().addXObject(xobject); } } @@ -635,7 +643,6 @@ public class PDFTextStripper extends LegacyPDFStreamEngine { var normalized = normalize(line); // normalized.stream().filter(l -> System.out.println(l.getText().contains("Plenarprotokoll 20/24")).findFirst().isPresent() - lastLineStartPosition = handleLineSeparation(current, lastPosition, lastLineStartPosition, maxHeightForLine); writeLine(normalized, current.isParagraphStart); line.clear(); @@ -914,8 +921,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine { textList.add(text); } } - if (!this.currentMarkedContents.isEmpty()) - { + if (!this.currentMarkedContents.isEmpty()) { this.currentMarkedContents.peek().addText(text); } } @@ -2102,7 +2108,9 @@ public class PDFTextStripper extends LegacyPDFStreamEngine { return endParagraphWritten; } - public void setEndParagraphWritten(){ + + public void setEndParagraphWritten() { + endParagraphWritten = true; } @@ -2145,7 +2153,6 @@ public class PDFTextStripper extends LegacyPDFStreamEngine { this.isHangingIndent = true; } - } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/ViewerDocumentService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/ViewerDocumentService.java index aa178bb..61d0525 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/ViewerDocumentService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/ViewerDocumentService.java @@ -3,20 +3,14 @@ package com.knecon.fforesight.service.layoutparser.processor.services.visualizat import java.awt.Color; import java.awt.geom.AffineTransform; import java.awt.geom.Rectangle2D; -import java.io.File; -import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; -import java.nio.file.Files; -import java.nio.file.Path; import java.util.ArrayList; import java.util.HashSet; import java.util.List; +import java.util.Map; import java.util.Set; -import javax.print.Doc; - -import org.apache.pdfbox.Loader; import org.apache.pdfbox.cos.COSDictionary; import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.pdmodel.PDDocument; @@ -33,8 +27,6 @@ import org.apache.pdfbox.pdmodel.graphics.state.PDExtendedGraphicsState; import org.apache.pdfbox.util.Matrix; import org.springframework.stereotype.Service; -import com.fasterxml.jackson.databind.JsonNode; -import com.fasterxml.jackson.databind.ObjectMapper; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle; import com.knecon.fforesight.service.layoutparser.processor.model.visualization.ColoredLine; @@ -45,8 +37,8 @@ import com.knecon.fforesight.service.layoutparser.processor.model.visualization. import com.knecon.fforesight.service.layoutparser.processor.model.visualization.VisualizationsOnPage; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.ExtractedTable; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.ExtractedTableData; +import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorData; -import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse; import com.knecon.fforesight.service.layoutparser.processor.utils.PdfVisualisationUtility; import lombok.RequiredArgsConstructor; @@ -58,55 +50,16 @@ import lombok.extern.slf4j.Slf4j; @RequiredArgsConstructor public class ViewerDocumentService { - private static final String LAYER_NAME = "Layout grid"; private static final int FONT_SIZE = 10; public static final float LINE_WIDTH = 1f; private final LayoutGridService layoutGridService; - @SneakyThrows - public void drawExtractedTables(PDDocument pdDocument, Document document, OutputStream outputStream, List tableExtractorData) { - - for (TableExtractorData tableExtractorDatum : tableExtractorData) { - int pageNumber = tableExtractorDatum.getPageNumber(); - List tableRectangles = new ArrayList<>(); - List objectRectangles = new ArrayList<>(); - for (ExtractedTable table : tableExtractorDatum.getTables()) { - List boundingBox = table.getExtractedTableValue().getBoundingBox(); - float x0 = boundingBox.get(0); - float x1 = boundingBox.get(2); - float y0 = boundingBox.get(1); - float y1 = boundingBox.get(3); - Rectangle2D tableRectangle = new Rectangle(y0, x0, x1 - x0, y1 - y0); - tableRectangles.add(tableRectangle); - } - for (ExtractedTableData object : tableExtractorDatum.getObjects()) { - List boundingBox = object.getBoundingBox(); - float x0 = boundingBox.get(0); - float x1 = boundingBox.get(2); - float y0 = boundingBox.get(1); - float y1 = boundingBox.get(3); - Rectangle2D objectRectangle = new Rectangle(y0, x0, x1 - x0, y1 - y0); - objectRectangles.add(objectRectangle); - } - PdfVisualisationUtility.drawRectangle2DList(pdDocument, - pageNumber, - tableRectangles, - PdfVisualisationUtility.Options.builder().strokeColor(Color.PINK).strokeWidth(1).stroke(true).build()); - PdfVisualisationUtility.drawRectangle2DList(pdDocument, - pageNumber, - objectRectangles, - PdfVisualisationUtility.Options.builder().strokeColor(Color.CYAN).strokeWidth(1).stroke(true).build()); - } - pdDocument.save(outputStream); - - - } @SneakyThrows - public void createViewerDocument(PDDocument pdDocument, Document document, OutputStream outputStream, boolean layerVisibilityDefaultValue) { + public void createViewerDocument(PDDocument pdDocument, Document document, OutputStream outputStream, Map> extractedTableCells, boolean layerVisibilityDefaultValue) { LayoutGrid layoutGrid = layoutGridService.createLayoutGrid(document); // PDDocument.save() is very slow, since it actually traverses the entire pdf and writes a new one. @@ -155,6 +108,11 @@ public class ViewerDocumentService { contentStream.addRect((float) r.getX(), (float) r.getY(), (float) r.getWidth(), (float) r.getHeight()); contentStream.fill(); } + for(TableCells tableCells: extractedTableCells.get(pageNumber)) { + contentStream.setStrokingColor(Color.CYAN); + contentStream.addRect((float) tableCells.getX0(), (float) tableCells.getY0(), (float) tableCells.getWidth(), (float) tableCells.getHeight()); + contentStream.stroke(); + } for (PlacedText placedText : visualizationsOnPage.getPlacedTexts()) { contentStream.setFont(font, FONT_SIZE); contentStream.beginText(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/MarkedContentUtils.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/MarkedContentUtils.java index 799ac99..045a266 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/MarkedContentUtils.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/MarkedContentUtils.java @@ -1,12 +1,5 @@ package com.knecon.fforesight.service.layoutparser.processor.utils; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; -import lombok.experimental.UtilityClass; -import org.apache.pdfbox.cos.COSName; -import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent; -import org.apache.pdfbox.text.TextPosition; - import java.awt.geom.Rectangle2D; import java.util.Collection; import java.util.Collections; @@ -14,12 +7,22 @@ import java.util.List; import java.util.Map; import java.util.stream.Collectors; +import org.apache.pdfbox.cos.COSName; +import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent; +import org.apache.pdfbox.text.TextPosition; + +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; + +import lombok.experimental.UtilityClass; + @UtilityClass public class MarkedContentUtils { public static final String HEADER = "Header"; public static final String FOOTER = "Footer"; + public List getMarkedContentBboxPerLine(List markedContents, String subtype) { if (markedContents == null) { @@ -31,7 +34,8 @@ public class MarkedContentUtils { .filter(m -> m.getProperties() != null) .filter(m -> m.getProperties().getItem("Subtype") != null) .filter(m -> ((COSName) m.getProperties().getItem("Subtype")).getName().equals(subtype)) - .map(PDMarkedContent::getContents).flatMap(Collection::stream) + .map(PDMarkedContent::getContents) + .flatMap(Collection::stream) .filter(t -> t instanceof TextPosition) .map(t -> (TextPosition) t) .filter(t -> !t.getUnicode().equals(" ")) @@ -41,16 +45,19 @@ public class MarkedContentUtils { return Collections.emptyList(); } - return markedContentByYPosition.values().stream() - .map(textPositions -> new TextPositionSequence(textPositions.stream() - .toList(), 0, true) - .getRectangle()) - .map(t -> new Rectangle2D.Float(t.getTopLeft().getX(), t.getTopLeft().getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight()))).collect(Collectors.toList()); + return markedContentByYPosition.values() + .stream() + .map(textPositions -> new TextPositionSequence(textPositions.stream().toList(), 0, true).getRectangle()) + .map(t -> new Rectangle2D.Float(t.getTopLeft().getX(), t.getTopLeft().getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight()))) + .collect(Collectors.toList()); } public boolean intersects(TextPageBlock textBlock, Map> markedContentBboxPerType, String type) { - return markedContentBboxPerType.get(type) != null && markedContentBboxPerType.get(type).stream().anyMatch(rectangle -> rectangle.intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight())); + + return markedContentBboxPerType.get(type) != null && markedContentBboxPerType.get(type) + .stream() + .anyMatch(rectangle -> rectangle.intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight())); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PositionUtils.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PositionUtils.java index 48b720d..3aecb92 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PositionUtils.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PositionUtils.java @@ -19,10 +19,9 @@ public final class PositionUtils { double threshold = textBlock.getMostPopularWordHeight() * 3; - if (textBlock.getPdfMinX() + threshold > btf.getTopLeft().getX() - && textBlock.getPdfMaxX() - threshold < btf.getTopLeft().getX() + btf.getWidth() - && textBlock.getPdfMinY() + threshold > btf.getTopLeft().getY() - && textBlock.getPdfMaxY() - threshold < btf.getTopLeft().getY() + btf.getHeight()) { + if (textBlock.getPdfMinX() + threshold > btf.getTopLeft().getX() && textBlock.getPdfMaxX() - threshold < btf.getTopLeft() + .getX() + btf.getWidth() && textBlock.getPdfMinY() + threshold > btf.getTopLeft().getY() && textBlock.getPdfMaxY() - threshold < btf.getTopLeft() + .getY() + btf.getHeight()) { return true; } else { return false; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangleTransformations.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangleTransformations.java index 1a49607..f0ea28f 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangleTransformations.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangleTransformations.java @@ -41,11 +41,14 @@ public class RectangleTransformations { return atomicTextBlocks.stream().flatMap(atomicTextBlock -> atomicTextBlock.getPositions().stream()).collect(new Rectangle2DBBoxCollector()); } + + public static Collector collectBBox() { return new Rectangle2DBBoxCollector(); } + public static PDRectangle toPDRectangleBBox(List rectangles) { Rectangle2D rectangle2D = RectangleTransformations.rectangleBBox(rectangles); @@ -70,6 +73,7 @@ public class RectangleTransformations { return format("%f,%f,%f,%f", rectangle2D.getX(), rectangle2D.getY(), rectangle2D.getWidth(), rectangle2D.getHeight()); } + public static Rectangle2D rectangleBBox(List rectangles) { return rectangles.stream().map(RectangleTransformations::toRectangle2D).collect(new Rectangle2DBBoxCollector()); @@ -84,6 +88,7 @@ public class RectangleTransformations { -redactionLogRectangle.getHeight()); } + public static Rectangle2D toRectangle2D(PDRectangle rectangle) { return new Rectangle2D.Double(rectangle.getLowerLeftX(), rectangle.getLowerLeftY(), rectangle.getWidth(), rectangle.getHeight()); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java index 53e8c29..9927685 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java @@ -3,7 +3,6 @@ package com.knecon.fforesight.service.layoutparser.processor.utils; import java.util.List; import java.util.stream.Collectors; - import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionSequenceComparator.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionSequenceComparator.java index 40dce07..1e122c6 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionSequenceComparator.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionSequenceComparator.java @@ -28,15 +28,13 @@ import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPosit * * @author Ben Litchfield */ -public class TextPositionSequenceComparator implements Comparator -{ +public class TextPositionSequenceComparator implements Comparator { + @Override - public int compare(TextPositionSequence pos1, TextPositionSequence pos2) - { + public int compare(TextPositionSequence pos1, TextPositionSequence pos2) { // only compare text that is in the same direction int cmp1 = Float.compare(pos1.getDir().getDegrees(), pos2.getDir().getDegrees()); - if (cmp1 != 0) - { + if (cmp1 != 0) { return cmp1; } @@ -54,19 +52,13 @@ public class TextPositionSequenceComparator implements Comparator= pos1YTop && pos2YBottom <= pos1YBottom || - pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom) - { + if (yDifference < .1 || pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom || pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom) { return Float.compare(x1, x2); - } - else if (pos1YBottom < pos2YBottom) - { + } else if (pos1YBottom < pos2YBottom) { return -1; - } - else - { + } else { return 1; } } + } diff --git a/layoutparser-service/layoutparser-service-server/src/main/java/com/knecon/fforesight/service/layoutparser/server/Application.java b/layoutparser-service/layoutparser-service-server/src/main/java/com/knecon/fforesight/service/layoutparser/server/Application.java index 75cfabd..5fdc714 100644 --- a/layoutparser-service/layoutparser-service-server/src/main/java/com/knecon/fforesight/service/layoutparser/server/Application.java +++ b/layoutparser-service/layoutparser-service-server/src/main/java/com/knecon/fforesight/service/layoutparser/server/Application.java @@ -14,7 +14,7 @@ import com.knecon.fforesight.service.layoutparser.server.queue.MessagingConfigur import com.knecon.fforesight.tenantcommons.MultiTenancyAutoConfiguration; @ImportAutoConfiguration({MultiTenancyAutoConfiguration.class}) -@Import({MetricsConfiguration.class, StorageAutoConfiguration.class, LayoutParsingServiceProcessorConfiguration.class, MessagingConfiguration.class}) +@Import({MetricsConfiguration.class, StorageAutoConfiguration.class, LayoutParsingServiceProcessorConfiguration.class, MessagingConfiguration.class}) @SpringBootApplication(exclude = {SecurityAutoConfiguration.class, ManagementWebSecurityAutoConfiguration.class}) public class Application { diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java index b8336d4..7e0d9e1 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java @@ -7,6 +7,7 @@ import java.awt.geom.Rectangle2D; import java.awt.geom.RectangularShape; import java.io.File; import java.io.FileOutputStream; +import java.io.InputStream; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; @@ -40,6 +41,8 @@ import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangl import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse; +import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.ExtractedTable; +import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.services.GapDetectionService; @@ -66,6 +69,9 @@ public class ViewerDocumentTest extends BuildDocumentTest { @Autowired private RedactManagerClassificationService redactManagerClassificationService; + @Autowired + private ObjectMapper objectMapper; + @Test @SneakyThrows public void testViewerDocument() { @@ -76,60 +82,63 @@ public class ViewerDocumentTest extends BuildDocumentTest { ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService); Document document = buildGraph(fileName, LayoutParsingType.TAAS); try (var pdDocument = Loader.loadPDF(new ClassPathResource(fileName).getFile()); var out = new FileOutputStream(tmpFileName)) { - viewerDocumentService.createViewerDocument(pdDocument, document, out, true); + viewerDocumentService.createViewerDocument(pdDocument, document, out, null,true); } } @Test @SneakyThrows - @Disabled public void testLayoutParsingServiceResults() { - String tableSourceFileName ="C:\\Users\\YannikHampe\\Downloads\\3875a78f1db6ff94b05e38446e65ba9a.EXTRACTED_TABLES.json\\3875a78f1db6ff94b05e38446e65ba9a.EXTRACTED_TABLES.json"; + String tableSourceFileName ="C:\\Users\\YannikHampe\\Downloads\\b28d9a22b674906813f12b86dda33202.EXTRACTED_TABLES.json\\b28d9a22b674906813f12b86dda33202.EXTRACTED_TABLES.json"; Path pdfFileResource = Path.of("C:\\Users\\YannikHampe\\Downloads\\2009-1048395_50pages_tables.pdf"); String tmpFileName = "C:/Users/YANNIK~1/AppData/Local/Temp/rectangles."+pdfFileResource.getFileName(); - ObjectMapper objectMapper = new ObjectMapper(); PDDocument pdDocument = Loader.loadPDF(pdfFileResource.toFile()); - JsonNode jsonNode = objectMapper.readTree(new String(Files.readAllBytes(new File(tableSourceFileName).toPath()))); - JsonNode dataNode = jsonNode.get("data"); - dataNode.forEach(node -> { - List rectangles = new ArrayList<>(); - int pageNumber = node.get("page_number").asInt()+1; - JsonNode tables = node.get("tables"); - tables.forEach(entry -> { - JsonNode table = entry.get("table"); - //table bounding box - if(Float.valueOf(String.valueOf(table.get("score"))) < 0.99) { - return; - } - JsonNode tableBox = table.get("bbox"); - float x0 = Float.valueOf(tableBox.get(0).toString()); - float x1 = Float.valueOf(tableBox.get(2).toString()); - float y0 = Float.valueOf(tableBox.get(1).toString()); - float y1 = Float.valueOf(tableBox.get(3).toString()); - Rectangle2D rectangle2D = new Rectangle(y0, x0, x1 - x0, y1 - y0); - rectangles.add(rectangle2D); - //columns and rows - JsonNode rowsAndColumns = entry.get("objects"); - rowsAndColumns.forEach(rowOrColumn -> { - JsonNode bbox = rowOrColumn.get("bbox"); - float rx0 = Float.valueOf(bbox.get(0).toString()); - float rx1 = Float.valueOf(bbox.get(2).toString()); - float ry0 = Float.valueOf(bbox.get(1).toString()); - float ry1 = Float.valueOf(bbox.get(3).toString()); - Rectangle2D rowOrColumnRectangle = new Rectangle(ry0, rx0, rx1 - rx0, ry1 - ry0); - rectangles.add(rowOrColumnRectangle); - }); + try (InputStream inputStream = Files.newInputStream(Path.of(tableSourceFileName))) { + TableExtractorResponse tableExtractorResponse = objectMapper.readValue(inputStream, TableExtractorResponse.class); + tableExtractorResponse.getData().forEach(data -> { + List tableCells = convertTableCells(data.getTables()); }); - PdfVisualisationUtility.drawRectangle2DList(pdDocument, pageNumber, rectangles, PdfVisualisationUtility.Options.builder().strokeColor(Color.GREEN).strokeWidth(2).stroke(true).build()); - - }); + inputStream.close(); + } try (var out = new FileOutputStream(tmpFileName)) { pdDocument.save(out); + pdDocument.close(); } } + public List convertTableCells(List tableObjects) { + + List parsedTableCells = new ArrayList<>(); + + tableObjects.stream().forEach(t -> { + System.out.println(t.getTable().getLabel()); + TableCells tableCells = new TableCells(); + tableCells.setX0(t.getTable().getBbox().get(0)); + tableCells.setX1(t.getTable().getBbox().get(2)); + tableCells.setY0(t.getTable().getBbox().get(1)); + tableCells.setY1(t.getTable().getBbox().get(3)); + tableCells.setWidth(tableCells.getX1()- tableCells.getX0()); + tableCells.setHeight(tableCells.getY1()- tableCells.getY0()); + parsedTableCells.add(tableCells); + t.getObjects().forEach(o -> { + System.out.println(o.getLabel()); + TableCells objectCell = new TableCells(); + objectCell.setX0(t.getTable().getBbox().get(0)); + objectCell.setX1(t.getTable().getBbox().get(2)); + objectCell.setY0(t.getTable().getBbox().get(1)); + objectCell.setY1(t.getTable().getBbox().get(3)); + objectCell.setWidth(objectCell.getX1()- objectCell.getX0()); + objectCell.setHeight(objectCell.getY1()- objectCell.getY0()); + parsedTableCells.add(objectCell); + }); + }); + + return parsedTableCells; + + } + public ClassificationDocument buildClassificationDocument(PDDocument originDocument) { ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,