diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..b634d85 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +*.pdf filter=lfs diff=lfs merge=lfs -text diff --git a/.gitmodules b/.gitmodules index f569cd2..4b7e4d7 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,8 +1,8 @@ [submodule "layoutparser-service/layoutparser-service-server/src/test/resources/files/basf"] path = layoutparser-service/layoutparser-service-server/src/test/resources/files/basf - url = https://gitlab.knecon.com/fforesight/documents/basf.git + url = ssh://git@git.knecon.com:22222/fforesight/documents/basf.git update = merge [submodule "layoutparser-service/layoutparser-service-server/src/test/resources/files/syngenta"] path = layoutparser-service/layoutparser-service-server/src/test/resources/files/syngenta - url = https://gitlab.knecon.com/fforesight/documents/syngenta.git + url = ssh://git@git.knecon.com:22222/fforesight/documents/syngenta.git update = merge diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingType.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingType.java index 9d066c1..b6567a9 100644 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingType.java +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingType.java @@ -5,6 +5,7 @@ public enum LayoutParsingType { REDACT_MANAGER_OLD, REDACT_MANAGER_PARAGRAPH_DEBUG, DOCUMINE, + DOCUMINE_OLD, CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index 4b390ac..abab9c3 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -45,6 +45,7 @@ import com.knecon.fforesight.service.layoutparser.processor.services.RulingClean import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService; import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService; import com.knecon.fforesight.service.layoutparser.processor.services.TableExtractionService; +import com.knecon.fforesight.service.layoutparser.processor.services.TextRulingsClassifier; import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService; import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService; import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService; @@ -52,12 +53,14 @@ import com.knecon.fforesight.service.layoutparser.processor.services.classificat import com.knecon.fforesight.service.layoutparser.processor.services.classification.DocuMineClassificationService; import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService; import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; +import com.knecon.fforesight.service.layoutparser.processor.services.graphics.Box; import com.knecon.fforesight.service.layoutparser.processor.services.graphics.GraphicExtractorService; import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper; import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper; import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDFLinesTextStripper; import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService; import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; +import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation; import io.micrometer.observation.Observation; import io.micrometer.observation.ObservationRegistry; @@ -119,11 +122,11 @@ public class LayoutParsingPipeline { } ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(), - originFile, - imageServiceResponse, - tableServiceResponse, - visualLayoutParsingResponse, - layoutParsingRequest.identifier()); + originFile, + imageServiceResponse, + tableServiceResponse, + visualLayoutParsingResponse, + layoutParsingRequest.identifier()); log.info("Building document graph for {}", layoutParsingRequest.identifier()); @@ -131,7 +134,7 @@ public class LayoutParsingPipeline { log.info("Creating viewer document for {}", layoutParsingRequest.identifier()); - layoutGridService.addLayoutGrid(viewerDocumentFile, documentGraph, viewerDocumentFile, false); + layoutGridService.addLayoutGrid(viewerDocumentFile, documentGraph, viewerDocumentFile, false, layoutParsingRequest.visualLayoutParsingFileId().isPresent()); log.info("Storing resulting files for {}", layoutParsingRequest.identifier()); @@ -155,25 +158,25 @@ public class LayoutParsingPipeline { .numberOfPages(documentGraph.getNumberOfPages()) .duration(System.currentTimeMillis() - start) .message(format(""" - Layout parsing has finished in %.02f s. - identifiers: %s - %s - Files have been saved with Ids: - Structure: %s - Text: %s - Positions: %s - PageData: %s - Simplified Text: %s - Viewer Doc: %s""", - ((float) (System.currentTimeMillis() - start)) / 1000, - layoutParsingRequest.identifier(), - buildSemanticNodeCountMessage(documentGraph.getNumberOfPages(), documentGraph.buildSemanticNodeCounts()), - layoutParsingRequest.structureFileStorageId(), - layoutParsingRequest.textBlockFileStorageId(), - layoutParsingRequest.positionBlockFileStorageId(), - layoutParsingRequest.pageFileStorageId(), - layoutParsingRequest.simplifiedTextStorageId(), - layoutParsingRequest.viewerDocumentStorageId())) + Layout parsing has finished in %.02f s. + identifiers: %s + %s + Files have been saved with Ids: + Structure: %s + Text: %s + Positions: %s + PageData: %s + Simplified Text: %s + Viewer Doc: %s""", + ((float) (System.currentTimeMillis() - start)) / 1000, + layoutParsingRequest.identifier(), + buildSemanticNodeCountMessage(documentGraph.getNumberOfPages(), documentGraph.buildSemanticNodeCounts()), + layoutParsingRequest.structureFileStorageId(), + layoutParsingRequest.textBlockFileStorageId(), + layoutParsingRequest.positionBlockFileStorageId(), + layoutParsingRequest.pageFileStorageId(), + layoutParsingRequest.simplifiedTextStorageId(), + layoutParsingRequest.viewerDocumentStorageId())) .build(); } @@ -194,14 +197,14 @@ public class LayoutParsingPipeline { private String buildSemanticNodeCountMessage(int numberOfPages, Map semanticNodeCounts) { return String.format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed", - numberOfPages, - semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION), - semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE), - semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH), - semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE), - semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL), - semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER), - semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER)); + numberOfPages, + semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION), + semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE), + semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH), + semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE), + semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL), + semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER), + semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER)); } @@ -220,6 +223,9 @@ public class LayoutParsingPipeline { Map> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse); Map> signatures = visualLayoutParsingAdapter.buildExtractedSignaturesPerPage(visualLayoutParsingResponse); ClassificationDocument classificationDocument = new ClassificationDocument(); + + classificationDocument.getVisualizations().setActive(identifier.containsKey("debug")); + List classificationPages = new ArrayList<>(); long pageCount = originDocument.getNumberOfPages(); @@ -244,10 +250,12 @@ public class LayoutParsingPipeline { stripper.setStartPage(pageNumber); stripper.setEndPage(pageNumber); stripper.setPdpage(pdPage); - if (layoutParsingType.equals(LayoutParsingType.DOCUMINE)) { + if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD)) { stripper.setSortByPosition(true); } stripper.getText(originDocument); + List words = stripper.getTextPositionSequences(); + classificationDocument.getVisualizations().addTextVisualizations(words, pageNumber); PDRectangle pdr = pdPage.getMediaBox(); @@ -255,28 +263,29 @@ public class LayoutParsingPipeline { boolean isLandscape = pdr.getWidth() > pdr.getHeight() && (rotation == 0 || rotation == 180) || pdr.getHeight() > pdr.getWidth() && (rotation == 90 || rotation == 270); PDRectangle cropbox = pdPage.getCropBox(); - CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), stripper.getRulings()); + classificationDocument.getVisualizations().addRulingVisualization(stripper.getRulings(), pageNumber); + CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(pdfTableCells.get(pageNumber), stripper.getRulings()); - List emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical()); + List emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontals(), cleanRulings.getVerticals(), PageInformation.fromPDPage(pageNumber, pdPage)); + classificationDocument.getVisualizations().addCellVisualizations(emptyTableCells, pageNumber); - var graphics = graphicExtractorService.extractPathElementGraphics(originDocument, - pdPage, - pageNumber, - cleanRulings, - stripper.getTextPositionSequences(), - emptyTableCells, - false); + TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(words, cleanRulings); + + List graphics = graphicExtractorService.extractPathElementGraphics(originDocument, pdPage, pageNumber, cleanRulings, stripper.getTextPositionSequences(), false); pdfImages.computeIfAbsent(pageNumber, x -> new ArrayList<>()) .addAll(graphics.stream() - .map(g -> new ClassifiedImage(new Rectangle2D.Double(g.x1, g.y1, g.width(), g.height()), ImageType.GRAPHIC, false, stripper.getPageNumber())) - .toList()); + .map(g -> new ClassifiedImage(new Rectangle2D.Double(g.x1, g.y1, g.width(), g.height()), ImageType.GRAPHIC, false, stripper.getPageNumber())) + .toList()); ClassificationPage classificationPage = switch (layoutParsingType) { - case REDACT_MANAGER_OLD -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells); - case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); - case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, true); - case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, false); + case REDACT_MANAGER_OLD -> + redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings, classificationDocument.getVisualizations()); + case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings); + case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG -> + docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getVisualizations(), layoutParsingType); + case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> + docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getVisualizations(), layoutParsingType); }; classificationPage.setCleanRulings(cleanRulings); @@ -286,17 +295,18 @@ public class LayoutParsingPipeline { classificationPage.setPageWidth(cropbox.getWidth()); classificationPage.setPageHeight(cropbox.getHeight()); + classificationDocument.getVisualizations().addMarkedContentVisualizations(stripper.getMarkedContents(), pageNumber); // MarkedContent needs to be converted at this point, otherwise it leads to GC Problems in Pdfbox. classificationPage.setMarkedContentBboxPerType(convertMarkedContents(stripper.getMarkedContents())); // If images is ocr needs to be calculated before textBlocks are moved into tables, otherwise findOcr algorithm needs to be adopted. - if (pdfImages != null && pdfImages.containsKey(pageNumber)) { + if (pdfImages.containsKey(pageNumber)) { classificationPage.setImages(pdfImages.get(pageNumber)); imageServiceResponseAdapter.findOcr(classificationPage); } if (signatures.containsKey(pageNumber)) { - if (classificationPage.getImages() == null || classificationPage.getImages().size() == 0) { + if (classificationPage.getImages() == null || classificationPage.getImages().isEmpty()) { classificationPage.setImages(signatures.get(pageNumber)); } else { classificationPage.getImages().addAll(signatures.get(pageNumber)); @@ -305,12 +315,6 @@ public class LayoutParsingPipeline { tableExtractionService.extractTables(emptyTableCells, classificationPage); - if (layoutParsingType == LayoutParsingType.REDACT_MANAGER) { - docstrumBlockificationService.combineBlocks(classificationPage); - } else if (layoutParsingType == LayoutParsingType.CLARIFYND) { - docstrumBlockificationService.mergeIntersectingBlocks(classificationPage.getTextBlocks(), 0, 6.5f); - } - buildPageStatistics(classificationPage); increaseDocumentStatistics(classificationPage, classificationDocument); @@ -321,11 +325,14 @@ public class LayoutParsingPipeline { log.info("Calculating BodyTextFrame for {}", identifier); bodyTextFrameService.setBodyTextFrames(classificationDocument, layoutParsingType); + for (ClassificationPage page : classificationDocument.getPages()) { + classificationDocument.getVisualizations().addCleanRulingVisualization(page.getCleanRulings(), page.getPageNumber()); + } log.info("Classify TextBlocks for {}", identifier); switch (layoutParsingType) { case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_OLD, CLARIFYND_PARAGRAPH_DEBUG -> redactManagerClassificationService.classifyDocument(classificationDocument); - case DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument); + case DOCUMINE_OLD, DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument); case CLARIFYND -> clarifyndClassificationService.classifyDocument(classificationDocument); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/DocstrumSegmentationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/DocstrumSegmentationService.java index eb62ce2..2b095a4 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/DocstrumSegmentationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/DocstrumSegmentationService.java @@ -7,14 +7,18 @@ import java.util.stream.Collectors; import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character; +import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone; import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.LineBuilderService; import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.NearestNeighbourService; import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.ReadingOrderService; import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.SpacingService; import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.ZoneBuilderService; +import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; +import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations; import lombok.RequiredArgsConstructor; @@ -29,31 +33,37 @@ public class DocstrumSegmentationService { private final ReadingOrderService readingOrderService; - public List segmentPage(List textPositions, boolean xyOrder) { + public List segmentPage(List textPositions, boolean xyOrder, CleanRulings usedRulings, LayoutparsingVisualizations visualizations) { List zones = new ArrayList<>(); - zones.addAll(computeZones(textPositions, TextDirection.ZERO)); - zones.addAll(computeZones(textPositions, TextDirection.QUARTER_CIRCLE)); - zones.addAll(computeZones(textPositions, TextDirection.HALF_CIRCLE)); - zones.addAll(computeZones(textPositions, TextDirection.THREE_QUARTER_CIRCLE)); + zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.ZERO)); + zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.QUARTER_CIRCLE)); + zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.HALF_CIRCLE)); + zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.THREE_QUARTER_CIRCLE)); return readingOrderService.resolve(zones, xyOrder); } - private List computeZones(List textPositions, TextDirection direction) { + private List computeZones(List textPositions, CleanRulings rulings, LayoutparsingVisualizations visualizations, TextDirection direction) { - var positions = textPositions.stream().filter(t -> t.getDir() == direction).map(TextPositionSequence::getTextPositions).flatMap(List::stream).toList(); + List positions = textPositions.stream() + .filter(t -> t.getDir() == direction) + .map(TextPositionSequence::getTextPositions) + .flatMap(List::stream) + .toList(); - var characters = positions.stream().map(Character::new).collect(Collectors.toList()); + List characters = positions.stream() + .map(Character::new) + .collect(Collectors.toList()); nearestNeighbourService.findNearestNeighbors(characters); - var characterSpacing = spacingService.computeCharacterSpacing(characters); - var lineSpacing = Math.min(spacingService.computeLineSpacing(characters), 20); + double characterSpacing = spacingService.computeCharacterSpacing(characters); + double lineSpacing = Math.min(spacingService.computeLineSpacing(characters), 20); - var lines = lineBuilderService.buildLines(characters, characterSpacing, lineSpacing); - return zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing); + List lines = lineBuilderService.buildLines(characters, characterSpacing, lineSpacing, rulings); + return zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing, rulings); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/BoundingBox.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/BoundingBox.java index 8e6785c..9f79eed 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/BoundingBox.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/BoundingBox.java @@ -1,13 +1,27 @@ package com.knecon.fforesight.service.layoutparser.processor.docstrum.model; import java.awt.geom.Rectangle2D; +import java.util.Comparator; +import java.util.List; + +import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; import lombok.Data; @Data public abstract class BoundingBox { - private Rectangle2D bBox; + // Java coordinate system: (0, 0) is always upper left, x is increasing left to right and y is increasing from top to bottom. + // should be used when determining reading order or other tasks which require coordinates in a harmonized system. + protected Rectangle2D bBox; // I would not trust this coordinate when comparing rulings and text, due to the text positions being slightly off. + + // PDF coordinate system: depends on page rotation, (0, 0) is lower left corner, x is increasing left to right and y from bottom to top. + // This rotates completely in 90 degree steps with page rotation. + // Needs to be used when writing to a PDF. + // Also, these are definitely correct and should be used whenever possible. + protected Rectangle2D bBoxInitialUserSpace; + + protected static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f; public double getX() { @@ -22,6 +36,42 @@ public abstract class BoundingBox { } + public double getMinX() { + + return bBox.getMinX(); + } + + + public double getMinY() { + + return bBox.getMinY(); + } + + + public double getPdfMinX() { + + return bBoxInitialUserSpace.getMinX(); + } + + + public double getPdfMaxX() { + + return bBoxInitialUserSpace.getMaxX(); + } + + + public double getPdfMinY() { + + return bBoxInitialUserSpace.getMinY(); + } + + + public double getPdfMaxY() { + + return bBoxInitialUserSpace.getMaxY(); + } + + public double getWidth() { return bBox.getWidth(); @@ -34,21 +84,102 @@ public abstract class BoundingBox { } + public double getMaxX() { + + return bBox.getMaxX(); + } + + + public double getMaxY() { + + return bBox.getMaxY(); + } + + public double getArea() { return (bBox.getHeight() * bBox.getWidth()); } - public boolean contains(Rectangle2D contained, double tolerance) { + public boolean contains(BoundingBox contained) { - return bBox.getX() <= contained.getX() + tolerance && bBox.getY() <= contained.getY() + tolerance && bBox.getX() + bBox.getWidth() >= contained.getX() + contained.getWidth() - tolerance && bBox.getY() + bBox.getHeight() >= contained.getY() + contained.getHeight() - tolerance; + return contains(contained, 0); + } + + + public boolean contains(BoundingBox contained, double tolerance) { + + return getPdfMinX() <= contained.getPdfMinX() + tolerance + && getPdfMinY() <= contained.getPdfMinY() + tolerance + && getPdfMaxX() >= contained.getPdfMaxX() - tolerance + && getPdfMaxY() >= contained.getPdfMaxY() - tolerance; + } + + + public boolean intersects(BoundingBox other) { + + return this.intersectsX(other) && this.intersectsY(other); + } + + + public boolean intersects(BoundingBox other, float yThreshold, float xThreshold) { + + return this.intersectsX(other, xThreshold) && this.intersectsY(other, yThreshold); } public boolean intersectsY(BoundingBox other) { - return this.getBBox().getMinY() <= other.getBBox().getMaxY() && this.getBBox().getMaxY() >= other.getBBox().getMinY(); + return this.getPdfMinY() <= other.getPdfMaxY() && this.getPdfMaxY() >= other.getPdfMinY(); } + + public boolean intersectsY(BoundingBox other, float threshold) { + + return this.getPdfMinY() - threshold <= other.getPdfMaxY() && this.getPdfMaxY() + threshold >= other.getPdfMinY(); + } + + + public boolean intersectsX(BoundingBox other) { + + return this.getPdfMinX() <= other.getMaxX() && this.getMaxX() >= other.getPdfMinX(); + } + + + public boolean intersectsX(BoundingBox other, float threshold) { + + return this.getPdfMinX() - threshold <= other.getMaxX() && this.getMaxX() + threshold >= other.getPdfMinX(); + } + + + public void setToBBoxOfComponents(List components) { + + this.bBox = components.stream() + .map(BoundingBox::getBBox) + .collect(RectangleTransformations.collectBBox()); + this.bBoxInitialUserSpace = components.stream() + .map(BoundingBox::getBBoxInitialUserSpace) + .collect(RectangleTransformations.collectBBox()); + } + + + public double verticalOverlap(BoundingBox other) { + + return Math.max(0, Math.min(this.getPdfMaxY(), other.getPdfMaxY()) - Math.max(this.getPdfMinY(), other.getPdfMinY())); + } + + + public static final Comparator ILL_DEFINED_ORDER = (o1, o2) -> { + + if (o1.equals(o2)) { + return 0; + } + if (o1.verticalOverlap(o2) > VERTICAL_COMPARISON_THRESHOLD * ((o1.getHeight() + o2.getHeight()) / 2)) { + return Double.compare(o1.getPdfMinX(), o2.getPdfMinX()); + } else { + return Double.compare(o1.getPdfMaxY(), o2.getPdfMaxY()); + } + }; + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/Character.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/Character.java index b4e2616..772f1b2 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/Character.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/Character.java @@ -27,8 +27,8 @@ public class Character { public Character(RedTextPosition chunk) { - this.x = chunk.getXDirAdj() + chunk.getWidthDirAdj() / 2; - this.y = chunk.getYDirAdj() + chunk.getHeightDir() / 2; + this.x = chunk.getBBoxDirAdj().getCenterX(); + this.y = chunk.getBBoxDirAdj().getCenterY(); this.textPosition = chunk; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/Line.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/Line.java index fa85249..525d148 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/Line.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/Line.java @@ -1,10 +1,10 @@ package com.knecon.fforesight.service.layoutparser.processor.docstrum.model; -import java.awt.geom.Rectangle2D; import java.util.ArrayList; import java.util.Arrays; import java.util.List; +import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import lombok.Data; @@ -72,7 +72,7 @@ public class Line extends BoundingBox { public double getAngle() { - return Math.atan2(y1 - y0, x1 - x0); + return FastAtan2.fastAtan2(y1 - y0, x1 - x0); } @@ -84,7 +84,9 @@ public class Line extends BoundingBox { private double computeHeight() { - return characters.stream().map(Character::getHeight).reduce(0d, Double::sum) / characters.size(); + return characters.stream() + .map(Character::getHeight) + .reduce(0d, Double::sum) / characters.size(); } @@ -116,7 +118,7 @@ public class Line extends BoundingBox { double ym = (y0 + y1) / 2; double yn = (other.y0 + other.y1) / 2; - return Math.abs(ym - yn) / Math.sqrt(1); + return Math.abs(ym - yn); } @@ -141,21 +143,9 @@ public class Line extends BoundingBox { private void buildBBox() { - double minX = Double.POSITIVE_INFINITY; - double minY = Double.POSITIVE_INFINITY; - double maxX = Double.NEGATIVE_INFINITY; - double maxY = Double.NEGATIVE_INFINITY; - - for (Character character : characters) { - - minX = Math.min(minX, character.getTextPosition().getXDirAdj()); - minY = Math.min(minY, character.getTextPosition().getYDirAdj()); - maxX = Math.max(maxX, character.getTextPosition().getXDirAdj() + character.getTextPosition().getWidthDirAdj()); - maxY = Math.max(maxY, character.getTextPosition().getYDirAdj() + character.getTextPosition().getHeightDir()); - - } - - this.setBBox(new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY)); + this.setToBBoxOfComponents(characters.stream() + .map(Character::getTextPosition) + .toList()); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/Zone.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/Zone.java index 85facd2..f1c61c5 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/Zone.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/Zone.java @@ -1,9 +1,10 @@ package com.knecon.fforesight.service.layoutparser.processor.docstrum.model; -import java.awt.geom.Rectangle2D; import java.util.Comparator; import java.util.List; +import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; + import lombok.Data; @Data @@ -15,29 +16,9 @@ public class Zone extends BoundingBox { @SuppressWarnings("PMD.ConstructorCallsOverridableMethod") public Zone(List lines) { - lines.sort(Comparator.comparingDouble(Line::getY)); + lines.sort(Comparator.comparingDouble(Line::getY0)); this.lines = lines; - buildBBox(); - } - - - public void buildBBox() { - - double minX = Double.POSITIVE_INFINITY; - double minY = Double.POSITIVE_INFINITY; - double maxX = Double.NEGATIVE_INFINITY; - double maxY = Double.NEGATIVE_INFINITY; - - for (Line line : lines) { - - minX = Math.min(minX, line.getX()); - minY = Math.min(minY, line.getY()); - maxX = Math.max(maxX, line.getX() + line.getWidth()); - maxY = Math.max(maxY, line.getY() + line.getHeight()); - - } - - this.setBBox(new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY)); + setToBBoxOfComponents(lines); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/LineBuilderService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/LineBuilderService.java index 66536a5..03c932b 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/LineBuilderService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/LineBuilderService.java @@ -1,6 +1,5 @@ package com.knecon.fforesight.service.layoutparser.processor.docstrum.service; -import java.util.ArrayList; import java.util.Comparator; import java.util.HashSet; import java.util.List; @@ -11,43 +10,49 @@ import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Angle import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind; +import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; @Service public class LineBuilderService { private static final double CHARACTER_SPACING_DISTANCE_MULTIPLIER = 3.5; - private static final double MAX_VERTICAL_CHARACTER_DISTANCE = 0.67; + private static final double LINE_SPACING_THRESHOLD_MULTIPLIER = 0.67; private static final double ANGLE_TOLERANCE = Math.PI / 6; - public List buildLines(List characters, double characterSpacing, double lineSpacing) { + public List buildLines(List characters, double characterSpacing, double lineSpacing, CleanRulings rulings) { double maxHorizontalDistance = characterSpacing * CHARACTER_SPACING_DISTANCE_MULTIPLIER; - double maxVerticalDistance = lineSpacing * MAX_VERTICAL_CHARACTER_DISTANCE; + double maxVerticalDistance = lineSpacing * LINE_SPACING_THRESHOLD_MULTIPLIER; UnionFind unionFind = new UnionFind<>(new HashSet<>(characters)); - AngleFilter filter = new AngleFilter(-ANGLE_TOLERANCE, ANGLE_TOLERANCE); + AngleFilter angleFilter = new AngleFilter(-ANGLE_TOLERANCE, ANGLE_TOLERANCE); characters.forEach(character -> { - character.getNeighbors().forEach(neighbor -> { - double x = neighbor.getHorizontalDistance() / maxHorizontalDistance; - double y = neighbor.getVerticalDistance() / maxVerticalDistance; - if (character.getTextPosition().getDir() == neighbor.getCharacter().getTextPosition().getDir() && filter.matches(neighbor) && Math.pow(x, 2) + Math.pow(y, - 2) <= 1) { - unionFind.union(character, neighbor.getCharacter()); - } - }); + character.getNeighbors() + .forEach(neighbor -> { + double normalizedHorizontalDistance = neighbor.getHorizontalDistance() / maxHorizontalDistance; + double normalizedVerticalDistance = neighbor.getVerticalDistance() / maxVerticalDistance; + + if (character.getTextPosition().getDir() != neighbor.getCharacter().getTextPosition().getDir() // + || !angleFilter.matches(neighbor) // + || Math.pow(normalizedHorizontalDistance, 2) + Math.pow(normalizedVerticalDistance, 2) > 1 // + || rulings.lineBetween(character.getTextPosition(), neighbor.getCharacter().getTextPosition())) { + return; + } + + unionFind.union(character, neighbor.getCharacter()); + }); }); - List lines = new ArrayList<>(); - unionFind.getGroups().forEach(group -> { - List lineCharacters = new ArrayList<>(group); - lineCharacters.sort(Comparator.comparingDouble(Character::getX)); - lines.add(new Line(lineCharacters, characterSpacing)); - }); - - return lines; + return unionFind.getGroups() + .stream() + .map(lineCharacters -> lineCharacters.stream() + .sorted(Comparator.comparingDouble(Character::getX)) + .toList()) + .map(lineCharacters -> new Line(lineCharacters, characterSpacing)) + .toList(); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/ReadingOrderService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/ReadingOrderService.java index 590a091..6d1a741 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/ReadingOrderService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/ReadingOrderService.java @@ -39,7 +39,10 @@ public class ReadingOrderService { } } - if (histogram.values().stream().mapToInt(Integer::intValue).average().orElse(1) < MULTI_COLUMN_DETECTION_THRESHOLD) { + if (histogram.values() + .stream() + .mapToInt(Integer::intValue).average() + .orElse(1) < MULTI_COLUMN_DETECTION_THRESHOLD) { return resolveSingleColumnReadingOrder(zones); } else { @@ -52,7 +55,7 @@ public class ReadingOrderService { private static List resolveSingleColumnReadingOrder(List zones) { zones.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) - .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); + .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); return zones; } @@ -90,14 +93,14 @@ public class ReadingOrderService { } leftOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) - .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); + .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); rightOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) - .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); + .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); middle.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) - .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); - + .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); +/* List leftNotIntersecting = new ArrayList<>(); for (Zone leftZone : leftOf) { boolean intersects = false; @@ -139,7 +142,7 @@ public class ReadingOrderService { middle.addAll(leftNotIntersecting); middle.addAll(rightNotIntersecting); - +*/ List sortedZones = new ArrayList<>(); sortedZones.addAll(leftOf); sortedZones.addAll(rightOf); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/ZoneBuilderService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/ZoneBuilderService.java index 4520163..ec1871c 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/ZoneBuilderService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/ZoneBuilderService.java @@ -5,6 +5,7 @@ import java.util.Comparator; import java.util.HashSet; import java.util.List; import java.util.Set; +import java.util.stream.Collectors; import org.springframework.stereotype.Service; @@ -12,6 +13,7 @@ import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Chara import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone; +import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; @Service public class ZoneBuilderService { @@ -29,12 +31,10 @@ public class ZoneBuilderService { private static final double ANGLE_TOLERANCE = Math.PI / 6; - private static final int MAX_ZONES = 300; - private static final double MAX_VERTICAL_MERGE_DISTANCE = 0.5; - public List buildZones(List lines, double characterSpacing, double lineSpacing) { + public List buildZones(List lines, double characterSpacing, double lineSpacing, CleanRulings rulings) { double minHorizontalDistance = characterSpacing * MIN_HORIZONTAL_DISTANCE_MULTIPLIER; double maxVerticalDistance = lineSpacing * MAX_VERTICAL_DISTANCE_MULTIPLIER; @@ -45,38 +45,39 @@ public class ZoneBuilderService { double meanHeight = calculateMeanHeight(lines); - lines.forEach(outerLine -> // - lines.forEach(innerLine -> { + lines.forEach(outerLine -> { + lines.forEach(innerLine -> { - double scale = Math.min(outerLine.getHeight(), innerLine.getHeight()) / meanHeight; - scale = Math.max(MIN_LINE_SIZE_SCALE, Math.min(scale, MAX_LINE_SIZE_SCALE)); + if (innerLine == outerLine // + || unionFind.inSameSet(outerLine, innerLine)// + || outerLine.angularDifference(innerLine) > ANGLE_TOLERANCE) { + return; + } - if (!unionFind.inSameSet(outerLine, innerLine) && outerLine.angularDifference(innerLine) <= ANGLE_TOLERANCE) { + double scale = Math.min(outerLine.getHeight(), innerLine.getHeight()) / meanHeight; + scale = Math.max(MIN_LINE_SIZE_SCALE, Math.min(scale, MAX_LINE_SIZE_SCALE)); - double horizontalDistance = outerLine.horizontalDistance(innerLine) / scale; - double verticalDistance = outerLine.verticalDistance(innerLine) / scale; + double horizontalDistance = outerLine.horizontalDistance(innerLine) / scale; + double verticalDistance = outerLine.verticalDistance(innerLine) / scale; - if (minHorizontalDistance <= horizontalDistance && verticalDistance <= maxVerticalDistance // - || minHorizontalMergeDistance <= horizontalDistance && verticalDistance <= maxVerticalMergeDistance) { - unionFind.union(outerLine, innerLine); - } - } - })); + if ((!(minHorizontalDistance <= horizontalDistance) || !(verticalDistance <= maxVerticalDistance)) // + && (!(minHorizontalMergeDistance <= horizontalDistance) || !(verticalDistance <= maxVerticalMergeDistance))) { + return; + } - List zones = new ArrayList<>(); - unionFind.getGroups().forEach(group -> { - zones.add(mergeLinesInZone(new ArrayList<>(group), characterSpacing, lineSpacing)); + if (rulings.lineBetween(outerLine, innerLine)) { + return; + } + + unionFind.union(outerLine, innerLine); + + }); }); - if (zones.size() > MAX_ZONES) { - List oneZoneLines = new ArrayList<>(); - for (Zone zone : zones) { - oneZoneLines.addAll(zone.getLines()); - } - return List.of(mergeLinesInZone(oneZoneLines, characterSpacing, lineSpacing)); - } - - return zones; + return unionFind.getGroups() + .stream() + .map(group -> mergeLinesInZone(new ArrayList<>(group), characterSpacing, lineSpacing)) + .toList(); } @@ -103,35 +104,40 @@ public class ZoneBuilderService { UnionFind unionFind = new UnionFind<>(new HashSet<>(lines)); lines.forEach(outer -> { - lines.forEach(inner -> { - if (inner != outer) { + if (inner == outer) { + return; + } - double horizontalDistance = outer.horizontalDistance(inner); - double verticalDistance = outer.verticalDistance(inner); + double horizontalDistance = outer.horizontalDistance(inner); + double verticalDistance = outer.verticalDistance(inner); - if (horizontalDistance <= maxHorizontalDistance && minVerticalDistance <= verticalDistance && verticalDistance <= maxVerticalDistance) { - unionFind.union(outer, inner); - } else if (minVerticalDistance <= verticalDistance && verticalDistance <= maxVerticalDistance && Math.abs(horizontalDistance - Math.min(outer.getLength(), - inner.getLength())) < 0.1) { - boolean characterOverlap = false; - int overlappingCount = 0; - for (Character outerCharacter : outer.getCharacters()) { - for (Character innerCharacter : inner.getCharacters()) { - double characterOverlapDistance = outerCharacter.overlappingDistance(innerCharacter); - if (characterOverlapDistance > 2) { - characterOverlap = true; - } - if (characterOverlapDistance > 0) { - overlappingCount++; - } + if (horizontalDistance <= maxHorizontalDistance && minVerticalDistance <= verticalDistance && verticalDistance <= maxVerticalDistance) { + + unionFind.union(outer, inner); + + } else if (minVerticalDistance <= verticalDistance + && verticalDistance <= maxVerticalDistance + && Math.abs(horizontalDistance - Math.min(outer.getLength(), inner.getLength())) < 0.1) { + + boolean characterOverlap = false; + int overlappingCount = 0; + for (Character outerCharacter : outer.getCharacters()) { + for (Character innerCharacter : inner.getCharacters()) { + double characterOverlapDistance = outerCharacter.overlappingDistance(innerCharacter); + if (characterOverlapDistance > 2) { + characterOverlap = true; + } + if (characterOverlapDistance > 0) { + overlappingCount++; } } - if (!characterOverlap && overlappingCount <= 2) { - unionFind.union(outer, inner); - } + } + if (!characterOverlap && overlappingCount <= 2) { + unionFind.union(outer, inner); } } + }); }); @@ -146,7 +152,9 @@ public class ZoneBuilderService { outputZone.add(new Line(characters, characterSpacing)); } - return new Zone(outputZone); + return new Zone(outputZone.stream() + .sorted(Comparator.comparing(Line::getY0)) + .collect(Collectors.toList())); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/AbstractPageBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/AbstractPageBlock.java index 1f01f2f..c805244 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/AbstractPageBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/AbstractPageBlock.java @@ -1,7 +1,10 @@ package com.knecon.fforesight.service.layoutparser.processor.model; +import java.awt.geom.Rectangle2D; + import com.fasterxml.jackson.annotation.JsonIgnore; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; +import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import lombok.AllArgsConstructor; @@ -13,16 +16,8 @@ import lombok.NoArgsConstructor; @AllArgsConstructor @NoArgsConstructor @EqualsAndHashCode(callSuper = true) -public abstract class AbstractPageBlock extends Rectangle { +public abstract class AbstractPageBlock extends BoundingBox { - @JsonIgnore - protected float minX; - @JsonIgnore - protected float maxX; - @JsonIgnore - protected float minY; - @JsonIgnore - protected float maxY; @JsonIgnore protected PageBlockType classification; @JsonIgnore @@ -41,63 +36,6 @@ public abstract class AbstractPageBlock extends Rectangle { } - public boolean containsBlock(TextPageBlock other) { - - return this.minX <= other.getPdfMinX() && this.maxX >= other.getPdfMaxX() && this.minY >= other.getPdfMinY() && this.maxY <= other.getPdfMaxY(); - } - - - public boolean contains(AbstractPageBlock other) { - - return this.minX <= other.minX && this.maxX >= other.maxX && this.minY >= other.minY && this.maxY <= other.maxY; - } - - - public boolean contains(Rectangle other) { - - return page == other.getPage() && this.minX <= other.getTopLeft().getX() && this.maxX >= other.getTopLeft().getX() + other.getWidth() && this.minY <= other.getTopLeft() - .getY() && this.maxY >= other.getTopLeft().getY() + other.getHeight(); - } - - - @JsonIgnore - public float getHeight() { - - return maxY - minY; - } - - - @JsonIgnore - public float getWidth() { - - return maxX - minX; - } - - - public boolean intersectsY(AbstractPageBlock apb) { - - return this.minY <= apb.getMaxY() && this.maxY >= apb.getMinY(); - } - - - public boolean almostIntersects(AbstractPageBlock apb, float yThreshold, float xThreshold) { - - return this.almostIntersectsX(apb, xThreshold) && this.almostIntersectsY(apb, yThreshold); - } - - - private boolean almostIntersectsY(AbstractPageBlock apb, float threshold) { - - return this.minY - threshold <= apb.getMaxY() && this.maxY + threshold >= apb.getMinY(); - } - - - private boolean almostIntersectsX(AbstractPageBlock apb, float threshold) { - - return this.minX - threshold <= apb.getMaxX() && this.maxX + threshold >= apb.getMinX(); - } - - public abstract boolean isEmpty(); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationDocument.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationDocument.java index b3565ae..4f3f339 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationDocument.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationDocument.java @@ -5,6 +5,7 @@ import java.util.List; import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter; import com.knecon.fforesight.service.layoutparser.processor.model.text.UnclassifiedText; +import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations; import lombok.Data; import lombok.NoArgsConstructor; @@ -22,6 +23,7 @@ public class ClassificationDocument { private FloatFrequencyCounter fontSizeCounter = new FloatFrequencyCounter(); private StringFrequencyCounter fontCounter = new StringFrequencyCounter(); private StringFrequencyCounter fontStyleCounter = new StringFrequencyCounter(); + private LayoutparsingVisualizations visualizations = new LayoutparsingVisualizations(); private boolean headlines; private long rulesVersion; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Document.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Document.java index e7b5f82..9a9d9cc 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Document.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Document.java @@ -12,6 +12,7 @@ import java.util.stream.Stream; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; +import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations; import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; @@ -40,6 +41,8 @@ public class Document implements GenericSemanticNode { @Builder.Default Set entities = new HashSet<>(); + LayoutparsingVisualizations visualizations; + @Override public NodeType getType() { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Cell.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Cell.java index 2f0de29..8da9b97 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Cell.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Cell.java @@ -1,11 +1,13 @@ package com.knecon.fforesight.service.layoutparser.processor.model.table; +import java.awt.geom.AffineTransform; import java.awt.geom.Point2D; import java.awt.geom.Rectangle2D; import java.util.ArrayList; import java.util.Iterator; import java.util.List; +import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities; @@ -18,7 +20,7 @@ import lombok.NoArgsConstructor; @Data @EqualsAndHashCode(callSuper = true) @NoArgsConstructor -public class Cell extends Rectangle { +public class Cell extends BoundingBox { private List textBlocks = new ArrayList<>(); @@ -33,13 +35,24 @@ public class Cell extends Rectangle { public Cell(Point2D topLeft, Point2D bottomRight) { - super((float) topLeft.getY(), (float) topLeft.getX(), (float) (bottomRight.getX() - topLeft.getX()), (float) (bottomRight.getY() - topLeft.getY())); + this.bBoxInitialUserSpace = new Rectangle2D.Double(topLeft.getX(), topLeft.getY(), (bottomRight.getX() - topLeft.getX()), (bottomRight.getY() - topLeft.getY())); + this.bBox = bBoxInitialUserSpace; } - public Cell(Rectangle2D r) { + public Cell(Rectangle2D bBoxInitialUserSpace, AffineTransform initialUserSpaceToJava) { - super((float) r.getY(), (float) r.getX(), (float) r.getWidth(), (float) r.getHeight()); + this.bBoxInitialUserSpace = bBoxInitialUserSpace; + this.bBox = initialUserSpaceToJava.createTransformedShape(bBoxInitialUserSpace).getBounds2D(); + } + + + public static Cell copy(Cell cell) { + + Cell copy = new Cell(); + copy.bBoxInitialUserSpace = cell.bBoxInitialUserSpace; + copy.bBox = cell.bBox; + return copy; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/CleanRulings.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/CleanRulings.java index 735d7a5..a4b97cc 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/CleanRulings.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/CleanRulings.java @@ -1,15 +1,206 @@ package com.knecon.fforesight.service.layoutparser.processor.model.table; +import java.awt.geom.Line2D; +import java.awt.geom.Point2D; +import java.awt.geom.Rectangle2D; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.Comparator; +import java.util.LinkedList; import java.util.List; +import java.util.stream.Stream; -import lombok.Builder; -import lombok.Data; +import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox; -@Data -@Builder +import lombok.Getter; + +@Getter public class CleanRulings { - List horizontal; - List vertical; + List horizontals; // unmodifiable sorted by Y list + List verticals; // unmodifiable sorted by X list + + + public CleanRulings(List horizontals, List verticals) { + + this.horizontals = horizontals.stream() + .peek(Ruling::assertHorizontal) + .sorted(Comparator.comparing(Line2D.Float::getY1)) + .toList(); + this.verticals = verticals.stream() + .peek(Ruling::assertVertical) + .sorted(Comparator.comparing(Line2D.Float::getX1)) + .toList(); + } + + + public CleanRulings getTableLines() { + + return new CleanRulings(horizontals.stream() + .filter(ruling -> ruling.getClassification().equals(Ruling.Classification.TABLE_LINE)) + .toList(), + verticals.stream() + .filter(ruling -> ruling.getClassification().equals(Ruling.Classification.TABLE_LINE)) + .toList()); + } + + + public CleanRulings withoutTextRulings() { + + return new CleanRulings(horizontals.stream() + .filter(ruling -> !(ruling.getClassification().equals(Ruling.Classification.UNDERLINE) || ruling.getClassification() + .equals(Ruling.Classification.STRIKETROUGH))) + .toList(), + verticals.stream() + .filter(ruling -> !(ruling.getClassification().equals(Ruling.Classification.UNDERLINE) || ruling.getClassification() + .equals(Ruling.Classification.STRIKETROUGH))) + .toList()); + } + + + public List buildAll() { + + ArrayList rulings = new ArrayList<>(horizontals.size() + verticals.size()); + rulings.addAll(horizontals); + rulings.addAll(verticals); + return rulings; + } + + + public boolean lineBetween(BoundingBox a, BoundingBox b) { + + return lineBetween(a.getBBoxInitialUserSpace(), b.getBBoxInitialUserSpace()); + } + + + public boolean lineBetween(Rectangle2D a, Rectangle2D b) { + + return lineBetween(new Point2D.Double(a.getCenterX(), a.getCenterY()), new Point2D.Double(b.getCenterX(), b.getCenterY())); + } + + + public boolean lineBetween(Point2D p1, Point2D p2) { + + Ruling ruling = new Ruling(p1, p2); + + if (ruling.isHorizontal()) { + return getVerticalsInXInterval(ruling.x1, ruling.x2).stream() + .anyMatch(vertical -> vertical.intersectsLine(ruling)); + + } + + if (ruling.isVertical()) { + return getHorizontalsInYInterval(ruling.y1, ruling.y2).stream() + .anyMatch(horizontal -> horizontal.intersectsLine(ruling)); + + } + + return Stream.of(getVerticalsInXInterval(ruling.x1, ruling.x2), getHorizontalsInYInterval(ruling.y1, ruling.y2)) + .flatMap(Collection::stream) + .anyMatch(other -> other.intersectsLine(ruling)); + } + + + public List getHorizontalsInYInterval(float y1, float y2) { + + float startY = Math.min(y1, y2); + float endY = Math.max(y1, y2); + + if (horizontals.isEmpty() || Float.isNaN(startY) || Float.isNaN(endY)) { + return Collections.emptyList(); + } + + int firstGreaterThanIdx = findFirstHorizontalRulingIdxAbove(startY); + + if (firstGreaterThanIdx == -1) { + return Collections.emptyList(); + } + + List result = new LinkedList<>(); + for (int i = firstGreaterThanIdx; i < horizontals.size(); i++) { + Ruling horizontal = horizontals.get(i); + if (horizontal.y1 > endY) { + break; + } + result.add(horizontal); + } + return result; + } + + + private int findFirstHorizontalRulingIdxAbove(float y) { + + int low = 0; + int high = horizontals.size() - 1; + + while (low <= high) { + int mid = low + (high - low) / 2; + Line2D.Float midLine = horizontals.get(mid); + float midY = midLine.y1; + + if (midY == y) { + return mid; + } else if (midY > y) { + high = mid - 1; + } else { + low = mid + 1; + } + } + + // Return the index of the first element greater than y or -1 if not found + return horizontals.size() > low && horizontals.get(low).y1 > y ? low : -1; + } + + + public List getVerticalsInXInterval(float x1, float x2) { + + float startX = Math.min(x1, x2); + float endX = Math.max(x1, x2); + + if (verticals.isEmpty() || Float.isNaN(startX) || Float.isNaN(endX)) { + return Collections.emptyList(); + } + + int firstGreaterThanIdx = findFirstVerticalRulingIdxRightOf(startX); + + if (firstGreaterThanIdx == -1) { + return Collections.emptyList(); + } + + List result = new LinkedList<>(); + for (int i = firstGreaterThanIdx; i < verticals.size(); i++) { + Ruling horizontal = verticals.get(i); + if (horizontal.x1 > endX) { + break; + } + result.add(horizontal); + } + return result; + } + + + private int findFirstVerticalRulingIdxRightOf(float x) { + + int low = 0; + int high = verticals.size() - 1; + + while (low <= high) { + int mid = low + (high - low) / 2; + Line2D.Float midLine = verticals.get(mid); + float midX = midLine.x1; + + if (midX == x) { + return mid; + } else if (midX > x) { + high = mid - 1; + } else { + low = mid + 1; + } + } + + // Return the index of the first element greater than y or -1 if not found + return verticals.size() > low && verticals.get(low).x1 > x ? low : -1; + } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Rectangle.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Rectangle.java deleted file mode 100644 index c357ab7..0000000 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Rectangle.java +++ /dev/null @@ -1,218 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.processor.model.table; - -import java.awt.geom.Point2D; -import java.awt.geom.Rectangle2D; -import java.util.Comparator; -import java.util.List; - -@SuppressWarnings("all") -public class Rectangle extends Rectangle2D.Float { - - protected static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f; - /** - * Ill-defined comparator, from when Rectangle was Comparable. - *

- * see https://github.com/tabulapdf/tabula-java/issues/116 - * - * @deprecated with no replacement - */ - @Deprecated - public static final Comparator ILL_DEFINED_ORDER = new Comparator() { - @Override - public int compare(Rectangle o1, Rectangle o2) { - - if (o1.equals(o2)) { - return 0; - } - if (o1.verticalOverlap(o2) > VERTICAL_COMPARISON_THRESHOLD) { - return o1.isLtrDominant() == -1 && o2.isLtrDominant() == -1 ? -java.lang.Double.compare(o1.getX(), o2.getX()) : java.lang.Double.compare(o1.getX(), o2.getX()); - } else { - return java.lang.Float.compare(o1.getBottom(), o2.getBottom()); - } - } - }; - - - public Rectangle() { - - super(); - } - - - public Rectangle(float top, float left, float width, float height) { - - super(); - this.setRect(left, top, width, height); - } - - - /** - * @param rectangles - * @return minimum bounding box that contains all the rectangles - */ - public static Rectangle boundingBoxOf(List rectangles) { - - float minx = java.lang.Float.MAX_VALUE; - float miny = java.lang.Float.MAX_VALUE; - float maxx = java.lang.Float.MIN_VALUE; - float maxy = java.lang.Float.MIN_VALUE; - - for (Rectangle r : rectangles) { - minx = (float) Math.min(r.getMinX(), minx); - miny = (float) Math.min(r.getMinY(), miny); - maxx = (float) Math.max(r.getMaxX(), maxx); - maxy = (float) Math.max(r.getMaxY(), maxy); - } - return new Rectangle(miny, minx, maxx - minx, maxy - miny); - } - - - public int compareTo(Rectangle other) { - - return ILL_DEFINED_ORDER.compare(this, other); - } - - - // I'm bad at Java and need this for fancy sorting in - // technology.tabula.TextChunk. - public int isLtrDominant() { - - return 0; - } - - - public float getArea() { - - return this.width * this.height; - } - - - public float verticalOverlap(Rectangle other) { - - return Math.max(0, Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop())); - } - - - public boolean verticallyOverlaps(Rectangle other) { - - return verticalOverlap(other) > 0; - } - - - public float horizontalOverlap(Rectangle other) { - - return Math.max(0, Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft())); - } - - - public boolean horizontallyOverlaps(Rectangle other) { - - return horizontalOverlap(other) > 0; - } - - - public float verticalOverlapRatio(Rectangle other) { - - float rv = 0, delta = Math.min(this.getBottom() - this.getTop(), other.getBottom() - other.getTop()); - - if (other.getTop() <= this.getTop() && this.getTop() <= other.getBottom() && other.getBottom() <= this.getBottom()) { - rv = (other.getBottom() - this.getTop()) / delta; - } else if (this.getTop() <= other.getTop() && other.getTop() <= this.getBottom() && this.getBottom() <= other.getBottom()) { - rv = (this.getBottom() - other.getTop()) / delta; - } else if (this.getTop() <= other.getTop() && other.getTop() <= other.getBottom() && other.getBottom() <= this.getBottom()) { - rv = (other.getBottom() - other.getTop()) / delta; - } else if (other.getTop() <= this.getTop() && this.getTop() <= this.getBottom() && this.getBottom() <= other.getBottom()) { - rv = (this.getBottom() - this.getTop()) / delta; - } - - return rv; - - } - - - public float overlapRatio(Rectangle other) { - - double intersectionWidth = Math.max(0, Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft())); - double intersectionHeight = Math.max(0, Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop())); - double intersectionArea = Math.max(0, intersectionWidth * intersectionHeight); - double unionArea = this.getArea() + other.getArea() - intersectionArea; - - return (float) (intersectionArea / unionArea); - } - - - public Rectangle merge(Rectangle other) { - - this.setRect(this.createUnion(other)); - return this; - } - - - public float getTop() { - - return (float) this.getMinY(); - } - - - public void setTop(float top) { - - float deltaHeight = top - this.y; - this.setRect(this.x, top, this.width, this.height - deltaHeight); - } - - - public float getRight() { - - return (float) this.getMaxX(); - } - - - public void setRight(float right) { - - this.setRect(this.x, this.y, right - this.x, this.height); - } - - - public float getLeft() { - - return (float) this.getMinX(); - } - - - public void setLeft(float left) { - - float deltaWidth = left - this.x; - this.setRect(left, this.y, this.width - deltaWidth, this.height); - } - - - public float getBottom() { - - return (float) this.getMaxY(); - } - - - public void setBottom(float bottom) { - - this.setRect(this.x, this.y, this.width, bottom - this.y); - } - - - public Point2D[] getPoints() { - - return new Point2D[]{new Point2D.Float(this.getLeft(), this.getTop()), new Point2D.Float(this.getRight(), this.getTop()), new Point2D.Float(this.getRight(), - this.getBottom()), new Point2D.Float(this.getLeft(), this.getBottom())}; - } - - - @Override - public String toString() { - - StringBuilder sb = new StringBuilder(); - String s = super.toString(); - sb.append(s.substring(0, s.length() - 1)); - sb.append(String.format(",bottom=%f,right=%f]", this.getBottom(), this.getRight())); - return sb.toString(); - } - -} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Ruling.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Ruling.java index 7586258..e910ff1 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Ruling.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Ruling.java @@ -4,16 +4,14 @@ import java.awt.geom.Line2D; import java.awt.geom.Point2D; import java.awt.geom.Rectangle2D; import java.util.ArrayList; -import java.util.Collections; -import java.util.Comparator; import java.util.Formatter; import java.util.List; -import java.util.Map; -import java.util.TreeMap; import com.knecon.fforesight.service.layoutparser.processor.utils.CohenSutherlandClipping; import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons; +import lombok.Getter; +import lombok.Setter; import lombok.extern.slf4j.Slf4j; @Slf4j @@ -23,10 +21,24 @@ public class Ruling extends Line2D.Float { public static final int PERPENDICULAR_UNIT_EXPAND_AMOUNT = 2; public static final int COLINEAR_OR_PARALLEL_UNIT_EXPAND_AMOUNT = 2; + public enum Classification { + TABLE_LINE, + UNDERLINE, + STRIKETROUGH, + HEADER_SEPARATOR, + FOOTER_SEPARATOR, + OTHER + } + + @Getter + @Setter + private Classification classification; + public Ruling(Point2D p1, Point2D p2) { super(p1, p2); + this.classification = Classification.OTHER; } @@ -60,126 +72,32 @@ public class Ruling extends Line2D.Float { } - // log(n) implementation of find_intersections - // based on http://people.csail.mit.edu/indyk/6.838-old/handouts/lec2.pdf - public static Map findIntersections(List horizontals, List verticals) { - - class SortObject { - - protected SOType type; - protected float position; - protected Ruling ruling; - - - public SortObject(SOType type, float position, Ruling ruling) { - - this.type = type; - this.position = position; - this.ruling = ruling; - } + public void assertHorizontal() { + if (isHorizontal()) { + return; } - - List sos = new ArrayList<>(); - - TreeMap tree = new TreeMap<>(new Comparator() { - @Override - public int compare(Ruling o1, Ruling o2) { - - return java.lang.Double.compare(o1.getTop(), o2.getTop()); - } - }); - - TreeMap rv = new TreeMap<>(new Comparator() { - @Override - public int compare(Point2D o1, Point2D o2) { - - if (o1.getY() > o2.getY()) { - return 1; - } - if (o1.getY() < o2.getY()) { - return -1; - } - if (o1.getX() > o2.getX()) { - return 1; - } - if (o1.getX() < o2.getX()) { - return -1; - } - return 0; - } - }); - - for (Ruling h : horizontals) { - sos.add(new SortObject(SOType.HLEFT, h.getLeft() - PERPENDICULAR_UNIT_EXPAND_AMOUNT, h)); - sos.add(new SortObject(SOType.HRIGHT, h.getRight() + PERPENDICULAR_UNIT_EXPAND_AMOUNT, h)); - } - - for (Ruling v : verticals) { - sos.add(new SortObject(SOType.VERTICAL, v.getLeft(), v)); - } - - Collections.sort(sos, new Comparator() { - @Override - public int compare(SortObject a, SortObject b) { - - int rv; - if (DoubleComparisons.feq(a.position, b.position)) { - if (a.type == SOType.VERTICAL && b.type == SOType.HLEFT) { - rv = 1; - } else if (a.type == SOType.VERTICAL && b.type == SOType.HRIGHT) { - rv = -1; - } else if (a.type == SOType.HLEFT && b.type == SOType.VERTICAL) { - rv = -1; - } else if (a.type == SOType.HRIGHT && b.type == SOType.VERTICAL) { - rv = 1; - } else { - rv = java.lang.Double.compare(a.position, b.position); - } - } else { - return java.lang.Double.compare(a.position, b.position); - } - return rv; - } - }); - - for (SortObject so : sos) { - switch (so.type) { - case VERTICAL: - for (Map.Entry h : tree.entrySet()) { - try { - Point2D i = h.getKey().intersectionPoint(so.ruling); - if (i == null) { - continue; - } - rv.put(i, new Ruling[]{h.getKey().expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT), so.ruling.expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT)}); - } catch (UnsupportedOperationException e) { - log.info("Some line are oblique, ignoring..."); - continue; - } - } - break; - case HRIGHT: - tree.remove(so.ruling); - break; - case HLEFT: - tree.put(so.ruling, true); - break; - } - } - - return rv; + throw new IllegalArgumentException("Ruling " + this + " is not horizontal"); } - public boolean vertical() { + public void assertVertical() { + + if (isVertical()) { + return; + } + throw new IllegalArgumentException("Ruling " + this + " is not vertical"); + } + + + public boolean isVertical() { return this.length() > 0 && DoubleComparisons.feq(this.x1, this.x2); //diff < ORIENTATION_CHECK_THRESHOLD; } - public boolean horizontal() { + public boolean isHorizontal() { return this.length() > 0 && DoubleComparisons.feq(this.y1, this.y2); //diff < ORIENTATION_CHECK_THRESHOLD; } @@ -188,36 +106,36 @@ public class Ruling extends Line2D.Float { // these are used to have a single collapse method (in page, currently) - public boolean oblique() { + public boolean isOblique() { - return !(this.vertical() || this.horizontal()); + return !(this.isVertical() || this.isHorizontal()); } public float getPosition() { - if (this.oblique()) { + if (this.isOblique()) { throw new UnsupportedOperationException(); } - return this.vertical() ? this.getLeft() : this.getTop(); + return this.isVertical() ? this.getLeft() : this.getTop(); } public float getStart() { - if (this.oblique()) { + if (this.isOblique()) { throw new UnsupportedOperationException(); } - return this.vertical() ? this.getTop() : this.getLeft(); + return this.isVertical() ? this.getTop() : this.getLeft(); } public void setStart(float v) { - if (this.oblique()) { + if (this.isOblique()) { throw new UnsupportedOperationException(); } - if (this.vertical()) { + if (this.isVertical()) { this.setTop(v); } else { this.setLeft(v); @@ -227,19 +145,19 @@ public class Ruling extends Line2D.Float { public float getEnd() { - if (this.oblique()) { + if (this.isOblique()) { throw new UnsupportedOperationException(); } - return this.vertical() ? this.getBottom() : this.getRight(); + return this.isVertical() ? this.getBottom() : this.getRight(); } public void setEnd(float v) { - if (this.oblique()) { + if (this.isOblique()) { throw new UnsupportedOperationException(); } - if (this.vertical()) { + if (this.isVertical()) { this.setBottom(v); } else { this.setRight(v); @@ -249,10 +167,10 @@ public class Ruling extends Line2D.Float { public void setStartEnd(float start, float end) { - if (this.oblique()) { + if (this.isOblique()) { throw new UnsupportedOperationException(); } - if (this.vertical()) { + if (this.isVertical()) { this.setTop(start); this.setBottom(end); } else { @@ -264,7 +182,7 @@ public class Ruling extends Line2D.Float { public boolean perpendicularTo(Ruling other) { - return this.vertical() == other.horizontal(); + return this.isVertical() == other.isHorizontal(); } @@ -318,30 +236,6 @@ public class Ruling extends Line2D.Float { } - public Point2D intersectionPoint(Ruling other) { - - Ruling this_l = this.expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT); - Ruling other_l = other.expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT); - Ruling horizontal, vertical; - - if (!this_l.intersectsLine(other_l)) { - return null; - } - - if (this_l.horizontal() && other_l.vertical()) { - horizontal = this_l; - vertical = other_l; - } else if (this_l.vertical() && other_l.horizontal()) { - vertical = this_l; - horizontal = other_l; - } else { - log.warn("lines must be orthogonal, vertical and horizontal"); - return null; - } - return new Point2D.Float(vertical.getLeft(), horizontal.getTop()); - } - - @Override public boolean equals(Object other) { @@ -451,16 +345,9 @@ public class Ruling extends Line2D.Float { final float TOLERANCE = 1; return Math.abs(ruling.getX1() - x1) < TOLERANCE &&// - Math.abs(ruling.getY1() - y1) < TOLERANCE &&// - Math.abs(ruling.getX2() - x2) < TOLERANCE &&// - Math.abs(ruling.getY2() - y2) < TOLERANCE; - } - - - private enum SOType { - VERTICAL, - HRIGHT, - HLEFT + Math.abs(ruling.getY1() - y1) < TOLERANCE &&// + Math.abs(ruling.getX2() - x2) < TOLERANCE &&// + Math.abs(ruling.getY2() - y2) < TOLERANCE; } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/TablePageBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/TablePageBlock.java index 020dca6..8093280 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/TablePageBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/TablePageBlock.java @@ -36,14 +36,11 @@ public class TablePageBlock extends AbstractPageBlock { private List cells; - public TablePageBlock(List cells, Rectangle area, int rotation) { + public TablePageBlock(List cells, int rotation) { + setToBBoxOfComponents(cells); this.cells = cells; addCells(cells); - minX = area.getLeft(); - minY = area.getBottom(); - maxX = area.getRight(); - maxY = area.getTop(); classification = PageBlockType.TABLE; this.rotation = rotation; } @@ -230,15 +227,15 @@ public class TablePageBlock extends AbstractPageBlock { return new ArrayList<>(); } - Set uniqueX = new HashSet<>(); - Set uniqueY = new HashSet<>(); + Set uniqueX = new HashSet<>(); + Set uniqueY = new HashSet<>(); cells.stream() .filter(c -> !c.getTextBlocks().isEmpty() || c.getHeight() > 3 && c.getWidth() > 3) .forEach(c -> { - uniqueX.add(c.getLeft()); - uniqueX.add(c.getRight()); - uniqueY.add(c.getBottom()); - uniqueY.add(c.getTop()); + uniqueX.add(c.getPdfMinX()); + uniqueX.add(c.getPdfMaxX()); + uniqueY.add(c.getPdfMinY()); + uniqueY.add(c.getPdfMaxY()); }); var sortedUniqueX = uniqueX.stream() @@ -250,22 +247,24 @@ public class TablePageBlock extends AbstractPageBlock { List> rowsOfCells = new ArrayList<>(); - Float prevY = null; + Double prevY = null; - for (Float y : sortedUniqueY) { + for (Double y : sortedUniqueY) { List row = new ArrayList<>(); - Float prevX = null; - for (Float x : sortedUniqueX) { + Double prevX = null; + for (Double x : sortedUniqueX) { if (prevY != null && prevX != null) { - var cellFromGridStructure = new Cell(new Point2D.Float(prevX, prevY), new Point2D.Float(x, y)); + var cellFromGridStructure = new Cell(new Point2D.Double(prevX, prevY), new Point2D.Double(x, y)); if (cellFromGridStructure.hasMinimumSize()) { cells.stream() - .map(originalCell -> new CellWithIntersection(originalCell, RectangleTransformations.calculateIntersectedArea(cellFromGridStructure, originalCell))) + .map(originalCell -> new CellWithIntersection(originalCell, + RectangleTransformations.calculateIntersectedArea(cellFromGridStructure.getBBoxInitialUserSpace(), + originalCell.getBBoxInitialUserSpace()))) .filter(cellWithIntersection -> cellWithIntersection.intersectedArea > 0) .filter(cellWithIntersection -> cellWithIntersection.originalCell.getArea() > cellWithIntersection.intersectedArea * CELL_AREA_CONTAINED_THRESHOLD) .max(Comparator.comparing(CellWithIntersection::intersectedArea)) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/RedTextPosition.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/RedTextPosition.java index e4a4212..710d7eb 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/RedTextPosition.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/RedTextPosition.java @@ -1,8 +1,12 @@ package com.knecon.fforesight.service.layoutparser.processor.model.text; +import java.awt.geom.AffineTransform; +import java.awt.geom.Rectangle2D; + import org.apache.pdfbox.text.TextPosition; import com.fasterxml.jackson.annotation.JsonIgnore; +import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox; import lombok.AllArgsConstructor; import lombok.Builder; @@ -14,9 +18,11 @@ import lombok.SneakyThrows; @Builder @NoArgsConstructor @AllArgsConstructor -public class RedTextPosition { +public class RedTextPosition extends BoundingBox { - private float[] position; + public final static int HEIGHT_PADDING = 2; + + private Rectangle2D.Float bBoxDirAdj; // adjusted to text rotation @JsonIgnore private int rotation; @@ -58,43 +64,71 @@ public class RedTextPosition { pos.setFontSizeInPt(textPosition.getFontSizeInPt()); pos.setFontName(textPosition.getFont().getName()); - var position = new float[4]; + //TODO: There is a mismatch in the java coords of the text and the rulings, + // I guess if we start with the initial user space positions and transform them the same way we do the rulings it would work. + pos.setBBox(new Rectangle2D.Float(textPosition.getX(), textPosition.getY(), textPosition.getWidthDirAdj(), textPosition.getHeight())); - position[0] = textPosition.getXDirAdj(); - position[1] = textPosition.getYDirAdj(); - position[2] = textPosition.getWidthDirAdj(); - position[3] = textPosition.getHeightDir(); + float textHeight = textPosition.getHeight() + HEIGHT_PADDING; + Rectangle2D.Float dirAdjPosition = new Rectangle2D.Float(textPosition.getXDirAdj(), + textPosition.getYDirAdj() - textHeight, + textPosition.getWidthDirAdj(), + textHeight + HEIGHT_PADDING); + pos.setBBoxDirAdj(dirAdjPosition); + + AffineTransform affineTransform = getRotationMatrix(TextDirection.fromDegrees(textPosition.getDir()), textPosition.getPageWidth(), textPosition.getPageHeight()); + Rectangle2D initialUserSpacePositionRect = affineTransform.createTransformedShape(dirAdjPosition).getBounds2D(); + + pos.setBBoxInitialUserSpace(initialUserSpacePositionRect); // These are definitely correct - pos.setPosition(position); return pos; } + + private static AffineTransform getRotationMatrix(TextDirection textDirection, float pageWidth, float pageHeight) { + + AffineTransform transform = new AffineTransform(); + + if (textDirection == TextDirection.ZERO || textDirection == TextDirection.HALF_CIRCLE) { + transform.rotate(textDirection.getRadians(), pageWidth / 2f, pageHeight / 2f); + transform.translate(0f, pageHeight); + } else if (textDirection == TextDirection.QUARTER_CIRCLE) { + transform.rotate(textDirection.getRadians(), pageWidth / 2f, pageWidth / 2f); + transform.translate(0f, pageWidth); + } else { + transform.rotate(textDirection.getRadians(), pageHeight / 2f, pageHeight / 2f); + transform.translate(0f, pageWidth); + } + transform.scale(1., -1.); + return transform; + } + + @JsonIgnore public float getXDirAdj() { - return position[0]; + return this.bBoxDirAdj.x; } @JsonIgnore public float getYDirAdj() { - return position[1]; + return this.bBoxDirAdj.y; } @JsonIgnore public float getWidthDirAdj() { - return position[2]; + return this.bBoxDirAdj.width; } @JsonIgnore public float getHeightDir() { - return position[3]; + return this.bBoxDirAdj.height; } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java index 6323205..b7be4e1 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java @@ -1,16 +1,13 @@ package com.knecon.fforesight.service.layoutparser.processor.model.text; -import static java.util.stream.Collectors.toSet; - +import java.awt.geom.Rectangle2D; import java.util.ArrayList; -import java.util.Comparator; import java.util.List; import com.fasterxml.jackson.annotation.JsonIgnore; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter; import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; -import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons; import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities; import lombok.AllArgsConstructor; @@ -29,34 +26,31 @@ public class TextPageBlock extends AbstractPageBlock { @Builder.Default private List sequences = new ArrayList<>(); - @JsonIgnore - private int rotation; - - @JsonIgnore private String mostPopularWordFont; - @JsonIgnore private String mostPopularWordStyle; - @JsonIgnore private float mostPopularWordFontSize; - @JsonIgnore private float mostPopularWordHeight; - @JsonIgnore private float mostPopularWordSpaceWidth; - @JsonIgnore private float highestFontSize; - @JsonIgnore private PageBlockType classification; - @JsonIgnore private boolean toDuplicate; + public TextPageBlock(List sequences) { + + this.sequences = sequences; + calculateFrequencyCounters(); + calculateBBox(); + } + + @JsonIgnore public TextDirection getDir() { @@ -64,31 +58,40 @@ public class TextPageBlock extends AbstractPageBlock { } - @JsonIgnore - private float getPageHeight() { + private void calculateBBox() { - return sequences.get(0).getPageHeight(); - } - - - @JsonIgnore - private float getPageWidth() { - - return sequences.get(0).getPageWidth(); + if (sequences == null) { + this.bBox = new Rectangle2D.Double(); + this.bBoxInitialUserSpace = new Rectangle2D.Double(); + return; + } + setToBBoxOfComponents(sequences); } public static TextPageBlock merge(List textBlocksToMerge) { - List sequences = textBlocksToMerge.stream().map(TextPageBlock::getSequences).flatMap(java.util.Collection::stream).toList(); + if (textBlocksToMerge.isEmpty()) { + throw new IllegalArgumentException("Need to provide at least one TextPageBlock."); + } + if (textBlocksToMerge.stream() + .map(AbstractPageBlock::getPage) + .distinct() + .count() != 1) { + throw new IllegalArgumentException("Cannot merge textBlocks on different pages."); + } + + List sequences = textBlocksToMerge.stream() + .map(TextPageBlock::getSequences) + .flatMap(java.util.Collection::stream) + .toList(); sequences = new ArrayList<>(sequences); - return fromTextPositionSequences(sequences); + + return new TextPageBlock(sequences); } - public static TextPageBlock fromTextPositionSequences(List wordBlockList) { - - TextPageBlock textBlock = null; + private void calculateFrequencyCounters() { FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter(); FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter(); @@ -96,7 +99,7 @@ public class TextPageBlock extends AbstractPageBlock { StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter(); StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter(); - for (TextPositionSequence wordBlock : wordBlockList) { + for (TextPositionSequence wordBlock : sequences) { lineHeightFrequencyCounter.add(wordBlock.getTextHeight()); fontSizeFrequencyCounter.add(wordBlock.getFontSize()); @@ -104,160 +107,23 @@ public class TextPageBlock extends AbstractPageBlock { fontFrequencyCounter.add(wordBlock.getFont()); styleFrequencyCounter.add(wordBlock.getFontStyle()); - if (textBlock == null) { - textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(), - wordBlock.getMaxXDirAdj(), - wordBlock.getMinYDirAdj(), - wordBlock.getMaxYDirAdj(), - wordBlockList, - wordBlock.getRotation()); - } else { - TextPageBlock spatialEntity = textBlock.union(wordBlock); - textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight()); - } } - if (textBlock != null) { - textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular()); - textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular()); - textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular()); - textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular()); - textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular()); - textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest()); - } - - if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences() - .stream() - .map(t -> DoubleComparisons.round(t.getMinYDirAdj(), 3)) - .collect(toSet()) - .size() == 1) { - textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj)); - } - return textBlock; - } - - - /** - * Returns the minX value in pdf coordinate system. - * Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation. - * 0 -> LowerLeft - * 90 -> UpperLeft - * 180 -> UpperRight - * 270 -> LowerRight - * - * @return the minX value in pdf coordinate system - */ - @JsonIgnore - public float getPdfMinX() { - - if (getDir().getDegrees() == 90) { - return minY; - } else if (getDir().getDegrees() == 180) { - return getPageWidth() - maxX; - - } else if (getDir().getDegrees() == 270) { - - return getPageWidth() - maxY; - } else { - return minX; - } - } - - - /** - * Returns the maxX value in pdf coordinate system. - * Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation. - * 0 -> LowerLeft - * 90 -> UpperLeft - * 180 -> UpperRight - * 270 -> LowerRight - * - * @return the maxX value in pdf coordinate system - */ - @JsonIgnore - public float getPdfMaxX() { - - if (getDir().getDegrees() == 90) { - return maxY; - } else if (getDir().getDegrees() == 180) { - return getPageWidth() - minX; - } else if (getDir().getDegrees() == 270) { - return getPageWidth() - minY; - - } else { - return maxX; - } - } - - - /** - * Returns the minY value in pdf coordinate system. - * Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation. - * 0 -> LowerLeft - * 90 -> UpperLeft - * 180 -> UpperRight - * 270 -> LowerRight - * - * @return the minY value in pdf coordinate system - */ - @JsonIgnore - public float getPdfMinY() { - - if (getDir().getDegrees() == 90) { - return minX; - } else if (getDir().getDegrees() == 180) { - return maxY; - - } else if (getDir().getDegrees() == 270) { - return getPageHeight() - maxX; - - } else { - return getPageHeight() - maxY; - } - } - - - /** - * Returns the maxY value in pdf coordinate system. - * Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation. - * 0 -> LowerLeft - * 90 -> UpperLeft - * 180 -> UpperRight - * 270 -> LowerRight - * - * @return the maxY value in pdf coordinate system - */ - @JsonIgnore - public float getPdfMaxY() { - - if (getDir().getDegrees() == 90) { - return maxX; - } else if (getDir().getDegrees() == 180) { - - return minY; - } else if (getDir().getDegrees() == 270) { - return getPageHeight() - minX; - } else { - return getPageHeight() - minY; - } - } - - - public TextPageBlock(float minX, float maxX, float minY, float maxY, List sequences, int rotation) { - - this.minX = minX; - this.maxX = maxX; - this.minY = minY; - this.maxY = maxY; - this.sequences = sequences; - this.rotation = rotation; + setMostPopularWordFont(fontFrequencyCounter.getMostPopular()); + setMostPopularWordStyle(styleFrequencyCounter.getMostPopular()); + setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular()); + setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular()); + setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular()); + setHighestFontSize(fontSizeFrequencyCounter.getHighest()); } public TextPageBlock union(TextPositionSequence r) { TextPageBlock union = this.copy(); - union.add(r); + union.getSequences().add(r); + calculateFrequencyCounters(); + calculateBBox(); return union; } @@ -265,64 +131,32 @@ public class TextPageBlock extends AbstractPageBlock { public TextPageBlock union(TextPageBlock r) { TextPageBlock union = this.copy(); - union.add(r); + union.getSequences().addAll(r.getSequences()); + calculateFrequencyCounters(); + calculateBBox(); return union; } public void add(TextPageBlock r) { - if (r.getMinX() < minX) { - minX = r.getMinX(); - } - if (r.getMaxX() > maxX) { - maxX = r.getMaxX(); - } - if (r.getMinY() < minY) { - minY = r.getMinY(); - } - if (r.getMaxY() > maxY) { - maxY = r.getMaxY(); - } sequences.addAll(r.getSequences()); + calculateFrequencyCounters(); + calculateBBox(); } public void add(TextPositionSequence r) { - if (r.getMinXDirAdj() < minX) { - minX = r.getMinXDirAdj(); - } - if (r.getMaxXDirAdj() > maxX) { - maxX = r.getMaxXDirAdj(); - } - if (r.getMinYDirAdj() < minY) { - minY = r.getMinYDirAdj(); - } - if (r.getMaxYDirAdj() > maxY) { - maxY = r.getMaxYDirAdj(); - } + sequences.add(r); + calculateFrequencyCounters(); + calculateBBox(); } public TextPageBlock copy() { - return new TextPageBlock(minX, maxX, minY, maxY, sequences, rotation); - } - - - public void resize(float x1, float y1, float width, float height) { - - set(x1, y1, x1 + width, y1 + height); - } - - - public void set(float x1, float y1, float x2, float y2) { - - this.minX = Math.min(x1, x2); - this.maxX = Math.max(x1, x2); - this.minY = Math.min(y1, y2); - this.maxY = Math.max(y1, y2); + return new TextPageBlock(new ArrayList<>(sequences)); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequence.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequence.java index de03144..4df3cdb 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequence.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequence.java @@ -1,7 +1,6 @@ package com.knecon.fforesight.service.layoutparser.processor.model.text; -import java.awt.geom.AffineTransform; -import java.awt.geom.Point2D; +import java.awt.geom.Rectangle2D; import java.util.ArrayList; import java.util.List; import java.util.Locale; @@ -9,15 +8,14 @@ import java.util.stream.Collectors; import org.apache.pdfbox.text.TextPosition; -import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point; -import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; +import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox; +import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; import lombok.AllArgsConstructor; import lombok.Builder; import lombok.Data; import lombok.EqualsAndHashCode; import lombok.NoArgsConstructor; -import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; @Slf4j @@ -25,8 +23,8 @@ import lombok.extern.slf4j.Slf4j; @Builder @NoArgsConstructor @AllArgsConstructor -@EqualsAndHashCode(onlyExplicitlyIncluded = true) -public class TextPositionSequence implements CharSequence { +@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = false) +public class TextPositionSequence extends BoundingBox implements CharSequence { public static final int HEIGHT_PADDING = 2; @@ -36,29 +34,38 @@ public class TextPositionSequence implements CharSequence { @EqualsAndHashCode.Include private List textPositions = new ArrayList<>(); + private Rectangle2D bBoxDirAdj; @EqualsAndHashCode.Include private TextDirection dir; private int rotation; private float pageHeight; private float pageWidth; private boolean isParagraphStart; + private boolean strikethrough; + private boolean underline; - public TextPositionSequence(int page) { + public TextPositionSequence(List textPositions, int pageNumber, boolean isParagraphStart) { - this.page = page; - } - - - public TextPositionSequence(List textPositions, int page, boolean isParagraphStart) { - - this.textPositions = textPositions.stream().map(RedTextPosition::fromTextPosition).collect(Collectors.toList()); - this.page = page; + this.textPositions = textPositions.stream() + .map(RedTextPosition::fromTextPosition) + .collect(Collectors.toList()); + this.page = pageNumber; this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir()); this.rotation = textPositions.get(0).getRotation(); this.pageHeight = textPositions.get(0).getPageHeight(); this.pageWidth = textPositions.get(0).getPageWidth(); this.isParagraphStart = isParagraphStart; + calculateBBox(); + } + + + private void calculateBBox() { + + this.bBoxDirAdj = textPositions.stream() + .map(RedTextPosition::getBBoxDirAdj) + .collect(RectangleTransformations.collectBBox()); + setToBBoxOfComponents(getTextPositions()); } @@ -70,6 +77,7 @@ public class TextPositionSequence implements CharSequence { this.rotation = textPositions.get(0).getRotation(); this.pageHeight = textPositions.get(0).getPageHeight(); this.pageWidth = textPositions.get(0).getPageWidth(); + calculateBBox(); } @@ -107,7 +115,7 @@ public class TextPositionSequence implements CharSequence { textPositionSequence.rotation = rotation; textPositionSequence.pageHeight = pageHeight; textPositionSequence.pageWidth = pageWidth; - + textPositionSequence.setToBBoxOfComponents(getTextPositions()); return textPositionSequence; } @@ -137,18 +145,18 @@ public class TextPositionSequence implements CharSequence { this.rotation = textPositionSequence.getRotation(); this.pageHeight = textPositionSequence.getPageHeight(); this.pageWidth = textPositionSequence.getPageWidth(); + calculateBBox(); } public void add(TextPosition textPosition) { this.textPositions.add(RedTextPosition.fromTextPosition(textPosition)); - this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir()); this.rotation = textPositions.get(0).getRotation(); this.pageHeight = textPositions.get(0).getPageHeight(); this.pageWidth = textPositions.get(0).getPageWidth(); - + calculateBBox(); } @@ -220,18 +228,6 @@ public class TextPositionSequence implements CharSequence { } - public float getHeight() { - - return getMaxYDirAdj() - getMinYDirAdj(); - } - - - public float getWidth() { - - return getMaxXDirAdj() - getMinXDirAdj(); - } - - public String getFont() { if (textPositions.get(0).getFontName() == null) { @@ -271,54 +267,5 @@ public class TextPositionSequence implements CharSequence { return textPositions.get(0).getWidthOfSpace(); } - - /** - * This returns the bounding box of the word in Pdf Coordinate System where {0,0} rotated with the page rotation. - * 0 -> LowerLeft - * 90 -> UpperLeft - * 180 -> UpperRight - * 270 -> LowerRight - * - * @return bounding box of the word in Pdf Coordinate System - */ - - @SneakyThrows - public Rectangle getRectangle() { - - log.debug("Page: '{}', Word: '{}', Rotation: '{}', textRotation {}", page, this, rotation, dir); - - float textHeight = getTextHeight(); - - RedTextPosition firstTextPos = textPositions.get(0); - RedTextPosition lastTextPos = textPositions.get(textPositions.size() - 1); - - Point2D bottomLeft = new Point2D.Double(firstTextPos.getXDirAdj(), firstTextPos.getYDirAdj() - HEIGHT_PADDING); - Point2D topRight = new Point2D.Double(lastTextPos.getXDirAdj() + lastTextPos.getWidthDirAdj(), lastTextPos.getYDirAdj() + textHeight + HEIGHT_PADDING); - - AffineTransform transform = new AffineTransform(); - if (dir == TextDirection.ZERO || dir == TextDirection.HALF_CIRCLE) { - transform.rotate(dir.getRadians(), pageWidth / 2f, pageHeight / 2f); - transform.translate(0f, pageHeight + textHeight); - transform.scale(1., -1.); - } else if (dir == TextDirection.QUARTER_CIRCLE) { - transform.rotate(dir.getRadians(), pageWidth / 2f, pageWidth / 2f); - transform.translate(0f, pageWidth + textHeight); - transform.scale(1., -1.); - } else { - transform.rotate(dir.getRadians(), pageHeight / 2f, pageHeight / 2f); - transform.translate(0f, pageWidth + textHeight); - transform.scale(1., -1.); - } - - bottomLeft = transform.transform(bottomLeft, null); - topRight = transform.transform(topRight, null); - - return new Rectangle( // - new Point((float) bottomLeft.getX(), (float) bottomLeft.getY()), - (float) (topRight.getX() - bottomLeft.getX()), - (float) (topRight.getY() - bottomLeft.getY()), - page); - } - } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/adapter/ImageServiceResponseAdapter.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/adapter/ImageServiceResponseAdapter.java index 5b1a61d..968bfbd 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/adapter/ImageServiceResponseAdapter.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/adapter/ImageServiceResponseAdapter.java @@ -9,6 +9,7 @@ import java.util.Map; import org.springframework.stereotype.Service; +import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType; import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; @@ -54,11 +55,12 @@ public class ImageServiceResponseAdapter { classificationPage.getImages().forEach(image -> { if (image.getImageType().equals(ImageType.OTHER)) { - classificationPage.getTextBlocks().forEach(textblock -> { - if (image.getPosition().contains(textblock.getMinX(), textblock.getMinY(), textblock.getWidth(), textblock.getHeight())) { + for (AbstractPageBlock textblock : classificationPage.getTextBlocks()) { + if (image.getPosition().contains(textblock.getBBoxInitialUserSpace())) { image.setImageType(ImageType.OCR); + return; } - }); + } } }); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java index 3cd09a8..c2c33dd 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java @@ -31,8 +31,9 @@ public class BodyTextFrameService { Rectangle bodyTextFrame = calculateBodyTextFrame(classificationDocument.getPages(), classificationDocument.getFontSizeCounter(), false, layoutParsingType); Rectangle landscapeBodyTextFrame = calculateBodyTextFrame(classificationDocument.getPages(), classificationDocument.getFontSizeCounter(), true, layoutParsingType); for (ClassificationPage page : classificationDocument.getPages()) { -// var updatedBodyTextFrame = getBodyTextFrameFromRulings(page, bodyTextFrame, landscapeBodyTextFrame); + var updatedBodyTextFrame = getBodyTextFrameFromRulings(page, bodyTextFrame, landscapeBodyTextFrame); setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame); + classificationDocument.getVisualizations().addMainBodyVisualization(page.isLandscape() ? landscapeBodyTextFrame : bodyTextFrame, page.getPageNumber()); } } @@ -58,24 +59,26 @@ public class BodyTextFrameService { private List getPotentialFooterRulings(ClassificationPage page) { - return page.getCleanRulings() - .getHorizontal() + return page.getCleanRulings().getHorizontals() .stream() + .filter(ruling -> ruling.getClassification().equals(Ruling.Classification.OTHER)) .filter(ruling -> ruling.getY1() < page.getPageHeight() * RULING_HEIGHT_THRESHOLD) .filter(ruling -> ruling.getWidth() > RULING_WIDTH_THRESHOLD * page.getPageWidth()) .sorted(Comparator.comparingDouble(Ruling::getTop)) + .peek(ruling -> ruling.setClassification(Ruling.Classification.FOOTER_SEPARATOR)) .toList(); } private List getPotentialHeaderRulings(ClassificationPage page) { - return page.getCleanRulings() - .getHorizontal() + return page.getCleanRulings().getHorizontals() .stream() + .filter(ruling -> ruling.getClassification().equals(Ruling.Classification.OTHER)) .filter(ruling -> ruling.getY1() > page.getPageHeight() * (1 - RULING_HEIGHT_THRESHOLD)) .filter(ruling -> ruling.getWidth() > RULING_WIDTH_THRESHOLD * page.getPageWidth()) .sorted(Comparator.comparingDouble(Ruling::getBottom).reversed()) + .peek(ruling -> ruling.setClassification(Ruling.Classification.HEADER_SEPARATOR)) .toList(); } @@ -99,16 +102,16 @@ public class BodyTextFrameService { if (page.getPageWidth() > page.getPageHeight() && page.getRotation() == 270) { textFrame = new Rectangle(new Point(textFrame.getTopLeft().getY(), page.getPageHeight() - textFrame.getTopLeft().getX() - textFrame.getWidth()), - textFrame.getHeight(), - textFrame.getWidth(), - 0); + textFrame.getHeight(), + textFrame.getWidth(), + 0); } else if (page.getPageWidth() > page.getPageHeight() && page.getRotation() != 0) { textFrame = new Rectangle(new Point(textFrame.getTopLeft().getY(), textFrame.getTopLeft().getX()), textFrame.getHeight(), textFrame.getWidth(), page.getPageNumber()); } else if (page.getRotation() == 180) { textFrame = new Rectangle(new Point(textFrame.getTopLeft().getX(), page.getPageHeight() - textFrame.getTopLeft().getY() - textFrame.getHeight()), - textFrame.getWidth(), - textFrame.getHeight(), - 0); + textFrame.getWidth(), + textFrame.getHeight(), + 0); } page.setBodyTextFrame(textFrame); } @@ -152,14 +155,17 @@ public class BodyTextFrameService { } if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) || MarkedContentUtils.intersects(textBlock, - page.getMarkedContentBboxPerType(), - MarkedContentUtils.FOOTER)) { + page.getMarkedContentBboxPerType(), + MarkedContentUtils.FOOTER)) { continue; } - float approxLineCount = PositionUtils.getApproxLineCount(textBlock); - if (layoutParsingType.equals(LayoutParsingType.DOCUMINE) && approxLineCount < approximateHeaderLineCount && textBlock.getMaxY() >= page.getPageHeight() - (page.getPageHeight() / 10) || !layoutParsingType.equals( - LayoutParsingType.DOCUMINE) && approxLineCount < approximateHeaderLineCount) { + double approxLineCount = PositionUtils.getApproxLineCount(textBlock); + if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD) // + && approxLineCount < approximateHeaderLineCount // + && textBlock.getMaxY() >= page.getPageHeight() - (page.getPageHeight() / 10)// + || !layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD) // + && approxLineCount < approximateHeaderLineCount) { continue; } @@ -185,10 +191,10 @@ public class BodyTextFrameService { } } } - return new Rectangle(new Point(expansionsRectangle.minX, expansionsRectangle.minY), - expansionsRectangle.maxX - expansionsRectangle.minX, - expansionsRectangle.maxY - expansionsRectangle.minY, - 0); + return new Rectangle(new Point((float) expansionsRectangle.minX, (float) expansionsRectangle.minY), + (float) (expansionsRectangle.maxX - expansionsRectangle.minX), + (float) (expansionsRectangle.maxY - expansionsRectangle.minY), + 0); } @@ -226,10 +232,10 @@ public class BodyTextFrameService { private class BodyTextFrameExpansionsRectangle { - float minX = 10000; - float maxX = -100; - float minY = 10000; - float maxY = -100; + double minX = 10000; + double maxX = -100; + double minY = 10000; + double maxY = -100; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/GapDetectionService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/GapDetectionService.java index ac7db1d..ae3eac1 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/GapDetectionService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/GapDetectionService.java @@ -44,9 +44,9 @@ public class GapDetectionService { if (yDifference > avgTextPositionHeight * Y_GAP_FACTOR) { yGapContext.addGap(mainBodyTextFrame.getMinX(), - previousTextPositionBBox.getMaxY(), - mainBodyTextFrame.getWidth(), - -(previousTextPositionBBox.getMaxY() - currentTextPositionBBox.getMinY())); + previousTextPositionBBox.getMaxY(), + mainBodyTextFrame.getWidth(), + -(previousTextPositionBBox.getMaxY() - currentTextPositionBBox.getMinY())); } if (yDifference > avgTextPositionHeight * NEW_LINE_FACTOR) { @@ -69,32 +69,37 @@ public class GapDetectionService { private static Rectangle2D toRectangle2D(TextPositionSequence textPosition) { - return mirrorY(RectangleTransformations.toRectangle2D(textPosition.getRectangle())); + return mirrorY(textPosition.getBBox()); } + private static Rectangle2D mirrorY(Rectangle2D rectangle2D) { return new Rectangle2D.Double(rectangle2D.getX(), Math.min(rectangle2D.getMinY(), rectangle2D.getMaxY()), rectangle2D.getWidth(), Math.abs(rectangle2D.getHeight())); } + private static void addGapToLine(Rectangle2D currentTextPosition, Rectangle2D previousTextPosition, XGapsContext context) { context.gapsInCurrentLine.add(new Rectangle2D.Double(previousTextPosition.getMaxX(), - previousTextPosition.getMinY(), - currentTextPosition.getMinX() - previousTextPosition.getMaxX(), - (previousTextPosition.getHeight() + currentTextPosition.getHeight()) / 2)); + previousTextPosition.getMinY(), + currentTextPosition.getMinX() - previousTextPosition.getMaxX(), + (previousTextPosition.getHeight() + currentTextPosition.getHeight()) / 2)); } private static void assertAllTextPositionsHaveSameDir(List textPositionSequences) { - assert textPositionSequences.stream().map(TextPositionSequence::getDir).allMatch(a -> a.equals(textPositionSequences.get(0).getDir())); + assert textPositionSequences.stream() + .map(TextPositionSequence::getDir) + .allMatch(a -> a.equals(textPositionSequences.get(0).getDir())); } private static double getAvgTextPositionHeight(List textPositionSequences) { - return textPositionSequences.stream().mapToDouble(TextPositionSequence::getHeight).average().orElseThrow(); + return textPositionSequences.stream() + .mapToDouble(TextPositionSequence::getHeight).average().orElseThrow(); } @@ -142,9 +147,9 @@ public class GapDetectionService { public void addGapToRightEdgeOfMainBody(Rectangle2D textPosition) { Rectangle2D leftGap = new Rectangle2D.Double(textPosition.getMaxX(), - textPosition.getMinY(), - mainBodyTextFrame.getMaxX() - textPosition.getMaxX(), - textPosition.getHeight()); + textPosition.getMinY(), + mainBodyTextFrame.getMaxX() - textPosition.getMaxX(), + textPosition.getHeight()); gapsInCurrentLine.add(leftGap); } @@ -152,9 +157,9 @@ public class GapDetectionService { public void addGapFromLeftEdgeOfMainBody(Rectangle2D textPosition) { Rectangle2D leftGap = new Rectangle2D.Double(mainBodyTextFrame.getMinX(), - textPosition.getMinY(), - textPosition.getMinX() - mainBodyTextFrame.getMinX(), - textPosition.getHeight()); + textPosition.getMinY(), + textPosition.getMinX() - mainBodyTextFrame.getMinX(), + textPosition.getHeight()); gapsInCurrentLine.add(leftGap); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/LineDetectionService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/LineDetectionService.java index 8b14767..ddebaef 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/LineDetectionService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/LineDetectionService.java @@ -180,7 +180,7 @@ public class LineDetectionService { private Rectangle2D textPositionBBox(List textPositionSequences) { - return RectangleTransformations.rectangleBBox(textPositionSequences.stream().map(TextPositionSequence::getRectangle).toList()); + return RectangleTransformations.rectangle2DBBox(textPositionSequences.stream().map(TextPositionSequence::getBBox).toList()); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/RulingCleaningService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/RulingCleaningService.java index c51c90b..ffe07f8 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/RulingCleaningService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/RulingCleaningService.java @@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.services; import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.X_FIRST_RULING_COMPARATOR; import java.awt.geom.Point2D; +import java.awt.geom.Rectangle2D; import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -12,9 +13,9 @@ import java.util.stream.Collectors; import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; -import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle; import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells; +import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; import com.knecon.fforesight.service.layoutparser.processor.utils.UnionFind; import lombok.RequiredArgsConstructor; @@ -31,7 +32,7 @@ public class RulingCleaningService { private static final float THRESHOLD_Y_HORIZONTAL = 3; - public CleanRulings getCleanRulings(List tableCells, List rulings) { + public CleanRulings deduplicateAndStraightenRulings(List tableCells, List rulings) { Rulings verticalAndHorizontalRulingLines; @@ -45,43 +46,43 @@ public class RulingCleaningService { verticalAndHorizontalRulingLines.horizontalLines.sort(X_FIRST_RULING_COMPARATOR); verticalAndHorizontalRulingLines = cleanRulings(verticalAndHorizontalRulingLines); - return CleanRulings.builder().vertical(verticalAndHorizontalRulingLines.verticalLines()).horizontal(verticalAndHorizontalRulingLines.horizontalLines()).build(); + return new CleanRulings(verticalAndHorizontalRulingLines.horizontalLines(), verticalAndHorizontalRulingLines.verticalLines()); } private Rulings cleanRulings(Rulings rulings) { - List> groupedOverlappingVerticalRectangles = groupOverlappingRectangles(rulings.verticalLines.stream() - .map(RulingCleaningService::getOverlapRectangle) - .distinct() - .toList()); - List cleanedVerticalRulings = groupedOverlappingVerticalRectangles.stream() - .map(rectList -> getXCenteredRuling(Rectangle.boundingBoxOf(rectList))) - .toList(); - - List> groupedOverlappingHorizontalRectangles = groupOverlappingRectangles(rulings.horizontalLines.stream() + List> groupedOverlappingVerticalRectangles = groupOverlappingRectangles(rulings.verticalLines.stream() .map(RulingCleaningService::getOverlapRectangle) .distinct() .toList()); + List cleanedVerticalRulings = groupedOverlappingVerticalRectangles.stream() + .map(rectList -> getXCenteredRuling(RectangleTransformations.rectangle2DBBox(rectList))) + .toList(); + + List> groupedOverlappingHorizontalRectangles = groupOverlappingRectangles(rulings.horizontalLines.stream() + .map(RulingCleaningService::getOverlapRectangle) + .distinct() + .toList()); List cleanedHorizontalRulings = groupedOverlappingHorizontalRectangles.stream() - .map(rectList -> getYCenteredRuling(Rectangle.boundingBoxOf(rectList))) + .map(rectList -> getYCenteredRuling(RectangleTransformations.rectangle2DBBox(rectList))) .collect(Collectors.toList()); return new Rulings(cleanedVerticalRulings, cleanedHorizontalRulings); } - private List> groupOverlappingRectangles(List rectangles) { + private List> groupOverlappingRectangles(List rectangles) { - UnionFind unionFind = new UnionFind<>(); + UnionFind unionFind = new UnionFind<>(); for (int i = 0; i < rectangles.size(); i++) { for (int j = i + 1; j < rectangles.size(); j++) { - Rectangle rectangle1 = rectangles.get(i); - Rectangle rectangle2 = rectangles.get(j); + Rectangle2D rectangle1 = rectangles.get(i); + Rectangle2D rectangle2 = rectangles.get(j); // we can stop early when we are too far off because of x-y-sorting - if(rectangle1.getRight() < rectangle2.getLeft() && rectangle1.getBottom() < rectangle2.getTop()) { + if (rectangle1.getMaxX() < rectangle2.getMinX() && rectangle1.getMaxY() < rectangle2.getMinY()) { break; } @@ -91,66 +92,66 @@ public class RulingCleaningService { } } - Map> groups = new HashMap<>(); - for (Rectangle rectangle : rectangles) { - Rectangle root = unionFind.find(rectangle); + Map> groups = new HashMap<>(); + for (Rectangle2D rectangle : rectangles) { + Rectangle2D root = unionFind.find(rectangle); groups.computeIfAbsent(root, k -> new ArrayList<>()).add(rectangle); } return new ArrayList<>(groups.values()); } - private static Rectangle getOverlapRectangle(Ruling ruling) { + private static Rectangle2D getOverlapRectangle(Ruling ruling) { - float top; - float left; + float y; + float x; float w; float h; if (ruling.x1 < ruling.x2) { - left = ruling.x1; + x = ruling.x1; w = ruling.x2 - ruling.x1; } else { - left = ruling.x2; + x = ruling.x2; w = ruling.x1 - ruling.x2; } if (ruling.y1 < ruling.y2) { - top = ruling.y1; + y = ruling.y1; h = ruling.y2 - ruling.y1; } else { - top = ruling.y2; + y = ruling.y2; h = ruling.y1 - ruling.y2; } - if (ruling.horizontal()) { - return new Rectangle(top - THRESHOLD_Y_HORIZONTAL, left - THRESHOLD_X_HORIZONTAL, w + 2 * THRESHOLD_X_HORIZONTAL, h + 2 * THRESHOLD_Y_HORIZONTAL); + if (ruling.isHorizontal()) { + return new Rectangle2D.Double(x - THRESHOLD_X_HORIZONTAL, y - THRESHOLD_Y_HORIZONTAL, w + 2 * THRESHOLD_X_HORIZONTAL, h + 2 * THRESHOLD_Y_HORIZONTAL); } else { - return new Rectangle(top - THRESHOLD_Y_VERTICAL, left - THRESHOLD_X_VERTICAL, w + 2 * THRESHOLD_X_VERTICAL, h + 2 * THRESHOLD_Y_VERTICAL); + return new Rectangle2D.Double(x - THRESHOLD_X_VERTICAL, y - THRESHOLD_Y_VERTICAL, w + 2 * THRESHOLD_X_VERTICAL, h + 2 * THRESHOLD_Y_VERTICAL); } } - public static Ruling getXCenteredRuling(Rectangle rectangle) { + public static Ruling getXCenteredRuling(Rectangle2D rectangle) { - float x = (float) rectangle.getCenterX(); - float y1 = rectangle.getTop(); - float y2 = rectangle.getBottom(); + double x = rectangle.getCenterX(); + double y1 = rectangle.getMinY(); + double y2 = rectangle.getMaxY(); - Point2D point1 = new Point2D.Float(x, y1 + THRESHOLD_Y_VERTICAL); - Point2D point2 = new Point2D.Float(x, y2 - THRESHOLD_Y_VERTICAL); + Point2D point1 = new Point2D.Double(x, y1 + THRESHOLD_Y_VERTICAL); + Point2D point2 = new Point2D.Double(x, y2 - THRESHOLD_Y_VERTICAL); return new Ruling(point1, point2); } - public static Ruling getYCenteredRuling(Rectangle rectangle) { + public static Ruling getYCenteredRuling(Rectangle2D rectangle) { - float x1 = rectangle.getLeft(); - float x2 = rectangle.getRight(); - float y = (float) rectangle.getCenterY(); + double x1 = rectangle.getX(); + double x2 = rectangle.getMaxX(); + double y = rectangle.getCenterY(); - Point2D point1 = new Point2D.Float(x1 + THRESHOLD_X_HORIZONTAL, y); - Point2D point2 = new Point2D.Float(x2 - THRESHOLD_X_HORIZONTAL, y); + Point2D point1 = new Point2D.Double(x1 + THRESHOLD_X_HORIZONTAL, y); + Point2D point2 = new Point2D.Double(x2 - THRESHOLD_X_HORIZONTAL, y); return new Ruling(point1, point2); } @@ -160,14 +161,14 @@ public class RulingCleaningService { List vrs = new ArrayList<>(); for (Ruling vr : rulings) { - if (vr.vertical()) { + if (vr.isVertical()) { vrs.add(vr); } } List hrs = new ArrayList<>(); for (Ruling hr : rulings) { - if (hr.horizontal()) { + if (hr.isHorizontal()) { hrs.add(hr); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/SectionsBuilderService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/SectionsBuilderService.java index fec1b29..8f22568 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/SectionsBuilderService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/SectionsBuilderService.java @@ -71,7 +71,8 @@ public class SectionsBuilderService { chunkBlockList.add(chunkBlock); chunkWords = new ArrayList<>(); if (!chunkBlock.getTables().isEmpty()) { - previousTable = chunkBlock.getTables().get(chunkBlock.getTables().size() - 1); + previousTable = chunkBlock.getTables() + .get(chunkBlock.getTables().size() - 1); } } if (current instanceof TablePageBlock table) { @@ -106,11 +107,12 @@ public class SectionsBuilderService { List sections = new ArrayList<>(); for (var page : document.getPages()) { - page.getTextBlocks().forEach(block -> { - block.setPage(page.getPageNumber()); - var section = buildTextBlock(List.of(block), Strings.EMPTY); - sections.add(section); - }); + page.getTextBlocks() + .forEach(block -> { + block.setPage(page.getPageNumber()); + var section = buildTextBlock(List.of(block), Strings.EMPTY); + sections.add(section); + }); } document.setSections(sections); } @@ -155,10 +157,10 @@ public class SectionsBuilderService { } } for (ClassificationSection section : sectionsOnPage) { - Float xMin = null; - Float yMin = null; - Float xMax = null; - Float yMax = null; + Double xMin = null; + Double yMin = null; + Double xMax = null; + Double yMax = null; for (AbstractPageBlock abs : section.getPageBlocks()) { if (abs.getPage() != page.getPageNumber()) { @@ -202,8 +204,14 @@ public class SectionsBuilderService { log.debug("Image position x: {}, y: {}", image.getPosition().getX(), image.getPosition().getY()); log.debug("Paragraph position xMin: {}, xMax: {}, yMin: {}, yMax: {}", xMin, xMax, yMin, yMax); - if (xMin != null && xMax != null && yMin != null && yMax != null && image.getPosition().getX() >= xMin && image.getPosition() - .getX() <= xMax && image.getPosition().getY() >= yMin && image.getPosition().getY() <= yMax) { + if (xMin != null + && xMax != null + && yMin != null + && yMax != null + && image.getPosition().getX() >= xMin + && image.getPosition().getX() <= xMax + && image.getPosition().getY() >= yMin + && image.getPosition().getY() <= yMax) { section.getImages().add(image); image.setAppendedToSection(true); break; @@ -226,17 +234,26 @@ public class SectionsBuilderService { List previousTableNonHeaderRow = getRowWithNonHeaderCells(previousTable); List tableNonHeaderRow = getRowWithNonHeaderCells(currentTable); // Allow merging of tables if header row is separated from first logical non-header row - if (previousTableNonHeaderRow.isEmpty() && previousTable.getRowCount() == 1 && previousTable.getRows().get(0).size() == tableNonHeaderRow.size()) { - previousTableNonHeaderRow = previousTable.getRows().get(0).stream().map(cell -> { - Cell fakeCell = new Cell(cell.getPoints()[0], cell.getPoints()[2]); - fakeCell.setHeaderCells(Collections.singletonList(cell)); - return fakeCell; - }).collect(Collectors.toList()); + if (previousTableNonHeaderRow.isEmpty() + && previousTable.getRowCount() == 1 + && previousTable.getRows() + .get(0).size() == tableNonHeaderRow.size()) { + previousTableNonHeaderRow = previousTable.getRows() + .get(0) + .stream() + .map(cell -> { + Cell fakeCell = Cell.copy(cell); + fakeCell.setHeaderCells(Collections.singletonList(cell)); + return fakeCell; + }) + .collect(Collectors.toList()); } if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) { for (int i = currentTable.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table - List row = currentTable.getRows().get(i); - if (row.size() == tableNonHeaderRow.size() && row.stream().allMatch(cell -> cell.getHeaderCells().isEmpty())) { + List row = currentTable.getRows() + .get(i); + if (row.size() == tableNonHeaderRow.size() && row.stream() + .allMatch(cell -> cell.getHeaderCells().isEmpty())) { for (int j = 0; j < row.size(); j++) { row.get(j).setHeaderCells(previousTableNonHeaderRow.get(j).getHeaderCells()); } @@ -279,7 +296,11 @@ public class SectionsBuilderService { private boolean hasInvalidHeaderInformation(TablePageBlock table) { - return table.getRows().stream().flatMap(row -> row.stream().filter(cell -> !cell.getHeaderCells().isEmpty())).findAny().isEmpty(); + return table.getRows() + .stream() + .flatMap(row -> row.stream() + .filter(cell -> !cell.getHeaderCells().isEmpty())) + .findAny().isEmpty(); } @@ -287,7 +308,8 @@ public class SectionsBuilderService { private List getRowWithNonHeaderCells(TablePageBlock table) { for (int i = table.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table - List row = table.getRows().get(i); + List row = table.getRows() + .get(i); if (row.size() == 1) { continue; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java index 4af2a04..b28a80b 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java @@ -3,6 +3,8 @@ package com.knecon.fforesight.service.layoutparser.processor.services; import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.CELL_SIZE_COMPARATOR; import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.RECTANGLE_SIZE_COMPARATOR; +import java.awt.geom.AffineTransform; +import java.awt.geom.Rectangle2D; import java.util.ArrayList; import java.util.HashSet; import java.util.List; @@ -11,22 +13,26 @@ import java.util.stream.Collectors; import org.springframework.stereotype.Service; +import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; -import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle; import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms; import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons; +import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation; import com.knecon.fforesight.service.layoutparser.processor.utils.RectangularIntersectionFinder; import com.knecon.fforesight.service.layoutparser.processor.utils.SpreadsheetFinder; +import lombok.SneakyThrows; + @Service public class TableExtractionService { private static final int MAX_TABLE_CONTAINED_CELLS_WITH_TEXT = 1; - private static final int TEXT_BLOCK_CONTAINMENT_TOLERANCE = 2; private static final double TABLE_UNIFORMITY_THRESHOLD = 0.7; @@ -59,29 +65,31 @@ public class TableExtractionService { } } - var cells = new ArrayList<>(new HashSet<>(emptyCells)); - DoubleComparisons.sort(cells, Rectangle.ILL_DEFINED_ORDER); + List cells = new ArrayList<>(new HashSet<>(emptyCells)); + DoubleComparisons.sort(cells, BoundingBox.ILL_DEFINED_ORDER); - List spreadsheetAreas = SpreadsheetFinder.findSpreadsheetsFromCells(cells); + List spreadsheetAreas = SpreadsheetFinder.findSpreadsheetsFromCells(cells); // sort spreadsheetAreas by size (height * width) ascending so that cells are placed in the smallest tables first // this way no cell duplication occurs when tables are contained in other tables and only the most inner table contains the cells spreadsheetAreas.sort(RECTANGLE_SIZE_COMPARATOR); List tables = new ArrayList<>(); - for (Rectangle area : spreadsheetAreas) { + for (Rectangle2D area : spreadsheetAreas) { List containedCells = new ArrayList<>(); for (Cell c : cells) { - if (c.hasMinimumSize() && area.contains(c)) { + if (c.hasMinimumSize() && area.contains(c.getBBoxInitialUserSpace())) { containedCells.add(c); } } - var containedCellsWithText = containedCells.stream().filter(cell -> !cell.getTextBlocks().isEmpty()).toList(); + var containedCellsWithText = containedCells.stream() + .filter(cell -> !cell.getTextBlocks().isEmpty()) + .toList(); // verify if table would contain fewer cells with text than the threshold allows if (containedCellsWithText.size() >= MAX_TABLE_CONTAINED_CELLS_WITH_TEXT && checkIfTableCellsAreUniform(containedCells)) { - tables.add(new TablePageBlock(containedCells, area, page.getRotation())); + tables.add(new TablePageBlock(containedCells, page.getRotation())); cells.removeAll(containedCells); } } @@ -90,14 +98,18 @@ public class TableExtractionService { int position = -1; for (AbstractPageBlock pageBlock : page.getTextBlocks()) { - if (pageBlock instanceof TextPageBlock ? table.containsBlock((TextPageBlock) pageBlock) : table.contains(pageBlock) && position == -1) { + if (pageBlock instanceof TextPageBlock ? table.contains(pageBlock) : table.contains(pageBlock) && position == -1) { position = page.getTextBlocks().indexOf(pageBlock); } } if (position != -1) { page.getTextBlocks().add(position, table); - var toBeRemoved = table.getCells().stream().map(Cell::getTextBlocks).flatMap(List::stream).toList(); + var toBeRemoved = table.getCells() + .stream() + .map(Cell::getTextBlocks) + .flatMap(List::stream) + .toList(); // remove text blocks from the page that were also added with the table (from its contained cells) page.getTextBlocks().removeAll(toBeRemoved); } @@ -112,7 +124,7 @@ public class TableExtractionService { } Map> cellsGroupedByRoundedWidth = containedCells.stream() - .map(Rectangle::getWidth) + .map(BoundingBox::getWidth) .map(size -> Math.round(size / 10.0) * 10) .collect(Collectors.groupingBy(Long::longValue)); @@ -122,22 +134,26 @@ public class TableExtractionService { private boolean doesCellContainTextBlock(Cell cell, TextPageBlock textBlock) { - double x = textBlock.getPdfMinX(); - double y = textBlock.getPdfMinY(); - double w = textBlock.getPdfMaxX() - textBlock.getPdfMinX(); - double h = textBlock.getPdfMaxY() - textBlock.getPdfMinY(); - if (cell.isEmpty() || w <= 0 || h <= 0) { - return false; - } - double x0 = cell.getX(); - double y0 = cell.getY(); - return (x >= x0 - TEXT_BLOCK_CONTAINMENT_TOLERANCE && y >= y0 - TEXT_BLOCK_CONTAINMENT_TOLERANCE && (x + w) <= x0 + cell.getWidth() + 2 * TEXT_BLOCK_CONTAINMENT_TOLERANCE && (y + h) <= y0 + cell.getHeight() + 2 * TEXT_BLOCK_CONTAINMENT_TOLERANCE); + return cell.contains(textBlock, RedTextPosition.HEIGHT_PADDING); } - public static List findCells(List horizontalRulingLines, List verticalRulingLines) { + @SneakyThrows + public static List findCells(List horizontalRulingLines, List verticalRulingLines, PageInformation pageInformation) { - return RectangularIntersectionFinder.find(horizontalRulingLines, verticalRulingLines).stream().map(Cell::new).collect(Collectors.toList()); + AffineTransform affineTransform = CoordinateTransforms.calculateInitialUserSpaceCoordsToImageCoords(pageInformation, 1); + /* + switch (pageInformation.rotationDegrees()) { + case 90 -> affineTransform.translate(RedTextPosition.HEIGHT_PADDING, 0); //although this is wrong, our text coordinates are wrong as well + case 180 -> affineTransform.translate(0, RedTextPosition.HEIGHT_PADDING); + case 270 -> affineTransform.translate(-RedTextPosition.HEIGHT_PADDING, 0); + default -> affineTransform.translate(0, -RedTextPosition.HEIGHT_PADDING); + } + */ + return RectangularIntersectionFinder.find(horizontalRulingLines, verticalRulingLines) + .stream() + .map(rect -> new Cell(rect, affineTransform)) + .collect(Collectors.toList()); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TextRulingsClassifier.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TextRulingsClassifier.java new file mode 100644 index 0000000..7fc2d40 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TextRulingsClassifier.java @@ -0,0 +1,99 @@ +package com.knecon.fforesight.service.layoutparser.processor.services; + +import java.util.List; + +import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; + +import lombok.experimental.UtilityClass; + +@UtilityClass +public class TextRulingsClassifier { + + private final static double STRIKETHROUGH_ZONE = 0.5; // multiplied with text height, determines height of intersection interval for strikethrough lines. + private final static double UNDERLINE_ZONE = 0.2; // multiplied with text height, determines height of intersection interval of underline lines. + private final static double TEXT_BBOX_THRESHOLD_FACTOR = 0.15; // multiplied with text width then subtracted from word width. If ruling covers this width, it is considered as strikethrough/underline. + + + public static void classifyUnderlinedAndStrikethroughText(List words, CleanRulings cleanRulings) { + + for (TextPositionSequence word : words) { + if (word.getDir().equals(TextDirection.ZERO) || word.getDir().equals(TextDirection.HALF_CIRCLE)) { + handleHorizontalText(cleanRulings, word); + } else { + handleVerticalText(cleanRulings, word); + } + } + } + + + private static void handleVerticalText(CleanRulings cleanRulings, TextPositionSequence word) { + + float lowerY = (float) (word.getBBoxInitialUserSpace().getMinY() + TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth()); + float upperY = (float) (word.getBBoxInitialUserSpace().getMaxY() - TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth()); + + float strikethroughCenterX = (float) word.getBBoxInitialUserSpace().getCenterX(); + float strikethroughBoxHeight = (float) ((word.getBBoxDirAdj().getHeight() * STRIKETHROUGH_ZONE) / 2); + + float underlineCenterX = (float) (word.getDir().equals(TextDirection.QUARTER_CIRCLE) ? word.getBBoxInitialUserSpace().getMaxX() : word.getBBoxInitialUserSpace().getMinX()); + float underlineBoxHeight = (float) ((word.getBBoxDirAdj().getHeight() * UNDERLINE_ZONE) / 2); + + float leftX = Math.min(underlineCenterX - underlineBoxHeight, strikethroughCenterX - strikethroughBoxHeight); + float rightX = Math.max(underlineCenterX + underlineBoxHeight, strikethroughCenterX + strikethroughBoxHeight); + + List rulingsIntersectingWord = cleanRulings.getVerticalsInXInterval(leftX, rightX) + .stream() + .filter(ruling -> ruling.getClassification().equals(Ruling.Classification.OTHER)) + .filter(ruling -> ruling.y1 <= lowerY && upperY <= ruling.y2) + .toList(); + + for (Ruling ruling : rulingsIntersectingWord) { + if (strikethroughCenterX - strikethroughBoxHeight < ruling.x1 && ruling.x1 < strikethroughCenterX + strikethroughBoxHeight) { + ruling.setClassification(Ruling.Classification.STRIKETROUGH); + word.setStrikethrough(true); + } + + if (underlineCenterX - underlineBoxHeight < ruling.x1 && ruling.x1 < underlineCenterX + underlineBoxHeight) { + ruling.setClassification(Ruling.Classification.UNDERLINE); + word.setUnderline(true); + } + } + } + + + private static void handleHorizontalText(CleanRulings cleanRulings, TextPositionSequence word) { + + float leftX = (float) (word.getBBoxInitialUserSpace().getMinX() + TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth()); + float rightX = (float) (word.getBBoxInitialUserSpace().getMaxX() - TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth()); + + float strikethroughCenterY = (float) word.getBBoxInitialUserSpace().getCenterY(); + float strikethroughBoxHeight = (float) ((word.getBBoxDirAdj().getHeight() * STRIKETHROUGH_ZONE) / 2); + + float underlineCenterY = (float) (word.getDir().equals(TextDirection.ZERO) ? word.getBBoxInitialUserSpace().getMinY() : word.getBBoxInitialUserSpace().getMaxY()); + float underlineBoxHeight = (float) ((word.getBBoxDirAdj().getHeight() * UNDERLINE_ZONE) / 2); + + float lowerY = Math.min(underlineCenterY - underlineBoxHeight, strikethroughCenterY - strikethroughBoxHeight); + float upperY = Math.max(underlineCenterY + underlineBoxHeight, strikethroughCenterY + strikethroughBoxHeight); + + List rulingsIntersectingWord = cleanRulings.getHorizontalsInYInterval(lowerY, upperY) + .stream() + .filter(ruling -> ruling.getClassification().equals(Ruling.Classification.OTHER)) + .filter(ruling -> ruling.x1 <= leftX && rightX <= ruling.x2) + .toList(); + + for (Ruling ruling : rulingsIntersectingWord) { + if (strikethroughCenterY - strikethroughBoxHeight < ruling.y1 && ruling.y1 < strikethroughCenterY + strikethroughBoxHeight) { + ruling.setClassification(Ruling.Classification.STRIKETROUGH); + word.setStrikethrough(true); + } + + if (underlineCenterY - underlineBoxHeight < ruling.y1 && ruling.y1 < underlineCenterY + underlineBoxHeight) { + ruling.setClassification(Ruling.Classification.UNDERLINE); + word.setUnderline(true); + } + } + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java index 9c087a1..4290da9 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java @@ -1,7 +1,5 @@ package com.knecon.fforesight.service.layoutparser.processor.services.blockification; -import static java.util.stream.Collectors.toSet; - import java.util.ArrayList; import java.util.Comparator; import java.util.List; @@ -9,21 +7,17 @@ import java.util.ListIterator; import org.springframework.stereotype.Service; +import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.processor.docstrum.DocstrumSegmentationService; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone; import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; -import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter; -import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; -import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; -import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; -import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; -import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil; +import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations; import lombok.RequiredArgsConstructor; @@ -37,48 +31,76 @@ public class DocstrumBlockificationService { static final float THRESHOLD = 1f; - public ClassificationPage blockify(List textPositions, List cells, boolean xyOrder) { + public ClassificationPage blockify(List textPositions, + CleanRulings rulings, + boolean xyOrder, + LayoutparsingVisualizations visualizations, + LayoutParsingType layoutParsingType) { - CleanRulings usedRulings = RectangleTransformations.extractRulings(cells); + CleanRulings usedRulings = rulings.withoutTextRulings(); - var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder); - var pageBlocks = toAbstractPageBlocks(zones, usedRulings.getHorizontal(), usedRulings.getVertical(), xyOrder); + var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder, usedRulings, visualizations); + + if (!textPositions.isEmpty()) { + visualizations.addZoneVisualizations(zones, textPositions.get(0).getPage()); + visualizations.addLineVisualizationsFromZones(zones, textPositions.get(0).getPage()); + visualizations.addCharactersWithNeighbours(zones, textPositions.get(0).getPage()); + } + + var pageBlocks = toAbstractPageBlocks(zones, xyOrder, usedRulings); + + if (xyOrder) { + sortPageBlocksXThenY(pageBlocks); + } var classificationPage = new ClassificationPage(pageBlocks); + classificationPage.setCleanRulings(rulings); - mergeIntersectingBlocks(classificationPage.getTextBlocks(), 0, 0); + mergeIntersectingBlocks(classificationPage, usedRulings, 0, 0); + + if (layoutParsingType == LayoutParsingType.DOCUMINE || layoutParsingType == LayoutParsingType.REDACT_MANAGER) { + combineBlocks(classificationPage); + } + + if (layoutParsingType == LayoutParsingType.CLARIFYND) { + mergeIntersectingBlocks(classificationPage, usedRulings, 0, 6.5f); + } return classificationPage; } - private List toAbstractPageBlocks(List zones, List horizontalRulings, List verticalRulings, boolean xyOrder) { + private static void sortPageBlocksXThenY(List pageBlocks) { + + pageBlocks.sort(Comparator.comparing(AbstractPageBlock::getMinY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) + .thenComparing(AbstractPageBlock::getMinX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); + pageBlocks.sort(new Comparator() { + @Override + public int compare(AbstractPageBlock o1, AbstractPageBlock o2) { + + return Math.abs(o1.getMinY() - o2.getMinY()) < 5 && o1.getMinX() < o2.getMinX() == true ? -1 : 0; + } + }); + } + + + private List toAbstractPageBlocks(List zones, boolean xyOrder, CleanRulings usedRulings) { List abstractPageBlocks = new ArrayList<>(); zones.forEach(zone -> { List textPositionSequences = new ArrayList<>(); - zone.getLines().forEach(line -> { - line.getWords().forEach(word -> { - textPositionSequences.add(new TextPositionSequence(word.getTextPositions(), word.getPage())); - }); - }); + zone.getLines() + .forEach(line -> { + line.getWords() + .forEach(word -> { + textPositionSequences.add(new TextPositionSequence(word.getTextPositions(), word.getPage())); + }); + }); - abstractPageBlocks.addAll(splitZonesAtRulings(textPositionSequences, horizontalRulings, verticalRulings)); + abstractPageBlocks.add(buildTextBlock(textPositionSequences, 0)); }); - if (xyOrder) { - abstractPageBlocks.sort(Comparator.comparing(AbstractPageBlock::getMinY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) - .thenComparing(AbstractPageBlock::getMinX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); - abstractPageBlocks.sort(new Comparator() { - @Override - public int compare(AbstractPageBlock o1, AbstractPageBlock o2) { - - return Math.abs(o1.getMinY() - o2.getMinY()) < 5 && o1.getMinX() < o2.getMinX() == true ? -1 : 0; - } - }); - } - return abstractPageBlocks; } @@ -87,6 +109,7 @@ public class DocstrumBlockificationService { TextPageBlock previous = new TextPageBlock(); ListIterator itty = page.getTextBlocks().listIterator(); + CleanRulings usedRulings = page.getCleanRulings().withoutTextRulings(); while (itty.hasNext()) { AbstractPageBlock block = itty.next(); @@ -98,7 +121,7 @@ public class DocstrumBlockificationService { if (previous != null && !previous.getSequences().isEmpty()) { - if (current.getDir() != previous.getDir()) { + if (current.getDir() != previous.getDir() || usedRulings.lineBetween(current, previous)) { previous = current; continue; } @@ -108,7 +131,7 @@ public class DocstrumBlockificationService { continue; } - if (previous.almostIntersects(current, 0, 0)) { + if (previous.intersects(current)) { previous = combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate()); continue; } @@ -127,15 +150,15 @@ public class DocstrumBlockificationService { previous = current; } - mergeIntersectingBlocks(page.getTextBlocks(), 0, 6.5f); + mergeIntersectingBlocks(page, usedRulings, 0, 6.5f); } private boolean isOnlyIntersectingYAndOnelineOrPrevoiusTwoLines(TextPageBlock previous, TextPageBlock current, ClassificationPage page) { return current.intersectsY(previous) // - && (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() == 2 && current.getNumberOfLines() == 1) // - && numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) <= 0; + && (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() == 2 && current.getNumberOfLines() == 1) // + && numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) <= 0; } @@ -144,16 +167,16 @@ public class DocstrumBlockificationService { ClassificationPage page) { return (Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD || Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) // - && (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() >= 2 && current.getNumberOfLines() == 1) // - && !hasBetween(current, previous, page.getTextBlocks()) && numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) <= 4; + && (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() >= 2 && current.getNumberOfLines() == 1) // + && !hasBetween(current, previous, page.getTextBlocks()) && numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) <= 4; } private boolean areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(TextPageBlock previous, TextPageBlock current, ClassificationPage page) { return previous.getNumberOfLines() >= 2 && current.getNumberOfLines() >= 2 // - && previous.intersectsY(current) // - && numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) == 0; + && previous.intersectsY(current) // + && numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) == 0; } @@ -208,12 +231,13 @@ public class DocstrumBlockificationService { } - public void mergeIntersectingBlocks(List blocks, float xThreshold, float yThreshold) { + public void mergeIntersectingBlocks(ClassificationPage page, CleanRulings usedRulings, float xThreshold, float yThreshold) { + var blocks = page.getTextBlocks(); ListIterator itty = blocks.listIterator(); while (itty.hasNext()) { AbstractPageBlock block = itty.next(); - if(block == null){ + if (block == null) { continue; } if (block instanceof TablePageBlock) { @@ -224,7 +248,7 @@ public class DocstrumBlockificationService { for (int i = 0; i < blocks.size(); i++) { - if(blocks.get(i) == null){ + if (blocks.get(i) == null) { continue; } if (blocks.get(i) == current) { @@ -236,7 +260,11 @@ public class DocstrumBlockificationService { TextPageBlock inner = (TextPageBlock) blocks.get(i); - if (current.getDir() == inner.getDir() && current.almostIntersects(inner, yThreshold, xThreshold)) { + if (usedRulings.lineBetween(current, blocks.get(i))) { + continue; + } + + if (current.getDir() == inner.getDir() && current.intersects(inner, yThreshold, xThreshold)) { boolean toDuplicate = current.isToDuplicate() || inner.isToDuplicate(); current.getSequences().addAll(inner.getSequences()); @@ -249,181 +277,17 @@ public class DocstrumBlockificationService { } } var blocksIterator = blocks.iterator(); - while(blocksIterator.hasNext()){ - if(blocksIterator.next() == null){ + while (blocksIterator.hasNext()) { + if (blocksIterator.next() == null) { blocksIterator.remove(); } } } - public List splitZonesAtRulings(List textPositions, List horizontalRulingLines, List verticalRulingLines) { - - int indexOnPage = 0; - List chunkWords = new ArrayList<>(); - List chunkBlockList = new ArrayList<>(); - - float minX = 1000, maxX = 0, minY = 1000, maxY = 0; - TextPositionSequence prev = null; - - for (TextPositionSequence word : textPositions) { - - boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines); - boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir()); - - if (prev != null && (splitByDir || isSplitByRuling)) { - - TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage); - indexOnPage++; - - chunkBlockList.add(cb1); - chunkWords = new ArrayList<>(); - - minX = 1000; - maxX = 0; - minY = 1000; - maxY = 0; - prev = null; - } - - chunkWords.add(word); - - prev = word; - if (word.getMinXDirAdj() < minX) { - minX = word.getMinXDirAdj(); - } - if (word.getMaxXDirAdj() > maxX) { - maxX = word.getMaxXDirAdj(); - } - if (word.getMinYDirAdj() < minY) { - minY = word.getMinYDirAdj(); - } - if (word.getMaxYDirAdj() > maxY) { - maxY = word.getMaxYDirAdj(); - } - } - - TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage); - if (cb1 != null) { - chunkBlockList.add(cb1); - } - - return chunkBlockList; - } - - - private boolean equalsWithThreshold(float f1, float f2) { - - return Math.abs(f1 - f2) < THRESHOLD; - } - - private TextPageBlock buildTextBlock(List wordBlockList, int indexOnPage) { - TextPageBlock textBlock = null; - - FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter(); - FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter(); - FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter(); - StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter(); - StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter(); - - for (TextPositionSequence wordBlock : wordBlockList) { - - lineHeightFrequencyCounter.add(wordBlock.getTextHeight()); - fontSizeFrequencyCounter.add(wordBlock.getFontSize()); - spaceFrequencyCounter.add(wordBlock.getSpaceWidth()); - fontFrequencyCounter.add(wordBlock.getFont()); - styleFrequencyCounter.add(wordBlock.getFontStyle()); - - if (textBlock == null) { - textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(), - wordBlock.getMaxXDirAdj(), - wordBlock.getMinYDirAdj(), - wordBlock.getMaxYDirAdj(), - wordBlockList, - wordBlock.getRotation()); - } else { - TextPageBlock spatialEntity = textBlock.union(wordBlock); - textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight()); - } - } - - if (textBlock != null) { - textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular()); - textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular()); - textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular()); - textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular()); - textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular()); - textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest()); - } - - if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) { - textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj)); - } - return textBlock; - } - - - private boolean isSplitByRuling(float minX, - float minY, - float maxX, - float maxY, - TextPositionSequence word, - List horizontalRulingLines, - List verticalRulingLines) { - - return isSplitByRuling(maxX, - minY, - word.getMinXDirAdj(), - word.getMinYDirAdj(), - verticalRulingLines, - word.getDir().getDegrees(), - word.getPageWidth(), - word.getPageHeight()) // - || isSplitByRuling(minX, - minY, - word.getMinXDirAdj(), - word.getMaxYDirAdj(), - horizontalRulingLines, - word.getDir().getDegrees(), - word.getPageWidth(), - word.getPageHeight()) // - || isSplitByRuling(maxX, - minY, - word.getMinXDirAdj(), - word.getMinYDirAdj(), - horizontalRulingLines, - word.getDir().getDegrees(), - word.getPageWidth(), - word.getPageHeight()) // - || isSplitByRuling(minX, - minY, - word.getMinXDirAdj(), - word.getMaxYDirAdj(), - verticalRulingLines, - word.getDir().getDegrees(), - word.getPageWidth(), - word.getPageHeight()); - } - - - private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List rulingLines, float dir, float pageWidth, float pageHeight) { - - for (Ruling ruling : rulingLines) { - var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight); - if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) { - return true; - } - } - return false; - } - - - private double round(float value, int decimalPoints) { - - var d = Math.pow(10, decimalPoints); - return Math.round(value * d) / d; + return new TextPageBlock(wordBlockList); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocuMineBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocuMineBlockificationService.java index ca72723..9c8def4 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocuMineBlockificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocuMineBlockificationService.java @@ -15,11 +15,10 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter; import com.knecon.fforesight.service.layoutparser.processor.model.Orientation; -import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; +import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; -import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil; @Service public class DocuMineBlockificationService { @@ -34,15 +33,16 @@ public class DocuMineBlockificationService { * This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this! * Rulings (Table lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling. * - * @param textPositions The words of a page. - * @param horizontalRulingLines Horizontal table lines. - * @param verticalRulingLines Vertical table lines. + * @param textPositions The textPositions of a page. + * @param cleanRulings All rulings on a page * @return Page object that contains the Textblock and text statistics. */ - public ClassificationPage blockify(List textPositions, List horizontalRulingLines, List verticalRulingLines) { + public ClassificationPage blockify(List textPositions, CleanRulings cleanRulings) { List chunkWords = new ArrayList<>(); - List chunkBlockList1 = new ArrayList<>(); + List textPageBlocks = new ArrayList<>(); + + CleanRulings usedRulings = cleanRulings.withoutTextRulings(); float minX = 1000; float maxX = 0; @@ -59,23 +59,26 @@ public class DocuMineBlockificationService { boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj(); boolean negativeXGap = prev != null && word.getMinXDirAdj() - minX < -5; boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj(); - boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines); + boolean isSplitByRuling = prev != null && usedRulings.lineBetween(prev, word); boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir()); boolean splitByOtherFontAndOtherY = prev != null && Math.abs(prev.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight() * 0.2 && (word.getFontStyle() - .contains("bold") && !prev.getFontStyle().contains("bold") || prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold")); + .contains("bold") + && !prev.getFontStyle() + .contains("bold") || prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold")); - Matcher matcher = pattern.matcher(chunkWords.stream().collect(Collectors.joining(" ")).toString()); + Matcher matcher = pattern.matcher(chunkWords.stream() + .collect(Collectors.joining(" ")).toString()); boolean startsOnSameX = Math.abs(minX - word.getMinXDirAdj()) < 5 && matcher.matches(); if (prev != null && (lineSeparation || startFromTop || splitByDir || isSplitByRuling || splitByOtherFontAndOtherY || negativeXGap || startsOnSameX)) { Orientation prevOrientation = null; - if (!chunkBlockList1.isEmpty()) { - prevOrientation = chunkBlockList1.get(chunkBlockList1.size() - 1).getOrientation(); + if (!textPageBlocks.isEmpty()) { + prevOrientation = textPageBlocks.get(textPageBlocks.size() - 1).getOrientation(); } - TextPageBlock cb1 = buildTextBlock(chunkWords); - chunkBlockList1.add(cb1); + TextPageBlock cb1 = new TextPageBlock(chunkWords); + textPageBlocks.add(cb1); chunkWords = new ArrayList<>(); if (splitByX && !isSplitByRuling) { @@ -86,7 +89,11 @@ public class DocuMineBlockificationService { wasSplitted = false; cb1.setOrientation(Orientation.RIGHT); splitX1 = null; - } else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation || !startFromTop || !splitByX || !newLineAfterSplit || !isSplitByRuling)) { + } else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation + || !startFromTop + || !splitByX + || !newLineAfterSplit + || !isSplitByRuling)) { cb1.setOrientation(Orientation.LEFT); } @@ -114,128 +121,12 @@ public class DocuMineBlockificationService { } } - TextPageBlock cb1 = buildTextBlock(chunkWords); - if (cb1 != null) { - chunkBlockList1.add(cb1); - } + textPageBlocks.add(new TextPageBlock(chunkWords)); - return new ClassificationPage(chunkBlockList1); + return new ClassificationPage(textPageBlocks); } - private boolean equalsWithThreshold(float f1, float f2) { - - return Math.abs(f1 - f2) < THRESHOLD; - } - - - private TextPageBlock buildTextBlock(List wordBlockList) { - - TextPageBlock textBlock = null; - - FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter(); - FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter(); - FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter(); - StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter(); - StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter(); - - for (TextPositionSequence wordBlock : wordBlockList) { - - lineHeightFrequencyCounter.add(wordBlock.getTextHeight()); - fontSizeFrequencyCounter.add(wordBlock.getFontSize()); - spaceFrequencyCounter.add(wordBlock.getSpaceWidth()); - fontFrequencyCounter.add(wordBlock.getFont()); - styleFrequencyCounter.add(wordBlock.getFontStyle()); - - if (textBlock == null) { - textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(), - wordBlock.getMaxXDirAdj(), - wordBlock.getMinYDirAdj(), - wordBlock.getMaxYDirAdj(), - wordBlockList, - wordBlock.getRotation()); - } else { - TextPageBlock spatialEntity = textBlock.union(wordBlock); - textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight()); - } - } - - if (textBlock != null) { - textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular()); - textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular()); - textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular()); - textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular()); - textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular()); - textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest()); - } - - if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) { - textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj)); - } - return textBlock; - } - - - private boolean isSplitByRuling(float minX, - float minY, - float maxX, - float maxY, - TextPositionSequence word, - List horizontalRulingLines, - List verticalRulingLines) { - - return isSplitByRuling(maxX, - minY, - word.getMinXDirAdj(), - word.getMinYDirAdj(), - verticalRulingLines, - word.getDir().getDegrees(), - word.getPageWidth(), - word.getPageHeight()) // - || isSplitByRuling(minX, - minY, - word.getMinXDirAdj(), - word.getMaxYDirAdj(), - horizontalRulingLines, - word.getDir().getDegrees(), - word.getPageWidth(), - word.getPageHeight()) // - || isSplitByRuling(maxX, - minY, - word.getMinXDirAdj(), - word.getMinYDirAdj(), - horizontalRulingLines, - word.getDir().getDegrees(), - word.getPageWidth(), - word.getPageHeight()) // - || isSplitByRuling(minX, - minY, - word.getMinXDirAdj(), - word.getMaxYDirAdj(), - verticalRulingLines, - word.getDir().getDegrees(), - word.getPageWidth(), - word.getPageHeight()); // - } - - - private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List rulingLines, float dir, float pageWidth, float pageHeight) { - - for (Ruling ruling : rulingLines) { - var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight); - if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) { - return true; - } - } - return false; - } - - - private double round(float value, int decimalPoints) { - - var d = Math.pow(10, decimalPoints); - return Math.round(value * d) / d; - } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/RedactManagerBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/RedactManagerBlockificationService.java index 9addf27..a2c7085 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/RedactManagerBlockificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/RedactManagerBlockificationService.java @@ -13,14 +13,11 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter; import com.knecon.fforesight.service.layoutparser.processor.model.Orientation; -import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; -import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; -import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; -import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil; +import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations; @SuppressWarnings("all") @Service @@ -34,12 +31,13 @@ public class RedactManagerBlockificationService { * This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this! * Rulings (TablePageBlock lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling. * - * @param textPositions The words of a page. + * @param textPositions The words of a page. + * @param visualizations * @return Page object that contains the Textblock and text statistics. */ - public ClassificationPage blockify(List textPositions, List cells) { + public ClassificationPage blockify(List textPositions, CleanRulings cleanRulings, LayoutparsingVisualizations visualizations) { - CleanRulings usedRulings = RectangleTransformations.extractRulings(cells); + CleanRulings usedRulings = cleanRulings.withoutTextRulings(); int indexOnPage = 0; List chunkWords = new ArrayList<>(); @@ -57,7 +55,7 @@ public class RedactManagerBlockificationService { boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj(); boolean xIsBeforeFirstX = prev != null && word.getMinXDirAdj() < minX; boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj(); - boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, usedRulings.getHorizontal(), usedRulings.getVertical()); + boolean isSplitByRuling = prev != null && usedRulings.lineBetween(prev, word); boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir()); if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) { @@ -67,7 +65,7 @@ public class RedactManagerBlockificationService { prevOrientation = chunkBlockList.get(chunkBlockList.size() - 1).getOrientation(); } - TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage); + TextPageBlock cb1 = new TextPageBlock(chunkWords); indexOnPage++; chunkBlockList.add(cb1); @@ -81,7 +79,11 @@ public class RedactManagerBlockificationService { wasSplitted = false; cb1.setOrientation(Orientation.RIGHT); splitX1 = null; - } else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation || !startFromTop || !splitByX || !newLineAfterSplit || !isSplitByRuling)) { + } else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation + || !startFromTop + || !splitByX + || !newLineAfterSplit + || !isSplitByRuling)) { cb1.setOrientation(Orientation.LEFT); } @@ -109,8 +111,8 @@ public class RedactManagerBlockificationService { } } - TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage); - if (cb1 != null) { + if (!chunkWords.isEmpty()) { + TextPageBlock cb1 = new TextPageBlock(chunkWords); chunkBlockList.add(cb1); } @@ -150,8 +152,11 @@ public class RedactManagerBlockificationService { TextPageBlock block = (TextPageBlock) itty.next(); if (previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold(block.getMaxY(), - previous.getMaxY()) || previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation() - .equals(Orientation.RIGHT) && equalsWithThreshold(block.getMaxY(), previous.getMaxY())) { + previous.getMaxY()) + || previous != null + && previous.getOrientation().equals(Orientation.LEFT) + && block.getOrientation().equals(Orientation.RIGHT) + && equalsWithThreshold(block.getMaxY(), previous.getMaxY())) { previous.add(block); itty.remove(); continue; @@ -159,123 +164,19 @@ public class RedactManagerBlockificationService { previous = block; } + if (!textPositions.isEmpty()) { + visualizations.addTextBlockVisualizations(chunkBlockList.stream() + .map(tb -> (TextPageBlock) tb) + .toList(), textPositions.get(0).getPage()); + } return new ClassificationPage(chunkBlockList); } - private boolean equalsWithThreshold(float f1, float f2) { + private boolean equalsWithThreshold(double f1, double f2) { return Math.abs(f1 - f2) < THRESHOLD; } - - private TextPageBlock buildTextBlock(List wordBlockList, int indexOnPage) { - - TextPageBlock textBlock = null; - - FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter(); - FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter(); - FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter(); - StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter(); - StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter(); - - for (TextPositionSequence wordBlock : wordBlockList) { - - lineHeightFrequencyCounter.add(wordBlock.getTextHeight()); - fontSizeFrequencyCounter.add(wordBlock.getFontSize()); - spaceFrequencyCounter.add(wordBlock.getSpaceWidth()); - fontFrequencyCounter.add(wordBlock.getFont()); - styleFrequencyCounter.add(wordBlock.getFontStyle()); - - if (textBlock == null) { - textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(), - wordBlock.getMaxXDirAdj(), - wordBlock.getMinYDirAdj(), - wordBlock.getMaxYDirAdj(), - wordBlockList, - wordBlock.getRotation()); - } else { - TextPageBlock spatialEntity = textBlock.union(wordBlock); - textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight()); - } - } - - if (textBlock != null) { - textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular()); - textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular()); - textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular()); - textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular()); - textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular()); - textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest()); - } - - if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) { - textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj)); - } - return textBlock; - } - - - private boolean isSplitByRuling(float minX, - float minY, - float maxX, - float maxY, - TextPositionSequence word, - List horizontalRulingLines, - List verticalRulingLines) { - - return isSplitByRuling(maxX, - minY, - word.getMinXDirAdj(), - word.getMinYDirAdj(), - verticalRulingLines, - word.getDir().getDegrees(), - word.getPageWidth(), - word.getPageHeight()) // - || isSplitByRuling(minX, - minY, - word.getMinXDirAdj(), - word.getMaxYDirAdj(), - horizontalRulingLines, - word.getDir().getDegrees(), - word.getPageWidth(), - word.getPageHeight()) // - || isSplitByRuling(maxX, - minY, - word.getMinXDirAdj(), - word.getMinYDirAdj(), - horizontalRulingLines, - word.getDir().getDegrees(), - word.getPageWidth(), - word.getPageHeight()) // - || isSplitByRuling(minX, - minY, - word.getMinXDirAdj(), - word.getMaxYDirAdj(), - verticalRulingLines, - word.getDir().getDegrees(), - word.getPageWidth(), - word.getPageHeight()); - } - - - private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List rulingLines, float dir, float pageWidth, float pageHeight) { - - for (Ruling ruling : rulingLines) { - var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight); - if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) { - return true; - } - } - return false; - } - - - private double round(float value, int decimalPoints) { - - var d = Math.pow(10, decimalPoints); - return Math.round(value * d) / d; - } - } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java index d622fc8..d0ee204 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java @@ -5,7 +5,6 @@ import java.util.Locale; import java.util.regex.Matcher; import java.util.regex.Pattern; -import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; @@ -13,6 +12,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils; import lombok.RequiredArgsConstructor; @@ -49,7 +49,6 @@ public class DocuMineClassificationService { } } - private void classifyBlock(TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List headlineFontSizes) { log.debug("headlineFontSizes: {}", headlineFontSizes); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java index 1481776..36ee3eb 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java @@ -7,7 +7,6 @@ import static java.util.stream.Collectors.toList; import java.awt.geom.Rectangle2D; import java.util.ArrayList; import java.util.HashMap; -import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; @@ -15,7 +14,6 @@ import java.util.NoSuchElementException; import java.util.Set; import java.util.stream.Collectors; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; @@ -52,6 +50,9 @@ public class DocumentGraphFactory { public Document buildDocumentGraph(LayoutParsingType layoutParsingType, ClassificationDocument document) { Document documentGraph = new Document(); + + documentGraph.setVisualizations(document.getVisualizations()); + Context context = new Context(documentGraph); document.getPages() @@ -79,20 +80,21 @@ public class DocumentGraphFactory { } - public void addParagraphOrHeadline(GenericSemanticNode parentNode, TextPageBlock originalTextBlock, Context context, List textBlocksToMerge) { + public void addParagraphOrHeadline(GenericSemanticNode parentNode, + TextPageBlock originalTextBlock, + Context context, + List textBlocksToMerge, + LayoutParsingType layoutParsingType) { Page page = context.getPage(originalTextBlock.getPage()); GenericSemanticNode node; if (originalTextBlock.isHeadline()) { - node = Headline.builder().documentTree(context.getDocumentTree()) - .build(); - } else if (originalTextBlock.isToDuplicate()) { - node = DuplicatedParagraph.builder().documentTree(context.getDocumentTree()) - .build(); + node = Headline.builder().documentTree(context.getDocumentTree()).build(); + } else if (originalTextBlock.isToDuplicate() && layoutParsingType.equals(LayoutParsingType.REDACT_MANAGER)) { + node = DuplicatedParagraph.builder().documentTree(context.getDocumentTree()).build(); } else { - node = Paragraph.builder().documentTree(context.getDocumentTree()) - .build(); + node = Paragraph.builder().documentTree(context.getDocumentTree()).build(); } page.getMainBody().add(node); @@ -178,8 +180,7 @@ public class DocumentGraphFactory { private void addFooter(List textBlocks, Context context) { Page page = context.getPage(textBlocks.get(0).getPage()); - Footer footer = Footer.builder().documentTree(context.getDocumentTree()) - .build(); + Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build(); AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks), footer, context, @@ -194,8 +195,7 @@ public class DocumentGraphFactory { public void addHeader(List textBlocks, Context context) { Page page = context.getPage(textBlocks.get(0).getPage()); - Header header = Header.builder().documentTree(context.getDocumentTree()) - .build(); + Header header = Header.builder().documentTree(context.getDocumentTree()).build(); AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks), header, 0, page); List tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header); header.setTreeId(tocId); @@ -207,8 +207,7 @@ public class DocumentGraphFactory { private void addEmptyFooter(int pageIndex, Context context) { Page page = context.getPage(pageIndex); - Footer footer = Footer.builder().documentTree(context.getDocumentTree()) - .build(); + Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build(); AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(footer, context, page); List tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer); footer.setTreeId(tocId); @@ -220,8 +219,7 @@ public class DocumentGraphFactory { private void addEmptyHeader(int pageIndex, Context context) { Page page = context.getPage(pageIndex); - Header header = Header.builder().documentTree(context.getDocumentTree()) - .build(); + Header header = Header.builder().documentTree(context.getDocumentTree()).build(); AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(header, 0, page); List tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header); header.setTreeId(tocId); @@ -275,8 +273,7 @@ public class DocumentGraphFactory { return pages.keySet() .stream() .filter(page -> page.getNumber() == pageIndex) - .findFirst() - .orElseThrow(() -> new NoSuchElementException(format("ClassificationPage with number %d not found", pageIndex))); + .findFirst().orElseThrow(() -> new NoSuchElementException(format("ClassificationPage with number %d not found", pageIndex))); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SearchTextWithTextPositionFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SearchTextWithTextPositionFactory.java index c10cbee..0d9fd8f 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SearchTextWithTextPositionFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SearchTextWithTextPositionFactory.java @@ -2,6 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor.services.factory; import java.awt.geom.AffineTransform; import java.awt.geom.Rectangle2D; +import java.util.Collection; import java.util.Collections; import java.util.LinkedList; import java.util.List; @@ -29,19 +30,22 @@ public class SearchTextWithTextPositionFactory { public SearchTextWithTextPositionDto buildSearchTextToTextPositionDto(List sequences) { - if (sequences.isEmpty() || sequences.stream().allMatch(sequence -> sequence.getTextPositions().isEmpty())) { + if (sequences.isEmpty() || sequences.stream() + .allMatch(sequence -> sequence.getTextPositions().isEmpty())) { return SearchTextWithTextPositionDto.empty(); } Context context = new Context(); - RedTextPosition currentTextPosition = sequences.get(0).getTextPositions().get(0); - RedTextPosition previousTextPosition = RedTextPosition.builder().unicode(" ").position(currentTextPosition.getPosition()).build(); + RedTextPosition currentTextPosition = sequences.get(0).getTextPositions() + .get(0); + RedTextPosition previousTextPosition = RedTextPosition.builder().unicode(" ").bBoxDirAdj(currentTextPosition.getBBoxDirAdj()).build(); for (TextPositionSequence word : sequences) { for (int i = 0; i < word.getTextPositions().size(); ++i) { - currentTextPosition = word.getTextPositions().get(i); + currentTextPosition = word.getTextPositions() + .get(i); if (isLineBreak(currentTextPosition, previousTextPosition)) { removeHyphenLinebreaks(context); context.lineBreaksStringIdx.add(context.stringIdx); @@ -57,18 +61,21 @@ public class SearchTextWithTextPositionFactory { ++context.positionIdx; } - previousTextPosition = RedTextPosition.builder().unicode(" ").position(previousTextPosition.getPosition()).build(); + previousTextPosition = RedTextPosition.builder().unicode(" ").bBoxDirAdj(previousTextPosition.getBBoxDirAdj()).build(); context.stringBuilder.append(" "); context.stringIdxToPositionIdx.add(context.positionIdx); ++context.stringIdx; } - assert context.stringBuilder.length() == context.stringIdxToPositionIdx.size(); List positions = sequences.stream() - .flatMap(sequence -> sequence.getTextPositions().stream().map(textPosition -> mapRedTextPositionToInitialUserSpace(textPosition, sequence))) + .map(TextPositionSequence::getTextPositions) + .flatMap(Collection::stream) + .map(RedTextPosition::getBBoxInitialUserSpace) .toList(); + assert context.stringBuilder.length() == context.stringIdxToPositionIdx.size(); + return SearchTextWithTextPositionDto.builder() .searchText(context.stringBuilder.toString()) .lineBreaks(context.lineBreaksStringIdx) @@ -153,7 +160,7 @@ public class SearchTextWithTextPositionFactory { return false; } - float deltaY = Math.abs(currentPosition.getYDirAdj() - previousPosition.getYDirAdj()); + double deltaY = Math.abs(currentPosition.getYDirAdj() - previousPosition.getYDirAdj()); return deltaY >= currentPosition.getHeightDir(); } @@ -167,16 +174,16 @@ public class SearchTextWithTextPositionFactory { private boolean isHyphen(String unicodeCharacter) { return Objects.equals(unicodeCharacter, "-") || // - Objects.equals(unicodeCharacter, "~") || // - Objects.equals(unicodeCharacter, "‐") || // - Objects.equals(unicodeCharacter, "‒") || // - Objects.equals(unicodeCharacter, "⁻") || // - Objects.equals(unicodeCharacter, "−") || // - Objects.equals(unicodeCharacter, "﹣") || // - Objects.equals(unicodeCharacter, "゠") || // - Objects.equals(unicodeCharacter, "⁓") || // - Objects.equals(unicodeCharacter, "‑") || // - Objects.equals(unicodeCharacter, "\u00AD"); + Objects.equals(unicodeCharacter, "~") || // + Objects.equals(unicodeCharacter, "‐") || // + Objects.equals(unicodeCharacter, "‒") || // + Objects.equals(unicodeCharacter, "⁻") || // + Objects.equals(unicodeCharacter, "−") || // + Objects.equals(unicodeCharacter, "﹣") || // + Objects.equals(unicodeCharacter, "゠") || // + Objects.equals(unicodeCharacter, "⁓") || // + Objects.equals(unicodeCharacter, "‑") || // + Objects.equals(unicodeCharacter, "\u00AD"); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java index f4b26eb..cca8558 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java @@ -49,8 +49,7 @@ public class SectionNodeFactory { Map> blocksPerPage = pageBlocks.stream() .collect(groupingBy(AbstractPageBlock::getPage)); - Section section = Section.builder().documentTree(context.getDocumentTree()) - .build(); + Section section = Section.builder().documentTree(context.getDocumentTree()).build(); context.getSections().add(section); blocksPerPage.keySet() @@ -121,12 +120,12 @@ public class SectionNodeFactory { case REDACT_MANAGER, CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> { alreadyMerged.add(abstractPageBlock); remainingBlocks.remove(abstractPageBlock); - DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, new ArrayList<>()); + DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, new ArrayList<>(), layoutParsingType); } default -> { List textBlocks = findTextBlocksWithSameClassificationAndAlignsY((TextPageBlock) abstractPageBlock, remainingBlocks); alreadyMerged.addAll(textBlocks); - DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, textBlocks); + DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, textBlocks, layoutParsingType); } } } else if (abstractPageBlock instanceof TablePageBlock tablePageBlock) { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java index f71669c..4b7303f 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java @@ -45,7 +45,10 @@ public class TableNodeFactory { .flatMap(Collection::stream) .toList(); - Table table = Table.builder().documentTree(context.getDocumentTree()).numberOfCols(mergedRows.isEmpty() ? 0 : mergedRows.get(0).size()).numberOfRows(mergedRows.size()) + Table table = Table.builder() + .documentTree(context.getDocumentTree()) + .numberOfCols(mergedRows.isEmpty() ? 0 : mergedRows.get(0).size()) + .numberOfRows(mergedRows.size()) .build(); pages.forEach(page -> addTableToPage(page, parentNode, table)); @@ -128,7 +131,12 @@ public class TableNodeFactory { Page page = context.getPage(cell.getPageNumber()); - TableCell tableCell = TableCell.builder().documentTree(context.getDocumentTree()).row(rowIndex).col(colIndex).header(cell.isHeaderCell()).bBox(cell.getBounds2D()) + TableCell tableCell = TableCell.builder() + .documentTree(context.getDocumentTree()) + .row(rowIndex) + .col(colIndex) + .header(cell.isHeaderCell()) + .bBox(cell.getBBoxInitialUserSpace()) .build(); page.getMainBody().add(tableCell); @@ -159,7 +167,7 @@ public class TableNodeFactory { tableCell.setLeafTextBlock(textBlock); } else { cell.getTextBlocks() - .forEach(tb -> DocumentGraphFactory.addParagraphOrHeadline(tableCell, tb, context, emptyList())); + .forEach(tb -> DocumentGraphFactory.addParagraphOrHeadline(tableCell, tb, context, emptyList(), layoutParsingType)); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/graphics/FindGraphicsRaster.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/graphics/FindGraphicsRaster.java index 1a42abb..c2a2426 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/graphics/FindGraphicsRaster.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/graphics/FindGraphicsRaster.java @@ -13,6 +13,9 @@ import org.apache.pdfbox.rendering.ImageType; import org.apache.pdfbox.rendering.PDFRenderer; import org.springframework.stereotype.Service; +import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms; +import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation; + import lombok.SneakyThrows; @Service @@ -30,7 +33,7 @@ public class FindGraphicsRaster { var renderer = new PDFRenderer(doc); var img = renderer.renderImageWithDPI(pageInformation.number() - 1, DPI, ImageType.GRAY); - var imageCtm = getImageCTM(pageInformation, img.getWidth()); + var imageCtm = CoordinateTransforms.calculateImageCoordsToInitialUserSpaceCoords(pageInformation, CoordinateTransforms.calculateScalingFactor(pageInformation, img.getWidth())); return findCCBoundingBoxes(img, remove, THRESHOLD, DPI / 72, imageCtm); } @@ -131,42 +134,4 @@ public class FindGraphicsRaster { } - public AffineTransform getImageCTM(PageInformation pageInformation, int imageWidth) { - - double scalingFactor = calculateScalingFactor(pageInformation, imageWidth); - AffineTransform imageToCropBoxScaling = new AffineTransform(scalingFactor, 0, 0, scalingFactor, -pageInformation.minX(), -pageInformation.minY()); - - AffineTransform mirrorMatrix = new AffineTransform(1, 0, 0, -1, 0, pageInformation.height()); - - AffineTransform rotationMatrix = switch (pageInformation.rotationDegrees()) { - case 90 -> new AffineTransform(0, 1, -1, 0, pageInformation.height(), 0); - case 180 -> new AffineTransform(-1, 0, 0, -1, pageInformation.width(), pageInformation.height()); - case 270 -> new AffineTransform(0, -1, 1, 0, pageInformation.width() - pageInformation.height(), pageInformation.height()); // results from 90 + 180 rotations - default -> new AffineTransform(); - }; - - // matrix multiplication is performed from right to left, so the order is reversed. - // scaling -> mirror -> rotation - AffineTransform resultMatrix = new AffineTransform(); - - resultMatrix.concatenate(rotationMatrix); - resultMatrix.concatenate(mirrorMatrix); - resultMatrix.concatenate(imageToCropBoxScaling); - return resultMatrix; - } - - - private double calculateScalingFactor(PageInformation pageInformation, int imageWidth) { - - // PDFBox always returns page height and width based on rotation - double pageWidth; - if (pageInformation.rotationDegrees() == 90 || pageInformation.rotationDegrees() == 270) { - pageWidth = pageInformation.height(); - } else { - pageWidth = pageInformation.width(); - } - - return pageWidth / imageWidth; - } - } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/graphics/GraphicExtractorService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/graphics/GraphicExtractorService.java index 37a7122..5e1cd2e 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/graphics/GraphicExtractorService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/graphics/GraphicExtractorService.java @@ -1,7 +1,6 @@ package com.knecon.fforesight.service.layoutparser.processor.services.graphics; import java.awt.geom.Rectangle2D; -import java.util.ArrayList; import java.util.List; import java.util.stream.Collectors; @@ -9,10 +8,11 @@ import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.springframework.stereotype.Service; -import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.services.factory.SearchTextWithTextPositionFactory; +import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation; import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; import lombok.RequiredArgsConstructor; @@ -22,6 +22,9 @@ import lombok.SneakyThrows; @RequiredArgsConstructor public class GraphicExtractorService { + private static final int MIN_GRAPHICS_SIDE_LENGTH = 30; + private static final int MIN_GRAPHICS_AREA = 500; + private final GraphicsClusteringService graphicsClusteringService; private final FindGraphicsRaster findGraphicsRaster; @@ -32,33 +35,32 @@ public class GraphicExtractorService { int pageNumber, CleanRulings cleanRulings, List textPositionSequences, - List emptyTableCells, boolean graphicsRaster) { - var characterBBoxes = getCharacterBBoxes(textPositionSequences); - var tableLineBBoxes = getLineBBoxesFromTableCells(emptyTableCells); - var underLineBBoxes = getUnderlineBBoxes(cleanRulings, characterBBoxes); - var strikeThroughBBoxes = getStrikeThroughBBoxes(cleanRulings, characterBBoxes); + List characterBBoxes = getCharacterBBoxes(textPositionSequences); + List classifiedRulingsBoxes = getLineBBoxesOfAllClassifiedRulings(cleanRulings); GraphicBBDetector graphicBBDetector = new GraphicBBDetector(pdPage, true); - var graphicBBoxes = graphicBBDetector.findGraphicBB(); + List graphicBBoxes = graphicBBDetector.findGraphicBB(); if (graphicsRaster) { // This should only be used if ocr was performed, it is currently in an early stage and needs to be improved. graphicBBoxes.addAll(findGraphicsRaster.findCCBoundingBoxes(pdDocument, - characterBBoxes.stream().map(box -> new Rectangle2D.Double(box.x1 - 2, box.y1 - 2, box.width() + 4, box.height() + 4)).collect(Collectors.toList()), - PageInformation.fromPDPage(pageNumber, pdPage))); + characterBBoxes.stream() + .map(box -> new Rectangle2D.Double(box.x1 - 2, box.y1 - 2, box.width() + 4, box.height() + 4)) + .collect(Collectors.toList()), + PageInformation.fromPDPage(pageNumber, pdPage))); } - var filteredGraphicBBoxes = graphicBBoxes.stream() - .filter(box -> !box.intersectsAny(tableLineBBoxes, 4)) - .filter(box -> !box.intersectsAny(underLineBBoxes, 4)) - .filter(box -> !box.intersectsAny(strikeThroughBBoxes, 4)) + List filteredGraphicBBoxes = graphicBBoxes.stream() + .filter(box -> !box.intersectsAny(classifiedRulingsBoxes, 4)) .collect(Collectors.toList()); - var clusters = graphicsClusteringService.getClusters(filteredGraphicBBoxes, 14); + List clusters = graphicsClusteringService.getClusters(filteredGraphicBBoxes, 14); - return clusters.stream().filter(box -> box.area() > 500 && box.height() > 50 && box.width() > 50).toList(); + return clusters.stream() + .filter(box -> box.area() > MIN_GRAPHICS_AREA && box.height() > MIN_GRAPHICS_SIDE_LENGTH && box.width() > MIN_GRAPHICS_SIDE_LENGTH) + .toList(); } @@ -74,34 +76,13 @@ public class GraphicExtractorService { } - private List getLineBBoxesFromTableCells(List emptyTableCells) { + private List getLineBBoxesOfAllClassifiedRulings(CleanRulings cleanRulings) { - List expandedTableLines = new ArrayList<>(); - - emptyTableCells.forEach(cell -> { - expandedTableLines.add(new Box(new Rectangle2D.Double(cell.x, cell.y - 1, cell.width, 2))); - expandedTableLines.add(new Box(new Rectangle2D.Double(cell.x, cell.y + cell.height - 1, cell.width, 2))); - expandedTableLines.add(new Box(new Rectangle2D.Double(cell.x - 1, cell.y, 2, cell.height))); - expandedTableLines.add(new Box(new Rectangle2D.Double(cell.x + cell.width - 1, cell.y, 2, cell.height))); - }); - - return expandedTableLines; - } - - - private List getUnderlineBBoxes(CleanRulings cleanRulings, List characterBBoxes) { - - return cleanRulings.getHorizontal() + return cleanRulings.buildAll() .stream() + .filter(ruling -> !ruling.getClassification().equals(Ruling.Classification.OTHER)) .map(h -> new Box(h.x1, h.y1, h.x2, h.y2)) - .filter(box -> box.intersectsAnyAndOver(characterBBoxes, 6)) .collect(Collectors.toList()); } - - private List getStrikeThroughBBoxes(CleanRulings cleanRulings, List characterBBoxes) { - - return cleanRulings.getHorizontal().stream().map(h -> new Box(h.x1, h.y1, h.x2, h.y2)).filter(box -> box.intersectsCenter(characterBBoxes, 2)).collect(Collectors.toList()); - } - } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/LegacyPDFStreamEngine.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/LegacyPDFStreamEngine.java index ff2e665..326746d 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/LegacyPDFStreamEngine.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/LegacyPDFStreamEngine.java @@ -82,7 +82,6 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine { private int pageRotation; private PDRectangle pageSize; - private Matrix translateMatrix; private final GlyphList glyphList; private final Map fontHeightMap = new WeakHashMap(); @@ -134,12 +133,6 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine { this.pageRotation = page.getRotation(); this.pageSize = page.getCropBox(); - if (pageSize.getLowerLeftX() == 0 && pageSize.getLowerLeftY() == 0) { - translateMatrix = null; - } else { - // translation matrix for cropbox - translateMatrix = Matrix.getTranslateInstance(-pageSize.getLowerLeftX(), -pageSize.getLowerLeftY()); - } super.processPage(page); } @@ -265,62 +258,52 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine { } } - // adjust for cropbox if needed - Matrix translatedTextRenderingMatrix; - if (translateMatrix == null) { - translatedTextRenderingMatrix = textRenderingMatrix; - } else { - translatedTextRenderingMatrix = Matrix.concatenate(translateMatrix, textRenderingMatrix); - nextX -= pageSize.getLowerLeftX(); - nextY -= pageSize.getLowerLeftY(); - } - // This is a hack for unicode letter with 2 chars e.g. RA see unicodeProblem.pdf if (unicodeMapping.length() == 2) { processTextPosition(new TextPosition(pageRotation, - pageSize.getWidth(), - pageSize.getHeight(), - translatedTextRenderingMatrix, - nextX, - nextY, - Math.abs(dyDisplay), - dxDisplay, - Math.abs(spaceWidthDisplay), - Character.toString(unicodeMapping.charAt(0)), - new int[]{code}, - font, - fontSize, - (int) (fontSize * textMatrix.getScalingFactorX()))); + pageSize.getWidth(), + pageSize.getHeight(), + textRenderingMatrix, + nextX, + nextY, + Math.abs(dyDisplay), + dxDisplay, + Math.abs(spaceWidthDisplay), + Character.toString(unicodeMapping.charAt(0)), + new int[]{code}, + font, + fontSize, + (int) (fontSize * textMatrix.getScalingFactorX()))); processTextPosition(new TextPosition(pageRotation, - pageSize.getWidth(), - pageSize.getHeight(), - translatedTextRenderingMatrix, - nextX, - nextY, - Math.abs(dyDisplay), - dxDisplay, - Math.abs(spaceWidthDisplay), - Character.toString(unicodeMapping.charAt(1)), - new int[]{code}, - font, - fontSize, - (int) (fontSize * textMatrix.getScalingFactorX()))); + pageSize.getWidth(), + pageSize.getHeight(), + textRenderingMatrix, + nextX, + nextY, + Math.abs(dyDisplay), + dxDisplay, + Math.abs(spaceWidthDisplay), + Character.toString(unicodeMapping.charAt(1)), + new int[]{code}, + font, + fontSize, + (int) (fontSize * textMatrix.getScalingFactorX()))); } else { processTextPosition(new TextPosition(pageRotation, - pageSize.getWidth(), - pageSize.getHeight(), - translatedTextRenderingMatrix, - nextX, - nextY, - Math.abs(dyDisplay), - dxDisplay, - Math.abs(spaceWidthDisplay), - unicodeMapping, - new int[]{code}, - font, - fontSize, - (int) (fontSize * textMatrix.getScalingFactorX()))); + pageSize.getWidth(), + pageSize.getHeight(), + textRenderingMatrix, + nextX, + nextY, + Math.abs(dyDisplay), + dxDisplay, + Math.abs(spaceWidthDisplay), + unicodeMapping, + new int[]{code}, + font, + fontSize, + (int) (fontSize * textMatrix.getScalingFactorX()))); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFTextStripper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFTextStripper.java index 1ca5b43..83fafea 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFTextStripper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFTextStripper.java @@ -1007,7 +1007,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine { /** * Set the desired word separator for output text. The PDFBox text extraction algorithm will output a space - * character if there is enough space between two words. By default a space character is used. If you need and + * character if there is enough space between two textPositions. By default a space character is used. If you need and * accurate count of characters that are found in a PDF document then you might want to set the word separator to * the empty string. * @@ -1703,7 +1703,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine { /** * Write a list of string containing a whole line of a document. * - * @param line a list with the words of the given line + * @param line a list with the textPositions of the given line * @throws IOException if something went wrong */ private void writeLine(List line, boolean isParagraphEnd) throws IOException { @@ -1744,9 +1744,9 @@ public class PDFTextStripper extends LegacyPDFStreamEngine { /** - * Handles the LTR and RTL direction of the given words. The whole implementation stands and falls with the given - * word. If the word is a full line, the results will be the best. If the word contains of single words or - * characters, the order of the characters in a word or words in a line may wrong, due to RTL and LTR marks and + * Handles the LTR and RTL direction of the given textPositions. The whole implementation stands and falls with the given + * word. If the word is a full line, the results will be the best. If the word contains of single textPositions or + * characters, the order of the characters in a word or textPositions in a line may wrong, due to RTL and LTR marks and * characters! *

* Based on http://www.nesterovsky-bros.com/weblog/2013/07/28/VisualToLogicalConversionInJava.aspx diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LayoutGridService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LayoutGridService.java index 9159742..4cdc5bc 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LayoutGridService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LayoutGridService.java @@ -65,12 +65,20 @@ public class LayoutGridService { @SneakyThrows @Observed(name = "ViewerDocumentService", contextualName = "create-viewer-document") - public void addLayoutGrid(File originFile, Document document, File destinationFile, boolean layerVisibilityDefaultValue) { + public void addLayoutGrid(File originFile, Document document, File destinationFile, boolean layerVisibilityDefaultValue, boolean writeVisualLayoutParsingGrid) { + List allVisualizations; Visualizations layoutGrid = this.addLayoutGrid(document, layerVisibilityDefaultValue, false); - Visualizations visualLayoutGrid = this.addLayoutGrid(document, layerVisibilityDefaultValue, true); + if (writeVisualLayoutParsingGrid) { + Visualizations visualLayoutGrid = this.addLayoutGrid(document, layerVisibilityDefaultValue, true); + allVisualizations = Stream.concat(Stream.of(layoutGrid, visualLayoutGrid), document.getVisualizations().streamAll()) + .toList(); + } else { + allVisualizations = Stream.concat(Stream.of(layoutGrid), document.getVisualizations().streamAll()) + .toList(); + } - viewerDocumentService.addVisualizationsOnPage(originFile, destinationFile, List.of(layoutGrid, visualLayoutGrid)); + viewerDocumentService.addVisualizationsOnPage(originFile, destinationFile, allVisualizations); } @@ -130,7 +138,10 @@ public class LayoutGridService { } for (Page page : table.getPages()) { - Optional optionalFirstRowOnPage = table.streamCol(0).filter(tableCell -> tableCell.isOnPage(page.getNumber())).map(TableCell::getRow).findFirst(); + Optional optionalFirstRowOnPage = table.streamCol(0) + .filter(tableCell -> tableCell.isOnPage(page.getNumber())) + .map(TableCell::getRow) + .findFirst(); if (optionalFirstRowOnPage.isEmpty()) { continue; } @@ -170,14 +181,17 @@ public class LayoutGridService { private static Stream streamBBoxOfCellsOnPage(Stream table, Page page) { - return table.filter(tableCell -> tableCell.isOnPage(page.getNumber())).map(TableCell::getBBox).map(bBoxMap -> bBoxMap.get(page)); + return table.filter(tableCell -> tableCell.isOnPage(page.getNumber())) + .map(TableCell::getBBox) + .map(bBoxMap -> bBoxMap.get(page)); } private void addSection(SemanticNode semanticNode, LayoutGrid layoutGrid, Color color) { Map bBoxMap = semanticNode.getBBox(); - List subSections = semanticNode.streamAllSubNodesOfType(NodeType.SECTION).toList(); + List subSections = semanticNode.streamAllSubNodesOfType(NodeType.SECTION) + .toList(); Page firstPage = semanticNode.getFirstPage(); if (!subSections.isEmpty()) { addPlacedText(firstPage, bBoxMap.get(firstPage), buildTreeIdString(semanticNode), layoutGrid); @@ -196,7 +210,10 @@ public class LayoutGridService { } return; } - List pagesInOrder = bBoxMap.keySet().stream().sorted(Comparator.comparingInt(Page::getNumber)).collect(Collectors.toList()); + List pagesInOrder = bBoxMap.keySet() + .stream() + .sorted(Comparator.comparingInt(Page::getNumber)) + .collect(Collectors.toList()); pagesInOrder.remove(0); addLinesForFirstPageOfSection(semanticNode, color, firstPage, layoutGrid); var lastPage = pagesInOrder.remove(pagesInOrder.size() - 1); @@ -293,7 +310,10 @@ public class LayoutGridService { private String buildTreeIdString(SemanticNode semanticNode) { - return semanticNode.getTreeId().stream().map(Object::toString).collect(Collectors.joining(".")); + return semanticNode.getTreeId() + .stream() + .map(Object::toString) + .collect(Collectors.joining(".")); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/CoordinateTransforms.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/CoordinateTransforms.java new file mode 100644 index 0000000..fbd540d --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/CoordinateTransforms.java @@ -0,0 +1,56 @@ +package com.knecon.fforesight.service.layoutparser.processor.utils; + +import java.awt.geom.AffineTransform; + +import lombok.SneakyThrows; +import lombok.experimental.UtilityClass; + +@UtilityClass +public class CoordinateTransforms { + + public AffineTransform calculateImageCoordsToInitialUserSpaceCoords(PageInformation pageInformation, double scalingFactor) { + + AffineTransform imageToCropBoxScaling = new AffineTransform(scalingFactor, 0, 0, scalingFactor, -pageInformation.minX(), -pageInformation.minY()); + + AffineTransform mirrorMatrix = new AffineTransform(1, 0, 0, -1, 0, pageInformation.height()); + + AffineTransform rotationMatrix = switch (pageInformation.rotationDegrees()) { + case 90 -> new AffineTransform(0, 1, -1, 0, pageInformation.height(), 0); + case 180 -> new AffineTransform(-1, 0, 0, -1, pageInformation.width(), pageInformation.height()); + case 270 -> new AffineTransform(0, -1, 1, 0, pageInformation.width() - pageInformation.height(), pageInformation.height()); // results from 90 + 180 rotations + default -> new AffineTransform(); + }; + + // matrix multiplication is performed from right to left, so the order is reversed. + // scaling -> mirror -> rotation + AffineTransform resultMatrix = new AffineTransform(); + + resultMatrix.concatenate(rotationMatrix); + resultMatrix.concatenate(mirrorMatrix); + resultMatrix.concatenate(imageToCropBoxScaling); + return resultMatrix; + } + + + + @SneakyThrows + public AffineTransform calculateInitialUserSpaceCoordsToImageCoords(PageInformation pageInformation, double scalingFactor) { + + return calculateImageCoordsToInitialUserSpaceCoords(pageInformation, scalingFactor).createInverse(); + } + + + public double calculateScalingFactor(PageInformation pageInformation, double imageWidth) { + + // PDFBox always returns page height and width based on rotation + double pageWidth; + if (pageInformation.rotationDegrees() == 90 || pageInformation.rotationDegrees() == 270) { + pageWidth = pageInformation.height(); + } else { + pageWidth = pageInformation.width(); + } + + return pageWidth / imageWidth; + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/GeometricComparators.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/GeometricComparators.java index c21b516..7bd53a2 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/GeometricComparators.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/GeometricComparators.java @@ -1,10 +1,10 @@ package com.knecon.fforesight.service.layoutparser.processor.utils; import java.awt.geom.Point2D; +import java.awt.geom.Rectangle2D; import java.util.Comparator; import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; -import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle; import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; public class GeometricComparators { @@ -58,7 +58,7 @@ public class GeometricComparators { return cell1Size.compareTo(cell2Size); }; - public static final Comparator RECTANGLE_SIZE_COMPARATOR = (rect1, rect2) -> { + public static final Comparator RECTANGLE_SIZE_COMPARATOR = (rect1, rect2) -> { Double rect1Size = rect1.getHeight() * rect1.getWidth(); Double rect2Size = rect2.getHeight() * rect2.getWidth(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/MarkedContentUtils.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/MarkedContentUtils.java index 799ac99..3e87eb4 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/MarkedContentUtils.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/MarkedContentUtils.java @@ -1,12 +1,5 @@ package com.knecon.fforesight.service.layoutparser.processor.utils; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; -import lombok.experimental.UtilityClass; -import org.apache.pdfbox.cos.COSName; -import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent; -import org.apache.pdfbox.text.TextPosition; - import java.awt.geom.Rectangle2D; import java.util.Collection; import java.util.Collections; @@ -14,12 +7,23 @@ import java.util.List; import java.util.Map; import java.util.stream.Collectors; +import org.apache.pdfbox.cos.COSName; +import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent; +import org.apache.pdfbox.text.TextPosition; + +import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; + +import lombok.experimental.UtilityClass; + @UtilityClass public class MarkedContentUtils { public static final String HEADER = "Header"; public static final String FOOTER = "Footer"; + public List getMarkedContentBboxPerLine(List markedContents, String subtype) { if (markedContents == null) { @@ -31,7 +35,8 @@ public class MarkedContentUtils { .filter(m -> m.getProperties() != null) .filter(m -> m.getProperties().getItem("Subtype") != null) .filter(m -> ((COSName) m.getProperties().getItem("Subtype")).getName().equals(subtype)) - .map(PDMarkedContent::getContents).flatMap(Collection::stream) + .map(PDMarkedContent::getContents) + .flatMap(Collection::stream) .filter(t -> t instanceof TextPosition) .map(t -> (TextPosition) t) .filter(t -> !t.getUnicode().equals(" ")) @@ -41,16 +46,77 @@ public class MarkedContentUtils { return Collections.emptyList(); } - return markedContentByYPosition.values().stream() - .map(textPositions -> new TextPositionSequence(textPositions.stream() - .toList(), 0, true) - .getRectangle()) - .map(t -> new Rectangle2D.Float(t.getTopLeft().getX(), t.getTopLeft().getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight()))).collect(Collectors.toList()); + return markedContentByYPosition.values() + .stream() + .map(textPositions -> new TextPositionSequence(textPositions, 0, true).getBBoxInitialUserSpace()) + .map(t -> new Rectangle2D.Double(t.getX(), t.getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight()))) + .collect(Collectors.toList()); + } + + + public List getMarkedContentPositions(List markedContents) { + + if (markedContents == null) { + return Collections.emptyList(); + } + + return markedContents.stream() + .filter(m -> !m.getContents().isEmpty()) + .map(MarkedContentPosition::fromPDMarkedContent) + .toList(); } public boolean intersects(TextPageBlock textBlock, Map> markedContentBboxPerType, String type) { - return markedContentBboxPerType.get(type) != null && markedContentBboxPerType.get(type).stream().anyMatch(rectangle -> rectangle.intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight())); + + return markedContentBboxPerType.get(type) != null && markedContentBboxPerType.get(type) + .stream() + .anyMatch(rectangle -> rectangle.intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight())); + } + + + public record MarkedContentPosition(String type, String subType, List textPositions) { + + public static MarkedContentPosition fromPDMarkedContent(PDMarkedContent markedContent) { + + return new MarkedContentPosition(markedContent.getTag(), parseSubType(markedContent), parseTextPositions(markedContent.getContents())); + } + + + private static List parseTextPositions(List contents) { + + return contents.stream() + .filter(content -> content instanceof TextPosition) + .map(content -> (TextPosition) content) + .filter(content -> !content.getUnicode().equals(" ")) + .map(textPositions -> new TextPositionSequence(List.of(textPositions), 0, true)) + .map(BoundingBox::getBBoxInitialUserSpace) + .collect(Collectors.toList()); + } + + + private static String parseSubType(PDMarkedContent markedContent) { + + if (markedContent == null || markedContent.getProperties() == null || markedContent.getProperties().getItem("Subtype") == null) { + return null; + } + + return ((COSName) markedContent.getProperties().getItem("Subtype")).getName(); + } + + + public String formattedType() { + + if (subType == null || subType.isEmpty()) { + return type; + } + if (type.equals("Artifact")) { + return subType; + } + return String.format("%s-%s", type, subType); + + } + } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/graphics/PageInformation.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PageInformation.java similarity index 91% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/graphics/PageInformation.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PageInformation.java index 09c6f8a..0b53d74 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/graphics/PageInformation.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PageInformation.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.services.graphics; +package com.knecon.fforesight.service.layoutparser.processor.utils; import java.awt.geom.Rectangle2D; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PositionUtils.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PositionUtils.java index 48b720d..c22e2bb 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PositionUtils.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PositionUtils.java @@ -114,7 +114,7 @@ public final class PositionUtils { } - public Float getApproxLineCount(TextPageBlock textBlock) { + public double getApproxLineCount(TextPageBlock textBlock) { return textBlock.getHeight() / textBlock.getMostPopularWordHeight(); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangleTransformations.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangleTransformations.java index 14df80a..189fd2e 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangleTransformations.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangleTransformations.java @@ -52,7 +52,10 @@ public class RectangleTransformations { public static Rectangle2D bBoxUnionAtomicTextBlock(List atomicTextBlocks) { - return atomicTextBlocks.stream().flatMap(atomicTextBlock -> atomicTextBlock.getPositions().stream()).collect(new Rectangle2DBBoxCollector()); + return atomicTextBlocks.stream() + .flatMap(atomicTextBlock -> atomicTextBlock.getPositions() + .stream()) + .collect(new Rectangle2DBBoxCollector()); } @@ -77,7 +80,10 @@ public class RectangleTransformations { public static Rectangle2D atomicTextBlockBBox(List atomicTextBlocks) { - return atomicTextBlocks.stream().flatMap(atomicTextBlock -> atomicTextBlock.getPositions().stream()).collect(new Rectangle2DBBoxCollector()); + return atomicTextBlocks.stream() + .flatMap(atomicTextBlock -> atomicTextBlock.getPositions() + .stream()) + .collect(new Rectangle2DBBoxCollector()); } @@ -89,16 +95,18 @@ public class RectangleTransformations { public static Rectangle2D rectangleBBox(List rectangles) { - return rectangles.stream().map(RectangleTransformations::toRectangle2D).collect(new Rectangle2DBBoxCollector()); + return rectangles.stream() + .map(RectangleTransformations::toRectangle2D) + .collect(new Rectangle2DBBoxCollector()); } public static Rectangle2D toRectangle2D(Rectangle redactionLogRectangle) { return new Rectangle2D.Double(redactionLogRectangle.getTopLeft().getX(), - redactionLogRectangle.getTopLeft().getY() + redactionLogRectangle.getHeight(), - redactionLogRectangle.getWidth(), - -redactionLogRectangle.getHeight()); + redactionLogRectangle.getTopLeft().getY() + redactionLogRectangle.getHeight(), + redactionLogRectangle.getWidth(), + -redactionLogRectangle.getHeight()); } @@ -111,15 +119,16 @@ public class RectangleTransformations { public static Rectangle toRedactionLogRectangle(Rectangle2D rectangle2D, int pageNumber) { return new Rectangle(new Point((float) rectangle2D.getMinX(), (float) (rectangle2D.getMinY() + rectangle2D.getHeight())), - (float) rectangle2D.getWidth(), - -(float) rectangle2D.getHeight(), - pageNumber); + (float) rectangle2D.getWidth(), + -(float) rectangle2D.getHeight(), + pageNumber); } public static Rectangle2D rectangle2DBBox(List rectangle2DList) { - return rectangle2DList.stream().collect(new Rectangle2DBBoxCollector()); + return rectangle2DList.stream() + .collect(new Rectangle2DBBoxCollector()); } @@ -134,7 +143,8 @@ public class RectangleTransformations { if (rectangle2DList.isEmpty()) { return Collections.emptyList(); } - double splitThreshold = rectangle2DList.stream().mapToDouble(RectangularShape::getWidth).average().orElse(5) * 5.0; + double splitThreshold = rectangle2DList.stream() + .mapToDouble(RectangularShape::getWidth).average().orElse(5) * 5.0; List> rectangleListsWithGaps = new LinkedList<>(); List rectangleListWithoutGaps = new LinkedList<>(); @@ -171,7 +181,7 @@ public class RectangleTransformations { verticalRulings.add(new Ruling(new Point2D.Float(rectangle.x + rectangle.width, rectangle.y), new Point2D.Float(rectangle.x + rectangle.width, rectangle.y + rectangle.height))); }); - return CleanRulings.builder().vertical(verticalRulings).horizontal(horizontalRulings).build(); + return new CleanRulings(verticalRulings, horizontalRulings); } @@ -195,9 +205,9 @@ public class RectangleTransformations { public BinaryOperator combiner() { return (b1, b2) -> new BBox(Math.min(b1.lowerLeftX, b2.lowerLeftX), - Math.min(b1.lowerLeftY, b2.lowerLeftY), - Math.max(b1.upperRightX, b2.upperRightX), - Math.max(b1.upperRightY, b2.upperRightY)); + Math.min(b1.lowerLeftY, b2.lowerLeftY), + Math.max(b1.upperRightX, b2.upperRightX), + Math.max(b1.upperRightY, b2.upperRightY)); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangularIntersectionFinder.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangularIntersectionFinder.java index 3f47b40..60a19b9 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangularIntersectionFinder.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangularIntersectionFinder.java @@ -14,23 +14,24 @@ public class RectangularIntersectionFinder { public static List find(List horizontalRulingLines, List verticalRulingLines) { - // Fix for 211.pdf - for (Ruling r : horizontalRulingLines) { - if (r.getX2() < r.getX1()) { - double a = r.getX2(); - r.x2 = (float) r.getX1(); - r.x1 = (float) a; - } - } +// // Fix for 211.pdf +// for (Ruling r : horizontalRulingLines) { +// if (r.getX2() < r.getX1()) { +// double a = r.getX2(); +// r.x2 = (float) r.getX1(); +// r.x1 = (float) a; +// } +// } List foundRectangles = new ArrayList<>(); - Map intersectionPoints = Ruling.findIntersections(horizontalRulingLines, verticalRulingLines); + Map intersectionPoints = RulingIntersectionFinder.findNaive(horizontalRulingLines, verticalRulingLines); + List intersectionPointsList = new ArrayList<>(intersectionPoints.keySet()); intersectionPointsList.sort(Y_FIRST_POINT_COMPARATOR); for (int i = 0; i < intersectionPointsList.size(); i++) { Point2D topLeft = intersectionPointsList.get(i); - Ruling[] hv = intersectionPoints.get(topLeft); + RulingIntersectionFinder.IntersectingRulings intersectingRulingsFromTopLeft = intersectionPoints.get(topLeft); // CrossingPointsDirectlyBelow( topLeft ); List xPoints = new ArrayList<>(); @@ -48,19 +49,24 @@ public class RectangularIntersectionFinder { outer: for (Point2D xPoint : xPoints) { // is there a vertical edge b/w topLeft and xPoint? - if (!hv[1].equals(intersectionPoints.get(xPoint)[1])) { + if (!intersectingRulingsFromTopLeft.vertical().equals(intersectionPoints.get(xPoint).vertical())) { continue; } for (Point2D yPoint : yPoints) { // is there a horizontal edge b/w topLeft and yPoint ? - if (!hv[0].equals(intersectionPoints.get(yPoint)[0])) { + if (!intersectingRulingsFromTopLeft.horizontal().equals(intersectionPoints.get(yPoint).horizontal())) { continue; } Point2D btmRight = new Point2D.Float((float) yPoint.getX(), (float) xPoint.getY()); if (intersectionPoints.containsKey(btmRight) - && intersectionPoints.get(btmRight)[0].equals(intersectionPoints.get(xPoint)[0]) - && intersectionPoints.get(btmRight)[1].equals(intersectionPoints.get(yPoint)[1])) { + && intersectionPoints.get(btmRight).horizontal().equals(intersectionPoints.get(xPoint).horizontal()) + && intersectionPoints.get(btmRight).vertical().equals(intersectionPoints.get(yPoint).vertical())) { + foundRectangles.add(new Rectangle2D.Double(topLeft.getX(), topLeft.getY(), btmRight.getX() - topLeft.getX(), btmRight.getY() - topLeft.getY())); + intersectionPoints.get(topLeft).horizontal().setClassification(Ruling.Classification.TABLE_LINE); + intersectionPoints.get(topLeft).vertical().setClassification(Ruling.Classification.TABLE_LINE); + intersectionPoints.get(btmRight).horizontal().setClassification(Ruling.Classification.TABLE_LINE); + intersectionPoints.get(btmRight).vertical().setClassification(Ruling.Classification.TABLE_LINE); break outer; } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RulingIntersectionFinder.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RulingIntersectionFinder.java new file mode 100644 index 0000000..e69bcee --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RulingIntersectionFinder.java @@ -0,0 +1,200 @@ +package com.knecon.fforesight.service.layoutparser.processor.utils; + +import java.awt.geom.Point2D; +import java.util.Collections; +import java.util.Comparator; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.TreeMap; + +import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; + +import lombok.experimental.UtilityClass; +import lombok.extern.slf4j.Slf4j; + +@Slf4j +@UtilityClass +public class RulingIntersectionFinder { + + public static final int PERPENDICULAR_UNIT_EXPAND_AMOUNT = 2; + + public static final Comparator Y_THEN_X_POINT_COMPARATOR = Comparator.comparingDouble(Point2D::getY).thenComparing(Point2D::getX); + + + /** + * Implementation to find line intersection in O(P + n log n), where n is the number of lines and P the numer of intersections. + * based on Segment Intersection by Piotr Indyk + * + * @param horizontals a list of non-overlapping horizontal rulings + * @param verticals a list of non-overlapping vertical rulings + * @return a Map of each found intersection point pointing to the two lines forming the intersection. + */ + /* + * The algorithm assumes there are only horizontal and vertical lines which are unique in their coordinates. (E.g. no overlapping horizontal lines exist) + * As a high level overview, the algorithm uses a sweep line advancing from left to right. + * It dynamically updates the horizontal rulings which are intersected by the current sweep line. + * When the sweep line hits a vertical line, it then checks for all intersections with the currently intersected horizontal rulings. + * THe trick of the algorithm is using a binary search tree to store the currently intersected horizontal rulings. This way the lookup should be in O(log n). + * This way the initial sorting step has the highest complexity class (O(n log n) and thus determines the complexity class of the entire algorithm + * Unfortunately, the implementation here takes a few liberties compared to the original algorithm. The binary search tree is replaced by an ordered Set which is simply looped over. + * Therefore, this implementation's worst case, where all horizontal lines span the entire sweep, you are essentially performing the naive approach with a bunch of overhead. + * Since we are using this implementation to find table cells, one can expect this worst case to always be the case. + * A simple runtime comparison for a single page with the most lines we can expect (SinglePages/AbsolutelyEnormousTable.pdf with 30 horizontals and 144 verticals) shows this implementation takes roughly 14 ms, whereas the naive approach takes 7 ms. Both are negligible, but the naive approach is two times as fast. + * If we would like to make this faster, we would need a better data structure for 'TreeMap horizontalRulingsInCurrentSweep', where we can query the TreeMap for all horizontal rulings in a given interval in O(log n). + */ + public Map find(List horizontals, List verticals) { + + long start = System.currentTimeMillis(); + List sweepTrajectory = buildSweepTrajectory(horizontals, verticals); + + TreeMap horizontalRulingsInCurrentSweep = new TreeMap<>(Comparator.comparingDouble(Ruling::getTop)); + + TreeMap intersections = new TreeMap<>(Y_THEN_X_POINT_COMPARATOR); + + for (SweepStep step : sweepTrajectory) { + switch (step.type) { + case VERTICAL: // check for intersections with currently intersected horizontal lines + for (Ruling horizontalRuling : horizontalRulingsInCurrentSweep.navigableKeySet()) { + + Optional intersectionPoint = findIntersectionPoint(horizontalRuling, step.ruling); + + if (intersectionPoint.isEmpty()) { + continue; + } + + intersections.put(intersectionPoint.get(), new IntersectingRulings(horizontalRuling, step.ruling)); + } + break; + case HORIZONTAL_ENTRY: // sweep line now intersects this horizontal ruling + horizontalRulingsInCurrentSweep.put(step.ruling, null); + break; + case HORIZONTAL_EXIT: // sweep line no longer intersects this horizontal ruling + horizontalRulingsInCurrentSweep.remove(step.ruling); + break; + } + } + log.debug("Finished building intersections with line sweep in {} ms", System.currentTimeMillis() - start); + + return intersections; + + } + + + /** + * Naive Approach in O(n^2) of finding intersections between lines by iterating over all lines. + * + * @param horizontals a list of non-overlapping horizontal rulings + * @param verticals a list of non-overlapping vertical rulings + * @return a Map of each found intersection point pointing to the two lines forming the intersection. + */ + public Map findNaive(List horizontals, List verticals) { + + long start = System.currentTimeMillis(); + TreeMap intersections = new TreeMap<>(Y_THEN_X_POINT_COMPARATOR); + + for (Ruling horizontal : horizontals) { + for (Ruling vertical : verticals) { + Optional intersectionPoint = findIntersectionPoint(horizontal, vertical); + + if (intersectionPoint.isEmpty()) { + continue; + } + + intersections.put(intersectionPoint.get(), new IntersectingRulings(horizontal, vertical)); + } + } + log.debug("Finished building intersections naively in {} ms", System.currentTimeMillis() - start); + + return intersections; + } + + + private static List buildSweepTrajectory(List horizontals, List verticals) { + + List sweepTrajectory = new LinkedList<>(); + + for (Ruling horizontalRuling : horizontals) { + sweepTrajectory.add(new SweepStep(SweepStep.Type.HORIZONTAL_ENTRY, horizontalRuling.getLeft() - PERPENDICULAR_UNIT_EXPAND_AMOUNT, horizontalRuling)); + sweepTrajectory.add(new SweepStep(SweepStep.Type.HORIZONTAL_EXIT, horizontalRuling.getRight() + PERPENDICULAR_UNIT_EXPAND_AMOUNT, horizontalRuling)); + } + + for (Ruling verticalRuling : verticals) { + sweepTrajectory.add(new SweepStep(SweepStep.Type.VERTICAL, verticalRuling.getLeft(), verticalRuling)); + } + + Collections.sort(sweepTrajectory); + + return sweepTrajectory; + } + + + public Optional findIntersectionPoint(Ruling horizontal, Ruling vertical) { + + if (!horizontal.isHorizontal() || !vertical.isVertical()) { + log.warn("lines must be orthogonal, vertical and horizontal"); + return Optional.empty(); + } + + Ruling expanded_horizontal = horizontal.expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT); + Ruling expanded_vertical = vertical.expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT); + + if (!expanded_horizontal.intersectsLine(expanded_vertical)) { + return Optional.empty(); + } + + return Optional.of(new Point2D.Float(vertical.getLeft(), horizontal.getTop())); + } + + + private class SweepStep implements Comparable { + + protected Type type; + protected float y_position; + protected Ruling ruling; + + private enum Type { + VERTICAL, + HORIZONTAL_EXIT, + HORIZONTAL_ENTRY + } + + + SweepStep(Type type, float y_position, Ruling ruling) { + + this.type = type; + this.y_position = y_position; + this.ruling = ruling; + } + + + @Override + public int compareTo(SweepStep other) { + + int rv; + if (DoubleComparisons.feq(y_position, other.y_position)) { + if (type == SweepStep.Type.VERTICAL && other.type == SweepStep.Type.HORIZONTAL_ENTRY) { + rv = 1; + } else if (type == SweepStep.Type.VERTICAL && other.type == SweepStep.Type.HORIZONTAL_EXIT) { + rv = -1; + } else if (type == SweepStep.Type.HORIZONTAL_ENTRY && other.type == SweepStep.Type.VERTICAL) { + rv = -1; + } else if (type == SweepStep.Type.HORIZONTAL_EXIT && other.type == SweepStep.Type.VERTICAL) { + rv = 1; + } else { + rv = Double.compare(y_position, other.y_position); + } + } else { + return Double.compare(y_position, other.y_position); + } + return rv; + } + + } + + public record IntersectingRulings(Ruling horizontal, Ruling vertical) { + + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/SpreadsheetFinder.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/SpreadsheetFinder.java index 660ef3f..b072352 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/SpreadsheetFinder.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/SpreadsheetFinder.java @@ -4,6 +4,7 @@ import static com.knecon.fforesight.service.layoutparser.processor.utils.Geometr import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.Y_FIRST_POINT_COMPARATOR; import java.awt.geom.Point2D; +import java.awt.geom.Rectangle2D; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; @@ -11,7 +12,7 @@ import java.util.List; import java.util.Map; import java.util.Set; -import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; public class SpreadsheetFinder { @@ -19,15 +20,15 @@ public class SpreadsheetFinder { private static final float AREA_TOLERANCE = 0.001f; - public static List findSpreadsheetsFromCells(List cells) { + public static List findSpreadsheetsFromCells(List cells) { // via: http://stackoverflow.com/questions/13746284/merging-multiple-adjacent-rectangles-into-one-polygon - List rectangles = new ArrayList<>(); + List rectangles = new ArrayList<>(); Set pointSet = new HashSet<>(); Map edgesH = new HashMap<>(); Map edgesV = new HashMap<>(); - for (Rectangle cell : cells) { - for (Point2D pt : cell.getPoints()) { + for (Cell cell : cells) { + for (Point2D pt : getPoints(cell.getBBoxInitialUserSpace())) { if (pointSet.contains(pt)) { // shared vertex, remove it pointSet.remove(pt); } else { @@ -116,13 +117,22 @@ public class SpreadsheetFinder { // do not add polygons with too many outer points as they are unlikely to be tables if (poly.size() <= MAX_OUTER_POINT_TOLERANCE) { - rectangles.add(new Rectangle(top - AREA_TOLERANCE, left - AREA_TOLERANCE, right - left + 2 * AREA_TOLERANCE, bottom - top + 2 * AREA_TOLERANCE)); + rectangles.add(new Rectangle2D.Double(left - AREA_TOLERANCE, top - AREA_TOLERANCE, right - left + (2 * AREA_TOLERANCE), bottom - top + (2 * AREA_TOLERANCE))); } } return rectangles; } + public static List getPoints(Rectangle2D rectangle2D) { + + return List.of(new Point2D.Double(rectangle2D.getX(), rectangle2D.getY()), + new Point2D.Double(rectangle2D.getMaxX(), rectangle2D.getY()), + new Point2D.Double(rectangle2D.getMaxX(), rectangle2D.getMaxY()), + new Point2D.Double(rectangle2D.getX(), rectangle2D.getMaxY())); + } + + private enum Direction { HORIZONTAL, VERTICAL diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutparsingVisualizations.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutparsingVisualizations.java new file mode 100644 index 0000000..e89ef31 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutparsingVisualizations.java @@ -0,0 +1,310 @@ +package com.knecon.fforesight.service.layoutparser.processor.visualization; + +import java.awt.Color; +import java.awt.geom.Line2D; +import java.awt.geom.Point2D; +import java.awt.geom.Rectangle2D; +import java.util.Collection; +import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.Stream; + +import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent; + +import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; +import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox; +import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line; +import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone; +import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; +import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; +import com.knecon.fforesight.service.viewerdoc.ContentStreams; +import com.knecon.fforesight.service.viewerdoc.model.ColoredLine; +import com.knecon.fforesight.service.viewerdoc.model.ColoredRectangle; +import com.knecon.fforesight.service.viewerdoc.model.PlacedText; +import com.knecon.fforesight.service.viewerdoc.model.Standard14EmbeddableFont; +import com.knecon.fforesight.service.viewerdoc.model.Visualizations; +import com.knecon.fforesight.service.viewerdoc.model.VisualizationsOnPage; + +import lombok.AccessLevel; +import lombok.AllArgsConstructor; +import lombok.Getter; +import lombok.NoArgsConstructor; +import lombok.Setter; +import lombok.experimental.FieldDefaults; + +@Getter +@NoArgsConstructor +@AllArgsConstructor +@FieldDefaults(level = AccessLevel.PRIVATE) +public class LayoutparsingVisualizations { + + static final Standard14EmbeddableFont FONT = Standard14EmbeddableFont.helvetica(); + + static final Color WORDS_COLOR = new Color(68, 84, 147); + static final Color LINES_COLOR = new Color(152, 45, 179); + static final Color ZONES_COLOR = new Color(131, 38, 38); + + static final Color RULINGS_COLOR = new Color(21, 221, 174); + static final Color TABLE_RULINGS_COLOR = new Color(255, 175, 175); + static final Color HEADER_RULING_COLOR = new Color(171, 131, 6); + static final Color FOOTER_RULING_COLOR = new Color(106, 82, 2); + static final Color UNDERLINE_RULING_COLOR = new Color(6, 39, 171); + static final Color STRIKETROUGH_RULING_COLOR = new Color(171, 6, 6); + + static final Color CELLS_COLOR = new Color(31, 214, 27); + + static final Color MAIN_BODY_COLOR = new Color(171, 131, 6); + static final Color MARKED_CONTENT_COLOR = new Color(171, 131, 6); + + static final List ROTATING_CHARACTER_COLOR = List.of(new Color(255, 87, 51), + new Color(255, 195, 0), + new Color(76, 175, 80), + new Color(33, 150, 243), + new Color(155, 89, 182), + new Color(233, 30, 99), + new Color(0, 188, 212), + new Color(121, 85, 72)); + + @Setter + boolean active; + + final Visualizations words = Visualizations.builder().layer(ContentStreams.WORDS).build(); + final Visualizations lines = Visualizations.builder().layer(ContentStreams.LINES).build(); + final Visualizations zones = Visualizations.builder().layer(ContentStreams.ZONES).build(); + final Visualizations mainBody = Visualizations.builder().layer(ContentStreams.MAIN_BODY).build(); + final Visualizations clean_rulings = Visualizations.builder().layer(ContentStreams.CLEAN_RULINGS).build(); + final Visualizations rulings = Visualizations.builder().layer(ContentStreams.RULINGS).build(); + final Visualizations cells = Visualizations.builder().layer(ContentStreams.CELLS).build(); + final Visualizations markedContent = Visualizations.builder().layer(ContentStreams.MARKED_CONTENT).build(); + final Visualizations neighbours = Visualizations.builder().layer(ContentStreams.NEIGHBOURS).build(); + final Visualizations characters = Visualizations.builder().layer(ContentStreams.CHARACTERS).build(); + + + public Stream streamAll() { + + if (!active) { + return Stream.empty(); + } + return Stream.of(characters, // + neighbours,// + words, // + lines, // + zones, // + rulings, // + clean_rulings, // + cells, // + mainBody, // + markedContent // + ); + } + + + public void addTextVisualizations(List textPositionSequences, int pageNumber) { + + if (!active) { + return; + } + VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.words); + visualizationsOnPage.getColoredRectangles() + .addAll(textPositionSequences.stream() + .map(BoundingBox::getBBoxInitialUserSpace) + .map(rect -> new ColoredRectangle(rect, WORDS_COLOR, 1)) + .toList()); + } + + + public void addCleanRulingVisualization(CleanRulings cleanRulings, int pageNumber) { + + if (!active) { + return; + } + VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.clean_rulings); + visualizationsOnPage.getColoredLines() + .addAll(cleanRulings.buildAll() + .stream() + .map(ruling -> new ColoredLine(ruling, decideOnRulingColor(ruling), 0.5f)) + .toList()); + } + + public void addRulingVisualization(List rulings, int pageNumber) { + + if (!active) { + return; + } + VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.rulings); + visualizationsOnPage.getColoredLines() + .addAll(rulings + .stream() + .map(ruling -> new ColoredLine(ruling, decideOnRulingColor(ruling), 0.5f)) + .toList()); + } + + + private Color decideOnRulingColor(Ruling ruling) { + + return switch (ruling.getClassification()) { + case TABLE_LINE -> TABLE_RULINGS_COLOR; + case HEADER_SEPARATOR -> HEADER_RULING_COLOR; + case FOOTER_SEPARATOR -> FOOTER_RULING_COLOR; + case UNDERLINE -> UNDERLINE_RULING_COLOR; + case STRIKETROUGH -> STRIKETROUGH_RULING_COLOR; + default -> RULINGS_COLOR; + }; + } + + + public void addCellVisualizations(List cells, int pageNumber) { + + if (!active) { + return; + } + VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.cells); + visualizationsOnPage.getColoredRectangles() + .addAll(cells.stream() + .map(cell -> new ColoredRectangle(cell.getBBoxInitialUserSpace(), CELLS_COLOR, 1)) + .toList()); + } + + + public void addZoneVisualizations(List zones, int page) { + + if (!active) { + return; + } + + VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(page, this.zones); + visualizationsOnPage.getColoredRectangles() + .addAll(zones.stream() + .map(BoundingBox::getBBoxInitialUserSpace) + .map(zone -> new ColoredRectangle(zone, ZONES_COLOR, 1)) + .toList()); + + } + + + public void addLineVisualizationsFromZones(List zones, int page) { + + addLineVisualizations(zones.stream() + .map(Zone::getLines) + .flatMap(Collection::stream) + .toList(), page); + + } + + + public void addLineVisualizations(List lines, int page) { + + if (!active) { + return; + } + VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(page, this.lines); + visualizationsOnPage.getColoredRectangles() + .addAll(lines.stream() + .map(BoundingBox::getBBoxInitialUserSpace) + .map(line -> new ColoredRectangle(line, LINES_COLOR, 0.5f)) + .toList()); + } + + + public void addTextBlockVisualizations(List textPageBlocks, int page) { + + if (!active) { + return; + } + VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(page, zones); + visualizationsOnPage.getColoredRectangles() + .addAll(textPageBlocks.stream() + .map(rect -> new ColoredRectangle(rect.getBBoxInitialUserSpace(), ZONES_COLOR, 1)) + .toList()); + } + + + public void addMainBodyVisualization(Rectangle rectangle, int pageNumber) { + + if (!active) { + return; + } + VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, mainBody); + visualizationsOnPage.getColoredRectangles() + .add(new ColoredRectangle(new Rectangle2D.Double(rectangle.getTopLeft().getX(), rectangle.getTopLeft().getY(), rectangle.getWidth(), rectangle.getHeight()), + MAIN_BODY_COLOR, + 1)); + } + + + public void addMarkedContentVisualizations(List markedContents, int pageNumber) { + + if (!active) { + return; + } + + VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, markedContent); + + List markedContentBBoxMapBySubType = MarkedContentUtils.getMarkedContentPositions(markedContents); + AtomicInteger count = new AtomicInteger(); + markedContentBBoxMapBySubType.forEach(markedContentPosition -> { + var bbox = markedContentPosition.textPositions() + .stream() + .collect(RectangleTransformations.collectBBox()); + String type = markedContentPosition.formattedType() + " " + count.getAndIncrement(); + + float translationAmount = ((FONT.getStringWidth(type) / 100) + 6); + // Pushes the string to the left of the box: calculate string width, divide by font units (1000), multiply with font size (10), add small offset (6). + + visualizationsOnPage.getPlacedTexts() + .add(PlacedText.textFacingUp(type, new Point2D.Double(bbox.getX() - translationAmount, bbox.getY() + bbox.getHeight()), 10, Color.BLACK, FONT)); + + visualizationsOnPage.getColoredRectangles().add(new ColoredRectangle(bbox, MARKED_CONTENT_COLOR, 1)); + } + + ); + + } + + + public void addCharactersWithNeighbours(List zones, int page) { + + if (!active) { + return; + } + + VisualizationsOnPage characterVisualizations = getOrCreateVisualizationsOnPage(page, characters); + VisualizationsOnPage neighbourVisualizations = getOrCreateVisualizationsOnPage(page, neighbours); + + AtomicInteger index = new AtomicInteger(0); + zones.forEach(zone -> zone.getLines() + .stream() + .map(Line::getCharacters) + .flatMap(Collection::stream) + .forEach(character -> { + Color color = ROTATING_CHARACTER_COLOR.get(index.getAndIncrement() % ROTATING_CHARACTER_COLOR.size()); + Rectangle2D charBBox = character.getTextPosition().getBBoxInitialUserSpace(); + characterVisualizations.getColoredRectangles().add(new ColoredRectangle(charBBox, color, 1)); + character.getNeighbors() + .forEach(neighbor -> { + Rectangle2D neighborBBox = neighbor.getCharacter().getTextPosition().getBBoxInitialUserSpace(); + Line2D line = new Line2D.Double(new Point2D.Double(charBBox.getCenterX(), charBBox.getCenterY()), + new Point2D.Double(neighborBBox.getCenterX(), neighborBBox.getCenterY())); + neighbourVisualizations.getColoredLines().add(new ColoredLine(line, color, 1)); + }); + })); + + } + + + private VisualizationsOnPage getOrCreateVisualizationsOnPage(int page, Visualizations visualizations) { + + if (visualizations.getVisualizationsOnPages().containsKey(page - 1)) { + return visualizations.getVisualizationsOnPages() + .get(page - 1); + } + VisualizationsOnPage visualizationsOnPage = VisualizationsOnPage.builder().build(); + visualizations.getVisualizationsOnPages().put(page - 1, visualizationsOnPage); + return visualizationsOnPage; + } + +} diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/HeadlinesGoldStandardIntegrationTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/HeadlinesGoldStandardIntegrationTest.java index 274e1e8..15d0e8d 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/HeadlinesGoldStandardIntegrationTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/HeadlinesGoldStandardIntegrationTest.java @@ -96,8 +96,8 @@ public class HeadlinesGoldStandardIntegrationTest { goldStandardLog.getRedactionLogEntry().removeIf(r -> !r.isRedacted() || r.getChanges().get(r.getChanges().size() - 1).getType().equals(ChangeType.REMOVED)); goldStandardLog.getRedactionLogEntry().forEach(e -> goldStandardHeadlines.add(new Headline(e.getPositions().get(0).getPage(), e.getValue()))); - Document documentGraph = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_OLD, - layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD, + Document documentGraph = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.DOCUMINE, + layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE, pdfFileResource.getFile(), new ImageServiceResponse(), new TableServiceResponse(), diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java index 7fde740..8763e37 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java @@ -1,10 +1,20 @@ package com.knecon.fforesight.service.layoutparser.server; +import java.io.File; +import java.nio.file.Files; +import java.nio.file.Path; import java.util.Arrays; +import java.util.Comparator; +import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.core.io.ClassPathResource; +import com.iqser.red.storage.commons.utils.FileSystemBackedStorageService; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; @@ -20,28 +30,65 @@ public class LayoutparserEnd2EndTest extends AbstractTest { @Autowired private LayoutParsingPipeline layoutParsingPipeline; - + @Disabled @Test - @SneakyThrows public void testLayoutParserEndToEnd() { - prepareStorage("files/bdr/Wie weiter bei Kristeneinrichtungen.pdf"); - LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER); - LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest); - Arrays.stream(finishedEvent.message().split("\n")) - .forEach(log::info); + String filePath = "files/Minimal Examples/RotateTextWithRulingsTestFile.pdf"; + + runForFile(filePath); + } + + @Test + @Disabled + @SneakyThrows + public void testLayoutParserEndToEndWithFolder() { + + String folder = "/home/kschuettler/Dokumente/TestFiles/large number of prod files"; + List pdfFiles = Files.walk(Path.of(folder)) + .filter(path -> path.getFileName().toString().endsWith(".pdf")) + .sorted(Comparator.comparing(Path::getFileName)) + .peek(System.out::println) + .toList(); + + System.out.printf("Found %d pdf files to process %n", pdfFiles.size()); + AtomicInteger count = new AtomicInteger(0); + pdfFiles.stream() + .peek(path -> log.info("{}/{}-{}", count.getAndIncrement(), pdfFiles.size(), path.getFileName())) + .forEach(path -> runForFile(path.toFile().toString())); } - @Test @SneakyThrows - public void testLayoutParserEndToEnd_RED_8747() { + private void runForFile(String filePath) { + + String fileName = Path.of(filePath).getFileName().toString(); + File file; + if (filePath.startsWith("files")) { // from resources + file = new ClassPathResource(filePath).getFile(); + } else { // absolute path + file = new File(filePath); + } + + LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LayoutParsingType.REDACT_MANAGER, true); + prepareStorage(layoutParsingRequest, file); - prepareStorage("files/syngenta/CustomerFiles/SinglePages/Page26_fRR A23317A PI0015600 CEU core part B6 - CZ.pdf"); - LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER_OLD); LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest); + Arrays.stream(finishedEvent.message().split("\n")) .forEach(log::info); + + File tmpFile = new File("/tmp/layoutparserEND2END/" + fileName + "_VIEWER.pdf"); + assert tmpFile.getParentFile().exists() || tmpFile.getParentFile().mkdirs(); + + storageService.downloadTo(TENANT_ID, layoutParsingRequest.viewerDocumentStorageId(), tmpFile); + } + + + @AfterEach + public void cleanUpTmp() { + + ((FileSystemBackedStorageService) storageService).clearStorage(); } } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java index a26754a..18b10df 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java @@ -23,6 +23,10 @@ import lombok.SneakyThrows; public class ViewerDocumentTest extends BuildDocumentTest { + ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null); + LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService); + + @Test @SneakyThrows public void testViewerDocument() { @@ -31,12 +35,10 @@ public class ViewerDocumentTest extends BuildDocumentTest { String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf"; var documentFile = new ClassPathResource(fileName).getFile(); - ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null); - LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService); long start = System.currentTimeMillis(); - Document document = buildGraph(fileName, LayoutParsingType.DOCUMINE); - layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true); + Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER); + layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true, false); System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000); } @@ -54,17 +56,17 @@ public class ViewerDocumentTest extends BuildDocumentTest { var tableResponse = mapper.readValue(new ClassPathResource(tableFileName).getInputStream(), TableServiceResponse.class); var documentFile = new ClassPathResource(fileName).getFile(); - var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE, - documentFile, - new ImageServiceResponse(), - tableResponse, - new VisualLayoutParsingResponse(), - Map.of("file", Path.of(fileName).getFileName().toFile().toString())); + var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE_OLD, + documentFile, + new ImageServiceResponse(), + tableResponse, + new VisualLayoutParsingResponse(), + Map.of("file", Path.of(fileName).getFileName().toFile().toString())); ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null); LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService); - Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.DOCUMINE, classificationDocument); + Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.DOCUMINE_OLD, classificationDocument); - layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true); + layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true, false); } } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/model/CleanRulingsTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/model/CleanRulingsTest.java new file mode 100644 index 0000000..4081aa6 --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/model/CleanRulingsTest.java @@ -0,0 +1,118 @@ +package com.knecon.fforesight.service.layoutparser.server.model; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.awt.geom.Point2D; +import java.awt.geom.Rectangle2D; +import java.util.Collections; +import java.util.List; +import java.util.stream.IntStream; + +import org.junit.jupiter.api.Test; + +import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; + +class CleanRulingsTest { + + @Test + public void testLineBetween() { + + List verticals = List.of(new Ruling(new Point2D.Double(10, 0), new Point2D.Double(10, 10))); + List horizontals = List.of(new Ruling(new Point2D.Double(0, 5), new Point2D.Double(10, 5))); + CleanRulings cleanRulings = new CleanRulings(horizontals, verticals); + Rectangle2D a = new Rectangle2D.Double(1, 6, 3, 3); + Rectangle2D b = new Rectangle2D.Double(5, 6, 3, 3); + Rectangle2D c = new Rectangle2D.Double(11, 6, 3, 3); + Rectangle2D d = new Rectangle2D.Double(1, 1, 3, 3); + Rectangle2D e = new Rectangle2D.Double(5, 1, 3, 3); + Rectangle2D f = new Rectangle2D.Double(11, 1, 3, 3); + + assertFalse(cleanRulings.lineBetween(a, a)); + assertFalse(cleanRulings.lineBetween(a, b)); + assertTrue(cleanRulings.lineBetween(a, c)); + assertTrue(cleanRulings.lineBetween(a, d)); + assertTrue(cleanRulings.lineBetween(a, e)); + assertTrue(cleanRulings.lineBetween(a, f)); + } + + + @Test + public void testSingleLineInRange() { + + List horizontals = List.of(new Ruling(new Point2D.Float(0, 1), new Point2D.Float(100, 1))); + List verticals = List.of(new Ruling(new Point2D.Float(1, 0), new Point2D.Float(1, 100))); + + CleanRulings cleanRulings = new CleanRulings(horizontals, verticals); + + assertEquals(0, cleanRulings.getVerticalsInXInterval(-2, -1).size()); + assertEquals(0, cleanRulings.getVerticalsInXInterval(-2, -5).size()); + assertEquals(0, cleanRulings.getVerticalsInXInterval(-2, Float.NaN).size()); + assertEquals(1, cleanRulings.getVerticalsInXInterval(1, 10).size()); + assertEquals(0, cleanRulings.getVerticalsInXInterval(100, 101).size()); + assertEquals(verticals.size(), cleanRulings.getVerticalsInXInterval(Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY).size()); + assertEquals(1, cleanRulings.getVerticalsInXInterval(1 - 1e-5f, 1 + 1e-5f).size()); + assertEquals(0, cleanRulings.getVerticalsInXInterval(1e-5f, 1 - 1e-5f).size()); + + assertEquals(0, cleanRulings.getHorizontalsInYInterval(-2, -1).size()); + assertEquals(1, cleanRulings.getHorizontalsInYInterval(1, 10).size()); + assertEquals(0, cleanRulings.getHorizontalsInYInterval(100, 1001).size()); + } + + + @Test + public void testLinesInRange() { + + List horizontals = IntStream.range(0, 101).boxed() + .map(y -> new Ruling(new Point2D.Float(0, y), new Point2D.Float(100, y))) + .toList(); + List verticals = IntStream.range(0, 101).boxed() + .map(x -> new Ruling(new Point2D.Float(x, 0), new Point2D.Float(x, 100))) + .toList(); + CleanRulings cleanRulings = new CleanRulings(horizontals, verticals); + + assertEquals(0, cleanRulings.getVerticalsInXInterval(-2, -1).size()); + assertEquals(0, cleanRulings.getVerticalsInXInterval(-2, -5).size()); + assertEquals(0, cleanRulings.getVerticalsInXInterval(-2, Float.NaN).size()); + assertEquals(10, cleanRulings.getVerticalsInXInterval(1, 10).size()); + assertEquals(1, cleanRulings.getVerticalsInXInterval(100, 101).size()); + assertEquals(verticals.size(), cleanRulings.getVerticalsInXInterval(Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY).size()); + assertEquals(1, cleanRulings.getVerticalsInXInterval(-1e-5f, 1e-5f).size()); + assertEquals(1, cleanRulings.getVerticalsInXInterval(0, 0).size()); + assertEquals(0, cleanRulings.getVerticalsInXInterval(1e-5f, 1 - 1e-5f).size()); + + assertEquals(0, cleanRulings.getHorizontalsInYInterval(-2, -1).size()); + assertEquals(10, cleanRulings.getHorizontalsInYInterval(1, 10).size()); + assertEquals(1, cleanRulings.getHorizontalsInYInterval(100, 1001).size()); + } + + + @Test + public void testLinesInRangePerformance() { + + List horizontals = IntStream.range(0, (int) 1e6).boxed() + .map(y -> new Ruling(new Point2D.Float(0, y), new Point2D.Float(100, y))) + .toList(); + CleanRulings cleanRulings = new CleanRulings(horizontals, Collections.emptyList()); + + float startY = 29; + float endY = 3000; + long start = System.currentTimeMillis(); + var result = cleanRulings.getHorizontalsInYInterval(startY, endY); + long time = System.currentTimeMillis() - start; + + start = System.currentTimeMillis(); + var result2 = cleanRulings.getHorizontals() + .stream() + .filter(ruling -> ruling.getY1() >= startY && ruling.getY1() <= endY) + .toList(); + long time2 = System.currentTimeMillis() - start; + + assertEquals(result, result2); + assertTrue(time < time2); + + } + +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/model/RulingTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/model/RulingTest.java new file mode 100644 index 0000000..cc1b5bd --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/model/RulingTest.java @@ -0,0 +1,62 @@ +package com.knecon.fforesight.service.layoutparser.server.model; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.awt.geom.Point2D; +import java.awt.geom.Rectangle2D; +import java.util.List; + +import org.junit.jupiter.api.Test; + +import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; + +public class RulingTest { + + @Test + public void testLineBetween() { + + List verticals = List.of(new Ruling(new Point2D.Double(10, 0), new Point2D.Double(10, 10)), new Ruling(new Point2D.Double(5, 0), new Point2D.Double(5, 5))); + List horizontals = List.of(new Ruling(new Point2D.Double(0, 5), new Point2D.Double(10, 5))); + + CleanRulings cleanRulings = new CleanRulings(horizontals, verticals); + + Rectangle2D a = new Rectangle2D.Double(1, 6, 3, 3); + Rectangle2D b = new Rectangle2D.Double(5, 6, 3, 3); + Rectangle2D c = new Rectangle2D.Double(11, 6, 3, 3); + Rectangle2D d = new Rectangle2D.Double(1, 1, 3, 3); + Rectangle2D e = new Rectangle2D.Double(5, 1, 3, 3); + Rectangle2D f = new Rectangle2D.Double(11, 1, 3, 3); + + assertFalse(cleanRulings.lineBetween(a, a)); + assertFalse(cleanRulings.lineBetween(a, b)); + assertTrue(cleanRulings.lineBetween(a, c)); + assertTrue(cleanRulings.lineBetween(a, d)); + assertTrue(cleanRulings.lineBetween(a, e)); + assertTrue(cleanRulings.lineBetween(a, f)); + + assertFalse(cleanRulings.lineBetween(d, d)); + assertTrue(cleanRulings.lineBetween(d, b)); + assertTrue(cleanRulings.lineBetween(d, c)); + assertTrue(cleanRulings.lineBetween(d, a)); + assertTrue(cleanRulings.lineBetween(d, e)); + assertTrue(cleanRulings.lineBetween(d, f)); + + assertFalse(cleanRulings.lineBetween(c, c)); + assertTrue(cleanRulings.lineBetween(c, b)); + assertTrue(cleanRulings.lineBetween(c, d)); + assertTrue(cleanRulings.lineBetween(c, a)); + assertTrue(cleanRulings.lineBetween(c, e)); + assertFalse(cleanRulings.lineBetween(c, f)); + + var all = List.of(a, b, c, d, e, f); + for (Rectangle2D r1 : all) { + for (Rectangle2D r2 : all) { + assertEquals(cleanRulings.lineBetween(r1, r2), cleanRulings.lineBetween(r2, r1)); + } + } + } + +} diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java index 4cb70fc..1981530 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java @@ -52,28 +52,16 @@ public class PdfSegmentationServiceTest extends AbstractTest { @Autowired private ObjectMapper objectMapper; - @Autowired - private RedactManagerClassificationService redactManagerClassificationService; - - @Autowired - private SectionsBuilderService sectionsBuilderService; - @SneakyThrows public ClassificationDocument buildClassificationDocument(File originDocument, TableServiceResponse tableServiceResponse) { - ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD, - originDocument, - new ImageServiceResponse(), - tableServiceResponse, - new VisualLayoutParsingResponse(), - Map.of("file","document")); - - redactManagerClassificationService.classifyDocument(classificationDocument); - - sectionsBuilderService.buildSections(classificationDocument); - - return classificationDocument; + return layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, + originDocument, + new ImageServiceResponse(), + tableServiceResponse, + new VisualLayoutParsingResponse(), + Map.of("file", "document")); } @@ -133,7 +121,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { .get(0).getSequences().size()).isEqualTo(8); assertThat(classificationDocument.getHeaders() .get(0).getTextBlocks() - .get(0).toString()).isEqualTo(textToSearch); + .get(0).toString()).contains(textToSearch); Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER, classificationDocument); @@ -143,6 +131,17 @@ public class PdfSegmentationServiceTest extends AbstractTest { } + @Test + @SneakyThrows + public void testTableAndCellRotations() { + String fileName = "files/Minimal Examples/simpleTablesRotated.pdf"; + ClassPathResource pdfFileResource = new ClassPathResource(fileName); + + ClassificationDocument classificationDocument = buildClassificationDocument(pdfFileResource.getFile()); + } + + + @Disabled @Test public void testScanRotationBorderIsIgnored() throws IOException { @@ -157,7 +156,11 @@ public class PdfSegmentationServiceTest extends AbstractTest { .flatMap(paragraph -> paragraph.getTables() .stream()) .collect(Collectors.toList())).isNotEmpty(); - var tables = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList(); + var tables = document.getSections() + .stream() + .flatMap(paragraph -> paragraph.getTables() + .stream()) + .toList(); // Quality of the table parsing is not good, because the file is rotated at scanning. // We only asset that the table border is not the page border. @@ -179,12 +182,12 @@ public class PdfSegmentationServiceTest extends AbstractTest { imageServiceResponse.getData() .forEach(imageMetadata -> images.computeIfAbsent(imageMetadata.getPosition().getPageNumber(), x -> new ArrayList<>()) .add(new ClassifiedImage(new Rectangle2D.Double(imageMetadata.getPosition().getX1(), - imageMetadata.getPosition().getY1(), - imageMetadata.getGeometry().getWidth(), - imageMetadata.getGeometry().getHeight()), - ImageType.valueOf(imageMetadata.getClassification().getLabel().toUpperCase(Locale.ROOT)), - imageMetadata.isAlpha(), - imageMetadata.getPosition().getPageNumber()))); + imageMetadata.getPosition().getY1(), + imageMetadata.getGeometry().getWidth(), + imageMetadata.getGeometry().getHeight()), + ImageType.valueOf(imageMetadata.getClassification().getLabel().toUpperCase(Locale.ROOT)), + imageMetadata.isAlpha(), + imageMetadata.getPosition().getPageNumber()))); System.out.println("object"); } @@ -196,11 +199,22 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/syngenta/CustomerFiles/SinglePages/Spanning Cells - Page131_S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"); ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); - assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty(); - TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(0); + assertThat(document.getSections() + .stream() + .flatMap(paragraph -> paragraph.getTables() + .stream()) + .collect(Collectors.toList())).isNotEmpty(); + TablePageBlock table = document.getSections() + .stream() + .flatMap(paragraph -> paragraph.getTables() + .stream()) + .toList() + .get(0); assertThat(table.getColCount()).isEqualTo(6); assertThat(table.getRowCount()).isEqualTo(13); - assertThat(table.getRows().stream().mapToInt(List::size).sum()).isEqualTo(6 * 13); + assertThat(table.getRows() + .stream() + .mapToInt(List::size).sum()).isEqualTo(6 * 13); } @@ -373,29 +387,30 @@ public class PdfSegmentationServiceTest extends AbstractTest { validateTable(document, 0, 8, 8, 0, 0); List> values = Arrays.asList(Arrays.asList("Annex point Reference within DAR/RAR", - "Author, date", - "Study title", - "Analytical method Author, date, No.", - "Technique, LOQ of the method, validated working range", - "Method meets analytical validation criteria", - "Remarks (in case validation criteria are not met)", - "Acceptability of the method"), - Arrays.asList("Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", - "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", - "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", - "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", - "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", - "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", - "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", - "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies"), - Arrays.asList("CA 7.1.2.1.1 DAR (2009)", - "Evans P.G. 2001 TMJ4569B, VV-323245", - "Azoxystrobin Laboratory Degradation Study in Three Soil Types, Sampled from Holland and the United Kingdom", - "Method: RAM 269 Johnson R.I., Tummon O.J., Earl M. 1995 RJ1864B, VV-377731 Johnson R.I., Tummon O.J., Earl M. 1998 RAM 269/02, VV-124072 Johnson R.I., Tummon O.J., Earl M. 2000 RAM 269/03, VV-123986 Validation: Robinson N.J. 2001 TMJ4617B, VV-895845", - "LC-MS/MS LOQ: 0.01 mg/kg (R401553 (SYN50165 7), R402173 (SYN501114 )) or 0.02 mg/kg (azoxystrobin, R230310, R234886) Working range: 0.02-1.0 or 0.01-0.5 mg/kg (depending on analyte) Other supporting quantificati on methods: HPLC-UV GC-MSD", - "Y", - "N/A", - "Y")); + "Author, date", + "Study title", + "Analytical method Author, date, No.", + "Technique, LOQ of the method, validated working range", + "Method meets analytical validation criteria", + "Remarks (in case validation criteria are not met)", + "Acceptability of the method"), + Arrays.asList( + "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", + "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", + "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", + "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", + "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", + "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", + "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", + "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies"), + Arrays.asList("CA 7.1.2.1.1 DAR (2009)", + "Evans P.G. 2001 TMJ4569B, VV-323245", + "Azoxystrobin Laboratory Degradation Study in Three Soil Types, Sampled from Holland and the United Kingdom", + "Method: RAM 269 Johnson R.I., Tummon O.J., Earl M. 1995 RJ1864B, VV-377731 Johnson R.I., Tummon O.J., Earl M. 1998 RAM 269/02, VV-124072 Johnson R.I., Tummon O.J., Earl M. 2000 RAM 269/03, VV-123986 Validation: Robinson N.J. 2001 TMJ4617B, VV-895845", + "LC-MS/MS LOQ: 0.01 mg/kg (R401553 (SYN50165 7), R402173 (SYN501114 )) or 0.02 mg/kg (azoxystrobin, R230310, R234886) Working range: 0.02-1.0 or 0.01-0.5 mg/kg (depending on analyte) Other supporting quantificati on methods: HPLC-UV GC-MSD", + "Y", + "N/A", + "Y")); validateTable(document, 0, values); @@ -785,6 +800,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { } + @Test public void testMergedEntities_Page26() throws IOException { @@ -802,7 +818,11 @@ public class PdfSegmentationServiceTest extends AbstractTest { @SneakyThrows private void toHtml(ClassificationDocument document, String filename) { - var tables = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList(); + var tables = document.getSections() + .stream() + .flatMap(paragraph -> paragraph.getTables() + .stream()) + .toList(); StringBuilder sb = new StringBuilder(); int currentPage = 1; @@ -823,9 +843,19 @@ public class PdfSegmentationServiceTest extends AbstractTest { private void validateTable(ClassificationDocument document, int tableIndex, int colCount, int rowCount, int emptyCellsCountCorrect, int emptyCellsCountIncorrect) { - TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(tableIndex); + TablePageBlock table = document.getSections() + .stream() + .flatMap(paragraph -> paragraph.getTables() + .stream()) + .toList() + .get(tableIndex); List> rows = table.getRows(); - int emptyCellsFoundFound = rows.stream().flatMap(List::stream).toList().stream().filter(f -> f.toString().isEmpty()).toList().size(); + int emptyCellsFoundFound = rows.stream() + .flatMap(List::stream) + .toList() + .stream() + .filter(f -> f.toString().isEmpty()) + .toList().size(); for (List row : table.getRows()) { row.forEach(r -> System.out.println(r.toString())); @@ -840,11 +870,20 @@ public class PdfSegmentationServiceTest extends AbstractTest { private void validateTable(ClassificationDocument document, int tableIndex, List> values) { - TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(tableIndex); + TablePageBlock table = document.getSections() + .stream() + .flatMap(paragraph -> paragraph.getTables() + .stream()) + .toList() + .get(tableIndex); List> rows = table.getRows(); - List rowsFlattened = rows.stream().flatMap(List::stream).toList(); - List valuesFlattened = values.stream().flatMap(List::stream).toList(); + List rowsFlattened = rows.stream() + .flatMap(List::stream) + .toList(); + List valuesFlattened = values.stream() + .flatMap(List::stream) + .toList(); for (int i = 0; i < valuesFlattened.size(); i++) { Cell cell = rowsFlattened.get(i); @@ -857,7 +896,11 @@ public class PdfSegmentationServiceTest extends AbstractTest { private void validateTableSize(ClassificationDocument document, int tableSize) { - assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().size()).isEqualTo(tableSize); + assertThat(document.getSections() + .stream() + .flatMap(paragraph -> paragraph.getTables() + .stream()) + .toList().size()).isEqualTo(tableSize); } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/InvisibleTableDetectionServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/InvisibleTableDetectionServiceTest.java index f16f7e8..3374b89 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/InvisibleTableDetectionServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/InvisibleTableDetectionServiceTest.java @@ -28,29 +28,30 @@ class InvisibleTableDetectionServiceTest { String fileName = "files/basf/CustomerFiles/invisible_tables_test-two-pages_ocred.pdf"; var tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_TABLE.pdf").toString(); - List pageContents = PageContentExtractor.getSortedPageContents(fileName).stream().map(PageInformationService::build).collect(Collectors.toList()); + List pageContents = PageContentExtractor.getSortedPageContents(fileName) + .stream() + .map(PageInformationService::build) + .collect(Collectors.toList()); int pageNumber = 1; - Rectangle2D tableBBox = pageContents.get(0) - .getPageContents() - .getSortedTextPositionSequences() - .subList(45, 152) + Rectangle2D tableBBox = pageContents.get(0).getPageContents().getSortedTextPositionSequences().subList(45, 152) .stream() - .map(TextPositionSequence::getRectangle) - .map(RectangleTransformations::toRectangle2D) + .map(TextPositionSequence::getBBox) .map(this::mirrorY) .collect(RectangleTransformations.collectBBox()); - List textPositionSequences = pageContents.get(0) - .getPageContents() - .getSortedTextPositionSequences() + List textPositionSequences = pageContents.get(0).getPageContents().getSortedTextPositionSequences() .stream() - .filter(textPositionSequence -> tableBBox.contains(mirrorY(RectangleTransformations.toRectangle2D(textPositionSequence.getRectangle())))) + .filter(textPositionSequence -> tableBBox.contains(mirrorY(textPositionSequence.getBBox()))) .toList(); var table = InvisibleTableDetectionService.detectTable(textPositionSequences, tableBBox); - PdfDraw.drawRectanglesPerPage(fileName, List.of(table.stream().flatMap(Collection::stream).toList(), Collections.emptyList()), tmpFileName); + PdfDraw.drawRectanglesPerPage(fileName, + List.of(table.stream() + .flatMap(Collection::stream) + .toList(), Collections.emptyList()), + tmpFileName); } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/PageContentExtractorTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/PageContentExtractorTest.java index 71bdb92..ec7d002 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/PageContentExtractorTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/PageContentExtractorTest.java @@ -29,9 +29,7 @@ class PageContentExtractorTest { textPositionPerPage.stream() .map(t -> t.getSortedTextPositionSequences() .stream() - .map(TextPositionSequence::getRectangle) - .map(RectangleTransformations::toRectangle2D) - //.map(textPositionSequence -> (Rectangle2D) new Rectangle2D.Double(textPositionSequence.getMaxXDirAdj(), textPositionSequence.getMaxYDirAdj(), textPositionSequence.getWidth(), textPositionSequence.getHeight())) + .map(TextPositionSequence::getBBoxInitialUserSpace) .map(List::of) .toList()) .toList(), tmpFileName); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java index a66d540..631f643 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java @@ -52,8 +52,8 @@ public class RulingCleaningServiceTest extends BuildDocumentTest { RulingCleaningService rulingCleaningService = new RulingCleaningService(); List> rectanglesPerPage = new LinkedList<>(); for (PageContents pageContent : pageContents) { - CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(Collections.emptyList(), pageContent.getRulings()); - List rects = RectangularIntersectionFinder.find(cleanRulings.getHorizontal(), cleanRulings.getVertical()); + CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(Collections.emptyList(), pageContent.getRulings()); + List rects = RectangularIntersectionFinder.find(cleanRulings.getHorizontals(), cleanRulings.getVerticals()); rectanglesPerPage.add(rects); } @@ -72,15 +72,16 @@ public class RulingCleaningServiceTest extends BuildDocumentTest { RulingCleaningService rulingCleaningService = new RulingCleaningService(); List cleanRulingsPerPage = new LinkedList<>(); for (PageContents pageContent : pageContents) { - cleanRulingsPerPage.add(rulingCleaningService.getCleanRulings(Collections.emptyList(), pageContent.getRulings())); + cleanRulingsPerPage.add(rulingCleaningService.deduplicateAndStraightenRulings(Collections.emptyList(), pageContent.getRulings())); } - var cleanRulings = cleanRulingsPerPage.stream().map(CleanRulings::getVertical).collect(Collectors.toList()); + var cleanRulings = cleanRulingsPerPage.stream().map(CleanRulings::getVerticals).collect(Collectors.toList()); PdfDraw.drawLinesPerPage(fileName, cleanRulings, lineFileName); } @Test + @Disabled @SneakyThrows public void testTableExtraction() { @@ -97,6 +98,7 @@ public class RulingCleaningServiceTest extends BuildDocumentTest { } + @SneakyThrows private void writeJsons(Path filename) { diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingsClassifierTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingsClassifierTest.java new file mode 100644 index 0000000..f3fd281 --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingsClassifierTest.java @@ -0,0 +1,84 @@ +package com.knecon.fforesight.service.layoutparser.server.services; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.Collections; +import java.util.List; + +import org.junit.jupiter.api.Test; + +import com.knecon.fforesight.service.layoutparser.processor.model.PageContents; +import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor; +import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService; +import com.knecon.fforesight.service.layoutparser.processor.services.TextRulingsClassifier; +import com.knecon.fforesight.service.layoutparser.processor.utils.RectangularIntersectionFinder; + +import lombok.SneakyThrows; + +public class RulingsClassifierTest { + + @Test + @SneakyThrows + public void textRulingExtractionTest() { + + String fileName = "files/Minimal Examples/RotateTextWithRulingsTestFile.pdf"; + List pageContents = PageContentExtractor.getSortedPageContents(fileName); + RulingCleaningService rulingCleaningService = new RulingCleaningService(); + + for (PageContents pageContent : pageContents) { + CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(Collections.emptyList(), pageContent.getRulings()); + RectangularIntersectionFinder.find(cleanRulings.getHorizontals(), cleanRulings.getVerticals()); + TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(pageContent.getSortedTextPositionSequences(), cleanRulings); + + assertTrue(pageContent.getSortedTextPositionSequences() + .stream() + .filter(word -> word.toString().equals("Underlined")) + .allMatch(TextPositionSequence::isUnderline)); + assertTrue(pageContent.getSortedTextPositionSequences() + .stream() + .filter(word -> word.toString().equals("Striketrough")) + .allMatch(TextPositionSequence::isStrikethrough)); + + assertEquals(4, + cleanRulings.buildAll() + .stream() + .filter(ruling -> ruling.getClassification().equals(Ruling.Classification.STRIKETROUGH)) + .count()); + assertEquals(4, + cleanRulings.buildAll() + .stream() + .filter(ruling -> ruling.getClassification().equals(Ruling.Classification.UNDERLINE)) + .count()); + assertEquals(0, cleanRulings.withoutTextRulings().buildAll().size()); + } + + } + + + @Test + @SneakyThrows + public void tableRulingExtractionTest() { + + String fileName = "files/SinglePages/AbsolutelyEnormousTable.pdf"; + List pageContents = PageContentExtractor.getSortedPageContents(fileName); + RulingCleaningService rulingCleaningService = new RulingCleaningService(); + + for (PageContents pageContent : pageContents) { + CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(Collections.emptyList(), pageContent.getRulings()); + RectangularIntersectionFinder.find(cleanRulings.getHorizontals(), cleanRulings.getVerticals()); + TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(pageContent.getSortedTextPositionSequences(), cleanRulings); + + assertEquals(30, cleanRulings.getHorizontals().size()); + assertEquals(30, cleanRulings.getTableLines().getHorizontals().size()); + + assertEquals(144, cleanRulings.getVerticals().size()); + assertEquals(144, cleanRulings.getTableLines().getVerticals().size()); + } + + } + +} diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/AbstractTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/AbstractTest.java index c0e2809..199f918 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/AbstractTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/AbstractTest.java @@ -1,6 +1,9 @@ package com.knecon.fforesight.service.layoutparser.server.utils; +import java.io.File; +import java.io.FileInputStream; import java.io.InputStream; +import java.nio.file.Path; import java.util.Map; import java.util.Optional; @@ -102,29 +105,22 @@ public abstract class AbstractTest { } - @SneakyThrows - protected LayoutParsingRequest prepareStorage(InputStream fileInputStream) { - - storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileInputStream); - return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER_OLD); - } - - - protected LayoutParsingRequest buildDefaultLayoutParsingRequest(LayoutParsingType layoutParsingType) { + protected LayoutParsingRequest buildDefaultLayoutParsingRequest(String fileName, LayoutParsingType layoutParsingType, boolean debug) { + var identifier = debug ? Map.of("fileId", fileName, "debug", "true") : Map.of("fileId", fileName); return LayoutParsingRequest.builder() - .identifier(Map.of("fileId", "1337")) + .identifier(identifier) .layoutParsingType(layoutParsingType) - .originFileStorageId(ORIGIN_FILE_ID) - .tablesFileStorageId(Optional.of(TABLE_FILE_ID)) - .imagesFileStorageId(Optional.of(IMAGE_FILE_ID)) - .visualLayoutParsingFileId(Optional.of(VISUAL_LAYOUT_FILE)) - .structureFileStorageId(STRUCTURE_FILE_ID) - .textBlockFileStorageId(TEXT_FILE_ID) - .positionBlockFileStorageId(POSITION_FILE_ID) - .pageFileStorageId(PAGES_FILE_ID) - .simplifiedTextStorageId(SIMPLIFIED_ID) - .viewerDocumentStorageId(VIEWER_DOCUMENT_ID) + .originFileStorageId(fileName + ORIGIN_FILE_ID) + .tablesFileStorageId(Optional.of(fileName + TABLE_FILE_ID)) + .imagesFileStorageId(Optional.of(fileName + IMAGE_FILE_ID)) + .visualLayoutParsingFileId(Optional.empty()) + .structureFileStorageId(fileName + STRUCTURE_FILE_ID) + .textBlockFileStorageId(fileName + TEXT_FILE_ID) + .positionBlockFileStorageId(fileName + POSITION_FILE_ID) + .pageFileStorageId(fileName + PAGES_FILE_ID) + .simplifiedTextStorageId(fileName + SIMPLIFIED_ID) + .viewerDocumentStorageId(fileName + VIEWER_DOCUMENT_ID) .build(); } @@ -148,10 +144,28 @@ public abstract class AbstractTest { ClassPathResource imageInfoFileResource = new ClassPathResource(imageInfoFile); ClassPathResource visualLayoutParsingResponseResource = new ClassPathResource(visualLayoutParsingResponseFile); - return prepareStorage(pdfFileResource.getInputStream(), - cvServiceResponseFileResource.getInputStream(), - imageInfoFileResource.getInputStream(), - visualLayoutParsingResponseResource.getInputStream()); + return prepareStorage(Path.of(file).getFileName().toString(), + pdfFileResource.getInputStream(), + cvServiceResponseFileResource.getInputStream(), + imageInfoFileResource.getInputStream(), + visualLayoutParsingResponseResource.getInputStream()); + } + + + @SneakyThrows + protected void prepareStorage(LayoutParsingRequest layoutParsingRequest, File file) { + + ClassPathResource cvServiceResponseFileResource = new ClassPathResource("cv_table_parsing_response/empty.json"); + ClassPathResource imageInfoFileResource = new ClassPathResource("image_service_response/empty.json"); + ClassPathResource visualLayoutParsingResponseResource = new ClassPathResource("visual_layout_parsing_response/empty.json"); + + try (var in = new FileInputStream(file)) { + prepareStorage(layoutParsingRequest, + in, + cvServiceResponseFileResource.getInputStream(), + imageInfoFileResource.getInputStream(), + visualLayoutParsingResponseResource.getInputStream()); + } } @@ -162,12 +176,29 @@ public abstract class AbstractTest { storageService.storeObject(TenantContext.getTenantId(), TABLE_FILE_ID, cvServiceResponseFileStream); storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileStream); - return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER_OLD); + return buildDefaultLayoutParsingRequest("test", LayoutParsingType.REDACT_MANAGER_OLD, true); } @SneakyThrows - protected LayoutParsingRequest prepareStorage(InputStream fileStream, + protected void prepareStorage(LayoutParsingRequest layoutParsingRequest, + InputStream fileStream, + InputStream cvServiceResponseFileStream, + InputStream imageInfoStream, + InputStream visualLayoutParsingResponseFileStream) { + + storageService.storeObject(TenantContext.getTenantId(), layoutParsingRequest.imagesFileStorageId().get(), imageInfoStream); + storageService.storeObject(TenantContext.getTenantId(), layoutParsingRequest.tablesFileStorageId().get(), cvServiceResponseFileStream); + storageService.storeObject(TenantContext.getTenantId(), layoutParsingRequest.originFileStorageId(), fileStream); + if (layoutParsingRequest.visualLayoutParsingFileId().isPresent()) { + storageService.storeObject(TenantContext.getTenantId(), layoutParsingRequest.visualLayoutParsingFileId().get(), visualLayoutParsingResponseFileStream); + } + } + + + @SneakyThrows + protected LayoutParsingRequest prepareStorage(String fileName, + InputStream fileStream, InputStream cvServiceResponseFileStream, InputStream imageInfoStream, InputStream visualLayoutParsingResponseFileStream) { @@ -177,7 +208,7 @@ public abstract class AbstractTest { storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileStream); storageService.storeObject(TenantContext.getTenantId(), VISUAL_LAYOUT_FILE, visualLayoutParsingResponseFileStream); - return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER_OLD); + return buildDefaultLayoutParsingRequest(fileName, LayoutParsingType.REDACT_MANAGER_OLD, true); } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/BuildDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/BuildDocumentTest.java index af2717b..cbd6201 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/BuildDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/BuildDocumentTest.java @@ -1,11 +1,13 @@ package com.knecon.fforesight.service.layoutparser.server.utils; import java.io.File; +import java.nio.file.Path; import java.util.Map; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.core.io.ClassPathResource; +import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; @@ -28,11 +30,11 @@ public abstract class BuildDocumentTest extends AbstractTest { File fileResource = new ClassPathResource(filename).getFile(); prepareStorage(filename); return layoutParsingPipeline.parseLayout(layoutParsingType, - fileResource, - layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID), - new TableServiceResponse(), - new VisualLayoutParsingResponse(), - Map.of("file",filename)); + fileResource, + layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID), + new TableServiceResponse(), + new VisualLayoutParsingResponse(), + Map.of("file", filename, "debug", "true")); } @@ -46,13 +48,25 @@ public abstract class BuildDocumentTest extends AbstractTest { @SneakyThrows protected Document buildGraph(String filename, LayoutParsingType layoutParsingType) { - if (filename.equals("files/syngenta/CustomerFiles/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf")) { - prepareStorage(filename, "cv_table_parsing_response/empty.json", "image_service_response/S-Metolachlor_RAR_01_Volume_1_2018-09-06.IMAGE_INFO.json"); + if (!filename.startsWith("files") && filename.startsWith("/")) { + LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(Path.of(filename).getFileName().toString(), LayoutParsingType.REDACT_MANAGER, true); + prepareStorage(layoutParsingRequest, new File(filename)); + return DocumentGraphFactory.buildDocumentGraph(layoutParsingType, + layoutParsingPipeline.parseLayout(layoutParsingType, + new File(filename), + layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get()), + new TableServiceResponse(), + new VisualLayoutParsingResponse(), + layoutParsingRequest.identifier())); } else { - prepareStorage(filename); + if (filename.equals("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf")) { + prepareStorage(filename, "cv_table_parsing_response/empty.json", "image_service_response/S-Metolachlor_RAR_01_Volume_1_2018-09-06.IMAGE_INFO.json"); + } else { + prepareStorage(filename); + } + return DocumentGraphFactory.buildDocumentGraph(layoutParsingType, parseLayout(filename, layoutParsingType)); } - return DocumentGraphFactory.buildDocumentGraph(layoutParsingType, parseLayout(filename, layoutParsingType)); } } diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/Minimal Examples/RotateTextWithRulingsTestFile.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/Minimal Examples/RotateTextWithRulingsTestFile.pdf new file mode 100644 index 0000000..da05904 Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/Minimal Examples/RotateTextWithRulingsTestFile.pdf differ diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/Minimal Examples/simpleTablesRotated.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/Minimal Examples/simpleTablesRotated.pdf new file mode 100644 index 0000000..f6571ef Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/Minimal Examples/simpleTablesRotated.pdf differ diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/AbsolutelyEnormousTable.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/AbsolutelyEnormousTable.pdf new file mode 100644 index 0000000..e6d9a07 Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/AbsolutelyEnormousTable.pdf differ diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/ContentStreams.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/ContentStreams.java index 937f75d..560da8a 100644 --- a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/ContentStreams.java +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/ContentStreams.java @@ -26,6 +26,26 @@ public class ContentStreams { public static Identifier ESCAPE_END = new Identifier("escape start", COSName.getPDFName("ESCAPE_END"), false); + public static Identifier CLEAN_RULINGS = new Identifier("Cleaned Rulings", COSName.getPDFName("KNECON_CLEAN_RULINGS"), true); + + public static Identifier RULINGS = new Identifier("Rulings", COSName.getPDFName("KNECON_RULINGS"), true); + + public static Identifier WORDS = new Identifier("Words", COSName.getPDFName("KNECON_WORDS"), true); + + public static Identifier ZONES = new Identifier("Text Zones", COSName.getPDFName("KNECON_ZONES"), true); + + public static Identifier LINES = new Identifier("Text Lines", COSName.getPDFName("KNECON_LINES"), true); + + public static Identifier CELLS = new Identifier("Cells", COSName.getPDFName("KNECON_CELLS"), true); + + public static Identifier MAIN_BODY = new Identifier("Main Text Body", COSName.getPDFName("KNECON_MAIN_BODY"), true); + + public static Identifier MARKED_CONTENT = new Identifier("Marked content", COSName.getPDFName("KNECON_MARKED_CONTENT"), true); + + public static Identifier NEIGHBOURS = new Identifier("Neighbours", COSName.getPDFName("KNECON_NEIGHBOURS"), true); + + public static Identifier CHARACTERS = new Identifier("Characters", COSName.getPDFName("KNECON_CHARACTERS"), true); + public static List allContentStreams = List.of(KNECON_LAYOUT, KNECON_VISUAL_PARSING, KNECON_OCR, @@ -33,7 +53,17 @@ public class ContentStreams { KNECON_OCR_TEXT_DEBUG, OTHER, ESCAPE_START, - ESCAPE_END); + ESCAPE_END, + RULINGS, + CLEAN_RULINGS, + WORDS, + ZONES, + LINES, + MAIN_BODY, + MARKED_CONTENT, + NEIGHBOURS, + CHARACTERS, + CELLS); public record Identifier(String name, COSName cosName, boolean optionalContent) { diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/Visualizations.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/Visualizations.java index fb17113..6af80b9 100644 --- a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/Visualizations.java +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/Visualizations.java @@ -1,5 +1,6 @@ package com.knecon.fforesight.service.viewerdoc.model; +import java.util.LinkedHashMap; import java.util.Map; import com.knecon.fforesight.service.viewerdoc.ContentStreams; @@ -17,7 +18,8 @@ import lombok.experimental.FieldDefaults; public class Visualizations { ContentStreams.Identifier layer; - Map visualizationsOnPages; + @Builder.Default + Map visualizationsOnPages = new LinkedHashMap<>(); boolean layerVisibilityDefaultValue; } diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/ViewerDocumentService.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/ViewerDocumentService.java index 040b81b..04233da 100644 --- a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/ViewerDocumentService.java +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/ViewerDocumentService.java @@ -53,12 +53,6 @@ public class ViewerDocumentService { private final ObservationRegistry registry; - public void addVisualizationsOnPage(File originFile, File destinationFile, Visualizations visualizations) { - - addVisualizationsOnPage(originFile, destinationFile, List.of(visualizations)); - } - - @Observed(name = "ViewerDocumentService", contextualName = "add-visualizations") @SneakyThrows public void addVisualizationsOnPage(File originFile, File destinationFile, List visualizations) { @@ -70,9 +64,14 @@ public class ViewerDocumentService { PDDocument pdDocument = openPDDocument(tmpFile.toFile()); - enrichObservation(pdDocument, visualizations.stream().map(Visualizations::getLayer).toList()); + enrichObservation(pdDocument, + visualizations.stream() + .map(Visualizations::getLayer) + .toList()); - Set allLayers = visualizations.stream().map(Visualizations::getLayer).collect(Collectors.toUnmodifiableSet()); + Set allLayers = visualizations.stream() + .map(Visualizations::getLayer) + .collect(Collectors.toUnmodifiableSet()); Map optionalContentGroupMap = addLayersToDocument(visualizations, pdDocument); @@ -229,11 +228,11 @@ public class ViewerDocumentService { Matrix textMatrix; if (placedText.textMatrix().isEmpty()) { textMatrix = new Matrix((float) textDeRotationMatrix.getScaleX(), - (float) textDeRotationMatrix.getShearX(), - (float) textDeRotationMatrix.getShearY(), - (float) textDeRotationMatrix.getScaleY(), - (float) placedText.lineStart().getX(), - (float) placedText.lineStart().getY()); + (float) textDeRotationMatrix.getShearX(), + (float) textDeRotationMatrix.getShearY(), + (float) textDeRotationMatrix.getScaleY(), + (float) placedText.lineStart().getX(), + (float) placedText.lineStart().getY()); } else { textMatrix = placedText.textMatrix().get(); } diff --git a/publish-custom-image.sh b/publish-custom-image.sh index c8c81d1..e2191d7 100755 --- a/publish-custom-image.sh +++ b/publish-custom-image.sh @@ -12,4 +12,4 @@ commit_hash=$(git rev-parse --short=5 HEAD) buildName="${USER}-${branch}-${commit_hash}" gradle bootBuildImage --publishImage -PbuildbootDockerHostNetwork=true -Pversion=$buildName --no-build-cache -echo "nexus.knecon.com:5001/ff/${dir}-service-server:$buildName" +echo "nexus.knecon.com:5001/ff/layoutparser-service-server:$buildName"