diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index 4b390ac..fd8119c 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -101,29 +101,33 @@ public class LayoutParsingPipeline { log.info("Starting layout parsing for {}", layoutParsingRequest.identifier()); File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId()); - File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()).orElse(originFile); + File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()) + .orElse(originFile); VisualLayoutParsingResponse visualLayoutParsingResponse = new VisualLayoutParsingResponse(); - if (layoutParsingRequest.visualLayoutParsingFileId().isPresent()) { + if (layoutParsingRequest.visualLayoutParsingFileId() + .isPresent()) { visualLayoutParsingResponse = layoutParsingStorageService.getVisualLayoutParsingFile(layoutParsingRequest.visualLayoutParsingFileId().get()); } ImageServiceResponse imageServiceResponse = new ImageServiceResponse(); - if (layoutParsingRequest.imagesFileStorageId().isPresent()) { + if (layoutParsingRequest.imagesFileStorageId() + .isPresent()) { imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get()); } TableServiceResponse tableServiceResponse = new TableServiceResponse(); - if (layoutParsingRequest.tablesFileStorageId().isPresent()) { + if (layoutParsingRequest.tablesFileStorageId() + .isPresent()) { tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get()); } ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(), - originFile, - imageServiceResponse, - tableServiceResponse, - visualLayoutParsingResponse, - layoutParsingRequest.identifier()); + originFile, + imageServiceResponse, + tableServiceResponse, + visualLayoutParsingResponse, + layoutParsingRequest.identifier()); log.info("Building document graph for {}", layoutParsingRequest.identifier()); @@ -155,25 +159,25 @@ public class LayoutParsingPipeline { .numberOfPages(documentGraph.getNumberOfPages()) .duration(System.currentTimeMillis() - start) .message(format(""" - Layout parsing has finished in %.02f s. - identifiers: %s - %s - Files have been saved with Ids: - Structure: %s - Text: %s - Positions: %s - PageData: %s - Simplified Text: %s - Viewer Doc: %s""", - ((float) (System.currentTimeMillis() - start)) / 1000, - layoutParsingRequest.identifier(), - buildSemanticNodeCountMessage(documentGraph.getNumberOfPages(), documentGraph.buildSemanticNodeCounts()), - layoutParsingRequest.structureFileStorageId(), - layoutParsingRequest.textBlockFileStorageId(), - layoutParsingRequest.positionBlockFileStorageId(), - layoutParsingRequest.pageFileStorageId(), - layoutParsingRequest.simplifiedTextStorageId(), - layoutParsingRequest.viewerDocumentStorageId())) + Layout parsing has finished in %.02f s. + identifiers: %s + %s + Files have been saved with Ids: + Structure: %s + Text: %s + Positions: %s + PageData: %s + Simplified Text: %s + Viewer Doc: %s""", + ((float) (System.currentTimeMillis() - start)) / 1000, + layoutParsingRequest.identifier(), + buildSemanticNodeCountMessage(documentGraph.getNumberOfPages(), documentGraph.buildSemanticNodeCounts()), + layoutParsingRequest.structureFileStorageId(), + layoutParsingRequest.textBlockFileStorageId(), + layoutParsingRequest.positionBlockFileStorageId(), + layoutParsingRequest.pageFileStorageId(), + layoutParsingRequest.simplifiedTextStorageId(), + layoutParsingRequest.viewerDocumentStorageId())) .build(); } @@ -194,14 +198,14 @@ public class LayoutParsingPipeline { private String buildSemanticNodeCountMessage(int numberOfPages, Map semanticNodeCounts) { return String.format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed", - numberOfPages, - semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION), - semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE), - semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH), - semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE), - semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL), - semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER), - semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER)); + numberOfPages, + semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION), + semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE), + semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH), + semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE), + semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL), + semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER), + semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER)); } @@ -220,6 +224,9 @@ public class LayoutParsingPipeline { Map> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse); Map> signatures = visualLayoutParsingAdapter.buildExtractedSignaturesPerPage(visualLayoutParsingResponse); ClassificationDocument classificationDocument = new ClassificationDocument(); + + classificationDocument.getVisualizations().setActive(identifier.containsKey("debug")); + List classificationPages = new ArrayList<>(); long pageCount = originDocument.getNumberOfPages(); @@ -249,6 +256,8 @@ public class LayoutParsingPipeline { } stripper.getText(originDocument); + classificationDocument.getVisualizations().addTextVisualizations(stripper.getTextPositionSequences(), pageNumber); + PDRectangle pdr = pdPage.getMediaBox(); int rotation = pdPage.getRotation(); @@ -257,6 +266,8 @@ public class LayoutParsingPipeline { PDRectangle cropbox = pdPage.getCropBox(); CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), stripper.getRulings()); + classificationDocument.getVisualizations().addCleanRulingVisualization(cleanRulings, pageNumber); + List emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical()); var graphics = graphicExtractorService.extractPathElementGraphics(originDocument, @@ -272,11 +283,16 @@ public class LayoutParsingPipeline { .map(g -> new ClassifiedImage(new Rectangle2D.Double(g.x1, g.y1, g.width(), g.height()), ImageType.GRAPHIC, false, stripper.getPageNumber())) .toList()); + classificationDocument.getVisualizations().addCellVisualizations(emptyTableCells, pageNumber); + ClassificationPage classificationPage = switch (layoutParsingType) { - case REDACT_MANAGER_OLD -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells); + case REDACT_MANAGER_OLD -> + redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells); case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); - case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, true); - case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, false); + case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG -> + docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, true, classificationDocument.getVisualizations()); + case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> + docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, false, classificationDocument.getVisualizations()); }; classificationPage.setCleanRulings(cleanRulings); @@ -286,8 +302,9 @@ public class LayoutParsingPipeline { classificationPage.setPageWidth(cropbox.getWidth()); classificationPage.setPageHeight(cropbox.getHeight()); + classificationDocument.getVisualizations().addMarkedContentVisualizations(stripper.getMarkedContents(), pageNumber, pdPage); // MarkedContent needs to be converted at this point, otherwise it leads to GC Problems in Pdfbox. - classificationPage.setMarkedContentBboxPerType(convertMarkedContents(stripper.getMarkedContents())); + classificationPage.setMarkedContentBboxPerType(convertMarkedContents(stripper.getMarkedContents(), pdPage)); // If images is ocr needs to be calculated before textBlocks are moved into tables, otherwise findOcr algorithm needs to be adopted. if (pdfImages != null && pdfImages.containsKey(pageNumber)) { @@ -361,11 +378,11 @@ public class LayoutParsingPipeline { } - private Map> convertMarkedContents(List pdMarkedContents) { + private Map> convertMarkedContents(List pdMarkedContents, PDPage pdPage) { Map> markedContentBboxes = new HashMap<>(); - markedContentBboxes.put(MarkedContentUtils.HEADER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.HEADER)); - markedContentBboxes.put(MarkedContentUtils.FOOTER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.FOOTER)); + markedContentBboxes.put(MarkedContentUtils.HEADER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.HEADER, pdPage)); + markedContentBboxes.put(MarkedContentUtils.FOOTER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.FOOTER, pdPage)); return markedContentBboxes; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/DocstrumSegmentationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/DocstrumSegmentationService.java index eb62ce2..b2e4087 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/DocstrumSegmentationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/DocstrumSegmentationService.java @@ -7,12 +7,14 @@ import java.util.stream.Collectors; import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character; +import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone; import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.LineBuilderService; import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.NearestNeighbourService; import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.ReadingOrderService; import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.SpacingService; import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.ZoneBuilderService; +import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; @@ -43,16 +45,16 @@ public class DocstrumSegmentationService { private List computeZones(List textPositions, TextDirection direction) { - var positions = textPositions.stream().filter(t -> t.getDir() == direction).map(TextPositionSequence::getTextPositions).flatMap(List::stream).toList(); + List positions = textPositions.stream().filter(t -> t.getDir() == direction).map(TextPositionSequence::getTextPositions).flatMap(List::stream).toList(); - var characters = positions.stream().map(Character::new).collect(Collectors.toList()); + List characters = positions.stream().map(Character::new).collect(Collectors.toList()); nearestNeighbourService.findNearestNeighbors(characters); - var characterSpacing = spacingService.computeCharacterSpacing(characters); - var lineSpacing = Math.min(spacingService.computeLineSpacing(characters), 20); + double characterSpacing = spacingService.computeCharacterSpacing(characters); + double lineSpacing = Math.min(spacingService.computeLineSpacing(characters), 20); - var lines = lineBuilderService.buildLines(characters, characterSpacing, lineSpacing); + List lines = lineBuilderService.buildLines(characters, characterSpacing, lineSpacing); return zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/Character.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/Character.java index b4e2616..40bc95c 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/Character.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/Character.java @@ -27,8 +27,8 @@ public class Character { public Character(RedTextPosition chunk) { - this.x = chunk.getXDirAdj() + chunk.getWidthDirAdj() / 2; - this.y = chunk.getYDirAdj() + chunk.getHeightDir() / 2; + this.x = chunk.getDirectionAdjustedPosition().getCenterX(); + this.y = chunk.getDirectionAdjustedPosition().getCenterY(); this.textPosition = chunk; } @@ -82,5 +82,4 @@ public class Character { return FastAtan2.fastAtan2(character.getY() - getY(), character.getX() - getX()); } } - } \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/Line.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/Line.java index fa85249..71fcbb5 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/Line.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/Line.java @@ -1,11 +1,12 @@ package com.knecon.fforesight.service.layoutparser.processor.docstrum.model; -import java.awt.geom.Rectangle2D; import java.util.ArrayList; import java.util.Arrays; import java.util.List; +import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; import lombok.Data; import lombok.EqualsAndHashCode; @@ -84,7 +85,9 @@ public class Line extends BoundingBox { private double computeHeight() { - return characters.stream().map(Character::getHeight).reduce(0d, Double::sum) / characters.size(); + return characters.stream() + .map(Character::getHeight) + .reduce(0d, Double::sum) / characters.size(); } @@ -116,7 +119,7 @@ public class Line extends BoundingBox { double ym = (y0 + y1) / 2; double yn = (other.y0 + other.y1) / 2; - return Math.abs(ym - yn) / Math.sqrt(1); + return Math.abs(ym - yn); } @@ -141,21 +144,10 @@ public class Line extends BoundingBox { private void buildBBox() { - double minX = Double.POSITIVE_INFINITY; - double minY = Double.POSITIVE_INFINITY; - double maxX = Double.NEGATIVE_INFINITY; - double maxY = Double.NEGATIVE_INFINITY; - - for (Character character : characters) { - - minX = Math.min(minX, character.getTextPosition().getXDirAdj()); - minY = Math.min(minY, character.getTextPosition().getYDirAdj()); - maxX = Math.max(maxX, character.getTextPosition().getXDirAdj() + character.getTextPosition().getWidthDirAdj()); - maxY = Math.max(maxY, character.getTextPosition().getYDirAdj() + character.getTextPosition().getHeightDir()); - - } - - this.setBBox(new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY)); + this.setBBox(characters.stream() + .map(Character::getTextPosition) + .map(RedTextPosition::getInitialUserSpacePosition) + .collect(RectangleTransformations.collectBBox())); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/Zone.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/Zone.java index 85facd2..aaf96ac 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/Zone.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/Zone.java @@ -1,9 +1,10 @@ package com.knecon.fforesight.service.layoutparser.processor.docstrum.model; -import java.awt.geom.Rectangle2D; import java.util.Comparator; import java.util.List; +import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; + import lombok.Data; @Data @@ -23,21 +24,9 @@ public class Zone extends BoundingBox { public void buildBBox() { - double minX = Double.POSITIVE_INFINITY; - double minY = Double.POSITIVE_INFINITY; - double maxX = Double.NEGATIVE_INFINITY; - double maxY = Double.NEGATIVE_INFINITY; - - for (Line line : lines) { - - minX = Math.min(minX, line.getX()); - minY = Math.min(minY, line.getY()); - maxX = Math.max(maxX, line.getX() + line.getWidth()); - maxY = Math.max(maxY, line.getY() + line.getHeight()); - - } - - this.setBBox(new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY)); + this.setBBox(getLines().stream() + .map(BoundingBox::getBBox) + .collect(RectangleTransformations.collectBBox())); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/LineBuilderService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/LineBuilderService.java index 66536a5..195a0fd 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/LineBuilderService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/LineBuilderService.java @@ -1,6 +1,5 @@ package com.knecon.fforesight.service.layoutparser.processor.docstrum.service; -import java.util.ArrayList; import java.util.Comparator; import java.util.HashSet; import java.util.List; @@ -30,24 +29,25 @@ public class LineBuilderService { AngleFilter filter = new AngleFilter(-ANGLE_TOLERANCE, ANGLE_TOLERANCE); characters.forEach(character -> { - character.getNeighbors().forEach(neighbor -> { - double x = neighbor.getHorizontalDistance() / maxHorizontalDistance; - double y = neighbor.getVerticalDistance() / maxVerticalDistance; - if (character.getTextPosition().getDir() == neighbor.getCharacter().getTextPosition().getDir() && filter.matches(neighbor) && Math.pow(x, 2) + Math.pow(y, - 2) <= 1) { - unionFind.union(character, neighbor.getCharacter()); - } - }); + character.getNeighbors() + .forEach(neighbor -> { + double normalizedHorizontalDistance = neighbor.getHorizontalDistance() / maxHorizontalDistance; + double normalizedVerticalDistance = neighbor.getVerticalDistance() / maxVerticalDistance; + if (character.getTextPosition().getDir() == neighbor.getCharacter().getTextPosition().getDir() // + && filter.matches(neighbor) // + && Math.pow(normalizedHorizontalDistance, 2) + Math.pow(normalizedVerticalDistance, 2) <= 1) { + unionFind.union(character, neighbor.getCharacter()); + } + }); }); - List lines = new ArrayList<>(); - unionFind.getGroups().forEach(group -> { - List lineCharacters = new ArrayList<>(group); - lineCharacters.sort(Comparator.comparingDouble(Character::getX)); - lines.add(new Line(lineCharacters, characterSpacing)); - }); - - return lines; + return unionFind.getGroups() + .stream() + .map(lineCharacters -> lineCharacters.stream() + .sorted(Comparator.comparingDouble(Character::getX)) + .toList()) + .map(lineCharacters -> new Line(lineCharacters, characterSpacing)) + .toList(); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/ZoneBuilderService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/ZoneBuilderService.java index 4520163..6acd4cb 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/ZoneBuilderService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/ZoneBuilderService.java @@ -45,29 +45,35 @@ public class ZoneBuilderService { double meanHeight = calculateMeanHeight(lines); - lines.forEach(outerLine -> // - lines.forEach(innerLine -> { + lines.forEach(outerLine -> { + lines.forEach(innerLine -> { - double scale = Math.min(outerLine.getHeight(), innerLine.getHeight()) / meanHeight; - scale = Math.max(MIN_LINE_SIZE_SCALE, Math.min(scale, MAX_LINE_SIZE_SCALE)); + if (innerLine == outerLine // + || unionFind.inSameSet(outerLine, innerLine)// + || outerLine.angularDifference(innerLine) > ANGLE_TOLERANCE) { + return; + } - if (!unionFind.inSameSet(outerLine, innerLine) && outerLine.angularDifference(innerLine) <= ANGLE_TOLERANCE) { + double scale = Math.min(outerLine.getHeight(), innerLine.getHeight()) / meanHeight; + scale = Math.max(MIN_LINE_SIZE_SCALE, Math.min(scale, MAX_LINE_SIZE_SCALE)); - double horizontalDistance = outerLine.horizontalDistance(innerLine) / scale; - double verticalDistance = outerLine.verticalDistance(innerLine) / scale; + double horizontalDistance = outerLine.horizontalDistance(innerLine) / scale; + double verticalDistance = outerLine.verticalDistance(innerLine) / scale; - if (minHorizontalDistance <= horizontalDistance && verticalDistance <= maxVerticalDistance // - || minHorizontalMergeDistance <= horizontalDistance && verticalDistance <= maxVerticalMergeDistance) { - unionFind.union(outerLine, innerLine); - } - } - })); + if (minHorizontalDistance <= horizontalDistance && verticalDistance <= maxVerticalDistance // + || minHorizontalMergeDistance <= horizontalDistance && verticalDistance <= maxVerticalMergeDistance) { - List zones = new ArrayList<>(); - unionFind.getGroups().forEach(group -> { - zones.add(mergeLinesInZone(new ArrayList<>(group), characterSpacing, lineSpacing)); + unionFind.union(outerLine, innerLine); + } + + }); }); + List zones = unionFind.getGroups() + .stream() + .map(group -> mergeLinesInZone(new ArrayList<>(group), characterSpacing, lineSpacing)) + .toList(); + if (zones.size() > MAX_ZONES) { List oneZoneLines = new ArrayList<>(); for (Zone zone : zones) { @@ -103,35 +109,40 @@ public class ZoneBuilderService { UnionFind unionFind = new UnionFind<>(new HashSet<>(lines)); lines.forEach(outer -> { - lines.forEach(inner -> { - if (inner != outer) { + if (inner == outer) { + return; + } - double horizontalDistance = outer.horizontalDistance(inner); - double verticalDistance = outer.verticalDistance(inner); + double horizontalDistance = outer.horizontalDistance(inner); + double verticalDistance = outer.verticalDistance(inner); - if (horizontalDistance <= maxHorizontalDistance && minVerticalDistance <= verticalDistance && verticalDistance <= maxVerticalDistance) { - unionFind.union(outer, inner); - } else if (minVerticalDistance <= verticalDistance && verticalDistance <= maxVerticalDistance && Math.abs(horizontalDistance - Math.min(outer.getLength(), - inner.getLength())) < 0.1) { - boolean characterOverlap = false; - int overlappingCount = 0; - for (Character outerCharacter : outer.getCharacters()) { - for (Character innerCharacter : inner.getCharacters()) { - double characterOverlapDistance = outerCharacter.overlappingDistance(innerCharacter); - if (characterOverlapDistance > 2) { - characterOverlap = true; - } - if (characterOverlapDistance > 0) { - overlappingCount++; - } + if (horizontalDistance <= maxHorizontalDistance && minVerticalDistance <= verticalDistance && verticalDistance <= maxVerticalDistance) { + + unionFind.union(outer, inner); + + } else if (minVerticalDistance <= verticalDistance + && verticalDistance <= maxVerticalDistance + && Math.abs(horizontalDistance - Math.min(outer.getLength(), inner.getLength())) < 0.1) { + + boolean characterOverlap = false; + int overlappingCount = 0; + for (Character outerCharacter : outer.getCharacters()) { + for (Character innerCharacter : inner.getCharacters()) { + double characterOverlapDistance = outerCharacter.overlappingDistance(innerCharacter); + if (characterOverlapDistance > 2) { + characterOverlap = true; + } + if (characterOverlapDistance > 0) { + overlappingCount++; } } - if (!characterOverlap && overlappingCount <= 2) { - unionFind.union(outer, inner); - } + } + if (!characterOverlap && overlappingCount <= 2) { + unionFind.union(outer, inner); } } + }); }); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationDocument.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationDocument.java index b3565ae..4f3f339 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationDocument.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationDocument.java @@ -5,6 +5,7 @@ import java.util.List; import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter; import com.knecon.fforesight.service.layoutparser.processor.model.text.UnclassifiedText; +import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations; import lombok.Data; import lombok.NoArgsConstructor; @@ -22,6 +23,7 @@ public class ClassificationDocument { private FloatFrequencyCounter fontSizeCounter = new FloatFrequencyCounter(); private StringFrequencyCounter fontCounter = new StringFrequencyCounter(); private StringFrequencyCounter fontStyleCounter = new StringFrequencyCounter(); + private LayoutparsingVisualizations visualizations = new LayoutparsingVisualizations(); private boolean headlines; private long rulesVersion; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Document.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Document.java index e7b5f82..9a9d9cc 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Document.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Document.java @@ -12,6 +12,7 @@ import java.util.stream.Stream; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; +import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations; import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; @@ -40,6 +41,8 @@ public class Document implements GenericSemanticNode { @Builder.Default Set entities = new HashSet<>(); + LayoutparsingVisualizations visualizations; + @Override public NodeType getType() { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Ruling.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Ruling.java index 7586258..aa2c8dc 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Ruling.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Ruling.java @@ -4,12 +4,8 @@ import java.awt.geom.Line2D; import java.awt.geom.Point2D; import java.awt.geom.Rectangle2D; import java.util.ArrayList; -import java.util.Collections; -import java.util.Comparator; import java.util.Formatter; import java.util.List; -import java.util.Map; -import java.util.TreeMap; import com.knecon.fforesight.service.layoutparser.processor.utils.CohenSutherlandClipping; import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons; @@ -60,126 +56,13 @@ public class Ruling extends Line2D.Float { } - // log(n) implementation of find_intersections - // based on http://people.csail.mit.edu/indyk/6.838-old/handouts/lec2.pdf - public static Map findIntersections(List horizontals, List verticals) { - - class SortObject { - - protected SOType type; - protected float position; - protected Ruling ruling; - - - public SortObject(SOType type, float position, Ruling ruling) { - - this.type = type; - this.position = position; - this.ruling = ruling; - } - - } - - List sos = new ArrayList<>(); - - TreeMap tree = new TreeMap<>(new Comparator() { - @Override - public int compare(Ruling o1, Ruling o2) { - - return java.lang.Double.compare(o1.getTop(), o2.getTop()); - } - }); - - TreeMap rv = new TreeMap<>(new Comparator() { - @Override - public int compare(Point2D o1, Point2D o2) { - - if (o1.getY() > o2.getY()) { - return 1; - } - if (o1.getY() < o2.getY()) { - return -1; - } - if (o1.getX() > o2.getX()) { - return 1; - } - if (o1.getX() < o2.getX()) { - return -1; - } - return 0; - } - }); - - for (Ruling h : horizontals) { - sos.add(new SortObject(SOType.HLEFT, h.getLeft() - PERPENDICULAR_UNIT_EXPAND_AMOUNT, h)); - sos.add(new SortObject(SOType.HRIGHT, h.getRight() + PERPENDICULAR_UNIT_EXPAND_AMOUNT, h)); - } - - for (Ruling v : verticals) { - sos.add(new SortObject(SOType.VERTICAL, v.getLeft(), v)); - } - - Collections.sort(sos, new Comparator() { - @Override - public int compare(SortObject a, SortObject b) { - - int rv; - if (DoubleComparisons.feq(a.position, b.position)) { - if (a.type == SOType.VERTICAL && b.type == SOType.HLEFT) { - rv = 1; - } else if (a.type == SOType.VERTICAL && b.type == SOType.HRIGHT) { - rv = -1; - } else if (a.type == SOType.HLEFT && b.type == SOType.VERTICAL) { - rv = -1; - } else if (a.type == SOType.HRIGHT && b.type == SOType.VERTICAL) { - rv = 1; - } else { - rv = java.lang.Double.compare(a.position, b.position); - } - } else { - return java.lang.Double.compare(a.position, b.position); - } - return rv; - } - }); - - for (SortObject so : sos) { - switch (so.type) { - case VERTICAL: - for (Map.Entry h : tree.entrySet()) { - try { - Point2D i = h.getKey().intersectionPoint(so.ruling); - if (i == null) { - continue; - } - rv.put(i, new Ruling[]{h.getKey().expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT), so.ruling.expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT)}); - } catch (UnsupportedOperationException e) { - log.info("Some line are oblique, ignoring..."); - continue; - } - } - break; - case HRIGHT: - tree.remove(so.ruling); - break; - case HLEFT: - tree.put(so.ruling, true); - break; - } - } - - return rv; - - } - - - public boolean vertical() { + public boolean isVertical() { return this.length() > 0 && DoubleComparisons.feq(this.x1, this.x2); //diff < ORIENTATION_CHECK_THRESHOLD; } - public boolean horizontal() { + public boolean isHorizontal() { return this.length() > 0 && DoubleComparisons.feq(this.y1, this.y2); //diff < ORIENTATION_CHECK_THRESHOLD; } @@ -188,36 +71,36 @@ public class Ruling extends Line2D.Float { // these are used to have a single collapse method (in page, currently) - public boolean oblique() { + public boolean isOblique() { - return !(this.vertical() || this.horizontal()); + return !(this.isVertical() || this.isHorizontal()); } public float getPosition() { - if (this.oblique()) { + if (this.isOblique()) { throw new UnsupportedOperationException(); } - return this.vertical() ? this.getLeft() : this.getTop(); + return this.isVertical() ? this.getLeft() : this.getTop(); } public float getStart() { - if (this.oblique()) { + if (this.isOblique()) { throw new UnsupportedOperationException(); } - return this.vertical() ? this.getTop() : this.getLeft(); + return this.isVertical() ? this.getTop() : this.getLeft(); } public void setStart(float v) { - if (this.oblique()) { + if (this.isOblique()) { throw new UnsupportedOperationException(); } - if (this.vertical()) { + if (this.isVertical()) { this.setTop(v); } else { this.setLeft(v); @@ -227,19 +110,19 @@ public class Ruling extends Line2D.Float { public float getEnd() { - if (this.oblique()) { + if (this.isOblique()) { throw new UnsupportedOperationException(); } - return this.vertical() ? this.getBottom() : this.getRight(); + return this.isVertical() ? this.getBottom() : this.getRight(); } public void setEnd(float v) { - if (this.oblique()) { + if (this.isOblique()) { throw new UnsupportedOperationException(); } - if (this.vertical()) { + if (this.isVertical()) { this.setBottom(v); } else { this.setRight(v); @@ -249,10 +132,10 @@ public class Ruling extends Line2D.Float { public void setStartEnd(float start, float end) { - if (this.oblique()) { + if (this.isOblique()) { throw new UnsupportedOperationException(); } - if (this.vertical()) { + if (this.isVertical()) { this.setTop(start); this.setBottom(end); } else { @@ -264,7 +147,7 @@ public class Ruling extends Line2D.Float { public boolean perpendicularTo(Ruling other) { - return this.vertical() == other.horizontal(); + return this.isVertical() == other.isHorizontal(); } @@ -318,30 +201,6 @@ public class Ruling extends Line2D.Float { } - public Point2D intersectionPoint(Ruling other) { - - Ruling this_l = this.expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT); - Ruling other_l = other.expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT); - Ruling horizontal, vertical; - - if (!this_l.intersectsLine(other_l)) { - return null; - } - - if (this_l.horizontal() && other_l.vertical()) { - horizontal = this_l; - vertical = other_l; - } else if (this_l.vertical() && other_l.horizontal()) { - vertical = this_l; - horizontal = other_l; - } else { - log.warn("lines must be orthogonal, vertical and horizontal"); - return null; - } - return new Point2D.Float(vertical.getLeft(), horizontal.getTop()); - } - - @Override public boolean equals(Object other) { @@ -451,16 +310,9 @@ public class Ruling extends Line2D.Float { final float TOLERANCE = 1; return Math.abs(ruling.getX1() - x1) < TOLERANCE &&// - Math.abs(ruling.getY1() - y1) < TOLERANCE &&// - Math.abs(ruling.getX2() - x2) < TOLERANCE &&// - Math.abs(ruling.getY2() - y2) < TOLERANCE; - } - - - private enum SOType { - VERTICAL, - HRIGHT, - HLEFT + Math.abs(ruling.getY1() - y1) < TOLERANCE &&// + Math.abs(ruling.getX2() - x2) < TOLERANCE &&// + Math.abs(ruling.getY2() - y2) < TOLERANCE; } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/RedTextPosition.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/RedTextPosition.java index e4a4212..b0c5ba7 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/RedTextPosition.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/RedTextPosition.java @@ -1,5 +1,8 @@ package com.knecon.fforesight.service.layoutparser.processor.model.text; +import java.awt.geom.AffineTransform; +import java.awt.geom.Rectangle2D; + import org.apache.pdfbox.text.TextPosition; import com.fasterxml.jackson.annotation.JsonIgnore; @@ -16,7 +19,9 @@ import lombok.SneakyThrows; @AllArgsConstructor public class RedTextPosition { - private float[] position; + private final static int HEIGHT_PADDING = 2; + private Rectangle2D.Float directionAdjustedPosition; + private Rectangle2D initialUserSpacePosition; @JsonIgnore private int rotation; @@ -58,43 +63,65 @@ public class RedTextPosition { pos.setFontSizeInPt(textPosition.getFontSizeInPt()); pos.setFontName(textPosition.getFont().getName()); - var position = new float[4]; + float textHeight = textPosition.getHeight() + HEIGHT_PADDING; + Rectangle2D.Float dirAdjPosition = new Rectangle2D.Float(textPosition.getXDirAdj(), + textPosition.getYDirAdj() - textHeight, + textPosition.getWidthDirAdj(), + textHeight + HEIGHT_PADDING); + pos.setDirectionAdjustedPosition(dirAdjPosition); - position[0] = textPosition.getXDirAdj(); - position[1] = textPosition.getYDirAdj(); - position[2] = textPosition.getWidthDirAdj(); - position[3] = textPosition.getHeightDir(); + AffineTransform affineTransform = getRotationMatrix(TextDirection.fromDegrees(textPosition.getDir()), textPosition.getPageWidth(), textPosition.getPageHeight()); + Rectangle2D initialUserSpacePositionRect = affineTransform.createTransformedShape(dirAdjPosition).getBounds2D(); + + pos.setInitialUserSpacePosition(initialUserSpacePositionRect); - pos.setPosition(position); return pos; } + private static AffineTransform getRotationMatrix(TextDirection textDirection, float pageWidth, float pageHeight) { + + AffineTransform transform = new AffineTransform(); + + if (textDirection == TextDirection.ZERO || textDirection == TextDirection.HALF_CIRCLE) { + transform.rotate(textDirection.getRadians(), pageWidth / 2f, pageHeight / 2f); + transform.translate(0f, pageHeight); + } else if (textDirection == TextDirection.QUARTER_CIRCLE) { + transform.rotate(textDirection.getRadians(), pageWidth / 2f, pageWidth / 2f); + transform.translate(0f, pageWidth); + } else { + transform.rotate(textDirection.getRadians(), pageHeight / 2f, pageHeight / 2f); + transform.translate(0f, pageWidth); + } + transform.scale(1., -1.); + return transform; + } + @JsonIgnore public float getXDirAdj() { - return position[0]; + return this.directionAdjustedPosition.x; } @JsonIgnore public float getYDirAdj() { - return position[1]; + return this.directionAdjustedPosition.y; } @JsonIgnore public float getWidthDirAdj() { - return position[2]; + return this.directionAdjustedPosition.width; } @JsonIgnore public float getHeightDir() { - return position[3]; + return this.directionAdjustedPosition.height; } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequence.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequence.java index de03144..fccecd5 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequence.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequence.java @@ -2,6 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor.model.text; import java.awt.geom.AffineTransform; import java.awt.geom.Point2D; +import java.awt.geom.Rectangle2D; import java.util.ArrayList; import java.util.List; import java.util.Locale; @@ -11,6 +12,7 @@ import org.apache.pdfbox.text.TextPosition; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; +import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; import lombok.AllArgsConstructor; import lombok.Builder; @@ -44,21 +46,19 @@ public class TextPositionSequence implements CharSequence { private boolean isParagraphStart; - public TextPositionSequence(int page) { + public TextPositionSequence(List textPositions, int pageNumber, boolean isParagraphStart) { - this.page = page; - } - - - public TextPositionSequence(List textPositions, int page, boolean isParagraphStart) { - - this.textPositions = textPositions.stream().map(RedTextPosition::fromTextPosition).collect(Collectors.toList()); - this.page = page; + this.textPositions = textPositions.stream() + .map(RedTextPosition::fromTextPosition) + .collect(Collectors.toList()); + this.page = pageNumber; this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir()); this.rotation = textPositions.get(0).getRotation(); this.pageHeight = textPositions.get(0).getPageHeight(); this.pageWidth = textPositions.get(0).getPageWidth(); this.isParagraphStart = isParagraphStart; + + } @@ -314,10 +314,18 @@ public class TextPositionSequence implements CharSequence { topRight = transform.transform(topRight, null); return new Rectangle( // - new Point((float) bottomLeft.getX(), (float) bottomLeft.getY()), - (float) (topRight.getX() - bottomLeft.getX()), - (float) (topRight.getY() - bottomLeft.getY()), - page); + new Point((float) bottomLeft.getX(), (float) bottomLeft.getY()), + (float) (topRight.getX() - bottomLeft.getX()), + (float) (topRight.getY() - bottomLeft.getY()), + page); + } + + + public Rectangle2D getBoundingBox() { + + return getTextPositions().stream() + .map(RedTextPosition::getInitialUserSpacePosition) + .collect(RectangleTransformations.collectBBox()); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java index 3cd09a8..7438843 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java @@ -33,6 +33,7 @@ public class BodyTextFrameService { for (ClassificationPage page : classificationDocument.getPages()) { // var updatedBodyTextFrame = getBodyTextFrameFromRulings(page, bodyTextFrame, landscapeBodyTextFrame); setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame); + classificationDocument.getVisualizations().addMainBodyVisualization(page.isLandscape() ? landscapeBodyTextFrame : bodyTextFrame, page.getPageNumber()); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/RulingCleaningService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/RulingCleaningService.java index c51c90b..b57fab1 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/RulingCleaningService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/RulingCleaningService.java @@ -122,7 +122,7 @@ public class RulingCleaningService { h = ruling.y1 - ruling.y2; } - if (ruling.horizontal()) { + if (ruling.isHorizontal()) { return new Rectangle(top - THRESHOLD_Y_HORIZONTAL, left - THRESHOLD_X_HORIZONTAL, w + 2 * THRESHOLD_X_HORIZONTAL, h + 2 * THRESHOLD_Y_HORIZONTAL); } else { return new Rectangle(top - THRESHOLD_Y_VERTICAL, left - THRESHOLD_X_VERTICAL, w + 2 * THRESHOLD_X_VERTICAL, h + 2 * THRESHOLD_Y_VERTICAL); @@ -160,14 +160,14 @@ public class RulingCleaningService { List vrs = new ArrayList<>(); for (Ruling vr : rulings) { - if (vr.vertical()) { + if (vr.isVertical()) { vrs.add(vr); } } List hrs = new ArrayList<>(); for (Ruling hr : rulings) { - if (hr.horizontal()) { + if (hr.isHorizontal()) { hrs.add(hr); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java index 9c087a1..883f393 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java @@ -15,6 +15,7 @@ import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.Doubl import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter; +import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations; import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; @@ -37,11 +38,18 @@ public class DocstrumBlockificationService { static final float THRESHOLD = 1f; - public ClassificationPage blockify(List textPositions, List cells, boolean xyOrder) { + public ClassificationPage blockify(List textPositions, List cells, boolean xyOrder, LayoutparsingVisualizations visualizations) { CleanRulings usedRulings = RectangleTransformations.extractRulings(cells); var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder); + + if (!textPositions.isEmpty()) { + visualizations.addZoneVisualizations(zones, textPositions.get(0).getPage()); + visualizations.addLineVisualizations(zones, textPositions.get(0).getPage()); + visualizations.addCharactersWithNeighbours(zones, textPositions.get(0).getPage()); + } + var pageBlocks = toAbstractPageBlocks(zones, usedRulings.getHorizontal(), usedRulings.getVertical(), xyOrder); var classificationPage = new ClassificationPage(pageBlocks); @@ -58,18 +66,20 @@ public class DocstrumBlockificationService { zones.forEach(zone -> { List textPositionSequences = new ArrayList<>(); - zone.getLines().forEach(line -> { - line.getWords().forEach(word -> { - textPositionSequences.add(new TextPositionSequence(word.getTextPositions(), word.getPage())); - }); - }); + zone.getLines() + .forEach(line -> { + line.getWords() + .forEach(word -> { + textPositionSequences.add(new TextPositionSequence(word.getTextPositions(), word.getPage())); + }); + }); abstractPageBlocks.addAll(splitZonesAtRulings(textPositionSequences, horizontalRulings, verticalRulings)); }); if (xyOrder) { abstractPageBlocks.sort(Comparator.comparing(AbstractPageBlock::getMinY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) - .thenComparing(AbstractPageBlock::getMinX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); + .thenComparing(AbstractPageBlock::getMinX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); abstractPageBlocks.sort(new Comparator() { @Override public int compare(AbstractPageBlock o1, AbstractPageBlock o2) { @@ -134,8 +144,8 @@ public class DocstrumBlockificationService { private boolean isOnlyIntersectingYAndOnelineOrPrevoiusTwoLines(TextPageBlock previous, TextPageBlock current, ClassificationPage page) { return current.intersectsY(previous) // - && (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() == 2 && current.getNumberOfLines() == 1) // - && numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) <= 0; + && (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() == 2 && current.getNumberOfLines() == 1) // + && numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) <= 0; } @@ -144,16 +154,16 @@ public class DocstrumBlockificationService { ClassificationPage page) { return (Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD || Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) // - && (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() >= 2 && current.getNumberOfLines() == 1) // - && !hasBetween(current, previous, page.getTextBlocks()) && numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) <= 4; + && (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() >= 2 && current.getNumberOfLines() == 1) // + && !hasBetween(current, previous, page.getTextBlocks()) && numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) <= 4; } private boolean areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(TextPageBlock previous, TextPageBlock current, ClassificationPage page) { return previous.getNumberOfLines() >= 2 && current.getNumberOfLines() >= 2 // - && previous.intersectsY(current) // - && numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) == 0; + && previous.intersectsY(current) // + && numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) == 0; } @@ -213,7 +223,7 @@ public class DocstrumBlockificationService { ListIterator itty = blocks.listIterator(); while (itty.hasNext()) { AbstractPageBlock block = itty.next(); - if(block == null){ + if (block == null) { continue; } if (block instanceof TablePageBlock) { @@ -224,7 +234,7 @@ public class DocstrumBlockificationService { for (int i = 0; i < blocks.size(); i++) { - if(blocks.get(i) == null){ + if (blocks.get(i) == null) { continue; } if (blocks.get(i) == current) { @@ -249,8 +259,8 @@ public class DocstrumBlockificationService { } } var blocksIterator = blocks.iterator(); - while(blocksIterator.hasNext()){ - if(blocksIterator.next() == null){ + while (blocksIterator.hasNext()) { + if (blocksIterator.next() == null) { blocksIterator.remove(); } } @@ -338,11 +348,11 @@ public class DocstrumBlockificationService { if (textBlock == null) { textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(), - wordBlock.getMaxXDirAdj(), - wordBlock.getMinYDirAdj(), - wordBlock.getMaxYDirAdj(), - wordBlockList, - wordBlock.getRotation()); + wordBlock.getMaxXDirAdj(), + wordBlock.getMinYDirAdj(), + wordBlock.getMaxYDirAdj(), + wordBlockList, + wordBlock.getRotation()); } else { TextPageBlock spatialEntity = textBlock.union(wordBlock); textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight()); @@ -358,7 +368,12 @@ public class DocstrumBlockificationService { textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest()); } - if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) { + if (textBlock != null + && textBlock.getSequences() != null + && textBlock.getSequences() + .stream() + .map(t -> round(t.getMinYDirAdj(), 3)) + .collect(toSet()).size() == 1) { textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj)); } return textBlock; @@ -373,38 +388,34 @@ public class DocstrumBlockificationService { List horizontalRulingLines, List verticalRulingLines) { - return isSplitByRuling(maxX, - minY, - word.getMinXDirAdj(), - word.getMinYDirAdj(), - verticalRulingLines, - word.getDir().getDegrees(), - word.getPageWidth(), - word.getPageHeight()) // - || isSplitByRuling(minX, - minY, - word.getMinXDirAdj(), - word.getMaxYDirAdj(), - horizontalRulingLines, - word.getDir().getDegrees(), - word.getPageWidth(), - word.getPageHeight()) // - || isSplitByRuling(maxX, - minY, - word.getMinXDirAdj(), - word.getMinYDirAdj(), - horizontalRulingLines, - word.getDir().getDegrees(), - word.getPageWidth(), - word.getPageHeight()) // - || isSplitByRuling(minX, - minY, - word.getMinXDirAdj(), - word.getMaxYDirAdj(), - verticalRulingLines, - word.getDir().getDegrees(), - word.getPageWidth(), - word.getPageHeight()); + return isSplitByRuling(maxX, minY, word.getMinXDirAdj(), word.getMinYDirAdj(), verticalRulingLines, word.getDir().getDegrees(), word.getPageWidth(), word.getPageHeight()) + // + || isSplitByRuling(minX, + minY, + word.getMinXDirAdj(), + word.getMaxYDirAdj(), + horizontalRulingLines, + word.getDir().getDegrees(), + word.getPageWidth(), + word.getPageHeight()) + // + || isSplitByRuling(maxX, + minY, + word.getMinXDirAdj(), + word.getMinYDirAdj(), + horizontalRulingLines, + word.getDir().getDegrees(), + word.getPageWidth(), + word.getPageHeight()) + // + || isSplitByRuling(minX, + minY, + word.getMinXDirAdj(), + word.getMaxYDirAdj(), + verticalRulingLines, + word.getDir().getDegrees(), + word.getPageWidth(), + word.getPageHeight()); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocuMineBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocuMineBlockificationService.java index ca72723..fd84be5 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocuMineBlockificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocuMineBlockificationService.java @@ -34,7 +34,7 @@ public class DocuMineBlockificationService { * This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this! * Rulings (Table lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling. * - * @param textPositions The words of a page. + * @param textPositions The textPositions of a page. * @param horizontalRulingLines Horizontal table lines. * @param verticalRulingLines Vertical table lines. * @return Page object that contains the Textblock and text statistics. diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java index 1481776..1b6f6ca 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java @@ -7,7 +7,6 @@ import static java.util.stream.Collectors.toList; import java.awt.geom.Rectangle2D; import java.util.ArrayList; import java.util.HashMap; -import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; @@ -15,7 +14,6 @@ import java.util.NoSuchElementException; import java.util.Set; import java.util.stream.Collectors; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; @@ -52,6 +50,9 @@ public class DocumentGraphFactory { public Document buildDocumentGraph(LayoutParsingType layoutParsingType, ClassificationDocument document) { Document documentGraph = new Document(); + + documentGraph.setVisualizations(document.getVisualizations()); + Context context = new Context(documentGraph); document.getPages() @@ -85,14 +86,11 @@ public class DocumentGraphFactory { GenericSemanticNode node; if (originalTextBlock.isHeadline()) { - node = Headline.builder().documentTree(context.getDocumentTree()) - .build(); + node = Headline.builder().documentTree(context.getDocumentTree()).build(); } else if (originalTextBlock.isToDuplicate()) { - node = DuplicatedParagraph.builder().documentTree(context.getDocumentTree()) - .build(); + node = DuplicatedParagraph.builder().documentTree(context.getDocumentTree()).build(); } else { - node = Paragraph.builder().documentTree(context.getDocumentTree()) - .build(); + node = Paragraph.builder().documentTree(context.getDocumentTree()).build(); } page.getMainBody().add(node); @@ -178,8 +176,7 @@ public class DocumentGraphFactory { private void addFooter(List textBlocks, Context context) { Page page = context.getPage(textBlocks.get(0).getPage()); - Footer footer = Footer.builder().documentTree(context.getDocumentTree()) - .build(); + Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build(); AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks), footer, context, @@ -194,8 +191,7 @@ public class DocumentGraphFactory { public void addHeader(List textBlocks, Context context) { Page page = context.getPage(textBlocks.get(0).getPage()); - Header header = Header.builder().documentTree(context.getDocumentTree()) - .build(); + Header header = Header.builder().documentTree(context.getDocumentTree()).build(); AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks), header, 0, page); List tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header); header.setTreeId(tocId); @@ -207,8 +203,7 @@ public class DocumentGraphFactory { private void addEmptyFooter(int pageIndex, Context context) { Page page = context.getPage(pageIndex); - Footer footer = Footer.builder().documentTree(context.getDocumentTree()) - .build(); + Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build(); AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(footer, context, page); List tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer); footer.setTreeId(tocId); @@ -220,8 +215,7 @@ public class DocumentGraphFactory { private void addEmptyHeader(int pageIndex, Context context) { Page page = context.getPage(pageIndex); - Header header = Header.builder().documentTree(context.getDocumentTree()) - .build(); + Header header = Header.builder().documentTree(context.getDocumentTree()).build(); AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(header, 0, page); List tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header); header.setTreeId(tocId); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SearchTextWithTextPositionFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SearchTextWithTextPositionFactory.java index c10cbee..34cea76 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SearchTextWithTextPositionFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SearchTextWithTextPositionFactory.java @@ -29,19 +29,22 @@ public class SearchTextWithTextPositionFactory { public SearchTextWithTextPositionDto buildSearchTextToTextPositionDto(List sequences) { - if (sequences.isEmpty() || sequences.stream().allMatch(sequence -> sequence.getTextPositions().isEmpty())) { + if (sequences.isEmpty() || sequences.stream() + .allMatch(sequence -> sequence.getTextPositions().isEmpty())) { return SearchTextWithTextPositionDto.empty(); } Context context = new Context(); - RedTextPosition currentTextPosition = sequences.get(0).getTextPositions().get(0); - RedTextPosition previousTextPosition = RedTextPosition.builder().unicode(" ").position(currentTextPosition.getPosition()).build(); + RedTextPosition currentTextPosition = sequences.get(0).getTextPositions() + .get(0); + RedTextPosition previousTextPosition = RedTextPosition.builder().unicode(" ").directionAdjustedPosition(currentTextPosition.getDirectionAdjustedPosition()).build(); for (TextPositionSequence word : sequences) { for (int i = 0; i < word.getTextPositions().size(); ++i) { - currentTextPosition = word.getTextPositions().get(i); + currentTextPosition = word.getTextPositions() + .get(i); if (isLineBreak(currentTextPosition, previousTextPosition)) { removeHyphenLinebreaks(context); context.lineBreaksStringIdx.add(context.stringIdx); @@ -57,7 +60,7 @@ public class SearchTextWithTextPositionFactory { ++context.positionIdx; } - previousTextPosition = RedTextPosition.builder().unicode(" ").position(previousTextPosition.getPosition()).build(); + previousTextPosition = RedTextPosition.builder().unicode(" ").directionAdjustedPosition(previousTextPosition.getDirectionAdjustedPosition()).build(); context.stringBuilder.append(" "); context.stringIdxToPositionIdx.add(context.positionIdx); ++context.stringIdx; @@ -66,7 +69,7 @@ public class SearchTextWithTextPositionFactory { assert context.stringBuilder.length() == context.stringIdxToPositionIdx.size(); List positions = sequences.stream() - .flatMap(sequence -> sequence.getTextPositions().stream().map(textPosition -> mapRedTextPositionToInitialUserSpace(textPosition, sequence))) + .map(TextPositionSequence::getBoundingBox) .toList(); return SearchTextWithTextPositionDto.builder() @@ -153,7 +156,7 @@ public class SearchTextWithTextPositionFactory { return false; } - float deltaY = Math.abs(currentPosition.getYDirAdj() - previousPosition.getYDirAdj()); + double deltaY = Math.abs(currentPosition.getYDirAdj() - previousPosition.getYDirAdj()); return deltaY >= currentPosition.getHeightDir(); } @@ -167,16 +170,16 @@ public class SearchTextWithTextPositionFactory { private boolean isHyphen(String unicodeCharacter) { return Objects.equals(unicodeCharacter, "-") || // - Objects.equals(unicodeCharacter, "~") || // - Objects.equals(unicodeCharacter, "‐") || // - Objects.equals(unicodeCharacter, "‒") || // - Objects.equals(unicodeCharacter, "⁻") || // - Objects.equals(unicodeCharacter, "−") || // - Objects.equals(unicodeCharacter, "﹣") || // - Objects.equals(unicodeCharacter, "゠") || // - Objects.equals(unicodeCharacter, "⁓") || // - Objects.equals(unicodeCharacter, "‑") || // - Objects.equals(unicodeCharacter, "\u00AD"); + Objects.equals(unicodeCharacter, "~") || // + Objects.equals(unicodeCharacter, "‐") || // + Objects.equals(unicodeCharacter, "‒") || // + Objects.equals(unicodeCharacter, "⁻") || // + Objects.equals(unicodeCharacter, "−") || // + Objects.equals(unicodeCharacter, "﹣") || // + Objects.equals(unicodeCharacter, "゠") || // + Objects.equals(unicodeCharacter, "⁓") || // + Objects.equals(unicodeCharacter, "‑") || // + Objects.equals(unicodeCharacter, "\u00AD"); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/LegacyPDFStreamEngine.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/LegacyPDFStreamEngine.java index ff2e665..326746d 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/LegacyPDFStreamEngine.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/LegacyPDFStreamEngine.java @@ -82,7 +82,6 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine { private int pageRotation; private PDRectangle pageSize; - private Matrix translateMatrix; private final GlyphList glyphList; private final Map fontHeightMap = new WeakHashMap(); @@ -134,12 +133,6 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine { this.pageRotation = page.getRotation(); this.pageSize = page.getCropBox(); - if (pageSize.getLowerLeftX() == 0 && pageSize.getLowerLeftY() == 0) { - translateMatrix = null; - } else { - // translation matrix for cropbox - translateMatrix = Matrix.getTranslateInstance(-pageSize.getLowerLeftX(), -pageSize.getLowerLeftY()); - } super.processPage(page); } @@ -265,62 +258,52 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine { } } - // adjust for cropbox if needed - Matrix translatedTextRenderingMatrix; - if (translateMatrix == null) { - translatedTextRenderingMatrix = textRenderingMatrix; - } else { - translatedTextRenderingMatrix = Matrix.concatenate(translateMatrix, textRenderingMatrix); - nextX -= pageSize.getLowerLeftX(); - nextY -= pageSize.getLowerLeftY(); - } - // This is a hack for unicode letter with 2 chars e.g. RA see unicodeProblem.pdf if (unicodeMapping.length() == 2) { processTextPosition(new TextPosition(pageRotation, - pageSize.getWidth(), - pageSize.getHeight(), - translatedTextRenderingMatrix, - nextX, - nextY, - Math.abs(dyDisplay), - dxDisplay, - Math.abs(spaceWidthDisplay), - Character.toString(unicodeMapping.charAt(0)), - new int[]{code}, - font, - fontSize, - (int) (fontSize * textMatrix.getScalingFactorX()))); + pageSize.getWidth(), + pageSize.getHeight(), + textRenderingMatrix, + nextX, + nextY, + Math.abs(dyDisplay), + dxDisplay, + Math.abs(spaceWidthDisplay), + Character.toString(unicodeMapping.charAt(0)), + new int[]{code}, + font, + fontSize, + (int) (fontSize * textMatrix.getScalingFactorX()))); processTextPosition(new TextPosition(pageRotation, - pageSize.getWidth(), - pageSize.getHeight(), - translatedTextRenderingMatrix, - nextX, - nextY, - Math.abs(dyDisplay), - dxDisplay, - Math.abs(spaceWidthDisplay), - Character.toString(unicodeMapping.charAt(1)), - new int[]{code}, - font, - fontSize, - (int) (fontSize * textMatrix.getScalingFactorX()))); + pageSize.getWidth(), + pageSize.getHeight(), + textRenderingMatrix, + nextX, + nextY, + Math.abs(dyDisplay), + dxDisplay, + Math.abs(spaceWidthDisplay), + Character.toString(unicodeMapping.charAt(1)), + new int[]{code}, + font, + fontSize, + (int) (fontSize * textMatrix.getScalingFactorX()))); } else { processTextPosition(new TextPosition(pageRotation, - pageSize.getWidth(), - pageSize.getHeight(), - translatedTextRenderingMatrix, - nextX, - nextY, - Math.abs(dyDisplay), - dxDisplay, - Math.abs(spaceWidthDisplay), - unicodeMapping, - new int[]{code}, - font, - fontSize, - (int) (fontSize * textMatrix.getScalingFactorX()))); + pageSize.getWidth(), + pageSize.getHeight(), + textRenderingMatrix, + nextX, + nextY, + Math.abs(dyDisplay), + dxDisplay, + Math.abs(spaceWidthDisplay), + unicodeMapping, + new int[]{code}, + font, + fontSize, + (int) (fontSize * textMatrix.getScalingFactorX()))); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFTextStripper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFTextStripper.java index 1ca5b43..83fafea 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFTextStripper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFTextStripper.java @@ -1007,7 +1007,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine { /** * Set the desired word separator for output text. The PDFBox text extraction algorithm will output a space - * character if there is enough space between two words. By default a space character is used. If you need and + * character if there is enough space between two textPositions. By default a space character is used. If you need and * accurate count of characters that are found in a PDF document then you might want to set the word separator to * the empty string. * @@ -1703,7 +1703,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine { /** * Write a list of string containing a whole line of a document. * - * @param line a list with the words of the given line + * @param line a list with the textPositions of the given line * @throws IOException if something went wrong */ private void writeLine(List line, boolean isParagraphEnd) throws IOException { @@ -1744,9 +1744,9 @@ public class PDFTextStripper extends LegacyPDFStreamEngine { /** - * Handles the LTR and RTL direction of the given words. The whole implementation stands and falls with the given - * word. If the word is a full line, the results will be the best. If the word contains of single words or - * characters, the order of the characters in a word or words in a line may wrong, due to RTL and LTR marks and + * Handles the LTR and RTL direction of the given textPositions. The whole implementation stands and falls with the given + * word. If the word is a full line, the results will be the best. If the word contains of single textPositions or + * characters, the order of the characters in a word or textPositions in a line may wrong, due to RTL and LTR marks and * characters! *

* Based on http://www.nesterovsky-bros.com/weblog/2013/07/28/VisualToLogicalConversionInJava.aspx diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LayoutGridService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LayoutGridService.java index 9159742..6e25dcb 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LayoutGridService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LayoutGridService.java @@ -70,7 +70,9 @@ public class LayoutGridService { Visualizations layoutGrid = this.addLayoutGrid(document, layerVisibilityDefaultValue, false); Visualizations visualLayoutGrid = this.addLayoutGrid(document, layerVisibilityDefaultValue, true); - viewerDocumentService.addVisualizationsOnPage(originFile, destinationFile, List.of(layoutGrid, visualLayoutGrid)); + List allVisualizations = Stream.concat(Stream.of(layoutGrid, visualLayoutGrid), document.getVisualizations().streamAll()).toList(); + + viewerDocumentService.addVisualizationsOnPage(originFile, destinationFile, allVisualizations); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/MarkedContentUtils.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/MarkedContentUtils.java index 799ac99..86d79d7 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/MarkedContentUtils.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/MarkedContentUtils.java @@ -1,12 +1,5 @@ package com.knecon.fforesight.service.layoutparser.processor.utils; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; -import lombok.experimental.UtilityClass; -import org.apache.pdfbox.cos.COSName; -import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent; -import org.apache.pdfbox.text.TextPosition; - import java.awt.geom.Rectangle2D; import java.util.Collection; import java.util.Collections; @@ -14,13 +7,24 @@ import java.util.List; import java.util.Map; import java.util.stream.Collectors; +import org.apache.pdfbox.cos.COSName; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent; +import org.apache.pdfbox.text.TextPosition; + +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; + +import lombok.experimental.UtilityClass; + @UtilityClass public class MarkedContentUtils { public static final String HEADER = "Header"; public static final String FOOTER = "Footer"; - public List getMarkedContentBboxPerLine(List markedContents, String subtype) { + + public List getMarkedContentBboxPerLine(List markedContents, String subtype, PDPage pdPage) { if (markedContents == null) { return Collections.emptyList(); @@ -31,7 +35,8 @@ public class MarkedContentUtils { .filter(m -> m.getProperties() != null) .filter(m -> m.getProperties().getItem("Subtype") != null) .filter(m -> ((COSName) m.getProperties().getItem("Subtype")).getName().equals(subtype)) - .map(PDMarkedContent::getContents).flatMap(Collection::stream) + .map(PDMarkedContent::getContents) + .flatMap(Collection::stream) .filter(t -> t instanceof TextPosition) .map(t -> (TextPosition) t) .filter(t -> !t.getUnicode().equals(" ")) @@ -41,16 +46,77 @@ public class MarkedContentUtils { return Collections.emptyList(); } - return markedContentByYPosition.values().stream() - .map(textPositions -> new TextPositionSequence(textPositions.stream() - .toList(), 0, true) - .getRectangle()) - .map(t -> new Rectangle2D.Float(t.getTopLeft().getX(), t.getTopLeft().getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight()))).collect(Collectors.toList()); + return markedContentByYPosition.values() + .stream() + .map(textPositions -> new TextPositionSequence(textPositions, 0, true).getRectangle()) + .map(t -> new Rectangle2D.Float(t.getTopLeft().getX(), t.getTopLeft().getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight()))) + .collect(Collectors.toList()); + } + + + public List getMarkedContentPositions(List markedContents, PDPage pdPage) { + + if (markedContents == null) { + return Collections.emptyList(); + } + + return markedContents.stream() + .filter(m -> !m.getContents().isEmpty()) + .map(markedContent -> MarkedContentPosition.fromPDMarkedContent(markedContent, pdPage)) + .toList(); } public boolean intersects(TextPageBlock textBlock, Map> markedContentBboxPerType, String type) { - return markedContentBboxPerType.get(type) != null && markedContentBboxPerType.get(type).stream().anyMatch(rectangle -> rectangle.intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight())); + + return markedContentBboxPerType.get(type) != null && markedContentBboxPerType.get(type) + .stream() + .anyMatch(rectangle -> rectangle.intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight())); + } + + + public record MarkedContentPosition(String type, String subType, List textPositions) { + + public static MarkedContentPosition fromPDMarkedContent(PDMarkedContent markedContent, PDPage pdPage) { + + return new MarkedContentPosition(markedContent.getTag(), parseSubType(markedContent), parseTextPositions(markedContent.getContents(), pdPage)); + } + + + private static List parseTextPositions(List contents, PDPage pdPage) { + + return contents.stream() + .filter(content -> content instanceof TextPosition) + .map(content -> (TextPosition) content) + .filter(content -> !content.getUnicode().equals(" ")) + .map(textPositions -> new TextPositionSequence(List.of(textPositions), 0, true).getRectangle()) + .map(t -> new Rectangle2D.Float(t.getTopLeft().getX(), t.getTopLeft().getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight()))) + .collect(Collectors.toList()); + } + + + private static String parseSubType(PDMarkedContent markedContent) { + + if (markedContent == null || markedContent.getProperties() == null || markedContent.getProperties().getItem("Subtype") == null) { + return null; + } + + return ((COSName) markedContent.getProperties().getItem("Subtype")).getName(); + } + + + public String formattedType() { + + if (subType == null || subType.isEmpty()) { + return type; + } + if (type.equals("Artifact")) { + return subType; + } + return String.format("%s-%s", type, subType); + + } + } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangleTransformations.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangleTransformations.java index 14df80a..e61f78b 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangleTransformations.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangleTransformations.java @@ -52,7 +52,10 @@ public class RectangleTransformations { public static Rectangle2D bBoxUnionAtomicTextBlock(List atomicTextBlocks) { - return atomicTextBlocks.stream().flatMap(atomicTextBlock -> atomicTextBlock.getPositions().stream()).collect(new Rectangle2DBBoxCollector()); + return atomicTextBlocks.stream() + .flatMap(atomicTextBlock -> atomicTextBlock.getPositions() + .stream()) + .collect(new Rectangle2DBBoxCollector()); } @@ -77,7 +80,10 @@ public class RectangleTransformations { public static Rectangle2D atomicTextBlockBBox(List atomicTextBlocks) { - return atomicTextBlocks.stream().flatMap(atomicTextBlock -> atomicTextBlock.getPositions().stream()).collect(new Rectangle2DBBoxCollector()); + return atomicTextBlocks.stream() + .flatMap(atomicTextBlock -> atomicTextBlock.getPositions() + .stream()) + .collect(new Rectangle2DBBoxCollector()); } @@ -89,16 +95,18 @@ public class RectangleTransformations { public static Rectangle2D rectangleBBox(List rectangles) { - return rectangles.stream().map(RectangleTransformations::toRectangle2D).collect(new Rectangle2DBBoxCollector()); + return rectangles.stream() + .map(RectangleTransformations::toRectangle2D) + .collect(new Rectangle2DBBoxCollector()); } public static Rectangle2D toRectangle2D(Rectangle redactionLogRectangle) { return new Rectangle2D.Double(redactionLogRectangle.getTopLeft().getX(), - redactionLogRectangle.getTopLeft().getY() + redactionLogRectangle.getHeight(), - redactionLogRectangle.getWidth(), - -redactionLogRectangle.getHeight()); + redactionLogRectangle.getTopLeft().getY() + redactionLogRectangle.getHeight(), + redactionLogRectangle.getWidth(), + -redactionLogRectangle.getHeight()); } @@ -111,15 +119,16 @@ public class RectangleTransformations { public static Rectangle toRedactionLogRectangle(Rectangle2D rectangle2D, int pageNumber) { return new Rectangle(new Point((float) rectangle2D.getMinX(), (float) (rectangle2D.getMinY() + rectangle2D.getHeight())), - (float) rectangle2D.getWidth(), - -(float) rectangle2D.getHeight(), - pageNumber); + (float) rectangle2D.getWidth(), + -(float) rectangle2D.getHeight(), + pageNumber); } public static Rectangle2D rectangle2DBBox(List rectangle2DList) { - return rectangle2DList.stream().collect(new Rectangle2DBBoxCollector()); + return rectangle2DList.stream() + .collect(new Rectangle2DBBoxCollector()); } @@ -134,7 +143,9 @@ public class RectangleTransformations { if (rectangle2DList.isEmpty()) { return Collections.emptyList(); } - double splitThreshold = rectangle2DList.stream().mapToDouble(RectangularShape::getWidth).average().orElse(5) * 5.0; + double splitThreshold = rectangle2DList.stream() + .mapToDouble(RectangularShape::getWidth).average() + .orElse(5) * 5.0; List> rectangleListsWithGaps = new LinkedList<>(); List rectangleListWithoutGaps = new LinkedList<>(); @@ -195,9 +206,9 @@ public class RectangleTransformations { public BinaryOperator combiner() { return (b1, b2) -> new BBox(Math.min(b1.lowerLeftX, b2.lowerLeftX), - Math.min(b1.lowerLeftY, b2.lowerLeftY), - Math.max(b1.upperRightX, b2.upperRightX), - Math.max(b1.upperRightY, b2.upperRightY)); + Math.min(b1.lowerLeftY, b2.lowerLeftY), + Math.max(b1.upperRightX, b2.upperRightX), + Math.max(b1.upperRightY, b2.upperRightY)); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangularIntersectionFinder.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangularIntersectionFinder.java index 3f47b40..53128a9 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangularIntersectionFinder.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangularIntersectionFinder.java @@ -14,23 +14,24 @@ public class RectangularIntersectionFinder { public static List find(List horizontalRulingLines, List verticalRulingLines) { - // Fix for 211.pdf - for (Ruling r : horizontalRulingLines) { - if (r.getX2() < r.getX1()) { - double a = r.getX2(); - r.x2 = (float) r.getX1(); - r.x1 = (float) a; - } - } +// // Fix for 211.pdf +// for (Ruling r : horizontalRulingLines) { +// if (r.getX2() < r.getX1()) { +// double a = r.getX2(); +// r.x2 = (float) r.getX1(); +// r.x1 = (float) a; +// } +// } List foundRectangles = new ArrayList<>(); - Map intersectionPoints = Ruling.findIntersections(horizontalRulingLines, verticalRulingLines); + Map intersectionPoints = RulingIntersectionFinder.findNaive(horizontalRulingLines, verticalRulingLines); + List intersectionPointsList = new ArrayList<>(intersectionPoints.keySet()); intersectionPointsList.sort(Y_FIRST_POINT_COMPARATOR); for (int i = 0; i < intersectionPointsList.size(); i++) { Point2D topLeft = intersectionPointsList.get(i); - Ruling[] hv = intersectionPoints.get(topLeft); + RulingIntersectionFinder.IntersectingRulings intersectingRulingsFromTopLeft = intersectionPoints.get(topLeft); // CrossingPointsDirectlyBelow( topLeft ); List xPoints = new ArrayList<>(); @@ -48,18 +49,19 @@ public class RectangularIntersectionFinder { outer: for (Point2D xPoint : xPoints) { // is there a vertical edge b/w topLeft and xPoint? - if (!hv[1].equals(intersectionPoints.get(xPoint)[1])) { + if (!intersectingRulingsFromTopLeft.vertical().equals(intersectionPoints.get(xPoint).vertical())) { continue; } for (Point2D yPoint : yPoints) { // is there a horizontal edge b/w topLeft and yPoint ? - if (!hv[0].equals(intersectionPoints.get(yPoint)[0])) { + if (!intersectingRulingsFromTopLeft.horizontal().equals(intersectionPoints.get(yPoint).horizontal())) { continue; } Point2D btmRight = new Point2D.Float((float) yPoint.getX(), (float) xPoint.getY()); if (intersectionPoints.containsKey(btmRight) - && intersectionPoints.get(btmRight)[0].equals(intersectionPoints.get(xPoint)[0]) - && intersectionPoints.get(btmRight)[1].equals(intersectionPoints.get(yPoint)[1])) { + && intersectionPoints.get(btmRight).horizontal().equals(intersectionPoints.get(xPoint).horizontal()) + && intersectionPoints.get(btmRight).vertical().equals(intersectionPoints.get(yPoint).vertical())) { + foundRectangles.add(new Rectangle2D.Double(topLeft.getX(), topLeft.getY(), btmRight.getX() - topLeft.getX(), btmRight.getY() - topLeft.getY())); break outer; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RulingIntersectionFinder.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RulingIntersectionFinder.java new file mode 100644 index 0000000..70ea31b --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RulingIntersectionFinder.java @@ -0,0 +1,201 @@ +package com.knecon.fforesight.service.layoutparser.processor.utils; + +import java.awt.geom.Point2D; +import java.util.Collections; +import java.util.Comparator; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.TreeMap; + +import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; + +import lombok.experimental.UtilityClass; +import lombok.extern.slf4j.Slf4j; + +@Slf4j +@UtilityClass +public class RulingIntersectionFinder { + + public static final int PERPENDICULAR_UNIT_EXPAND_AMOUNT = 2; + + public static final Comparator Y_THEN_X_POINT_COMPARATOR = Comparator.comparingDouble(Point2D::getY).thenComparing(Point2D::getX); + + + /** + * Implementation to find line intersection in O(P + n log n), where n is the number of lines and P the numer of intersections + * based on Segment Intersection by Piotr Indyk + * The algorithm assumes there are only horizontal and vertical lines which are unique in their coordinates. (E.g. no overlapping horizontal lines exist) + * As a high level overview, the algorithm uses a sweep line advancing from left to right. + * It dynamically updates the horizontal rulings which are intersected by the current sweep line. + * When the sweep line hits a vertical line, it then checks for all intersections with the currently intersected horizontal rulings. + * THe trick of the algorithm is using a binary search tree to store the currently intersected horizontal rulings. This way the lookup should be in O(log n). + * This way the initial sorting step has the highest complexity class (O(n log n) and thus determines the complexity class of the entire algorithm + * + * Unfortunately, the implementation here takes a few liberties compared to the original algorithm. The binary search tree is replaced by an ordered Set which is simply looped over. + * Therefore, this implementation's worst case, where all horizontal lines span the entire sweep, you are essentially performing the naive approach with a bunch of overhead. + * Since we are using this implementation to find table cells, one can expect this worst case to always be the case. + * + * A simple runtime comparison for a single page with the most lines we can expect (SinglePages/AbsolutelyEnormousTable.pdf with 30 horizontals and 144 verticals) shows this implementation takes roughly 14 ms, whereas the naive approach takes 7 ms. Both are negligible, but the naive approach is two times as fast. + * + * If we would like to make this faster, we would need a better data structure for 'TreeMap horizontalRulingsInCurrentSweep', where we can query the TreeMap for all horizontal rulings in a given interval in O(log n). + * + * @param horizontals a list of non-overlapping horizontal rulings + * @param verticals a list of non-overlapping vertical rulings + * @return a Map of each found intersection point pointing to the two lines forming the intersection. + */ + public Map find(List horizontals, List verticals) { + + long start = System.currentTimeMillis(); + List sweepTrajectory = buildSweepTrajectory(horizontals, verticals); + + TreeMap horizontalRulingsInCurrentSweep = new TreeMap<>(Comparator.comparingDouble(Ruling::getTop)); + + TreeMap intersections = new TreeMap<>(Y_THEN_X_POINT_COMPARATOR); + + for (SweepStep step : sweepTrajectory) { + switch (step.type) { + case VERTICAL: // check for intersections with currently intersected horizontal lines + for (Ruling horizontalRuling : horizontalRulingsInCurrentSweep.navigableKeySet()) { + + Optional intersectionPoint = findIntersectionPoint(horizontalRuling, step.ruling); + + if (intersectionPoint.isEmpty()) { + continue; + } + + intersections.put(intersectionPoint.get(), new IntersectingRulings(horizontalRuling, step.ruling)); + } + break; + case HORIZONTAL_ENTRY: // sweep line now intersects this horizontal ruling + horizontalRulingsInCurrentSweep.put(step.ruling, null); + break; + case HORIZONTAL_EXIT: // sweep line no longer intersects this horizontal ruling + horizontalRulingsInCurrentSweep.remove(step.ruling); + break; + } + } + log.debug("Finished building intersections with line sweep in {} ms", System.currentTimeMillis() - start); + + return intersections; + + } + + + /** + * Naive Approach in O(n^2) of finding intersections between lines by iterating over all lines. + * + * @param horizontals a list of non-overlapping horizontal rulings + * @param verticals a list of non-overlapping vertical rulings + * @return a Map of each found intersection point pointing to the two lines forming the intersection. + */ + public Map findNaive(List horizontals, List verticals) { + + long start = System.currentTimeMillis(); + TreeMap intersections = new TreeMap<>(Y_THEN_X_POINT_COMPARATOR); + + for (Ruling horizontal : horizontals) { + for (Ruling vertical : verticals) { + Optional intersectionPoint = findIntersectionPoint(horizontal, vertical); + + if (intersectionPoint.isEmpty()) { + continue; + } + + intersections.put(intersectionPoint.get(), new IntersectingRulings(horizontal, vertical)); + } + } + log.debug("Finished building intersections naively in {} ms", System.currentTimeMillis() - start); + + return intersections; + } + + + private static List buildSweepTrajectory(List horizontals, List verticals) { + + List sweepTrajectory = new LinkedList<>(); + + for (Ruling horizontalRuling : horizontals) { + sweepTrajectory.add(new SweepStep(SweepStep.Type.HORIZONTAL_ENTRY, horizontalRuling.getLeft() - PERPENDICULAR_UNIT_EXPAND_AMOUNT, horizontalRuling)); + sweepTrajectory.add(new SweepStep(SweepStep.Type.HORIZONTAL_EXIT, horizontalRuling.getRight() + PERPENDICULAR_UNIT_EXPAND_AMOUNT, horizontalRuling)); + } + + for (Ruling verticalRuling : verticals) { + sweepTrajectory.add(new SweepStep(SweepStep.Type.VERTICAL, verticalRuling.getLeft(), verticalRuling)); + } + + Collections.sort(sweepTrajectory); + + return sweepTrajectory; + } + + + public Optional findIntersectionPoint(Ruling horizontal, Ruling vertical) { + + if (!horizontal.isHorizontal() || !vertical.isVertical()) { + log.warn("lines must be orthogonal, vertical and horizontal"); + return Optional.empty(); + } + + Ruling expanded_horizontal = horizontal.expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT); + Ruling expanded_vertical = vertical.expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT); + + if (!expanded_horizontal.intersectsLine(expanded_vertical)) { + return Optional.empty(); + } + + return Optional.of(new Point2D.Float(vertical.getLeft(), horizontal.getTop())); + } + + + private class SweepStep implements Comparable { + + protected Type type; + protected float y_position; + protected Ruling ruling; + + private enum Type { + VERTICAL, + HORIZONTAL_EXIT, + HORIZONTAL_ENTRY + } + + + public SweepStep(Type type, float y_position, Ruling ruling) { + + this.type = type; + this.y_position = y_position; + this.ruling = ruling; + } + + + @Override + public int compareTo(SweepStep other) { + + int rv; + if (DoubleComparisons.feq(y_position, other.y_position)) { + if (type == SweepStep.Type.VERTICAL && other.type == SweepStep.Type.HORIZONTAL_ENTRY) { + rv = 1; + } else if (type == SweepStep.Type.VERTICAL && other.type == SweepStep.Type.HORIZONTAL_EXIT) { + rv = -1; + } else if (type == SweepStep.Type.HORIZONTAL_ENTRY && other.type == SweepStep.Type.VERTICAL) { + rv = -1; + } else if (type == SweepStep.Type.HORIZONTAL_EXIT && other.type == SweepStep.Type.VERTICAL) { + rv = 1; + } else { + rv = Double.compare(y_position, other.y_position); + } + } else { + return Double.compare(y_position, other.y_position); + } + return rv; + } + + } + + public record IntersectingRulings(Ruling horizontal, Ruling vertical) { + + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutparsingVisualizations.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutparsingVisualizations.java new file mode 100644 index 0000000..77eb8f3 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutparsingVisualizations.java @@ -0,0 +1,252 @@ +package com.knecon.fforesight.service.layoutparser.processor.visualization; + +import java.awt.Color; +import java.awt.geom.Line2D; +import java.awt.geom.Point2D; +import java.awt.geom.Rectangle2D; +import java.util.Collection; +import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.Stream; + +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent; + +import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; +import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox; +import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line; +import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone; +import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; +import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; +import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; +import com.knecon.fforesight.service.viewerdoc.ContentStreams; +import com.knecon.fforesight.service.viewerdoc.model.ColoredLine; +import com.knecon.fforesight.service.viewerdoc.model.ColoredRectangle; +import com.knecon.fforesight.service.viewerdoc.model.PlacedText; +import com.knecon.fforesight.service.viewerdoc.model.Standard14EmbeddableFont; +import com.knecon.fforesight.service.viewerdoc.model.Visualizations; +import com.knecon.fforesight.service.viewerdoc.model.VisualizationsOnPage; + +import lombok.AccessLevel; +import lombok.AllArgsConstructor; +import lombok.Getter; +import lombok.NoArgsConstructor; +import lombok.Setter; +import lombok.experimental.FieldDefaults; + +@Getter +@NoArgsConstructor +@AllArgsConstructor +@FieldDefaults(level = AccessLevel.PRIVATE) +public class LayoutparsingVisualizations { + + static final Standard14EmbeddableFont FONT = Standard14EmbeddableFont.helvetica(); + + static final Color WORDS_COLOR = new Color(68, 84, 147); + static final Color LINES_COLOR = new Color(152, 45, 179); + static final Color ZONES_COLOR = new Color(131, 38, 38); + static final Color RULINGS_COLOR = new Color(21, 221, 174); + static final Color CELLS_COLOR = new Color(31, 214, 27); + static final Color MAIN_BODY_COLOR = new Color(171, 131, 6); + static final Color MARKED_CONTENT_COLOR = new Color(171, 131, 6); + + static final List ROTATING_CHARACTER_COLOR = List.of(new Color(255, 87, 51), + new Color(255, 195, 0), + new Color(76, 175, 80), + new Color(33, 150, 243), + new Color(155, 89, 182), + new Color(233, 30, 99), + new Color(0, 188, 212), + new Color(121, 85, 72)); + + @Setter + boolean active = false; + + final Visualizations words = Visualizations.builder().layer(ContentStreams.WORDS).build(); + final Visualizations lines = Visualizations.builder().layer(ContentStreams.LINES).build(); + final Visualizations zones = Visualizations.builder().layer(ContentStreams.ZONES).build(); + final Visualizations mainBody = Visualizations.builder().layer(ContentStreams.MAIN_BODY).build(); + final Visualizations rulings = Visualizations.builder().layer(ContentStreams.RULINGS).build(); + final Visualizations cells = Visualizations.builder().layer(ContentStreams.CELLS).build(); + final Visualizations markedContent = Visualizations.builder().layer(ContentStreams.MARKED_CONTENT).build(); + final Visualizations neighbours = Visualizations.builder().layer(ContentStreams.NEIGHBOURS).build(); + final Visualizations characters = Visualizations.builder().layer(ContentStreams.CHARACTERS).build(); + + + public Stream streamAll() { + + if (!active) { + return Stream.empty(); + } + return Stream.of(characters, // + neighbours,// + words, // + lines, // + zones, // + rulings, // + cells, // + mainBody, // + markedContent // + ); + } + + + public void addTextVisualizations(List textPositionSequences, int pageNumber) { + + if (!active) { + return; + } + List list = textPositionSequences.stream() + .map(textPositionSequence -> textPositionSequence.getTextPositions() + .stream() + .map(RedTextPosition::getInitialUserSpacePosition) + .collect(RectangleTransformations.collectBBox())) + .map(rect -> new ColoredRectangle(rect, WORDS_COLOR, 1)) + .toList(); + this.words.getVisualizationsOnPages().put(pageNumber - 1, VisualizationsOnPage.builder().coloredRectangles(list).build()); + } + + + public void addCleanRulingVisualization(CleanRulings cleanRulings, int pageNumber) { + + if (!active) { + return; + } + this.rulings.getVisualizationsOnPages() + .put(pageNumber - 1, + VisualizationsOnPage.builder() + .coloredLines(Stream.of(cleanRulings.getHorizontal(), cleanRulings.getVertical()) + .flatMap(Collection::stream) + .map(ruling -> new ColoredLine(ruling, RULINGS_COLOR, 1)) + .toList()) + .build()); + } + + + public void addCellVisualizations(List cells, int pageNumber) { + + if (!active) { + return; + } + this.cells.getVisualizationsOnPages() + .put(pageNumber - 1, + VisualizationsOnPage.builder() + .coloredRectangles(cells.stream() + .map(ruling -> new ColoredRectangle(ruling, CELLS_COLOR, 1)) + .toList()) + .build()); + } + + + public void addZoneVisualizations(List zones, int page) { + + if (!active) { + return; + } + + this.zones.getVisualizationsOnPages() + .put(page - 1, + VisualizationsOnPage.builder() + .coloredRectangles(zones.stream() + .map(BoundingBox::getBBox) + .map(zone -> new ColoredRectangle(zone, ZONES_COLOR, 1)) + .toList()) + .build()); + + } + + + public void addLineVisualizations(List zones, int page) { + + if (!active) { + return; + } + this.lines.getVisualizationsOnPages() + .put(page - 1, + VisualizationsOnPage.builder() + .coloredRectangles(zones.stream() + .map(Zone::getLines) + .flatMap(Collection::stream) + .map(BoundingBox::getBBox) + .map(line -> new ColoredRectangle(line, LINES_COLOR, 1)) + .toList()) + .build()); + } + + + public void addMainBodyVisualization(Rectangle rectangle, int pageNumber) { + + if (!active) { + return; + } + this.mainBody.getVisualizationsOnPages() + .put(pageNumber - 1, + VisualizationsOnPage.builder() + .coloredRectangles(List.of(new ColoredRectangle(new Rectangle2D.Double(rectangle.getTopLeft().getX(), + rectangle.getTopLeft().getY(), + rectangle.getWidth(), + rectangle.getHeight()), MAIN_BODY_COLOR, 1))) + .build()); + } + + + public void addMarkedContentVisualizations(List markedContents, int pageNumber, PDPage pdPage) { + + if (!active) { + return; + } + List markedContentBBoxMapBySubType = MarkedContentUtils.getMarkedContentPositions(markedContents, pdPage); + VisualizationsOnPage visualizationsOnPage = VisualizationsOnPage.builder().build(); + this.markedContent.getVisualizationsOnPages().put(pageNumber - 1, visualizationsOnPage); + markedContentBBoxMapBySubType.forEach(markedContentPosition -> { + + var bbox = markedContentPosition.textPositions() + .stream() + .collect(RectangleTransformations.collectBBox()); + String type = markedContentPosition.formattedType(); + float translationAmount = ((FONT.getStringWidth(type) / 1000) * 10 + (2 * 1) + 4); + visualizationsOnPage.getPlacedTexts() + .add(PlacedText.textFacingUp(type, new Point2D.Double(bbox.getX() - translationAmount, bbox.getY() + bbox.getHeight()), 10, Color.BLACK, FONT)); + + visualizationsOnPage.getColoredRectangles().add(new ColoredRectangle(bbox, MARKED_CONTENT_COLOR, 1)); + } + + ); + + } + + + public void addCharactersWithNeighbours(List zones, int page) { + + if (!active) { + return; + } + + VisualizationsOnPage neighbourVisualizations = VisualizationsOnPage.builder().build(); + neighbours.getVisualizationsOnPages().put(page - 1, neighbourVisualizations); + VisualizationsOnPage characterVisualizations = VisualizationsOnPage.builder().build(); + characters.getVisualizationsOnPages().put(page - 1, characterVisualizations); + + AtomicInteger index = new AtomicInteger(0); + zones.forEach(zone -> zone.getLines() + .stream() + .map(Line::getCharacters) + .flatMap(Collection::stream) + .forEach(character -> { + Color color = ROTATING_CHARACTER_COLOR.get(index.getAndIncrement() % ROTATING_CHARACTER_COLOR.size()); + Rectangle2D charBBox = character.getTextPosition().getInitialUserSpacePosition(); + characterVisualizations.getColoredRectangles().add(new ColoredRectangle(charBBox, color, 1)); + character.getNeighbors() + .forEach(neighbor -> { + Rectangle2D neighborBBox = neighbor.getCharacter().getTextPosition().getInitialUserSpacePosition(); + Line2D line = new Line2D.Double(new Point2D.Double(charBBox.getCenterX(), charBBox.getCenterY()), + new Point2D.Double(neighborBBox.getCenterX(), neighborBBox.getCenterY())); + neighbourVisualizations.getColoredLines().add(new ColoredLine(line, color, 1)); + }); + })); + + } + +} diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java index 7fde740..b5771a2 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java @@ -1,10 +1,20 @@ package com.knecon.fforesight.service.layoutparser.server; +import java.io.File; +import java.nio.file.Files; +import java.nio.file.Path; import java.util.Arrays; +import java.util.Comparator; +import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.core.io.ClassPathResource; +import com.iqser.red.storage.commons.utils.FileSystemBackedStorageService; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; @@ -22,26 +32,63 @@ public class LayoutparserEnd2EndTest extends AbstractTest { @Test - @SneakyThrows public void testLayoutParserEndToEnd() { - prepareStorage("files/bdr/Wie weiter bei Kristeneinrichtungen.pdf"); - LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER); - LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest); - Arrays.stream(finishedEvent.message().split("\n")) - .forEach(log::info); + String filePath = "files/bdr/Wie weiter bei Kristeneinrichtungen.pdf"; + + runForFile(filePath); + } + + @Test + @Disabled + @SneakyThrows + public void testLayoutParserEndToEndWithFolder() { + + String folder = "/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles-pdftron-ocred"; + List pdfFiles = Files.walk(Path.of(folder)) + .filter(path -> path.getFileName().toString().endsWith(".pdf")) + .sorted(Comparator.comparing(Path::getFileName)) + .peek(System.out::println) + .toList(); + + System.out.printf("Found %d pdf files to process %n", pdfFiles.size()); + AtomicInteger count = new AtomicInteger(0); + pdfFiles.stream() + .peek(path -> log.info("{}/{}-{}", count.getAndIncrement(), pdfFiles.size(), path.getFileName())) + .forEach(path -> runForFile(path.toFile().toString())); } - @Test @SneakyThrows - public void testLayoutParserEndToEnd_RED_8747() { + private void runForFile(String filePath) { + + String fileName = Path.of(filePath).getFileName().toString(); + File file; + if (filePath.startsWith("files")) { // from resources + file = new ClassPathResource(filePath).getFile(); + } else { // absolute path + file = new File(filePath); + } + + LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LayoutParsingType.REDACT_MANAGER, true); + prepareStorage(layoutParsingRequest, file); - prepareStorage("files/syngenta/CustomerFiles/SinglePages/Page26_fRR A23317A PI0015600 CEU core part B6 - CZ.pdf"); - LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER_OLD); LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest); + Arrays.stream(finishedEvent.message().split("\n")) .forEach(log::info); + + File tmpFile = new File("/tmp/layout-E2E/" + fileName + "_VIEWER.pdf"); + assert tmpFile.getParentFile().exists() || tmpFile.getParentFile().mkdirs(); + + storageService.downloadTo(TENANT_ID, layoutParsingRequest.viewerDocumentStorageId(), tmpFile); + } + + + @AfterEach + public void cleanUpTmp() { + + ((FileSystemBackedStorageService) storageService).clearStorage(); } } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java index a26754a..5ebb0cc 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java @@ -23,6 +23,10 @@ import lombok.SneakyThrows; public class ViewerDocumentTest extends BuildDocumentTest { + ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null); + LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService); + + @Test @SneakyThrows public void testViewerDocument() { @@ -31,11 +35,9 @@ public class ViewerDocumentTest extends BuildDocumentTest { String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf"; var documentFile = new ClassPathResource(fileName).getFile(); - ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null); - LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService); long start = System.currentTimeMillis(); - Document document = buildGraph(fileName, LayoutParsingType.DOCUMINE); + Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER); layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true); System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000); } @@ -55,11 +57,11 @@ public class ViewerDocumentTest extends BuildDocumentTest { var documentFile = new ClassPathResource(fileName).getFile(); var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE, - documentFile, - new ImageServiceResponse(), - tableResponse, - new VisualLayoutParsingResponse(), - Map.of("file", Path.of(fileName).getFileName().toFile().toString())); + documentFile, + new ImageServiceResponse(), + tableResponse, + new VisualLayoutParsingResponse(), + Map.of("file", Path.of(fileName).getFileName().toFile().toString())); ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null); LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService); Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.DOCUMINE, classificationDocument); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/AbstractTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/AbstractTest.java index c0e2809..cfd8cb9 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/AbstractTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/AbstractTest.java @@ -1,6 +1,9 @@ package com.knecon.fforesight.service.layoutparser.server.utils; +import java.io.File; +import java.io.FileInputStream; import java.io.InputStream; +import java.nio.file.Path; import java.util.Map; import java.util.Optional; @@ -102,29 +105,22 @@ public abstract class AbstractTest { } - @SneakyThrows - protected LayoutParsingRequest prepareStorage(InputStream fileInputStream) { - - storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileInputStream); - return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER_OLD); - } - - - protected LayoutParsingRequest buildDefaultLayoutParsingRequest(LayoutParsingType layoutParsingType) { + protected LayoutParsingRequest buildDefaultLayoutParsingRequest(String fileName, LayoutParsingType layoutParsingType, boolean debug) { + var identifier = debug ? Map.of("fileId", fileName, "debug", "true") : Map.of("fileId", fileName); return LayoutParsingRequest.builder() - .identifier(Map.of("fileId", "1337")) + .identifier(identifier) .layoutParsingType(layoutParsingType) - .originFileStorageId(ORIGIN_FILE_ID) - .tablesFileStorageId(Optional.of(TABLE_FILE_ID)) - .imagesFileStorageId(Optional.of(IMAGE_FILE_ID)) - .visualLayoutParsingFileId(Optional.of(VISUAL_LAYOUT_FILE)) - .structureFileStorageId(STRUCTURE_FILE_ID) - .textBlockFileStorageId(TEXT_FILE_ID) - .positionBlockFileStorageId(POSITION_FILE_ID) - .pageFileStorageId(PAGES_FILE_ID) - .simplifiedTextStorageId(SIMPLIFIED_ID) - .viewerDocumentStorageId(VIEWER_DOCUMENT_ID) + .originFileStorageId(fileName + ORIGIN_FILE_ID) + .tablesFileStorageId(Optional.of(fileName + TABLE_FILE_ID)) + .imagesFileStorageId(Optional.of(fileName + IMAGE_FILE_ID)) + .visualLayoutParsingFileId(Optional.of(fileName + VISUAL_LAYOUT_FILE)) + .structureFileStorageId(fileName + STRUCTURE_FILE_ID) + .textBlockFileStorageId(fileName + TEXT_FILE_ID) + .positionBlockFileStorageId(fileName + POSITION_FILE_ID) + .pageFileStorageId(fileName + PAGES_FILE_ID) + .simplifiedTextStorageId(fileName + SIMPLIFIED_ID) + .viewerDocumentStorageId(fileName + VIEWER_DOCUMENT_ID) .build(); } @@ -148,10 +144,28 @@ public abstract class AbstractTest { ClassPathResource imageInfoFileResource = new ClassPathResource(imageInfoFile); ClassPathResource visualLayoutParsingResponseResource = new ClassPathResource(visualLayoutParsingResponseFile); - return prepareStorage(pdfFileResource.getInputStream(), - cvServiceResponseFileResource.getInputStream(), - imageInfoFileResource.getInputStream(), - visualLayoutParsingResponseResource.getInputStream()); + return prepareStorage(Path.of(file).getFileName().toString(), + pdfFileResource.getInputStream(), + cvServiceResponseFileResource.getInputStream(), + imageInfoFileResource.getInputStream(), + visualLayoutParsingResponseResource.getInputStream()); + } + + + @SneakyThrows + protected void prepareStorage(LayoutParsingRequest layoutParsingRequest, File file) { + + ClassPathResource cvServiceResponseFileResource = new ClassPathResource("cv_table_parsing_response/empty.json"); + ClassPathResource imageInfoFileResource = new ClassPathResource("image_service_response/empty.json"); + ClassPathResource visualLayoutParsingResponseResource = new ClassPathResource("visual_layout_parsing_response/empty.json"); + + try (var in = new FileInputStream(file)) { + prepareStorage(layoutParsingRequest, + in, + cvServiceResponseFileResource.getInputStream(), + imageInfoFileResource.getInputStream(), + visualLayoutParsingResponseResource.getInputStream()); + } } @@ -162,12 +176,27 @@ public abstract class AbstractTest { storageService.storeObject(TenantContext.getTenantId(), TABLE_FILE_ID, cvServiceResponseFileStream); storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileStream); - return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER_OLD); + return buildDefaultLayoutParsingRequest("test", LayoutParsingType.REDACT_MANAGER_OLD, true); } @SneakyThrows - protected LayoutParsingRequest prepareStorage(InputStream fileStream, + protected void prepareStorage(LayoutParsingRequest layoutParsingRequest, + InputStream fileStream, + InputStream cvServiceResponseFileStream, + InputStream imageInfoStream, + InputStream visualLayoutParsingResponseFileStream) { + + storageService.storeObject(TenantContext.getTenantId(), layoutParsingRequest.imagesFileStorageId().get(), imageInfoStream); + storageService.storeObject(TenantContext.getTenantId(), layoutParsingRequest.tablesFileStorageId().get(), cvServiceResponseFileStream); + storageService.storeObject(TenantContext.getTenantId(), layoutParsingRequest.originFileStorageId(), fileStream); + storageService.storeObject(TenantContext.getTenantId(), layoutParsingRequest.visualLayoutParsingFileId().get(), visualLayoutParsingResponseFileStream); + } + + + @SneakyThrows + protected LayoutParsingRequest prepareStorage(String fileName, + InputStream fileStream, InputStream cvServiceResponseFileStream, InputStream imageInfoStream, InputStream visualLayoutParsingResponseFileStream) { @@ -177,7 +206,7 @@ public abstract class AbstractTest { storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileStream); storageService.storeObject(TenantContext.getTenantId(), VISUAL_LAYOUT_FILE, visualLayoutParsingResponseFileStream); - return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER_OLD); + return buildDefaultLayoutParsingRequest(fileName, LayoutParsingType.REDACT_MANAGER_OLD, true); } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/BuildDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/BuildDocumentTest.java index af2717b..cbd6201 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/BuildDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/BuildDocumentTest.java @@ -1,11 +1,13 @@ package com.knecon.fforesight.service.layoutparser.server.utils; import java.io.File; +import java.nio.file.Path; import java.util.Map; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.core.io.ClassPathResource; +import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; @@ -28,11 +30,11 @@ public abstract class BuildDocumentTest extends AbstractTest { File fileResource = new ClassPathResource(filename).getFile(); prepareStorage(filename); return layoutParsingPipeline.parseLayout(layoutParsingType, - fileResource, - layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID), - new TableServiceResponse(), - new VisualLayoutParsingResponse(), - Map.of("file",filename)); + fileResource, + layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID), + new TableServiceResponse(), + new VisualLayoutParsingResponse(), + Map.of("file", filename, "debug", "true")); } @@ -46,13 +48,25 @@ public abstract class BuildDocumentTest extends AbstractTest { @SneakyThrows protected Document buildGraph(String filename, LayoutParsingType layoutParsingType) { - if (filename.equals("files/syngenta/CustomerFiles/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf")) { - prepareStorage(filename, "cv_table_parsing_response/empty.json", "image_service_response/S-Metolachlor_RAR_01_Volume_1_2018-09-06.IMAGE_INFO.json"); + if (!filename.startsWith("files") && filename.startsWith("/")) { + LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(Path.of(filename).getFileName().toString(), LayoutParsingType.REDACT_MANAGER, true); + prepareStorage(layoutParsingRequest, new File(filename)); + return DocumentGraphFactory.buildDocumentGraph(layoutParsingType, + layoutParsingPipeline.parseLayout(layoutParsingType, + new File(filename), + layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get()), + new TableServiceResponse(), + new VisualLayoutParsingResponse(), + layoutParsingRequest.identifier())); } else { - prepareStorage(filename); + if (filename.equals("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf")) { + prepareStorage(filename, "cv_table_parsing_response/empty.json", "image_service_response/S-Metolachlor_RAR_01_Volume_1_2018-09-06.IMAGE_INFO.json"); + } else { + prepareStorage(filename); + } + return DocumentGraphFactory.buildDocumentGraph(layoutParsingType, parseLayout(filename, layoutParsingType)); } - return DocumentGraphFactory.buildDocumentGraph(layoutParsingType, parseLayout(filename, layoutParsingType)); } } diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/AbsolutelyEnormousTable.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/AbsolutelyEnormousTable.pdf new file mode 100644 index 0000000..e6d9a07 Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/AbsolutelyEnormousTable.pdf differ diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/ContentStreams.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/ContentStreams.java index 937f75d..d92c039 100644 --- a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/ContentStreams.java +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/ContentStreams.java @@ -26,6 +26,23 @@ public class ContentStreams { public static Identifier ESCAPE_END = new Identifier("escape start", COSName.getPDFName("ESCAPE_END"), false); + public static Identifier RULINGS = new Identifier("Rulings", COSName.getPDFName("KNECON_RULINGS"), true); + + public static Identifier WORDS = new Identifier("Words", COSName.getPDFName("KNECON_WORDS"), true); + + public static Identifier ZONES = new Identifier("Text Zones", COSName.getPDFName("KNECON_ZONES"), true); + + public static Identifier LINES = new Identifier("Text Lines", COSName.getPDFName("KNECON_LINES"), true); + + public static Identifier CELLS = new Identifier("Cells", COSName.getPDFName("KNECON_CELLS"), true); + + public static Identifier MAIN_BODY = new Identifier("Main Text Body", COSName.getPDFName("KNECON_MAIN_BODY"), true); + + public static Identifier MARKED_CONTENT = new Identifier("Marked content", COSName.getPDFName("KNECON_MARKED_CONTENT"), true); + + public static Identifier NEIGHBOURS = new Identifier("Neighbours", COSName.getPDFName("KNECON_NEIGHBOURS"), true); + public static Identifier CHARACTERS = new Identifier("Characters", COSName.getPDFName("KNECON_CHARACTERS"), true); + public static List allContentStreams = List.of(KNECON_LAYOUT, KNECON_VISUAL_PARSING, KNECON_OCR, @@ -33,7 +50,16 @@ public class ContentStreams { KNECON_OCR_TEXT_DEBUG, OTHER, ESCAPE_START, - ESCAPE_END); + ESCAPE_END, + RULINGS, + WORDS, + ZONES, + LINES, + MAIN_BODY, + MARKED_CONTENT, + NEIGHBOURS, + CHARACTERS, + CELLS); public record Identifier(String name, COSName cosName, boolean optionalContent) { diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/Visualizations.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/Visualizations.java index fb17113..6af80b9 100644 --- a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/Visualizations.java +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/Visualizations.java @@ -1,5 +1,6 @@ package com.knecon.fforesight.service.viewerdoc.model; +import java.util.LinkedHashMap; import java.util.Map; import com.knecon.fforesight.service.viewerdoc.ContentStreams; @@ -17,7 +18,8 @@ import lombok.experimental.FieldDefaults; public class Visualizations { ContentStreams.Identifier layer; - Map visualizationsOnPages; + @Builder.Default + Map visualizationsOnPages = new LinkedHashMap<>(); boolean layerVisibilityDefaultValue; } diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/ViewerDocumentService.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/ViewerDocumentService.java index 040b81b..c761c69 100644 --- a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/ViewerDocumentService.java +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/ViewerDocumentService.java @@ -53,12 +53,6 @@ public class ViewerDocumentService { private final ObservationRegistry registry; - public void addVisualizationsOnPage(File originFile, File destinationFile, Visualizations visualizations) { - - addVisualizationsOnPage(originFile, destinationFile, List.of(visualizations)); - } - - @Observed(name = "ViewerDocumentService", contextualName = "add-visualizations") @SneakyThrows public void addVisualizationsOnPage(File originFile, File destinationFile, List visualizations) { @@ -70,9 +64,14 @@ public class ViewerDocumentService { PDDocument pdDocument = openPDDocument(tmpFile.toFile()); - enrichObservation(pdDocument, visualizations.stream().map(Visualizations::getLayer).toList()); + enrichObservation(pdDocument, + visualizations.stream() + .map(Visualizations::getLayer) + .toList()); - Set allLayers = visualizations.stream().map(Visualizations::getLayer).collect(Collectors.toUnmodifiableSet()); + Set allLayers = visualizations.stream() + .map(Visualizations::getLayer) + .collect(Collectors.toUnmodifiableSet()); Map optionalContentGroupMap = addLayersToDocument(visualizations, pdDocument); @@ -186,7 +185,8 @@ public class ViewerDocumentService { contentStream.setFont(font, placedText.fontSize()); contentStream.beginText(); contentStream.setNonStrokingColor(placedText.color()); - if (placedText.renderingMode().isPresent()) { + if (placedText.renderingMode() + .isPresent()) { contentStream.setRenderingMode(placedText.renderingMode().get()); } else { contentStream.setRenderingMode(RenderingMode.FILL); @@ -229,11 +229,11 @@ public class ViewerDocumentService { Matrix textMatrix; if (placedText.textMatrix().isEmpty()) { textMatrix = new Matrix((float) textDeRotationMatrix.getScaleX(), - (float) textDeRotationMatrix.getShearX(), - (float) textDeRotationMatrix.getShearY(), - (float) textDeRotationMatrix.getScaleY(), - (float) placedText.lineStart().getX(), - (float) placedText.lineStart().getY()); + (float) textDeRotationMatrix.getShearX(), + (float) textDeRotationMatrix.getShearY(), + (float) textDeRotationMatrix.getScaleY(), + (float) placedText.lineStart().getX(), + (float) placedText.lineStart().getY()); } else { textMatrix = placedText.textMatrix().get(); }