diff --git a/buildSrc/src/main/kotlin/com.knecon.fforesight.java-conventions.gradle.kts b/buildSrc/src/main/kotlin/com.knecon.fforesight.java-conventions.gradle.kts index 61951b8..8ad6ecb 100644 --- a/buildSrc/src/main/kotlin/com.knecon.fforesight.java-conventions.gradle.kts +++ b/buildSrc/src/main/kotlin/com.knecon.fforesight.java-conventions.gradle.kts @@ -42,6 +42,15 @@ tasks.jacocoTestReport { } allprojects { + + tasks.withType { + options { + this as StandardJavadocDocletOptions + addBooleanOption("Xdoclint:none", true) + addStringOption("Xmaxwarns", "1") + } + } + publishing { publications { create(name) { @@ -64,6 +73,7 @@ java { withJavadocJar() } + repositories { mavenLocal() mavenCentral() diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/SimplifiedText.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/SimplifiedText.java index f16d543..0f4bd90 100644 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/SimplifiedText.java +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/SimplifiedText.java @@ -19,6 +19,7 @@ public class SimplifiedText { @Schema(description = "Number of pages in the entire document.") private int numberOfPages; @Schema(description = "A List of simplified Sections, which contains almost exclusively text.") + @Builder.Default private List sectionTexts = new ArrayList<>(); } diff --git a/layoutparser-service/layoutparser-service-processor/build.gradle.kts b/layoutparser-service/layoutparser-service-processor/build.gradle.kts index 486a9c9..4889e40 100644 --- a/layoutparser-service/layoutparser-service-processor/build.gradle.kts +++ b/layoutparser-service/layoutparser-service-processor/build.gradle.kts @@ -28,4 +28,6 @@ dependencies { implementation("org.tinspin:tinspin-indexes:2.1.3") implementation("org.commonmark:commonmark:0.22.0") implementation("org.commonmark:commonmark-ext-gfm-tables:0.22.0") + implementation("com.pdftron:PDFNet:10.11.0") + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index d1b504e..a74156a 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -69,6 +69,7 @@ import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDF import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService; import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation; +import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations; import io.micrometer.observation.Observation; import io.micrometer.observation.ObservationRegistry; @@ -117,14 +118,18 @@ public class LayoutParsingPipeline { log.info("Starting layout parsing for {}", layoutParsingRequest.identifier()); File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId()); - File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()).orElse(originFile); + File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()) + .orElse(originFile); VisualLayoutParsingResponse visualLayoutParsingResponse = layoutParsingRequest.visualLayoutParsingFileId() - .map(layoutParsingStorageService::getVisualLayoutParsingFile).orElse(new VisualLayoutParsingResponse()); + .map(layoutParsingStorageService::getVisualLayoutParsingFile) + .orElse(new VisualLayoutParsingResponse()); ImageServiceResponse imageServiceResponse = layoutParsingRequest.imagesFileStorageId() - .map(layoutParsingStorageService::getImagesFile).orElse(new ImageServiceResponse()); + .map(layoutParsingStorageService::getImagesFile) + .orElse(new ImageServiceResponse()); TableServiceResponse tableServiceResponse = layoutParsingRequest.tablesFileStorageId() - .map(layoutParsingStorageService::getTablesFile).orElse(new TableServiceResponse()); + .map(layoutParsingStorageService::getTablesFile) + .orElse(new TableServiceResponse()); ClassificationDocument classificationDocument = parseLayout(settings.getLayoutParsingTypeOverride() == null // ? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride(), @@ -137,8 +142,7 @@ public class LayoutParsingPipeline { log.info("Building document graph for {}", layoutParsingRequest.identifier()); Document documentGraph = observeBuildDocumentGraph(settings.getLayoutParsingTypeOverride() == null // - ? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride(), - classificationDocument); + ? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride(), classificationDocument); log.info("Creating viewer document for {}", layoutParsingRequest.identifier()); @@ -147,7 +151,7 @@ public class LayoutParsingPipeline { log.info("Storing resulting files for {}", layoutParsingRequest.identifier()); layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentGraph)); - if(layoutParsingRequest.documentMarkdownFileStorageId().isPresent()) { + if (layoutParsingRequest.documentMarkdownFileStorageId().isPresent()) { layoutParsingStorageService.storeMarkdownFile(layoutParsingRequest.documentMarkdownFileStorageId().get(), new MarkdownMapper().toMarkdownContent(documentGraph)); } layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentGraph)); @@ -271,11 +275,11 @@ public class LayoutParsingPipeline { stripper.setStartPage(pageNumber); stripper.setEndPage(pageNumber); stripper.setPdpage(pdPage); - if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD)) { - stripper.setSortByPosition(true); - } stripper.getText(originDocument); List words = stripper.getTextPositionSequences(); + if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD)) { + words = TextPositionOperations.sort(words); + } classificationDocument.getLayoutDebugLayer().addTextVisualizations(words, pageNumber); PDRectangle pdr = pdPage.getMediaBox(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/DocstrumSegmentationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/DocstrumSegmentationService.java index 55d3f40..6f6024c 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/DocstrumSegmentationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/DocstrumSegmentationService.java @@ -1,6 +1,7 @@ package com.knecon.fforesight.service.layoutparser.processor.docstrum; import java.util.ArrayList; +import java.util.EnumMap; import java.util.List; import java.util.stream.Collectors; @@ -26,6 +27,7 @@ import lombok.RequiredArgsConstructor; @RequiredArgsConstructor public class DocstrumSegmentationService { + public static final double SAME_DIRECTION_THRESHOLD = 0.9; private final NearestNeighbourService nearestNeighbourService; private final SpacingService spacingService; private final LineBuilderService lineBuilderService; @@ -35,13 +37,44 @@ public class DocstrumSegmentationService { public List segmentPage(List textPositions, boolean xyOrder, CleanRulings usedRulings, LayoutDebugLayer visualizations) { - List zones = new ArrayList<>(); - zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.ZERO)); - zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.QUARTER_CIRCLE)); - zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.HALF_CIRCLE)); - zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.THREE_QUARTER_CIRCLE)); + EnumMap directionCounts = new EnumMap<>(TextDirection.class); - return readingOrderService.resolve(zones, xyOrder); + List newZones = computeZones(textPositions, usedRulings, visualizations, TextDirection.ZERO); + directionCounts.put(TextDirection.ZERO, newZones.size()); + List zones = new ArrayList<>(newZones); + + newZones = computeZones(textPositions, usedRulings, visualizations, TextDirection.QUARTER_CIRCLE); + directionCounts.put(TextDirection.QUARTER_CIRCLE, newZones.size()); + zones.addAll(newZones); + + newZones = computeZones(textPositions, usedRulings, visualizations, TextDirection.HALF_CIRCLE); + directionCounts.put(TextDirection.HALF_CIRCLE, newZones.size()); + zones.addAll(newZones); + + newZones = computeZones(textPositions, usedRulings, visualizations, TextDirection.THREE_QUARTER_CIRCLE); + directionCounts.put(TextDirection.THREE_QUARTER_CIRCLE, newZones.size()); + zones.addAll(newZones); + + return readingOrderService.resolve(zones, xyOrder, mostSameDirection(directionCounts)); + } + + + private boolean mostSameDirection(EnumMap directionCounts) { + + int total = directionCounts.values() + .stream() + .mapToInt(i -> i).sum(); + + if ((double) directionCounts.get(TextDirection.ZERO) / total > SAME_DIRECTION_THRESHOLD) { + return true; + } else if ((double) directionCounts.get(TextDirection.QUARTER_CIRCLE) / total > SAME_DIRECTION_THRESHOLD) { + return true; + } else if ((double) directionCounts.get(TextDirection.HALF_CIRCLE) / total > SAME_DIRECTION_THRESHOLD) { + return true; + } else if ((double) directionCounts.get(TextDirection.THREE_QUARTER_CIRCLE) / total > SAME_DIRECTION_THRESHOLD) { + return true; + } + return false; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/AngleFilter.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/AngleFilter.java index a51051d..95e42c2 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/AngleFilter.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/AngleFilter.java @@ -15,10 +15,16 @@ public class AngleFilter { public boolean matches(Neighbor neighbor) { + return matches(neighbor.getAngle()); + } + + + public boolean matches(double angle) { + if (lowerAngle <= upperAngle) { - return lowerAngle <= neighbor.getAngle() && neighbor.getAngle() < upperAngle; + return lowerAngle <= angle && angle < upperAngle; } else { - return lowerAngle <= neighbor.getAngle() || neighbor.getAngle() < upperAngle; + return lowerAngle <= angle || angle < upperAngle; } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/BoundingBox.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/BoundingBox.java index ae2fd62..9efc286 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/BoundingBox.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/BoundingBox.java @@ -7,8 +7,12 @@ import java.util.List; import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; import lombok.Data; +import lombok.NoArgsConstructor; +import lombok.experimental.SuperBuilder; @Data +@SuperBuilder +@NoArgsConstructor public abstract class BoundingBox { // Java coordinate system: (0, 0) is always upper left, x is increasing left to right and y is increasing from top to bottom. @@ -19,7 +23,7 @@ public abstract class BoundingBox { // This rotates completely in 90 degree steps with page rotation. // Needs to be used when writing to a PDF. // Also, these are definitely correct and should be used whenever possible. - protected Rectangle2D bBoxInitialUserSpace; + protected Rectangle2D bBoxPdf; protected static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f; @@ -50,25 +54,25 @@ public abstract class BoundingBox { public double getPdfMinX() { - return bBoxInitialUserSpace.getMinX(); + return bBoxPdf.getMinX(); } public double getPdfMaxX() { - return bBoxInitialUserSpace.getMaxX(); + return bBoxPdf.getMaxX(); } public double getPdfMinY() { - return bBoxInitialUserSpace.getMinY(); + return bBoxPdf.getMinY(); } public double getPdfMaxY() { - return bBoxInitialUserSpace.getMaxY(); + return bBoxPdf.getMaxY(); } @@ -129,13 +133,31 @@ public abstract class BoundingBox { } - public boolean intersectsY(BoundingBox other) { + private boolean intersectsX(BoundingBox other, float threshold) { + + return this.getX() - threshold <= other.getMaxX() && this.getMaxX() + threshold >= other.getX(); + } + + + public boolean intersectsPdf(BoundingBox other) { + + return this.intersectsXPdf(other) && this.intersectsYPdf(other); + } + + + public boolean intersectsPdf(BoundingBox other, float yThreshold, float xThreshold) { + + return this.intersectsXPdf(other, xThreshold) && this.intersectsYPdf(other, yThreshold); + } + + + public boolean intersectsYPdf(BoundingBox other) { return this.getPdfMinY() <= other.getPdfMaxY() && this.getPdfMaxY() >= other.getPdfMinY(); } - public boolean intersectsYJava(BoundingBox other) { + public boolean intersectsY(BoundingBox other) { return this.getY() <= other.getMaxY() && this.getMaxY() >= other.getY(); } @@ -143,25 +165,31 @@ public abstract class BoundingBox { public boolean intersectsY(BoundingBox other, float threshold) { + return this.getY() - threshold <= other.getMaxY() && this.getMaxY() + threshold >= other.getY(); + } + + + public boolean intersectsYPdf(BoundingBox other, float threshold) { + return this.getPdfMinY() - threshold <= other.getPdfMaxY() && this.getPdfMaxY() + threshold >= other.getPdfMinY(); } - public boolean intersectsX(BoundingBox other) { + public boolean intersectsXPdf(BoundingBox other) { return this.getPdfMinX() <= other.getPdfMaxX() && this.getPdfMaxX() >= other.getPdfMinX(); } - public boolean intersectsXJava(BoundingBox other) { + public boolean intersectsX(BoundingBox other) { return this.getX() <= other.getMaxX() && this.getMaxX() >= other.getMinX(); } - public boolean intersectsX(BoundingBox other, float threshold) { + public boolean intersectsXPdf(BoundingBox other, float threshold) { - return this.getPdfMinX() - threshold <= other.getMaxX() && this.getMaxX() + threshold >= other.getPdfMinX(); + return this.getPdfMinX() - threshold <= other.getPdfMaxX() && this.getMaxX() + threshold >= other.getPdfMinX(); } @@ -170,8 +198,8 @@ public abstract class BoundingBox { this.bBox = components.stream() .map(BoundingBox::getBBox) .collect(RectangleTransformations.collectBBox()); - this.bBoxInitialUserSpace = components.stream() - .map(BoundingBox::getBBoxInitialUserSpace) + this.bBoxPdf = components.stream() + .map(BoundingBox::getBBoxPdf) .collect(RectangleTransformations.collectBBox()); } @@ -229,25 +257,25 @@ public abstract class BoundingBox { public boolean rightOf(BoundingBox other) { - return this.intersectsYJava(other) && other.getMaxX() <= this.getMinX(); + return this.intersectsY(other) && other.getMaxX() <= this.getMinX(); } public boolean leftOf(BoundingBox other) { - return this.intersectsYJava(other) && other.getMinX() >= this.getMaxX(); + return this.intersectsY(other) && other.getMinX() >= this.getMaxX(); } public boolean isAbove(BoundingBox other) { - return this.intersectsXJava(other) && other.getMinY() >= this.getMaxY(); + return this.intersectsX(other) && other.getMinY() >= this.getMaxY(); } public boolean isBelow(BoundingBox other) { - return this.intersectsXJava(other) && this.getMinY() >= other.getMaxY(); + return this.intersectsX(other) && this.getMinY() >= other.getMaxY(); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/Character.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/Character.java index 772f1b2..d9fd2b5 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/Character.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/Character.java @@ -35,7 +35,7 @@ public class Character { public double getHeight() { - return textPosition.getHeightDir(); + return textPosition.getHeightDirAdj(); } @@ -65,9 +65,9 @@ public class Character { double s = Math.sin(-0); double c = Math.cos(-0); xs[0] = c * x - s * y; - xs[1] = c * (x + textPosition.getWidthDirAdj()) - s * (y + textPosition.getHeightDir()); + xs[1] = c * (x + textPosition.getWidthDirAdj()) - s * (y + textPosition.getHeightDirAdj()); xs[2] = c * other.x - s * other.y; - xs[3] = c * (other.x + other.textPosition.getWidthDirAdj()) - s * (other.y + other.textPosition.getHeightDir()); + xs[3] = c * (other.x + other.textPosition.getWidthDirAdj()) - s * (other.y + other.textPosition.getHeightDirAdj()); boolean overlapping = xs[1] >= xs[2] && xs[3] >= xs[0]; Arrays.sort(xs); return Math.abs(xs[2] - xs[1]) * (overlapping ? 1 : -1); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/Line.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/Line.java index 525d148..d9d779b 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/Line.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/Line.java @@ -1,18 +1,28 @@ package com.knecon.fforesight.service.layoutparser.processor.docstrum.model; +import static com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence.BOLD; +import static com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence.BOLD_ITALIC; +import static com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence.ITALIC; +import static com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence.STANDARD; + import java.util.ArrayList; import java.util.Arrays; +import java.util.Comparator; +import java.util.EnumMap; import java.util.List; +import java.util.Map; +import java.util.concurrent.atomic.AtomicInteger; import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.model.text.FontStyle; import lombok.Data; import lombok.EqualsAndHashCode; @Data -@EqualsAndHashCode(onlyExplicitlyIncluded = true) -public class Line extends BoundingBox { +@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = false) +public class Line extends TextBoundingBox { private static final double WORD_DISTANCE_MULTIPLIER = 0.18; @@ -28,6 +38,8 @@ public class Line extends BoundingBox { private final double height; + private FontStyle fontStyle; + private final List characters; private final List words = new ArrayList<>(); @@ -67,6 +79,29 @@ public class Line extends BoundingBox { height = computeHeight(); computeWords(wordSpacing * WORD_DISTANCE_MULTIPLIER); buildBBox(); + computeFontStyle(); + } + + + private void computeFontStyle() { + + EnumMap fontStyleCounter = new EnumMap<>(FontStyle.class); + for (FontStyle fontStyle : FontStyle.values()) { + fontStyleCounter.put(fontStyle, new AtomicInteger(0)); + } + for (TextPositionSequence word : words) { + switch (word.getFontStyle()) { + case STANDARD -> fontStyleCounter.get(FontStyle.REGULAR).getAndIncrement(); + case BOLD -> fontStyleCounter.get(FontStyle.BOLD).getAndIncrement(); + case ITALIC -> fontStyleCounter.get(FontStyle.ITALIC).getAndIncrement(); + case BOLD_ITALIC -> fontStyleCounter.get(FontStyle.BOLD_ITALIC).getAndIncrement(); + } + } + fontStyle = fontStyleCounter.entrySet() + .stream() + .max(Comparator.comparing(entry -> entry.getValue().get())) + .map(Map.Entry::getKey) + .orElse(FontStyle.REGULAR); } @@ -144,8 +179,8 @@ public class Line extends BoundingBox { private void buildBBox() { this.setToBBoxOfComponents(characters.stream() - .map(Character::getTextPosition) - .toList()); + .map(Character::getTextPosition) + .toList()); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/TextBoundingBox.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/TextBoundingBox.java new file mode 100644 index 0000000..02aa578 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/TextBoundingBox.java @@ -0,0 +1,102 @@ +package com.knecon.fforesight.service.layoutparser.processor.docstrum.model; + +import java.awt.geom.Rectangle2D; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection; +import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; + +import lombok.EqualsAndHashCode; +import lombok.Getter; +import lombok.NoArgsConstructor; +import lombok.Setter; +import lombok.experimental.SuperBuilder; + +@Getter +@Setter +@SuperBuilder +@NoArgsConstructor +@EqualsAndHashCode(callSuper = false) +public abstract class TextBoundingBox extends BoundingBox { + + protected Rectangle2D bBoxDirAdj; + + protected TextDirection dir; + + + @Override + public void setToBBoxOfComponents(List components) { + + super.setToBBoxOfComponents(components); + this.bBoxDirAdj = components.stream() + .filter(c -> c instanceof TextBoundingBox) + .map(c -> (TextBoundingBox) c) + .map(TextBoundingBox::getBBoxDirAdj) + .collect(RectangleTransformations.collectBBox()); + + Set textDirections = components.stream() + .filter(c -> c instanceof TextBoundingBox) + .map(c -> (TextBoundingBox) c) + .map(TextBoundingBox::getDir) + .collect(Collectors.toSet()); + + if (textDirections.isEmpty()) { + dir = TextDirection.ZERO; + } else if (textDirections.size() > 1) { + throw new IllegalArgumentException("More than one text direction found"); + } else { + dir = textDirections.iterator().next(); + } + } + + + public double getXDirAdj() { + + return this.bBoxDirAdj.getX(); + } + + + public double getYDirAdj() { + + return this.bBoxDirAdj.getY(); + } + + + public double getWidthDirAdj() { + + return this.bBoxDirAdj.getWidth(); + } + + + public double getHeightDirAdj() { + + return this.bBoxDirAdj.getHeight(); + } + + + public double getMaxXDirAdj() { + + return this.bBoxDirAdj.getMaxX(); + } + + + public double getMaxYDirAdj() { + + return this.bBoxDirAdj.getMaxY(); + } + + + public double getCenterYDirAdj() { + + return this.bBoxDirAdj.getCenterY(); + } + + + public double getCenterXDirAdj() { + + return this.bBoxDirAdj.getCenterX(); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/Zone.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/Zone.java index f1c61c5..cc02fd8 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/Zone.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/Zone.java @@ -6,9 +6,11 @@ import java.util.List; import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; import lombok.Data; +import lombok.EqualsAndHashCode; @Data -public class Zone extends BoundingBox { +@EqualsAndHashCode(callSuper = false) +public class Zone extends TextBoundingBox { private List lines; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/ReadingOrderService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/ReadingOrderService.java index 6d1a741..2f57594 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/ReadingOrderService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/ReadingOrderService.java @@ -1,15 +1,17 @@ package com.knecon.fforesight.service.layoutparser.processor.docstrum.service; +import java.awt.geom.Rectangle2D; import java.util.ArrayList; import java.util.Comparator; import java.util.HashMap; import java.util.List; import java.util.ListIterator; import java.util.Map; +import java.util.stream.Collectors; import org.springframework.stereotype.Service; -import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox; +import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone; import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils; @@ -19,21 +21,30 @@ public class ReadingOrderService { private static final double THRESHOLD = 5; public static final double MULTI_COLUMN_DETECTION_THRESHOLD = 1.5; + private static final Comparator COMPARATOR = // + Comparator.comparing(TextBoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) + .thenComparing(TextBoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)); - public List resolve(List zones, boolean xyReadingOrder) { + private static final Comparator COMPARATOR_DIR_ADJ = // + Comparator.comparing(TextBoundingBox::getYDirAdj, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) + .thenComparing(TextBoundingBox::getXDirAdj, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)); + + + public List resolve(List zones, boolean xyReadingOrder, boolean useDirAdjCoords) { if (zones.isEmpty() || zones.size() == 1) { return zones; } if (xyReadingOrder) { - return resolveSingleColumnReadingOrder(zones); + return resolveSingleColumnReadingOrder(zones, useDirAdjCoords); } Map histogram = new HashMap<>(); for (Zone zone : zones) { - long minY = Math.round(zone.getBBox().getMinY()); - long maxY = Math.round(zone.getBBox().getMaxY()); + Rectangle2D bbox = useDirAdjCoords ? zone.getBBoxDirAdj() : zone.getBBox(); + long minY = Math.round(bbox.getMinY()); + long maxY = Math.round(bbox.getMaxY()); for (long i = minY; i <= maxY; i++) { histogram.put(i, histogram.getOrDefault(i, 0) + 1); } @@ -43,24 +54,32 @@ public class ReadingOrderService { .stream() .mapToInt(Integer::intValue).average() .orElse(1) < MULTI_COLUMN_DETECTION_THRESHOLD) { - return resolveSingleColumnReadingOrder(zones); + return resolveSingleColumnReadingOrder(zones, useDirAdjCoords); } else { - return resolveMultiColumnReadingOder(zones); + return resolveMultiColumnReadingOder(zones, useDirAdjCoords); } } - private static List resolveSingleColumnReadingOrder(List zones) { + private static List resolveSingleColumnReadingOrder(List zones, boolean useDirAdjCoords) { - zones.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) - .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); + if (useDirAdjCoords) { + return zones.stream() + .collect(Collectors.groupingBy(TextBoundingBox::getDir)).values() + .stream() + .flatMap(words -> words.stream() + .sorted(COMPARATOR_DIR_ADJ)) + .toList(); + } + + zones.sort(COMPARATOR); return zones; } - private List resolveMultiColumnReadingOder(List zones) { + private List resolveMultiColumnReadingOder(List zones, boolean useDirAdjCoords) { // Simple reading order resolver for multi column page layout as described here : https://pub.towardsai.net/advanced-rag-02-unveiling-pdf-parsing-b84ae866344e // TODO implement a more fancy reading order resolver see https://github.com/BobLd/DocumentLayoutAnalysis/blob/master/README.md#reading-order @@ -69,11 +88,12 @@ public class ReadingOrderService { double maxX = Double.NEGATIVE_INFINITY; for (Zone zone : zones) { - if (zone.getX() < minX) { - minX = zone.getX(); + Rectangle2D bbox = useDirAdjCoords ? zone.getBBoxDirAdj() : zone.getBBox(); + if (bbox.getX() < minX) { + minX = zone.getXDirAdj(); } - if (zone.getX() + zone.getWidth() > maxX) { - maxX = zone.getX() + zone.getWidth(); + if (bbox.getMaxX() > maxX) { + maxX = zone.getMaxXDirAdj(); } } @@ -82,24 +102,27 @@ public class ReadingOrderService { List leftOf = new ArrayList<>(); List rightOf = new ArrayList<>(); List middle = new ArrayList<>(); + for (Zone zone : zones) { - if (zone.getX() < midLineXCoordinate && zone.getX() + zone.getWidth() < midLineXCoordinate) { + Rectangle2D bbox = useDirAdjCoords ? zone.getBBoxDirAdj() : zone.getBBox(); + if (bbox.getX() < midLineXCoordinate && bbox.getX() + bbox.getWidth() < midLineXCoordinate) { leftOf.add(zone); - } else if (zone.getX() > midLineXCoordinate && zone.getX() + zone.getWidth() > midLineXCoordinate) { + } else if (bbox.getX() > midLineXCoordinate && bbox.getX() + bbox.getWidth() > midLineXCoordinate) { rightOf.add(zone); } else { middle.add(zone); } } - leftOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) - .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); - - rightOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) - .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); - - middle.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) - .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); + if (useDirAdjCoords) { + leftOf.sort(COMPARATOR_DIR_ADJ); + rightOf.sort(COMPARATOR_DIR_ADJ); + middle.sort(COMPARATOR_DIR_ADJ); + } else { + leftOf.sort(COMPARATOR); + rightOf.sort(COMPARATOR); + middle.sort(COMPARATOR); + } /* List leftNotIntersecting = new ArrayList<>(); for (Zone leftZone : leftOf) { @@ -151,8 +174,9 @@ public class ReadingOrderService { while (itty.hasNext()) { Zone current = itty.next(); + Rectangle2D bbox = useDirAdjCoords ? current.getBBoxDirAdj() : current.getBBox(); for (int i = 0; i < sortedZones.size(); i++) { - if (current.getY() < sortedZones.get(i).getY()) { + if (bbox.getY() < sortedZones.get(i).getY()) { sortedZones.add(i, current); itty.remove(); break; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/ZoneBuilderService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/ZoneBuilderService.java index ec1871c..e222e23 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/ZoneBuilderService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/ZoneBuilderService.java @@ -1,5 +1,7 @@ package com.knecon.fforesight.service.layoutparser.processor.docstrum.service; +import static com.knecon.fforesight.service.layoutparser.processor.model.SectionIdentifier.numericalIdentifierPattern; + import java.util.ArrayList; import java.util.Comparator; import java.util.HashSet; @@ -21,7 +23,7 @@ public class ZoneBuilderService { private static final double MIN_HORIZONTAL_DISTANCE_MULTIPLIER = -0.5; private static final double MAX_VERTICAL_DISTANCE_MULTIPLIER = 1.2; - private static final double MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER = -3.0; + private static final double MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER = -7; private static final double MAX_VERTICAL_MERGE_DISTANCE_MULTIPLIER = 0.5; @@ -38,7 +40,7 @@ public class ZoneBuilderService { double minHorizontalDistance = characterSpacing * MIN_HORIZONTAL_DISTANCE_MULTIPLIER; double maxVerticalDistance = lineSpacing * MAX_VERTICAL_DISTANCE_MULTIPLIER; - double minHorizontalMergeDistance = characterSpacing * MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER; + double minHorizontalMergeDistance = lineSpacing * MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER; double maxVerticalMergeDistance = lineSpacing * MAX_VERTICAL_MERGE_DISTANCE_MULTIPLIER; UnionFind unionFind = new UnionFind<>(new HashSet<>(lines)); @@ -54,11 +56,26 @@ public class ZoneBuilderService { return; } - double scale = Math.min(outerLine.getHeight(), innerLine.getHeight()) / meanHeight; - scale = Math.max(MIN_LINE_SIZE_SCALE, Math.min(scale, MAX_LINE_SIZE_SCALE)); +// if (!innerLine.getFontStyle().equals(outerLine.getFontStyle()) // +// && !outerLine.intersectsY(innerLine, -2f)) { +// return; +// } - double horizontalDistance = outerLine.horizontalDistance(innerLine) / scale; - double verticalDistance = outerLine.verticalDistance(innerLine) / scale; + double horizontalScale = Math.min(outerLine.getHeightDirAdj(), innerLine.getHeightDirAdj()) / meanHeight; + horizontalScale = Math.max(MIN_LINE_SIZE_SCALE, Math.min(horizontalScale, MAX_LINE_SIZE_SCALE)); + double verticalScale = horizontalScale; + +// if (innerLine.toString().endsWith(":") +// || outerLine.toString().endsWith(":") +// || numericalIdentifierPattern.matcher(innerLine.toString()).matches() +// || numericalIdentifierPattern.matcher(outerLine.toString()).matches()) { +// +// horizontalScale *= 5; +// verticalScale /= 10; +// } + + double horizontalDistance = outerLine.horizontalDistance(innerLine) / horizontalScale; + double verticalDistance = outerLine.verticalDistance(innerLine) / verticalScale; if ((!(minHorizontalDistance <= horizontalDistance) || !(verticalDistance <= maxVerticalDistance)) // && (!(minHorizontalMergeDistance <= horizontalDistance) || !(verticalDistance <= maxVerticalMergeDistance))) { @@ -87,7 +104,7 @@ public class ZoneBuilderService { double weights = 0.0; for (Line line : lines) { double weight = line.getLength(); - meanHeight += line.getHeight() * weight; + meanHeight += line.getHeightDirAdj() * weight; weights += weight; } meanHeight /= weights; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/markdown/MarkdownMapper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/markdown/MarkdownMapper.java index f8239bc..a9c1e1b 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/markdown/MarkdownMapper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/markdown/MarkdownMapper.java @@ -32,6 +32,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.He import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.FontStyle; public class MarkdownMapper extends AbstractNodeVisitor { @@ -297,12 +298,6 @@ public class MarkdownMapper extends AbstractNodeVisitor { } - enum FontStyle { - REGULAR, - BOLD, - ITALIC, - BOLD_ITALIC; - } record FontStyleChange(boolean enter, FontStyle style) { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationPage.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationPage.java index ef97651..dcd9315 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationPage.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationPage.java @@ -18,6 +18,7 @@ import lombok.RequiredArgsConstructor; @Data @RequiredArgsConstructor + public class ClassificationPage { @NonNull @@ -25,7 +26,7 @@ public class ClassificationPage { private List outlineObjects = new ArrayList<>(); - private List headlines = new ArrayList<>(); + private List headlines = new ArrayList<>(); private List images = new ArrayList<>(); @@ -44,7 +45,7 @@ public class ClassificationPage { private float pageWidth; private float pageHeight; - CleanRulings cleanRulings; + private CleanRulings cleanRulings; private Map> markedContentBboxPerType = new HashMap<>(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/FloatFrequencyCounter.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/FloatFrequencyCounter.java index a3d7917..44fdf43 100755 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/FloatFrequencyCounter.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/FloatFrequencyCounter.java @@ -12,10 +12,10 @@ import lombok.Getter; @Getter public class FloatFrequencyCounter { - Map countPerValue = new HashMap<>(); + Map countPerValue = new HashMap<>(); - public void add(float value) { + public void add(double value) { if (!countPerValue.containsKey(value)) { countPerValue.put(value, 1); @@ -25,9 +25,9 @@ public class FloatFrequencyCounter { } - public void addAll(Map otherCounter) { + public void addAll(Map otherCounter) { - for (Map.Entry entry : otherCounter.entrySet()) { + for (Map.Entry entry : otherCounter.entrySet()) { if (countPerValue.containsKey(entry.getKey())) { countPerValue.put(entry.getKey(), countPerValue.get(entry.getKey()) + entry.getValue()); } else { @@ -37,10 +37,10 @@ public class FloatFrequencyCounter { } - public Float getMostPopular() { + public Double getMostPopular() { - Map.Entry mostPopular = null; - for (Map.Entry entry : countPerValue.entrySet()) { + Map.Entry mostPopular = null; + for (Map.Entry entry : countPerValue.entrySet()) { if (mostPopular == null || entry.getValue() >= mostPopular.getValue()) { mostPopular = entry; } @@ -49,11 +49,11 @@ public class FloatFrequencyCounter { } - public List getHighterThanMostPopular() { + public List getHigherThanMostPopular() { - Float mostPopular = getMostPopular(); - List higher = new ArrayList<>(); - for (Float value : countPerValue.keySet()) { + Double mostPopular = getMostPopular(); + List higher = new ArrayList<>(); + for (Double value : countPerValue.keySet()) { if (value > mostPopular) { higher.add(value); } @@ -63,10 +63,10 @@ public class FloatFrequencyCounter { } - public Float getHighest() { + public Double getHighest() { - Float highest = null; - for (Float value : countPerValue.keySet()) { + Double highest = null; + for (Double value : countPerValue.keySet()) { if (highest == null || value > highest) { highest = value; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/SectionIdentifier.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/SectionIdentifier.java index dbcb2ce..c09d529 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/SectionIdentifier.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/SectionIdentifier.java @@ -15,7 +15,7 @@ import lombok.experimental.FieldDefaults; @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) public class SectionIdentifier { - static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d+)[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?"); + public static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d+)[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?"); public enum Format { EMPTY, diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/AtomicTextBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/AtomicTextBlock.java index 20b328c..6da9330 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/AtomicTextBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/AtomicTextBlock.java @@ -145,10 +145,7 @@ public class AtomicTextBlock implements TextBlock { } - public static AtomicTextBlock fromAtomicTextBlockData(DocumentTextData documentTextData, - DocumentPositionData documentPositionData, - SemanticNode parent, - Page page) { + public static AtomicTextBlock fromAtomicTextBlockData(DocumentTextData documentTextData, DocumentPositionData documentPositionData, SemanticNode parent, Page page) { return AtomicTextBlock.builder() .id(documentTextData.getId()) @@ -156,8 +153,10 @@ public class AtomicTextBlock implements TextBlock { .page(page) .textRange(new TextRange(documentTextData.getStart(), documentTextData.getEnd())) .searchText(documentTextData.getSearchText()) - .lineBreaks(Arrays.stream(documentTextData.getLineBreaks()).boxed().toList()) - .stringIdxToPositionIdx(Arrays.stream(documentPositionData.getStringIdxToPositionIdx()).boxed().toList()) + .lineBreaks(Arrays.stream(documentTextData.getLineBreaks()).boxed() + .toList()) + .stringIdxToPositionIdx(Arrays.stream(documentPositionData.getStringIdxToPositionIdx()).boxed() + .toList()) .positions(toRectangle2DList(documentPositionData.getPositions())) .parent(parent) .build(); @@ -166,7 +165,9 @@ public class AtomicTextBlock implements TextBlock { private static List toRectangle2DList(float[][] positions) { - return Arrays.stream(positions).map(floatArr -> (Rectangle2D) new Rectangle2D.Float(floatArr[0], floatArr[1], floatArr[2], floatArr[3])).toList(); + return Arrays.stream(positions) + .map(floatArr -> (Rectangle2D) new Rectangle2D.Float(floatArr[0], floatArr[1], floatArr[2], floatArr[3])) + .toList(); } @@ -176,6 +177,9 @@ public class AtomicTextBlock implements TextBlock { throw new IndexOutOfBoundsException(format("line %d out of range for AtomicTextBlock with %d lines", lineNumber, numberOfLines())); } if (lineNumber == 0) { + if (lineBreaks.isEmpty()) { + return searchText; + } return subSequence(textRange.start(), lineBreaks.get(0) + textRange.start()); } else if (lineNumber == numberOfLines() - 1) { return subSequence(lineBreaks.get(lineBreaks.size() - 1) + textRange.start(), textRange.end()); @@ -195,9 +199,9 @@ public class AtomicTextBlock implements TextBlock { public int getNextLinebreak(int fromIndex) { return lineBreaks.stream()// - .filter(linebreak -> linebreak > fromIndex - textRange.start()) // - .findFirst() // - .orElse(searchText.length()) + textRange.start(); + .filter(linebreak -> linebreak > fromIndex - textRange.start()) // + .findFirst() // + .orElse(searchText.length()) + textRange.start(); } @@ -205,9 +209,9 @@ public class AtomicTextBlock implements TextBlock { public int getPreviousLinebreak(int fromIndex) { return lineBreaks.stream()// - .filter(linebreak -> linebreak <= fromIndex - textRange.start())// - .reduce((a, b) -> b)// - .orElse(0) + textRange.start(); + .filter(linebreak -> linebreak <= fromIndex - textRange.start())// + .reduce((a, b) -> b)// + .orElse(0) + textRange.start(); } @@ -255,7 +259,10 @@ public class AtomicTextBlock implements TextBlock { protected List getAllLineBreaksInBoundary(TextRange textRange) { - return getLineBreaks().stream().map(linebreak -> linebreak + this.textRange.start()).filter(textRange::contains).toList(); + return getLineBreaks().stream() + .map(linebreak -> linebreak + this.textRange.start()) + .filter(textRange::contains) + .toList(); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/image/ClassifiedImage.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/image/ClassifiedImage.java index 40931be..9f1556e 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/image/ClassifiedImage.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/image/ClassifiedImage.java @@ -10,8 +10,8 @@ import lombok.NonNull; import lombok.RequiredArgsConstructor; @Data -@RequiredArgsConstructor @AllArgsConstructor +@RequiredArgsConstructor public class ClassifiedImage { @NonNull @@ -20,11 +20,19 @@ public class ClassifiedImage { private ImageType imageType; private boolean sourceByAi; private boolean isAppendedToSection; - @NonNull private boolean hasTransparency; - @NonNull private int page; @NonNull private String representation; + + public ClassifiedImage(@NonNull Rectangle2D position, @NonNull ImageType imageType, boolean hasTransparency, int page, @NonNull String representation) { + + this.position = position; + this.imageType = imageType; + this.hasTransparency = hasTransparency; + this.page = page; + this.representation = representation; + } + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Cell.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Cell.java index 8da9b97..80243e4 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Cell.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Cell.java @@ -35,14 +35,14 @@ public class Cell extends BoundingBox { public Cell(Point2D topLeft, Point2D bottomRight) { - this.bBoxInitialUserSpace = new Rectangle2D.Double(topLeft.getX(), topLeft.getY(), (bottomRight.getX() - topLeft.getX()), (bottomRight.getY() - topLeft.getY())); - this.bBox = bBoxInitialUserSpace; + this.bBoxPdf = new Rectangle2D.Double(topLeft.getX(), topLeft.getY(), (bottomRight.getX() - topLeft.getX()), (bottomRight.getY() - topLeft.getY())); + this.bBox = bBoxPdf; } public Cell(Rectangle2D bBoxInitialUserSpace, AffineTransform initialUserSpaceToJava) { - this.bBoxInitialUserSpace = bBoxInitialUserSpace; + this.bBoxPdf = bBoxInitialUserSpace; this.bBox = initialUserSpaceToJava.createTransformedShape(bBoxInitialUserSpace).getBounds2D(); } @@ -50,7 +50,7 @@ public class Cell extends BoundingBox { public static Cell copy(Cell cell) { Cell copy = new Cell(); - copy.bBoxInitialUserSpace = cell.bBoxInitialUserSpace; + copy.bBoxPdf = cell.bBoxPdf; copy.bBox = cell.bBox; return copy; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/CleanRulings.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/CleanRulings.java index a4b97cc..8c698d8 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/CleanRulings.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/CleanRulings.java @@ -70,7 +70,7 @@ public class CleanRulings { public boolean lineBetween(BoundingBox a, BoundingBox b) { - return lineBetween(a.getBBoxInitialUserSpace(), b.getBBoxInitialUserSpace()); + return lineBetween(a.getBBoxPdf(), b.getBBoxPdf()); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/TablePageBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/TablePageBlock.java index 8093280..6894336 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/TablePageBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/TablePageBlock.java @@ -263,8 +263,8 @@ public class TablePageBlock extends AbstractPageBlock { cells.stream() .map(originalCell -> new CellWithIntersection(originalCell, - RectangleTransformations.calculateIntersectedArea(cellFromGridStructure.getBBoxInitialUserSpace(), - originalCell.getBBoxInitialUserSpace()))) + RectangleTransformations.calculateIntersectedArea(cellFromGridStructure.getBBoxPdf(), + originalCell.getBBoxPdf()))) .filter(cellWithIntersection -> cellWithIntersection.intersectedArea > 0) .filter(cellWithIntersection -> cellWithIntersection.originalCell.getArea() > cellWithIntersection.intersectedArea * CELL_AREA_CONTAINED_THRESHOLD) .max(Comparator.comparing(CellWithIntersection::intersectedArea)) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/FontStyle.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/FontStyle.java new file mode 100644 index 0000000..0417055 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/FontStyle.java @@ -0,0 +1,9 @@ +package com.knecon.fforesight.service.layoutparser.processor.model.text; + + +public enum FontStyle { + REGULAR, + BOLD, + ITALIC, + BOLD_ITALIC; +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/RedTextPosition.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/RedTextPosition.java index 8a0bbc5..bdc581f 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/RedTextPosition.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/RedTextPosition.java @@ -5,64 +5,50 @@ import java.awt.geom.Rectangle2D; import org.apache.pdfbox.text.TextPosition; -import com.fasterxml.jackson.annotation.JsonIgnore; -import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox; +import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox; +import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2; +import lombok.AccessLevel; import lombok.AllArgsConstructor; -import lombok.Builder; import lombok.Data; +import lombok.EqualsAndHashCode; import lombok.NoArgsConstructor; import lombok.SneakyThrows; +import lombok.experimental.FieldDefaults; +import lombok.experimental.SuperBuilder; @Data -@Builder +@SuperBuilder @NoArgsConstructor @AllArgsConstructor -public class RedTextPosition extends BoundingBox { +@EqualsAndHashCode(callSuper = true) +@FieldDefaults(level = AccessLevel.PRIVATE) +public class RedTextPosition extends TextBoundingBox { public final static int HEIGHT_PADDING = 2; - private Rectangle2D.Float bBoxDirAdj; // adjusted to text rotation + String unicode; - @JsonIgnore - private int rotation; + // estimated using the TextMatrix in radians + float exactDir; - @JsonIgnore - private float pageHeight; + float widthOfSpace; - @JsonIgnore - private float pageWidth; + float fontSizeInPt; - private String unicode; - - @JsonIgnore - private float dir; - - // not used in reanalysis - @JsonIgnore - private float widthOfSpace; - - // not used in reanalysis - @JsonIgnore - private float fontSizeInPt; - - // not used in reanalysis - @JsonIgnore - private String fontName; + String fontName; @SneakyThrows public static RedTextPosition fromTextPosition(TextPosition textPosition) { var pos = new RedTextPosition(); - pos.setRotation(textPosition.getRotation()); - pos.setPageHeight(textPosition.getPageHeight()); - pos.setPageWidth(textPosition.getPageWidth()); pos.setUnicode(textPosition.getUnicode()); - pos.setDir(textPosition.getDir()); pos.setWidthOfSpace(textPosition.getWidthOfSpace()); pos.setFontSizeInPt(textPosition.getFontSizeInPt()); pos.setFontName(textPosition.getFont().getName()); + pos.setExactDir((float) FastAtan2.fastAtan2(textPosition.getTextMatrix().getShearY(), textPosition.getTextMatrix().getScaleX())); + pos.setDir(TextDirection.fromDegrees(textPosition.getDir())); //TODO: There is a mismatch in the java coords of the text and the rulings, // I guess if we start with the initial user space positions and transform them the same way we do the rulings it would work. @@ -73,18 +59,18 @@ public class RedTextPosition extends BoundingBox { textPosition.getYDirAdj() - textHeight, textPosition.getWidthDirAdj(), textHeight + HEIGHT_PADDING); + pos.setBBoxDirAdj(dirAdjPosition); AffineTransform affineTransform = getRotationMatrix(TextDirection.fromDegrees(textPosition.getDir()), textPosition.getPageWidth(), textPosition.getPageHeight()); Rectangle2D bBoxInitialUserSpace = affineTransform.createTransformedShape(dirAdjPosition).getBounds2D(); - pos.setBBoxInitialUserSpace(bBoxInitialUserSpace); // These are definitely correct + pos.setBBoxPdf(bBoxInitialUserSpace); // These are definitely correct return pos; } - private static AffineTransform getRotationMatrix(TextDirection textDirection, float pageWidth, float pageHeight) { AffineTransform transform = new AffineTransform(); @@ -103,32 +89,4 @@ public class RedTextPosition extends BoundingBox { return transform; } - - @JsonIgnore - public float getXDirAdj() { - - return this.bBoxDirAdj.x; - } - - - @JsonIgnore - public float getYDirAdj() { - - return this.bBoxDirAdj.y; - } - - - @JsonIgnore - public float getWidthDirAdj() { - - return this.bBoxDirAdj.width; - } - - - @JsonIgnore - public float getHeightDir() { - - return this.bBoxDirAdj.height; - } - } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextDirection.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextDirection.java index 8d1fa97..9fdd9be 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextDirection.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextDirection.java @@ -44,4 +44,15 @@ public enum TextDirection { throw new IllegalArgumentException(String.format("A value of %f is not supported by TextDirection", degrees)); } + + + public int getRotation() { + + return switch (this) { + case ZERO -> 0; + case QUARTER_CIRCLE -> 1; + case HALF_CIRCLE -> 2; + case THREE_QUARTER_CIRCLE -> 3; + }; + } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java index c7c2ae6..d407fd5 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java @@ -8,6 +8,7 @@ import com.fasterxml.jackson.annotation.JsonIgnore; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter; import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; +import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities; import lombok.AllArgsConstructor; @@ -26,17 +27,19 @@ public class TextPageBlock extends AbstractPageBlock { @Builder.Default private List sequences = new ArrayList<>(); + private Rectangle2D bBoxDirAdj; + private String mostPopularWordFont; private String mostPopularWordStyle; - private float mostPopularWordFontSize; + private double mostPopularWordFontSize; - private float mostPopularWordHeight; + private double mostPopularWordHeight; - private float mostPopularWordSpaceWidth; + private double mostPopularWordSpaceWidth; - private float highestFontSize; + private double highestFontSize; private PageBlockType classification; @@ -51,34 +54,24 @@ public class TextPageBlock extends AbstractPageBlock { } - @JsonIgnore public TextDirection getDir() { return sequences.get(0).getDir(); } - @JsonIgnore - public float getPageHeight() { - - return sequences.get(0).getPageHeight(); - } - - - @JsonIgnore - public float getPageWidth() { - - return sequences.get(0).getPageWidth(); - } - - private void calculateBBox() { if (sequences == null) { this.bBox = new Rectangle2D.Double(); - this.bBoxInitialUserSpace = new Rectangle2D.Double(); + this.bBoxPdf = new Rectangle2D.Double(); + this.bBoxDirAdj = new Rectangle2D.Double(); return; } + this.bBoxDirAdj = sequences.stream() + .map(TextPositionSequence::getBBoxDirAdj) + .collect(RectangleTransformations.collectBBox()); + setToBBoxOfComponents(sequences); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequence.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequence.java index 4df3cdb..fa2c797 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequence.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequence.java @@ -1,6 +1,7 @@ package com.knecon.fforesight.service.layoutparser.processor.model.text; -import java.awt.geom.Rectangle2D; +import static com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition.HEIGHT_PADDING; + import java.util.ArrayList; import java.util.List; import java.util.Locale; @@ -8,8 +9,7 @@ import java.util.stream.Collectors; import org.apache.pdfbox.text.TextPosition; -import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox; -import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; +import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox; import lombok.AllArgsConstructor; import lombok.Builder; @@ -23,23 +23,21 @@ import lombok.extern.slf4j.Slf4j; @Builder @NoArgsConstructor @AllArgsConstructor -@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = false) -public class TextPositionSequence extends BoundingBox implements CharSequence { +@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = true) // needs the bbox to be unique +public class TextPositionSequence extends TextBoundingBox implements CharSequence { - public static final int HEIGHT_PADDING = 2; + public static final String STANDARD = "standard"; + public static final String BOLD_ITALIC = "bold, italic"; + public static final String BOLD = "bold"; + public static final String ITALIC = "italic"; @EqualsAndHashCode.Include private int page; @EqualsAndHashCode.Include + @Builder.Default private List textPositions = new ArrayList<>(); - private Rectangle2D bBoxDirAdj; - @EqualsAndHashCode.Include - private TextDirection dir; - private int rotation; - private float pageHeight; - private float pageWidth; private boolean isParagraphStart; private boolean strikethrough; private boolean underline; @@ -51,10 +49,6 @@ public class TextPositionSequence extends BoundingBox implements CharSequence { .map(RedTextPosition::fromTextPosition) .collect(Collectors.toList()); this.page = pageNumber; - this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir()); - this.rotation = textPositions.get(0).getRotation(); - this.pageHeight = textPositions.get(0).getPageHeight(); - this.pageWidth = textPositions.get(0).getPageWidth(); this.isParagraphStart = isParagraphStart; calculateBBox(); } @@ -62,9 +56,6 @@ public class TextPositionSequence extends BoundingBox implements CharSequence { private void calculateBBox() { - this.bBoxDirAdj = textPositions.stream() - .map(RedTextPosition::getBBoxDirAdj) - .collect(RectangleTransformations.collectBBox()); setToBBoxOfComponents(getTextPositions()); } @@ -73,10 +64,6 @@ public class TextPositionSequence extends BoundingBox implements CharSequence { this.textPositions = textPositions; this.page = page; - this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir()); - this.rotation = textPositions.get(0).getRotation(); - this.pageHeight = textPositions.get(0).getPageHeight(); - this.pageWidth = textPositions.get(0).getPageWidth(); calculateBBox(); } @@ -112,9 +99,6 @@ public class TextPositionSequence extends BoundingBox implements CharSequence { textPositionSequence.textPositions = textPositions.subList(start, end); textPositionSequence.page = page; textPositionSequence.dir = dir; - textPositionSequence.rotation = rotation; - textPositionSequence.pageHeight = pageHeight; - textPositionSequence.pageWidth = pageWidth; textPositionSequence.setToBBoxOfComponents(getTextPositions()); return textPositionSequence; } @@ -141,10 +125,6 @@ public class TextPositionSequence extends BoundingBox implements CharSequence { this.textPositions.add(textPosition); this.page = textPositionSequence.getPage(); - this.dir = textPositionSequence.getDir(); - this.rotation = textPositionSequence.getRotation(); - this.pageHeight = textPositionSequence.getPageHeight(); - this.pageWidth = textPositionSequence.getPageWidth(); calculateBBox(); } @@ -152,79 +132,18 @@ public class TextPositionSequence extends BoundingBox implements CharSequence { public void add(TextPosition textPosition) { this.textPositions.add(RedTextPosition.fromTextPosition(textPosition)); - this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir()); - this.rotation = textPositions.get(0).getRotation(); - this.pageHeight = textPositions.get(0).getPageHeight(); - this.pageWidth = textPositions.get(0).getPageWidth(); calculateBBox(); } + public double getTextHeightNoPadding() { - /** - * This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction. - * This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt. - * - * @return the text direction adjusted minX value - */ - - public float getMinXDirAdj() { - - return textPositions.get(0).getXDirAdj(); - + return textPositions.get(0).getHeightDirAdj(); } - /** - * This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction. - * This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt. - * - * @return the text direction adjusted maxX value - */ + public double getTextHeight() { - public float getMaxXDirAdj() { - - return textPositions.get(textPositions.size() - 1).getXDirAdj() + textPositions.get(textPositions.size() - 1).getWidthDirAdj() + HEIGHT_PADDING; - - } - - - /** - * This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction. - * This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt. - * - * @return the text direction adjusted minY value. The upper border of the bounding box of the word. - */ - - public float getMinYDirAdj() { - - return textPositions.get(0).getYDirAdj() - getTextHeight(); - - } - - - /** - * This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction. - * This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt. - * - * @return the text direction adjusted maxY value. The lower border of the bounding box of the word. - */ - - public float getMaxYDirAdj() { - - return textPositions.get(0).getYDirAdj(); - - } - - - public float getTextHeightNoPadding() { - - return textPositions.get(0).getHeightDir(); - } - - - public float getTextHeight() { - - return textPositions.get(0).getHeightDir() + HEIGHT_PADDING; + return textPositions.get(0).getHeightDirAdj() + HEIGHT_PADDING; } @@ -240,18 +159,18 @@ public class TextPositionSequence extends BoundingBox implements CharSequence { public String getFontStyle() { if (textPositions.get(0).getFontName() == null) { - return "standard"; + return STANDARD; } String lowercaseFontName = textPositions.get(0).getFontName().toLowerCase(Locale.ROOT); - if (lowercaseFontName.contains("bold") && lowercaseFontName.contains("italic")) { - return "bold, italic"; - } else if (lowercaseFontName.contains("bold")) { - return "bold"; - } else if (lowercaseFontName.contains("italic")) { - return "italic"; + if (lowercaseFontName.contains(BOLD) && lowercaseFontName.contains(ITALIC)) { + return BOLD_ITALIC; + } else if (lowercaseFontName.contains(BOLD)) { + return BOLD; + } else if (lowercaseFontName.contains(ITALIC)) { + return ITALIC; } else { - return "standard"; + return STANDARD; } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/adapter/ImageServiceResponseAdapter.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/adapter/ImageServiceResponseAdapter.java index 7b6c840..3b03a99 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/adapter/ImageServiceResponseAdapter.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/adapter/ImageServiceResponseAdapter.java @@ -56,7 +56,7 @@ public class ImageServiceResponseAdapter { classificationPage.getImages().forEach(image -> { if (image.getImageType().equals(ImageType.OTHER)) { for (AbstractPageBlock textblock : classificationPage.getTextBlocks()) { - if (image.getPosition().contains(textblock.getBBoxInitialUserSpace())) { + if (image.getPosition().contains(textblock.getBBoxPdf())) { image.setImageType(ImageType.OCR); return; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/image/Classification.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/image/Classification.java index 00b8cd0..220019a 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/image/Classification.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/image/Classification.java @@ -14,6 +14,7 @@ import lombok.NoArgsConstructor; @AllArgsConstructor public class Classification { + @Builder.Default private Map probabilities = new HashMap<>(); private String label; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/image/ImageServiceResponse.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/image/ImageServiceResponse.java index 0a7f176..d6cc437 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/image/ImageServiceResponse.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/image/ImageServiceResponse.java @@ -22,8 +22,10 @@ public class ImageServiceResponse { @JsonProperty(value = "imageMetadata") @JsonAlias("data") + @Builder.Default private List data = new ArrayList<>(); + @Builder.Default private List dataCV = new ArrayList<>(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/TableData.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/TableData.java index be1d492..99dc0df 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/TableData.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/TableData.java @@ -15,6 +15,7 @@ import lombok.NoArgsConstructor; public class TableData { private PageInfo pageInfo; + @Builder.Default private List tableCells = new ArrayList<>(); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/TableServiceResponse.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/TableServiceResponse.java index fbfea32..f98e35e 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/TableServiceResponse.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/TableServiceResponse.java @@ -19,7 +19,7 @@ public class TableServiceResponse { private String operation; private String targetFileExtension; private String responseFileExtension; - + @Builder.Default private List data = new ArrayList<>(); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/GapDetectionService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/GapDetectionService.java index ae3eac1..54fd973 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/GapDetectionService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/GapDetectionService.java @@ -6,7 +6,6 @@ import java.util.List; import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; -import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; import lombok.AllArgsConstructor; import lombok.experimental.UtilityClass; @@ -38,7 +37,7 @@ public class GapDetectionService { for (TextPositionSequence currentTextPosition : sortedTextPositionSequences.subList(1, sortedTextPositionSequences.size())) { double yDifference = Math.abs(currentTextPosition.getMaxYDirAdj() - previousTextPosition.getMaxYDirAdj()); - double xGap = Math.abs(previousTextPosition.getMaxXDirAdj() - currentTextPosition.getMinXDirAdj()); + double xGap = Math.abs(previousTextPosition.getMaxXDirAdj() - currentTextPosition.getXDirAdj()); Rectangle2D previousTextPositionBBox = toRectangle2D(previousTextPosition); Rectangle2D currentTextPositionBBox = toRectangle2D(currentTextPosition); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/LineDetectionService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/LineDetectionService.java index ddebaef..6ad8d9f 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/LineDetectionService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/LineDetectionService.java @@ -71,7 +71,7 @@ public class LineDetectionService { private static boolean isXGap(TextPositionSequence currentTextPosition, TextPositionSequence previousTextPosition, double avgTextPositionHeight) { - return Math.abs(previousTextPosition.getMaxXDirAdj() - currentTextPosition.getMinXDirAdj()) > (avgTextPositionHeight * X_GAP_FACTOR); + return Math.abs(previousTextPosition.getMaxXDirAdj() - currentTextPosition.getXDirAdj()) > (avgTextPositionHeight * X_GAP_FACTOR); } @@ -83,7 +83,7 @@ public class LineDetectionService { private static boolean isNewLine(TextPositionSequence currentTextPosition, TextPositionSequence previousTextPosition, double avgTextPositionHeight) { - return Math.abs(previousTextPosition.getMinYDirAdj() - currentTextPosition.getMinYDirAdj()) > avgTextPositionHeight; + return Math.abs(previousTextPosition.getYDirAdj() - currentTextPosition.getYDirAdj()) > avgTextPositionHeight; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java index b28a80b..a055bf9 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java @@ -78,7 +78,7 @@ public class TableExtractionService { List containedCells = new ArrayList<>(); for (Cell c : cells) { - if (c.hasMinimumSize() && area.contains(c.getBBoxInitialUserSpace())) { + if (c.hasMinimumSize() && area.contains(c.getBBoxPdf())) { containedCells.add(c); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TextRulingsClassifier.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TextRulingsClassifier.java index 7fc2d40..f809833 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TextRulingsClassifier.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TextRulingsClassifier.java @@ -31,13 +31,13 @@ public class TextRulingsClassifier { private static void handleVerticalText(CleanRulings cleanRulings, TextPositionSequence word) { - float lowerY = (float) (word.getBBoxInitialUserSpace().getMinY() + TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth()); - float upperY = (float) (word.getBBoxInitialUserSpace().getMaxY() - TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth()); + float lowerY = (float) (word.getBBoxPdf().getMinY() + TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth()); + float upperY = (float) (word.getBBoxPdf().getMaxY() - TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth()); - float strikethroughCenterX = (float) word.getBBoxInitialUserSpace().getCenterX(); + float strikethroughCenterX = (float) word.getBBoxPdf().getCenterX(); float strikethroughBoxHeight = (float) ((word.getBBoxDirAdj().getHeight() * STRIKETHROUGH_ZONE) / 2); - float underlineCenterX = (float) (word.getDir().equals(TextDirection.QUARTER_CIRCLE) ? word.getBBoxInitialUserSpace().getMaxX() : word.getBBoxInitialUserSpace().getMinX()); + float underlineCenterX = (float) (word.getDir().equals(TextDirection.QUARTER_CIRCLE) ? word.getBBoxPdf().getMaxX() : word.getBBoxPdf().getMinX()); float underlineBoxHeight = (float) ((word.getBBoxDirAdj().getHeight() * UNDERLINE_ZONE) / 2); float leftX = Math.min(underlineCenterX - underlineBoxHeight, strikethroughCenterX - strikethroughBoxHeight); @@ -65,13 +65,13 @@ public class TextRulingsClassifier { private static void handleHorizontalText(CleanRulings cleanRulings, TextPositionSequence word) { - float leftX = (float) (word.getBBoxInitialUserSpace().getMinX() + TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth()); - float rightX = (float) (word.getBBoxInitialUserSpace().getMaxX() - TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth()); + float leftX = (float) (word.getBBoxPdf().getMinX() + TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth()); + float rightX = (float) (word.getBBoxPdf().getMaxX() - TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth()); - float strikethroughCenterY = (float) word.getBBoxInitialUserSpace().getCenterY(); + float strikethroughCenterY = (float) word.getBBoxPdf().getCenterY(); float strikethroughBoxHeight = (float) ((word.getBBoxDirAdj().getHeight() * STRIKETHROUGH_ZONE) / 2); - float underlineCenterY = (float) (word.getDir().equals(TextDirection.ZERO) ? word.getBBoxInitialUserSpace().getMinY() : word.getBBoxInitialUserSpace().getMaxY()); + float underlineCenterY = (float) (word.getDir().equals(TextDirection.ZERO) ? word.getBBoxPdf().getMinY() : word.getBBoxPdf().getMaxY()); float underlineBoxHeight = (float) ((word.getBBoxDirAdj().getHeight() * UNDERLINE_ZONE) / 2); float lowerY = Math.min(underlineCenterY - underlineBoxHeight, strikethroughCenterY - strikethroughBoxHeight); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java index 1f13bee..0878584 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java @@ -2,12 +2,10 @@ package com.knecon.fforesight.service.layoutparser.processor.services.blockifica import static com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService.buildTextBlock; -import java.awt.geom.Rectangle2D; import java.util.ArrayList; import java.util.List; import java.util.ListIterator; import java.util.Locale; -import java.util.function.Function; import org.apache.commons.lang3.StringUtils; import org.springframework.stereotype.Service; @@ -19,8 +17,6 @@ import com.knecon.fforesight.service.layoutparser.processor.model.SectionIdentif import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; -import com.knecon.fforesight.service.layoutparser.processor.services.factory.SearchTextWithTextPositionFactory; -import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; import lombok.Data; @@ -29,14 +25,6 @@ public class BlockificationPostprocessingService { private static final float BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD = 5.0f; - private static final Function blockToBoundingBox = (abstractPageBlock) -> abstractPageBlock.getSequences() - .stream() - .map(textPositionSequence -> textPositionSequence.getTextPositions() - .stream() - .map(tp -> SearchTextWithTextPositionFactory.mapRedTextPositionToInitialUserSpace(tp, textPositionSequence)) - .collect(RectangleTransformations.collectBBox())) - .collect(RectangleTransformations.collectBBox()); - public OutlineObject sanitizeOutlineBlocks(ClassificationPage classificationPage, OutlineObject notFoundOutlineObject) { @@ -63,13 +51,13 @@ public class BlockificationPostprocessingService { } if (!contextsOverlap(notFoundOutlineObjectProcessionContext, firstOutlineObjectProcessionContext)) { - notFoundOutlineObject.setFound(selectMatch(classificationPage, notFoundOutlineObjectProcessionContext)); + notFoundOutlineObject.setFound(selectMatch(classificationPage, notFoundOutlineObjectProcessionContext, pageHeight)); } if (firstOutlineObject != null) { // re-create the context for the updated blocks firstOutlineObjectProcessionContext = new OutlineProcessionContext(firstOutlineObject); processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, firstOutlineObjectProcessionContext); - firstOutlineObject.setFound(selectMatch(classificationPage, firstOutlineObjectProcessionContext)); + firstOutlineObject.setFound(selectMatch(classificationPage, firstOutlineObjectProcessionContext, pageHeight)); } } @@ -77,7 +65,7 @@ public class BlockificationPostprocessingService { outlineObjectListIterator.forEachRemaining(outlineObject -> { OutlineProcessionContext outlineObjectProcessionContext = new OutlineProcessionContext(outlineObject); processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, outlineObjectProcessionContext); - outlineObject.setFound(selectMatch(classificationPage, outlineObjectProcessionContext)); + outlineObject.setFound(selectMatch(classificationPage, outlineObjectProcessionContext, pageHeight)); }); if (!outlineObjects.isEmpty()) { @@ -160,7 +148,7 @@ public class BlockificationPostprocessingService { } - private boolean selectMatch(ClassificationPage classificationPage, OutlineProcessionContext context) { + private boolean selectMatch(ClassificationPage classificationPage, OutlineProcessionContext context, float pageHeight) { OutlineObject outlineObject = context.outlineObject; TextPageBlock directMatch = context.directMatch; @@ -168,8 +156,8 @@ public class BlockificationPostprocessingService { TextPageBlock splitCandidate = context.splitCandidate; PageBlockType headlineType = PageBlockType.getHeadlineType(outlineObject.getTreeDepth()); - double distanceToDirectMatch = directMatch != null ? calculateDistance(outlineObject, directMatch) : Double.MAX_VALUE; - double distanceToSplitCandidate = splitCandidate != null ? calculateDistance(outlineObject, splitCandidate) : Double.MAX_VALUE; + double distanceToDirectMatch = directMatch != null ? calculateDistance(outlineObject, directMatch, pageHeight) : Double.MAX_VALUE; + double distanceToSplitCandidate = splitCandidate != null ? calculateDistance(outlineObject, splitCandidate, pageHeight) : Double.MAX_VALUE; double distanceToBestMergeCandidates = Double.MAX_VALUE; List bestMergeCandidateCombination = new ArrayList<>(); @@ -189,7 +177,7 @@ public class BlockificationPostprocessingService { for (List combination : combinations) { double averageDistance = combination.stream() - .map(block -> calculateDistance(outlineObject, block)) + .map(block -> calculateDistance(outlineObject, block, pageHeight)) .mapToDouble(Double::doubleValue).average() .orElse(Double.MAX_VALUE); if (distanceToBestMergeCandidates > averageDistance) { @@ -273,7 +261,7 @@ public class BlockificationPostprocessingService { List postSequence = new ArrayList<>(); StringBuilder currentSequence = new StringBuilder(); - if (target.isBlank()){ + if (target.isBlank()) { return new WordSequenceResult(); } @@ -418,10 +406,10 @@ public class BlockificationPostprocessingService { } - private double calculateDistance(OutlineObject outlineObject, TextPageBlock pageBlock) { + private double calculateDistance(OutlineObject outlineObject, TextPageBlock pageBlock, float pageHeight) { double deltaX = outlineObject.getPoint().getX() - pageBlock.getMinX(); - double deltaY = pageBlock.getPageHeight() - outlineObject.getPoint().getY() - pageBlock.getMinY(); + double deltaY = pageHeight - outlineObject.getPoint().getY() - pageBlock.getMinY(); return Math.sqrt(deltaX * deltaX + deltaY * deltaY); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java index 6248251..b96b8ff 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java @@ -1,7 +1,6 @@ package com.knecon.fforesight.service.layoutparser.processor.services.blockification; import java.util.ArrayList; -import java.util.Comparator; import java.util.List; import java.util.ListIterator; @@ -10,7 +9,6 @@ import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.processor.docstrum.DocstrumSegmentationService; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone; -import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; @@ -40,7 +38,7 @@ public class DocstrumBlockificationService { CleanRulings usedRulings = rulings.withoutTextRulings(); - var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder, usedRulings, visualizations); + List zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder, usedRulings, visualizations); if (!textPositions.isEmpty()) { visualizations.addZoneVisualizations(zones, textPositions.get(0).getPage()); @@ -48,11 +46,7 @@ public class DocstrumBlockificationService { visualizations.addCharactersWithNeighbours(zones, textPositions.get(0).getPage()); } - var pageBlocks = toAbstractPageBlocks(zones, xyOrder, usedRulings); - - if (xyOrder) { - sortPageBlocksXThenY(pageBlocks); - } + var pageBlocks = toAbstractPageBlocks(zones); var classificationPage = new ClassificationPage(pageBlocks); classificationPage.setCleanRulings(rulings); @@ -73,21 +67,7 @@ public class DocstrumBlockificationService { } - private static void sortPageBlocksXThenY(List pageBlocks) { - - pageBlocks.sort(Comparator.comparing(AbstractPageBlock::getMinY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) - .thenComparing(AbstractPageBlock::getMinX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); - pageBlocks.sort(new Comparator() { - @Override - public int compare(AbstractPageBlock o1, AbstractPageBlock o2) { - - return Math.abs(o1.getMinY() - o2.getMinY()) < 5 && o1.getMinX() < o2.getMinX() == true ? -1 : 0; - } - }); - } - - - private List toAbstractPageBlocks(List zones, boolean xyOrder, CleanRulings usedRulings) { + private List toAbstractPageBlocks(List zones) { List abstractPageBlocks = new ArrayList<>(); zones.forEach(zone -> { @@ -296,6 +276,10 @@ public class DocstrumBlockificationService { continue; } +// if (!current.getMostPopularWordStyle().equals(inner.getMostPopularWordStyle())) { +// continue; +// } + if (current.getDir() == inner.getDir() && current.intersects(inner, yThreshold, xThreshold)) { boolean toDuplicate = current.isToDuplicate() || inner.isToDuplicate(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocuMineBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocuMineBlockificationService.java index 9c8def4..6a23fc8 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocuMineBlockificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocuMineBlockificationService.java @@ -1,9 +1,6 @@ package com.knecon.fforesight.service.layoutparser.processor.services.blockification; -import static java.util.stream.Collectors.toSet; - import java.util.ArrayList; -import java.util.Comparator; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -13,10 +10,8 @@ import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; -import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter; import com.knecon.fforesight.service.layoutparser.processor.model.Orientation; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; -import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; @@ -44,31 +39,30 @@ public class DocuMineBlockificationService { CleanRulings usedRulings = cleanRulings.withoutTextRulings(); - float minX = 1000; - float maxX = 0; - float minY = 1000; - float maxY = 0; + double minX = 1000; + double maxX = 0; + double minY = 1000; + double maxY = 0; TextPositionSequence prev = null; boolean wasSplitted = false; - Float splitX1 = null; + Double splitX1 = null; for (TextPositionSequence word : textPositions) { - boolean lineSeparation = prev != null && word.getMinYDirAdj() - maxY > Math.min(word.getHeight(), prev.getHeight()) * 1.25; - boolean startFromTop = prev != null && word.getMinYDirAdj() < prev.getMinYDirAdj() - prev.getTextHeight(); - boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj(); - boolean negativeXGap = prev != null && word.getMinXDirAdj() - minX < -5; - boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj(); + boolean lineSeparation = prev != null && word.getYDirAdj() - prev.getMaxYDirAdj() > Math.min(word.getHeight(), prev.getHeight()) * 1.25; + boolean startFromTop = prev != null && word.getYDirAdj() < prev.getYDirAdj() - prev.getTextHeight(); + boolean splitByX = prev != null && maxX + 50 < word.getXDirAdj() && prev.getYDirAdj() == word.getYDirAdj(); + boolean negativeXGap = prev != null && word.getXDirAdj() - minX < -5; + boolean newLineAfterSplit = prev != null && word.getYDirAdj() != prev.getYDirAdj() && wasSplitted && splitX1 != word.getXDirAdj(); boolean isSplitByRuling = prev != null && usedRulings.lineBetween(prev, word); boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir()); - boolean splitByOtherFontAndOtherY = prev != null && Math.abs(prev.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight() * 0.2 && (word.getFontStyle() - .contains("bold") - && !prev.getFontStyle() - .contains("bold") || prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold")); + boolean splitByOtherFontAndOtherY = prev != null && Math.abs(prev.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight() * 0.2 // + && (word.getFontStyle().contains("bold") && !prev.getFontStyle().contains("bold") // + || prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold")); Matcher matcher = pattern.matcher(chunkWords.stream() .collect(Collectors.joining(" ")).toString()); - boolean startsOnSameX = Math.abs(minX - word.getMinXDirAdj()) < 5 && matcher.matches(); + boolean startsOnSameX = Math.abs(minX - word.getXDirAdj()) < 5 && matcher.matches(); if (prev != null && (lineSeparation || startFromTop || splitByDir || isSplitByRuling || splitByOtherFontAndOtherY || negativeXGap || startsOnSameX)) { @@ -84,7 +78,7 @@ public class DocuMineBlockificationService { if (splitByX && !isSplitByRuling) { wasSplitted = true; cb1.setOrientation(Orientation.LEFT); - splitX1 = word.getMinXDirAdj(); + splitX1 = word.getXDirAdj(); } else if (newLineAfterSplit && !isSplitByRuling) { wasSplitted = false; cb1.setOrientation(Orientation.RIGHT); @@ -107,14 +101,14 @@ public class DocuMineBlockificationService { chunkWords.add(word); prev = word; - if (word.getMinXDirAdj() < minX) { - minX = word.getMinXDirAdj(); + if (word.getXDirAdj() < minX) { + minX = word.getXDirAdj(); } if (word.getMaxXDirAdj() > maxX) { maxX = word.getMaxXDirAdj(); } - if (word.getMinYDirAdj() < minY) { - minY = word.getMinYDirAdj(); + if (word.getYDirAdj() < minY) { + minY = word.getYDirAdj(); } if (word.getMaxYDirAdj() > maxY) { maxY = word.getMaxYDirAdj(); @@ -126,7 +120,5 @@ public class DocuMineBlockificationService { return new ClassificationPage(textPageBlocks); } - - } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/RedactManagerBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/RedactManagerBlockificationService.java index 35d4edc..5394b22 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/RedactManagerBlockificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/RedactManagerBlockificationService.java @@ -38,18 +38,18 @@ public class RedactManagerBlockificationService { List chunkWords = new ArrayList<>(); List chunkBlockList = new ArrayList<>(); - float minX = 1000, maxX = 0, minY = 1000, maxY = 0; + double minX = 1000, maxX = 0, minY = 1000, maxY = 0; TextPositionSequence prev = null; boolean wasSplitted = false; - Float splitX1 = null; + Double splitX1 = null; for (TextPositionSequence word : textPositions) { - boolean lineSeparation = word.getMinYDirAdj() - maxY > word.getHeight() * 1.25; - boolean startFromTop = prev != null && word.getMinYDirAdj() < prev.getMinYDirAdj() - prev.getTextHeight(); - boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj(); - boolean xIsBeforeFirstX = prev != null && word.getMinXDirAdj() < minX; - boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj(); + boolean lineSeparation = word.getYDirAdj() - maxY > word.getHeight() * 1.25; + boolean startFromTop = prev != null && word.getYDirAdj() < prev.getYDirAdj() - prev.getTextHeight(); + boolean splitByX = prev != null && maxX + 50 < word.getXDirAdj() && prev.getYDirAdj() == word.getYDirAdj(); + boolean xIsBeforeFirstX = prev != null && word.getXDirAdj() < minX; + boolean newLineAfterSplit = prev != null && word.getYDirAdj() != prev.getYDirAdj() && wasSplitted && splitX1 != word.getXDirAdj(); boolean isSplitByRuling = prev != null && usedRulings.lineBetween(prev, word); boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir()); @@ -69,7 +69,7 @@ public class RedactManagerBlockificationService { if (splitByX && !isSplitByRuling) { wasSplitted = true; cb1.setOrientation(Orientation.LEFT); - splitX1 = word.getMinXDirAdj(); + splitX1 = word.getXDirAdj(); } else if (newLineAfterSplit && !isSplitByRuling) { wasSplitted = false; cb1.setOrientation(Orientation.RIGHT); @@ -92,14 +92,14 @@ public class RedactManagerBlockificationService { chunkWords.add(word); prev = word; - if (word.getMinXDirAdj() < minX) { - minX = word.getMinXDirAdj(); + if (word.getXDirAdj() < minX) { + minX = word.getXDirAdj(); } if (word.getMaxXDirAdj() > maxX) { maxX = word.getMaxXDirAdj(); } - if (word.getMinYDirAdj() < minY) { - minY = word.getMinYDirAdj(); + if (word.getYDirAdj() < minY) { + minY = word.getYDirAdj(); } if (word.getMaxYDirAdj() > maxY) { maxY = word.getMaxYDirAdj(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/ClarifyndClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/ClarifyndClassificationService.java index 8b98b65..cf2be14 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/ClarifyndClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/ClarifyndClassificationService.java @@ -23,7 +23,7 @@ public class ClarifyndClassificationService { public void classifyDocument(ClassificationDocument document) { - List headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular(); + List headlineFontSizes = document.getFontSizeCounter().getHigherThanMostPopular(); log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue()); @@ -35,7 +35,7 @@ public class ClarifyndClassificationService { } - private void classifyPage(HeadlineClassificationService headlineClassificationService, ClassificationPage page, ClassificationDocument document, List headlineFontSizes) { + private void classifyPage(HeadlineClassificationService headlineClassificationService, ClassificationPage page, ClassificationDocument document, List headlineFontSizes) { for (AbstractPageBlock textBlock : page.getTextBlocks()) { if (textBlock instanceof TextPageBlock) { @@ -45,7 +45,7 @@ public class ClarifyndClassificationService { } - private void classifyBlock(HeadlineClassificationService headlineClassificationService, TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List headlineFontSizes) { + private void classifyBlock(HeadlineClassificationService headlineClassificationService, TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List headlineFontSizes) { var bodyTextFrame = page.getBodyTextFrame(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java index ea019bd..3554372 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java @@ -31,7 +31,7 @@ public class DocuMineClassificationService { public void classifyDocument(ClassificationDocument document) { - List headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular(); + List headlineFontSizes = document.getFontSizeCounter().getHigherThanMostPopular(); log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue()); @@ -46,7 +46,7 @@ public class DocuMineClassificationService { private void classifyPage(HeadlineClassificationService headlineClassificationService, ClassificationPage page, ClassificationDocument document, - List headlineFontSizes) { + List headlineFontSizes) { for (AbstractPageBlock textBlock : page.getTextBlocks()) { if (textBlock instanceof TextPageBlock) { @@ -60,7 +60,7 @@ public class DocuMineClassificationService { TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, - List headlineFontSizes) { + List headlineFontSizes) { log.debug("headlineFontSizes: {}", headlineFontSizes); var bodyTextFrame = page.getBodyTextFrame(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java index 93f966c..205002a 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java @@ -25,7 +25,7 @@ public class RedactManagerClassificationService { public void classifyDocument(ClassificationDocument document) { - List headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular(); + List headlineFontSizes = document.getFontSizeCounter().getHigherThanMostPopular(); log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue()); @@ -37,7 +37,7 @@ public class RedactManagerClassificationService { } - private void classifyPage(HeadlineClassificationService headlineClassificationService, ClassificationPage page, ClassificationDocument document, List headlineFontSizes) { + private void classifyPage(HeadlineClassificationService headlineClassificationService, ClassificationPage page, ClassificationDocument document, List headlineFontSizes) { for (AbstractPageBlock textBlock : page.getTextBlocks()) { if (textBlock instanceof TextPageBlock) { @@ -47,7 +47,7 @@ public class RedactManagerClassificationService { } - private void classifyBlock(HeadlineClassificationService headlineClassificationService, TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List headlineFontSizes) { + private void classifyBlock(HeadlineClassificationService headlineClassificationService, TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List headlineFontSizes) { var bodyTextFrame = page.getBodyTextFrame(); @@ -56,7 +56,7 @@ public class RedactManagerClassificationService { return; } if (document.getFontSizeCounter().getMostPopular() == null) { - textBlock.setClassification(PageBlockType.OTHER); + textBlock.setClassification(PageBlockType.PARAGRAPH); return; } @@ -129,7 +129,7 @@ public class RedactManagerClassificationService { } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) { textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN); } else { - textBlock.setClassification(PageBlockType.OTHER); + textBlock.setClassification(PageBlockType.PARAGRAPH); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java index a6dd95e..113d55a 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java @@ -111,10 +111,10 @@ public class DocumentGraphFactory { textBlocks.add(originalTextBlock); textBlocks.addAll(textBlocksToMerge); - AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), node, context, page); + AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock2(TextPositionOperations.mergeAndSort(textBlocks), node, context, page); if (node instanceof DuplicatedParagraph duplicatedParagraph) { - AtomicTextBlock unsortedTextBlock = context.textBlockFactory.buildAtomicTextBlock(textBlocks.stream() + AtomicTextBlock unsortedTextBlock = context.textBlockFactory.buildAtomicTextBlock2(textBlocks.stream() .flatMap(tb -> tb.getSequences() .stream()) .collect(Collectors.toList()), node, context, page); @@ -191,7 +191,7 @@ public class DocumentGraphFactory { Page page = context.getPage(textBlocks.get(0).getPage()); Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build(); - AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks), footer, context, page); + AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock2(TextPositionOperations.merge(textBlocks), footer, context, page); List tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer); footer.setTreeId(tocId); footer.setLeafTextBlock(textBlock); @@ -203,7 +203,7 @@ public class DocumentGraphFactory { Page page = context.getPage(textBlocks.get(0).getPage()); Header header = Header.builder().documentTree(context.getDocumentTree()).build(); - AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks), header, 0, page); + AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.merge(textBlocks), header, 0, page); List tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header); header.setTreeId(tocId); header.setLeafTextBlock(textBlock); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SearchTextWithTextPositionFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SearchTextWithTextPositionFactory.java index 9fc0381..3a76c39 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SearchTextWithTextPositionFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SearchTextWithTextPositionFactory.java @@ -1,6 +1,7 @@ package com.knecon.fforesight.service.layoutparser.processor.services.factory; -import java.awt.geom.AffineTransform; +import static com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition.HEIGHT_PADDING; + import java.awt.geom.Rectangle2D; import java.util.Collection; import java.util.Collections; @@ -11,7 +12,6 @@ import java.util.Objects; import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange; import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import lombok.experimental.UtilityClass; @@ -19,14 +19,13 @@ import lombok.experimental.UtilityClass; @UtilityClass public class SearchTextWithTextPositionFactory { - public final int HEIGHT_PADDING = 2; // when checking for a hyphen linebreak, we need to check after a linebreak if the last hyphen was less than three symbols away. // We detect a linebreak as either a "\n" character or if two adjacent symbol's position differ in y-coordinates by at least one character height. // If there is a hyphen linebreak, the hyphen will be 1 position in front of a "\n" or 2 positions in front of the character which has a lower y-coordinate // This is why, we need to initialize this to < -2, otherwise, if the very first symbol is a \n we would detect a hyphen linebreak that isn't there. // Also, Integer.MIN_VALUE is a bad idea due to potential overflow during arithmetic operations. This is why the default should be -3. public final int MAX_HYPHEN_LINEBREAK_DISTANCE = 3; - public static final double LINEBREAK_DELTA_TOLERANCE = 1.05; + public static final double LINEBREAK_DELTA_TOLERANCE = 1.5; public SearchTextWithTextPositionDto buildSearchTextToTextPositionDto(List sequences) { @@ -38,15 +37,13 @@ public class SearchTextWithTextPositionFactory { Context context = new Context(); - RedTextPosition currentTextPosition = sequences.get(0).getTextPositions() - .get(0); + RedTextPosition currentTextPosition = sequences.get(0).getTextPositions().get(0); RedTextPosition previousTextPosition = RedTextPosition.builder().unicode(" ").bBoxDirAdj(currentTextPosition.getBBoxDirAdj()).build(); for (TextPositionSequence word : sequences) { for (int i = 0; i < word.getTextPositions().size(); ++i) { - currentTextPosition = word.getTextPositions() - .get(i); + currentTextPosition = word.getTextPositions().get(i); if (isLineBreak(currentTextPosition, previousTextPosition)) { removeHyphenLinebreaks(context); context.lineBreaksStringIdx.add(context.stringIdx); @@ -68,11 +65,10 @@ public class SearchTextWithTextPositionFactory { ++context.stringIdx; } - List positions = sequences.stream() .map(TextPositionSequence::getTextPositions) .flatMap(Collection::stream) - .map(RedTextPosition::getBBoxInitialUserSpace) + .map(RedTextPosition::getBBoxPdf) .toList(); assert context.stringBuilder.length() == context.stringIdxToPositionIdx.size(); @@ -161,8 +157,8 @@ public class SearchTextWithTextPositionFactory { return false; } - double deltaY = Math.abs(currentPosition.getYDirAdj() - previousPosition.getYDirAdj()) * LINEBREAK_DELTA_TOLERANCE; - return deltaY >= currentPosition.getHeightDir() || deltaY >= previousPosition.getHeightDir(); + double deltaY = (Math.abs(currentPosition.getYDirAdj() - previousPosition.getYDirAdj()) * LINEBREAK_DELTA_TOLERANCE) + (2 * HEIGHT_PADDING); + return deltaY >= currentPosition.getHeightDirAdj() || deltaY >= previousPosition.getHeightDirAdj(); } @@ -188,32 +184,6 @@ public class SearchTextWithTextPositionFactory { } - public Rectangle2D mapRedTextPositionToInitialUserSpace(RedTextPosition textPosition, TextPositionSequence sequence) { - - float textHeight = sequence.getTextHeight() + HEIGHT_PADDING; - Rectangle2D rectangle2D = new Rectangle2D.Double(textPosition.getXDirAdj(), - textPosition.getYDirAdj() - textHeight, - textPosition.getWidthDirAdj(), - textHeight + HEIGHT_PADDING); - - AffineTransform transform = new AffineTransform(); - - if (sequence.getDir() == TextDirection.ZERO || sequence.getDir() == TextDirection.HALF_CIRCLE) { - transform.rotate(sequence.getDir().getRadians(), sequence.getPageWidth() / 2f, sequence.getPageHeight() / 2f); - transform.translate(0f, sequence.getPageHeight()); - } else if (sequence.getDir() == TextDirection.QUARTER_CIRCLE) { - transform.rotate(sequence.getDir().getRadians(), sequence.getPageWidth() / 2f, sequence.getPageWidth() / 2f); - transform.translate(0f, sequence.getPageWidth()); - } else { - transform.rotate(sequence.getDir().getRadians(), sequence.getPageHeight() / 2f, sequence.getPageHeight() / 2f); - transform.translate(0f, sequence.getPageWidth()); - } - transform.scale(1., -1.); - - return transform.createTransformedShape(rectangle2D).getBounds2D(); - } - - private class Context { List stringIdxToPositionIdx = new LinkedList<>(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java index 195d114..507f8af 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java @@ -234,7 +234,7 @@ public class SectionNodeFactory { .filter(abstractTextContainer -> !abstractTextContainer.equals(atc)) .filter(abstractTextContainer -> abstractTextContainer.getPage() == atc.getPage()) .filter(abstractTextContainer -> abstractTextContainer instanceof TextPageBlock) - .filter(abstractTextContainer -> abstractTextContainer.intersectsY(atc)) + .filter(abstractTextContainer -> abstractTextContainer.intersectsYPdf(atc)) .map(abstractTextContainer -> (TextPageBlock) abstractTextContainer) .filter(abstractTextContainer -> abstractTextContainer.getDir() == atc.getDir()) .filter(abstractTextContainer -> !abstractTextContainer.isToDuplicate()) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java index b78e53b..1060a68 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java @@ -136,7 +136,7 @@ public class TableNodeFactory { .row(rowIndex) .col(colIndex) .header(cell.isHeaderCell()) - .bBox(cell.getBBoxInitialUserSpace()) + .bBox(cell.getBBoxPdf()) .build(); page.getMainBody().add(tableCell); @@ -148,7 +148,7 @@ public class TableNodeFactory { tableCell.setLeafTextBlock(context.getTextBlockFactory().emptyTextBlock(tableNode, context, page)); } else if (cell.getTextBlocks().size() == 1) { textBlock = context.getTextBlockFactory() - .buildAtomicTextBlock(cell.getTextBlocks() + .buildAtomicTextBlock2(cell.getTextBlocks() .get(0).getSequences(), tableCell, context, page); tableCell.setLeafTextBlock(textBlock); } else if (firstTextBlockIsHeadline(cell)) { @@ -163,8 +163,8 @@ public class TableNodeFactory { context, document); } else if (cellAreaIsSmallerThanPageAreaTimesThreshold(cell, page)) { - List sequences = TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(cell.getTextBlocks()); - textBlock = context.getTextBlockFactory().buildAtomicTextBlock(sequences, tableCell, context, page); + List sequences = TextPositionOperations.mergeAndSort(cell.getTextBlocks()); + textBlock = context.getTextBlockFactory().buildAtomicTextBlock2(sequences, tableCell, context, page); tableCell.setLeafTextBlock(textBlock); } else { cell.getTextBlocks() diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TextBlockFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TextBlockFactory.java index 6a0268c..5357fce 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TextBlockFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TextBlockFactory.java @@ -17,7 +17,7 @@ public class TextBlockFactory { long textBlockIdx; - public AtomicTextBlock buildAtomicTextBlock(List sequences, SemanticNode parent, DocumentGraphFactory.Context context, Page page) { + public AtomicTextBlock buildAtomicTextBlock2(List sequences, SemanticNode parent, DocumentGraphFactory.Context context, Page page) { Integer numberOnPage = context.getAndIncrementTextBlockNumberOnPage(page); return buildAtomicTextBlock(sequences, parent, numberOnPage, page); @@ -32,27 +32,27 @@ public class TextBlockFactory { long idx = textBlockIdx; textBlockIdx++; String orientation; - int textDirection; + int textRotation; if (sequences.isEmpty()) { orientation = null; - textDirection = 0; + textRotation = 0; } else { orientation = sequences.get(0).getDir().toString(); - textDirection = sequences.get(0).getRotation(); + textRotation = sequences.get(0).getDir().getRotation(); } return AtomicTextBlock.fromSearchTextWithTextPosition(searchTextWithTextPositionDto.getSearchText(), - searchTextWithTextPositionDto.getLineBreaks(), - searchTextWithTextPositionDto.getBoldTextBoundaries(), - searchTextWithTextPositionDto.getItalicTextBoundaries(), - searchTextWithTextPositionDto.getPositions(), - searchTextWithTextPositionDto.getStringIdxToPositionIdx(), - idx, - parent, - numberOnPage, - page, - offset, - orientation, - textDirection); + searchTextWithTextPositionDto.getLineBreaks(), + searchTextWithTextPositionDto.getBoldTextBoundaries(), + searchTextWithTextPositionDto.getItalicTextBoundaries(), + searchTextWithTextPositionDto.getPositions(), + searchTextWithTextPositionDto.getStringIdxToPositionIdx(), + idx, + parent, + numberOnPage, + page, + offset, + orientation, + textRotation); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/graphics/GraphicExtractorService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/graphics/GraphicExtractorService.java index 5e1cd2e..42c483a 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/graphics/GraphicExtractorService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/graphics/GraphicExtractorService.java @@ -8,12 +8,11 @@ import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.springframework.stereotype.Service; +import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; -import com.knecon.fforesight.service.layoutparser.processor.services.factory.SearchTextWithTextPositionFactory; import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation; -import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; import lombok.RequiredArgsConstructor; import lombok.SneakyThrows; @@ -67,10 +66,7 @@ public class GraphicExtractorService { private List getCharacterBBoxes(List textPositionSequences) { return textPositionSequences.stream() - .map(pos -> pos.getTextPositions() - .stream() - .map(tp -> SearchTextWithTextPositionFactory.mapRedTextPositionToInitialUserSpace(tp, pos)) - .collect(RectangleTransformations.collectBBox())) + .map(BoundingBox::getBBoxPdf) .map(Box::new) .collect(Collectors.toList()); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java index 24ed41d..f010a98 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java @@ -96,7 +96,7 @@ public class HeaderFooterDetection { continue; } - int distance = calculateHammingDistanceWithPreprocessing(testString, paddedString); + int distance = StringDistances.hamming(testString, paddedString); double normalizedScore = 1 - (double) distance / Math.max(testString.length(), paddedString.length()); score += normalizedScore * (j < weights.length ? weights[j] : 1); } @@ -180,44 +180,4 @@ public class HeaderFooterDetection { return headerCandidates; } - - /** - * Calculate the Hamming distance between two strings after preprocessing to make them the same length - * and replacing all digits with a special character '@' since they are a common occurrence in headers/footers. - * - * @param firstCandidate First string - * @param secondCandidate Second string - * @return The Hamming distance between the two preprocessed strings. - */ - private int calculateHammingDistanceWithPreprocessing(String firstCandidate, String secondCandidate) { - - int maxLength = Math.max(firstCandidate.length(), secondCandidate.length()); - - String cleanFirstCandidate = padString(firstCandidate, maxLength, '\0').replaceAll("\\d", "@"); - String cleanSecondCandidate = padString(secondCandidate, maxLength, '\0').replaceAll("\\d", "@"); - - int distance = 0; - for (int i = 0; i < maxLength; i++) { - if (cleanFirstCandidate.charAt(i) != cleanSecondCandidate.charAt(i)) { - distance++; - } - } - return distance; - } - - - private String padString(String input, int length, char padChar) { - - if (input.length() >= length) { - return input; - } - - StringBuilder sb = new StringBuilder(input); - - while (sb.length() < length) { - sb.append(padChar); - } - return sb.toString(); - } - } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/MarkedContentUtils.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/MarkedContentUtils.java index 3e87eb4..015f5bb 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/MarkedContentUtils.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/MarkedContentUtils.java @@ -48,7 +48,7 @@ public class MarkedContentUtils { return markedContentByYPosition.values() .stream() - .map(textPositions -> new TextPositionSequence(textPositions, 0, true).getBBoxInitialUserSpace()) + .map(textPositions -> new TextPositionSequence(textPositions, 0, true).getBBoxPdf()) .map(t -> new Rectangle2D.Double(t.getX(), t.getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight()))) .collect(Collectors.toList()); } @@ -90,7 +90,7 @@ public class MarkedContentUtils { .map(content -> (TextPosition) content) .filter(content -> !content.getUnicode().equals(" ")) .map(textPositions -> new TextPositionSequence(List.of(textPositions), 0, true)) - .map(BoundingBox::getBBoxInitialUserSpace) + .map(BoundingBox::getBBoxPdf) .collect(Collectors.toList()); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PositionUtils.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PositionUtils.java index c22e2bb..a3e20d0 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PositionUtils.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PositionUtils.java @@ -108,7 +108,7 @@ public final class PositionUtils { } - public float getHeightDifferenceBetweenChunkWordAndDocumentWord(TextPageBlock textBlock, Float documentMostPopularWordHeight) { + public double getHeightDifferenceBetweenChunkWordAndDocumentWord(TextPageBlock textBlock, Double documentMostPopularWordHeight) { return textBlock.getMostPopularWordHeight() - documentMostPopularWordHeight; } @@ -116,7 +116,7 @@ public final class PositionUtils { public double getApproxLineCount(TextPageBlock textBlock) { - return textBlock.getHeight() / textBlock.getMostPopularWordHeight(); + return textBlock.getBBoxDirAdj().getHeight() / textBlock.getMostPopularWordHeight(); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/SpreadsheetFinder.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/SpreadsheetFinder.java index b072352..048d020 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/SpreadsheetFinder.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/SpreadsheetFinder.java @@ -28,7 +28,7 @@ public class SpreadsheetFinder { Map edgesV = new HashMap<>(); for (Cell cell : cells) { - for (Point2D pt : getPoints(cell.getBBoxInitialUserSpace())) { + for (Point2D pt : getPoints(cell.getBBoxPdf())) { if (pointSet.contains(pt)) { // shared vertex, remove it pointSet.remove(pt); } else { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/StringDistances.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/StringDistances.java new file mode 100644 index 0000000..f224276 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/StringDistances.java @@ -0,0 +1,49 @@ +package com.knecon.fforesight.service.layoutparser.processor.utils; + +import lombok.experimental.UtilityClass; + +@UtilityClass +public class StringDistances { + + + /** + * Calculate the Hamming distance between two strings after preprocessing to make them the same length + * and replacing all digits with a special character '@' since they are a common occurrence in headers/footers. + * + * @param s1 First string + * @param s2 Second string + * @return The Hamming distance between the two preprocessed strings. + */ + public int hamming(String s1, String s2) { + + int maxLength = Math.max(s1.length(), s2.length()); + + String cleanFirstCandidate = padString(s1, maxLength, '\0').replaceAll("\\d", "@"); + String cleanSecondCandidate = padString(s2, maxLength, '\0').replaceAll("\\d", "@"); + + int distance = 0; + for (int i = 0; i < maxLength; i++) { + if (cleanFirstCandidate.charAt(i) != cleanSecondCandidate.charAt(i)) { + distance++; + } + } + return distance; + } + + + private String padString(String input, int length, char padChar) { + + if (input.length() >= length) { + return input; + } + + StringBuilder sb = new StringBuilder(input); + + while (sb.length() < length) { + sb.append(padChar); + } + return sb.toString(); + } + + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java index 5ab1843..fae4eba 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java @@ -1,30 +1,136 @@ package com.knecon.fforesight.service.layoutparser.processor.utils; +import java.awt.geom.Rectangle2D; +import java.awt.geom.RectangularShape; +import java.util.Collection; +import java.util.Comparator; +import java.util.HashSet; import java.util.List; +import java.util.Set; import java.util.stream.Collectors; +import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.AngleFilter; +import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox; +import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind; +import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils; +import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import lombok.experimental.UtilityClass; + +@UtilityClass public class TextPositionOperations { - private static final TextPositionSequenceComparator comparator = new TextPositionSequenceComparator(); + public static final double ANGLE_TOLERANCE = Math.PI / 35; + public static final AngleFilter ANGLE_FILTER = new AngleFilter(-ANGLE_TOLERANCE, ANGLE_TOLERANCE); + public static final double MAX_LINE_HEIGHT_FACTOR = 0.66; // multiplied with max word height + public static final double MAX_WORD_DISTANCE_FACTOR = 3.5; // multiplied with max word width + + private static final double THRESHOLD = 5; + private static final Comparator COMPARATOR_DIR_ADJ = // + Comparator.comparing(TextBoundingBox::getDir) + .thenComparing(TextBoundingBox::getYDirAdj, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) + .thenComparing(TextBoundingBox::getXDirAdj, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)); - public static List mergeAndSortTextPositionSequenceByYThenX(List textBlocks) { + public List mergeAndSort(List textBlocks) { - var sequence = textBlocks.stream().flatMap(tb -> tb.getSequences().stream()).collect(Collectors.toList()); - - // because the TextPositionSequenceComparator is not transitive, but - // JDK7+ enforces transitivity on comparators, we need to use - // a custom quicksort implementation (which is slower, unfortunately). - QuickSort.sort(sequence, comparator); - return sequence; + var sequences = textBlocks.stream() + .flatMap(tb -> tb.getSequences() + .stream()) + .collect(Collectors.toSet()); + return sortUsingLineDetection(sequences); } - public static List mergeTextPositionSequence(List textBlocks) { - return textBlocks.stream().flatMap(tb -> tb.getSequences().stream()).collect(Collectors.toList()); + public List sort(List sequences) { + + return sortUsingLineDetection(new HashSet<>(sequences)); + } + + + private List sortUsingLineDetection(Set sequences) { + + return groupByLine(sequences).stream() + .map(TextPositionOperations::sortByXDirAdj) + .filter(line -> !line.isEmpty()) + .sorted(Comparator.comparing(line -> line.get(0), COMPARATOR_DIR_ADJ)) + .flatMap(Collection::stream) + .toList(); + + } + + + private List sortByXDirAdj(Set line) { + + return line.stream() + .sorted(Comparator.comparing(TextPositionSequence::getXDirAdj)) + .toList(); + } + + + private Collection> groupByLine(Set sequences) { + + double maxLineDistance = sequences.stream() + .map(TextPositionSequence::getBBoxDirAdj) + .mapToDouble(RectangularShape::getHeight).average() + .orElse(10) * MAX_LINE_HEIGHT_FACTOR; + double maxXGap = sequences.stream() + .map(TextPositionSequence::getBBoxDirAdj) + .mapToDouble(RectangularShape::getWidth).average() + .orElse(75) * MAX_WORD_DISTANCE_FACTOR; + + UnionFind unionFind = new UnionFind<>(sequences); + + for (TextPositionSequence sequence : sequences) { + for (TextPositionSequence sequence2 : sequences) { + + if (sequence.equals(sequence2) || unionFind.inSameSet(sequence, sequence2)) { + continue; + } + + double angle = computeAngle(sequence.getBBoxDirAdj(), sequence2.getBBoxDirAdj()); + + double normalizedVerticalDistance = Math.abs(sequence.getBBoxDirAdj().getCenterY() - sequence2.getBBoxDirAdj().getCenterY()) / maxLineDistance; + double normalizedHorizontalDistance = Math.abs(sequence.getBBoxDirAdj().getCenterX() - sequence2.getBBoxDirAdj().getCenterX()) / maxXGap; + + if (sequence.getDir() != sequence2.getDir() + || Math.abs(sequence.getFontSize() - sequence2.getFontSize()) > 0.5 * Math.min(sequence.getFontSize(), + sequence2.getFontSize()) + || Math.pow(normalizedVerticalDistance, 2) + Math.pow(normalizedHorizontalDistance, 2) > 1 + || !ANGLE_FILTER.matches(angle)) { + continue; + } + + unionFind.union(sequence, sequence2); + } + } + + return unionFind.getGroups(); + } + + + public double computeAngle(Rectangle2D rect1, Rectangle2D rect2) { + + double rect1CentroidX = rect1.getCenterX(); + double rect1CentroidY = rect1.getCenterY(); + double rect2CentroidX = rect2.getCenterX(); + double rect2CentroidY = rect2.getCenterY(); + + double deltaX = rect2CentroidX - rect1CentroidX; + double deltaY = rect2CentroidY - rect1CentroidY; + + return FastAtan2.fastAtan2(deltaY, deltaX); + } + + + public List merge(List textBlocks) { + + return textBlocks.stream() + .map(TextPageBlock::getSequences) + .flatMap(Collection::stream) + .collect(Collectors.toList()); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionSequenceComparator.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionSequenceComparator.java deleted file mode 100644 index 1c46ef5..0000000 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionSequenceComparator.java +++ /dev/null @@ -1,99 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.knecon.fforesight.service.layoutparser.processor.utils; - -import java.util.Comparator; - -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; - -/** - * This class is a comparator for TextPosition operators. It handles - * pages with text in different directions by grouping the text based - * on direction and sorting in that direction. This allows continuous text - * in a given direction to be more easily grouped together. - * - * @author Ben Litchfield - */ -public class TextPositionSequenceComparator implements Comparator { - - @Override - public int compare(TextPositionSequence pos1, TextPositionSequence pos2) { - // only compare text that is in the same direction - int cmp1 = Float.compare(pos1.getDir().getDegrees(), pos2.getDir().getDegrees()); - if (cmp1 != 0) { - return cmp1; - } - - // get the text direction adjusted coordinates - double x1 = pos1.getBBox().getX(); - double x2 = pos2.getBBox().getX(); - - double pos1YBottom = pos1.getBBox().getMaxY(); - double pos2YBottom = pos2.getBBox().getMaxY(); - - // note that the coordinates have been adjusted so 0,0 is in upper left - double pos1YTop = pos1YBottom - pos1.getBBox().getHeight(); - double pos2YTop = pos2YBottom - pos2.getBBox().getHeight(); - - double yDifference = Math.abs(pos1YBottom - pos2YBottom); - - - // Adjust for text rotation - switch (pos1.getRotation()) { - case 0: - // 0 degrees (horizontal, top to bottom and left to right): Sort primarily by y-coordinates from top to bottom (pos1YBottom < pos2YBottom). - if (yDifference < .1 || (pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom) || (pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom)) { - return Double.compare(x1, x2); - } else if (pos1YBottom < pos2YBottom) { - return -1; - } else { - return 1; - } - case 90: - // 90 degrees (vertical, right to left): Sort by x-coordinates first (x1 > x2), then by y-coordinates from top to bottom (pos1YBottom < pos2YBottom). - if (x1 > x2) { - return -1; - } else if (x1 < x2) { - return 1; - } else { - return Double.compare(pos1YBottom, pos2YBottom); - } - case 180: - // 180 degrees (horizontal, bottom to top and right to left): Sort primarily by y-coordinates from bottom to top (pos1YBottom > pos2YBottom). - if (yDifference < .1 || (pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom) || (pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom)) { - return Double.compare(x2, x1); - - } else if (pos1YBottom > pos2YBottom) { - return -1; - } else { - return 1; - } - case 270: - // 270 degrees (vertical, left to right): Sort by x-coordinates in reverse (x2 > x1), then by y-coordinates from bottom to top (pos2YBottom > pos1YBottom). - if (x2 > x1) { - return -1; - } else if (x2 < x1) { - return 1; - } else { - return Double.compare(pos2YBottom, pos1YBottom); - } - default: - throw new RuntimeException("Rotation not supported. Only 0/90/180/270 degree rotation is supported."); - } - } - -} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutDebugLayer.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutDebugLayer.java index 74b724a..9cbe489 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutDebugLayer.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutDebugLayer.java @@ -51,7 +51,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig { VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.words); visualizationsOnPage.getColoredRectangles() .addAll(textPositionSequences.stream() - .map(BoundingBox::getBBoxInitialUserSpace) + .map(BoundingBox::getBBoxPdf) .map(rect -> new ColoredRectangle(rect, WORDS_COLOR, 1)) .toList()); } @@ -105,7 +105,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig { VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.cells); visualizationsOnPage.getColoredRectangles() .addAll(cells.stream() - .map(cell -> new ColoredRectangle(cell.getBBoxInitialUserSpace(), CELLS_COLOR, 1)) + .map(cell -> new ColoredRectangle(cell.getBBoxPdf(), CELLS_COLOR, 1)) .toList()); } @@ -119,7 +119,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig { VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(page, this.zones); visualizationsOnPage.getColoredRectangles() .addAll(zones.stream() - .map(BoundingBox::getBBoxInitialUserSpace) + .map(BoundingBox::getBBoxPdf) .map(zone -> new ColoredRectangle(zone, ZONES_COLOR, 1)) .toList()); @@ -144,7 +144,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig { VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(page, this.lines); visualizationsOnPage.getColoredRectangles() .addAll(lines.stream() - .map(BoundingBox::getBBoxInitialUserSpace) + .map(BoundingBox::getBBoxPdf) .map(line -> new ColoredRectangle(line, LINES_COLOR, 0.5f)) .toList()); } @@ -158,7 +158,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig { VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(page, zones); visualizationsOnPage.getColoredRectangles() .addAll(textPageBlocks.stream() - .map(rect -> new ColoredRectangle(rect.getBBoxInitialUserSpace(), ZONES_COLOR, 1)) + .map(rect -> new ColoredRectangle(rect.getBBoxPdf(), ZONES_COLOR, 1)) .toList()); } @@ -222,11 +222,11 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig { .flatMap(Collection::stream) .forEach(character -> { Color color = ROTATING_CHARACTER_COLOR.get(index.getAndIncrement() % ROTATING_CHARACTER_COLOR.size()); - Rectangle2D charBBox = character.getTextPosition().getBBoxInitialUserSpace(); + Rectangle2D charBBox = character.getTextPosition().getBBoxPdf(); characterVisualizations.getColoredRectangles().add(new ColoredRectangle(charBBox, color, 1)); character.getNeighbors() .forEach(neighbor -> { - Rectangle2D neighborBBox = neighbor.getCharacter().getTextPosition().getBBoxInitialUserSpace(); + Rectangle2D neighborBBox = neighbor.getCharacter().getTextPosition().getBBoxPdf(); Line2D line = new Line2D.Double(new Point2D.Double(charBBox.getCenterX(), charBBox.getCenterY()), new Point2D.Double(neighborBBox.getCenterX(), neighborBBox.getCenterY())); neighbourVisualizations.getColoredLines().add(new ColoredLine(line, color, 1)); diff --git a/layoutparser-service/layoutparser-service-server/build.gradle.kts b/layoutparser-service/layoutparser-service-server/build.gradle.kts index e25d176..590c419 100644 --- a/layoutparser-service/layoutparser-service-server/build.gradle.kts +++ b/layoutparser-service/layoutparser-service-server/build.gradle.kts @@ -38,7 +38,7 @@ dependencies { implementation("com.amazonaws:aws-java-sdk-s3:1.12.536") implementation("org.springframework.cloud:spring-cloud-starter-openfeign:4.0.4") implementation("net.logstash.logback:logstash-logback-encoder:7.4") - implementation("com.pdftron:PDFNet:10.7.0") + implementation("com.pdftron:PDFNet:10.11.0") // for integration testing only testImplementation(project(":viewer-doc-processor")) @@ -52,6 +52,8 @@ dependencies { testImplementation("com.fasterxml.jackson.datatype:jackson-datatype-jsr310:${jacksonVersion}") testImplementation("org.apache.pdfbox:pdfbox:${pdfBoxVersion}") testImplementation("org.apache.pdfbox:pdfbox-tools:${pdfBoxVersion}") + testImplementation("org.apache.commons:commons-text:1.12.0") + } /* @@ -89,6 +91,9 @@ tasks.named("bootBuildImage") { environment.put("BPE_DELIM_JAVA_TOOL_OPTIONS", " ") environment.put("BPE_APPEND_JAVA_TOOL_OPTIONS", "-Dfile.encoding=UTF-8") + builder.set("docker-proxy.knecon.com/paketobuildpacks/builder:base") + runImage.set("docker-proxy.knecon.com/paketobuildpacks/run:base-cnb") + imageName.set("nexus.knecon.com:5001/ff/${project.name}:${project.version}") if (project.hasProperty("buildbootDockerHostNetwork")) { network.set("host") @@ -99,6 +104,13 @@ tasks.named("bootBuildImage") { } verboseLogging.set(true) + builderRegistry { + username.set(providers.gradleProperty("mavenUser").getOrNull()) + password.set(providers.gradleProperty("mavenPassword").getOrNull()) + email.set(providers.gradleProperty("mavenEmail").getOrNull()) + url.set("https://docker-proxy.knecon.com:5001/") + } + publishRegistry { username.set(providers.gradleProperty("mavenUser").getOrNull()) password.set(providers.gradleProperty("mavenPassword").getOrNull()) @@ -106,4 +118,5 @@ tasks.named("bootBuildImage") { url.set("https://nexus.knecon.com:5001/") } } + } diff --git a/layoutparser-service/layoutparser-service-server/src/main/java/com/knecon/fforesight/service/layoutparser/server/PDFNetInitializer.java b/layoutparser-service/layoutparser-service-server/src/main/java/com/knecon/fforesight/service/layoutparser/server/PDFNetInitializer.java index 0fbf9d7..5671c9b 100644 --- a/layoutparser-service/layoutparser-service-server/src/main/java/com/knecon/fforesight/service/layoutparser/server/PDFNetInitializer.java +++ b/layoutparser-service/layoutparser-service-server/src/main/java/com/knecon/fforesight/service/layoutparser/server/PDFNetInitializer.java @@ -1,20 +1,18 @@ package com.knecon.fforesight.service.layoutparser.server; import org.springframework.beans.factory.annotation.Value; -import org.springframework.stereotype.Component; +import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.Configuration; import com.google.common.base.Strings; -import com.knecon.fforesight.service.layoutparser.processor.LayoutparserSettings; import com.pdftron.pdf.PDFNet; -import jakarta.annotation.PostConstruct; -import jakarta.annotation.PreDestroy; import lombok.RequiredArgsConstructor; import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; @Slf4j -@Component +@Configuration @RequiredArgsConstructor public class PDFNetInitializer { @@ -22,26 +20,17 @@ public class PDFNetInitializer { private String pdftronLicense; + @Bean @SneakyThrows - @PostConstruct - // Do not change back to application runner, if it is application runner it takes messages from the queue before PDFNet is initialized, that leads to UnsatisfiedLinkError. public void init() { if (Strings.isNullOrEmpty(pdftronLicense)) { - return; + throw new IllegalArgumentException("PDFTRON_LICENSE not set!"); } log.info("Initializing Native Libraries"); log.info("Setting pdftron license: {}", pdftronLicense); PDFNet.setTempPath("/tmp/pdftron"); PDFNet.initialize(pdftronLicense); - - } - - - @PreDestroy - public void terminate() { - - PDFNet.terminate(); } } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java index 6e003a8..ce1a37a 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java @@ -27,23 +27,28 @@ import lombok.extern.slf4j.Slf4j; @Slf4j public class LayoutparserEnd2EndTest extends AbstractTest { + public static final LayoutParsingType LAYOUT_PARSING_TYPE = LayoutParsingType.DOCUMINE; + @Autowired private LayoutParsingPipeline layoutParsingPipeline; + @Test + @Disabled public void testLayoutParserEndToEnd() { - String filePath = "files/test-1.pdf"; + String filePath = "/home/kschuettler/Downloads/55974b3de7ed2915718a10458206bbd8.ORIGIN.pdf"; runForFile(filePath); } + @Test @Disabled @SneakyThrows public void testLayoutParserEndToEndWithFolder() { - String folder = "/Users/maverickstuder/Documents/Fforesight/layoutparser/layoutparser-service/layoutparser-service-server/src/test"; + String folder = "/home/kschuettler/Dokumente/TestFiles/ReadingOrder"; List pdfFiles = Files.walk(Path.of(folder)) .filter(path -> path.getFileName().toString().endsWith(".pdf")) .sorted(Comparator.comparing(Path::getFileName)) @@ -69,7 +74,8 @@ public class LayoutparserEnd2EndTest extends AbstractTest { file = new File(filePath); } - LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH, true); + LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LAYOUT_PARSING_TYPE, true); + prepareStorage(layoutParsingRequest, file); LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/OutlineDetectionTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/OutlineDetectionTest.java index 0f486eb..818a99a 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/OutlineDetectionTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/OutlineDetectionTest.java @@ -57,9 +57,11 @@ public class OutlineDetectionTest extends AbstractTest { pdfNetInitializer.init(); } + @Test @SneakyThrows - public void testOutlineError(){ + public void testOutlineError() { + String fileName = "files/syngenta/CustomerFiles/Clarifynd/VV-470942.pdf"; ClassificationDocument classificationDocument = parseLayout(fileName, LayoutParsingType.CLARIFYND); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentReadingOrderTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentReadingOrderTest.java new file mode 100644 index 0000000..5ff7900 --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentReadingOrderTest.java @@ -0,0 +1,452 @@ +package com.knecon.fforesight.service.layoutparser.server.graph; + +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.awt.Color; +import java.awt.geom.Rectangle2D; +import java.io.FileOutputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; + +import org.apache.commons.text.similarity.LevenshteinDistance; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.springframework.beans.factory.annotation.Autowired; + +import com.iqser.red.storage.commons.service.StorageService; +import com.iqser.red.storage.commons.utils.FileSystemBackedStorageService; +import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; +import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; +import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations; +import com.knecon.fforesight.service.layoutparser.server.PDFNetInitializer; +import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest; +import com.knecon.fforesight.service.viewerdoc.model.Standard14EmbeddableFont; +import com.pdftron.common.Matrix2D; +import com.pdftron.common.PDFNetException; +import com.pdftron.pdf.ColorPt; +import com.pdftron.pdf.ColorSpace; +import com.pdftron.pdf.Element; +import com.pdftron.pdf.ElementBuilder; +import com.pdftron.pdf.ElementWriter; +import com.pdftron.pdf.Font; +import com.pdftron.pdf.GState; +import com.pdftron.pdf.PDFDoc; +import com.pdftron.pdf.Page; +import com.pdftron.sdf.SDFDoc; + +import lombok.SneakyThrows; +import lombok.extern.slf4j.Slf4j; + +@Slf4j +public class DocumentReadingOrderTest extends BuildDocumentTest { + + private static final boolean DRAW_DIR_ADJ_COORDS = false; + public static final List LAYOUT_PARSING_TYPES = List.of(LayoutParsingType.DOCUMINE, + LayoutParsingType.DOCUMINE_OLD, + LayoutParsingType.REDACT_MANAGER, + LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH); + + @Autowired + PDFNetInitializer pdfNetInitializer; + + @Autowired + StorageService storageService; + + + @BeforeEach + public void before() { + + pdfNetInitializer.init(); + } + + + @AfterEach + public void cleanUp() { + + ((FileSystemBackedStorageService) storageService).clearStorage(); + } + + + @Test + public void readingOrderTestSeite14() { + + String pdfFile = "files/syngenta/CustomerFiles/SinglePages/Seite14.pdf"; + String expectedText = """ + 27 + 26 APPENDICES SECTION + APPENDIX 1 Analytical Report + syngenta + A16148F + Batch ID 533158 (GP-080305) + Batch Identification 533158 + Product Design Code A16148F + Product Denomination SYN524464 FS (500) + Product by Common Name SYN524464 FS (500) + Other Product Code(s) GP-080305 + Source Technology & Projects, Syngenta Crop Protection, Inc. + Chemical Analysis + (Active Ingredient Content) + Identity of the Active Ingredient* Confirmed + Content of SYN524464* 45.6% (wt/wt) or 534 g/L + Methodology Used for Characterization HPLC + The Active Ingredient content is within the FAO limits. + Physical Analysis + Appearance* pink opaque liquid + Density* 1171 g/L + Stability: + Storage Temperature <30°℃ + Expiration date March 2009 + The stability of this test substance will be determined concurrently through reanalysis of material held + in inventory under GLP conditions at Syngenta Crop Protection, Inc., Greensboro, NC + This Certificate of Analysis is summarizing data (marked with an asterisk) from a study that has been + performed in compliance with Good Laboratory Practices per 40 CFR Part 160 Raw data, + documentation, protocols, any amendments to study protocols and reports pertaining to this study are + maintained in the Syngenta Crop Protection Archives in Greensboro, NC. + Authorization' + 26 Mar 2008 + Dorothea Jeffery Date + Group Leader I + Analytical & Product Chemistry Department + Document 10350420.doc Certificate of Analysis + Page 1 of 1 Study T000973-08 + Report Number: 11813-08 Page 14 of 14 + """; + + assertSimilarReadingOrder(expectedText, pdfFile); + } + + + @Test + public void readingOrderTestTiltedText() { + + String pdfFile = "files/syngenta/CustomerFiles/SinglePages/tiltedText.pdf"; + String expectedText = """ + However there was no consistency in the areas affected either between sexes or at different + ages, in general other measurements for the same structures at other levels showed no + differences, all were within the historical control range of mean values and none of these + differences is considered to be related to treatment (Appendix K). + 7. DISCUSSION + The purpose of this study, which was to determine the potential for developmental + neurotoxicity in the assessment and evaluation of the toxic characteristics of lambda- + cyhalothrin in rats, was successfully accomplished. + There was evidence of toxicity characterised by lower bodyweights and food consumption in + dams receiving 60 or 150 ppm lambda-cyhalothrin during gestation and also post partum in + the 150 ppm group only. + There were no treatment-related effects of administration of lambda-cyhalothrin on + reproductive parameters: there were no effects on gestation length, mean litter size or on pup + bodyweight at birth. + There was evidence of toxicity in F1 animals receiving 150 ppm. This was seen as slightly + higher pup mortality up to day 5 and lower bodyweights from day 5, reaching a maximum of + approximately 8-9% below control on day 22. + There was a small difference in the age at which male rats in the 150 ppm group reached + preputial separation, but this was too small to be of toxicological significance. + No effects were seen on motor activity or response to auditory startle. + There was no clear evidence of any effects in the learning and memory assessment in + weanling (age 21-24 days) or young adult animals (age 59-62 days). However, at day 21 + swimming speeds of females receiving 150 ppm were slightly slower than controls. The + difference is considered to reflect a difference in swimming performance rather than an effect + on learning or memory. + No neuropathological effect of treatment with lambda-cyhalothrin was detected from a + detailed microscopic examination of the selected F1 animals post mortem on day 12 or 63. + LAMBDA-CYHALOTHRIN: DEVELOPMENTAL NEUROTOXICITY STUDY IN RATS + CTL/RR0969/REGULATORY/REPORT - 34 + """; + + assertSimilarReadingOrder(expectedText, pdfFile); + } + + + @Test + public void readingOrderTest402Study() { + + String pdfFile = "files/SinglePages/402StudyPage5.pdf"; + String expectedText = """ + 2.0 INTRODUCTION + 2.1 Purpose + Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor + invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et + accusam et justo duo dolores et ea rebum. + Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem + ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt + ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et + justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est + Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed + diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam + voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd + gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. + 2.2 Guidelines + Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor + invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. + At vero eos et accusam et justo duo dolores et ea rebum. + Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem + ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt + ut labore et dolore magna aliquyam erat, sed diam voluptua. + At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no + sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, + consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore + magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et + ea rebum. + Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. + Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel + illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui + blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi. Lorem + ipsum dolor sit amet. + 2.3 Test Facility + Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor + invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et + accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata + sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur + sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna + aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea + rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. + Report Number: 20/080-002P 5 + """; + + assertSimilarReadingOrder(expectedText, pdfFile); + } + + + @Test + public void readingOrderTest402StudyRotated() { + + String pdfFile = "files/SinglePages/402StudyPage5_rotated.pdf"; + String expectedText = """ + 2.0 INTRODUCTION + 2.1 Purpose + Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor + invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et + accusam et justo duo dolores et ea rebum. + Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem + ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt + ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et + justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est + Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed + diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam + voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd + gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. + 2.2 Guidelines + Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor + invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. + At vero eos et accusam et justo duo dolores et ea rebum. + Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem + ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt + ut labore et dolore magna aliquyam erat, sed diam voluptua. + At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no + sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, + consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore + magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et + ea rebum. + Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. + Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel + illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui + blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi. Lorem + ipsum dolor sit amet. + 2.3 Test Facility + Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor + invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et + accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata + sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur + sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna + aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea + rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. + Report Number: 20/080-002P 5 + """; + + assertSimilarReadingOrder(expectedText, pdfFile); + } + + + private void assertSimilarReadingOrder(String expectedText, String pdfFile) { + + List expectedLines = List.of(expectedText.split("\n")); + for (LayoutParsingType layoutParsingType : LAYOUT_PARSING_TYPES) { + + log.info("Evaluating for {}", layoutParsingType); + + ClassificationDocument classificationDocument = parseLayout(pdfFile, layoutParsingType); + + if (DRAW_DIR_ADJ_COORDS) { + drawDirAdjCoords(pdfFile, classificationDocument, layoutParsingType); + } + + Document document = DocumentGraphFactory.buildDocumentGraph(layoutParsingType, classificationDocument); + List readLines = getTextAsLines(document); + readLines.forEach(log::info); + + int correctCount = 0; + int maxLineOffset = 0; + for (int i = 0; i < expectedLines.size(); i++) { + String expectedLine = expectedLines.get(i); + int mostSimilarLine = 0; + double maxSimilarity = 0; + for (int j = 0; j < readLines.size(); j++) { + String readLine = readLines.get(j); + double similarity = similarity(expectedLine, readLine); + if (similarity > maxSimilarity) { + maxSimilarity = similarity; + mostSimilarLine = j; + } + } + if (readLines.get(mostSimilarLine).trim().equals(expectedLine.trim())) { + correctCount++; + int lineOffset = Math.abs(mostSimilarLine - i); + if (lineOffset > 0) { + log.info("Line {} offset by {}", readLines.get(mostSimilarLine), lineOffset); + } + if (lineOffset > maxLineOffset) { + maxLineOffset = lineOffset; + } + } else { + log.error("Lines {}-{} do not match: \n Expected: {}\n Actual: {}", i, mostSimilarLine, expectedLine, readLines.get(mostSimilarLine)); + } + } + double correctLinesFactor = (double) correctCount / (double) readLines.size(); + double averageLineOffset = (double) maxLineOffset / (double) readLines.size(); + + log.info("Difference in number of lines: {}", Math.abs(expectedLines.size() - readLines.size())); + log.info("Correct lines factor: {}", correctLinesFactor); + log.info("Max order offset: {}, avg: {}", maxLineOffset, averageLineOffset); + // In the rotated document one line is read as two + + assertTrue(Math.abs(expectedLines.size() - readLines.size()) <= 1); + // Most of the errors come from the similarity metric finding different lines in 402 study, as the lines are too similar, or a miss classification of Footers + assertTrue(averageLineOffset < 1); + assertTrue(correctLinesFactor > 0.9); + } + } + + + public List getTextAsLines(Document document) { + + return document.getTextBlock().getAtomicTextBlocks() + .stream() + .filter(atb -> !atb.isEmpty()) + .map(DocumentReadingOrderTest::getLines) + .flatMap(List::stream) + .toList(); + } + + + private static List getLines(AtomicTextBlock atomicTextBlock) { + + int numberOfLines = atomicTextBlock.numberOfLines(); + List lines = new ArrayList<>(numberOfLines); + for (int line = 0; line < numberOfLines; line++) { + lines.add(atomicTextBlock.getLine(line).toString()); + } + return lines; + } + + + private static double similarity(String s1, String s2) { + + LevenshteinDistance levenshteinDistance = new LevenshteinDistance(); + + int max = Math.max(s1.length(), s2.length()); + int dist = levenshteinDistance.apply(s1, s2); + return 1 - (double) dist / (double) max; + } + + + @SneakyThrows + private void drawDirAdjCoords(String filename, ClassificationDocument classificationDocument, LayoutParsingType layoutParsingType) { + + try (PDFDoc pdfDoc = new PDFDoc(); ElementWriter writer = new ElementWriter(); ElementBuilder builder = new ElementBuilder()) { + + Standard14EmbeddableFont font = Standard14EmbeddableFont.helvetica(); + Font helvetica = Font.create(pdfDoc, Font.e_helvetica); + for (ClassificationPage classificationDocumentPage : classificationDocument.getPages()) { + int count = 0; + Page page = pdfDoc.pageCreate(); + writer.begin(page); + for (AbstractPageBlock abstractBlock : classificationDocumentPage.getTextBlocks()) { + + if (abstractBlock instanceof TextPageBlock textBlock) { + for (TextPositionSequence sequence : TextPositionOperations.mergeAndSort(List.of(textBlock))) { + + float stringWidth; + try { + stringWidth = font.getStringWidth(sequence.toString()); + } catch (Exception e) { + stringWidth = font.getFont().getAverageFontWidth() * sequence.toString().length(); + } + double fontSize = (sequence.getBBoxDirAdj().getWidth() / stringWidth) * 1000; + try (Matrix2D textMatrix = new Matrix2D(1, + 0, + 0, + 1, + sequence.getXDirAdj(), + page.getCropBox().getHeight() - sequence.getYDirAdj() - sequence.getHeightDirAdj())) { + writeText(sequence.toString(), textMatrix, builder, helvetica, fontSize, writer, Color.BLACK); + writeText(String.valueOf(count), textMatrix.translate(-(2 + (5 * String.valueOf(count).length())), 0), builder, helvetica, 8, writer, Color.RED); + count++; + } + + writeBBox(sequence.getBBoxDirAdj(), builder, page, writer, Color.BLACK); + } + writeBBox(textBlock.getBBoxDirAdj(), builder, page, writer, Color.BLUE); + } + } + writer.end(); + pdfDoc.pagePushBack(page); + + } + + Path stem = Path.of("/tmp/READING_ORDER_TEST/"); + Files.createDirectories(stem); + try (var out = new FileOutputStream(stem.resolve(layoutParsingType.name() + "_" + Path.of(filename).getFileName()).toFile() + "_dirAdjCoordinates.pdf")) { + pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null); + } + } + } + + + private static void writeBBox(Rectangle2D r, ElementBuilder builder, Page page, ElementWriter writer, Color color) throws PDFNetException { + + Element rect = builder.createRect(r.getX(), page.getCropBox().getHeight() - r.getY(), r.getWidth(), -r.getHeight()); + float[] comp = color.getColorComponents(null); + rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB()); + try (ColorPt colorpt = new ColorPt(comp[0], comp[1], comp[2])) { + rect.getGState().setStrokeColor(colorpt); + } + rect.setPathStroke(true); + writer.writeElement(rect); + } + + + private static void writeText(String string, + Matrix2D matrix2D, + ElementBuilder builder, + Font helvetica, + double fontSize, + ElementWriter writer, + Color color) throws PDFNetException { + + Element text = builder.createTextBegin(helvetica, fontSize); + text.getGState().setFillColorSpace(ColorSpace.createDeviceRGB()); + float[] colorComponents = color.getColorComponents(null); + try (ColorPt colorpt = new ColorPt(colorComponents[0], colorComponents[1], colorComponents[2])) { + text.getGState().setFillColor(colorpt); + } + text.setTextMatrix(matrix2D); + text.getGState().setTextRenderMode(GState.e_fill_text); + writer.writeElement(text); + + text = builder.createTextRun(string); + writer.writeElement(text); + text = builder.createTextEnd(); + writer.writeElement(text); + + } + +} diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/model/TextPositionSequenceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/model/TextPositionSequenceTest.java deleted file mode 100644 index 0922e4b..0000000 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/model/TextPositionSequenceTest.java +++ /dev/null @@ -1,60 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.server.model; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.ByteArrayInputStream; -import java.nio.charset.StandardCharsets; - -import org.apache.pdfbox.util.Matrix; -import org.junit.jupiter.api.Test; - -import com.fasterxml.jackson.databind.ObjectMapper; -import com.iqser.red.storage.commons.properties.StorageProperties; -import com.iqser.red.storage.commons.service.ObjectSerializer; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; - -import lombok.SneakyThrows; - -public class TextPositionSequenceTest { - - private static final String TEXT_POSITION_SEQUENCE_AS_JSON = "{\n" // - + " \"page\": 1,\n" // - + " \"textPositions\": [],\n" // - + " \"dir\": 180.0,\n" // - + " \"rotation\": 0,\n" // - + " \"pageHeight\": 800,\n" // - + " \"pageWidth\": 600\n" // - + "}"; - - private final ObjectSerializer objectSerializer = new ObjectSerializer(new ObjectMapper()); - - - @Test - @SneakyThrows - public void testDeserializationWithJackson() { - - TextPositionSequence textPositionSequence = objectSerializer.deserialize(new ByteArrayInputStream(TEXT_POSITION_SEQUENCE_AS_JSON.getBytes(StandardCharsets.UTF_8)), - TextPositionSequence.class); - - assertPropertiesAfterJsonDeserialization(textPositionSequence); - } - - - private void assertPropertiesAfterJsonDeserialization(TextPositionSequence textPositionSequence) { - - assertThat(textPositionSequence.getPage()).isEqualTo(1); - assertThat(textPositionSequence.getTextPositions()).hasSize(0); - assertThat(textPositionSequence.getDir()).isEqualTo(TextDirection.HALF_CIRCLE); - assertThat(textPositionSequence.getRotation()).isEqualTo(0); - assertThat(textPositionSequence.getPageHeight()).isEqualTo(800f); - assertThat(textPositionSequence.getPageWidth()).isEqualTo(600f); - } - - - private Matrix createIdentityMatrix() { - - return new Matrix(); - } - -} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/PageContentExtractorTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/PageContentExtractorTest.java index ec7d002..dbb838a 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/PageContentExtractorTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/PageContentExtractorTest.java @@ -3,12 +3,10 @@ package com.knecon.fforesight.service.layoutparser.server.services; import java.nio.file.Path; import java.util.List; -import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import com.knecon.fforesight.service.layoutparser.processor.model.PageContents; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; -import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor; import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw; @@ -29,7 +27,7 @@ class PageContentExtractorTest { textPositionPerPage.stream() .map(t -> t.getSortedTextPositionSequences() .stream() - .map(TextPositionSequence::getBBoxInitialUserSpace) + .map(TextPositionSequence::getBBoxPdf) .map(List::of) .toList()) .toList(), tmpFileName); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/BuildDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/BuildDocumentTest.java index 6857f68..92ffe11 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/BuildDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/BuildDocumentTest.java @@ -1,6 +1,8 @@ package com.knecon.fforesight.service.layoutparser.server.utils; +import java.awt.geom.Rectangle2D; import java.io.File; +import java.io.FileOutputStream; import java.nio.file.Path; import java.util.Map; @@ -10,11 +12,27 @@ import org.springframework.core.io.ClassPathResource; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline; +import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse; import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; +import com.knecon.fforesight.service.viewerdoc.model.Standard14EmbeddableFont; +import com.pdftron.common.Matrix2D; +import com.pdftron.pdf.ColorPt; +import com.pdftron.pdf.ColorSpace; +import com.pdftron.pdf.Element; +import com.pdftron.pdf.ElementBuilder; +import com.pdftron.pdf.ElementWriter; +import com.pdftron.pdf.Font; +import com.pdftron.pdf.GState; +import com.pdftron.pdf.PDFDoc; +import com.pdftron.pdf.Page; +import com.pdftron.sdf.SDFDoc; import lombok.SneakyThrows; @@ -48,14 +66,14 @@ public abstract class BuildDocumentTest extends AbstractTest { @SneakyThrows protected Document buildGraph(String filename, LayoutParsingType layoutParsingType) { - if (!filename.startsWith("files") && filename.startsWith("/")) { - LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(Path.of(filename).getFileName().toString(), LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH, true); + LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(Path.of(filename).getFileName().toString(), layoutParsingType, true); prepareStorage(layoutParsingRequest, new File(filename)); return DocumentGraphFactory.buildDocumentGraph(layoutParsingType, layoutParsingPipeline.parseLayout(layoutParsingType, new File(filename), - layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get()), + layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId() + .get()), new TableServiceResponse(), new VisualLayoutParsingResponse(), layoutParsingRequest.identifier())); @@ -65,10 +83,12 @@ public abstract class BuildDocumentTest extends AbstractTest { } else { prepareStorage(filename); } - return DocumentGraphFactory.buildDocumentGraph(layoutParsingType, parseLayout(filename, layoutParsingType)); + var classificationDocument = parseLayout(filename, layoutParsingType); + return DocumentGraphFactory.buildDocumentGraph(layoutParsingType, classificationDocument); } } + } diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/402StudyPage5.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/402StudyPage5.pdf new file mode 100644 index 0000000..7e3d7a8 --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/402StudyPage5.pdf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23511d9cb1ae4caa55945b0973651bb6d8b77dc7340cc4e9208df357f00c2d93 +size 587600 diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/402StudyPage5_rotated.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/402StudyPage5_rotated.pdf new file mode 100644 index 0000000..a30e21d --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/402StudyPage5_rotated.pdf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b6f582f1b0d5cc7022b7d754c0f05ab8fb4d8259fafdb6e58922522c906c233 +size 587386 diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/DocumineIdentifierProblem.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/DocumineIdentifierProblem.pdf index a13ba29..319d941 100644 Binary files a/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/DocumineIdentifierProblem.pdf and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/DocumineIdentifierProblem.pdf differ diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/brokenTableOnOcr_ocred 1.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/brokenTableOnOcr_ocred 1.pdf index 1a00988..4e163ae 100644 Binary files a/layoutparser-service/layoutparser-service-server/src/test/resources/files/brokenTableOnOcr_ocred 1.pdf and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/brokenTableOnOcr_ocred 1.pdf differ diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/headerFooterTest3Pages.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/headerFooterTest3Pages.pdf index f80c5b7..d386a12 100644 Binary files a/layoutparser-service/layoutparser-service-server/src/test/resources/files/headerFooterTest3Pages.pdf and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/headerFooterTest3Pages.pdf differ diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/knecon_extracted_msg_WG_ BAP-BFB, Windtests.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/knecon_extracted_msg_WG_ BAP-BFB, Windtests.pdf index e42b4cd..aea0dfe 100644 Binary files a/layoutparser-service/layoutparser-service-server/src/test/resources/files/knecon_extracted_msg_WG_ BAP-BFB, Windtests.pdf and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/knecon_extracted_msg_WG_ BAP-BFB, Windtests.pdf differ diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/syngenta b/layoutparser-service/layoutparser-service-server/src/test/resources/files/syngenta index c6fd9e8..0da08b1 160000 --- a/layoutparser-service/layoutparser-service-server/src/test/resources/files/syngenta +++ b/layoutparser-service/layoutparser-service-server/src/test/resources/files/syngenta @@ -1 +1 @@ -Subproject commit c6fd9e849f3efd7d1507401f63629b91dec9f4ec +Subproject commit 0da08b1d9d1bc815a3fb51aa9638eafea2cf02d5 diff --git a/layoutparser-service/viewer-doc-processor/build.gradle b/layoutparser-service/viewer-doc-processor/build.gradle index c97d1a2..199c1cb 100644 --- a/layoutparser-service/viewer-doc-processor/build.gradle +++ b/layoutparser-service/viewer-doc-processor/build.gradle @@ -12,7 +12,7 @@ dependencies { implementation("org.apache.pdfbox:pdfbox-tools:${pdfBoxVersion}") implementation("org.slf4j:slf4j-api:1.7.25") implementation("com.knecon.fforesight:tracing-commons:0.5.0") - implementation("com.pdftron:PDFNet:10.7.0") + implementation("com.pdftron:PDFNet:10.11.0") testImplementation("org.apache.logging.log4j:log4j-slf4j-impl:2.22.1") testImplementation("org.junit.jupiter:junit-jupiter") diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/Standard14EmbeddableFont.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/Standard14EmbeddableFont.java index a12dbd2..a4b6926 100644 --- a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/Standard14EmbeddableFont.java +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/Standard14EmbeddableFont.java @@ -10,12 +10,14 @@ import org.apache.pdfbox.pdmodel.font.Standard14Fonts; import com.pdftron.pdf.Font; import com.pdftron.pdf.PDFDoc; +import lombok.Getter; import lombok.RequiredArgsConstructor; import lombok.SneakyThrows; @RequiredArgsConstructor public class Standard14EmbeddableFont implements EmbeddableFont { + @Getter private final PDType1Font font; private final int pdfTronIdentifier; diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/PDFTronViewerDocumentService.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/PDFTronViewerDocumentService.java index ee2ded6..1b7fe8c 100644 --- a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/PDFTronViewerDocumentService.java +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/PDFTronViewerDocumentService.java @@ -96,15 +96,18 @@ public class PDFTronViewerDocumentService { boolean isCurrentVersion = ViewerDocVersioningUtility.docIsCurrentVersion(pdfDoc); int pageNumber = 1; - for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); pageNumber++) { + try (PageIterator iterator = pdfDoc.getPageIterator()) { + while (iterator.hasNext()) { - Page page = iterator.next(); + Page page = iterator.next(); - if (isCurrentVersion) { - pageContentCleaner.removeMarkedContent(page); + if (isCurrentVersion) { + pageContentCleaner.removeMarkedContent(page); + } + + visualizationWriter.drawVisualizationsOnPage(pageNumber, page); + pageNumber++; } - - visualizationWriter.drawVisualizationsOnPage(pageNumber, page); } ViewerDocVersioningUtility.setVersionInDocument(pdfDoc); diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/VisualizationWriter.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/VisualizationWriter.java index 6e625d5..8c9927d 100644 --- a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/VisualizationWriter.java +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/VisualizationWriter.java @@ -343,12 +343,7 @@ public class VisualizationWriter { @SneakyThrows private static AffineTransform getTextDeRotationTransform(Page page) { - return AffineTransform.getQuadrantRotateInstance(switch (page.getRotation()) { - case 90 -> 3; - case 180 -> 2; - case 270 -> 1; - default -> 0; - }); + return AffineTransform.getQuadrantRotateInstance(page.getRotation()); } } diff --git a/layoutparser-service/viewer-doc-processor/src/test/java/com/knecon/fforesight/service/viewerdoc/service/PageContentCleanerTest.java b/layoutparser-service/viewer-doc-processor/src/test/java/com/knecon/fforesight/service/viewerdoc/service/PageContentCleanerTest.java index c1a92aa..618a8d6 100644 --- a/layoutparser-service/viewer-doc-processor/src/test/java/com/knecon/fforesight/service/viewerdoc/service/PageContentCleanerTest.java +++ b/layoutparser-service/viewer-doc-processor/src/test/java/com/knecon/fforesight/service/viewerdoc/service/PageContentCleanerTest.java @@ -61,11 +61,12 @@ class PageContentCleanerTest { .markedContentToRemove(Set.of(LayerIdentifier.KNECON_OCR_DEBUG.markedContentName())) .build(); - for (PageIterator iterator = doc.getPageIterator(); iterator.hasNext(); ) { + try (PageIterator iterator = doc.getPageIterator()) { + while (iterator.hasNext()) { + Page page = iterator.next(); - Page page = iterator.next(); - - pageContentCleaner.removeMarkedContent(page); + pageContentCleaner.removeMarkedContent(page); + } } doc.save(out, SDFDoc.SaveMode.REMOVE_UNUSED, null);