hotfix reading order

This commit is contained in:
Kilian Schüttler 2024-08-09 11:49:12 +02:00
parent b900cfaf31
commit 69bcd4f68d
77 changed files with 1297 additions and 756 deletions

View File

@ -42,6 +42,15 @@ tasks.jacocoTestReport {
} }
allprojects { allprojects {
tasks.withType<Javadoc> {
options {
this as StandardJavadocDocletOptions
addBooleanOption("Xdoclint:none", true)
addStringOption("Xmaxwarns", "1")
}
}
publishing { publishing {
publications { publications {
create<MavenPublication>(name) { create<MavenPublication>(name) {
@ -64,6 +73,7 @@ java {
withJavadocJar() withJavadocJar()
} }
repositories { repositories {
mavenLocal() mavenLocal()
mavenCentral() mavenCentral()

View File

@ -19,6 +19,7 @@ public class SimplifiedText {
@Schema(description = "Number of pages in the entire document.") @Schema(description = "Number of pages in the entire document.")
private int numberOfPages; private int numberOfPages;
@Schema(description = "A List of simplified Sections, which contains almost exclusively text.") @Schema(description = "A List of simplified Sections, which contains almost exclusively text.")
@Builder.Default
private List<SimplifiedSectionText> sectionTexts = new ArrayList<>(); private List<SimplifiedSectionText> sectionTexts = new ArrayList<>();
} }

View File

@ -28,4 +28,6 @@ dependencies {
implementation("org.tinspin:tinspin-indexes:2.1.3") implementation("org.tinspin:tinspin-indexes:2.1.3")
implementation("org.commonmark:commonmark:0.22.0") implementation("org.commonmark:commonmark:0.22.0")
implementation("org.commonmark:commonmark-ext-gfm-tables:0.22.0") implementation("org.commonmark:commonmark-ext-gfm-tables:0.22.0")
implementation("com.pdftron:PDFNet:10.11.0")
} }

View File

@ -69,6 +69,7 @@ import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDF
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService; import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation; import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
import io.micrometer.observation.Observation; import io.micrometer.observation.Observation;
import io.micrometer.observation.ObservationRegistry; import io.micrometer.observation.ObservationRegistry;
@ -117,14 +118,18 @@ public class LayoutParsingPipeline {
log.info("Starting layout parsing for {}", layoutParsingRequest.identifier()); log.info("Starting layout parsing for {}", layoutParsingRequest.identifier());
File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId()); File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()).orElse(originFile); File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId())
.orElse(originFile);
VisualLayoutParsingResponse visualLayoutParsingResponse = layoutParsingRequest.visualLayoutParsingFileId() VisualLayoutParsingResponse visualLayoutParsingResponse = layoutParsingRequest.visualLayoutParsingFileId()
.map(layoutParsingStorageService::getVisualLayoutParsingFile).orElse(new VisualLayoutParsingResponse()); .map(layoutParsingStorageService::getVisualLayoutParsingFile)
.orElse(new VisualLayoutParsingResponse());
ImageServiceResponse imageServiceResponse = layoutParsingRequest.imagesFileStorageId() ImageServiceResponse imageServiceResponse = layoutParsingRequest.imagesFileStorageId()
.map(layoutParsingStorageService::getImagesFile).orElse(new ImageServiceResponse()); .map(layoutParsingStorageService::getImagesFile)
.orElse(new ImageServiceResponse());
TableServiceResponse tableServiceResponse = layoutParsingRequest.tablesFileStorageId() TableServiceResponse tableServiceResponse = layoutParsingRequest.tablesFileStorageId()
.map(layoutParsingStorageService::getTablesFile).orElse(new TableServiceResponse()); .map(layoutParsingStorageService::getTablesFile)
.orElse(new TableServiceResponse());
ClassificationDocument classificationDocument = parseLayout(settings.getLayoutParsingTypeOverride() == null // ClassificationDocument classificationDocument = parseLayout(settings.getLayoutParsingTypeOverride() == null //
? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride(), ? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride(),
@ -137,8 +142,7 @@ public class LayoutParsingPipeline {
log.info("Building document graph for {}", layoutParsingRequest.identifier()); log.info("Building document graph for {}", layoutParsingRequest.identifier());
Document documentGraph = observeBuildDocumentGraph(settings.getLayoutParsingTypeOverride() == null // Document documentGraph = observeBuildDocumentGraph(settings.getLayoutParsingTypeOverride() == null //
? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride(), ? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride(), classificationDocument);
classificationDocument);
log.info("Creating viewer document for {}", layoutParsingRequest.identifier()); log.info("Creating viewer document for {}", layoutParsingRequest.identifier());
@ -271,11 +275,11 @@ public class LayoutParsingPipeline {
stripper.setStartPage(pageNumber); stripper.setStartPage(pageNumber);
stripper.setEndPage(pageNumber); stripper.setEndPage(pageNumber);
stripper.setPdpage(pdPage); stripper.setPdpage(pdPage);
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD)) {
stripper.setSortByPosition(true);
}
stripper.getText(originDocument); stripper.getText(originDocument);
List<TextPositionSequence> words = stripper.getTextPositionSequences(); List<TextPositionSequence> words = stripper.getTextPositionSequences();
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD)) {
words = TextPositionOperations.sort(words);
}
classificationDocument.getLayoutDebugLayer().addTextVisualizations(words, pageNumber); classificationDocument.getLayoutDebugLayer().addTextVisualizations(words, pageNumber);
PDRectangle pdr = pdPage.getMediaBox(); PDRectangle pdr = pdPage.getMediaBox();

View File

@ -1,6 +1,7 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum; package com.knecon.fforesight.service.layoutparser.processor.docstrum;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.EnumMap;
import java.util.List; import java.util.List;
import java.util.stream.Collectors; import java.util.stream.Collectors;
@ -26,6 +27,7 @@ import lombok.RequiredArgsConstructor;
@RequiredArgsConstructor @RequiredArgsConstructor
public class DocstrumSegmentationService { public class DocstrumSegmentationService {
public static final double SAME_DIRECTION_THRESHOLD = 0.9;
private final NearestNeighbourService nearestNeighbourService; private final NearestNeighbourService nearestNeighbourService;
private final SpacingService spacingService; private final SpacingService spacingService;
private final LineBuilderService lineBuilderService; private final LineBuilderService lineBuilderService;
@ -35,13 +37,44 @@ public class DocstrumSegmentationService {
public List<Zone> segmentPage(List<TextPositionSequence> textPositions, boolean xyOrder, CleanRulings usedRulings, LayoutDebugLayer visualizations) { public List<Zone> segmentPage(List<TextPositionSequence> textPositions, boolean xyOrder, CleanRulings usedRulings, LayoutDebugLayer visualizations) {
List<Zone> zones = new ArrayList<>(); EnumMap<TextDirection, Integer> directionCounts = new EnumMap<>(TextDirection.class);
zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.ZERO));
zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.QUARTER_CIRCLE));
zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.HALF_CIRCLE));
zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.THREE_QUARTER_CIRCLE));
return readingOrderService.resolve(zones, xyOrder); List<Zone> newZones = computeZones(textPositions, usedRulings, visualizations, TextDirection.ZERO);
directionCounts.put(TextDirection.ZERO, newZones.size());
List<Zone> zones = new ArrayList<>(newZones);
newZones = computeZones(textPositions, usedRulings, visualizations, TextDirection.QUARTER_CIRCLE);
directionCounts.put(TextDirection.QUARTER_CIRCLE, newZones.size());
zones.addAll(newZones);
newZones = computeZones(textPositions, usedRulings, visualizations, TextDirection.HALF_CIRCLE);
directionCounts.put(TextDirection.HALF_CIRCLE, newZones.size());
zones.addAll(newZones);
newZones = computeZones(textPositions, usedRulings, visualizations, TextDirection.THREE_QUARTER_CIRCLE);
directionCounts.put(TextDirection.THREE_QUARTER_CIRCLE, newZones.size());
zones.addAll(newZones);
return readingOrderService.resolve(zones, xyOrder, mostSameDirection(directionCounts));
}
private boolean mostSameDirection(EnumMap<TextDirection, Integer> directionCounts) {
int total = directionCounts.values()
.stream()
.mapToInt(i -> i).sum();
if ((double) directionCounts.get(TextDirection.ZERO) / total > SAME_DIRECTION_THRESHOLD) {
return true;
} else if ((double) directionCounts.get(TextDirection.QUARTER_CIRCLE) / total > SAME_DIRECTION_THRESHOLD) {
return true;
} else if ((double) directionCounts.get(TextDirection.HALF_CIRCLE) / total > SAME_DIRECTION_THRESHOLD) {
return true;
} else if ((double) directionCounts.get(TextDirection.THREE_QUARTER_CIRCLE) / total > SAME_DIRECTION_THRESHOLD) {
return true;
}
return false;
} }

View File

@ -15,10 +15,16 @@ public class AngleFilter {
public boolean matches(Neighbor neighbor) { public boolean matches(Neighbor neighbor) {
return matches(neighbor.getAngle());
}
public boolean matches(double angle) {
if (lowerAngle <= upperAngle) { if (lowerAngle <= upperAngle) {
return lowerAngle <= neighbor.getAngle() && neighbor.getAngle() < upperAngle; return lowerAngle <= angle && angle < upperAngle;
} else { } else {
return lowerAngle <= neighbor.getAngle() || neighbor.getAngle() < upperAngle; return lowerAngle <= angle || angle < upperAngle;
} }
} }

View File

@ -7,8 +7,12 @@ import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.Data; import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.experimental.SuperBuilder;
@Data @Data
@SuperBuilder
@NoArgsConstructor
public abstract class BoundingBox { public abstract class BoundingBox {
// Java coordinate system: (0, 0) is always upper left, x is increasing left to right and y is increasing from top to bottom. // Java coordinate system: (0, 0) is always upper left, x is increasing left to right and y is increasing from top to bottom.
@ -19,7 +23,7 @@ public abstract class BoundingBox {
// This rotates completely in 90 degree steps with page rotation. // This rotates completely in 90 degree steps with page rotation.
// Needs to be used when writing to a PDF. // Needs to be used when writing to a PDF.
// Also, these are definitely correct and should be used whenever possible. // Also, these are definitely correct and should be used whenever possible.
protected Rectangle2D bBoxInitialUserSpace; protected Rectangle2D bBoxPdf;
protected static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f; protected static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f;
@ -50,25 +54,25 @@ public abstract class BoundingBox {
public double getPdfMinX() { public double getPdfMinX() {
return bBoxInitialUserSpace.getMinX(); return bBoxPdf.getMinX();
} }
public double getPdfMaxX() { public double getPdfMaxX() {
return bBoxInitialUserSpace.getMaxX(); return bBoxPdf.getMaxX();
} }
public double getPdfMinY() { public double getPdfMinY() {
return bBoxInitialUserSpace.getMinY(); return bBoxPdf.getMinY();
} }
public double getPdfMaxY() { public double getPdfMaxY() {
return bBoxInitialUserSpace.getMaxY(); return bBoxPdf.getMaxY();
} }
@ -129,13 +133,31 @@ public abstract class BoundingBox {
} }
public boolean intersectsY(BoundingBox other) { private boolean intersectsX(BoundingBox other, float threshold) {
return this.getX() - threshold <= other.getMaxX() && this.getMaxX() + threshold >= other.getX();
}
public boolean intersectsPdf(BoundingBox other) {
return this.intersectsXPdf(other) && this.intersectsYPdf(other);
}
public boolean intersectsPdf(BoundingBox other, float yThreshold, float xThreshold) {
return this.intersectsXPdf(other, xThreshold) && this.intersectsYPdf(other, yThreshold);
}
public boolean intersectsYPdf(BoundingBox other) {
return this.getPdfMinY() <= other.getPdfMaxY() && this.getPdfMaxY() >= other.getPdfMinY(); return this.getPdfMinY() <= other.getPdfMaxY() && this.getPdfMaxY() >= other.getPdfMinY();
} }
public boolean intersectsYJava(BoundingBox other) { public boolean intersectsY(BoundingBox other) {
return this.getY() <= other.getMaxY() && this.getMaxY() >= other.getY(); return this.getY() <= other.getMaxY() && this.getMaxY() >= other.getY();
} }
@ -143,25 +165,31 @@ public abstract class BoundingBox {
public boolean intersectsY(BoundingBox other, float threshold) { public boolean intersectsY(BoundingBox other, float threshold) {
return this.getY() - threshold <= other.getMaxY() && this.getMaxY() + threshold >= other.getY();
}
public boolean intersectsYPdf(BoundingBox other, float threshold) {
return this.getPdfMinY() - threshold <= other.getPdfMaxY() && this.getPdfMaxY() + threshold >= other.getPdfMinY(); return this.getPdfMinY() - threshold <= other.getPdfMaxY() && this.getPdfMaxY() + threshold >= other.getPdfMinY();
} }
public boolean intersectsX(BoundingBox other) { public boolean intersectsXPdf(BoundingBox other) {
return this.getPdfMinX() <= other.getPdfMaxX() && this.getPdfMaxX() >= other.getPdfMinX(); return this.getPdfMinX() <= other.getPdfMaxX() && this.getPdfMaxX() >= other.getPdfMinX();
} }
public boolean intersectsXJava(BoundingBox other) { public boolean intersectsX(BoundingBox other) {
return this.getX() <= other.getMaxX() && this.getMaxX() >= other.getMinX(); return this.getX() <= other.getMaxX() && this.getMaxX() >= other.getMinX();
} }
public boolean intersectsX(BoundingBox other, float threshold) { public boolean intersectsXPdf(BoundingBox other, float threshold) {
return this.getPdfMinX() - threshold <= other.getMaxX() && this.getMaxX() + threshold >= other.getPdfMinX(); return this.getPdfMinX() - threshold <= other.getPdfMaxX() && this.getMaxX() + threshold >= other.getPdfMinX();
} }
@ -170,8 +198,8 @@ public abstract class BoundingBox {
this.bBox = components.stream() this.bBox = components.stream()
.map(BoundingBox::getBBox) .map(BoundingBox::getBBox)
.collect(RectangleTransformations.collectBBox()); .collect(RectangleTransformations.collectBBox());
this.bBoxInitialUserSpace = components.stream() this.bBoxPdf = components.stream()
.map(BoundingBox::getBBoxInitialUserSpace) .map(BoundingBox::getBBoxPdf)
.collect(RectangleTransformations.collectBBox()); .collect(RectangleTransformations.collectBBox());
} }
@ -229,25 +257,25 @@ public abstract class BoundingBox {
public boolean rightOf(BoundingBox other) { public boolean rightOf(BoundingBox other) {
return this.intersectsYJava(other) && other.getMaxX() <= this.getMinX(); return this.intersectsY(other) && other.getMaxX() <= this.getMinX();
} }
public boolean leftOf(BoundingBox other) { public boolean leftOf(BoundingBox other) {
return this.intersectsYJava(other) && other.getMinX() >= this.getMaxX(); return this.intersectsY(other) && other.getMinX() >= this.getMaxX();
} }
public boolean isAbove(BoundingBox other) { public boolean isAbove(BoundingBox other) {
return this.intersectsXJava(other) && other.getMinY() >= this.getMaxY(); return this.intersectsX(other) && other.getMinY() >= this.getMaxY();
} }
public boolean isBelow(BoundingBox other) { public boolean isBelow(BoundingBox other) {
return this.intersectsXJava(other) && this.getMinY() >= other.getMaxY(); return this.intersectsX(other) && this.getMinY() >= other.getMaxY();
} }
} }

View File

@ -35,7 +35,7 @@ public class Character {
public double getHeight() { public double getHeight() {
return textPosition.getHeightDir(); return textPosition.getHeightDirAdj();
} }
@ -65,9 +65,9 @@ public class Character {
double s = Math.sin(-0); double s = Math.sin(-0);
double c = Math.cos(-0); double c = Math.cos(-0);
xs[0] = c * x - s * y; xs[0] = c * x - s * y;
xs[1] = c * (x + textPosition.getWidthDirAdj()) - s * (y + textPosition.getHeightDir()); xs[1] = c * (x + textPosition.getWidthDirAdj()) - s * (y + textPosition.getHeightDirAdj());
xs[2] = c * other.x - s * other.y; xs[2] = c * other.x - s * other.y;
xs[3] = c * (other.x + other.textPosition.getWidthDirAdj()) - s * (other.y + other.textPosition.getHeightDir()); xs[3] = c * (other.x + other.textPosition.getWidthDirAdj()) - s * (other.y + other.textPosition.getHeightDirAdj());
boolean overlapping = xs[1] >= xs[2] && xs[3] >= xs[0]; boolean overlapping = xs[1] >= xs[2] && xs[3] >= xs[0];
Arrays.sort(xs); Arrays.sort(xs);
return Math.abs(xs[2] - xs[1]) * (overlapping ? 1 : -1); return Math.abs(xs[2] - xs[1]) * (overlapping ? 1 : -1);

View File

@ -1,18 +1,28 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model; package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
import static com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence.BOLD;
import static com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence.BOLD_ITALIC;
import static com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence.ITALIC;
import static com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence.STANDARD;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.Comparator;
import java.util.EnumMap;
import java.util.List; import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2; import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.model.text.FontStyle;
import lombok.Data; import lombok.Data;
import lombok.EqualsAndHashCode; import lombok.EqualsAndHashCode;
@Data @Data
@EqualsAndHashCode(onlyExplicitlyIncluded = true) @EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = false)
public class Line extends BoundingBox { public class Line extends TextBoundingBox {
private static final double WORD_DISTANCE_MULTIPLIER = 0.18; private static final double WORD_DISTANCE_MULTIPLIER = 0.18;
@ -28,6 +38,8 @@ public class Line extends BoundingBox {
private final double height; private final double height;
private FontStyle fontStyle;
private final List<Character> characters; private final List<Character> characters;
private final List<TextPositionSequence> words = new ArrayList<>(); private final List<TextPositionSequence> words = new ArrayList<>();
@ -67,6 +79,29 @@ public class Line extends BoundingBox {
height = computeHeight(); height = computeHeight();
computeWords(wordSpacing * WORD_DISTANCE_MULTIPLIER); computeWords(wordSpacing * WORD_DISTANCE_MULTIPLIER);
buildBBox(); buildBBox();
computeFontStyle();
}
private void computeFontStyle() {
EnumMap<FontStyle, AtomicInteger> fontStyleCounter = new EnumMap<>(FontStyle.class);
for (FontStyle fontStyle : FontStyle.values()) {
fontStyleCounter.put(fontStyle, new AtomicInteger(0));
}
for (TextPositionSequence word : words) {
switch (word.getFontStyle()) {
case STANDARD -> fontStyleCounter.get(FontStyle.REGULAR).getAndIncrement();
case BOLD -> fontStyleCounter.get(FontStyle.BOLD).getAndIncrement();
case ITALIC -> fontStyleCounter.get(FontStyle.ITALIC).getAndIncrement();
case BOLD_ITALIC -> fontStyleCounter.get(FontStyle.BOLD_ITALIC).getAndIncrement();
}
}
fontStyle = fontStyleCounter.entrySet()
.stream()
.max(Comparator.comparing(entry -> entry.getValue().get()))
.map(Map.Entry::getKey)
.orElse(FontStyle.REGULAR);
} }

View File

@ -0,0 +1,102 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
import java.awt.geom.Rectangle2D;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.NoArgsConstructor;
import lombok.Setter;
import lombok.experimental.SuperBuilder;
@Getter
@Setter
@SuperBuilder
@NoArgsConstructor
@EqualsAndHashCode(callSuper = false)
public abstract class TextBoundingBox extends BoundingBox {
protected Rectangle2D bBoxDirAdj;
protected TextDirection dir;
@Override
public void setToBBoxOfComponents(List<? extends BoundingBox> components) {
super.setToBBoxOfComponents(components);
this.bBoxDirAdj = components.stream()
.filter(c -> c instanceof TextBoundingBox)
.map(c -> (TextBoundingBox) c)
.map(TextBoundingBox::getBBoxDirAdj)
.collect(RectangleTransformations.collectBBox());
Set<TextDirection> textDirections = components.stream()
.filter(c -> c instanceof TextBoundingBox)
.map(c -> (TextBoundingBox) c)
.map(TextBoundingBox::getDir)
.collect(Collectors.toSet());
if (textDirections.isEmpty()) {
dir = TextDirection.ZERO;
} else if (textDirections.size() > 1) {
throw new IllegalArgumentException("More than one text direction found");
} else {
dir = textDirections.iterator().next();
}
}
public double getXDirAdj() {
return this.bBoxDirAdj.getX();
}
public double getYDirAdj() {
return this.bBoxDirAdj.getY();
}
public double getWidthDirAdj() {
return this.bBoxDirAdj.getWidth();
}
public double getHeightDirAdj() {
return this.bBoxDirAdj.getHeight();
}
public double getMaxXDirAdj() {
return this.bBoxDirAdj.getMaxX();
}
public double getMaxYDirAdj() {
return this.bBoxDirAdj.getMaxY();
}
public double getCenterYDirAdj() {
return this.bBoxDirAdj.getCenterY();
}
public double getCenterXDirAdj() {
return this.bBoxDirAdj.getCenterX();
}
}

View File

@ -6,9 +6,11 @@ import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.Data; import lombok.Data;
import lombok.EqualsAndHashCode;
@Data @Data
public class Zone extends BoundingBox { @EqualsAndHashCode(callSuper = false)
public class Zone extends TextBoundingBox {
private List<Line> lines; private List<Line> lines;

View File

@ -1,15 +1,17 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.service; package com.knecon.fforesight.service.layoutparser.processor.docstrum.service;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Comparator; import java.util.Comparator;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.ListIterator; import java.util.ListIterator;
import java.util.Map; import java.util.Map;
import java.util.stream.Collectors;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils; import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils;
@ -19,21 +21,30 @@ public class ReadingOrderService {
private static final double THRESHOLD = 5; private static final double THRESHOLD = 5;
public static final double MULTI_COLUMN_DETECTION_THRESHOLD = 1.5; public static final double MULTI_COLUMN_DETECTION_THRESHOLD = 1.5;
private static final Comparator<TextBoundingBox> COMPARATOR = //
Comparator.comparing(TextBoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(TextBoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD));
public List<Zone> resolve(List<Zone> zones, boolean xyReadingOrder) { private static final Comparator<TextBoundingBox> COMPARATOR_DIR_ADJ = //
Comparator.comparing(TextBoundingBox::getYDirAdj, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(TextBoundingBox::getXDirAdj, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD));
public List<Zone> resolve(List<Zone> zones, boolean xyReadingOrder, boolean useDirAdjCoords) {
if (zones.isEmpty() || zones.size() == 1) { if (zones.isEmpty() || zones.size() == 1) {
return zones; return zones;
} }
if (xyReadingOrder) { if (xyReadingOrder) {
return resolveSingleColumnReadingOrder(zones); return resolveSingleColumnReadingOrder(zones, useDirAdjCoords);
} }
Map<Long, Integer> histogram = new HashMap<>(); Map<Long, Integer> histogram = new HashMap<>();
for (Zone zone : zones) { for (Zone zone : zones) {
long minY = Math.round(zone.getBBox().getMinY()); Rectangle2D bbox = useDirAdjCoords ? zone.getBBoxDirAdj() : zone.getBBox();
long maxY = Math.round(zone.getBBox().getMaxY()); long minY = Math.round(bbox.getMinY());
long maxY = Math.round(bbox.getMaxY());
for (long i = minY; i <= maxY; i++) { for (long i = minY; i <= maxY; i++) {
histogram.put(i, histogram.getOrDefault(i, 0) + 1); histogram.put(i, histogram.getOrDefault(i, 0) + 1);
} }
@ -43,24 +54,32 @@ public class ReadingOrderService {
.stream() .stream()
.mapToInt(Integer::intValue).average() .mapToInt(Integer::intValue).average()
.orElse(1) < MULTI_COLUMN_DETECTION_THRESHOLD) { .orElse(1) < MULTI_COLUMN_DETECTION_THRESHOLD) {
return resolveSingleColumnReadingOrder(zones); return resolveSingleColumnReadingOrder(zones, useDirAdjCoords);
} else { } else {
return resolveMultiColumnReadingOder(zones); return resolveMultiColumnReadingOder(zones, useDirAdjCoords);
} }
} }
private static List<Zone> resolveSingleColumnReadingOrder(List<Zone> zones) { private static List<Zone> resolveSingleColumnReadingOrder(List<Zone> zones, boolean useDirAdjCoords) {
zones.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) if (useDirAdjCoords) {
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); return zones.stream()
.collect(Collectors.groupingBy(TextBoundingBox::getDir)).values()
.stream()
.flatMap(words -> words.stream()
.sorted(COMPARATOR_DIR_ADJ))
.toList();
}
zones.sort(COMPARATOR);
return zones; return zones;
} }
private List<Zone> resolveMultiColumnReadingOder(List<Zone> zones) { private List<Zone> resolveMultiColumnReadingOder(List<Zone> zones, boolean useDirAdjCoords) {
// Simple reading order resolver for multi column page layout as described here : https://pub.towardsai.net/advanced-rag-02-unveiling-pdf-parsing-b84ae866344e // Simple reading order resolver for multi column page layout as described here : https://pub.towardsai.net/advanced-rag-02-unveiling-pdf-parsing-b84ae866344e
// TODO implement a more fancy reading order resolver see https://github.com/BobLd/DocumentLayoutAnalysis/blob/master/README.md#reading-order // TODO implement a more fancy reading order resolver see https://github.com/BobLd/DocumentLayoutAnalysis/blob/master/README.md#reading-order
@ -69,11 +88,12 @@ public class ReadingOrderService {
double maxX = Double.NEGATIVE_INFINITY; double maxX = Double.NEGATIVE_INFINITY;
for (Zone zone : zones) { for (Zone zone : zones) {
if (zone.getX() < minX) { Rectangle2D bbox = useDirAdjCoords ? zone.getBBoxDirAdj() : zone.getBBox();
minX = zone.getX(); if (bbox.getX() < minX) {
minX = zone.getXDirAdj();
} }
if (zone.getX() + zone.getWidth() > maxX) { if (bbox.getMaxX() > maxX) {
maxX = zone.getX() + zone.getWidth(); maxX = zone.getMaxXDirAdj();
} }
} }
@ -82,24 +102,27 @@ public class ReadingOrderService {
List<Zone> leftOf = new ArrayList<>(); List<Zone> leftOf = new ArrayList<>();
List<Zone> rightOf = new ArrayList<>(); List<Zone> rightOf = new ArrayList<>();
List<Zone> middle = new ArrayList<>(); List<Zone> middle = new ArrayList<>();
for (Zone zone : zones) { for (Zone zone : zones) {
if (zone.getX() < midLineXCoordinate && zone.getX() + zone.getWidth() < midLineXCoordinate) { Rectangle2D bbox = useDirAdjCoords ? zone.getBBoxDirAdj() : zone.getBBox();
if (bbox.getX() < midLineXCoordinate && bbox.getX() + bbox.getWidth() < midLineXCoordinate) {
leftOf.add(zone); leftOf.add(zone);
} else if (zone.getX() > midLineXCoordinate && zone.getX() + zone.getWidth() > midLineXCoordinate) { } else if (bbox.getX() > midLineXCoordinate && bbox.getX() + bbox.getWidth() > midLineXCoordinate) {
rightOf.add(zone); rightOf.add(zone);
} else { } else {
middle.add(zone); middle.add(zone);
} }
} }
leftOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) if (useDirAdjCoords) {
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); leftOf.sort(COMPARATOR_DIR_ADJ);
rightOf.sort(COMPARATOR_DIR_ADJ);
rightOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) middle.sort(COMPARATOR_DIR_ADJ);
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); } else {
leftOf.sort(COMPARATOR);
middle.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) rightOf.sort(COMPARATOR);
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); middle.sort(COMPARATOR);
}
/* /*
List<Zone> leftNotIntersecting = new ArrayList<>(); List<Zone> leftNotIntersecting = new ArrayList<>();
for (Zone leftZone : leftOf) { for (Zone leftZone : leftOf) {
@ -151,8 +174,9 @@ public class ReadingOrderService {
while (itty.hasNext()) { while (itty.hasNext()) {
Zone current = itty.next(); Zone current = itty.next();
Rectangle2D bbox = useDirAdjCoords ? current.getBBoxDirAdj() : current.getBBox();
for (int i = 0; i < sortedZones.size(); i++) { for (int i = 0; i < sortedZones.size(); i++) {
if (current.getY() < sortedZones.get(i).getY()) { if (bbox.getY() < sortedZones.get(i).getY()) {
sortedZones.add(i, current); sortedZones.add(i, current);
itty.remove(); itty.remove();
break; break;

View File

@ -1,5 +1,7 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.service; package com.knecon.fforesight.service.layoutparser.processor.docstrum.service;
import static com.knecon.fforesight.service.layoutparser.processor.model.SectionIdentifier.numericalIdentifierPattern;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Comparator; import java.util.Comparator;
import java.util.HashSet; import java.util.HashSet;
@ -21,7 +23,7 @@ public class ZoneBuilderService {
private static final double MIN_HORIZONTAL_DISTANCE_MULTIPLIER = -0.5; private static final double MIN_HORIZONTAL_DISTANCE_MULTIPLIER = -0.5;
private static final double MAX_VERTICAL_DISTANCE_MULTIPLIER = 1.2; private static final double MAX_VERTICAL_DISTANCE_MULTIPLIER = 1.2;
private static final double MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER = -3.0; private static final double MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER = -7;
private static final double MAX_VERTICAL_MERGE_DISTANCE_MULTIPLIER = 0.5; private static final double MAX_VERTICAL_MERGE_DISTANCE_MULTIPLIER = 0.5;
@ -38,7 +40,7 @@ public class ZoneBuilderService {
double minHorizontalDistance = characterSpacing * MIN_HORIZONTAL_DISTANCE_MULTIPLIER; double minHorizontalDistance = characterSpacing * MIN_HORIZONTAL_DISTANCE_MULTIPLIER;
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_DISTANCE_MULTIPLIER; double maxVerticalDistance = lineSpacing * MAX_VERTICAL_DISTANCE_MULTIPLIER;
double minHorizontalMergeDistance = characterSpacing * MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER; double minHorizontalMergeDistance = lineSpacing * MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER;
double maxVerticalMergeDistance = lineSpacing * MAX_VERTICAL_MERGE_DISTANCE_MULTIPLIER; double maxVerticalMergeDistance = lineSpacing * MAX_VERTICAL_MERGE_DISTANCE_MULTIPLIER;
UnionFind<Line> unionFind = new UnionFind<>(new HashSet<>(lines)); UnionFind<Line> unionFind = new UnionFind<>(new HashSet<>(lines));
@ -54,11 +56,26 @@ public class ZoneBuilderService {
return; return;
} }
double scale = Math.min(outerLine.getHeight(), innerLine.getHeight()) / meanHeight; // if (!innerLine.getFontStyle().equals(outerLine.getFontStyle()) //
scale = Math.max(MIN_LINE_SIZE_SCALE, Math.min(scale, MAX_LINE_SIZE_SCALE)); // && !outerLine.intersectsY(innerLine, -2f)) {
// return;
// }
double horizontalDistance = outerLine.horizontalDistance(innerLine) / scale; double horizontalScale = Math.min(outerLine.getHeightDirAdj(), innerLine.getHeightDirAdj()) / meanHeight;
double verticalDistance = outerLine.verticalDistance(innerLine) / scale; horizontalScale = Math.max(MIN_LINE_SIZE_SCALE, Math.min(horizontalScale, MAX_LINE_SIZE_SCALE));
double verticalScale = horizontalScale;
// if (innerLine.toString().endsWith(":")
// || outerLine.toString().endsWith(":")
// || numericalIdentifierPattern.matcher(innerLine.toString()).matches()
// || numericalIdentifierPattern.matcher(outerLine.toString()).matches()) {
//
// horizontalScale *= 5;
// verticalScale /= 10;
// }
double horizontalDistance = outerLine.horizontalDistance(innerLine) / horizontalScale;
double verticalDistance = outerLine.verticalDistance(innerLine) / verticalScale;
if ((!(minHorizontalDistance <= horizontalDistance) || !(verticalDistance <= maxVerticalDistance)) // if ((!(minHorizontalDistance <= horizontalDistance) || !(verticalDistance <= maxVerticalDistance)) //
&& (!(minHorizontalMergeDistance <= horizontalDistance) || !(verticalDistance <= maxVerticalMergeDistance))) { && (!(minHorizontalMergeDistance <= horizontalDistance) || !(verticalDistance <= maxVerticalMergeDistance))) {
@ -87,7 +104,7 @@ public class ZoneBuilderService {
double weights = 0.0; double weights = 0.0;
for (Line line : lines) { for (Line line : lines) {
double weight = line.getLength(); double weight = line.getLength();
meanHeight += line.getHeight() * weight; meanHeight += line.getHeightDirAdj() * weight;
weights += weight; weights += weight;
} }
meanHeight /= weights; meanHeight /= weights;

View File

@ -32,6 +32,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.He
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.FontStyle;
public class MarkdownMapper extends AbstractNodeVisitor { public class MarkdownMapper extends AbstractNodeVisitor {
@ -297,12 +298,6 @@ public class MarkdownMapper extends AbstractNodeVisitor {
} }
enum FontStyle {
REGULAR,
BOLD,
ITALIC,
BOLD_ITALIC;
}
record FontStyleChange(boolean enter, FontStyle style) { record FontStyleChange(boolean enter, FontStyle style) {

View File

@ -18,6 +18,7 @@ import lombok.RequiredArgsConstructor;
@Data @Data
@RequiredArgsConstructor @RequiredArgsConstructor
public class ClassificationPage { public class ClassificationPage {
@NonNull @NonNull
@ -44,7 +45,7 @@ public class ClassificationPage {
private float pageWidth; private float pageWidth;
private float pageHeight; private float pageHeight;
CleanRulings cleanRulings; private CleanRulings cleanRulings;
private Map<String, List<Rectangle2D>> markedContentBboxPerType = new HashMap<>(); private Map<String, List<Rectangle2D>> markedContentBboxPerType = new HashMap<>();

View File

@ -12,10 +12,10 @@ import lombok.Getter;
@Getter @Getter
public class FloatFrequencyCounter { public class FloatFrequencyCounter {
Map<Float, Integer> countPerValue = new HashMap<>(); Map<Double, Integer> countPerValue = new HashMap<>();
public void add(float value) { public void add(double value) {
if (!countPerValue.containsKey(value)) { if (!countPerValue.containsKey(value)) {
countPerValue.put(value, 1); countPerValue.put(value, 1);
@ -25,9 +25,9 @@ public class FloatFrequencyCounter {
} }
public void addAll(Map<Float, Integer> otherCounter) { public void addAll(Map<Double, Integer> otherCounter) {
for (Map.Entry<Float, Integer> entry : otherCounter.entrySet()) { for (Map.Entry<Double, Integer> entry : otherCounter.entrySet()) {
if (countPerValue.containsKey(entry.getKey())) { if (countPerValue.containsKey(entry.getKey())) {
countPerValue.put(entry.getKey(), countPerValue.get(entry.getKey()) + entry.getValue()); countPerValue.put(entry.getKey(), countPerValue.get(entry.getKey()) + entry.getValue());
} else { } else {
@ -37,10 +37,10 @@ public class FloatFrequencyCounter {
} }
public Float getMostPopular() { public Double getMostPopular() {
Map.Entry<Float, Integer> mostPopular = null; Map.Entry<Double, Integer> mostPopular = null;
for (Map.Entry<Float, Integer> entry : countPerValue.entrySet()) { for (Map.Entry<Double, Integer> entry : countPerValue.entrySet()) {
if (mostPopular == null || entry.getValue() >= mostPopular.getValue()) { if (mostPopular == null || entry.getValue() >= mostPopular.getValue()) {
mostPopular = entry; mostPopular = entry;
} }
@ -49,11 +49,11 @@ public class FloatFrequencyCounter {
} }
public List<Float> getHighterThanMostPopular() { public List<Double> getHigherThanMostPopular() {
Float mostPopular = getMostPopular(); Double mostPopular = getMostPopular();
List<Float> higher = new ArrayList<>(); List<Double> higher = new ArrayList<>();
for (Float value : countPerValue.keySet()) { for (Double value : countPerValue.keySet()) {
if (value > mostPopular) { if (value > mostPopular) {
higher.add(value); higher.add(value);
} }
@ -63,10 +63,10 @@ public class FloatFrequencyCounter {
} }
public Float getHighest() { public Double getHighest() {
Float highest = null; Double highest = null;
for (Float value : countPerValue.keySet()) { for (Double value : countPerValue.keySet()) {
if (highest == null || value > highest) { if (highest == null || value > highest) {
highest = value; highest = value;
} }

View File

@ -15,7 +15,7 @@ import lombok.experimental.FieldDefaults;
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class SectionIdentifier { public class SectionIdentifier {
static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d+)[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?"); public static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d+)[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?");
public enum Format { public enum Format {
EMPTY, EMPTY,

View File

@ -145,10 +145,7 @@ public class AtomicTextBlock implements TextBlock {
} }
public static AtomicTextBlock fromAtomicTextBlockData(DocumentTextData documentTextData, public static AtomicTextBlock fromAtomicTextBlockData(DocumentTextData documentTextData, DocumentPositionData documentPositionData, SemanticNode parent, Page page) {
DocumentPositionData documentPositionData,
SemanticNode parent,
Page page) {
return AtomicTextBlock.builder() return AtomicTextBlock.builder()
.id(documentTextData.getId()) .id(documentTextData.getId())
@ -156,8 +153,10 @@ public class AtomicTextBlock implements TextBlock {
.page(page) .page(page)
.textRange(new TextRange(documentTextData.getStart(), documentTextData.getEnd())) .textRange(new TextRange(documentTextData.getStart(), documentTextData.getEnd()))
.searchText(documentTextData.getSearchText()) .searchText(documentTextData.getSearchText())
.lineBreaks(Arrays.stream(documentTextData.getLineBreaks()).boxed().toList()) .lineBreaks(Arrays.stream(documentTextData.getLineBreaks()).boxed()
.stringIdxToPositionIdx(Arrays.stream(documentPositionData.getStringIdxToPositionIdx()).boxed().toList()) .toList())
.stringIdxToPositionIdx(Arrays.stream(documentPositionData.getStringIdxToPositionIdx()).boxed()
.toList())
.positions(toRectangle2DList(documentPositionData.getPositions())) .positions(toRectangle2DList(documentPositionData.getPositions()))
.parent(parent) .parent(parent)
.build(); .build();
@ -166,7 +165,9 @@ public class AtomicTextBlock implements TextBlock {
private static List<Rectangle2D> toRectangle2DList(float[][] positions) { private static List<Rectangle2D> toRectangle2DList(float[][] positions) {
return Arrays.stream(positions).map(floatArr -> (Rectangle2D) new Rectangle2D.Float(floatArr[0], floatArr[1], floatArr[2], floatArr[3])).toList(); return Arrays.stream(positions)
.map(floatArr -> (Rectangle2D) new Rectangle2D.Float(floatArr[0], floatArr[1], floatArr[2], floatArr[3]))
.toList();
} }
@ -176,6 +177,9 @@ public class AtomicTextBlock implements TextBlock {
throw new IndexOutOfBoundsException(format("line %d out of range for AtomicTextBlock with %d lines", lineNumber, numberOfLines())); throw new IndexOutOfBoundsException(format("line %d out of range for AtomicTextBlock with %d lines", lineNumber, numberOfLines()));
} }
if (lineNumber == 0) { if (lineNumber == 0) {
if (lineBreaks.isEmpty()) {
return searchText;
}
return subSequence(textRange.start(), lineBreaks.get(0) + textRange.start()); return subSequence(textRange.start(), lineBreaks.get(0) + textRange.start());
} else if (lineNumber == numberOfLines() - 1) { } else if (lineNumber == numberOfLines() - 1) {
return subSequence(lineBreaks.get(lineBreaks.size() - 1) + textRange.start(), textRange.end()); return subSequence(lineBreaks.get(lineBreaks.size() - 1) + textRange.start(), textRange.end());
@ -255,7 +259,10 @@ public class AtomicTextBlock implements TextBlock {
protected List<Integer> getAllLineBreaksInBoundary(TextRange textRange) { protected List<Integer> getAllLineBreaksInBoundary(TextRange textRange) {
return getLineBreaks().stream().map(linebreak -> linebreak + this.textRange.start()).filter(textRange::contains).toList(); return getLineBreaks().stream()
.map(linebreak -> linebreak + this.textRange.start())
.filter(textRange::contains)
.toList();
} }

View File

@ -10,8 +10,8 @@ import lombok.NonNull;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
@Data @Data
@RequiredArgsConstructor
@AllArgsConstructor @AllArgsConstructor
@RequiredArgsConstructor
public class ClassifiedImage { public class ClassifiedImage {
@NonNull @NonNull
@ -20,11 +20,19 @@ public class ClassifiedImage {
private ImageType imageType; private ImageType imageType;
private boolean sourceByAi; private boolean sourceByAi;
private boolean isAppendedToSection; private boolean isAppendedToSection;
@NonNull
private boolean hasTransparency; private boolean hasTransparency;
@NonNull
private int page; private int page;
@NonNull @NonNull
private String representation; private String representation;
public ClassifiedImage(@NonNull Rectangle2D position, @NonNull ImageType imageType, boolean hasTransparency, int page, @NonNull String representation) {
this.position = position;
this.imageType = imageType;
this.hasTransparency = hasTransparency;
this.page = page;
this.representation = representation;
}
} }

View File

@ -35,14 +35,14 @@ public class Cell extends BoundingBox {
public Cell(Point2D topLeft, Point2D bottomRight) { public Cell(Point2D topLeft, Point2D bottomRight) {
this.bBoxInitialUserSpace = new Rectangle2D.Double(topLeft.getX(), topLeft.getY(), (bottomRight.getX() - topLeft.getX()), (bottomRight.getY() - topLeft.getY())); this.bBoxPdf = new Rectangle2D.Double(topLeft.getX(), topLeft.getY(), (bottomRight.getX() - topLeft.getX()), (bottomRight.getY() - topLeft.getY()));
this.bBox = bBoxInitialUserSpace; this.bBox = bBoxPdf;
} }
public Cell(Rectangle2D bBoxInitialUserSpace, AffineTransform initialUserSpaceToJava) { public Cell(Rectangle2D bBoxInitialUserSpace, AffineTransform initialUserSpaceToJava) {
this.bBoxInitialUserSpace = bBoxInitialUserSpace; this.bBoxPdf = bBoxInitialUserSpace;
this.bBox = initialUserSpaceToJava.createTransformedShape(bBoxInitialUserSpace).getBounds2D(); this.bBox = initialUserSpaceToJava.createTransformedShape(bBoxInitialUserSpace).getBounds2D();
} }
@ -50,7 +50,7 @@ public class Cell extends BoundingBox {
public static Cell copy(Cell cell) { public static Cell copy(Cell cell) {
Cell copy = new Cell(); Cell copy = new Cell();
copy.bBoxInitialUserSpace = cell.bBoxInitialUserSpace; copy.bBoxPdf = cell.bBoxPdf;
copy.bBox = cell.bBox; copy.bBox = cell.bBox;
return copy; return copy;
} }

View File

@ -70,7 +70,7 @@ public class CleanRulings {
public boolean lineBetween(BoundingBox a, BoundingBox b) { public boolean lineBetween(BoundingBox a, BoundingBox b) {
return lineBetween(a.getBBoxInitialUserSpace(), b.getBBoxInitialUserSpace()); return lineBetween(a.getBBoxPdf(), b.getBBoxPdf());
} }

View File

@ -263,8 +263,8 @@ public class TablePageBlock extends AbstractPageBlock {
cells.stream() cells.stream()
.map(originalCell -> new CellWithIntersection(originalCell, .map(originalCell -> new CellWithIntersection(originalCell,
RectangleTransformations.calculateIntersectedArea(cellFromGridStructure.getBBoxInitialUserSpace(), RectangleTransformations.calculateIntersectedArea(cellFromGridStructure.getBBoxPdf(),
originalCell.getBBoxInitialUserSpace()))) originalCell.getBBoxPdf())))
.filter(cellWithIntersection -> cellWithIntersection.intersectedArea > 0) .filter(cellWithIntersection -> cellWithIntersection.intersectedArea > 0)
.filter(cellWithIntersection -> cellWithIntersection.originalCell.getArea() > cellWithIntersection.intersectedArea * CELL_AREA_CONTAINED_THRESHOLD) .filter(cellWithIntersection -> cellWithIntersection.originalCell.getArea() > cellWithIntersection.intersectedArea * CELL_AREA_CONTAINED_THRESHOLD)
.max(Comparator.comparing(CellWithIntersection::intersectedArea)) .max(Comparator.comparing(CellWithIntersection::intersectedArea))

View File

@ -0,0 +1,9 @@
package com.knecon.fforesight.service.layoutparser.processor.model.text;
public enum FontStyle {
REGULAR,
BOLD,
ITALIC,
BOLD_ITALIC;
}

View File

@ -5,64 +5,50 @@ import java.awt.geom.Rectangle2D;
import org.apache.pdfbox.text.TextPosition; import org.apache.pdfbox.text.TextPosition;
import com.fasterxml.jackson.annotation.JsonIgnore; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox; import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2;
import lombok.AccessLevel;
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data; import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor; import lombok.NoArgsConstructor;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
@Data @Data
@Builder @SuperBuilder
@NoArgsConstructor @NoArgsConstructor
@AllArgsConstructor @AllArgsConstructor
public class RedTextPosition extends BoundingBox { @EqualsAndHashCode(callSuper = true)
@FieldDefaults(level = AccessLevel.PRIVATE)
public class RedTextPosition extends TextBoundingBox {
public final static int HEIGHT_PADDING = 2; public final static int HEIGHT_PADDING = 2;
private Rectangle2D.Float bBoxDirAdj; // adjusted to text rotation String unicode;
@JsonIgnore // estimated using the TextMatrix in radians
private int rotation; float exactDir;
@JsonIgnore float widthOfSpace;
private float pageHeight;
@JsonIgnore float fontSizeInPt;
private float pageWidth;
private String unicode; String fontName;
@JsonIgnore
private float dir;
// not used in reanalysis
@JsonIgnore
private float widthOfSpace;
// not used in reanalysis
@JsonIgnore
private float fontSizeInPt;
// not used in reanalysis
@JsonIgnore
private String fontName;
@SneakyThrows @SneakyThrows
public static RedTextPosition fromTextPosition(TextPosition textPosition) { public static RedTextPosition fromTextPosition(TextPosition textPosition) {
var pos = new RedTextPosition(); var pos = new RedTextPosition();
pos.setRotation(textPosition.getRotation());
pos.setPageHeight(textPosition.getPageHeight());
pos.setPageWidth(textPosition.getPageWidth());
pos.setUnicode(textPosition.getUnicode()); pos.setUnicode(textPosition.getUnicode());
pos.setDir(textPosition.getDir());
pos.setWidthOfSpace(textPosition.getWidthOfSpace()); pos.setWidthOfSpace(textPosition.getWidthOfSpace());
pos.setFontSizeInPt(textPosition.getFontSizeInPt()); pos.setFontSizeInPt(textPosition.getFontSizeInPt());
pos.setFontName(textPosition.getFont().getName()); pos.setFontName(textPosition.getFont().getName());
pos.setExactDir((float) FastAtan2.fastAtan2(textPosition.getTextMatrix().getShearY(), textPosition.getTextMatrix().getScaleX()));
pos.setDir(TextDirection.fromDegrees(textPosition.getDir()));
//TODO: There is a mismatch in the java coords of the text and the rulings, //TODO: There is a mismatch in the java coords of the text and the rulings,
// I guess if we start with the initial user space positions and transform them the same way we do the rulings it would work. // I guess if we start with the initial user space positions and transform them the same way we do the rulings it would work.
@ -73,18 +59,18 @@ public class RedTextPosition extends BoundingBox {
textPosition.getYDirAdj() - textHeight, textPosition.getYDirAdj() - textHeight,
textPosition.getWidthDirAdj(), textPosition.getWidthDirAdj(),
textHeight + HEIGHT_PADDING); textHeight + HEIGHT_PADDING);
pos.setBBoxDirAdj(dirAdjPosition); pos.setBBoxDirAdj(dirAdjPosition);
AffineTransform affineTransform = getRotationMatrix(TextDirection.fromDegrees(textPosition.getDir()), textPosition.getPageWidth(), textPosition.getPageHeight()); AffineTransform affineTransform = getRotationMatrix(TextDirection.fromDegrees(textPosition.getDir()), textPosition.getPageWidth(), textPosition.getPageHeight());
Rectangle2D bBoxInitialUserSpace = affineTransform.createTransformedShape(dirAdjPosition).getBounds2D(); Rectangle2D bBoxInitialUserSpace = affineTransform.createTransformedShape(dirAdjPosition).getBounds2D();
pos.setBBoxInitialUserSpace(bBoxInitialUserSpace); // These are definitely correct pos.setBBoxPdf(bBoxInitialUserSpace); // These are definitely correct
return pos; return pos;
} }
private static AffineTransform getRotationMatrix(TextDirection textDirection, float pageWidth, float pageHeight) { private static AffineTransform getRotationMatrix(TextDirection textDirection, float pageWidth, float pageHeight) {
AffineTransform transform = new AffineTransform(); AffineTransform transform = new AffineTransform();
@ -103,32 +89,4 @@ public class RedTextPosition extends BoundingBox {
return transform; return transform;
} }
@JsonIgnore
public float getXDirAdj() {
return this.bBoxDirAdj.x;
}
@JsonIgnore
public float getYDirAdj() {
return this.bBoxDirAdj.y;
}
@JsonIgnore
public float getWidthDirAdj() {
return this.bBoxDirAdj.width;
}
@JsonIgnore
public float getHeightDir() {
return this.bBoxDirAdj.height;
}
} }

View File

@ -44,4 +44,15 @@ public enum TextDirection {
throw new IllegalArgumentException(String.format("A value of %f is not supported by TextDirection", degrees)); throw new IllegalArgumentException(String.format("A value of %f is not supported by TextDirection", degrees));
} }
public int getRotation() {
return switch (this) {
case ZERO -> 0;
case QUARTER_CIRCLE -> 1;
case HALF_CIRCLE -> 2;
case THREE_QUARTER_CIRCLE -> 3;
};
}
} }

View File

@ -8,6 +8,7 @@ import com.fasterxml.jackson.annotation.JsonIgnore;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter; import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities; import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
@ -26,17 +27,19 @@ public class TextPageBlock extends AbstractPageBlock {
@Builder.Default @Builder.Default
private List<TextPositionSequence> sequences = new ArrayList<>(); private List<TextPositionSequence> sequences = new ArrayList<>();
private Rectangle2D bBoxDirAdj;
private String mostPopularWordFont; private String mostPopularWordFont;
private String mostPopularWordStyle; private String mostPopularWordStyle;
private float mostPopularWordFontSize; private double mostPopularWordFontSize;
private float mostPopularWordHeight; private double mostPopularWordHeight;
private float mostPopularWordSpaceWidth; private double mostPopularWordSpaceWidth;
private float highestFontSize; private double highestFontSize;
private PageBlockType classification; private PageBlockType classification;
@ -51,34 +54,24 @@ public class TextPageBlock extends AbstractPageBlock {
} }
@JsonIgnore
public TextDirection getDir() { public TextDirection getDir() {
return sequences.get(0).getDir(); return sequences.get(0).getDir();
} }
@JsonIgnore
public float getPageHeight() {
return sequences.get(0).getPageHeight();
}
@JsonIgnore
public float getPageWidth() {
return sequences.get(0).getPageWidth();
}
private void calculateBBox() { private void calculateBBox() {
if (sequences == null) { if (sequences == null) {
this.bBox = new Rectangle2D.Double(); this.bBox = new Rectangle2D.Double();
this.bBoxInitialUserSpace = new Rectangle2D.Double(); this.bBoxPdf = new Rectangle2D.Double();
this.bBoxDirAdj = new Rectangle2D.Double();
return; return;
} }
this.bBoxDirAdj = sequences.stream()
.map(TextPositionSequence::getBBoxDirAdj)
.collect(RectangleTransformations.collectBBox());
setToBBoxOfComponents(sequences); setToBBoxOfComponents(sequences);
} }

View File

@ -1,6 +1,7 @@
package com.knecon.fforesight.service.layoutparser.processor.model.text; package com.knecon.fforesight.service.layoutparser.processor.model.text;
import java.awt.geom.Rectangle2D; import static com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition.HEIGHT_PADDING;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Locale; import java.util.Locale;
@ -8,8 +9,7 @@ import java.util.stream.Collectors;
import org.apache.pdfbox.text.TextPosition; import org.apache.pdfbox.text.TextPosition;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
import lombok.Builder; import lombok.Builder;
@ -23,23 +23,21 @@ import lombok.extern.slf4j.Slf4j;
@Builder @Builder
@NoArgsConstructor @NoArgsConstructor
@AllArgsConstructor @AllArgsConstructor
@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = false) @EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = true) // needs the bbox to be unique
public class TextPositionSequence extends BoundingBox implements CharSequence { public class TextPositionSequence extends TextBoundingBox implements CharSequence {
public static final int HEIGHT_PADDING = 2; public static final String STANDARD = "standard";
public static final String BOLD_ITALIC = "bold, italic";
public static final String BOLD = "bold";
public static final String ITALIC = "italic";
@EqualsAndHashCode.Include @EqualsAndHashCode.Include
private int page; private int page;
@EqualsAndHashCode.Include @EqualsAndHashCode.Include
@Builder.Default
private List<RedTextPosition> textPositions = new ArrayList<>(); private List<RedTextPosition> textPositions = new ArrayList<>();
private Rectangle2D bBoxDirAdj;
@EqualsAndHashCode.Include
private TextDirection dir;
private int rotation;
private float pageHeight;
private float pageWidth;
private boolean isParagraphStart; private boolean isParagraphStart;
private boolean strikethrough; private boolean strikethrough;
private boolean underline; private boolean underline;
@ -51,10 +49,6 @@ public class TextPositionSequence extends BoundingBox implements CharSequence {
.map(RedTextPosition::fromTextPosition) .map(RedTextPosition::fromTextPosition)
.collect(Collectors.toList()); .collect(Collectors.toList());
this.page = pageNumber; this.page = pageNumber;
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
this.rotation = textPositions.get(0).getRotation();
this.pageHeight = textPositions.get(0).getPageHeight();
this.pageWidth = textPositions.get(0).getPageWidth();
this.isParagraphStart = isParagraphStart; this.isParagraphStart = isParagraphStart;
calculateBBox(); calculateBBox();
} }
@ -62,9 +56,6 @@ public class TextPositionSequence extends BoundingBox implements CharSequence {
private void calculateBBox() { private void calculateBBox() {
this.bBoxDirAdj = textPositions.stream()
.map(RedTextPosition::getBBoxDirAdj)
.collect(RectangleTransformations.collectBBox());
setToBBoxOfComponents(getTextPositions()); setToBBoxOfComponents(getTextPositions());
} }
@ -73,10 +64,6 @@ public class TextPositionSequence extends BoundingBox implements CharSequence {
this.textPositions = textPositions; this.textPositions = textPositions;
this.page = page; this.page = page;
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
this.rotation = textPositions.get(0).getRotation();
this.pageHeight = textPositions.get(0).getPageHeight();
this.pageWidth = textPositions.get(0).getPageWidth();
calculateBBox(); calculateBBox();
} }
@ -112,9 +99,6 @@ public class TextPositionSequence extends BoundingBox implements CharSequence {
textPositionSequence.textPositions = textPositions.subList(start, end); textPositionSequence.textPositions = textPositions.subList(start, end);
textPositionSequence.page = page; textPositionSequence.page = page;
textPositionSequence.dir = dir; textPositionSequence.dir = dir;
textPositionSequence.rotation = rotation;
textPositionSequence.pageHeight = pageHeight;
textPositionSequence.pageWidth = pageWidth;
textPositionSequence.setToBBoxOfComponents(getTextPositions()); textPositionSequence.setToBBoxOfComponents(getTextPositions());
return textPositionSequence; return textPositionSequence;
} }
@ -141,10 +125,6 @@ public class TextPositionSequence extends BoundingBox implements CharSequence {
this.textPositions.add(textPosition); this.textPositions.add(textPosition);
this.page = textPositionSequence.getPage(); this.page = textPositionSequence.getPage();
this.dir = textPositionSequence.getDir();
this.rotation = textPositionSequence.getRotation();
this.pageHeight = textPositionSequence.getPageHeight();
this.pageWidth = textPositionSequence.getPageWidth();
calculateBBox(); calculateBBox();
} }
@ -152,79 +132,18 @@ public class TextPositionSequence extends BoundingBox implements CharSequence {
public void add(TextPosition textPosition) { public void add(TextPosition textPosition) {
this.textPositions.add(RedTextPosition.fromTextPosition(textPosition)); this.textPositions.add(RedTextPosition.fromTextPosition(textPosition));
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
this.rotation = textPositions.get(0).getRotation();
this.pageHeight = textPositions.get(0).getPageHeight();
this.pageWidth = textPositions.get(0).getPageWidth();
calculateBBox(); calculateBBox();
} }
public double getTextHeightNoPadding() {
/** return textPositions.get(0).getHeightDirAdj();
* This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
* This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt.
*
* @return the text direction adjusted minX value
*/
public float getMinXDirAdj() {
return textPositions.get(0).getXDirAdj();
} }
/** public double getTextHeight() {
* This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
* This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt.
*
* @return the text direction adjusted maxX value
*/
public float getMaxXDirAdj() { return textPositions.get(0).getHeightDirAdj() + HEIGHT_PADDING;
return textPositions.get(textPositions.size() - 1).getXDirAdj() + textPositions.get(textPositions.size() - 1).getWidthDirAdj() + HEIGHT_PADDING;
}
/**
* This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
* This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt.
*
* @return the text direction adjusted minY value. The upper border of the bounding box of the word.
*/
public float getMinYDirAdj() {
return textPositions.get(0).getYDirAdj() - getTextHeight();
}
/**
* This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
* This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt.
*
* @return the text direction adjusted maxY value. The lower border of the bounding box of the word.
*/
public float getMaxYDirAdj() {
return textPositions.get(0).getYDirAdj();
}
public float getTextHeightNoPadding() {
return textPositions.get(0).getHeightDir();
}
public float getTextHeight() {
return textPositions.get(0).getHeightDir() + HEIGHT_PADDING;
} }
@ -240,18 +159,18 @@ public class TextPositionSequence extends BoundingBox implements CharSequence {
public String getFontStyle() { public String getFontStyle() {
if (textPositions.get(0).getFontName() == null) { if (textPositions.get(0).getFontName() == null) {
return "standard"; return STANDARD;
} }
String lowercaseFontName = textPositions.get(0).getFontName().toLowerCase(Locale.ROOT); String lowercaseFontName = textPositions.get(0).getFontName().toLowerCase(Locale.ROOT);
if (lowercaseFontName.contains("bold") && lowercaseFontName.contains("italic")) { if (lowercaseFontName.contains(BOLD) && lowercaseFontName.contains(ITALIC)) {
return "bold, italic"; return BOLD_ITALIC;
} else if (lowercaseFontName.contains("bold")) { } else if (lowercaseFontName.contains(BOLD)) {
return "bold"; return BOLD;
} else if (lowercaseFontName.contains("italic")) { } else if (lowercaseFontName.contains(ITALIC)) {
return "italic"; return ITALIC;
} else { } else {
return "standard"; return STANDARD;
} }
} }

View File

@ -56,7 +56,7 @@ public class ImageServiceResponseAdapter {
classificationPage.getImages().forEach(image -> { classificationPage.getImages().forEach(image -> {
if (image.getImageType().equals(ImageType.OTHER)) { if (image.getImageType().equals(ImageType.OTHER)) {
for (AbstractPageBlock textblock : classificationPage.getTextBlocks()) { for (AbstractPageBlock textblock : classificationPage.getTextBlocks()) {
if (image.getPosition().contains(textblock.getBBoxInitialUserSpace())) { if (image.getPosition().contains(textblock.getBBoxPdf())) {
image.setImageType(ImageType.OCR); image.setImageType(ImageType.OCR);
return; return;
} }

View File

@ -14,6 +14,7 @@ import lombok.NoArgsConstructor;
@AllArgsConstructor @AllArgsConstructor
public class Classification { public class Classification {
@Builder.Default
private Map<String, Float> probabilities = new HashMap<>(); private Map<String, Float> probabilities = new HashMap<>();
private String label; private String label;

View File

@ -22,8 +22,10 @@ public class ImageServiceResponse {
@JsonProperty(value = "imageMetadata") @JsonProperty(value = "imageMetadata")
@JsonAlias("data") @JsonAlias("data")
@Builder.Default
private List<ImageMetadata> data = new ArrayList<>(); private List<ImageMetadata> data = new ArrayList<>();
@Builder.Default
private List<ImageMetadata> dataCV = new ArrayList<>(); private List<ImageMetadata> dataCV = new ArrayList<>();

View File

@ -15,6 +15,7 @@ import lombok.NoArgsConstructor;
public class TableData { public class TableData {
private PageInfo pageInfo; private PageInfo pageInfo;
@Builder.Default
private List<TableCells> tableCells = new ArrayList<>(); private List<TableCells> tableCells = new ArrayList<>();
} }

View File

@ -19,7 +19,7 @@ public class TableServiceResponse {
private String operation; private String operation;
private String targetFileExtension; private String targetFileExtension;
private String responseFileExtension; private String responseFileExtension;
@Builder.Default
private List<TableData> data = new ArrayList<>(); private List<TableData> data = new ArrayList<>();
} }

View File

@ -6,7 +6,6 @@ import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation; import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
import lombok.experimental.UtilityClass; import lombok.experimental.UtilityClass;
@ -38,7 +37,7 @@ public class GapDetectionService {
for (TextPositionSequence currentTextPosition : sortedTextPositionSequences.subList(1, sortedTextPositionSequences.size())) { for (TextPositionSequence currentTextPosition : sortedTextPositionSequences.subList(1, sortedTextPositionSequences.size())) {
double yDifference = Math.abs(currentTextPosition.getMaxYDirAdj() - previousTextPosition.getMaxYDirAdj()); double yDifference = Math.abs(currentTextPosition.getMaxYDirAdj() - previousTextPosition.getMaxYDirAdj());
double xGap = Math.abs(previousTextPosition.getMaxXDirAdj() - currentTextPosition.getMinXDirAdj()); double xGap = Math.abs(previousTextPosition.getMaxXDirAdj() - currentTextPosition.getXDirAdj());
Rectangle2D previousTextPositionBBox = toRectangle2D(previousTextPosition); Rectangle2D previousTextPositionBBox = toRectangle2D(previousTextPosition);
Rectangle2D currentTextPositionBBox = toRectangle2D(currentTextPosition); Rectangle2D currentTextPositionBBox = toRectangle2D(currentTextPosition);

View File

@ -71,7 +71,7 @@ public class LineDetectionService {
private static boolean isXGap(TextPositionSequence currentTextPosition, TextPositionSequence previousTextPosition, double avgTextPositionHeight) { private static boolean isXGap(TextPositionSequence currentTextPosition, TextPositionSequence previousTextPosition, double avgTextPositionHeight) {
return Math.abs(previousTextPosition.getMaxXDirAdj() - currentTextPosition.getMinXDirAdj()) > (avgTextPositionHeight * X_GAP_FACTOR); return Math.abs(previousTextPosition.getMaxXDirAdj() - currentTextPosition.getXDirAdj()) > (avgTextPositionHeight * X_GAP_FACTOR);
} }
@ -83,7 +83,7 @@ public class LineDetectionService {
private static boolean isNewLine(TextPositionSequence currentTextPosition, TextPositionSequence previousTextPosition, double avgTextPositionHeight) { private static boolean isNewLine(TextPositionSequence currentTextPosition, TextPositionSequence previousTextPosition, double avgTextPositionHeight) {
return Math.abs(previousTextPosition.getMinYDirAdj() - currentTextPosition.getMinYDirAdj()) > avgTextPositionHeight; return Math.abs(previousTextPosition.getYDirAdj() - currentTextPosition.getYDirAdj()) > avgTextPositionHeight;
} }

View File

@ -78,7 +78,7 @@ public class TableExtractionService {
List<Cell> containedCells = new ArrayList<>(); List<Cell> containedCells = new ArrayList<>();
for (Cell c : cells) { for (Cell c : cells) {
if (c.hasMinimumSize() && area.contains(c.getBBoxInitialUserSpace())) { if (c.hasMinimumSize() && area.contains(c.getBBoxPdf())) {
containedCells.add(c); containedCells.add(c);
} }
} }

View File

@ -31,13 +31,13 @@ public class TextRulingsClassifier {
private static void handleVerticalText(CleanRulings cleanRulings, TextPositionSequence word) { private static void handleVerticalText(CleanRulings cleanRulings, TextPositionSequence word) {
float lowerY = (float) (word.getBBoxInitialUserSpace().getMinY() + TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth()); float lowerY = (float) (word.getBBoxPdf().getMinY() + TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
float upperY = (float) (word.getBBoxInitialUserSpace().getMaxY() - TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth()); float upperY = (float) (word.getBBoxPdf().getMaxY() - TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
float strikethroughCenterX = (float) word.getBBoxInitialUserSpace().getCenterX(); float strikethroughCenterX = (float) word.getBBoxPdf().getCenterX();
float strikethroughBoxHeight = (float) ((word.getBBoxDirAdj().getHeight() * STRIKETHROUGH_ZONE) / 2); float strikethroughBoxHeight = (float) ((word.getBBoxDirAdj().getHeight() * STRIKETHROUGH_ZONE) / 2);
float underlineCenterX = (float) (word.getDir().equals(TextDirection.QUARTER_CIRCLE) ? word.getBBoxInitialUserSpace().getMaxX() : word.getBBoxInitialUserSpace().getMinX()); float underlineCenterX = (float) (word.getDir().equals(TextDirection.QUARTER_CIRCLE) ? word.getBBoxPdf().getMaxX() : word.getBBoxPdf().getMinX());
float underlineBoxHeight = (float) ((word.getBBoxDirAdj().getHeight() * UNDERLINE_ZONE) / 2); float underlineBoxHeight = (float) ((word.getBBoxDirAdj().getHeight() * UNDERLINE_ZONE) / 2);
float leftX = Math.min(underlineCenterX - underlineBoxHeight, strikethroughCenterX - strikethroughBoxHeight); float leftX = Math.min(underlineCenterX - underlineBoxHeight, strikethroughCenterX - strikethroughBoxHeight);
@ -65,13 +65,13 @@ public class TextRulingsClassifier {
private static void handleHorizontalText(CleanRulings cleanRulings, TextPositionSequence word) { private static void handleHorizontalText(CleanRulings cleanRulings, TextPositionSequence word) {
float leftX = (float) (word.getBBoxInitialUserSpace().getMinX() + TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth()); float leftX = (float) (word.getBBoxPdf().getMinX() + TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
float rightX = (float) (word.getBBoxInitialUserSpace().getMaxX() - TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth()); float rightX = (float) (word.getBBoxPdf().getMaxX() - TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
float strikethroughCenterY = (float) word.getBBoxInitialUserSpace().getCenterY(); float strikethroughCenterY = (float) word.getBBoxPdf().getCenterY();
float strikethroughBoxHeight = (float) ((word.getBBoxDirAdj().getHeight() * STRIKETHROUGH_ZONE) / 2); float strikethroughBoxHeight = (float) ((word.getBBoxDirAdj().getHeight() * STRIKETHROUGH_ZONE) / 2);
float underlineCenterY = (float) (word.getDir().equals(TextDirection.ZERO) ? word.getBBoxInitialUserSpace().getMinY() : word.getBBoxInitialUserSpace().getMaxY()); float underlineCenterY = (float) (word.getDir().equals(TextDirection.ZERO) ? word.getBBoxPdf().getMinY() : word.getBBoxPdf().getMaxY());
float underlineBoxHeight = (float) ((word.getBBoxDirAdj().getHeight() * UNDERLINE_ZONE) / 2); float underlineBoxHeight = (float) ((word.getBBoxDirAdj().getHeight() * UNDERLINE_ZONE) / 2);
float lowerY = Math.min(underlineCenterY - underlineBoxHeight, strikethroughCenterY - strikethroughBoxHeight); float lowerY = Math.min(underlineCenterY - underlineBoxHeight, strikethroughCenterY - strikethroughBoxHeight);

View File

@ -2,12 +2,10 @@ package com.knecon.fforesight.service.layoutparser.processor.services.blockifica
import static com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService.buildTextBlock; import static com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService.buildTextBlock;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.ListIterator; import java.util.ListIterator;
import java.util.Locale; import java.util.Locale;
import java.util.function.Function;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
@ -19,8 +17,6 @@ import com.knecon.fforesight.service.layoutparser.processor.model.SectionIdentif
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject; import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.SearchTextWithTextPositionFactory;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.Data; import lombok.Data;
@ -29,14 +25,6 @@ public class BlockificationPostprocessingService {
private static final float BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD = 5.0f; private static final float BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD = 5.0f;
private static final Function<TextPageBlock, Rectangle2D> blockToBoundingBox = (abstractPageBlock) -> abstractPageBlock.getSequences()
.stream()
.map(textPositionSequence -> textPositionSequence.getTextPositions()
.stream()
.map(tp -> SearchTextWithTextPositionFactory.mapRedTextPositionToInitialUserSpace(tp, textPositionSequence))
.collect(RectangleTransformations.collectBBox()))
.collect(RectangleTransformations.collectBBox());
public OutlineObject sanitizeOutlineBlocks(ClassificationPage classificationPage, OutlineObject notFoundOutlineObject) { public OutlineObject sanitizeOutlineBlocks(ClassificationPage classificationPage, OutlineObject notFoundOutlineObject) {
@ -63,13 +51,13 @@ public class BlockificationPostprocessingService {
} }
if (!contextsOverlap(notFoundOutlineObjectProcessionContext, firstOutlineObjectProcessionContext)) { if (!contextsOverlap(notFoundOutlineObjectProcessionContext, firstOutlineObjectProcessionContext)) {
notFoundOutlineObject.setFound(selectMatch(classificationPage, notFoundOutlineObjectProcessionContext)); notFoundOutlineObject.setFound(selectMatch(classificationPage, notFoundOutlineObjectProcessionContext, pageHeight));
} }
if (firstOutlineObject != null) { if (firstOutlineObject != null) {
// re-create the context for the updated blocks // re-create the context for the updated blocks
firstOutlineObjectProcessionContext = new OutlineProcessionContext(firstOutlineObject); firstOutlineObjectProcessionContext = new OutlineProcessionContext(firstOutlineObject);
processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, firstOutlineObjectProcessionContext); processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, firstOutlineObjectProcessionContext);
firstOutlineObject.setFound(selectMatch(classificationPage, firstOutlineObjectProcessionContext)); firstOutlineObject.setFound(selectMatch(classificationPage, firstOutlineObjectProcessionContext, pageHeight));
} }
} }
@ -77,7 +65,7 @@ public class BlockificationPostprocessingService {
outlineObjectListIterator.forEachRemaining(outlineObject -> { outlineObjectListIterator.forEachRemaining(outlineObject -> {
OutlineProcessionContext outlineObjectProcessionContext = new OutlineProcessionContext(outlineObject); OutlineProcessionContext outlineObjectProcessionContext = new OutlineProcessionContext(outlineObject);
processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, outlineObjectProcessionContext); processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, outlineObjectProcessionContext);
outlineObject.setFound(selectMatch(classificationPage, outlineObjectProcessionContext)); outlineObject.setFound(selectMatch(classificationPage, outlineObjectProcessionContext, pageHeight));
}); });
if (!outlineObjects.isEmpty()) { if (!outlineObjects.isEmpty()) {
@ -160,7 +148,7 @@ public class BlockificationPostprocessingService {
} }
private boolean selectMatch(ClassificationPage classificationPage, OutlineProcessionContext context) { private boolean selectMatch(ClassificationPage classificationPage, OutlineProcessionContext context, float pageHeight) {
OutlineObject outlineObject = context.outlineObject; OutlineObject outlineObject = context.outlineObject;
TextPageBlock directMatch = context.directMatch; TextPageBlock directMatch = context.directMatch;
@ -168,8 +156,8 @@ public class BlockificationPostprocessingService {
TextPageBlock splitCandidate = context.splitCandidate; TextPageBlock splitCandidate = context.splitCandidate;
PageBlockType headlineType = PageBlockType.getHeadlineType(outlineObject.getTreeDepth()); PageBlockType headlineType = PageBlockType.getHeadlineType(outlineObject.getTreeDepth());
double distanceToDirectMatch = directMatch != null ? calculateDistance(outlineObject, directMatch) : Double.MAX_VALUE; double distanceToDirectMatch = directMatch != null ? calculateDistance(outlineObject, directMatch, pageHeight) : Double.MAX_VALUE;
double distanceToSplitCandidate = splitCandidate != null ? calculateDistance(outlineObject, splitCandidate) : Double.MAX_VALUE; double distanceToSplitCandidate = splitCandidate != null ? calculateDistance(outlineObject, splitCandidate, pageHeight) : Double.MAX_VALUE;
double distanceToBestMergeCandidates = Double.MAX_VALUE; double distanceToBestMergeCandidates = Double.MAX_VALUE;
List<TextPageBlock> bestMergeCandidateCombination = new ArrayList<>(); List<TextPageBlock> bestMergeCandidateCombination = new ArrayList<>();
@ -189,7 +177,7 @@ public class BlockificationPostprocessingService {
for (List<TextPageBlock> combination : combinations) { for (List<TextPageBlock> combination : combinations) {
double averageDistance = combination.stream() double averageDistance = combination.stream()
.map(block -> calculateDistance(outlineObject, block)) .map(block -> calculateDistance(outlineObject, block, pageHeight))
.mapToDouble(Double::doubleValue).average() .mapToDouble(Double::doubleValue).average()
.orElse(Double.MAX_VALUE); .orElse(Double.MAX_VALUE);
if (distanceToBestMergeCandidates > averageDistance) { if (distanceToBestMergeCandidates > averageDistance) {
@ -418,10 +406,10 @@ public class BlockificationPostprocessingService {
} }
private double calculateDistance(OutlineObject outlineObject, TextPageBlock pageBlock) { private double calculateDistance(OutlineObject outlineObject, TextPageBlock pageBlock, float pageHeight) {
double deltaX = outlineObject.getPoint().getX() - pageBlock.getMinX(); double deltaX = outlineObject.getPoint().getX() - pageBlock.getMinX();
double deltaY = pageBlock.getPageHeight() - outlineObject.getPoint().getY() - pageBlock.getMinY(); double deltaY = pageHeight - outlineObject.getPoint().getY() - pageBlock.getMinY();
return Math.sqrt(deltaX * deltaX + deltaY * deltaY); return Math.sqrt(deltaX * deltaX + deltaY * deltaY);
} }

View File

@ -1,7 +1,6 @@
package com.knecon.fforesight.service.layoutparser.processor.services.blockification; package com.knecon.fforesight.service.layoutparser.processor.services.blockification;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Comparator;
import java.util.List; import java.util.List;
import java.util.ListIterator; import java.util.ListIterator;
@ -10,7 +9,6 @@ import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.DocstrumSegmentationService; import com.knecon.fforesight.service.layoutparser.processor.docstrum.DocstrumSegmentationService;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
@ -40,7 +38,7 @@ public class DocstrumBlockificationService {
CleanRulings usedRulings = rulings.withoutTextRulings(); CleanRulings usedRulings = rulings.withoutTextRulings();
var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder, usedRulings, visualizations); List<Zone> zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder, usedRulings, visualizations);
if (!textPositions.isEmpty()) { if (!textPositions.isEmpty()) {
visualizations.addZoneVisualizations(zones, textPositions.get(0).getPage()); visualizations.addZoneVisualizations(zones, textPositions.get(0).getPage());
@ -48,11 +46,7 @@ public class DocstrumBlockificationService {
visualizations.addCharactersWithNeighbours(zones, textPositions.get(0).getPage()); visualizations.addCharactersWithNeighbours(zones, textPositions.get(0).getPage());
} }
var pageBlocks = toAbstractPageBlocks(zones, xyOrder, usedRulings); var pageBlocks = toAbstractPageBlocks(zones);
if (xyOrder) {
sortPageBlocksXThenY(pageBlocks);
}
var classificationPage = new ClassificationPage(pageBlocks); var classificationPage = new ClassificationPage(pageBlocks);
classificationPage.setCleanRulings(rulings); classificationPage.setCleanRulings(rulings);
@ -73,21 +67,7 @@ public class DocstrumBlockificationService {
} }
private static void sortPageBlocksXThenY(List<AbstractPageBlock> pageBlocks) { private List<AbstractPageBlock> toAbstractPageBlocks(List<Zone> zones) {
pageBlocks.sort(Comparator.comparing(AbstractPageBlock::getMinY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(AbstractPageBlock::getMinX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
pageBlocks.sort(new Comparator<AbstractPageBlock>() {
@Override
public int compare(AbstractPageBlock o1, AbstractPageBlock o2) {
return Math.abs(o1.getMinY() - o2.getMinY()) < 5 && o1.getMinX() < o2.getMinX() == true ? -1 : 0;
}
});
}
private List<AbstractPageBlock> toAbstractPageBlocks(List<Zone> zones, boolean xyOrder, CleanRulings usedRulings) {
List<AbstractPageBlock> abstractPageBlocks = new ArrayList<>(); List<AbstractPageBlock> abstractPageBlocks = new ArrayList<>();
zones.forEach(zone -> { zones.forEach(zone -> {
@ -296,6 +276,10 @@ public class DocstrumBlockificationService {
continue; continue;
} }
// if (!current.getMostPopularWordStyle().equals(inner.getMostPopularWordStyle())) {
// continue;
// }
if (current.getDir() == inner.getDir() && current.intersects(inner, yThreshold, xThreshold)) { if (current.getDir() == inner.getDir() && current.intersects(inner, yThreshold, xThreshold)) {
boolean toDuplicate = current.isToDuplicate() || inner.isToDuplicate(); boolean toDuplicate = current.isToDuplicate() || inner.isToDuplicate();

View File

@ -1,9 +1,6 @@
package com.knecon.fforesight.service.layoutparser.processor.services.blockification; package com.knecon.fforesight.service.layoutparser.processor.services.blockification;
import static java.util.stream.Collectors.toSet;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Comparator;
import java.util.List; import java.util.List;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
@ -13,10 +10,8 @@ import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.Orientation; import com.knecon.fforesight.service.layoutparser.processor.model.Orientation;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
@ -44,31 +39,30 @@ public class DocuMineBlockificationService {
CleanRulings usedRulings = cleanRulings.withoutTextRulings(); CleanRulings usedRulings = cleanRulings.withoutTextRulings();
float minX = 1000; double minX = 1000;
float maxX = 0; double maxX = 0;
float minY = 1000; double minY = 1000;
float maxY = 0; double maxY = 0;
TextPositionSequence prev = null; TextPositionSequence prev = null;
boolean wasSplitted = false; boolean wasSplitted = false;
Float splitX1 = null; Double splitX1 = null;
for (TextPositionSequence word : textPositions) { for (TextPositionSequence word : textPositions) {
boolean lineSeparation = prev != null && word.getMinYDirAdj() - maxY > Math.min(word.getHeight(), prev.getHeight()) * 1.25; boolean lineSeparation = prev != null && word.getYDirAdj() - prev.getMaxYDirAdj() > Math.min(word.getHeight(), prev.getHeight()) * 1.25;
boolean startFromTop = prev != null && word.getMinYDirAdj() < prev.getMinYDirAdj() - prev.getTextHeight(); boolean startFromTop = prev != null && word.getYDirAdj() < prev.getYDirAdj() - prev.getTextHeight();
boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj(); boolean splitByX = prev != null && maxX + 50 < word.getXDirAdj() && prev.getYDirAdj() == word.getYDirAdj();
boolean negativeXGap = prev != null && word.getMinXDirAdj() - minX < -5; boolean negativeXGap = prev != null && word.getXDirAdj() - minX < -5;
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj(); boolean newLineAfterSplit = prev != null && word.getYDirAdj() != prev.getYDirAdj() && wasSplitted && splitX1 != word.getXDirAdj();
boolean isSplitByRuling = prev != null && usedRulings.lineBetween(prev, word); boolean isSplitByRuling = prev != null && usedRulings.lineBetween(prev, word);
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir()); boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
boolean splitByOtherFontAndOtherY = prev != null && Math.abs(prev.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight() * 0.2 && (word.getFontStyle() boolean splitByOtherFontAndOtherY = prev != null && Math.abs(prev.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight() * 0.2 //
.contains("bold") && (word.getFontStyle().contains("bold") && !prev.getFontStyle().contains("bold") //
&& !prev.getFontStyle() || prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold"));
.contains("bold") || prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold"));
Matcher matcher = pattern.matcher(chunkWords.stream() Matcher matcher = pattern.matcher(chunkWords.stream()
.collect(Collectors.joining(" ")).toString()); .collect(Collectors.joining(" ")).toString());
boolean startsOnSameX = Math.abs(minX - word.getMinXDirAdj()) < 5 && matcher.matches(); boolean startsOnSameX = Math.abs(minX - word.getXDirAdj()) < 5 && matcher.matches();
if (prev != null && (lineSeparation || startFromTop || splitByDir || isSplitByRuling || splitByOtherFontAndOtherY || negativeXGap || startsOnSameX)) { if (prev != null && (lineSeparation || startFromTop || splitByDir || isSplitByRuling || splitByOtherFontAndOtherY || negativeXGap || startsOnSameX)) {
@ -84,7 +78,7 @@ public class DocuMineBlockificationService {
if (splitByX && !isSplitByRuling) { if (splitByX && !isSplitByRuling) {
wasSplitted = true; wasSplitted = true;
cb1.setOrientation(Orientation.LEFT); cb1.setOrientation(Orientation.LEFT);
splitX1 = word.getMinXDirAdj(); splitX1 = word.getXDirAdj();
} else if (newLineAfterSplit && !isSplitByRuling) { } else if (newLineAfterSplit && !isSplitByRuling) {
wasSplitted = false; wasSplitted = false;
cb1.setOrientation(Orientation.RIGHT); cb1.setOrientation(Orientation.RIGHT);
@ -107,14 +101,14 @@ public class DocuMineBlockificationService {
chunkWords.add(word); chunkWords.add(word);
prev = word; prev = word;
if (word.getMinXDirAdj() < minX) { if (word.getXDirAdj() < minX) {
minX = word.getMinXDirAdj(); minX = word.getXDirAdj();
} }
if (word.getMaxXDirAdj() > maxX) { if (word.getMaxXDirAdj() > maxX) {
maxX = word.getMaxXDirAdj(); maxX = word.getMaxXDirAdj();
} }
if (word.getMinYDirAdj() < minY) { if (word.getYDirAdj() < minY) {
minY = word.getMinYDirAdj(); minY = word.getYDirAdj();
} }
if (word.getMaxYDirAdj() > maxY) { if (word.getMaxYDirAdj() > maxY) {
maxY = word.getMaxYDirAdj(); maxY = word.getMaxYDirAdj();
@ -126,7 +120,5 @@ public class DocuMineBlockificationService {
return new ClassificationPage(textPageBlocks); return new ClassificationPage(textPageBlocks);
} }
} }

View File

@ -38,18 +38,18 @@ public class RedactManagerBlockificationService {
List<TextPositionSequence> chunkWords = new ArrayList<>(); List<TextPositionSequence> chunkWords = new ArrayList<>();
List<AbstractPageBlock> chunkBlockList = new ArrayList<>(); List<AbstractPageBlock> chunkBlockList = new ArrayList<>();
float minX = 1000, maxX = 0, minY = 1000, maxY = 0; double minX = 1000, maxX = 0, minY = 1000, maxY = 0;
TextPositionSequence prev = null; TextPositionSequence prev = null;
boolean wasSplitted = false; boolean wasSplitted = false;
Float splitX1 = null; Double splitX1 = null;
for (TextPositionSequence word : textPositions) { for (TextPositionSequence word : textPositions) {
boolean lineSeparation = word.getMinYDirAdj() - maxY > word.getHeight() * 1.25; boolean lineSeparation = word.getYDirAdj() - maxY > word.getHeight() * 1.25;
boolean startFromTop = prev != null && word.getMinYDirAdj() < prev.getMinYDirAdj() - prev.getTextHeight(); boolean startFromTop = prev != null && word.getYDirAdj() < prev.getYDirAdj() - prev.getTextHeight();
boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj(); boolean splitByX = prev != null && maxX + 50 < word.getXDirAdj() && prev.getYDirAdj() == word.getYDirAdj();
boolean xIsBeforeFirstX = prev != null && word.getMinXDirAdj() < minX; boolean xIsBeforeFirstX = prev != null && word.getXDirAdj() < minX;
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj(); boolean newLineAfterSplit = prev != null && word.getYDirAdj() != prev.getYDirAdj() && wasSplitted && splitX1 != word.getXDirAdj();
boolean isSplitByRuling = prev != null && usedRulings.lineBetween(prev, word); boolean isSplitByRuling = prev != null && usedRulings.lineBetween(prev, word);
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir()); boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
@ -69,7 +69,7 @@ public class RedactManagerBlockificationService {
if (splitByX && !isSplitByRuling) { if (splitByX && !isSplitByRuling) {
wasSplitted = true; wasSplitted = true;
cb1.setOrientation(Orientation.LEFT); cb1.setOrientation(Orientation.LEFT);
splitX1 = word.getMinXDirAdj(); splitX1 = word.getXDirAdj();
} else if (newLineAfterSplit && !isSplitByRuling) { } else if (newLineAfterSplit && !isSplitByRuling) {
wasSplitted = false; wasSplitted = false;
cb1.setOrientation(Orientation.RIGHT); cb1.setOrientation(Orientation.RIGHT);
@ -92,14 +92,14 @@ public class RedactManagerBlockificationService {
chunkWords.add(word); chunkWords.add(word);
prev = word; prev = word;
if (word.getMinXDirAdj() < minX) { if (word.getXDirAdj() < minX) {
minX = word.getMinXDirAdj(); minX = word.getXDirAdj();
} }
if (word.getMaxXDirAdj() > maxX) { if (word.getMaxXDirAdj() > maxX) {
maxX = word.getMaxXDirAdj(); maxX = word.getMaxXDirAdj();
} }
if (word.getMinYDirAdj() < minY) { if (word.getYDirAdj() < minY) {
minY = word.getMinYDirAdj(); minY = word.getYDirAdj();
} }
if (word.getMaxYDirAdj() > maxY) { if (word.getMaxYDirAdj() > maxY) {
maxY = word.getMaxYDirAdj(); maxY = word.getMaxYDirAdj();

View File

@ -23,7 +23,7 @@ public class ClarifyndClassificationService {
public void classifyDocument(ClassificationDocument document) { public void classifyDocument(ClassificationDocument document) {
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular(); List<Double> headlineFontSizes = document.getFontSizeCounter().getHigherThanMostPopular();
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue()); log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
@ -35,7 +35,7 @@ public class ClarifyndClassificationService {
} }
private void classifyPage(HeadlineClassificationService headlineClassificationService, ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) { private void classifyPage(HeadlineClassificationService headlineClassificationService, ClassificationPage page, ClassificationDocument document, List<Double> headlineFontSizes) {
for (AbstractPageBlock textBlock : page.getTextBlocks()) { for (AbstractPageBlock textBlock : page.getTextBlocks()) {
if (textBlock instanceof TextPageBlock) { if (textBlock instanceof TextPageBlock) {
@ -45,7 +45,7 @@ public class ClarifyndClassificationService {
} }
private void classifyBlock(HeadlineClassificationService headlineClassificationService, TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) { private void classifyBlock(HeadlineClassificationService headlineClassificationService, TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List<Double> headlineFontSizes) {
var bodyTextFrame = page.getBodyTextFrame(); var bodyTextFrame = page.getBodyTextFrame();

View File

@ -31,7 +31,7 @@ public class DocuMineClassificationService {
public void classifyDocument(ClassificationDocument document) { public void classifyDocument(ClassificationDocument document) {
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular(); List<Double> headlineFontSizes = document.getFontSizeCounter().getHigherThanMostPopular();
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue()); log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
@ -46,7 +46,7 @@ public class DocuMineClassificationService {
private void classifyPage(HeadlineClassificationService headlineClassificationService, private void classifyPage(HeadlineClassificationService headlineClassificationService,
ClassificationPage page, ClassificationPage page,
ClassificationDocument document, ClassificationDocument document,
List<Float> headlineFontSizes) { List<Double> headlineFontSizes) {
for (AbstractPageBlock textBlock : page.getTextBlocks()) { for (AbstractPageBlock textBlock : page.getTextBlocks()) {
if (textBlock instanceof TextPageBlock) { if (textBlock instanceof TextPageBlock) {
@ -60,7 +60,7 @@ public class DocuMineClassificationService {
TextPageBlock textBlock, TextPageBlock textBlock,
ClassificationPage page, ClassificationPage page,
ClassificationDocument document, ClassificationDocument document,
List<Float> headlineFontSizes) { List<Double> headlineFontSizes) {
log.debug("headlineFontSizes: {}", headlineFontSizes); log.debug("headlineFontSizes: {}", headlineFontSizes);
var bodyTextFrame = page.getBodyTextFrame(); var bodyTextFrame = page.getBodyTextFrame();

View File

@ -25,7 +25,7 @@ public class RedactManagerClassificationService {
public void classifyDocument(ClassificationDocument document) { public void classifyDocument(ClassificationDocument document) {
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular(); List<Double> headlineFontSizes = document.getFontSizeCounter().getHigherThanMostPopular();
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue()); log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
@ -37,7 +37,7 @@ public class RedactManagerClassificationService {
} }
private void classifyPage(HeadlineClassificationService headlineClassificationService, ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) { private void classifyPage(HeadlineClassificationService headlineClassificationService, ClassificationPage page, ClassificationDocument document, List<Double> headlineFontSizes) {
for (AbstractPageBlock textBlock : page.getTextBlocks()) { for (AbstractPageBlock textBlock : page.getTextBlocks()) {
if (textBlock instanceof TextPageBlock) { if (textBlock instanceof TextPageBlock) {
@ -47,7 +47,7 @@ public class RedactManagerClassificationService {
} }
private void classifyBlock(HeadlineClassificationService headlineClassificationService, TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) { private void classifyBlock(HeadlineClassificationService headlineClassificationService, TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List<Double> headlineFontSizes) {
var bodyTextFrame = page.getBodyTextFrame(); var bodyTextFrame = page.getBodyTextFrame();
@ -56,7 +56,7 @@ public class RedactManagerClassificationService {
return; return;
} }
if (document.getFontSizeCounter().getMostPopular() == null) { if (document.getFontSizeCounter().getMostPopular() == null) {
textBlock.setClassification(PageBlockType.OTHER); textBlock.setClassification(PageBlockType.PARAGRAPH);
return; return;
} }
@ -129,7 +129,7 @@ public class RedactManagerClassificationService {
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) { } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) {
textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN); textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN);
} else { } else {
textBlock.setClassification(PageBlockType.OTHER); textBlock.setClassification(PageBlockType.PARAGRAPH);
} }
} }

View File

@ -111,10 +111,10 @@ public class DocumentGraphFactory {
textBlocks.add(originalTextBlock); textBlocks.add(originalTextBlock);
textBlocks.addAll(textBlocksToMerge); textBlocks.addAll(textBlocksToMerge);
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), node, context, page); AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock2(TextPositionOperations.mergeAndSort(textBlocks), node, context, page);
if (node instanceof DuplicatedParagraph duplicatedParagraph) { if (node instanceof DuplicatedParagraph duplicatedParagraph) {
AtomicTextBlock unsortedTextBlock = context.textBlockFactory.buildAtomicTextBlock(textBlocks.stream() AtomicTextBlock unsortedTextBlock = context.textBlockFactory.buildAtomicTextBlock2(textBlocks.stream()
.flatMap(tb -> tb.getSequences() .flatMap(tb -> tb.getSequences()
.stream()) .stream())
.collect(Collectors.toList()), node, context, page); .collect(Collectors.toList()), node, context, page);
@ -191,7 +191,7 @@ public class DocumentGraphFactory {
Page page = context.getPage(textBlocks.get(0).getPage()); Page page = context.getPage(textBlocks.get(0).getPage());
Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build(); Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build();
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks), footer, context, page); AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock2(TextPositionOperations.merge(textBlocks), footer, context, page);
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer); List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer);
footer.setTreeId(tocId); footer.setTreeId(tocId);
footer.setLeafTextBlock(textBlock); footer.setLeafTextBlock(textBlock);
@ -203,7 +203,7 @@ public class DocumentGraphFactory {
Page page = context.getPage(textBlocks.get(0).getPage()); Page page = context.getPage(textBlocks.get(0).getPage());
Header header = Header.builder().documentTree(context.getDocumentTree()).build(); Header header = Header.builder().documentTree(context.getDocumentTree()).build();
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks), header, 0, page); AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.merge(textBlocks), header, 0, page);
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header); List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header);
header.setTreeId(tocId); header.setTreeId(tocId);
header.setLeafTextBlock(textBlock); header.setLeafTextBlock(textBlock);

View File

@ -1,6 +1,7 @@
package com.knecon.fforesight.service.layoutparser.processor.services.factory; package com.knecon.fforesight.service.layoutparser.processor.services.factory;
import java.awt.geom.AffineTransform; import static com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition.HEIGHT_PADDING;
import java.awt.geom.Rectangle2D; import java.awt.geom.Rectangle2D;
import java.util.Collection; import java.util.Collection;
import java.util.Collections; import java.util.Collections;
@ -11,7 +12,6 @@ import java.util.Objects;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange; import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition; import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import lombok.experimental.UtilityClass; import lombok.experimental.UtilityClass;
@ -19,14 +19,13 @@ import lombok.experimental.UtilityClass;
@UtilityClass @UtilityClass
public class SearchTextWithTextPositionFactory { public class SearchTextWithTextPositionFactory {
public final int HEIGHT_PADDING = 2;
// when checking for a hyphen linebreak, we need to check after a linebreak if the last hyphen was less than three symbols away. // when checking for a hyphen linebreak, we need to check after a linebreak if the last hyphen was less than three symbols away.
// We detect a linebreak as either a "\n" character or if two adjacent symbol's position differ in y-coordinates by at least one character height. // We detect a linebreak as either a "\n" character or if two adjacent symbol's position differ in y-coordinates by at least one character height.
// If there is a hyphen linebreak, the hyphen will be 1 position in front of a "\n" or 2 positions in front of the character which has a lower y-coordinate // If there is a hyphen linebreak, the hyphen will be 1 position in front of a "\n" or 2 positions in front of the character which has a lower y-coordinate
// This is why, we need to initialize this to < -2, otherwise, if the very first symbol is a \n we would detect a hyphen linebreak that isn't there. // This is why, we need to initialize this to < -2, otherwise, if the very first symbol is a \n we would detect a hyphen linebreak that isn't there.
// Also, Integer.MIN_VALUE is a bad idea due to potential overflow during arithmetic operations. This is why the default should be -3. // Also, Integer.MIN_VALUE is a bad idea due to potential overflow during arithmetic operations. This is why the default should be -3.
public final int MAX_HYPHEN_LINEBREAK_DISTANCE = 3; public final int MAX_HYPHEN_LINEBREAK_DISTANCE = 3;
public static final double LINEBREAK_DELTA_TOLERANCE = 1.05; public static final double LINEBREAK_DELTA_TOLERANCE = 1.5;
public SearchTextWithTextPositionDto buildSearchTextToTextPositionDto(List<TextPositionSequence> sequences) { public SearchTextWithTextPositionDto buildSearchTextToTextPositionDto(List<TextPositionSequence> sequences) {
@ -38,15 +37,13 @@ public class SearchTextWithTextPositionFactory {
Context context = new Context(); Context context = new Context();
RedTextPosition currentTextPosition = sequences.get(0).getTextPositions() RedTextPosition currentTextPosition = sequences.get(0).getTextPositions().get(0);
.get(0);
RedTextPosition previousTextPosition = RedTextPosition.builder().unicode(" ").bBoxDirAdj(currentTextPosition.getBBoxDirAdj()).build(); RedTextPosition previousTextPosition = RedTextPosition.builder().unicode(" ").bBoxDirAdj(currentTextPosition.getBBoxDirAdj()).build();
for (TextPositionSequence word : sequences) { for (TextPositionSequence word : sequences) {
for (int i = 0; i < word.getTextPositions().size(); ++i) { for (int i = 0; i < word.getTextPositions().size(); ++i) {
currentTextPosition = word.getTextPositions() currentTextPosition = word.getTextPositions().get(i);
.get(i);
if (isLineBreak(currentTextPosition, previousTextPosition)) { if (isLineBreak(currentTextPosition, previousTextPosition)) {
removeHyphenLinebreaks(context); removeHyphenLinebreaks(context);
context.lineBreaksStringIdx.add(context.stringIdx); context.lineBreaksStringIdx.add(context.stringIdx);
@ -68,11 +65,10 @@ public class SearchTextWithTextPositionFactory {
++context.stringIdx; ++context.stringIdx;
} }
List<Rectangle2D> positions = sequences.stream() List<Rectangle2D> positions = sequences.stream()
.map(TextPositionSequence::getTextPositions) .map(TextPositionSequence::getTextPositions)
.flatMap(Collection::stream) .flatMap(Collection::stream)
.map(RedTextPosition::getBBoxInitialUserSpace) .map(RedTextPosition::getBBoxPdf)
.toList(); .toList();
assert context.stringBuilder.length() == context.stringIdxToPositionIdx.size(); assert context.stringBuilder.length() == context.stringIdxToPositionIdx.size();
@ -161,8 +157,8 @@ public class SearchTextWithTextPositionFactory {
return false; return false;
} }
double deltaY = Math.abs(currentPosition.getYDirAdj() - previousPosition.getYDirAdj()) * LINEBREAK_DELTA_TOLERANCE; double deltaY = (Math.abs(currentPosition.getYDirAdj() - previousPosition.getYDirAdj()) * LINEBREAK_DELTA_TOLERANCE) + (2 * HEIGHT_PADDING);
return deltaY >= currentPosition.getHeightDir() || deltaY >= previousPosition.getHeightDir(); return deltaY >= currentPosition.getHeightDirAdj() || deltaY >= previousPosition.getHeightDirAdj();
} }
@ -188,32 +184,6 @@ public class SearchTextWithTextPositionFactory {
} }
public Rectangle2D mapRedTextPositionToInitialUserSpace(RedTextPosition textPosition, TextPositionSequence sequence) {
float textHeight = sequence.getTextHeight() + HEIGHT_PADDING;
Rectangle2D rectangle2D = new Rectangle2D.Double(textPosition.getXDirAdj(),
textPosition.getYDirAdj() - textHeight,
textPosition.getWidthDirAdj(),
textHeight + HEIGHT_PADDING);
AffineTransform transform = new AffineTransform();
if (sequence.getDir() == TextDirection.ZERO || sequence.getDir() == TextDirection.HALF_CIRCLE) {
transform.rotate(sequence.getDir().getRadians(), sequence.getPageWidth() / 2f, sequence.getPageHeight() / 2f);
transform.translate(0f, sequence.getPageHeight());
} else if (sequence.getDir() == TextDirection.QUARTER_CIRCLE) {
transform.rotate(sequence.getDir().getRadians(), sequence.getPageWidth() / 2f, sequence.getPageWidth() / 2f);
transform.translate(0f, sequence.getPageWidth());
} else {
transform.rotate(sequence.getDir().getRadians(), sequence.getPageHeight() / 2f, sequence.getPageHeight() / 2f);
transform.translate(0f, sequence.getPageWidth());
}
transform.scale(1., -1.);
return transform.createTransformedShape(rectangle2D).getBounds2D();
}
private class Context { private class Context {
List<Integer> stringIdxToPositionIdx = new LinkedList<>(); List<Integer> stringIdxToPositionIdx = new LinkedList<>();

View File

@ -234,7 +234,7 @@ public class SectionNodeFactory {
.filter(abstractTextContainer -> !abstractTextContainer.equals(atc)) .filter(abstractTextContainer -> !abstractTextContainer.equals(atc))
.filter(abstractTextContainer -> abstractTextContainer.getPage() == atc.getPage()) .filter(abstractTextContainer -> abstractTextContainer.getPage() == atc.getPage())
.filter(abstractTextContainer -> abstractTextContainer instanceof TextPageBlock) .filter(abstractTextContainer -> abstractTextContainer instanceof TextPageBlock)
.filter(abstractTextContainer -> abstractTextContainer.intersectsY(atc)) .filter(abstractTextContainer -> abstractTextContainer.intersectsYPdf(atc))
.map(abstractTextContainer -> (TextPageBlock) abstractTextContainer) .map(abstractTextContainer -> (TextPageBlock) abstractTextContainer)
.filter(abstractTextContainer -> abstractTextContainer.getDir() == atc.getDir()) .filter(abstractTextContainer -> abstractTextContainer.getDir() == atc.getDir())
.filter(abstractTextContainer -> !abstractTextContainer.isToDuplicate()) .filter(abstractTextContainer -> !abstractTextContainer.isToDuplicate())

View File

@ -136,7 +136,7 @@ public class TableNodeFactory {
.row(rowIndex) .row(rowIndex)
.col(colIndex) .col(colIndex)
.header(cell.isHeaderCell()) .header(cell.isHeaderCell())
.bBox(cell.getBBoxInitialUserSpace()) .bBox(cell.getBBoxPdf())
.build(); .build();
page.getMainBody().add(tableCell); page.getMainBody().add(tableCell);
@ -148,7 +148,7 @@ public class TableNodeFactory {
tableCell.setLeafTextBlock(context.getTextBlockFactory().emptyTextBlock(tableNode, context, page)); tableCell.setLeafTextBlock(context.getTextBlockFactory().emptyTextBlock(tableNode, context, page));
} else if (cell.getTextBlocks().size() == 1) { } else if (cell.getTextBlocks().size() == 1) {
textBlock = context.getTextBlockFactory() textBlock = context.getTextBlockFactory()
.buildAtomicTextBlock(cell.getTextBlocks() .buildAtomicTextBlock2(cell.getTextBlocks()
.get(0).getSequences(), tableCell, context, page); .get(0).getSequences(), tableCell, context, page);
tableCell.setLeafTextBlock(textBlock); tableCell.setLeafTextBlock(textBlock);
} else if (firstTextBlockIsHeadline(cell)) { } else if (firstTextBlockIsHeadline(cell)) {
@ -163,8 +163,8 @@ public class TableNodeFactory {
context, context,
document); document);
} else if (cellAreaIsSmallerThanPageAreaTimesThreshold(cell, page)) { } else if (cellAreaIsSmallerThanPageAreaTimesThreshold(cell, page)) {
List<TextPositionSequence> sequences = TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(cell.getTextBlocks()); List<TextPositionSequence> sequences = TextPositionOperations.mergeAndSort(cell.getTextBlocks());
textBlock = context.getTextBlockFactory().buildAtomicTextBlock(sequences, tableCell, context, page); textBlock = context.getTextBlockFactory().buildAtomicTextBlock2(sequences, tableCell, context, page);
tableCell.setLeafTextBlock(textBlock); tableCell.setLeafTextBlock(textBlock);
} else { } else {
cell.getTextBlocks() cell.getTextBlocks()

View File

@ -17,7 +17,7 @@ public class TextBlockFactory {
long textBlockIdx; long textBlockIdx;
public AtomicTextBlock buildAtomicTextBlock(List<TextPositionSequence> sequences, SemanticNode parent, DocumentGraphFactory.Context context, Page page) { public AtomicTextBlock buildAtomicTextBlock2(List<TextPositionSequence> sequences, SemanticNode parent, DocumentGraphFactory.Context context, Page page) {
Integer numberOnPage = context.getAndIncrementTextBlockNumberOnPage(page); Integer numberOnPage = context.getAndIncrementTextBlockNumberOnPage(page);
return buildAtomicTextBlock(sequences, parent, numberOnPage, page); return buildAtomicTextBlock(sequences, parent, numberOnPage, page);
@ -32,13 +32,13 @@ public class TextBlockFactory {
long idx = textBlockIdx; long idx = textBlockIdx;
textBlockIdx++; textBlockIdx++;
String orientation; String orientation;
int textDirection; int textRotation;
if (sequences.isEmpty()) { if (sequences.isEmpty()) {
orientation = null; orientation = null;
textDirection = 0; textRotation = 0;
} else { } else {
orientation = sequences.get(0).getDir().toString(); orientation = sequences.get(0).getDir().toString();
textDirection = sequences.get(0).getRotation(); textRotation = sequences.get(0).getDir().getRotation();
} }
return AtomicTextBlock.fromSearchTextWithTextPosition(searchTextWithTextPositionDto.getSearchText(), return AtomicTextBlock.fromSearchTextWithTextPosition(searchTextWithTextPositionDto.getSearchText(),
searchTextWithTextPositionDto.getLineBreaks(), searchTextWithTextPositionDto.getLineBreaks(),
@ -52,7 +52,7 @@ public class TextBlockFactory {
page, page,
offset, offset,
orientation, orientation,
textDirection); textRotation);
} }

View File

@ -8,12 +8,11 @@ import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPage;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.SearchTextWithTextPositionFactory;
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation; import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows; import lombok.SneakyThrows;
@ -67,10 +66,7 @@ public class GraphicExtractorService {
private List<Box> getCharacterBBoxes(List<TextPositionSequence> textPositionSequences) { private List<Box> getCharacterBBoxes(List<TextPositionSequence> textPositionSequences) {
return textPositionSequences.stream() return textPositionSequences.stream()
.map(pos -> pos.getTextPositions() .map(BoundingBox::getBBoxPdf)
.stream()
.map(tp -> SearchTextWithTextPositionFactory.mapRedTextPositionToInitialUserSpace(tp, pos))
.collect(RectangleTransformations.collectBBox()))
.map(Box::new) .map(Box::new)
.collect(Collectors.toList()); .collect(Collectors.toList());
} }

View File

@ -96,7 +96,7 @@ public class HeaderFooterDetection {
continue; continue;
} }
int distance = calculateHammingDistanceWithPreprocessing(testString, paddedString); int distance = StringDistances.hamming(testString, paddedString);
double normalizedScore = 1 - (double) distance / Math.max(testString.length(), paddedString.length()); double normalizedScore = 1 - (double) distance / Math.max(testString.length(), paddedString.length());
score += normalizedScore * (j < weights.length ? weights[j] : 1); score += normalizedScore * (j < weights.length ? weights[j] : 1);
} }
@ -180,44 +180,4 @@ public class HeaderFooterDetection {
return headerCandidates; return headerCandidates;
} }
/**
* Calculate the Hamming distance between two strings after preprocessing to make them the same length
* and replacing all digits with a special character '@' since they are a common occurrence in headers/footers.
*
* @param firstCandidate First string
* @param secondCandidate Second string
* @return The Hamming distance between the two preprocessed strings.
*/
private int calculateHammingDistanceWithPreprocessing(String firstCandidate, String secondCandidate) {
int maxLength = Math.max(firstCandidate.length(), secondCandidate.length());
String cleanFirstCandidate = padString(firstCandidate, maxLength, '\0').replaceAll("\\d", "@");
String cleanSecondCandidate = padString(secondCandidate, maxLength, '\0').replaceAll("\\d", "@");
int distance = 0;
for (int i = 0; i < maxLength; i++) {
if (cleanFirstCandidate.charAt(i) != cleanSecondCandidate.charAt(i)) {
distance++;
}
}
return distance;
}
private String padString(String input, int length, char padChar) {
if (input.length() >= length) {
return input;
}
StringBuilder sb = new StringBuilder(input);
while (sb.length() < length) {
sb.append(padChar);
}
return sb.toString();
}
} }

View File

@ -48,7 +48,7 @@ public class MarkedContentUtils {
return markedContentByYPosition.values() return markedContentByYPosition.values()
.stream() .stream()
.map(textPositions -> new TextPositionSequence(textPositions, 0, true).getBBoxInitialUserSpace()) .map(textPositions -> new TextPositionSequence(textPositions, 0, true).getBBoxPdf())
.map(t -> new Rectangle2D.Double(t.getX(), t.getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight()))) .map(t -> new Rectangle2D.Double(t.getX(), t.getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight())))
.collect(Collectors.toList()); .collect(Collectors.toList());
} }
@ -90,7 +90,7 @@ public class MarkedContentUtils {
.map(content -> (TextPosition) content) .map(content -> (TextPosition) content)
.filter(content -> !content.getUnicode().equals(" ")) .filter(content -> !content.getUnicode().equals(" "))
.map(textPositions -> new TextPositionSequence(List.of(textPositions), 0, true)) .map(textPositions -> new TextPositionSequence(List.of(textPositions), 0, true))
.map(BoundingBox::getBBoxInitialUserSpace) .map(BoundingBox::getBBoxPdf)
.collect(Collectors.toList()); .collect(Collectors.toList());
} }

View File

@ -108,7 +108,7 @@ public final class PositionUtils {
} }
public float getHeightDifferenceBetweenChunkWordAndDocumentWord(TextPageBlock textBlock, Float documentMostPopularWordHeight) { public double getHeightDifferenceBetweenChunkWordAndDocumentWord(TextPageBlock textBlock, Double documentMostPopularWordHeight) {
return textBlock.getMostPopularWordHeight() - documentMostPopularWordHeight; return textBlock.getMostPopularWordHeight() - documentMostPopularWordHeight;
} }
@ -116,7 +116,7 @@ public final class PositionUtils {
public double getApproxLineCount(TextPageBlock textBlock) { public double getApproxLineCount(TextPageBlock textBlock) {
return textBlock.getHeight() / textBlock.getMostPopularWordHeight(); return textBlock.getBBoxDirAdj().getHeight() / textBlock.getMostPopularWordHeight();
} }
} }

View File

@ -28,7 +28,7 @@ public class SpreadsheetFinder {
Map<Point2D, Point2D> edgesV = new HashMap<>(); Map<Point2D, Point2D> edgesV = new HashMap<>();
for (Cell cell : cells) { for (Cell cell : cells) {
for (Point2D pt : getPoints(cell.getBBoxInitialUserSpace())) { for (Point2D pt : getPoints(cell.getBBoxPdf())) {
if (pointSet.contains(pt)) { // shared vertex, remove it if (pointSet.contains(pt)) { // shared vertex, remove it
pointSet.remove(pt); pointSet.remove(pt);
} else { } else {

View File

@ -0,0 +1,49 @@
package com.knecon.fforesight.service.layoutparser.processor.utils;
import lombok.experimental.UtilityClass;
@UtilityClass
public class StringDistances {
/**
* Calculate the Hamming distance between two strings after preprocessing to make them the same length
* and replacing all digits with a special character '@' since they are a common occurrence in headers/footers.
*
* @param s1 First string
* @param s2 Second string
* @return The Hamming distance between the two preprocessed strings.
*/
public int hamming(String s1, String s2) {
int maxLength = Math.max(s1.length(), s2.length());
String cleanFirstCandidate = padString(s1, maxLength, '\0').replaceAll("\\d", "@");
String cleanSecondCandidate = padString(s2, maxLength, '\0').replaceAll("\\d", "@");
int distance = 0;
for (int i = 0; i < maxLength; i++) {
if (cleanFirstCandidate.charAt(i) != cleanSecondCandidate.charAt(i)) {
distance++;
}
}
return distance;
}
private String padString(String input, int length, char padChar) {
if (input.length() >= length) {
return input;
}
StringBuilder sb = new StringBuilder(input);
while (sb.length() < length) {
sb.append(padChar);
}
return sb.toString();
}
}

View File

@ -1,30 +1,136 @@
package com.knecon.fforesight.service.layoutparser.processor.utils; package com.knecon.fforesight.service.layoutparser.processor.utils;
import java.awt.geom.Rectangle2D;
import java.awt.geom.RectangularShape;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Set;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.AngleFilter;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import lombok.experimental.UtilityClass;
@UtilityClass
public class TextPositionOperations { public class TextPositionOperations {
private static final TextPositionSequenceComparator comparator = new TextPositionSequenceComparator(); public static final double ANGLE_TOLERANCE = Math.PI / 35;
public static final AngleFilter ANGLE_FILTER = new AngleFilter(-ANGLE_TOLERANCE, ANGLE_TOLERANCE);
public static final double MAX_LINE_HEIGHT_FACTOR = 0.66; // multiplied with max word height
public static final double MAX_WORD_DISTANCE_FACTOR = 3.5; // multiplied with max word width
private static final double THRESHOLD = 5;
private static final Comparator<TextBoundingBox> COMPARATOR_DIR_ADJ = //
Comparator.comparing(TextBoundingBox::getDir)
.thenComparing(TextBoundingBox::getYDirAdj, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(TextBoundingBox::getXDirAdj, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD));
public static List<TextPositionSequence> mergeAndSortTextPositionSequenceByYThenX(List<TextPageBlock> textBlocks) { public List<TextPositionSequence> mergeAndSort(List<TextPageBlock> textBlocks) {
var sequence = textBlocks.stream().flatMap(tb -> tb.getSequences().stream()).collect(Collectors.toList()); var sequences = textBlocks.stream()
.flatMap(tb -> tb.getSequences()
// because the TextPositionSequenceComparator is not transitive, but .stream())
// JDK7+ enforces transitivity on comparators, we need to use .collect(Collectors.toSet());
// a custom quicksort implementation (which is slower, unfortunately). return sortUsingLineDetection(sequences);
QuickSort.sort(sequence, comparator);
return sequence;
} }
public static List<TextPositionSequence> mergeTextPositionSequence(List<TextPageBlock> textBlocks) {
return textBlocks.stream().flatMap(tb -> tb.getSequences().stream()).collect(Collectors.toList()); public List<TextPositionSequence> sort(List<TextPositionSequence> sequences) {
return sortUsingLineDetection(new HashSet<>(sequences));
}
private List<TextPositionSequence> sortUsingLineDetection(Set<TextPositionSequence> sequences) {
return groupByLine(sequences).stream()
.map(TextPositionOperations::sortByXDirAdj)
.filter(line -> !line.isEmpty())
.sorted(Comparator.comparing(line -> line.get(0), COMPARATOR_DIR_ADJ))
.flatMap(Collection::stream)
.toList();
}
private List<TextPositionSequence> sortByXDirAdj(Set<TextPositionSequence> line) {
return line.stream()
.sorted(Comparator.comparing(TextPositionSequence::getXDirAdj))
.toList();
}
private Collection<Set<TextPositionSequence>> groupByLine(Set<TextPositionSequence> sequences) {
double maxLineDistance = sequences.stream()
.map(TextPositionSequence::getBBoxDirAdj)
.mapToDouble(RectangularShape::getHeight).average()
.orElse(10) * MAX_LINE_HEIGHT_FACTOR;
double maxXGap = sequences.stream()
.map(TextPositionSequence::getBBoxDirAdj)
.mapToDouble(RectangularShape::getWidth).average()
.orElse(75) * MAX_WORD_DISTANCE_FACTOR;
UnionFind<TextPositionSequence> unionFind = new UnionFind<>(sequences);
for (TextPositionSequence sequence : sequences) {
for (TextPositionSequence sequence2 : sequences) {
if (sequence.equals(sequence2) || unionFind.inSameSet(sequence, sequence2)) {
continue;
}
double angle = computeAngle(sequence.getBBoxDirAdj(), sequence2.getBBoxDirAdj());
double normalizedVerticalDistance = Math.abs(sequence.getBBoxDirAdj().getCenterY() - sequence2.getBBoxDirAdj().getCenterY()) / maxLineDistance;
double normalizedHorizontalDistance = Math.abs(sequence.getBBoxDirAdj().getCenterX() - sequence2.getBBoxDirAdj().getCenterX()) / maxXGap;
if (sequence.getDir() != sequence2.getDir()
|| Math.abs(sequence.getFontSize() - sequence2.getFontSize()) > 0.5 * Math.min(sequence.getFontSize(),
sequence2.getFontSize())
|| Math.pow(normalizedVerticalDistance, 2) + Math.pow(normalizedHorizontalDistance, 2) > 1
|| !ANGLE_FILTER.matches(angle)) {
continue;
}
unionFind.union(sequence, sequence2);
}
}
return unionFind.getGroups();
}
public double computeAngle(Rectangle2D rect1, Rectangle2D rect2) {
double rect1CentroidX = rect1.getCenterX();
double rect1CentroidY = rect1.getCenterY();
double rect2CentroidX = rect2.getCenterX();
double rect2CentroidY = rect2.getCenterY();
double deltaX = rect2CentroidX - rect1CentroidX;
double deltaY = rect2CentroidY - rect1CentroidY;
return FastAtan2.fastAtan2(deltaY, deltaX);
}
public List<TextPositionSequence> merge(List<TextPageBlock> textBlocks) {
return textBlocks.stream()
.map(TextPageBlock::getSequences)
.flatMap(Collection::stream)
.collect(Collectors.toList());
} }
} }

View File

@ -1,99 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.knecon.fforesight.service.layoutparser.processor.utils;
import java.util.Comparator;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
/**
* This class is a comparator for TextPosition operators. It handles
* pages with text in different directions by grouping the text based
* on direction and sorting in that direction. This allows continuous text
* in a given direction to be more easily grouped together.
*
* @author Ben Litchfield
*/
public class TextPositionSequenceComparator implements Comparator<TextPositionSequence> {
@Override
public int compare(TextPositionSequence pos1, TextPositionSequence pos2) {
// only compare text that is in the same direction
int cmp1 = Float.compare(pos1.getDir().getDegrees(), pos2.getDir().getDegrees());
if (cmp1 != 0) {
return cmp1;
}
// get the text direction adjusted coordinates
double x1 = pos1.getBBox().getX();
double x2 = pos2.getBBox().getX();
double pos1YBottom = pos1.getBBox().getMaxY();
double pos2YBottom = pos2.getBBox().getMaxY();
// note that the coordinates have been adjusted so 0,0 is in upper left
double pos1YTop = pos1YBottom - pos1.getBBox().getHeight();
double pos2YTop = pos2YBottom - pos2.getBBox().getHeight();
double yDifference = Math.abs(pos1YBottom - pos2YBottom);
// Adjust for text rotation
switch (pos1.getRotation()) {
case 0:
// 0 degrees (horizontal, top to bottom and left to right): Sort primarily by y-coordinates from top to bottom (pos1YBottom < pos2YBottom).
if (yDifference < .1 || (pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom) || (pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom)) {
return Double.compare(x1, x2);
} else if (pos1YBottom < pos2YBottom) {
return -1;
} else {
return 1;
}
case 90:
// 90 degrees (vertical, right to left): Sort by x-coordinates first (x1 > x2), then by y-coordinates from top to bottom (pos1YBottom < pos2YBottom).
if (x1 > x2) {
return -1;
} else if (x1 < x2) {
return 1;
} else {
return Double.compare(pos1YBottom, pos2YBottom);
}
case 180:
// 180 degrees (horizontal, bottom to top and right to left): Sort primarily by y-coordinates from bottom to top (pos1YBottom > pos2YBottom).
if (yDifference < .1 || (pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom) || (pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom)) {
return Double.compare(x2, x1);
} else if (pos1YBottom > pos2YBottom) {
return -1;
} else {
return 1;
}
case 270:
// 270 degrees (vertical, left to right): Sort by x-coordinates in reverse (x2 > x1), then by y-coordinates from bottom to top (pos2YBottom > pos1YBottom).
if (x2 > x1) {
return -1;
} else if (x2 < x1) {
return 1;
} else {
return Double.compare(pos2YBottom, pos1YBottom);
}
default:
throw new RuntimeException("Rotation not supported. Only 0/90/180/270 degree rotation is supported.");
}
}
}

View File

@ -51,7 +51,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.words); VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.words);
visualizationsOnPage.getColoredRectangles() visualizationsOnPage.getColoredRectangles()
.addAll(textPositionSequences.stream() .addAll(textPositionSequences.stream()
.map(BoundingBox::getBBoxInitialUserSpace) .map(BoundingBox::getBBoxPdf)
.map(rect -> new ColoredRectangle(rect, WORDS_COLOR, 1)) .map(rect -> new ColoredRectangle(rect, WORDS_COLOR, 1))
.toList()); .toList());
} }
@ -105,7 +105,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.cells); VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.cells);
visualizationsOnPage.getColoredRectangles() visualizationsOnPage.getColoredRectangles()
.addAll(cells.stream() .addAll(cells.stream()
.map(cell -> new ColoredRectangle(cell.getBBoxInitialUserSpace(), CELLS_COLOR, 1)) .map(cell -> new ColoredRectangle(cell.getBBoxPdf(), CELLS_COLOR, 1))
.toList()); .toList());
} }
@ -119,7 +119,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(page, this.zones); VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(page, this.zones);
visualizationsOnPage.getColoredRectangles() visualizationsOnPage.getColoredRectangles()
.addAll(zones.stream() .addAll(zones.stream()
.map(BoundingBox::getBBoxInitialUserSpace) .map(BoundingBox::getBBoxPdf)
.map(zone -> new ColoredRectangle(zone, ZONES_COLOR, 1)) .map(zone -> new ColoredRectangle(zone, ZONES_COLOR, 1))
.toList()); .toList());
@ -144,7 +144,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(page, this.lines); VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(page, this.lines);
visualizationsOnPage.getColoredRectangles() visualizationsOnPage.getColoredRectangles()
.addAll(lines.stream() .addAll(lines.stream()
.map(BoundingBox::getBBoxInitialUserSpace) .map(BoundingBox::getBBoxPdf)
.map(line -> new ColoredRectangle(line, LINES_COLOR, 0.5f)) .map(line -> new ColoredRectangle(line, LINES_COLOR, 0.5f))
.toList()); .toList());
} }
@ -158,7 +158,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(page, zones); VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(page, zones);
visualizationsOnPage.getColoredRectangles() visualizationsOnPage.getColoredRectangles()
.addAll(textPageBlocks.stream() .addAll(textPageBlocks.stream()
.map(rect -> new ColoredRectangle(rect.getBBoxInitialUserSpace(), ZONES_COLOR, 1)) .map(rect -> new ColoredRectangle(rect.getBBoxPdf(), ZONES_COLOR, 1))
.toList()); .toList());
} }
@ -222,11 +222,11 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
.flatMap(Collection::stream) .flatMap(Collection::stream)
.forEach(character -> { .forEach(character -> {
Color color = ROTATING_CHARACTER_COLOR.get(index.getAndIncrement() % ROTATING_CHARACTER_COLOR.size()); Color color = ROTATING_CHARACTER_COLOR.get(index.getAndIncrement() % ROTATING_CHARACTER_COLOR.size());
Rectangle2D charBBox = character.getTextPosition().getBBoxInitialUserSpace(); Rectangle2D charBBox = character.getTextPosition().getBBoxPdf();
characterVisualizations.getColoredRectangles().add(new ColoredRectangle(charBBox, color, 1)); characterVisualizations.getColoredRectangles().add(new ColoredRectangle(charBBox, color, 1));
character.getNeighbors() character.getNeighbors()
.forEach(neighbor -> { .forEach(neighbor -> {
Rectangle2D neighborBBox = neighbor.getCharacter().getTextPosition().getBBoxInitialUserSpace(); Rectangle2D neighborBBox = neighbor.getCharacter().getTextPosition().getBBoxPdf();
Line2D line = new Line2D.Double(new Point2D.Double(charBBox.getCenterX(), charBBox.getCenterY()), Line2D line = new Line2D.Double(new Point2D.Double(charBBox.getCenterX(), charBBox.getCenterY()),
new Point2D.Double(neighborBBox.getCenterX(), neighborBBox.getCenterY())); new Point2D.Double(neighborBBox.getCenterX(), neighborBBox.getCenterY()));
neighbourVisualizations.getColoredLines().add(new ColoredLine(line, color, 1)); neighbourVisualizations.getColoredLines().add(new ColoredLine(line, color, 1));

View File

@ -38,7 +38,7 @@ dependencies {
implementation("com.amazonaws:aws-java-sdk-s3:1.12.536") implementation("com.amazonaws:aws-java-sdk-s3:1.12.536")
implementation("org.springframework.cloud:spring-cloud-starter-openfeign:4.0.4") implementation("org.springframework.cloud:spring-cloud-starter-openfeign:4.0.4")
implementation("net.logstash.logback:logstash-logback-encoder:7.4") implementation("net.logstash.logback:logstash-logback-encoder:7.4")
implementation("com.pdftron:PDFNet:10.7.0") implementation("com.pdftron:PDFNet:10.11.0")
// for integration testing only // for integration testing only
testImplementation(project(":viewer-doc-processor")) testImplementation(project(":viewer-doc-processor"))
@ -52,6 +52,8 @@ dependencies {
testImplementation("com.fasterxml.jackson.datatype:jackson-datatype-jsr310:${jacksonVersion}") testImplementation("com.fasterxml.jackson.datatype:jackson-datatype-jsr310:${jacksonVersion}")
testImplementation("org.apache.pdfbox:pdfbox:${pdfBoxVersion}") testImplementation("org.apache.pdfbox:pdfbox:${pdfBoxVersion}")
testImplementation("org.apache.pdfbox:pdfbox-tools:${pdfBoxVersion}") testImplementation("org.apache.pdfbox:pdfbox-tools:${pdfBoxVersion}")
testImplementation("org.apache.commons:commons-text:1.12.0")
} }
/* /*
@ -89,6 +91,9 @@ tasks.named<BootBuildImage>("bootBuildImage") {
environment.put("BPE_DELIM_JAVA_TOOL_OPTIONS", " ") environment.put("BPE_DELIM_JAVA_TOOL_OPTIONS", " ")
environment.put("BPE_APPEND_JAVA_TOOL_OPTIONS", "-Dfile.encoding=UTF-8") environment.put("BPE_APPEND_JAVA_TOOL_OPTIONS", "-Dfile.encoding=UTF-8")
builder.set("docker-proxy.knecon.com/paketobuildpacks/builder:base")
runImage.set("docker-proxy.knecon.com/paketobuildpacks/run:base-cnb")
imageName.set("nexus.knecon.com:5001/ff/${project.name}:${project.version}") imageName.set("nexus.knecon.com:5001/ff/${project.name}:${project.version}")
if (project.hasProperty("buildbootDockerHostNetwork")) { if (project.hasProperty("buildbootDockerHostNetwork")) {
network.set("host") network.set("host")
@ -99,6 +104,13 @@ tasks.named<BootBuildImage>("bootBuildImage") {
} }
verboseLogging.set(true) verboseLogging.set(true)
builderRegistry {
username.set(providers.gradleProperty("mavenUser").getOrNull())
password.set(providers.gradleProperty("mavenPassword").getOrNull())
email.set(providers.gradleProperty("mavenEmail").getOrNull())
url.set("https://docker-proxy.knecon.com:5001/")
}
publishRegistry { publishRegistry {
username.set(providers.gradleProperty("mavenUser").getOrNull()) username.set(providers.gradleProperty("mavenUser").getOrNull())
password.set(providers.gradleProperty("mavenPassword").getOrNull()) password.set(providers.gradleProperty("mavenPassword").getOrNull())
@ -106,4 +118,5 @@ tasks.named<BootBuildImage>("bootBuildImage") {
url.set("https://nexus.knecon.com:5001/") url.set("https://nexus.knecon.com:5001/")
} }
} }
} }

View File

@ -1,20 +1,18 @@
package com.knecon.fforesight.service.layoutparser.server; package com.knecon.fforesight.service.layoutparser.server;
import org.springframework.beans.factory.annotation.Value; import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component; import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import com.google.common.base.Strings; import com.google.common.base.Strings;
import com.knecon.fforesight.service.layoutparser.processor.LayoutparserSettings;
import com.pdftron.pdf.PDFNet; import com.pdftron.pdf.PDFNet;
import jakarta.annotation.PostConstruct;
import jakarta.annotation.PreDestroy;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
@Slf4j @Slf4j
@Component @Configuration
@RequiredArgsConstructor @RequiredArgsConstructor
public class PDFNetInitializer { public class PDFNetInitializer {
@ -22,26 +20,17 @@ public class PDFNetInitializer {
private String pdftronLicense; private String pdftronLicense;
@Bean
@SneakyThrows @SneakyThrows
@PostConstruct
// Do not change back to application runner, if it is application runner it takes messages from the queue before PDFNet is initialized, that leads to UnsatisfiedLinkError.
public void init() { public void init() {
if (Strings.isNullOrEmpty(pdftronLicense)) { if (Strings.isNullOrEmpty(pdftronLicense)) {
return; throw new IllegalArgumentException("PDFTRON_LICENSE not set!");
} }
log.info("Initializing Native Libraries"); log.info("Initializing Native Libraries");
log.info("Setting pdftron license: {}", pdftronLicense); log.info("Setting pdftron license: {}", pdftronLicense);
PDFNet.setTempPath("/tmp/pdftron"); PDFNet.setTempPath("/tmp/pdftron");
PDFNet.initialize(pdftronLicense); PDFNet.initialize(pdftronLicense);
}
@PreDestroy
public void terminate() {
PDFNet.terminate();
} }
} }

View File

@ -27,23 +27,28 @@ import lombok.extern.slf4j.Slf4j;
@Slf4j @Slf4j
public class LayoutparserEnd2EndTest extends AbstractTest { public class LayoutparserEnd2EndTest extends AbstractTest {
public static final LayoutParsingType LAYOUT_PARSING_TYPE = LayoutParsingType.DOCUMINE;
@Autowired @Autowired
private LayoutParsingPipeline layoutParsingPipeline; private LayoutParsingPipeline layoutParsingPipeline;
@Test @Test
@Disabled
public void testLayoutParserEndToEnd() { public void testLayoutParserEndToEnd() {
String filePath = "files/test-1.pdf"; String filePath = "/home/kschuettler/Downloads/55974b3de7ed2915718a10458206bbd8.ORIGIN.pdf";
runForFile(filePath); runForFile(filePath);
} }
@Test @Test
@Disabled @Disabled
@SneakyThrows @SneakyThrows
public void testLayoutParserEndToEndWithFolder() { public void testLayoutParserEndToEndWithFolder() {
String folder = "/Users/maverickstuder/Documents/Fforesight/layoutparser/layoutparser-service/layoutparser-service-server/src/test"; String folder = "/home/kschuettler/Dokumente/TestFiles/ReadingOrder";
List<Path> pdfFiles = Files.walk(Path.of(folder)) List<Path> pdfFiles = Files.walk(Path.of(folder))
.filter(path -> path.getFileName().toString().endsWith(".pdf")) .filter(path -> path.getFileName().toString().endsWith(".pdf"))
.sorted(Comparator.comparing(Path::getFileName)) .sorted(Comparator.comparing(Path::getFileName))
@ -69,7 +74,8 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
file = new File(filePath); file = new File(filePath);
} }
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH, true); LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LAYOUT_PARSING_TYPE, true);
prepareStorage(layoutParsingRequest, file); prepareStorage(layoutParsingRequest, file);
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest); LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);

View File

@ -57,9 +57,11 @@ public class OutlineDetectionTest extends AbstractTest {
pdfNetInitializer.init(); pdfNetInitializer.init();
} }
@Test @Test
@SneakyThrows @SneakyThrows
public void testOutlineError() { public void testOutlineError() {
String fileName = "files/syngenta/CustomerFiles/Clarifynd/VV-470942.pdf"; String fileName = "files/syngenta/CustomerFiles/Clarifynd/VV-470942.pdf";
ClassificationDocument classificationDocument = parseLayout(fileName, LayoutParsingType.CLARIFYND); ClassificationDocument classificationDocument = parseLayout(fileName, LayoutParsingType.CLARIFYND);

View File

@ -0,0 +1,452 @@
package com.knecon.fforesight.service.layoutparser.server.graph;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.awt.Color;
import java.awt.geom.Rectangle2D;
import java.io.FileOutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.text.similarity.LevenshteinDistance;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import com.iqser.red.storage.commons.service.StorageService;
import com.iqser.red.storage.commons.utils.FileSystemBackedStorageService;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
import com.knecon.fforesight.service.layoutparser.server.PDFNetInitializer;
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
import com.knecon.fforesight.service.viewerdoc.model.Standard14EmbeddableFont;
import com.pdftron.common.Matrix2D;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.ColorPt;
import com.pdftron.pdf.ColorSpace;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.ElementBuilder;
import com.pdftron.pdf.ElementWriter;
import com.pdftron.pdf.Font;
import com.pdftron.pdf.GState;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.sdf.SDFDoc;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Slf4j
public class DocumentReadingOrderTest extends BuildDocumentTest {
private static final boolean DRAW_DIR_ADJ_COORDS = false;
public static final List<LayoutParsingType> LAYOUT_PARSING_TYPES = List.of(LayoutParsingType.DOCUMINE,
LayoutParsingType.DOCUMINE_OLD,
LayoutParsingType.REDACT_MANAGER,
LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH);
@Autowired
PDFNetInitializer pdfNetInitializer;
@Autowired
StorageService storageService;
@BeforeEach
public void before() {
pdfNetInitializer.init();
}
@AfterEach
public void cleanUp() {
((FileSystemBackedStorageService) storageService).clearStorage();
}
@Test
public void readingOrderTestSeite14() {
String pdfFile = "files/syngenta/CustomerFiles/SinglePages/Seite14.pdf";
String expectedText = """
27
26 APPENDICES SECTION
APPENDIX 1 Analytical Report
syngenta
A16148F
Batch ID 533158 (GP-080305)
Batch Identification 533158
Product Design Code A16148F
Product Denomination SYN524464 FS (500)
Product by Common Name SYN524464 FS (500)
Other Product Code(s) GP-080305
Source Technology & Projects, Syngenta Crop Protection, Inc.
Chemical Analysis
(Active Ingredient Content)
Identity of the Active Ingredient* Confirmed
Content of SYN524464* 45.6% (wt/wt) or 534 g/L
Methodology Used for Characterization HPLC
The Active Ingredient content is within the FAO limits.
Physical Analysis
Appearance* pink opaque liquid
Density* 1171 g/L
Stability:
Storage Temperature <30°
Expiration date March 2009
The stability of this test substance will be determined concurrently through reanalysis of material held
in inventory under GLP conditions at Syngenta Crop Protection, Inc., Greensboro, NC
This Certificate of Analysis is summarizing data (marked with an asterisk) from a study that has been
performed in compliance with Good Laboratory Practices per 40 CFR Part 160 Raw data,
documentation, protocols, any amendments to study protocols and reports pertaining to this study are
maintained in the Syngenta Crop Protection Archives in Greensboro, NC.
Authorization'
26 Mar 2008
Dorothea Jeffery Date
Group Leader I
Analytical & Product Chemistry Department
Document 10350420.doc Certificate of Analysis
Page 1 of 1 Study T000973-08
Report Number: 11813-08 Page 14 of 14
""";
assertSimilarReadingOrder(expectedText, pdfFile);
}
@Test
public void readingOrderTestTiltedText() {
String pdfFile = "files/syngenta/CustomerFiles/SinglePages/tiltedText.pdf";
String expectedText = """
However there was no consistency in the areas affected either between sexes or at different
ages, in general other measurements for the same structures at other levels showed no
differences, all were within the historical control range of mean values and none of these
differences is considered to be related to treatment (Appendix K).
7. DISCUSSION
The purpose of this study, which was to determine the potential for developmental
neurotoxicity in the assessment and evaluation of the toxic characteristics of lambda-
cyhalothrin in rats, was successfully accomplished.
There was evidence of toxicity characterised by lower bodyweights and food consumption in
dams receiving 60 or 150 ppm lambda-cyhalothrin during gestation and also post partum in
the 150 ppm group only.
There were no treatment-related effects of administration of lambda-cyhalothrin on
reproductive parameters: there were no effects on gestation length, mean litter size or on pup
bodyweight at birth.
There was evidence of toxicity in F1 animals receiving 150 ppm. This was seen as slightly
higher pup mortality up to day 5 and lower bodyweights from day 5, reaching a maximum of
approximately 8-9% below control on day 22.
There was a small difference in the age at which male rats in the 150 ppm group reached
preputial separation, but this was too small to be of toxicological significance.
No effects were seen on motor activity or response to auditory startle.
There was no clear evidence of any effects in the learning and memory assessment in
weanling (age 21-24 days) or young adult animals (age 59-62 days). However, at day 21
swimming speeds of females receiving 150 ppm were slightly slower than controls. The
difference is considered to reflect a difference in swimming performance rather than an effect
on learning or memory.
No neuropathological effect of treatment with lambda-cyhalothrin was detected from a
detailed microscopic examination of the selected F1 animals post mortem on day 12 or 63.
LAMBDA-CYHALOTHRIN: DEVELOPMENTAL NEUROTOXICITY STUDY IN RATS
CTL/RR0969/REGULATORY/REPORT - 34
""";
assertSimilarReadingOrder(expectedText, pdfFile);
}
@Test
public void readingOrderTest402Study() {
String pdfFile = "files/SinglePages/402StudyPage5.pdf";
String expectedText = """
2.0 INTRODUCTION
2.1 Purpose
Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor
invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et
accusam et justo duo dolores et ea rebum.
Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem
ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt
ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et
justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est
Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed
diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam
voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd
gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
2.2 Guidelines
Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor
invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua.
At vero eos et accusam et justo duo dolores et ea rebum.
Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem
ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt
ut labore et dolore magna aliquyam erat, sed diam voluptua.
At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no
sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet,
consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore
magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et
ea rebum.
Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel
illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui
blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi. Lorem
ipsum dolor sit amet.
2.3 Test Facility
Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor
invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et
accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata
sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur
sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna
aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea
rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
Report Number: 20/080-002P 5
""";
assertSimilarReadingOrder(expectedText, pdfFile);
}
@Test
public void readingOrderTest402StudyRotated() {
String pdfFile = "files/SinglePages/402StudyPage5_rotated.pdf";
String expectedText = """
2.0 INTRODUCTION
2.1 Purpose
Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor
invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et
accusam et justo duo dolores et ea rebum.
Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem
ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt
ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et
justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est
Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed
diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam
voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd
gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
2.2 Guidelines
Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor
invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua.
At vero eos et accusam et justo duo dolores et ea rebum.
Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem
ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt
ut labore et dolore magna aliquyam erat, sed diam voluptua.
At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no
sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet,
consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore
magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et
ea rebum.
Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel
illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui
blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi. Lorem
ipsum dolor sit amet.
2.3 Test Facility
Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor
invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et
accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata
sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur
sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna
aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea
rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
Report Number: 20/080-002P 5
""";
assertSimilarReadingOrder(expectedText, pdfFile);
}
private void assertSimilarReadingOrder(String expectedText, String pdfFile) {
List<String> expectedLines = List.of(expectedText.split("\n"));
for (LayoutParsingType layoutParsingType : LAYOUT_PARSING_TYPES) {
log.info("Evaluating for {}", layoutParsingType);
ClassificationDocument classificationDocument = parseLayout(pdfFile, layoutParsingType);
if (DRAW_DIR_ADJ_COORDS) {
drawDirAdjCoords(pdfFile, classificationDocument, layoutParsingType);
}
Document document = DocumentGraphFactory.buildDocumentGraph(layoutParsingType, classificationDocument);
List<String> readLines = getTextAsLines(document);
readLines.forEach(log::info);
int correctCount = 0;
int maxLineOffset = 0;
for (int i = 0; i < expectedLines.size(); i++) {
String expectedLine = expectedLines.get(i);
int mostSimilarLine = 0;
double maxSimilarity = 0;
for (int j = 0; j < readLines.size(); j++) {
String readLine = readLines.get(j);
double similarity = similarity(expectedLine, readLine);
if (similarity > maxSimilarity) {
maxSimilarity = similarity;
mostSimilarLine = j;
}
}
if (readLines.get(mostSimilarLine).trim().equals(expectedLine.trim())) {
correctCount++;
int lineOffset = Math.abs(mostSimilarLine - i);
if (lineOffset > 0) {
log.info("Line {} offset by {}", readLines.get(mostSimilarLine), lineOffset);
}
if (lineOffset > maxLineOffset) {
maxLineOffset = lineOffset;
}
} else {
log.error("Lines {}-{} do not match: \n Expected: {}\n Actual: {}", i, mostSimilarLine, expectedLine, readLines.get(mostSimilarLine));
}
}
double correctLinesFactor = (double) correctCount / (double) readLines.size();
double averageLineOffset = (double) maxLineOffset / (double) readLines.size();
log.info("Difference in number of lines: {}", Math.abs(expectedLines.size() - readLines.size()));
log.info("Correct lines factor: {}", correctLinesFactor);
log.info("Max order offset: {}, avg: {}", maxLineOffset, averageLineOffset);
// In the rotated document one line is read as two
assertTrue(Math.abs(expectedLines.size() - readLines.size()) <= 1);
// Most of the errors come from the similarity metric finding different lines in 402 study, as the lines are too similar, or a miss classification of Footers
assertTrue(averageLineOffset < 1);
assertTrue(correctLinesFactor > 0.9);
}
}
public List<String> getTextAsLines(Document document) {
return document.getTextBlock().getAtomicTextBlocks()
.stream()
.filter(atb -> !atb.isEmpty())
.map(DocumentReadingOrderTest::getLines)
.flatMap(List::stream)
.toList();
}
private static List<String> getLines(AtomicTextBlock atomicTextBlock) {
int numberOfLines = atomicTextBlock.numberOfLines();
List<String> lines = new ArrayList<>(numberOfLines);
for (int line = 0; line < numberOfLines; line++) {
lines.add(atomicTextBlock.getLine(line).toString());
}
return lines;
}
private static double similarity(String s1, String s2) {
LevenshteinDistance levenshteinDistance = new LevenshteinDistance();
int max = Math.max(s1.length(), s2.length());
int dist = levenshteinDistance.apply(s1, s2);
return 1 - (double) dist / (double) max;
}
@SneakyThrows
private void drawDirAdjCoords(String filename, ClassificationDocument classificationDocument, LayoutParsingType layoutParsingType) {
try (PDFDoc pdfDoc = new PDFDoc(); ElementWriter writer = new ElementWriter(); ElementBuilder builder = new ElementBuilder()) {
Standard14EmbeddableFont font = Standard14EmbeddableFont.helvetica();
Font helvetica = Font.create(pdfDoc, Font.e_helvetica);
for (ClassificationPage classificationDocumentPage : classificationDocument.getPages()) {
int count = 0;
Page page = pdfDoc.pageCreate();
writer.begin(page);
for (AbstractPageBlock abstractBlock : classificationDocumentPage.getTextBlocks()) {
if (abstractBlock instanceof TextPageBlock textBlock) {
for (TextPositionSequence sequence : TextPositionOperations.mergeAndSort(List.of(textBlock))) {
float stringWidth;
try {
stringWidth = font.getStringWidth(sequence.toString());
} catch (Exception e) {
stringWidth = font.getFont().getAverageFontWidth() * sequence.toString().length();
}
double fontSize = (sequence.getBBoxDirAdj().getWidth() / stringWidth) * 1000;
try (Matrix2D textMatrix = new Matrix2D(1,
0,
0,
1,
sequence.getXDirAdj(),
page.getCropBox().getHeight() - sequence.getYDirAdj() - sequence.getHeightDirAdj())) {
writeText(sequence.toString(), textMatrix, builder, helvetica, fontSize, writer, Color.BLACK);
writeText(String.valueOf(count), textMatrix.translate(-(2 + (5 * String.valueOf(count).length())), 0), builder, helvetica, 8, writer, Color.RED);
count++;
}
writeBBox(sequence.getBBoxDirAdj(), builder, page, writer, Color.BLACK);
}
writeBBox(textBlock.getBBoxDirAdj(), builder, page, writer, Color.BLUE);
}
}
writer.end();
pdfDoc.pagePushBack(page);
}
Path stem = Path.of("/tmp/READING_ORDER_TEST/");
Files.createDirectories(stem);
try (var out = new FileOutputStream(stem.resolve(layoutParsingType.name() + "_" + Path.of(filename).getFileName()).toFile() + "_dirAdjCoordinates.pdf")) {
pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
}
}
}
private static void writeBBox(Rectangle2D r, ElementBuilder builder, Page page, ElementWriter writer, Color color) throws PDFNetException {
Element rect = builder.createRect(r.getX(), page.getCropBox().getHeight() - r.getY(), r.getWidth(), -r.getHeight());
float[] comp = color.getColorComponents(null);
rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
try (ColorPt colorpt = new ColorPt(comp[0], comp[1], comp[2])) {
rect.getGState().setStrokeColor(colorpt);
}
rect.setPathStroke(true);
writer.writeElement(rect);
}
private static void writeText(String string,
Matrix2D matrix2D,
ElementBuilder builder,
Font helvetica,
double fontSize,
ElementWriter writer,
Color color) throws PDFNetException {
Element text = builder.createTextBegin(helvetica, fontSize);
text.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
float[] colorComponents = color.getColorComponents(null);
try (ColorPt colorpt = new ColorPt(colorComponents[0], colorComponents[1], colorComponents[2])) {
text.getGState().setFillColor(colorpt);
}
text.setTextMatrix(matrix2D);
text.getGState().setTextRenderMode(GState.e_fill_text);
writer.writeElement(text);
text = builder.createTextRun(string);
writer.writeElement(text);
text = builder.createTextEnd();
writer.writeElement(text);
}
}

View File

@ -1,60 +0,0 @@
package com.knecon.fforesight.service.layoutparser.server.model;
import static org.assertj.core.api.Assertions.assertThat;
import java.io.ByteArrayInputStream;
import java.nio.charset.StandardCharsets;
import org.apache.pdfbox.util.Matrix;
import org.junit.jupiter.api.Test;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.storage.commons.properties.StorageProperties;
import com.iqser.red.storage.commons.service.ObjectSerializer;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import lombok.SneakyThrows;
public class TextPositionSequenceTest {
private static final String TEXT_POSITION_SEQUENCE_AS_JSON = "{\n" //
+ " \"page\": 1,\n" //
+ " \"textPositions\": [],\n" //
+ " \"dir\": 180.0,\n" //
+ " \"rotation\": 0,\n" //
+ " \"pageHeight\": 800,\n" //
+ " \"pageWidth\": 600\n" //
+ "}";
private final ObjectSerializer objectSerializer = new ObjectSerializer(new ObjectMapper());
@Test
@SneakyThrows
public void testDeserializationWithJackson() {
TextPositionSequence textPositionSequence = objectSerializer.deserialize(new ByteArrayInputStream(TEXT_POSITION_SEQUENCE_AS_JSON.getBytes(StandardCharsets.UTF_8)),
TextPositionSequence.class);
assertPropertiesAfterJsonDeserialization(textPositionSequence);
}
private void assertPropertiesAfterJsonDeserialization(TextPositionSequence textPositionSequence) {
assertThat(textPositionSequence.getPage()).isEqualTo(1);
assertThat(textPositionSequence.getTextPositions()).hasSize(0);
assertThat(textPositionSequence.getDir()).isEqualTo(TextDirection.HALF_CIRCLE);
assertThat(textPositionSequence.getRotation()).isEqualTo(0);
assertThat(textPositionSequence.getPageHeight()).isEqualTo(800f);
assertThat(textPositionSequence.getPageWidth()).isEqualTo(600f);
}
private Matrix createIdentityMatrix() {
return new Matrix();
}
}

View File

@ -3,12 +3,10 @@ package com.knecon.fforesight.service.layoutparser.server.services;
import java.nio.file.Path; import java.nio.file.Path;
import java.util.List; import java.util.List;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents; import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor; import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw; import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
@ -29,7 +27,7 @@ class PageContentExtractorTest {
textPositionPerPage.stream() textPositionPerPage.stream()
.map(t -> t.getSortedTextPositionSequences() .map(t -> t.getSortedTextPositionSequences()
.stream() .stream()
.map(TextPositionSequence::getBBoxInitialUserSpace) .map(TextPositionSequence::getBBoxPdf)
.map(List::of) .map(List::of)
.toList()) .toList())
.toList(), tmpFileName); .toList(), tmpFileName);

View File

@ -1,6 +1,8 @@
package com.knecon.fforesight.service.layoutparser.server.utils; package com.knecon.fforesight.service.layoutparser.server.utils;
import java.awt.geom.Rectangle2D;
import java.io.File; import java.io.File;
import java.io.FileOutputStream;
import java.nio.file.Path; import java.nio.file.Path;
import java.util.Map; import java.util.Map;
@ -10,11 +12,27 @@ import org.springframework.core.io.ClassPathResource;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline; import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.viewerdoc.model.Standard14EmbeddableFont;
import com.pdftron.common.Matrix2D;
import com.pdftron.pdf.ColorPt;
import com.pdftron.pdf.ColorSpace;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.ElementBuilder;
import com.pdftron.pdf.ElementWriter;
import com.pdftron.pdf.Font;
import com.pdftron.pdf.GState;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.sdf.SDFDoc;
import lombok.SneakyThrows; import lombok.SneakyThrows;
@ -48,14 +66,14 @@ public abstract class BuildDocumentTest extends AbstractTest {
@SneakyThrows @SneakyThrows
protected Document buildGraph(String filename, LayoutParsingType layoutParsingType) { protected Document buildGraph(String filename, LayoutParsingType layoutParsingType) {
if (!filename.startsWith("files") && filename.startsWith("/")) { if (!filename.startsWith("files") && filename.startsWith("/")) {
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(Path.of(filename).getFileName().toString(), LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH, true); LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(Path.of(filename).getFileName().toString(), layoutParsingType, true);
prepareStorage(layoutParsingRequest, new File(filename)); prepareStorage(layoutParsingRequest, new File(filename));
return DocumentGraphFactory.buildDocumentGraph(layoutParsingType, return DocumentGraphFactory.buildDocumentGraph(layoutParsingType,
layoutParsingPipeline.parseLayout(layoutParsingType, layoutParsingPipeline.parseLayout(layoutParsingType,
new File(filename), new File(filename),
layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get()), layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId()
.get()),
new TableServiceResponse(), new TableServiceResponse(),
new VisualLayoutParsingResponse(), new VisualLayoutParsingResponse(),
layoutParsingRequest.identifier())); layoutParsingRequest.identifier()));
@ -65,10 +83,12 @@ public abstract class BuildDocumentTest extends AbstractTest {
} else { } else {
prepareStorage(filename); prepareStorage(filename);
} }
return DocumentGraphFactory.buildDocumentGraph(layoutParsingType, parseLayout(filename, layoutParsingType)); var classificationDocument = parseLayout(filename, layoutParsingType);
return DocumentGraphFactory.buildDocumentGraph(layoutParsingType, classificationDocument);
} }
} }
} }

@ -1 +1 @@
Subproject commit c6fd9e849f3efd7d1507401f63629b91dec9f4ec Subproject commit 0da08b1d9d1bc815a3fb51aa9638eafea2cf02d5

View File

@ -12,7 +12,7 @@ dependencies {
implementation("org.apache.pdfbox:pdfbox-tools:${pdfBoxVersion}") implementation("org.apache.pdfbox:pdfbox-tools:${pdfBoxVersion}")
implementation("org.slf4j:slf4j-api:1.7.25") implementation("org.slf4j:slf4j-api:1.7.25")
implementation("com.knecon.fforesight:tracing-commons:0.5.0") implementation("com.knecon.fforesight:tracing-commons:0.5.0")
implementation("com.pdftron:PDFNet:10.7.0") implementation("com.pdftron:PDFNet:10.11.0")
testImplementation("org.apache.logging.log4j:log4j-slf4j-impl:2.22.1") testImplementation("org.apache.logging.log4j:log4j-slf4j-impl:2.22.1")
testImplementation("org.junit.jupiter:junit-jupiter") testImplementation("org.junit.jupiter:junit-jupiter")

View File

@ -10,12 +10,14 @@ import org.apache.pdfbox.pdmodel.font.Standard14Fonts;
import com.pdftron.pdf.Font; import com.pdftron.pdf.Font;
import com.pdftron.pdf.PDFDoc; import com.pdftron.pdf.PDFDoc;
import lombok.Getter;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows; import lombok.SneakyThrows;
@RequiredArgsConstructor @RequiredArgsConstructor
public class Standard14EmbeddableFont implements EmbeddableFont { public class Standard14EmbeddableFont implements EmbeddableFont {
@Getter
private final PDType1Font font; private final PDType1Font font;
private final int pdfTronIdentifier; private final int pdfTronIdentifier;

View File

@ -96,7 +96,8 @@ public class PDFTronViewerDocumentService {
boolean isCurrentVersion = ViewerDocVersioningUtility.docIsCurrentVersion(pdfDoc); boolean isCurrentVersion = ViewerDocVersioningUtility.docIsCurrentVersion(pdfDoc);
int pageNumber = 1; int pageNumber = 1;
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); pageNumber++) { try (PageIterator iterator = pdfDoc.getPageIterator()) {
while (iterator.hasNext()) {
Page page = iterator.next(); Page page = iterator.next();
@ -105,6 +106,8 @@ public class PDFTronViewerDocumentService {
} }
visualizationWriter.drawVisualizationsOnPage(pageNumber, page); visualizationWriter.drawVisualizationsOnPage(pageNumber, page);
pageNumber++;
}
} }
ViewerDocVersioningUtility.setVersionInDocument(pdfDoc); ViewerDocVersioningUtility.setVersionInDocument(pdfDoc);

View File

@ -343,12 +343,7 @@ public class VisualizationWriter {
@SneakyThrows @SneakyThrows
private static AffineTransform getTextDeRotationTransform(Page page) { private static AffineTransform getTextDeRotationTransform(Page page) {
return AffineTransform.getQuadrantRotateInstance(switch (page.getRotation()) { return AffineTransform.getQuadrantRotateInstance(page.getRotation());
case 90 -> 3;
case 180 -> 2;
case 270 -> 1;
default -> 0;
});
} }
} }

View File

@ -61,12 +61,13 @@ class PageContentCleanerTest {
.markedContentToRemove(Set.of(LayerIdentifier.KNECON_OCR_DEBUG.markedContentName())) .markedContentToRemove(Set.of(LayerIdentifier.KNECON_OCR_DEBUG.markedContentName()))
.build(); .build();
for (PageIterator iterator = doc.getPageIterator(); iterator.hasNext(); ) { try (PageIterator iterator = doc.getPageIterator()) {
while (iterator.hasNext()) {
Page page = iterator.next(); Page page = iterator.next();
pageContentCleaner.removeMarkedContent(page); pageContentCleaner.removeMarkedContent(page);
} }
}
doc.save(out, SDFDoc.SaveMode.REMOVE_UNUSED, null); doc.save(out, SDFDoc.SaveMode.REMOVE_UNUSED, null);
} }