hotfix reading order

This commit is contained in:
Kilian Schüttler 2024-08-09 11:49:12 +02:00
parent b900cfaf31
commit 69bcd4f68d
77 changed files with 1297 additions and 756 deletions

View File

@ -42,6 +42,15 @@ tasks.jacocoTestReport {
}
allprojects {
tasks.withType<Javadoc> {
options {
this as StandardJavadocDocletOptions
addBooleanOption("Xdoclint:none", true)
addStringOption("Xmaxwarns", "1")
}
}
publishing {
publications {
create<MavenPublication>(name) {
@ -64,6 +73,7 @@ java {
withJavadocJar()
}
repositories {
mavenLocal()
mavenCentral()

View File

@ -19,6 +19,7 @@ public class SimplifiedText {
@Schema(description = "Number of pages in the entire document.")
private int numberOfPages;
@Schema(description = "A List of simplified Sections, which contains almost exclusively text.")
@Builder.Default
private List<SimplifiedSectionText> sectionTexts = new ArrayList<>();
}

View File

@ -28,4 +28,6 @@ dependencies {
implementation("org.tinspin:tinspin-indexes:2.1.3")
implementation("org.commonmark:commonmark:0.22.0")
implementation("org.commonmark:commonmark-ext-gfm-tables:0.22.0")
implementation("com.pdftron:PDFNet:10.11.0")
}

View File

@ -69,6 +69,7 @@ import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDF
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
import io.micrometer.observation.Observation;
import io.micrometer.observation.ObservationRegistry;
@ -117,14 +118,18 @@ public class LayoutParsingPipeline {
log.info("Starting layout parsing for {}", layoutParsingRequest.identifier());
File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()).orElse(originFile);
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId())
.orElse(originFile);
VisualLayoutParsingResponse visualLayoutParsingResponse = layoutParsingRequest.visualLayoutParsingFileId()
.map(layoutParsingStorageService::getVisualLayoutParsingFile).orElse(new VisualLayoutParsingResponse());
.map(layoutParsingStorageService::getVisualLayoutParsingFile)
.orElse(new VisualLayoutParsingResponse());
ImageServiceResponse imageServiceResponse = layoutParsingRequest.imagesFileStorageId()
.map(layoutParsingStorageService::getImagesFile).orElse(new ImageServiceResponse());
.map(layoutParsingStorageService::getImagesFile)
.orElse(new ImageServiceResponse());
TableServiceResponse tableServiceResponse = layoutParsingRequest.tablesFileStorageId()
.map(layoutParsingStorageService::getTablesFile).orElse(new TableServiceResponse());
.map(layoutParsingStorageService::getTablesFile)
.orElse(new TableServiceResponse());
ClassificationDocument classificationDocument = parseLayout(settings.getLayoutParsingTypeOverride() == null //
? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride(),
@ -137,8 +142,7 @@ public class LayoutParsingPipeline {
log.info("Building document graph for {}", layoutParsingRequest.identifier());
Document documentGraph = observeBuildDocumentGraph(settings.getLayoutParsingTypeOverride() == null //
? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride(),
classificationDocument);
? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride(), classificationDocument);
log.info("Creating viewer document for {}", layoutParsingRequest.identifier());
@ -147,7 +151,7 @@ public class LayoutParsingPipeline {
log.info("Storing resulting files for {}", layoutParsingRequest.identifier());
layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentGraph));
if(layoutParsingRequest.documentMarkdownFileStorageId().isPresent()) {
if (layoutParsingRequest.documentMarkdownFileStorageId().isPresent()) {
layoutParsingStorageService.storeMarkdownFile(layoutParsingRequest.documentMarkdownFileStorageId().get(), new MarkdownMapper().toMarkdownContent(documentGraph));
}
layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentGraph));
@ -271,11 +275,11 @@ public class LayoutParsingPipeline {
stripper.setStartPage(pageNumber);
stripper.setEndPage(pageNumber);
stripper.setPdpage(pdPage);
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD)) {
stripper.setSortByPosition(true);
}
stripper.getText(originDocument);
List<TextPositionSequence> words = stripper.getTextPositionSequences();
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD)) {
words = TextPositionOperations.sort(words);
}
classificationDocument.getLayoutDebugLayer().addTextVisualizations(words, pageNumber);
PDRectangle pdr = pdPage.getMediaBox();

View File

@ -1,6 +1,7 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum;
import java.util.ArrayList;
import java.util.EnumMap;
import java.util.List;
import java.util.stream.Collectors;
@ -26,6 +27,7 @@ import lombok.RequiredArgsConstructor;
@RequiredArgsConstructor
public class DocstrumSegmentationService {
public static final double SAME_DIRECTION_THRESHOLD = 0.9;
private final NearestNeighbourService nearestNeighbourService;
private final SpacingService spacingService;
private final LineBuilderService lineBuilderService;
@ -35,13 +37,44 @@ public class DocstrumSegmentationService {
public List<Zone> segmentPage(List<TextPositionSequence> textPositions, boolean xyOrder, CleanRulings usedRulings, LayoutDebugLayer visualizations) {
List<Zone> zones = new ArrayList<>();
zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.ZERO));
zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.QUARTER_CIRCLE));
zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.HALF_CIRCLE));
zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.THREE_QUARTER_CIRCLE));
EnumMap<TextDirection, Integer> directionCounts = new EnumMap<>(TextDirection.class);
return readingOrderService.resolve(zones, xyOrder);
List<Zone> newZones = computeZones(textPositions, usedRulings, visualizations, TextDirection.ZERO);
directionCounts.put(TextDirection.ZERO, newZones.size());
List<Zone> zones = new ArrayList<>(newZones);
newZones = computeZones(textPositions, usedRulings, visualizations, TextDirection.QUARTER_CIRCLE);
directionCounts.put(TextDirection.QUARTER_CIRCLE, newZones.size());
zones.addAll(newZones);
newZones = computeZones(textPositions, usedRulings, visualizations, TextDirection.HALF_CIRCLE);
directionCounts.put(TextDirection.HALF_CIRCLE, newZones.size());
zones.addAll(newZones);
newZones = computeZones(textPositions, usedRulings, visualizations, TextDirection.THREE_QUARTER_CIRCLE);
directionCounts.put(TextDirection.THREE_QUARTER_CIRCLE, newZones.size());
zones.addAll(newZones);
return readingOrderService.resolve(zones, xyOrder, mostSameDirection(directionCounts));
}
private boolean mostSameDirection(EnumMap<TextDirection, Integer> directionCounts) {
int total = directionCounts.values()
.stream()
.mapToInt(i -> i).sum();
if ((double) directionCounts.get(TextDirection.ZERO) / total > SAME_DIRECTION_THRESHOLD) {
return true;
} else if ((double) directionCounts.get(TextDirection.QUARTER_CIRCLE) / total > SAME_DIRECTION_THRESHOLD) {
return true;
} else if ((double) directionCounts.get(TextDirection.HALF_CIRCLE) / total > SAME_DIRECTION_THRESHOLD) {
return true;
} else if ((double) directionCounts.get(TextDirection.THREE_QUARTER_CIRCLE) / total > SAME_DIRECTION_THRESHOLD) {
return true;
}
return false;
}

View File

@ -15,10 +15,16 @@ public class AngleFilter {
public boolean matches(Neighbor neighbor) {
return matches(neighbor.getAngle());
}
public boolean matches(double angle) {
if (lowerAngle <= upperAngle) {
return lowerAngle <= neighbor.getAngle() && neighbor.getAngle() < upperAngle;
return lowerAngle <= angle && angle < upperAngle;
} else {
return lowerAngle <= neighbor.getAngle() || neighbor.getAngle() < upperAngle;
return lowerAngle <= angle || angle < upperAngle;
}
}

View File

@ -7,8 +7,12 @@ import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.experimental.SuperBuilder;
@Data
@SuperBuilder
@NoArgsConstructor
public abstract class BoundingBox {
// Java coordinate system: (0, 0) is always upper left, x is increasing left to right and y is increasing from top to bottom.
@ -19,7 +23,7 @@ public abstract class BoundingBox {
// This rotates completely in 90 degree steps with page rotation.
// Needs to be used when writing to a PDF.
// Also, these are definitely correct and should be used whenever possible.
protected Rectangle2D bBoxInitialUserSpace;
protected Rectangle2D bBoxPdf;
protected static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f;
@ -50,25 +54,25 @@ public abstract class BoundingBox {
public double getPdfMinX() {
return bBoxInitialUserSpace.getMinX();
return bBoxPdf.getMinX();
}
public double getPdfMaxX() {
return bBoxInitialUserSpace.getMaxX();
return bBoxPdf.getMaxX();
}
public double getPdfMinY() {
return bBoxInitialUserSpace.getMinY();
return bBoxPdf.getMinY();
}
public double getPdfMaxY() {
return bBoxInitialUserSpace.getMaxY();
return bBoxPdf.getMaxY();
}
@ -129,13 +133,31 @@ public abstract class BoundingBox {
}
public boolean intersectsY(BoundingBox other) {
private boolean intersectsX(BoundingBox other, float threshold) {
return this.getX() - threshold <= other.getMaxX() && this.getMaxX() + threshold >= other.getX();
}
public boolean intersectsPdf(BoundingBox other) {
return this.intersectsXPdf(other) && this.intersectsYPdf(other);
}
public boolean intersectsPdf(BoundingBox other, float yThreshold, float xThreshold) {
return this.intersectsXPdf(other, xThreshold) && this.intersectsYPdf(other, yThreshold);
}
public boolean intersectsYPdf(BoundingBox other) {
return this.getPdfMinY() <= other.getPdfMaxY() && this.getPdfMaxY() >= other.getPdfMinY();
}
public boolean intersectsYJava(BoundingBox other) {
public boolean intersectsY(BoundingBox other) {
return this.getY() <= other.getMaxY() && this.getMaxY() >= other.getY();
}
@ -143,25 +165,31 @@ public abstract class BoundingBox {
public boolean intersectsY(BoundingBox other, float threshold) {
return this.getY() - threshold <= other.getMaxY() && this.getMaxY() + threshold >= other.getY();
}
public boolean intersectsYPdf(BoundingBox other, float threshold) {
return this.getPdfMinY() - threshold <= other.getPdfMaxY() && this.getPdfMaxY() + threshold >= other.getPdfMinY();
}
public boolean intersectsX(BoundingBox other) {
public boolean intersectsXPdf(BoundingBox other) {
return this.getPdfMinX() <= other.getPdfMaxX() && this.getPdfMaxX() >= other.getPdfMinX();
}
public boolean intersectsXJava(BoundingBox other) {
public boolean intersectsX(BoundingBox other) {
return this.getX() <= other.getMaxX() && this.getMaxX() >= other.getMinX();
}
public boolean intersectsX(BoundingBox other, float threshold) {
public boolean intersectsXPdf(BoundingBox other, float threshold) {
return this.getPdfMinX() - threshold <= other.getMaxX() && this.getMaxX() + threshold >= other.getPdfMinX();
return this.getPdfMinX() - threshold <= other.getPdfMaxX() && this.getMaxX() + threshold >= other.getPdfMinX();
}
@ -170,8 +198,8 @@ public abstract class BoundingBox {
this.bBox = components.stream()
.map(BoundingBox::getBBox)
.collect(RectangleTransformations.collectBBox());
this.bBoxInitialUserSpace = components.stream()
.map(BoundingBox::getBBoxInitialUserSpace)
this.bBoxPdf = components.stream()
.map(BoundingBox::getBBoxPdf)
.collect(RectangleTransformations.collectBBox());
}
@ -229,25 +257,25 @@ public abstract class BoundingBox {
public boolean rightOf(BoundingBox other) {
return this.intersectsYJava(other) && other.getMaxX() <= this.getMinX();
return this.intersectsY(other) && other.getMaxX() <= this.getMinX();
}
public boolean leftOf(BoundingBox other) {
return this.intersectsYJava(other) && other.getMinX() >= this.getMaxX();
return this.intersectsY(other) && other.getMinX() >= this.getMaxX();
}
public boolean isAbove(BoundingBox other) {
return this.intersectsXJava(other) && other.getMinY() >= this.getMaxY();
return this.intersectsX(other) && other.getMinY() >= this.getMaxY();
}
public boolean isBelow(BoundingBox other) {
return this.intersectsXJava(other) && this.getMinY() >= other.getMaxY();
return this.intersectsX(other) && this.getMinY() >= other.getMaxY();
}
}

View File

@ -35,7 +35,7 @@ public class Character {
public double getHeight() {
return textPosition.getHeightDir();
return textPosition.getHeightDirAdj();
}
@ -65,9 +65,9 @@ public class Character {
double s = Math.sin(-0);
double c = Math.cos(-0);
xs[0] = c * x - s * y;
xs[1] = c * (x + textPosition.getWidthDirAdj()) - s * (y + textPosition.getHeightDir());
xs[1] = c * (x + textPosition.getWidthDirAdj()) - s * (y + textPosition.getHeightDirAdj());
xs[2] = c * other.x - s * other.y;
xs[3] = c * (other.x + other.textPosition.getWidthDirAdj()) - s * (other.y + other.textPosition.getHeightDir());
xs[3] = c * (other.x + other.textPosition.getWidthDirAdj()) - s * (other.y + other.textPosition.getHeightDirAdj());
boolean overlapping = xs[1] >= xs[2] && xs[3] >= xs[0];
Arrays.sort(xs);
return Math.abs(xs[2] - xs[1]) * (overlapping ? 1 : -1);

View File

@ -1,18 +1,28 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
import static com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence.BOLD;
import static com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence.BOLD_ITALIC;
import static com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence.ITALIC;
import static com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence.STANDARD;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.EnumMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.model.text.FontStyle;
import lombok.Data;
import lombok.EqualsAndHashCode;
@Data
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
public class Line extends BoundingBox {
@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = false)
public class Line extends TextBoundingBox {
private static final double WORD_DISTANCE_MULTIPLIER = 0.18;
@ -28,6 +38,8 @@ public class Line extends BoundingBox {
private final double height;
private FontStyle fontStyle;
private final List<Character> characters;
private final List<TextPositionSequence> words = new ArrayList<>();
@ -67,6 +79,29 @@ public class Line extends BoundingBox {
height = computeHeight();
computeWords(wordSpacing * WORD_DISTANCE_MULTIPLIER);
buildBBox();
computeFontStyle();
}
private void computeFontStyle() {
EnumMap<FontStyle, AtomicInteger> fontStyleCounter = new EnumMap<>(FontStyle.class);
for (FontStyle fontStyle : FontStyle.values()) {
fontStyleCounter.put(fontStyle, new AtomicInteger(0));
}
for (TextPositionSequence word : words) {
switch (word.getFontStyle()) {
case STANDARD -> fontStyleCounter.get(FontStyle.REGULAR).getAndIncrement();
case BOLD -> fontStyleCounter.get(FontStyle.BOLD).getAndIncrement();
case ITALIC -> fontStyleCounter.get(FontStyle.ITALIC).getAndIncrement();
case BOLD_ITALIC -> fontStyleCounter.get(FontStyle.BOLD_ITALIC).getAndIncrement();
}
}
fontStyle = fontStyleCounter.entrySet()
.stream()
.max(Comparator.comparing(entry -> entry.getValue().get()))
.map(Map.Entry::getKey)
.orElse(FontStyle.REGULAR);
}
@ -144,8 +179,8 @@ public class Line extends BoundingBox {
private void buildBBox() {
this.setToBBoxOfComponents(characters.stream()
.map(Character::getTextPosition)
.toList());
.map(Character::getTextPosition)
.toList());
}

View File

@ -0,0 +1,102 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
import java.awt.geom.Rectangle2D;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.NoArgsConstructor;
import lombok.Setter;
import lombok.experimental.SuperBuilder;
@Getter
@Setter
@SuperBuilder
@NoArgsConstructor
@EqualsAndHashCode(callSuper = false)
public abstract class TextBoundingBox extends BoundingBox {
protected Rectangle2D bBoxDirAdj;
protected TextDirection dir;
@Override
public void setToBBoxOfComponents(List<? extends BoundingBox> components) {
super.setToBBoxOfComponents(components);
this.bBoxDirAdj = components.stream()
.filter(c -> c instanceof TextBoundingBox)
.map(c -> (TextBoundingBox) c)
.map(TextBoundingBox::getBBoxDirAdj)
.collect(RectangleTransformations.collectBBox());
Set<TextDirection> textDirections = components.stream()
.filter(c -> c instanceof TextBoundingBox)
.map(c -> (TextBoundingBox) c)
.map(TextBoundingBox::getDir)
.collect(Collectors.toSet());
if (textDirections.isEmpty()) {
dir = TextDirection.ZERO;
} else if (textDirections.size() > 1) {
throw new IllegalArgumentException("More than one text direction found");
} else {
dir = textDirections.iterator().next();
}
}
public double getXDirAdj() {
return this.bBoxDirAdj.getX();
}
public double getYDirAdj() {
return this.bBoxDirAdj.getY();
}
public double getWidthDirAdj() {
return this.bBoxDirAdj.getWidth();
}
public double getHeightDirAdj() {
return this.bBoxDirAdj.getHeight();
}
public double getMaxXDirAdj() {
return this.bBoxDirAdj.getMaxX();
}
public double getMaxYDirAdj() {
return this.bBoxDirAdj.getMaxY();
}
public double getCenterYDirAdj() {
return this.bBoxDirAdj.getCenterY();
}
public double getCenterXDirAdj() {
return this.bBoxDirAdj.getCenterX();
}
}

View File

@ -6,9 +6,11 @@ import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.Data;
import lombok.EqualsAndHashCode;
@Data
public class Zone extends BoundingBox {
@EqualsAndHashCode(callSuper = false)
public class Zone extends TextBoundingBox {
private List<Line> lines;

View File

@ -1,15 +1,17 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.service;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.ListIterator;
import java.util.Map;
import java.util.stream.Collectors;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils;
@ -19,21 +21,30 @@ public class ReadingOrderService {
private static final double THRESHOLD = 5;
public static final double MULTI_COLUMN_DETECTION_THRESHOLD = 1.5;
private static final Comparator<TextBoundingBox> COMPARATOR = //
Comparator.comparing(TextBoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(TextBoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD));
public List<Zone> resolve(List<Zone> zones, boolean xyReadingOrder) {
private static final Comparator<TextBoundingBox> COMPARATOR_DIR_ADJ = //
Comparator.comparing(TextBoundingBox::getYDirAdj, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(TextBoundingBox::getXDirAdj, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD));
public List<Zone> resolve(List<Zone> zones, boolean xyReadingOrder, boolean useDirAdjCoords) {
if (zones.isEmpty() || zones.size() == 1) {
return zones;
}
if (xyReadingOrder) {
return resolveSingleColumnReadingOrder(zones);
return resolveSingleColumnReadingOrder(zones, useDirAdjCoords);
}
Map<Long, Integer> histogram = new HashMap<>();
for (Zone zone : zones) {
long minY = Math.round(zone.getBBox().getMinY());
long maxY = Math.round(zone.getBBox().getMaxY());
Rectangle2D bbox = useDirAdjCoords ? zone.getBBoxDirAdj() : zone.getBBox();
long minY = Math.round(bbox.getMinY());
long maxY = Math.round(bbox.getMaxY());
for (long i = minY; i <= maxY; i++) {
histogram.put(i, histogram.getOrDefault(i, 0) + 1);
}
@ -43,24 +54,32 @@ public class ReadingOrderService {
.stream()
.mapToInt(Integer::intValue).average()
.orElse(1) < MULTI_COLUMN_DETECTION_THRESHOLD) {
return resolveSingleColumnReadingOrder(zones);
return resolveSingleColumnReadingOrder(zones, useDirAdjCoords);
} else {
return resolveMultiColumnReadingOder(zones);
return resolveMultiColumnReadingOder(zones, useDirAdjCoords);
}
}
private static List<Zone> resolveSingleColumnReadingOrder(List<Zone> zones) {
private static List<Zone> resolveSingleColumnReadingOrder(List<Zone> zones, boolean useDirAdjCoords) {
zones.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
if (useDirAdjCoords) {
return zones.stream()
.collect(Collectors.groupingBy(TextBoundingBox::getDir)).values()
.stream()
.flatMap(words -> words.stream()
.sorted(COMPARATOR_DIR_ADJ))
.toList();
}
zones.sort(COMPARATOR);
return zones;
}
private List<Zone> resolveMultiColumnReadingOder(List<Zone> zones) {
private List<Zone> resolveMultiColumnReadingOder(List<Zone> zones, boolean useDirAdjCoords) {
// Simple reading order resolver for multi column page layout as described here : https://pub.towardsai.net/advanced-rag-02-unveiling-pdf-parsing-b84ae866344e
// TODO implement a more fancy reading order resolver see https://github.com/BobLd/DocumentLayoutAnalysis/blob/master/README.md#reading-order
@ -69,11 +88,12 @@ public class ReadingOrderService {
double maxX = Double.NEGATIVE_INFINITY;
for (Zone zone : zones) {
if (zone.getX() < minX) {
minX = zone.getX();
Rectangle2D bbox = useDirAdjCoords ? zone.getBBoxDirAdj() : zone.getBBox();
if (bbox.getX() < minX) {
minX = zone.getXDirAdj();
}
if (zone.getX() + zone.getWidth() > maxX) {
maxX = zone.getX() + zone.getWidth();
if (bbox.getMaxX() > maxX) {
maxX = zone.getMaxXDirAdj();
}
}
@ -82,24 +102,27 @@ public class ReadingOrderService {
List<Zone> leftOf = new ArrayList<>();
List<Zone> rightOf = new ArrayList<>();
List<Zone> middle = new ArrayList<>();
for (Zone zone : zones) {
if (zone.getX() < midLineXCoordinate && zone.getX() + zone.getWidth() < midLineXCoordinate) {
Rectangle2D bbox = useDirAdjCoords ? zone.getBBoxDirAdj() : zone.getBBox();
if (bbox.getX() < midLineXCoordinate && bbox.getX() + bbox.getWidth() < midLineXCoordinate) {
leftOf.add(zone);
} else if (zone.getX() > midLineXCoordinate && zone.getX() + zone.getWidth() > midLineXCoordinate) {
} else if (bbox.getX() > midLineXCoordinate && bbox.getX() + bbox.getWidth() > midLineXCoordinate) {
rightOf.add(zone);
} else {
middle.add(zone);
}
}
leftOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
rightOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
middle.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
if (useDirAdjCoords) {
leftOf.sort(COMPARATOR_DIR_ADJ);
rightOf.sort(COMPARATOR_DIR_ADJ);
middle.sort(COMPARATOR_DIR_ADJ);
} else {
leftOf.sort(COMPARATOR);
rightOf.sort(COMPARATOR);
middle.sort(COMPARATOR);
}
/*
List<Zone> leftNotIntersecting = new ArrayList<>();
for (Zone leftZone : leftOf) {
@ -151,8 +174,9 @@ public class ReadingOrderService {
while (itty.hasNext()) {
Zone current = itty.next();
Rectangle2D bbox = useDirAdjCoords ? current.getBBoxDirAdj() : current.getBBox();
for (int i = 0; i < sortedZones.size(); i++) {
if (current.getY() < sortedZones.get(i).getY()) {
if (bbox.getY() < sortedZones.get(i).getY()) {
sortedZones.add(i, current);
itty.remove();
break;

View File

@ -1,5 +1,7 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.service;
import static com.knecon.fforesight.service.layoutparser.processor.model.SectionIdentifier.numericalIdentifierPattern;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashSet;
@ -21,7 +23,7 @@ public class ZoneBuilderService {
private static final double MIN_HORIZONTAL_DISTANCE_MULTIPLIER = -0.5;
private static final double MAX_VERTICAL_DISTANCE_MULTIPLIER = 1.2;
private static final double MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER = -3.0;
private static final double MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER = -7;
private static final double MAX_VERTICAL_MERGE_DISTANCE_MULTIPLIER = 0.5;
@ -38,7 +40,7 @@ public class ZoneBuilderService {
double minHorizontalDistance = characterSpacing * MIN_HORIZONTAL_DISTANCE_MULTIPLIER;
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_DISTANCE_MULTIPLIER;
double minHorizontalMergeDistance = characterSpacing * MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER;
double minHorizontalMergeDistance = lineSpacing * MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER;
double maxVerticalMergeDistance = lineSpacing * MAX_VERTICAL_MERGE_DISTANCE_MULTIPLIER;
UnionFind<Line> unionFind = new UnionFind<>(new HashSet<>(lines));
@ -54,11 +56,26 @@ public class ZoneBuilderService {
return;
}
double scale = Math.min(outerLine.getHeight(), innerLine.getHeight()) / meanHeight;
scale = Math.max(MIN_LINE_SIZE_SCALE, Math.min(scale, MAX_LINE_SIZE_SCALE));
// if (!innerLine.getFontStyle().equals(outerLine.getFontStyle()) //
// && !outerLine.intersectsY(innerLine, -2f)) {
// return;
// }
double horizontalDistance = outerLine.horizontalDistance(innerLine) / scale;
double verticalDistance = outerLine.verticalDistance(innerLine) / scale;
double horizontalScale = Math.min(outerLine.getHeightDirAdj(), innerLine.getHeightDirAdj()) / meanHeight;
horizontalScale = Math.max(MIN_LINE_SIZE_SCALE, Math.min(horizontalScale, MAX_LINE_SIZE_SCALE));
double verticalScale = horizontalScale;
// if (innerLine.toString().endsWith(":")
// || outerLine.toString().endsWith(":")
// || numericalIdentifierPattern.matcher(innerLine.toString()).matches()
// || numericalIdentifierPattern.matcher(outerLine.toString()).matches()) {
//
// horizontalScale *= 5;
// verticalScale /= 10;
// }
double horizontalDistance = outerLine.horizontalDistance(innerLine) / horizontalScale;
double verticalDistance = outerLine.verticalDistance(innerLine) / verticalScale;
if ((!(minHorizontalDistance <= horizontalDistance) || !(verticalDistance <= maxVerticalDistance)) //
&& (!(minHorizontalMergeDistance <= horizontalDistance) || !(verticalDistance <= maxVerticalMergeDistance))) {
@ -87,7 +104,7 @@ public class ZoneBuilderService {
double weights = 0.0;
for (Line line : lines) {
double weight = line.getLength();
meanHeight += line.getHeight() * weight;
meanHeight += line.getHeightDirAdj() * weight;
weights += weight;
}
meanHeight /= weights;

View File

@ -32,6 +32,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.He
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.FontStyle;
public class MarkdownMapper extends AbstractNodeVisitor {
@ -297,12 +298,6 @@ public class MarkdownMapper extends AbstractNodeVisitor {
}
enum FontStyle {
REGULAR,
BOLD,
ITALIC,
BOLD_ITALIC;
}
record FontStyleChange(boolean enter, FontStyle style) {

View File

@ -18,6 +18,7 @@ import lombok.RequiredArgsConstructor;
@Data
@RequiredArgsConstructor
public class ClassificationPage {
@NonNull
@ -25,7 +26,7 @@ public class ClassificationPage {
private List<OutlineObject> outlineObjects = new ArrayList<>();
private List<AbstractPageBlock> headlines = new ArrayList<>();
private List<AbstractPageBlock> headlines = new ArrayList<>();
private List<ClassifiedImage> images = new ArrayList<>();
@ -44,7 +45,7 @@ public class ClassificationPage {
private float pageWidth;
private float pageHeight;
CleanRulings cleanRulings;
private CleanRulings cleanRulings;
private Map<String, List<Rectangle2D>> markedContentBboxPerType = new HashMap<>();

View File

@ -12,10 +12,10 @@ import lombok.Getter;
@Getter
public class FloatFrequencyCounter {
Map<Float, Integer> countPerValue = new HashMap<>();
Map<Double, Integer> countPerValue = new HashMap<>();
public void add(float value) {
public void add(double value) {
if (!countPerValue.containsKey(value)) {
countPerValue.put(value, 1);
@ -25,9 +25,9 @@ public class FloatFrequencyCounter {
}
public void addAll(Map<Float, Integer> otherCounter) {
public void addAll(Map<Double, Integer> otherCounter) {
for (Map.Entry<Float, Integer> entry : otherCounter.entrySet()) {
for (Map.Entry<Double, Integer> entry : otherCounter.entrySet()) {
if (countPerValue.containsKey(entry.getKey())) {
countPerValue.put(entry.getKey(), countPerValue.get(entry.getKey()) + entry.getValue());
} else {
@ -37,10 +37,10 @@ public class FloatFrequencyCounter {
}
public Float getMostPopular() {
public Double getMostPopular() {
Map.Entry<Float, Integer> mostPopular = null;
for (Map.Entry<Float, Integer> entry : countPerValue.entrySet()) {
Map.Entry<Double, Integer> mostPopular = null;
for (Map.Entry<Double, Integer> entry : countPerValue.entrySet()) {
if (mostPopular == null || entry.getValue() >= mostPopular.getValue()) {
mostPopular = entry;
}
@ -49,11 +49,11 @@ public class FloatFrequencyCounter {
}
public List<Float> getHighterThanMostPopular() {
public List<Double> getHigherThanMostPopular() {
Float mostPopular = getMostPopular();
List<Float> higher = new ArrayList<>();
for (Float value : countPerValue.keySet()) {
Double mostPopular = getMostPopular();
List<Double> higher = new ArrayList<>();
for (Double value : countPerValue.keySet()) {
if (value > mostPopular) {
higher.add(value);
}
@ -63,10 +63,10 @@ public class FloatFrequencyCounter {
}
public Float getHighest() {
public Double getHighest() {
Float highest = null;
for (Float value : countPerValue.keySet()) {
Double highest = null;
for (Double value : countPerValue.keySet()) {
if (highest == null || value > highest) {
highest = value;
}

View File

@ -15,7 +15,7 @@ import lombok.experimental.FieldDefaults;
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class SectionIdentifier {
static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d+)[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?");
public static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d+)[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?");
public enum Format {
EMPTY,

View File

@ -145,10 +145,7 @@ public class AtomicTextBlock implements TextBlock {
}
public static AtomicTextBlock fromAtomicTextBlockData(DocumentTextData documentTextData,
DocumentPositionData documentPositionData,
SemanticNode parent,
Page page) {
public static AtomicTextBlock fromAtomicTextBlockData(DocumentTextData documentTextData, DocumentPositionData documentPositionData, SemanticNode parent, Page page) {
return AtomicTextBlock.builder()
.id(documentTextData.getId())
@ -156,8 +153,10 @@ public class AtomicTextBlock implements TextBlock {
.page(page)
.textRange(new TextRange(documentTextData.getStart(), documentTextData.getEnd()))
.searchText(documentTextData.getSearchText())
.lineBreaks(Arrays.stream(documentTextData.getLineBreaks()).boxed().toList())
.stringIdxToPositionIdx(Arrays.stream(documentPositionData.getStringIdxToPositionIdx()).boxed().toList())
.lineBreaks(Arrays.stream(documentTextData.getLineBreaks()).boxed()
.toList())
.stringIdxToPositionIdx(Arrays.stream(documentPositionData.getStringIdxToPositionIdx()).boxed()
.toList())
.positions(toRectangle2DList(documentPositionData.getPositions()))
.parent(parent)
.build();
@ -166,7 +165,9 @@ public class AtomicTextBlock implements TextBlock {
private static List<Rectangle2D> toRectangle2DList(float[][] positions) {
return Arrays.stream(positions).map(floatArr -> (Rectangle2D) new Rectangle2D.Float(floatArr[0], floatArr[1], floatArr[2], floatArr[3])).toList();
return Arrays.stream(positions)
.map(floatArr -> (Rectangle2D) new Rectangle2D.Float(floatArr[0], floatArr[1], floatArr[2], floatArr[3]))
.toList();
}
@ -176,6 +177,9 @@ public class AtomicTextBlock implements TextBlock {
throw new IndexOutOfBoundsException(format("line %d out of range for AtomicTextBlock with %d lines", lineNumber, numberOfLines()));
}
if (lineNumber == 0) {
if (lineBreaks.isEmpty()) {
return searchText;
}
return subSequence(textRange.start(), lineBreaks.get(0) + textRange.start());
} else if (lineNumber == numberOfLines() - 1) {
return subSequence(lineBreaks.get(lineBreaks.size() - 1) + textRange.start(), textRange.end());
@ -195,9 +199,9 @@ public class AtomicTextBlock implements TextBlock {
public int getNextLinebreak(int fromIndex) {
return lineBreaks.stream()//
.filter(linebreak -> linebreak > fromIndex - textRange.start()) //
.findFirst() //
.orElse(searchText.length()) + textRange.start();
.filter(linebreak -> linebreak > fromIndex - textRange.start()) //
.findFirst() //
.orElse(searchText.length()) + textRange.start();
}
@ -205,9 +209,9 @@ public class AtomicTextBlock implements TextBlock {
public int getPreviousLinebreak(int fromIndex) {
return lineBreaks.stream()//
.filter(linebreak -> linebreak <= fromIndex - textRange.start())//
.reduce((a, b) -> b)//
.orElse(0) + textRange.start();
.filter(linebreak -> linebreak <= fromIndex - textRange.start())//
.reduce((a, b) -> b)//
.orElse(0) + textRange.start();
}
@ -255,7 +259,10 @@ public class AtomicTextBlock implements TextBlock {
protected List<Integer> getAllLineBreaksInBoundary(TextRange textRange) {
return getLineBreaks().stream().map(linebreak -> linebreak + this.textRange.start()).filter(textRange::contains).toList();
return getLineBreaks().stream()
.map(linebreak -> linebreak + this.textRange.start())
.filter(textRange::contains)
.toList();
}

View File

@ -10,8 +10,8 @@ import lombok.NonNull;
import lombok.RequiredArgsConstructor;
@Data
@RequiredArgsConstructor
@AllArgsConstructor
@RequiredArgsConstructor
public class ClassifiedImage {
@NonNull
@ -20,11 +20,19 @@ public class ClassifiedImage {
private ImageType imageType;
private boolean sourceByAi;
private boolean isAppendedToSection;
@NonNull
private boolean hasTransparency;
@NonNull
private int page;
@NonNull
private String representation;
public ClassifiedImage(@NonNull Rectangle2D position, @NonNull ImageType imageType, boolean hasTransparency, int page, @NonNull String representation) {
this.position = position;
this.imageType = imageType;
this.hasTransparency = hasTransparency;
this.page = page;
this.representation = representation;
}
}

View File

@ -35,14 +35,14 @@ public class Cell extends BoundingBox {
public Cell(Point2D topLeft, Point2D bottomRight) {
this.bBoxInitialUserSpace = new Rectangle2D.Double(topLeft.getX(), topLeft.getY(), (bottomRight.getX() - topLeft.getX()), (bottomRight.getY() - topLeft.getY()));
this.bBox = bBoxInitialUserSpace;
this.bBoxPdf = new Rectangle2D.Double(topLeft.getX(), topLeft.getY(), (bottomRight.getX() - topLeft.getX()), (bottomRight.getY() - topLeft.getY()));
this.bBox = bBoxPdf;
}
public Cell(Rectangle2D bBoxInitialUserSpace, AffineTransform initialUserSpaceToJava) {
this.bBoxInitialUserSpace = bBoxInitialUserSpace;
this.bBoxPdf = bBoxInitialUserSpace;
this.bBox = initialUserSpaceToJava.createTransformedShape(bBoxInitialUserSpace).getBounds2D();
}
@ -50,7 +50,7 @@ public class Cell extends BoundingBox {
public static Cell copy(Cell cell) {
Cell copy = new Cell();
copy.bBoxInitialUserSpace = cell.bBoxInitialUserSpace;
copy.bBoxPdf = cell.bBoxPdf;
copy.bBox = cell.bBox;
return copy;
}

View File

@ -70,7 +70,7 @@ public class CleanRulings {
public boolean lineBetween(BoundingBox a, BoundingBox b) {
return lineBetween(a.getBBoxInitialUserSpace(), b.getBBoxInitialUserSpace());
return lineBetween(a.getBBoxPdf(), b.getBBoxPdf());
}

View File

@ -263,8 +263,8 @@ public class TablePageBlock extends AbstractPageBlock {
cells.stream()
.map(originalCell -> new CellWithIntersection(originalCell,
RectangleTransformations.calculateIntersectedArea(cellFromGridStructure.getBBoxInitialUserSpace(),
originalCell.getBBoxInitialUserSpace())))
RectangleTransformations.calculateIntersectedArea(cellFromGridStructure.getBBoxPdf(),
originalCell.getBBoxPdf())))
.filter(cellWithIntersection -> cellWithIntersection.intersectedArea > 0)
.filter(cellWithIntersection -> cellWithIntersection.originalCell.getArea() > cellWithIntersection.intersectedArea * CELL_AREA_CONTAINED_THRESHOLD)
.max(Comparator.comparing(CellWithIntersection::intersectedArea))

View File

@ -0,0 +1,9 @@
package com.knecon.fforesight.service.layoutparser.processor.model.text;
public enum FontStyle {
REGULAR,
BOLD,
ITALIC,
BOLD_ITALIC;
}

View File

@ -5,64 +5,50 @@ import java.awt.geom.Rectangle2D;
import org.apache.pdfbox.text.TextPosition;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
@Data
@Builder
@SuperBuilder
@NoArgsConstructor
@AllArgsConstructor
public class RedTextPosition extends BoundingBox {
@EqualsAndHashCode(callSuper = true)
@FieldDefaults(level = AccessLevel.PRIVATE)
public class RedTextPosition extends TextBoundingBox {
public final static int HEIGHT_PADDING = 2;
private Rectangle2D.Float bBoxDirAdj; // adjusted to text rotation
String unicode;
@JsonIgnore
private int rotation;
// estimated using the TextMatrix in radians
float exactDir;
@JsonIgnore
private float pageHeight;
float widthOfSpace;
@JsonIgnore
private float pageWidth;
float fontSizeInPt;
private String unicode;
@JsonIgnore
private float dir;
// not used in reanalysis
@JsonIgnore
private float widthOfSpace;
// not used in reanalysis
@JsonIgnore
private float fontSizeInPt;
// not used in reanalysis
@JsonIgnore
private String fontName;
String fontName;
@SneakyThrows
public static RedTextPosition fromTextPosition(TextPosition textPosition) {
var pos = new RedTextPosition();
pos.setRotation(textPosition.getRotation());
pos.setPageHeight(textPosition.getPageHeight());
pos.setPageWidth(textPosition.getPageWidth());
pos.setUnicode(textPosition.getUnicode());
pos.setDir(textPosition.getDir());
pos.setWidthOfSpace(textPosition.getWidthOfSpace());
pos.setFontSizeInPt(textPosition.getFontSizeInPt());
pos.setFontName(textPosition.getFont().getName());
pos.setExactDir((float) FastAtan2.fastAtan2(textPosition.getTextMatrix().getShearY(), textPosition.getTextMatrix().getScaleX()));
pos.setDir(TextDirection.fromDegrees(textPosition.getDir()));
//TODO: There is a mismatch in the java coords of the text and the rulings,
// I guess if we start with the initial user space positions and transform them the same way we do the rulings it would work.
@ -73,18 +59,18 @@ public class RedTextPosition extends BoundingBox {
textPosition.getYDirAdj() - textHeight,
textPosition.getWidthDirAdj(),
textHeight + HEIGHT_PADDING);
pos.setBBoxDirAdj(dirAdjPosition);
AffineTransform affineTransform = getRotationMatrix(TextDirection.fromDegrees(textPosition.getDir()), textPosition.getPageWidth(), textPosition.getPageHeight());
Rectangle2D bBoxInitialUserSpace = affineTransform.createTransformedShape(dirAdjPosition).getBounds2D();
pos.setBBoxInitialUserSpace(bBoxInitialUserSpace); // These are definitely correct
pos.setBBoxPdf(bBoxInitialUserSpace); // These are definitely correct
return pos;
}
private static AffineTransform getRotationMatrix(TextDirection textDirection, float pageWidth, float pageHeight) {
AffineTransform transform = new AffineTransform();
@ -103,32 +89,4 @@ public class RedTextPosition extends BoundingBox {
return transform;
}
@JsonIgnore
public float getXDirAdj() {
return this.bBoxDirAdj.x;
}
@JsonIgnore
public float getYDirAdj() {
return this.bBoxDirAdj.y;
}
@JsonIgnore
public float getWidthDirAdj() {
return this.bBoxDirAdj.width;
}
@JsonIgnore
public float getHeightDir() {
return this.bBoxDirAdj.height;
}
}

View File

@ -44,4 +44,15 @@ public enum TextDirection {
throw new IllegalArgumentException(String.format("A value of %f is not supported by TextDirection", degrees));
}
public int getRotation() {
return switch (this) {
case ZERO -> 0;
case QUARTER_CIRCLE -> 1;
case HALF_CIRCLE -> 2;
case THREE_QUARTER_CIRCLE -> 3;
};
}
}

View File

@ -8,6 +8,7 @@ import com.fasterxml.jackson.annotation.JsonIgnore;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
import lombok.AllArgsConstructor;
@ -26,17 +27,19 @@ public class TextPageBlock extends AbstractPageBlock {
@Builder.Default
private List<TextPositionSequence> sequences = new ArrayList<>();
private Rectangle2D bBoxDirAdj;
private String mostPopularWordFont;
private String mostPopularWordStyle;
private float mostPopularWordFontSize;
private double mostPopularWordFontSize;
private float mostPopularWordHeight;
private double mostPopularWordHeight;
private float mostPopularWordSpaceWidth;
private double mostPopularWordSpaceWidth;
private float highestFontSize;
private double highestFontSize;
private PageBlockType classification;
@ -51,34 +54,24 @@ public class TextPageBlock extends AbstractPageBlock {
}
@JsonIgnore
public TextDirection getDir() {
return sequences.get(0).getDir();
}
@JsonIgnore
public float getPageHeight() {
return sequences.get(0).getPageHeight();
}
@JsonIgnore
public float getPageWidth() {
return sequences.get(0).getPageWidth();
}
private void calculateBBox() {
if (sequences == null) {
this.bBox = new Rectangle2D.Double();
this.bBoxInitialUserSpace = new Rectangle2D.Double();
this.bBoxPdf = new Rectangle2D.Double();
this.bBoxDirAdj = new Rectangle2D.Double();
return;
}
this.bBoxDirAdj = sequences.stream()
.map(TextPositionSequence::getBBoxDirAdj)
.collect(RectangleTransformations.collectBBox());
setToBBoxOfComponents(sequences);
}

View File

@ -1,6 +1,7 @@
package com.knecon.fforesight.service.layoutparser.processor.model.text;
import java.awt.geom.Rectangle2D;
import static com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition.HEIGHT_PADDING;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
@ -8,8 +9,7 @@ import java.util.stream.Collectors;
import org.apache.pdfbox.text.TextPosition;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox;
import lombok.AllArgsConstructor;
import lombok.Builder;
@ -23,23 +23,21 @@ import lombok.extern.slf4j.Slf4j;
@Builder
@NoArgsConstructor
@AllArgsConstructor
@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = false)
public class TextPositionSequence extends BoundingBox implements CharSequence {
@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = true) // needs the bbox to be unique
public class TextPositionSequence extends TextBoundingBox implements CharSequence {
public static final int HEIGHT_PADDING = 2;
public static final String STANDARD = "standard";
public static final String BOLD_ITALIC = "bold, italic";
public static final String BOLD = "bold";
public static final String ITALIC = "italic";
@EqualsAndHashCode.Include
private int page;
@EqualsAndHashCode.Include
@Builder.Default
private List<RedTextPosition> textPositions = new ArrayList<>();
private Rectangle2D bBoxDirAdj;
@EqualsAndHashCode.Include
private TextDirection dir;
private int rotation;
private float pageHeight;
private float pageWidth;
private boolean isParagraphStart;
private boolean strikethrough;
private boolean underline;
@ -51,10 +49,6 @@ public class TextPositionSequence extends BoundingBox implements CharSequence {
.map(RedTextPosition::fromTextPosition)
.collect(Collectors.toList());
this.page = pageNumber;
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
this.rotation = textPositions.get(0).getRotation();
this.pageHeight = textPositions.get(0).getPageHeight();
this.pageWidth = textPositions.get(0).getPageWidth();
this.isParagraphStart = isParagraphStart;
calculateBBox();
}
@ -62,9 +56,6 @@ public class TextPositionSequence extends BoundingBox implements CharSequence {
private void calculateBBox() {
this.bBoxDirAdj = textPositions.stream()
.map(RedTextPosition::getBBoxDirAdj)
.collect(RectangleTransformations.collectBBox());
setToBBoxOfComponents(getTextPositions());
}
@ -73,10 +64,6 @@ public class TextPositionSequence extends BoundingBox implements CharSequence {
this.textPositions = textPositions;
this.page = page;
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
this.rotation = textPositions.get(0).getRotation();
this.pageHeight = textPositions.get(0).getPageHeight();
this.pageWidth = textPositions.get(0).getPageWidth();
calculateBBox();
}
@ -112,9 +99,6 @@ public class TextPositionSequence extends BoundingBox implements CharSequence {
textPositionSequence.textPositions = textPositions.subList(start, end);
textPositionSequence.page = page;
textPositionSequence.dir = dir;
textPositionSequence.rotation = rotation;
textPositionSequence.pageHeight = pageHeight;
textPositionSequence.pageWidth = pageWidth;
textPositionSequence.setToBBoxOfComponents(getTextPositions());
return textPositionSequence;
}
@ -141,10 +125,6 @@ public class TextPositionSequence extends BoundingBox implements CharSequence {
this.textPositions.add(textPosition);
this.page = textPositionSequence.getPage();
this.dir = textPositionSequence.getDir();
this.rotation = textPositionSequence.getRotation();
this.pageHeight = textPositionSequence.getPageHeight();
this.pageWidth = textPositionSequence.getPageWidth();
calculateBBox();
}
@ -152,79 +132,18 @@ public class TextPositionSequence extends BoundingBox implements CharSequence {
public void add(TextPosition textPosition) {
this.textPositions.add(RedTextPosition.fromTextPosition(textPosition));
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
this.rotation = textPositions.get(0).getRotation();
this.pageHeight = textPositions.get(0).getPageHeight();
this.pageWidth = textPositions.get(0).getPageWidth();
calculateBBox();
}
public double getTextHeightNoPadding() {
/**
* This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
* This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt.
*
* @return the text direction adjusted minX value
*/
public float getMinXDirAdj() {
return textPositions.get(0).getXDirAdj();
return textPositions.get(0).getHeightDirAdj();
}
/**
* This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
* This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt.
*
* @return the text direction adjusted maxX value
*/
public double getTextHeight() {
public float getMaxXDirAdj() {
return textPositions.get(textPositions.size() - 1).getXDirAdj() + textPositions.get(textPositions.size() - 1).getWidthDirAdj() + HEIGHT_PADDING;
}
/**
* This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
* This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt.
*
* @return the text direction adjusted minY value. The upper border of the bounding box of the word.
*/
public float getMinYDirAdj() {
return textPositions.get(0).getYDirAdj() - getTextHeight();
}
/**
* This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
* This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt.
*
* @return the text direction adjusted maxY value. The lower border of the bounding box of the word.
*/
public float getMaxYDirAdj() {
return textPositions.get(0).getYDirAdj();
}
public float getTextHeightNoPadding() {
return textPositions.get(0).getHeightDir();
}
public float getTextHeight() {
return textPositions.get(0).getHeightDir() + HEIGHT_PADDING;
return textPositions.get(0).getHeightDirAdj() + HEIGHT_PADDING;
}
@ -240,18 +159,18 @@ public class TextPositionSequence extends BoundingBox implements CharSequence {
public String getFontStyle() {
if (textPositions.get(0).getFontName() == null) {
return "standard";
return STANDARD;
}
String lowercaseFontName = textPositions.get(0).getFontName().toLowerCase(Locale.ROOT);
if (lowercaseFontName.contains("bold") && lowercaseFontName.contains("italic")) {
return "bold, italic";
} else if (lowercaseFontName.contains("bold")) {
return "bold";
} else if (lowercaseFontName.contains("italic")) {
return "italic";
if (lowercaseFontName.contains(BOLD) && lowercaseFontName.contains(ITALIC)) {
return BOLD_ITALIC;
} else if (lowercaseFontName.contains(BOLD)) {
return BOLD;
} else if (lowercaseFontName.contains(ITALIC)) {
return ITALIC;
} else {
return "standard";
return STANDARD;
}
}

View File

@ -56,7 +56,7 @@ public class ImageServiceResponseAdapter {
classificationPage.getImages().forEach(image -> {
if (image.getImageType().equals(ImageType.OTHER)) {
for (AbstractPageBlock textblock : classificationPage.getTextBlocks()) {
if (image.getPosition().contains(textblock.getBBoxInitialUserSpace())) {
if (image.getPosition().contains(textblock.getBBoxPdf())) {
image.setImageType(ImageType.OCR);
return;
}

View File

@ -14,6 +14,7 @@ import lombok.NoArgsConstructor;
@AllArgsConstructor
public class Classification {
@Builder.Default
private Map<String, Float> probabilities = new HashMap<>();
private String label;

View File

@ -22,8 +22,10 @@ public class ImageServiceResponse {
@JsonProperty(value = "imageMetadata")
@JsonAlias("data")
@Builder.Default
private List<ImageMetadata> data = new ArrayList<>();
@Builder.Default
private List<ImageMetadata> dataCV = new ArrayList<>();

View File

@ -15,6 +15,7 @@ import lombok.NoArgsConstructor;
public class TableData {
private PageInfo pageInfo;
@Builder.Default
private List<TableCells> tableCells = new ArrayList<>();
}

View File

@ -19,7 +19,7 @@ public class TableServiceResponse {
private String operation;
private String targetFileExtension;
private String responseFileExtension;
@Builder.Default
private List<TableData> data = new ArrayList<>();
}

View File

@ -6,7 +6,6 @@ import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.AllArgsConstructor;
import lombok.experimental.UtilityClass;
@ -38,7 +37,7 @@ public class GapDetectionService {
for (TextPositionSequence currentTextPosition : sortedTextPositionSequences.subList(1, sortedTextPositionSequences.size())) {
double yDifference = Math.abs(currentTextPosition.getMaxYDirAdj() - previousTextPosition.getMaxYDirAdj());
double xGap = Math.abs(previousTextPosition.getMaxXDirAdj() - currentTextPosition.getMinXDirAdj());
double xGap = Math.abs(previousTextPosition.getMaxXDirAdj() - currentTextPosition.getXDirAdj());
Rectangle2D previousTextPositionBBox = toRectangle2D(previousTextPosition);
Rectangle2D currentTextPositionBBox = toRectangle2D(currentTextPosition);

View File

@ -71,7 +71,7 @@ public class LineDetectionService {
private static boolean isXGap(TextPositionSequence currentTextPosition, TextPositionSequence previousTextPosition, double avgTextPositionHeight) {
return Math.abs(previousTextPosition.getMaxXDirAdj() - currentTextPosition.getMinXDirAdj()) > (avgTextPositionHeight * X_GAP_FACTOR);
return Math.abs(previousTextPosition.getMaxXDirAdj() - currentTextPosition.getXDirAdj()) > (avgTextPositionHeight * X_GAP_FACTOR);
}
@ -83,7 +83,7 @@ public class LineDetectionService {
private static boolean isNewLine(TextPositionSequence currentTextPosition, TextPositionSequence previousTextPosition, double avgTextPositionHeight) {
return Math.abs(previousTextPosition.getMinYDirAdj() - currentTextPosition.getMinYDirAdj()) > avgTextPositionHeight;
return Math.abs(previousTextPosition.getYDirAdj() - currentTextPosition.getYDirAdj()) > avgTextPositionHeight;
}

View File

@ -78,7 +78,7 @@ public class TableExtractionService {
List<Cell> containedCells = new ArrayList<>();
for (Cell c : cells) {
if (c.hasMinimumSize() && area.contains(c.getBBoxInitialUserSpace())) {
if (c.hasMinimumSize() && area.contains(c.getBBoxPdf())) {
containedCells.add(c);
}
}

View File

@ -31,13 +31,13 @@ public class TextRulingsClassifier {
private static void handleVerticalText(CleanRulings cleanRulings, TextPositionSequence word) {
float lowerY = (float) (word.getBBoxInitialUserSpace().getMinY() + TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
float upperY = (float) (word.getBBoxInitialUserSpace().getMaxY() - TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
float lowerY = (float) (word.getBBoxPdf().getMinY() + TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
float upperY = (float) (word.getBBoxPdf().getMaxY() - TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
float strikethroughCenterX = (float) word.getBBoxInitialUserSpace().getCenterX();
float strikethroughCenterX = (float) word.getBBoxPdf().getCenterX();
float strikethroughBoxHeight = (float) ((word.getBBoxDirAdj().getHeight() * STRIKETHROUGH_ZONE) / 2);
float underlineCenterX = (float) (word.getDir().equals(TextDirection.QUARTER_CIRCLE) ? word.getBBoxInitialUserSpace().getMaxX() : word.getBBoxInitialUserSpace().getMinX());
float underlineCenterX = (float) (word.getDir().equals(TextDirection.QUARTER_CIRCLE) ? word.getBBoxPdf().getMaxX() : word.getBBoxPdf().getMinX());
float underlineBoxHeight = (float) ((word.getBBoxDirAdj().getHeight() * UNDERLINE_ZONE) / 2);
float leftX = Math.min(underlineCenterX - underlineBoxHeight, strikethroughCenterX - strikethroughBoxHeight);
@ -65,13 +65,13 @@ public class TextRulingsClassifier {
private static void handleHorizontalText(CleanRulings cleanRulings, TextPositionSequence word) {
float leftX = (float) (word.getBBoxInitialUserSpace().getMinX() + TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
float rightX = (float) (word.getBBoxInitialUserSpace().getMaxX() - TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
float leftX = (float) (word.getBBoxPdf().getMinX() + TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
float rightX = (float) (word.getBBoxPdf().getMaxX() - TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
float strikethroughCenterY = (float) word.getBBoxInitialUserSpace().getCenterY();
float strikethroughCenterY = (float) word.getBBoxPdf().getCenterY();
float strikethroughBoxHeight = (float) ((word.getBBoxDirAdj().getHeight() * STRIKETHROUGH_ZONE) / 2);
float underlineCenterY = (float) (word.getDir().equals(TextDirection.ZERO) ? word.getBBoxInitialUserSpace().getMinY() : word.getBBoxInitialUserSpace().getMaxY());
float underlineCenterY = (float) (word.getDir().equals(TextDirection.ZERO) ? word.getBBoxPdf().getMinY() : word.getBBoxPdf().getMaxY());
float underlineBoxHeight = (float) ((word.getBBoxDirAdj().getHeight() * UNDERLINE_ZONE) / 2);
float lowerY = Math.min(underlineCenterY - underlineBoxHeight, strikethroughCenterY - strikethroughBoxHeight);

View File

@ -2,12 +2,10 @@ package com.knecon.fforesight.service.layoutparser.processor.services.blockifica
import static com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService.buildTextBlock;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.List;
import java.util.ListIterator;
import java.util.Locale;
import java.util.function.Function;
import org.apache.commons.lang3.StringUtils;
import org.springframework.stereotype.Service;
@ -19,8 +17,6 @@ import com.knecon.fforesight.service.layoutparser.processor.model.SectionIdentif
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.SearchTextWithTextPositionFactory;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.Data;
@ -29,14 +25,6 @@ public class BlockificationPostprocessingService {
private static final float BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD = 5.0f;
private static final Function<TextPageBlock, Rectangle2D> blockToBoundingBox = (abstractPageBlock) -> abstractPageBlock.getSequences()
.stream()
.map(textPositionSequence -> textPositionSequence.getTextPositions()
.stream()
.map(tp -> SearchTextWithTextPositionFactory.mapRedTextPositionToInitialUserSpace(tp, textPositionSequence))
.collect(RectangleTransformations.collectBBox()))
.collect(RectangleTransformations.collectBBox());
public OutlineObject sanitizeOutlineBlocks(ClassificationPage classificationPage, OutlineObject notFoundOutlineObject) {
@ -63,13 +51,13 @@ public class BlockificationPostprocessingService {
}
if (!contextsOverlap(notFoundOutlineObjectProcessionContext, firstOutlineObjectProcessionContext)) {
notFoundOutlineObject.setFound(selectMatch(classificationPage, notFoundOutlineObjectProcessionContext));
notFoundOutlineObject.setFound(selectMatch(classificationPage, notFoundOutlineObjectProcessionContext, pageHeight));
}
if (firstOutlineObject != null) {
// re-create the context for the updated blocks
firstOutlineObjectProcessionContext = new OutlineProcessionContext(firstOutlineObject);
processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, firstOutlineObjectProcessionContext);
firstOutlineObject.setFound(selectMatch(classificationPage, firstOutlineObjectProcessionContext));
firstOutlineObject.setFound(selectMatch(classificationPage, firstOutlineObjectProcessionContext, pageHeight));
}
}
@ -77,7 +65,7 @@ public class BlockificationPostprocessingService {
outlineObjectListIterator.forEachRemaining(outlineObject -> {
OutlineProcessionContext outlineObjectProcessionContext = new OutlineProcessionContext(outlineObject);
processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, outlineObjectProcessionContext);
outlineObject.setFound(selectMatch(classificationPage, outlineObjectProcessionContext));
outlineObject.setFound(selectMatch(classificationPage, outlineObjectProcessionContext, pageHeight));
});
if (!outlineObjects.isEmpty()) {
@ -160,7 +148,7 @@ public class BlockificationPostprocessingService {
}
private boolean selectMatch(ClassificationPage classificationPage, OutlineProcessionContext context) {
private boolean selectMatch(ClassificationPage classificationPage, OutlineProcessionContext context, float pageHeight) {
OutlineObject outlineObject = context.outlineObject;
TextPageBlock directMatch = context.directMatch;
@ -168,8 +156,8 @@ public class BlockificationPostprocessingService {
TextPageBlock splitCandidate = context.splitCandidate;
PageBlockType headlineType = PageBlockType.getHeadlineType(outlineObject.getTreeDepth());
double distanceToDirectMatch = directMatch != null ? calculateDistance(outlineObject, directMatch) : Double.MAX_VALUE;
double distanceToSplitCandidate = splitCandidate != null ? calculateDistance(outlineObject, splitCandidate) : Double.MAX_VALUE;
double distanceToDirectMatch = directMatch != null ? calculateDistance(outlineObject, directMatch, pageHeight) : Double.MAX_VALUE;
double distanceToSplitCandidate = splitCandidate != null ? calculateDistance(outlineObject, splitCandidate, pageHeight) : Double.MAX_VALUE;
double distanceToBestMergeCandidates = Double.MAX_VALUE;
List<TextPageBlock> bestMergeCandidateCombination = new ArrayList<>();
@ -189,7 +177,7 @@ public class BlockificationPostprocessingService {
for (List<TextPageBlock> combination : combinations) {
double averageDistance = combination.stream()
.map(block -> calculateDistance(outlineObject, block))
.map(block -> calculateDistance(outlineObject, block, pageHeight))
.mapToDouble(Double::doubleValue).average()
.orElse(Double.MAX_VALUE);
if (distanceToBestMergeCandidates > averageDistance) {
@ -273,7 +261,7 @@ public class BlockificationPostprocessingService {
List<TextPositionSequence> postSequence = new ArrayList<>();
StringBuilder currentSequence = new StringBuilder();
if (target.isBlank()){
if (target.isBlank()) {
return new WordSequenceResult();
}
@ -418,10 +406,10 @@ public class BlockificationPostprocessingService {
}
private double calculateDistance(OutlineObject outlineObject, TextPageBlock pageBlock) {
private double calculateDistance(OutlineObject outlineObject, TextPageBlock pageBlock, float pageHeight) {
double deltaX = outlineObject.getPoint().getX() - pageBlock.getMinX();
double deltaY = pageBlock.getPageHeight() - outlineObject.getPoint().getY() - pageBlock.getMinY();
double deltaY = pageHeight - outlineObject.getPoint().getY() - pageBlock.getMinY();
return Math.sqrt(deltaX * deltaX + deltaY * deltaY);
}

View File

@ -1,7 +1,6 @@
package com.knecon.fforesight.service.layoutparser.processor.services.blockification;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.ListIterator;
@ -10,7 +9,6 @@ import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.DocstrumSegmentationService;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
@ -40,7 +38,7 @@ public class DocstrumBlockificationService {
CleanRulings usedRulings = rulings.withoutTextRulings();
var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder, usedRulings, visualizations);
List<Zone> zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder, usedRulings, visualizations);
if (!textPositions.isEmpty()) {
visualizations.addZoneVisualizations(zones, textPositions.get(0).getPage());
@ -48,11 +46,7 @@ public class DocstrumBlockificationService {
visualizations.addCharactersWithNeighbours(zones, textPositions.get(0).getPage());
}
var pageBlocks = toAbstractPageBlocks(zones, xyOrder, usedRulings);
if (xyOrder) {
sortPageBlocksXThenY(pageBlocks);
}
var pageBlocks = toAbstractPageBlocks(zones);
var classificationPage = new ClassificationPage(pageBlocks);
classificationPage.setCleanRulings(rulings);
@ -73,21 +67,7 @@ public class DocstrumBlockificationService {
}
private static void sortPageBlocksXThenY(List<AbstractPageBlock> pageBlocks) {
pageBlocks.sort(Comparator.comparing(AbstractPageBlock::getMinY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(AbstractPageBlock::getMinX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
pageBlocks.sort(new Comparator<AbstractPageBlock>() {
@Override
public int compare(AbstractPageBlock o1, AbstractPageBlock o2) {
return Math.abs(o1.getMinY() - o2.getMinY()) < 5 && o1.getMinX() < o2.getMinX() == true ? -1 : 0;
}
});
}
private List<AbstractPageBlock> toAbstractPageBlocks(List<Zone> zones, boolean xyOrder, CleanRulings usedRulings) {
private List<AbstractPageBlock> toAbstractPageBlocks(List<Zone> zones) {
List<AbstractPageBlock> abstractPageBlocks = new ArrayList<>();
zones.forEach(zone -> {
@ -296,6 +276,10 @@ public class DocstrumBlockificationService {
continue;
}
// if (!current.getMostPopularWordStyle().equals(inner.getMostPopularWordStyle())) {
// continue;
// }
if (current.getDir() == inner.getDir() && current.intersects(inner, yThreshold, xThreshold)) {
boolean toDuplicate = current.isToDuplicate() || inner.isToDuplicate();

View File

@ -1,9 +1,6 @@
package com.knecon.fforesight.service.layoutparser.processor.services.blockification;
import static java.util.stream.Collectors.toSet;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@ -13,10 +10,8 @@ import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.Orientation;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
@ -44,31 +39,30 @@ public class DocuMineBlockificationService {
CleanRulings usedRulings = cleanRulings.withoutTextRulings();
float minX = 1000;
float maxX = 0;
float minY = 1000;
float maxY = 0;
double minX = 1000;
double maxX = 0;
double minY = 1000;
double maxY = 0;
TextPositionSequence prev = null;
boolean wasSplitted = false;
Float splitX1 = null;
Double splitX1 = null;
for (TextPositionSequence word : textPositions) {
boolean lineSeparation = prev != null && word.getMinYDirAdj() - maxY > Math.min(word.getHeight(), prev.getHeight()) * 1.25;
boolean startFromTop = prev != null && word.getMinYDirAdj() < prev.getMinYDirAdj() - prev.getTextHeight();
boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj();
boolean negativeXGap = prev != null && word.getMinXDirAdj() - minX < -5;
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
boolean lineSeparation = prev != null && word.getYDirAdj() - prev.getMaxYDirAdj() > Math.min(word.getHeight(), prev.getHeight()) * 1.25;
boolean startFromTop = prev != null && word.getYDirAdj() < prev.getYDirAdj() - prev.getTextHeight();
boolean splitByX = prev != null && maxX + 50 < word.getXDirAdj() && prev.getYDirAdj() == word.getYDirAdj();
boolean negativeXGap = prev != null && word.getXDirAdj() - minX < -5;
boolean newLineAfterSplit = prev != null && word.getYDirAdj() != prev.getYDirAdj() && wasSplitted && splitX1 != word.getXDirAdj();
boolean isSplitByRuling = prev != null && usedRulings.lineBetween(prev, word);
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
boolean splitByOtherFontAndOtherY = prev != null && Math.abs(prev.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight() * 0.2 && (word.getFontStyle()
.contains("bold")
&& !prev.getFontStyle()
.contains("bold") || prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold"));
boolean splitByOtherFontAndOtherY = prev != null && Math.abs(prev.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight() * 0.2 //
&& (word.getFontStyle().contains("bold") && !prev.getFontStyle().contains("bold") //
|| prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold"));
Matcher matcher = pattern.matcher(chunkWords.stream()
.collect(Collectors.joining(" ")).toString());
boolean startsOnSameX = Math.abs(minX - word.getMinXDirAdj()) < 5 && matcher.matches();
boolean startsOnSameX = Math.abs(minX - word.getXDirAdj()) < 5 && matcher.matches();
if (prev != null && (lineSeparation || startFromTop || splitByDir || isSplitByRuling || splitByOtherFontAndOtherY || negativeXGap || startsOnSameX)) {
@ -84,7 +78,7 @@ public class DocuMineBlockificationService {
if (splitByX && !isSplitByRuling) {
wasSplitted = true;
cb1.setOrientation(Orientation.LEFT);
splitX1 = word.getMinXDirAdj();
splitX1 = word.getXDirAdj();
} else if (newLineAfterSplit && !isSplitByRuling) {
wasSplitted = false;
cb1.setOrientation(Orientation.RIGHT);
@ -107,14 +101,14 @@ public class DocuMineBlockificationService {
chunkWords.add(word);
prev = word;
if (word.getMinXDirAdj() < minX) {
minX = word.getMinXDirAdj();
if (word.getXDirAdj() < minX) {
minX = word.getXDirAdj();
}
if (word.getMaxXDirAdj() > maxX) {
maxX = word.getMaxXDirAdj();
}
if (word.getMinYDirAdj() < minY) {
minY = word.getMinYDirAdj();
if (word.getYDirAdj() < minY) {
minY = word.getYDirAdj();
}
if (word.getMaxYDirAdj() > maxY) {
maxY = word.getMaxYDirAdj();
@ -126,7 +120,5 @@ public class DocuMineBlockificationService {
return new ClassificationPage(textPageBlocks);
}
}

View File

@ -38,18 +38,18 @@ public class RedactManagerBlockificationService {
List<TextPositionSequence> chunkWords = new ArrayList<>();
List<AbstractPageBlock> chunkBlockList = new ArrayList<>();
float minX = 1000, maxX = 0, minY = 1000, maxY = 0;
double minX = 1000, maxX = 0, minY = 1000, maxY = 0;
TextPositionSequence prev = null;
boolean wasSplitted = false;
Float splitX1 = null;
Double splitX1 = null;
for (TextPositionSequence word : textPositions) {
boolean lineSeparation = word.getMinYDirAdj() - maxY > word.getHeight() * 1.25;
boolean startFromTop = prev != null && word.getMinYDirAdj() < prev.getMinYDirAdj() - prev.getTextHeight();
boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj();
boolean xIsBeforeFirstX = prev != null && word.getMinXDirAdj() < minX;
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
boolean lineSeparation = word.getYDirAdj() - maxY > word.getHeight() * 1.25;
boolean startFromTop = prev != null && word.getYDirAdj() < prev.getYDirAdj() - prev.getTextHeight();
boolean splitByX = prev != null && maxX + 50 < word.getXDirAdj() && prev.getYDirAdj() == word.getYDirAdj();
boolean xIsBeforeFirstX = prev != null && word.getXDirAdj() < minX;
boolean newLineAfterSplit = prev != null && word.getYDirAdj() != prev.getYDirAdj() && wasSplitted && splitX1 != word.getXDirAdj();
boolean isSplitByRuling = prev != null && usedRulings.lineBetween(prev, word);
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
@ -69,7 +69,7 @@ public class RedactManagerBlockificationService {
if (splitByX && !isSplitByRuling) {
wasSplitted = true;
cb1.setOrientation(Orientation.LEFT);
splitX1 = word.getMinXDirAdj();
splitX1 = word.getXDirAdj();
} else if (newLineAfterSplit && !isSplitByRuling) {
wasSplitted = false;
cb1.setOrientation(Orientation.RIGHT);
@ -92,14 +92,14 @@ public class RedactManagerBlockificationService {
chunkWords.add(word);
prev = word;
if (word.getMinXDirAdj() < minX) {
minX = word.getMinXDirAdj();
if (word.getXDirAdj() < minX) {
minX = word.getXDirAdj();
}
if (word.getMaxXDirAdj() > maxX) {
maxX = word.getMaxXDirAdj();
}
if (word.getMinYDirAdj() < minY) {
minY = word.getMinYDirAdj();
if (word.getYDirAdj() < minY) {
minY = word.getYDirAdj();
}
if (word.getMaxYDirAdj() > maxY) {
maxY = word.getMaxYDirAdj();

View File

@ -23,7 +23,7 @@ public class ClarifyndClassificationService {
public void classifyDocument(ClassificationDocument document) {
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
List<Double> headlineFontSizes = document.getFontSizeCounter().getHigherThanMostPopular();
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
@ -35,7 +35,7 @@ public class ClarifyndClassificationService {
}
private void classifyPage(HeadlineClassificationService headlineClassificationService, ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
private void classifyPage(HeadlineClassificationService headlineClassificationService, ClassificationPage page, ClassificationDocument document, List<Double> headlineFontSizes) {
for (AbstractPageBlock textBlock : page.getTextBlocks()) {
if (textBlock instanceof TextPageBlock) {
@ -45,7 +45,7 @@ public class ClarifyndClassificationService {
}
private void classifyBlock(HeadlineClassificationService headlineClassificationService, TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
private void classifyBlock(HeadlineClassificationService headlineClassificationService, TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List<Double> headlineFontSizes) {
var bodyTextFrame = page.getBodyTextFrame();

View File

@ -31,7 +31,7 @@ public class DocuMineClassificationService {
public void classifyDocument(ClassificationDocument document) {
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
List<Double> headlineFontSizes = document.getFontSizeCounter().getHigherThanMostPopular();
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
@ -46,7 +46,7 @@ public class DocuMineClassificationService {
private void classifyPage(HeadlineClassificationService headlineClassificationService,
ClassificationPage page,
ClassificationDocument document,
List<Float> headlineFontSizes) {
List<Double> headlineFontSizes) {
for (AbstractPageBlock textBlock : page.getTextBlocks()) {
if (textBlock instanceof TextPageBlock) {
@ -60,7 +60,7 @@ public class DocuMineClassificationService {
TextPageBlock textBlock,
ClassificationPage page,
ClassificationDocument document,
List<Float> headlineFontSizes) {
List<Double> headlineFontSizes) {
log.debug("headlineFontSizes: {}", headlineFontSizes);
var bodyTextFrame = page.getBodyTextFrame();

View File

@ -25,7 +25,7 @@ public class RedactManagerClassificationService {
public void classifyDocument(ClassificationDocument document) {
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
List<Double> headlineFontSizes = document.getFontSizeCounter().getHigherThanMostPopular();
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
@ -37,7 +37,7 @@ public class RedactManagerClassificationService {
}
private void classifyPage(HeadlineClassificationService headlineClassificationService, ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
private void classifyPage(HeadlineClassificationService headlineClassificationService, ClassificationPage page, ClassificationDocument document, List<Double> headlineFontSizes) {
for (AbstractPageBlock textBlock : page.getTextBlocks()) {
if (textBlock instanceof TextPageBlock) {
@ -47,7 +47,7 @@ public class RedactManagerClassificationService {
}
private void classifyBlock(HeadlineClassificationService headlineClassificationService, TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
private void classifyBlock(HeadlineClassificationService headlineClassificationService, TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List<Double> headlineFontSizes) {
var bodyTextFrame = page.getBodyTextFrame();
@ -56,7 +56,7 @@ public class RedactManagerClassificationService {
return;
}
if (document.getFontSizeCounter().getMostPopular() == null) {
textBlock.setClassification(PageBlockType.OTHER);
textBlock.setClassification(PageBlockType.PARAGRAPH);
return;
}
@ -129,7 +129,7 @@ public class RedactManagerClassificationService {
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) {
textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN);
} else {
textBlock.setClassification(PageBlockType.OTHER);
textBlock.setClassification(PageBlockType.PARAGRAPH);
}
}

View File

@ -111,10 +111,10 @@ public class DocumentGraphFactory {
textBlocks.add(originalTextBlock);
textBlocks.addAll(textBlocksToMerge);
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), node, context, page);
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock2(TextPositionOperations.mergeAndSort(textBlocks), node, context, page);
if (node instanceof DuplicatedParagraph duplicatedParagraph) {
AtomicTextBlock unsortedTextBlock = context.textBlockFactory.buildAtomicTextBlock(textBlocks.stream()
AtomicTextBlock unsortedTextBlock = context.textBlockFactory.buildAtomicTextBlock2(textBlocks.stream()
.flatMap(tb -> tb.getSequences()
.stream())
.collect(Collectors.toList()), node, context, page);
@ -191,7 +191,7 @@ public class DocumentGraphFactory {
Page page = context.getPage(textBlocks.get(0).getPage());
Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build();
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks), footer, context, page);
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock2(TextPositionOperations.merge(textBlocks), footer, context, page);
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer);
footer.setTreeId(tocId);
footer.setLeafTextBlock(textBlock);
@ -203,7 +203,7 @@ public class DocumentGraphFactory {
Page page = context.getPage(textBlocks.get(0).getPage());
Header header = Header.builder().documentTree(context.getDocumentTree()).build();
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks), header, 0, page);
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.merge(textBlocks), header, 0, page);
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header);
header.setTreeId(tocId);
header.setLeafTextBlock(textBlock);

View File

@ -1,6 +1,7 @@
package com.knecon.fforesight.service.layoutparser.processor.services.factory;
import java.awt.geom.AffineTransform;
import static com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition.HEIGHT_PADDING;
import java.awt.geom.Rectangle2D;
import java.util.Collection;
import java.util.Collections;
@ -11,7 +12,6 @@ import java.util.Objects;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import lombok.experimental.UtilityClass;
@ -19,14 +19,13 @@ import lombok.experimental.UtilityClass;
@UtilityClass
public class SearchTextWithTextPositionFactory {
public final int HEIGHT_PADDING = 2;
// when checking for a hyphen linebreak, we need to check after a linebreak if the last hyphen was less than three symbols away.
// We detect a linebreak as either a "\n" character or if two adjacent symbol's position differ in y-coordinates by at least one character height.
// If there is a hyphen linebreak, the hyphen will be 1 position in front of a "\n" or 2 positions in front of the character which has a lower y-coordinate
// This is why, we need to initialize this to < -2, otherwise, if the very first symbol is a \n we would detect a hyphen linebreak that isn't there.
// Also, Integer.MIN_VALUE is a bad idea due to potential overflow during arithmetic operations. This is why the default should be -3.
public final int MAX_HYPHEN_LINEBREAK_DISTANCE = 3;
public static final double LINEBREAK_DELTA_TOLERANCE = 1.05;
public static final double LINEBREAK_DELTA_TOLERANCE = 1.5;
public SearchTextWithTextPositionDto buildSearchTextToTextPositionDto(List<TextPositionSequence> sequences) {
@ -38,15 +37,13 @@ public class SearchTextWithTextPositionFactory {
Context context = new Context();
RedTextPosition currentTextPosition = sequences.get(0).getTextPositions()
.get(0);
RedTextPosition currentTextPosition = sequences.get(0).getTextPositions().get(0);
RedTextPosition previousTextPosition = RedTextPosition.builder().unicode(" ").bBoxDirAdj(currentTextPosition.getBBoxDirAdj()).build();
for (TextPositionSequence word : sequences) {
for (int i = 0; i < word.getTextPositions().size(); ++i) {
currentTextPosition = word.getTextPositions()
.get(i);
currentTextPosition = word.getTextPositions().get(i);
if (isLineBreak(currentTextPosition, previousTextPosition)) {
removeHyphenLinebreaks(context);
context.lineBreaksStringIdx.add(context.stringIdx);
@ -68,11 +65,10 @@ public class SearchTextWithTextPositionFactory {
++context.stringIdx;
}
List<Rectangle2D> positions = sequences.stream()
.map(TextPositionSequence::getTextPositions)
.flatMap(Collection::stream)
.map(RedTextPosition::getBBoxInitialUserSpace)
.map(RedTextPosition::getBBoxPdf)
.toList();
assert context.stringBuilder.length() == context.stringIdxToPositionIdx.size();
@ -161,8 +157,8 @@ public class SearchTextWithTextPositionFactory {
return false;
}
double deltaY = Math.abs(currentPosition.getYDirAdj() - previousPosition.getYDirAdj()) * LINEBREAK_DELTA_TOLERANCE;
return deltaY >= currentPosition.getHeightDir() || deltaY >= previousPosition.getHeightDir();
double deltaY = (Math.abs(currentPosition.getYDirAdj() - previousPosition.getYDirAdj()) * LINEBREAK_DELTA_TOLERANCE) + (2 * HEIGHT_PADDING);
return deltaY >= currentPosition.getHeightDirAdj() || deltaY >= previousPosition.getHeightDirAdj();
}
@ -188,32 +184,6 @@ public class SearchTextWithTextPositionFactory {
}
public Rectangle2D mapRedTextPositionToInitialUserSpace(RedTextPosition textPosition, TextPositionSequence sequence) {
float textHeight = sequence.getTextHeight() + HEIGHT_PADDING;
Rectangle2D rectangle2D = new Rectangle2D.Double(textPosition.getXDirAdj(),
textPosition.getYDirAdj() - textHeight,
textPosition.getWidthDirAdj(),
textHeight + HEIGHT_PADDING);
AffineTransform transform = new AffineTransform();
if (sequence.getDir() == TextDirection.ZERO || sequence.getDir() == TextDirection.HALF_CIRCLE) {
transform.rotate(sequence.getDir().getRadians(), sequence.getPageWidth() / 2f, sequence.getPageHeight() / 2f);
transform.translate(0f, sequence.getPageHeight());
} else if (sequence.getDir() == TextDirection.QUARTER_CIRCLE) {
transform.rotate(sequence.getDir().getRadians(), sequence.getPageWidth() / 2f, sequence.getPageWidth() / 2f);
transform.translate(0f, sequence.getPageWidth());
} else {
transform.rotate(sequence.getDir().getRadians(), sequence.getPageHeight() / 2f, sequence.getPageHeight() / 2f);
transform.translate(0f, sequence.getPageWidth());
}
transform.scale(1., -1.);
return transform.createTransformedShape(rectangle2D).getBounds2D();
}
private class Context {
List<Integer> stringIdxToPositionIdx = new LinkedList<>();

View File

@ -234,7 +234,7 @@ public class SectionNodeFactory {
.filter(abstractTextContainer -> !abstractTextContainer.equals(atc))
.filter(abstractTextContainer -> abstractTextContainer.getPage() == atc.getPage())
.filter(abstractTextContainer -> abstractTextContainer instanceof TextPageBlock)
.filter(abstractTextContainer -> abstractTextContainer.intersectsY(atc))
.filter(abstractTextContainer -> abstractTextContainer.intersectsYPdf(atc))
.map(abstractTextContainer -> (TextPageBlock) abstractTextContainer)
.filter(abstractTextContainer -> abstractTextContainer.getDir() == atc.getDir())
.filter(abstractTextContainer -> !abstractTextContainer.isToDuplicate())

View File

@ -136,7 +136,7 @@ public class TableNodeFactory {
.row(rowIndex)
.col(colIndex)
.header(cell.isHeaderCell())
.bBox(cell.getBBoxInitialUserSpace())
.bBox(cell.getBBoxPdf())
.build();
page.getMainBody().add(tableCell);
@ -148,7 +148,7 @@ public class TableNodeFactory {
tableCell.setLeafTextBlock(context.getTextBlockFactory().emptyTextBlock(tableNode, context, page));
} else if (cell.getTextBlocks().size() == 1) {
textBlock = context.getTextBlockFactory()
.buildAtomicTextBlock(cell.getTextBlocks()
.buildAtomicTextBlock2(cell.getTextBlocks()
.get(0).getSequences(), tableCell, context, page);
tableCell.setLeafTextBlock(textBlock);
} else if (firstTextBlockIsHeadline(cell)) {
@ -163,8 +163,8 @@ public class TableNodeFactory {
context,
document);
} else if (cellAreaIsSmallerThanPageAreaTimesThreshold(cell, page)) {
List<TextPositionSequence> sequences = TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(cell.getTextBlocks());
textBlock = context.getTextBlockFactory().buildAtomicTextBlock(sequences, tableCell, context, page);
List<TextPositionSequence> sequences = TextPositionOperations.mergeAndSort(cell.getTextBlocks());
textBlock = context.getTextBlockFactory().buildAtomicTextBlock2(sequences, tableCell, context, page);
tableCell.setLeafTextBlock(textBlock);
} else {
cell.getTextBlocks()

View File

@ -17,7 +17,7 @@ public class TextBlockFactory {
long textBlockIdx;
public AtomicTextBlock buildAtomicTextBlock(List<TextPositionSequence> sequences, SemanticNode parent, DocumentGraphFactory.Context context, Page page) {
public AtomicTextBlock buildAtomicTextBlock2(List<TextPositionSequence> sequences, SemanticNode parent, DocumentGraphFactory.Context context, Page page) {
Integer numberOnPage = context.getAndIncrementTextBlockNumberOnPage(page);
return buildAtomicTextBlock(sequences, parent, numberOnPage, page);
@ -32,27 +32,27 @@ public class TextBlockFactory {
long idx = textBlockIdx;
textBlockIdx++;
String orientation;
int textDirection;
int textRotation;
if (sequences.isEmpty()) {
orientation = null;
textDirection = 0;
textRotation = 0;
} else {
orientation = sequences.get(0).getDir().toString();
textDirection = sequences.get(0).getRotation();
textRotation = sequences.get(0).getDir().getRotation();
}
return AtomicTextBlock.fromSearchTextWithTextPosition(searchTextWithTextPositionDto.getSearchText(),
searchTextWithTextPositionDto.getLineBreaks(),
searchTextWithTextPositionDto.getBoldTextBoundaries(),
searchTextWithTextPositionDto.getItalicTextBoundaries(),
searchTextWithTextPositionDto.getPositions(),
searchTextWithTextPositionDto.getStringIdxToPositionIdx(),
idx,
parent,
numberOnPage,
page,
offset,
orientation,
textDirection);
searchTextWithTextPositionDto.getLineBreaks(),
searchTextWithTextPositionDto.getBoldTextBoundaries(),
searchTextWithTextPositionDto.getItalicTextBoundaries(),
searchTextWithTextPositionDto.getPositions(),
searchTextWithTextPositionDto.getStringIdxToPositionIdx(),
idx,
parent,
numberOnPage,
page,
offset,
orientation,
textRotation);
}

View File

@ -8,12 +8,11 @@ import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.SearchTextWithTextPositionFactory;
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
@ -67,10 +66,7 @@ public class GraphicExtractorService {
private List<Box> getCharacterBBoxes(List<TextPositionSequence> textPositionSequences) {
return textPositionSequences.stream()
.map(pos -> pos.getTextPositions()
.stream()
.map(tp -> SearchTextWithTextPositionFactory.mapRedTextPositionToInitialUserSpace(tp, pos))
.collect(RectangleTransformations.collectBBox()))
.map(BoundingBox::getBBoxPdf)
.map(Box::new)
.collect(Collectors.toList());
}

View File

@ -96,7 +96,7 @@ public class HeaderFooterDetection {
continue;
}
int distance = calculateHammingDistanceWithPreprocessing(testString, paddedString);
int distance = StringDistances.hamming(testString, paddedString);
double normalizedScore = 1 - (double) distance / Math.max(testString.length(), paddedString.length());
score += normalizedScore * (j < weights.length ? weights[j] : 1);
}
@ -180,44 +180,4 @@ public class HeaderFooterDetection {
return headerCandidates;
}
/**
* Calculate the Hamming distance between two strings after preprocessing to make them the same length
* and replacing all digits with a special character '@' since they are a common occurrence in headers/footers.
*
* @param firstCandidate First string
* @param secondCandidate Second string
* @return The Hamming distance between the two preprocessed strings.
*/
private int calculateHammingDistanceWithPreprocessing(String firstCandidate, String secondCandidate) {
int maxLength = Math.max(firstCandidate.length(), secondCandidate.length());
String cleanFirstCandidate = padString(firstCandidate, maxLength, '\0').replaceAll("\\d", "@");
String cleanSecondCandidate = padString(secondCandidate, maxLength, '\0').replaceAll("\\d", "@");
int distance = 0;
for (int i = 0; i < maxLength; i++) {
if (cleanFirstCandidate.charAt(i) != cleanSecondCandidate.charAt(i)) {
distance++;
}
}
return distance;
}
private String padString(String input, int length, char padChar) {
if (input.length() >= length) {
return input;
}
StringBuilder sb = new StringBuilder(input);
while (sb.length() < length) {
sb.append(padChar);
}
return sb.toString();
}
}

View File

@ -48,7 +48,7 @@ public class MarkedContentUtils {
return markedContentByYPosition.values()
.stream()
.map(textPositions -> new TextPositionSequence(textPositions, 0, true).getBBoxInitialUserSpace())
.map(textPositions -> new TextPositionSequence(textPositions, 0, true).getBBoxPdf())
.map(t -> new Rectangle2D.Double(t.getX(), t.getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight())))
.collect(Collectors.toList());
}
@ -90,7 +90,7 @@ public class MarkedContentUtils {
.map(content -> (TextPosition) content)
.filter(content -> !content.getUnicode().equals(" "))
.map(textPositions -> new TextPositionSequence(List.of(textPositions), 0, true))
.map(BoundingBox::getBBoxInitialUserSpace)
.map(BoundingBox::getBBoxPdf)
.collect(Collectors.toList());
}

View File

@ -108,7 +108,7 @@ public final class PositionUtils {
}
public float getHeightDifferenceBetweenChunkWordAndDocumentWord(TextPageBlock textBlock, Float documentMostPopularWordHeight) {
public double getHeightDifferenceBetweenChunkWordAndDocumentWord(TextPageBlock textBlock, Double documentMostPopularWordHeight) {
return textBlock.getMostPopularWordHeight() - documentMostPopularWordHeight;
}
@ -116,7 +116,7 @@ public final class PositionUtils {
public double getApproxLineCount(TextPageBlock textBlock) {
return textBlock.getHeight() / textBlock.getMostPopularWordHeight();
return textBlock.getBBoxDirAdj().getHeight() / textBlock.getMostPopularWordHeight();
}
}

View File

@ -28,7 +28,7 @@ public class SpreadsheetFinder {
Map<Point2D, Point2D> edgesV = new HashMap<>();
for (Cell cell : cells) {
for (Point2D pt : getPoints(cell.getBBoxInitialUserSpace())) {
for (Point2D pt : getPoints(cell.getBBoxPdf())) {
if (pointSet.contains(pt)) { // shared vertex, remove it
pointSet.remove(pt);
} else {

View File

@ -0,0 +1,49 @@
package com.knecon.fforesight.service.layoutparser.processor.utils;
import lombok.experimental.UtilityClass;
@UtilityClass
public class StringDistances {
/**
* Calculate the Hamming distance between two strings after preprocessing to make them the same length
* and replacing all digits with a special character '@' since they are a common occurrence in headers/footers.
*
* @param s1 First string
* @param s2 Second string
* @return The Hamming distance between the two preprocessed strings.
*/
public int hamming(String s1, String s2) {
int maxLength = Math.max(s1.length(), s2.length());
String cleanFirstCandidate = padString(s1, maxLength, '\0').replaceAll("\\d", "@");
String cleanSecondCandidate = padString(s2, maxLength, '\0').replaceAll("\\d", "@");
int distance = 0;
for (int i = 0; i < maxLength; i++) {
if (cleanFirstCandidate.charAt(i) != cleanSecondCandidate.charAt(i)) {
distance++;
}
}
return distance;
}
private String padString(String input, int length, char padChar) {
if (input.length() >= length) {
return input;
}
StringBuilder sb = new StringBuilder(input);
while (sb.length() < length) {
sb.append(padChar);
}
return sb.toString();
}
}

View File

@ -1,30 +1,136 @@
package com.knecon.fforesight.service.layoutparser.processor.utils;
import java.awt.geom.Rectangle2D;
import java.awt.geom.RectangularShape;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.AngleFilter;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import lombok.experimental.UtilityClass;
@UtilityClass
public class TextPositionOperations {
private static final TextPositionSequenceComparator comparator = new TextPositionSequenceComparator();
public static final double ANGLE_TOLERANCE = Math.PI / 35;
public static final AngleFilter ANGLE_FILTER = new AngleFilter(-ANGLE_TOLERANCE, ANGLE_TOLERANCE);
public static final double MAX_LINE_HEIGHT_FACTOR = 0.66; // multiplied with max word height
public static final double MAX_WORD_DISTANCE_FACTOR = 3.5; // multiplied with max word width
private static final double THRESHOLD = 5;
private static final Comparator<TextBoundingBox> COMPARATOR_DIR_ADJ = //
Comparator.comparing(TextBoundingBox::getDir)
.thenComparing(TextBoundingBox::getYDirAdj, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(TextBoundingBox::getXDirAdj, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD));
public static List<TextPositionSequence> mergeAndSortTextPositionSequenceByYThenX(List<TextPageBlock> textBlocks) {
public List<TextPositionSequence> mergeAndSort(List<TextPageBlock> textBlocks) {
var sequence = textBlocks.stream().flatMap(tb -> tb.getSequences().stream()).collect(Collectors.toList());
// because the TextPositionSequenceComparator is not transitive, but
// JDK7+ enforces transitivity on comparators, we need to use
// a custom quicksort implementation (which is slower, unfortunately).
QuickSort.sort(sequence, comparator);
return sequence;
var sequences = textBlocks.stream()
.flatMap(tb -> tb.getSequences()
.stream())
.collect(Collectors.toSet());
return sortUsingLineDetection(sequences);
}
public static List<TextPositionSequence> mergeTextPositionSequence(List<TextPageBlock> textBlocks) {
return textBlocks.stream().flatMap(tb -> tb.getSequences().stream()).collect(Collectors.toList());
public List<TextPositionSequence> sort(List<TextPositionSequence> sequences) {
return sortUsingLineDetection(new HashSet<>(sequences));
}
private List<TextPositionSequence> sortUsingLineDetection(Set<TextPositionSequence> sequences) {
return groupByLine(sequences).stream()
.map(TextPositionOperations::sortByXDirAdj)
.filter(line -> !line.isEmpty())
.sorted(Comparator.comparing(line -> line.get(0), COMPARATOR_DIR_ADJ))
.flatMap(Collection::stream)
.toList();
}
private List<TextPositionSequence> sortByXDirAdj(Set<TextPositionSequence> line) {
return line.stream()
.sorted(Comparator.comparing(TextPositionSequence::getXDirAdj))
.toList();
}
private Collection<Set<TextPositionSequence>> groupByLine(Set<TextPositionSequence> sequences) {
double maxLineDistance = sequences.stream()
.map(TextPositionSequence::getBBoxDirAdj)
.mapToDouble(RectangularShape::getHeight).average()
.orElse(10) * MAX_LINE_HEIGHT_FACTOR;
double maxXGap = sequences.stream()
.map(TextPositionSequence::getBBoxDirAdj)
.mapToDouble(RectangularShape::getWidth).average()
.orElse(75) * MAX_WORD_DISTANCE_FACTOR;
UnionFind<TextPositionSequence> unionFind = new UnionFind<>(sequences);
for (TextPositionSequence sequence : sequences) {
for (TextPositionSequence sequence2 : sequences) {
if (sequence.equals(sequence2) || unionFind.inSameSet(sequence, sequence2)) {
continue;
}
double angle = computeAngle(sequence.getBBoxDirAdj(), sequence2.getBBoxDirAdj());
double normalizedVerticalDistance = Math.abs(sequence.getBBoxDirAdj().getCenterY() - sequence2.getBBoxDirAdj().getCenterY()) / maxLineDistance;
double normalizedHorizontalDistance = Math.abs(sequence.getBBoxDirAdj().getCenterX() - sequence2.getBBoxDirAdj().getCenterX()) / maxXGap;
if (sequence.getDir() != sequence2.getDir()
|| Math.abs(sequence.getFontSize() - sequence2.getFontSize()) > 0.5 * Math.min(sequence.getFontSize(),
sequence2.getFontSize())
|| Math.pow(normalizedVerticalDistance, 2) + Math.pow(normalizedHorizontalDistance, 2) > 1
|| !ANGLE_FILTER.matches(angle)) {
continue;
}
unionFind.union(sequence, sequence2);
}
}
return unionFind.getGroups();
}
public double computeAngle(Rectangle2D rect1, Rectangle2D rect2) {
double rect1CentroidX = rect1.getCenterX();
double rect1CentroidY = rect1.getCenterY();
double rect2CentroidX = rect2.getCenterX();
double rect2CentroidY = rect2.getCenterY();
double deltaX = rect2CentroidX - rect1CentroidX;
double deltaY = rect2CentroidY - rect1CentroidY;
return FastAtan2.fastAtan2(deltaY, deltaX);
}
public List<TextPositionSequence> merge(List<TextPageBlock> textBlocks) {
return textBlocks.stream()
.map(TextPageBlock::getSequences)
.flatMap(Collection::stream)
.collect(Collectors.toList());
}
}

View File

@ -1,99 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.knecon.fforesight.service.layoutparser.processor.utils;
import java.util.Comparator;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
/**
* This class is a comparator for TextPosition operators. It handles
* pages with text in different directions by grouping the text based
* on direction and sorting in that direction. This allows continuous text
* in a given direction to be more easily grouped together.
*
* @author Ben Litchfield
*/
public class TextPositionSequenceComparator implements Comparator<TextPositionSequence> {
@Override
public int compare(TextPositionSequence pos1, TextPositionSequence pos2) {
// only compare text that is in the same direction
int cmp1 = Float.compare(pos1.getDir().getDegrees(), pos2.getDir().getDegrees());
if (cmp1 != 0) {
return cmp1;
}
// get the text direction adjusted coordinates
double x1 = pos1.getBBox().getX();
double x2 = pos2.getBBox().getX();
double pos1YBottom = pos1.getBBox().getMaxY();
double pos2YBottom = pos2.getBBox().getMaxY();
// note that the coordinates have been adjusted so 0,0 is in upper left
double pos1YTop = pos1YBottom - pos1.getBBox().getHeight();
double pos2YTop = pos2YBottom - pos2.getBBox().getHeight();
double yDifference = Math.abs(pos1YBottom - pos2YBottom);
// Adjust for text rotation
switch (pos1.getRotation()) {
case 0:
// 0 degrees (horizontal, top to bottom and left to right): Sort primarily by y-coordinates from top to bottom (pos1YBottom < pos2YBottom).
if (yDifference < .1 || (pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom) || (pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom)) {
return Double.compare(x1, x2);
} else if (pos1YBottom < pos2YBottom) {
return -1;
} else {
return 1;
}
case 90:
// 90 degrees (vertical, right to left): Sort by x-coordinates first (x1 > x2), then by y-coordinates from top to bottom (pos1YBottom < pos2YBottom).
if (x1 > x2) {
return -1;
} else if (x1 < x2) {
return 1;
} else {
return Double.compare(pos1YBottom, pos2YBottom);
}
case 180:
// 180 degrees (horizontal, bottom to top and right to left): Sort primarily by y-coordinates from bottom to top (pos1YBottom > pos2YBottom).
if (yDifference < .1 || (pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom) || (pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom)) {
return Double.compare(x2, x1);
} else if (pos1YBottom > pos2YBottom) {
return -1;
} else {
return 1;
}
case 270:
// 270 degrees (vertical, left to right): Sort by x-coordinates in reverse (x2 > x1), then by y-coordinates from bottom to top (pos2YBottom > pos1YBottom).
if (x2 > x1) {
return -1;
} else if (x2 < x1) {
return 1;
} else {
return Double.compare(pos2YBottom, pos1YBottom);
}
default:
throw new RuntimeException("Rotation not supported. Only 0/90/180/270 degree rotation is supported.");
}
}
}

View File

@ -51,7 +51,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.words);
visualizationsOnPage.getColoredRectangles()
.addAll(textPositionSequences.stream()
.map(BoundingBox::getBBoxInitialUserSpace)
.map(BoundingBox::getBBoxPdf)
.map(rect -> new ColoredRectangle(rect, WORDS_COLOR, 1))
.toList());
}
@ -105,7 +105,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.cells);
visualizationsOnPage.getColoredRectangles()
.addAll(cells.stream()
.map(cell -> new ColoredRectangle(cell.getBBoxInitialUserSpace(), CELLS_COLOR, 1))
.map(cell -> new ColoredRectangle(cell.getBBoxPdf(), CELLS_COLOR, 1))
.toList());
}
@ -119,7 +119,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(page, this.zones);
visualizationsOnPage.getColoredRectangles()
.addAll(zones.stream()
.map(BoundingBox::getBBoxInitialUserSpace)
.map(BoundingBox::getBBoxPdf)
.map(zone -> new ColoredRectangle(zone, ZONES_COLOR, 1))
.toList());
@ -144,7 +144,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(page, this.lines);
visualizationsOnPage.getColoredRectangles()
.addAll(lines.stream()
.map(BoundingBox::getBBoxInitialUserSpace)
.map(BoundingBox::getBBoxPdf)
.map(line -> new ColoredRectangle(line, LINES_COLOR, 0.5f))
.toList());
}
@ -158,7 +158,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(page, zones);
visualizationsOnPage.getColoredRectangles()
.addAll(textPageBlocks.stream()
.map(rect -> new ColoredRectangle(rect.getBBoxInitialUserSpace(), ZONES_COLOR, 1))
.map(rect -> new ColoredRectangle(rect.getBBoxPdf(), ZONES_COLOR, 1))
.toList());
}
@ -222,11 +222,11 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
.flatMap(Collection::stream)
.forEach(character -> {
Color color = ROTATING_CHARACTER_COLOR.get(index.getAndIncrement() % ROTATING_CHARACTER_COLOR.size());
Rectangle2D charBBox = character.getTextPosition().getBBoxInitialUserSpace();
Rectangle2D charBBox = character.getTextPosition().getBBoxPdf();
characterVisualizations.getColoredRectangles().add(new ColoredRectangle(charBBox, color, 1));
character.getNeighbors()
.forEach(neighbor -> {
Rectangle2D neighborBBox = neighbor.getCharacter().getTextPosition().getBBoxInitialUserSpace();
Rectangle2D neighborBBox = neighbor.getCharacter().getTextPosition().getBBoxPdf();
Line2D line = new Line2D.Double(new Point2D.Double(charBBox.getCenterX(), charBBox.getCenterY()),
new Point2D.Double(neighborBBox.getCenterX(), neighborBBox.getCenterY()));
neighbourVisualizations.getColoredLines().add(new ColoredLine(line, color, 1));

View File

@ -38,7 +38,7 @@ dependencies {
implementation("com.amazonaws:aws-java-sdk-s3:1.12.536")
implementation("org.springframework.cloud:spring-cloud-starter-openfeign:4.0.4")
implementation("net.logstash.logback:logstash-logback-encoder:7.4")
implementation("com.pdftron:PDFNet:10.7.0")
implementation("com.pdftron:PDFNet:10.11.0")
// for integration testing only
testImplementation(project(":viewer-doc-processor"))
@ -52,6 +52,8 @@ dependencies {
testImplementation("com.fasterxml.jackson.datatype:jackson-datatype-jsr310:${jacksonVersion}")
testImplementation("org.apache.pdfbox:pdfbox:${pdfBoxVersion}")
testImplementation("org.apache.pdfbox:pdfbox-tools:${pdfBoxVersion}")
testImplementation("org.apache.commons:commons-text:1.12.0")
}
/*
@ -89,6 +91,9 @@ tasks.named<BootBuildImage>("bootBuildImage") {
environment.put("BPE_DELIM_JAVA_TOOL_OPTIONS", " ")
environment.put("BPE_APPEND_JAVA_TOOL_OPTIONS", "-Dfile.encoding=UTF-8")
builder.set("docker-proxy.knecon.com/paketobuildpacks/builder:base")
runImage.set("docker-proxy.knecon.com/paketobuildpacks/run:base-cnb")
imageName.set("nexus.knecon.com:5001/ff/${project.name}:${project.version}")
if (project.hasProperty("buildbootDockerHostNetwork")) {
network.set("host")
@ -99,6 +104,13 @@ tasks.named<BootBuildImage>("bootBuildImage") {
}
verboseLogging.set(true)
builderRegistry {
username.set(providers.gradleProperty("mavenUser").getOrNull())
password.set(providers.gradleProperty("mavenPassword").getOrNull())
email.set(providers.gradleProperty("mavenEmail").getOrNull())
url.set("https://docker-proxy.knecon.com:5001/")
}
publishRegistry {
username.set(providers.gradleProperty("mavenUser").getOrNull())
password.set(providers.gradleProperty("mavenPassword").getOrNull())
@ -106,4 +118,5 @@ tasks.named<BootBuildImage>("bootBuildImage") {
url.set("https://nexus.knecon.com:5001/")
}
}
}

View File

@ -1,20 +1,18 @@
package com.knecon.fforesight.service.layoutparser.server;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import com.google.common.base.Strings;
import com.knecon.fforesight.service.layoutparser.processor.LayoutparserSettings;
import com.pdftron.pdf.PDFNet;
import jakarta.annotation.PostConstruct;
import jakarta.annotation.PreDestroy;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Component
@Configuration
@RequiredArgsConstructor
public class PDFNetInitializer {
@ -22,26 +20,17 @@ public class PDFNetInitializer {
private String pdftronLicense;
@Bean
@SneakyThrows
@PostConstruct
// Do not change back to application runner, if it is application runner it takes messages from the queue before PDFNet is initialized, that leads to UnsatisfiedLinkError.
public void init() {
if (Strings.isNullOrEmpty(pdftronLicense)) {
return;
throw new IllegalArgumentException("PDFTRON_LICENSE not set!");
}
log.info("Initializing Native Libraries");
log.info("Setting pdftron license: {}", pdftronLicense);
PDFNet.setTempPath("/tmp/pdftron");
PDFNet.initialize(pdftronLicense);
}
@PreDestroy
public void terminate() {
PDFNet.terminate();
}
}

View File

@ -27,23 +27,28 @@ import lombok.extern.slf4j.Slf4j;
@Slf4j
public class LayoutparserEnd2EndTest extends AbstractTest {
public static final LayoutParsingType LAYOUT_PARSING_TYPE = LayoutParsingType.DOCUMINE;
@Autowired
private LayoutParsingPipeline layoutParsingPipeline;
@Test
@Disabled
public void testLayoutParserEndToEnd() {
String filePath = "files/test-1.pdf";
String filePath = "/home/kschuettler/Downloads/55974b3de7ed2915718a10458206bbd8.ORIGIN.pdf";
runForFile(filePath);
}
@Test
@Disabled
@SneakyThrows
public void testLayoutParserEndToEndWithFolder() {
String folder = "/Users/maverickstuder/Documents/Fforesight/layoutparser/layoutparser-service/layoutparser-service-server/src/test";
String folder = "/home/kschuettler/Dokumente/TestFiles/ReadingOrder";
List<Path> pdfFiles = Files.walk(Path.of(folder))
.filter(path -> path.getFileName().toString().endsWith(".pdf"))
.sorted(Comparator.comparing(Path::getFileName))
@ -69,7 +74,8 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
file = new File(filePath);
}
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH, true);
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LAYOUT_PARSING_TYPE, true);
prepareStorage(layoutParsingRequest, file);
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);

View File

@ -57,9 +57,11 @@ public class OutlineDetectionTest extends AbstractTest {
pdfNetInitializer.init();
}
@Test
@SneakyThrows
public void testOutlineError(){
public void testOutlineError() {
String fileName = "files/syngenta/CustomerFiles/Clarifynd/VV-470942.pdf";
ClassificationDocument classificationDocument = parseLayout(fileName, LayoutParsingType.CLARIFYND);

View File

@ -0,0 +1,452 @@
package com.knecon.fforesight.service.layoutparser.server.graph;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.awt.Color;
import java.awt.geom.Rectangle2D;
import java.io.FileOutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.text.similarity.LevenshteinDistance;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import com.iqser.red.storage.commons.service.StorageService;
import com.iqser.red.storage.commons.utils.FileSystemBackedStorageService;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
import com.knecon.fforesight.service.layoutparser.server.PDFNetInitializer;
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
import com.knecon.fforesight.service.viewerdoc.model.Standard14EmbeddableFont;
import com.pdftron.common.Matrix2D;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.ColorPt;
import com.pdftron.pdf.ColorSpace;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.ElementBuilder;
import com.pdftron.pdf.ElementWriter;
import com.pdftron.pdf.Font;
import com.pdftron.pdf.GState;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.sdf.SDFDoc;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Slf4j
public class DocumentReadingOrderTest extends BuildDocumentTest {
private static final boolean DRAW_DIR_ADJ_COORDS = false;
public static final List<LayoutParsingType> LAYOUT_PARSING_TYPES = List.of(LayoutParsingType.DOCUMINE,
LayoutParsingType.DOCUMINE_OLD,
LayoutParsingType.REDACT_MANAGER,
LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH);
@Autowired
PDFNetInitializer pdfNetInitializer;
@Autowired
StorageService storageService;
@BeforeEach
public void before() {
pdfNetInitializer.init();
}
@AfterEach
public void cleanUp() {
((FileSystemBackedStorageService) storageService).clearStorage();
}
@Test
public void readingOrderTestSeite14() {
String pdfFile = "files/syngenta/CustomerFiles/SinglePages/Seite14.pdf";
String expectedText = """
27
26 APPENDICES SECTION
APPENDIX 1 Analytical Report
syngenta
A16148F
Batch ID 533158 (GP-080305)
Batch Identification 533158
Product Design Code A16148F
Product Denomination SYN524464 FS (500)
Product by Common Name SYN524464 FS (500)
Other Product Code(s) GP-080305
Source Technology & Projects, Syngenta Crop Protection, Inc.
Chemical Analysis
(Active Ingredient Content)
Identity of the Active Ingredient* Confirmed
Content of SYN524464* 45.6% (wt/wt) or 534 g/L
Methodology Used for Characterization HPLC
The Active Ingredient content is within the FAO limits.
Physical Analysis
Appearance* pink opaque liquid
Density* 1171 g/L
Stability:
Storage Temperature <30°
Expiration date March 2009
The stability of this test substance will be determined concurrently through reanalysis of material held
in inventory under GLP conditions at Syngenta Crop Protection, Inc., Greensboro, NC
This Certificate of Analysis is summarizing data (marked with an asterisk) from a study that has been
performed in compliance with Good Laboratory Practices per 40 CFR Part 160 Raw data,
documentation, protocols, any amendments to study protocols and reports pertaining to this study are
maintained in the Syngenta Crop Protection Archives in Greensboro, NC.
Authorization'
26 Mar 2008
Dorothea Jeffery Date
Group Leader I
Analytical & Product Chemistry Department
Document 10350420.doc Certificate of Analysis
Page 1 of 1 Study T000973-08
Report Number: 11813-08 Page 14 of 14
""";
assertSimilarReadingOrder(expectedText, pdfFile);
}
@Test
public void readingOrderTestTiltedText() {
String pdfFile = "files/syngenta/CustomerFiles/SinglePages/tiltedText.pdf";
String expectedText = """
However there was no consistency in the areas affected either between sexes or at different
ages, in general other measurements for the same structures at other levels showed no
differences, all were within the historical control range of mean values and none of these
differences is considered to be related to treatment (Appendix K).
7. DISCUSSION
The purpose of this study, which was to determine the potential for developmental
neurotoxicity in the assessment and evaluation of the toxic characteristics of lambda-
cyhalothrin in rats, was successfully accomplished.
There was evidence of toxicity characterised by lower bodyweights and food consumption in
dams receiving 60 or 150 ppm lambda-cyhalothrin during gestation and also post partum in
the 150 ppm group only.
There were no treatment-related effects of administration of lambda-cyhalothrin on
reproductive parameters: there were no effects on gestation length, mean litter size or on pup
bodyweight at birth.
There was evidence of toxicity in F1 animals receiving 150 ppm. This was seen as slightly
higher pup mortality up to day 5 and lower bodyweights from day 5, reaching a maximum of
approximately 8-9% below control on day 22.
There was a small difference in the age at which male rats in the 150 ppm group reached
preputial separation, but this was too small to be of toxicological significance.
No effects were seen on motor activity or response to auditory startle.
There was no clear evidence of any effects in the learning and memory assessment in
weanling (age 21-24 days) or young adult animals (age 59-62 days). However, at day 21
swimming speeds of females receiving 150 ppm were slightly slower than controls. The
difference is considered to reflect a difference in swimming performance rather than an effect
on learning or memory.
No neuropathological effect of treatment with lambda-cyhalothrin was detected from a
detailed microscopic examination of the selected F1 animals post mortem on day 12 or 63.
LAMBDA-CYHALOTHRIN: DEVELOPMENTAL NEUROTOXICITY STUDY IN RATS
CTL/RR0969/REGULATORY/REPORT - 34
""";
assertSimilarReadingOrder(expectedText, pdfFile);
}
@Test
public void readingOrderTest402Study() {
String pdfFile = "files/SinglePages/402StudyPage5.pdf";
String expectedText = """
2.0 INTRODUCTION
2.1 Purpose
Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor
invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et
accusam et justo duo dolores et ea rebum.
Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem
ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt
ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et
justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est
Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed
diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam
voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd
gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
2.2 Guidelines
Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor
invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua.
At vero eos et accusam et justo duo dolores et ea rebum.
Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem
ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt
ut labore et dolore magna aliquyam erat, sed diam voluptua.
At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no
sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet,
consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore
magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et
ea rebum.
Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel
illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui
blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi. Lorem
ipsum dolor sit amet.
2.3 Test Facility
Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor
invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et
accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata
sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur
sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna
aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea
rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
Report Number: 20/080-002P 5
""";
assertSimilarReadingOrder(expectedText, pdfFile);
}
@Test
public void readingOrderTest402StudyRotated() {
String pdfFile = "files/SinglePages/402StudyPage5_rotated.pdf";
String expectedText = """
2.0 INTRODUCTION
2.1 Purpose
Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor
invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et
accusam et justo duo dolores et ea rebum.
Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem
ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt
ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et
justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est
Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed
diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam
voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd
gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
2.2 Guidelines
Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor
invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua.
At vero eos et accusam et justo duo dolores et ea rebum.
Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem
ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt
ut labore et dolore magna aliquyam erat, sed diam voluptua.
At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no
sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet,
consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore
magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et
ea rebum.
Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel
illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui
blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi. Lorem
ipsum dolor sit amet.
2.3 Test Facility
Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor
invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et
accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata
sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur
sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna
aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea
rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
Report Number: 20/080-002P 5
""";
assertSimilarReadingOrder(expectedText, pdfFile);
}
private void assertSimilarReadingOrder(String expectedText, String pdfFile) {
List<String> expectedLines = List.of(expectedText.split("\n"));
for (LayoutParsingType layoutParsingType : LAYOUT_PARSING_TYPES) {
log.info("Evaluating for {}", layoutParsingType);
ClassificationDocument classificationDocument = parseLayout(pdfFile, layoutParsingType);
if (DRAW_DIR_ADJ_COORDS) {
drawDirAdjCoords(pdfFile, classificationDocument, layoutParsingType);
}
Document document = DocumentGraphFactory.buildDocumentGraph(layoutParsingType, classificationDocument);
List<String> readLines = getTextAsLines(document);
readLines.forEach(log::info);
int correctCount = 0;
int maxLineOffset = 0;
for (int i = 0; i < expectedLines.size(); i++) {
String expectedLine = expectedLines.get(i);
int mostSimilarLine = 0;
double maxSimilarity = 0;
for (int j = 0; j < readLines.size(); j++) {
String readLine = readLines.get(j);
double similarity = similarity(expectedLine, readLine);
if (similarity > maxSimilarity) {
maxSimilarity = similarity;
mostSimilarLine = j;
}
}
if (readLines.get(mostSimilarLine).trim().equals(expectedLine.trim())) {
correctCount++;
int lineOffset = Math.abs(mostSimilarLine - i);
if (lineOffset > 0) {
log.info("Line {} offset by {}", readLines.get(mostSimilarLine), lineOffset);
}
if (lineOffset > maxLineOffset) {
maxLineOffset = lineOffset;
}
} else {
log.error("Lines {}-{} do not match: \n Expected: {}\n Actual: {}", i, mostSimilarLine, expectedLine, readLines.get(mostSimilarLine));
}
}
double correctLinesFactor = (double) correctCount / (double) readLines.size();
double averageLineOffset = (double) maxLineOffset / (double) readLines.size();
log.info("Difference in number of lines: {}", Math.abs(expectedLines.size() - readLines.size()));
log.info("Correct lines factor: {}", correctLinesFactor);
log.info("Max order offset: {}, avg: {}", maxLineOffset, averageLineOffset);
// In the rotated document one line is read as two
assertTrue(Math.abs(expectedLines.size() - readLines.size()) <= 1);
// Most of the errors come from the similarity metric finding different lines in 402 study, as the lines are too similar, or a miss classification of Footers
assertTrue(averageLineOffset < 1);
assertTrue(correctLinesFactor > 0.9);
}
}
public List<String> getTextAsLines(Document document) {
return document.getTextBlock().getAtomicTextBlocks()
.stream()
.filter(atb -> !atb.isEmpty())
.map(DocumentReadingOrderTest::getLines)
.flatMap(List::stream)
.toList();
}
private static List<String> getLines(AtomicTextBlock atomicTextBlock) {
int numberOfLines = atomicTextBlock.numberOfLines();
List<String> lines = new ArrayList<>(numberOfLines);
for (int line = 0; line < numberOfLines; line++) {
lines.add(atomicTextBlock.getLine(line).toString());
}
return lines;
}
private static double similarity(String s1, String s2) {
LevenshteinDistance levenshteinDistance = new LevenshteinDistance();
int max = Math.max(s1.length(), s2.length());
int dist = levenshteinDistance.apply(s1, s2);
return 1 - (double) dist / (double) max;
}
@SneakyThrows
private void drawDirAdjCoords(String filename, ClassificationDocument classificationDocument, LayoutParsingType layoutParsingType) {
try (PDFDoc pdfDoc = new PDFDoc(); ElementWriter writer = new ElementWriter(); ElementBuilder builder = new ElementBuilder()) {
Standard14EmbeddableFont font = Standard14EmbeddableFont.helvetica();
Font helvetica = Font.create(pdfDoc, Font.e_helvetica);
for (ClassificationPage classificationDocumentPage : classificationDocument.getPages()) {
int count = 0;
Page page = pdfDoc.pageCreate();
writer.begin(page);
for (AbstractPageBlock abstractBlock : classificationDocumentPage.getTextBlocks()) {
if (abstractBlock instanceof TextPageBlock textBlock) {
for (TextPositionSequence sequence : TextPositionOperations.mergeAndSort(List.of(textBlock))) {
float stringWidth;
try {
stringWidth = font.getStringWidth(sequence.toString());
} catch (Exception e) {
stringWidth = font.getFont().getAverageFontWidth() * sequence.toString().length();
}
double fontSize = (sequence.getBBoxDirAdj().getWidth() / stringWidth) * 1000;
try (Matrix2D textMatrix = new Matrix2D(1,
0,
0,
1,
sequence.getXDirAdj(),
page.getCropBox().getHeight() - sequence.getYDirAdj() - sequence.getHeightDirAdj())) {
writeText(sequence.toString(), textMatrix, builder, helvetica, fontSize, writer, Color.BLACK);
writeText(String.valueOf(count), textMatrix.translate(-(2 + (5 * String.valueOf(count).length())), 0), builder, helvetica, 8, writer, Color.RED);
count++;
}
writeBBox(sequence.getBBoxDirAdj(), builder, page, writer, Color.BLACK);
}
writeBBox(textBlock.getBBoxDirAdj(), builder, page, writer, Color.BLUE);
}
}
writer.end();
pdfDoc.pagePushBack(page);
}
Path stem = Path.of("/tmp/READING_ORDER_TEST/");
Files.createDirectories(stem);
try (var out = new FileOutputStream(stem.resolve(layoutParsingType.name() + "_" + Path.of(filename).getFileName()).toFile() + "_dirAdjCoordinates.pdf")) {
pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
}
}
}
private static void writeBBox(Rectangle2D r, ElementBuilder builder, Page page, ElementWriter writer, Color color) throws PDFNetException {
Element rect = builder.createRect(r.getX(), page.getCropBox().getHeight() - r.getY(), r.getWidth(), -r.getHeight());
float[] comp = color.getColorComponents(null);
rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
try (ColorPt colorpt = new ColorPt(comp[0], comp[1], comp[2])) {
rect.getGState().setStrokeColor(colorpt);
}
rect.setPathStroke(true);
writer.writeElement(rect);
}
private static void writeText(String string,
Matrix2D matrix2D,
ElementBuilder builder,
Font helvetica,
double fontSize,
ElementWriter writer,
Color color) throws PDFNetException {
Element text = builder.createTextBegin(helvetica, fontSize);
text.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
float[] colorComponents = color.getColorComponents(null);
try (ColorPt colorpt = new ColorPt(colorComponents[0], colorComponents[1], colorComponents[2])) {
text.getGState().setFillColor(colorpt);
}
text.setTextMatrix(matrix2D);
text.getGState().setTextRenderMode(GState.e_fill_text);
writer.writeElement(text);
text = builder.createTextRun(string);
writer.writeElement(text);
text = builder.createTextEnd();
writer.writeElement(text);
}
}

View File

@ -1,60 +0,0 @@
package com.knecon.fforesight.service.layoutparser.server.model;
import static org.assertj.core.api.Assertions.assertThat;
import java.io.ByteArrayInputStream;
import java.nio.charset.StandardCharsets;
import org.apache.pdfbox.util.Matrix;
import org.junit.jupiter.api.Test;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.storage.commons.properties.StorageProperties;
import com.iqser.red.storage.commons.service.ObjectSerializer;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import lombok.SneakyThrows;
public class TextPositionSequenceTest {
private static final String TEXT_POSITION_SEQUENCE_AS_JSON = "{\n" //
+ " \"page\": 1,\n" //
+ " \"textPositions\": [],\n" //
+ " \"dir\": 180.0,\n" //
+ " \"rotation\": 0,\n" //
+ " \"pageHeight\": 800,\n" //
+ " \"pageWidth\": 600\n" //
+ "}";
private final ObjectSerializer objectSerializer = new ObjectSerializer(new ObjectMapper());
@Test
@SneakyThrows
public void testDeserializationWithJackson() {
TextPositionSequence textPositionSequence = objectSerializer.deserialize(new ByteArrayInputStream(TEXT_POSITION_SEQUENCE_AS_JSON.getBytes(StandardCharsets.UTF_8)),
TextPositionSequence.class);
assertPropertiesAfterJsonDeserialization(textPositionSequence);
}
private void assertPropertiesAfterJsonDeserialization(TextPositionSequence textPositionSequence) {
assertThat(textPositionSequence.getPage()).isEqualTo(1);
assertThat(textPositionSequence.getTextPositions()).hasSize(0);
assertThat(textPositionSequence.getDir()).isEqualTo(TextDirection.HALF_CIRCLE);
assertThat(textPositionSequence.getRotation()).isEqualTo(0);
assertThat(textPositionSequence.getPageHeight()).isEqualTo(800f);
assertThat(textPositionSequence.getPageWidth()).isEqualTo(600f);
}
private Matrix createIdentityMatrix() {
return new Matrix();
}
}

View File

@ -3,12 +3,10 @@ package com.knecon.fforesight.service.layoutparser.server.services;
import java.nio.file.Path;
import java.util.List;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
@ -29,7 +27,7 @@ class PageContentExtractorTest {
textPositionPerPage.stream()
.map(t -> t.getSortedTextPositionSequences()
.stream()
.map(TextPositionSequence::getBBoxInitialUserSpace)
.map(TextPositionSequence::getBBoxPdf)
.map(List::of)
.toList())
.toList(), tmpFileName);

View File

@ -1,6 +1,8 @@
package com.knecon.fforesight.service.layoutparser.server.utils;
import java.awt.geom.Rectangle2D;
import java.io.File;
import java.io.FileOutputStream;
import java.nio.file.Path;
import java.util.Map;
@ -10,11 +12,27 @@ import org.springframework.core.io.ClassPathResource;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.viewerdoc.model.Standard14EmbeddableFont;
import com.pdftron.common.Matrix2D;
import com.pdftron.pdf.ColorPt;
import com.pdftron.pdf.ColorSpace;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.ElementBuilder;
import com.pdftron.pdf.ElementWriter;
import com.pdftron.pdf.Font;
import com.pdftron.pdf.GState;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.sdf.SDFDoc;
import lombok.SneakyThrows;
@ -48,14 +66,14 @@ public abstract class BuildDocumentTest extends AbstractTest {
@SneakyThrows
protected Document buildGraph(String filename, LayoutParsingType layoutParsingType) {
if (!filename.startsWith("files") && filename.startsWith("/")) {
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(Path.of(filename).getFileName().toString(), LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH, true);
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(Path.of(filename).getFileName().toString(), layoutParsingType, true);
prepareStorage(layoutParsingRequest, new File(filename));
return DocumentGraphFactory.buildDocumentGraph(layoutParsingType,
layoutParsingPipeline.parseLayout(layoutParsingType,
new File(filename),
layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get()),
layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId()
.get()),
new TableServiceResponse(),
new VisualLayoutParsingResponse(),
layoutParsingRequest.identifier()));
@ -65,10 +83,12 @@ public abstract class BuildDocumentTest extends AbstractTest {
} else {
prepareStorage(filename);
}
return DocumentGraphFactory.buildDocumentGraph(layoutParsingType, parseLayout(filename, layoutParsingType));
var classificationDocument = parseLayout(filename, layoutParsingType);
return DocumentGraphFactory.buildDocumentGraph(layoutParsingType, classificationDocument);
}
}
}

@ -1 +1 @@
Subproject commit c6fd9e849f3efd7d1507401f63629b91dec9f4ec
Subproject commit 0da08b1d9d1bc815a3fb51aa9638eafea2cf02d5

View File

@ -12,7 +12,7 @@ dependencies {
implementation("org.apache.pdfbox:pdfbox-tools:${pdfBoxVersion}")
implementation("org.slf4j:slf4j-api:1.7.25")
implementation("com.knecon.fforesight:tracing-commons:0.5.0")
implementation("com.pdftron:PDFNet:10.7.0")
implementation("com.pdftron:PDFNet:10.11.0")
testImplementation("org.apache.logging.log4j:log4j-slf4j-impl:2.22.1")
testImplementation("org.junit.jupiter:junit-jupiter")

View File

@ -10,12 +10,14 @@ import org.apache.pdfbox.pdmodel.font.Standard14Fonts;
import com.pdftron.pdf.Font;
import com.pdftron.pdf.PDFDoc;
import lombok.Getter;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
@RequiredArgsConstructor
public class Standard14EmbeddableFont implements EmbeddableFont {
@Getter
private final PDType1Font font;
private final int pdfTronIdentifier;

View File

@ -96,15 +96,18 @@ public class PDFTronViewerDocumentService {
boolean isCurrentVersion = ViewerDocVersioningUtility.docIsCurrentVersion(pdfDoc);
int pageNumber = 1;
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); pageNumber++) {
try (PageIterator iterator = pdfDoc.getPageIterator()) {
while (iterator.hasNext()) {
Page page = iterator.next();
Page page = iterator.next();
if (isCurrentVersion) {
pageContentCleaner.removeMarkedContent(page);
if (isCurrentVersion) {
pageContentCleaner.removeMarkedContent(page);
}
visualizationWriter.drawVisualizationsOnPage(pageNumber, page);
pageNumber++;
}
visualizationWriter.drawVisualizationsOnPage(pageNumber, page);
}
ViewerDocVersioningUtility.setVersionInDocument(pdfDoc);

View File

@ -343,12 +343,7 @@ public class VisualizationWriter {
@SneakyThrows
private static AffineTransform getTextDeRotationTransform(Page page) {
return AffineTransform.getQuadrantRotateInstance(switch (page.getRotation()) {
case 90 -> 3;
case 180 -> 2;
case 270 -> 1;
default -> 0;
});
return AffineTransform.getQuadrantRotateInstance(page.getRotation());
}
}

View File

@ -61,11 +61,12 @@ class PageContentCleanerTest {
.markedContentToRemove(Set.of(LayerIdentifier.KNECON_OCR_DEBUG.markedContentName()))
.build();
for (PageIterator iterator = doc.getPageIterator(); iterator.hasNext(); ) {
try (PageIterator iterator = doc.getPageIterator()) {
while (iterator.hasNext()) {
Page page = iterator.next();
Page page = iterator.next();
pageContentCleaner.removeMarkedContent(page);
pageContentCleaner.removeMarkedContent(page);
}
}
doc.save(out, SDFDoc.SaveMode.REMOVE_UNUSED, null);