hotfix reading order
This commit is contained in:
parent
b900cfaf31
commit
69bcd4f68d
@ -42,6 +42,15 @@ tasks.jacocoTestReport {
|
||||
}
|
||||
|
||||
allprojects {
|
||||
|
||||
tasks.withType<Javadoc> {
|
||||
options {
|
||||
this as StandardJavadocDocletOptions
|
||||
addBooleanOption("Xdoclint:none", true)
|
||||
addStringOption("Xmaxwarns", "1")
|
||||
}
|
||||
}
|
||||
|
||||
publishing {
|
||||
publications {
|
||||
create<MavenPublication>(name) {
|
||||
@ -64,6 +73,7 @@ java {
|
||||
withJavadocJar()
|
||||
}
|
||||
|
||||
|
||||
repositories {
|
||||
mavenLocal()
|
||||
mavenCentral()
|
||||
|
||||
@ -19,6 +19,7 @@ public class SimplifiedText {
|
||||
@Schema(description = "Number of pages in the entire document.")
|
||||
private int numberOfPages;
|
||||
@Schema(description = "A List of simplified Sections, which contains almost exclusively text.")
|
||||
@Builder.Default
|
||||
private List<SimplifiedSectionText> sectionTexts = new ArrayList<>();
|
||||
|
||||
}
|
||||
|
||||
@ -28,4 +28,6 @@ dependencies {
|
||||
implementation("org.tinspin:tinspin-indexes:2.1.3")
|
||||
implementation("org.commonmark:commonmark:0.22.0")
|
||||
implementation("org.commonmark:commonmark-ext-gfm-tables:0.22.0")
|
||||
implementation("com.pdftron:PDFNet:10.11.0")
|
||||
|
||||
}
|
||||
|
||||
@ -69,6 +69,7 @@ import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDF
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
|
||||
|
||||
import io.micrometer.observation.Observation;
|
||||
import io.micrometer.observation.ObservationRegistry;
|
||||
@ -117,14 +118,18 @@ public class LayoutParsingPipeline {
|
||||
log.info("Starting layout parsing for {}", layoutParsingRequest.identifier());
|
||||
|
||||
File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
|
||||
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()).orElse(originFile);
|
||||
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId())
|
||||
.orElse(originFile);
|
||||
|
||||
VisualLayoutParsingResponse visualLayoutParsingResponse = layoutParsingRequest.visualLayoutParsingFileId()
|
||||
.map(layoutParsingStorageService::getVisualLayoutParsingFile).orElse(new VisualLayoutParsingResponse());
|
||||
.map(layoutParsingStorageService::getVisualLayoutParsingFile)
|
||||
.orElse(new VisualLayoutParsingResponse());
|
||||
ImageServiceResponse imageServiceResponse = layoutParsingRequest.imagesFileStorageId()
|
||||
.map(layoutParsingStorageService::getImagesFile).orElse(new ImageServiceResponse());
|
||||
.map(layoutParsingStorageService::getImagesFile)
|
||||
.orElse(new ImageServiceResponse());
|
||||
TableServiceResponse tableServiceResponse = layoutParsingRequest.tablesFileStorageId()
|
||||
.map(layoutParsingStorageService::getTablesFile).orElse(new TableServiceResponse());
|
||||
.map(layoutParsingStorageService::getTablesFile)
|
||||
.orElse(new TableServiceResponse());
|
||||
|
||||
ClassificationDocument classificationDocument = parseLayout(settings.getLayoutParsingTypeOverride() == null //
|
||||
? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride(),
|
||||
@ -137,8 +142,7 @@ public class LayoutParsingPipeline {
|
||||
log.info("Building document graph for {}", layoutParsingRequest.identifier());
|
||||
|
||||
Document documentGraph = observeBuildDocumentGraph(settings.getLayoutParsingTypeOverride() == null //
|
||||
? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride(),
|
||||
classificationDocument);
|
||||
? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride(), classificationDocument);
|
||||
|
||||
log.info("Creating viewer document for {}", layoutParsingRequest.identifier());
|
||||
|
||||
@ -147,7 +151,7 @@ public class LayoutParsingPipeline {
|
||||
log.info("Storing resulting files for {}", layoutParsingRequest.identifier());
|
||||
|
||||
layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentGraph));
|
||||
if(layoutParsingRequest.documentMarkdownFileStorageId().isPresent()) {
|
||||
if (layoutParsingRequest.documentMarkdownFileStorageId().isPresent()) {
|
||||
layoutParsingStorageService.storeMarkdownFile(layoutParsingRequest.documentMarkdownFileStorageId().get(), new MarkdownMapper().toMarkdownContent(documentGraph));
|
||||
}
|
||||
layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentGraph));
|
||||
@ -271,11 +275,11 @@ public class LayoutParsingPipeline {
|
||||
stripper.setStartPage(pageNumber);
|
||||
stripper.setEndPage(pageNumber);
|
||||
stripper.setPdpage(pdPage);
|
||||
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD)) {
|
||||
stripper.setSortByPosition(true);
|
||||
}
|
||||
stripper.getText(originDocument);
|
||||
List<TextPositionSequence> words = stripper.getTextPositionSequences();
|
||||
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD)) {
|
||||
words = TextPositionOperations.sort(words);
|
||||
}
|
||||
classificationDocument.getLayoutDebugLayer().addTextVisualizations(words, pageNumber);
|
||||
|
||||
PDRectangle pdr = pdPage.getMediaBox();
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.docstrum;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.EnumMap;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@ -26,6 +27,7 @@ import lombok.RequiredArgsConstructor;
|
||||
@RequiredArgsConstructor
|
||||
public class DocstrumSegmentationService {
|
||||
|
||||
public static final double SAME_DIRECTION_THRESHOLD = 0.9;
|
||||
private final NearestNeighbourService nearestNeighbourService;
|
||||
private final SpacingService spacingService;
|
||||
private final LineBuilderService lineBuilderService;
|
||||
@ -35,13 +37,44 @@ public class DocstrumSegmentationService {
|
||||
|
||||
public List<Zone> segmentPage(List<TextPositionSequence> textPositions, boolean xyOrder, CleanRulings usedRulings, LayoutDebugLayer visualizations) {
|
||||
|
||||
List<Zone> zones = new ArrayList<>();
|
||||
zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.ZERO));
|
||||
zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.QUARTER_CIRCLE));
|
||||
zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.HALF_CIRCLE));
|
||||
zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.THREE_QUARTER_CIRCLE));
|
||||
EnumMap<TextDirection, Integer> directionCounts = new EnumMap<>(TextDirection.class);
|
||||
|
||||
return readingOrderService.resolve(zones, xyOrder);
|
||||
List<Zone> newZones = computeZones(textPositions, usedRulings, visualizations, TextDirection.ZERO);
|
||||
directionCounts.put(TextDirection.ZERO, newZones.size());
|
||||
List<Zone> zones = new ArrayList<>(newZones);
|
||||
|
||||
newZones = computeZones(textPositions, usedRulings, visualizations, TextDirection.QUARTER_CIRCLE);
|
||||
directionCounts.put(TextDirection.QUARTER_CIRCLE, newZones.size());
|
||||
zones.addAll(newZones);
|
||||
|
||||
newZones = computeZones(textPositions, usedRulings, visualizations, TextDirection.HALF_CIRCLE);
|
||||
directionCounts.put(TextDirection.HALF_CIRCLE, newZones.size());
|
||||
zones.addAll(newZones);
|
||||
|
||||
newZones = computeZones(textPositions, usedRulings, visualizations, TextDirection.THREE_QUARTER_CIRCLE);
|
||||
directionCounts.put(TextDirection.THREE_QUARTER_CIRCLE, newZones.size());
|
||||
zones.addAll(newZones);
|
||||
|
||||
return readingOrderService.resolve(zones, xyOrder, mostSameDirection(directionCounts));
|
||||
}
|
||||
|
||||
|
||||
private boolean mostSameDirection(EnumMap<TextDirection, Integer> directionCounts) {
|
||||
|
||||
int total = directionCounts.values()
|
||||
.stream()
|
||||
.mapToInt(i -> i).sum();
|
||||
|
||||
if ((double) directionCounts.get(TextDirection.ZERO) / total > SAME_DIRECTION_THRESHOLD) {
|
||||
return true;
|
||||
} else if ((double) directionCounts.get(TextDirection.QUARTER_CIRCLE) / total > SAME_DIRECTION_THRESHOLD) {
|
||||
return true;
|
||||
} else if ((double) directionCounts.get(TextDirection.HALF_CIRCLE) / total > SAME_DIRECTION_THRESHOLD) {
|
||||
return true;
|
||||
} else if ((double) directionCounts.get(TextDirection.THREE_QUARTER_CIRCLE) / total > SAME_DIRECTION_THRESHOLD) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -15,10 +15,16 @@ public class AngleFilter {
|
||||
|
||||
public boolean matches(Neighbor neighbor) {
|
||||
|
||||
return matches(neighbor.getAngle());
|
||||
}
|
||||
|
||||
|
||||
public boolean matches(double angle) {
|
||||
|
||||
if (lowerAngle <= upperAngle) {
|
||||
return lowerAngle <= neighbor.getAngle() && neighbor.getAngle() < upperAngle;
|
||||
return lowerAngle <= angle && angle < upperAngle;
|
||||
} else {
|
||||
return lowerAngle <= neighbor.getAngle() || neighbor.getAngle() < upperAngle;
|
||||
return lowerAngle <= angle || angle < upperAngle;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -7,8 +7,12 @@ import java.util.List;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.SuperBuilder;
|
||||
|
||||
@Data
|
||||
@SuperBuilder
|
||||
@NoArgsConstructor
|
||||
public abstract class BoundingBox {
|
||||
|
||||
// Java coordinate system: (0, 0) is always upper left, x is increasing left to right and y is increasing from top to bottom.
|
||||
@ -19,7 +23,7 @@ public abstract class BoundingBox {
|
||||
// This rotates completely in 90 degree steps with page rotation.
|
||||
// Needs to be used when writing to a PDF.
|
||||
// Also, these are definitely correct and should be used whenever possible.
|
||||
protected Rectangle2D bBoxInitialUserSpace;
|
||||
protected Rectangle2D bBoxPdf;
|
||||
|
||||
protected static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f;
|
||||
|
||||
@ -50,25 +54,25 @@ public abstract class BoundingBox {
|
||||
|
||||
public double getPdfMinX() {
|
||||
|
||||
return bBoxInitialUserSpace.getMinX();
|
||||
return bBoxPdf.getMinX();
|
||||
}
|
||||
|
||||
|
||||
public double getPdfMaxX() {
|
||||
|
||||
return bBoxInitialUserSpace.getMaxX();
|
||||
return bBoxPdf.getMaxX();
|
||||
}
|
||||
|
||||
|
||||
public double getPdfMinY() {
|
||||
|
||||
return bBoxInitialUserSpace.getMinY();
|
||||
return bBoxPdf.getMinY();
|
||||
}
|
||||
|
||||
|
||||
public double getPdfMaxY() {
|
||||
|
||||
return bBoxInitialUserSpace.getMaxY();
|
||||
return bBoxPdf.getMaxY();
|
||||
}
|
||||
|
||||
|
||||
@ -129,13 +133,31 @@ public abstract class BoundingBox {
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsY(BoundingBox other) {
|
||||
private boolean intersectsX(BoundingBox other, float threshold) {
|
||||
|
||||
return this.getX() - threshold <= other.getMaxX() && this.getMaxX() + threshold >= other.getX();
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsPdf(BoundingBox other) {
|
||||
|
||||
return this.intersectsXPdf(other) && this.intersectsYPdf(other);
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsPdf(BoundingBox other, float yThreshold, float xThreshold) {
|
||||
|
||||
return this.intersectsXPdf(other, xThreshold) && this.intersectsYPdf(other, yThreshold);
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsYPdf(BoundingBox other) {
|
||||
|
||||
return this.getPdfMinY() <= other.getPdfMaxY() && this.getPdfMaxY() >= other.getPdfMinY();
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsYJava(BoundingBox other) {
|
||||
public boolean intersectsY(BoundingBox other) {
|
||||
|
||||
return this.getY() <= other.getMaxY() && this.getMaxY() >= other.getY();
|
||||
}
|
||||
@ -143,25 +165,31 @@ public abstract class BoundingBox {
|
||||
|
||||
public boolean intersectsY(BoundingBox other, float threshold) {
|
||||
|
||||
return this.getY() - threshold <= other.getMaxY() && this.getMaxY() + threshold >= other.getY();
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsYPdf(BoundingBox other, float threshold) {
|
||||
|
||||
return this.getPdfMinY() - threshold <= other.getPdfMaxY() && this.getPdfMaxY() + threshold >= other.getPdfMinY();
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsX(BoundingBox other) {
|
||||
public boolean intersectsXPdf(BoundingBox other) {
|
||||
|
||||
return this.getPdfMinX() <= other.getPdfMaxX() && this.getPdfMaxX() >= other.getPdfMinX();
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsXJava(BoundingBox other) {
|
||||
public boolean intersectsX(BoundingBox other) {
|
||||
|
||||
return this.getX() <= other.getMaxX() && this.getMaxX() >= other.getMinX();
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsX(BoundingBox other, float threshold) {
|
||||
public boolean intersectsXPdf(BoundingBox other, float threshold) {
|
||||
|
||||
return this.getPdfMinX() - threshold <= other.getMaxX() && this.getMaxX() + threshold >= other.getPdfMinX();
|
||||
return this.getPdfMinX() - threshold <= other.getPdfMaxX() && this.getMaxX() + threshold >= other.getPdfMinX();
|
||||
}
|
||||
|
||||
|
||||
@ -170,8 +198,8 @@ public abstract class BoundingBox {
|
||||
this.bBox = components.stream()
|
||||
.map(BoundingBox::getBBox)
|
||||
.collect(RectangleTransformations.collectBBox());
|
||||
this.bBoxInitialUserSpace = components.stream()
|
||||
.map(BoundingBox::getBBoxInitialUserSpace)
|
||||
this.bBoxPdf = components.stream()
|
||||
.map(BoundingBox::getBBoxPdf)
|
||||
.collect(RectangleTransformations.collectBBox());
|
||||
}
|
||||
|
||||
@ -229,25 +257,25 @@ public abstract class BoundingBox {
|
||||
|
||||
public boolean rightOf(BoundingBox other) {
|
||||
|
||||
return this.intersectsYJava(other) && other.getMaxX() <= this.getMinX();
|
||||
return this.intersectsY(other) && other.getMaxX() <= this.getMinX();
|
||||
}
|
||||
|
||||
|
||||
public boolean leftOf(BoundingBox other) {
|
||||
|
||||
return this.intersectsYJava(other) && other.getMinX() >= this.getMaxX();
|
||||
return this.intersectsY(other) && other.getMinX() >= this.getMaxX();
|
||||
}
|
||||
|
||||
|
||||
public boolean isAbove(BoundingBox other) {
|
||||
|
||||
return this.intersectsXJava(other) && other.getMinY() >= this.getMaxY();
|
||||
return this.intersectsX(other) && other.getMinY() >= this.getMaxY();
|
||||
}
|
||||
|
||||
|
||||
public boolean isBelow(BoundingBox other) {
|
||||
|
||||
return this.intersectsXJava(other) && this.getMinY() >= other.getMaxY();
|
||||
return this.intersectsX(other) && this.getMinY() >= other.getMaxY();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -35,7 +35,7 @@ public class Character {
|
||||
|
||||
public double getHeight() {
|
||||
|
||||
return textPosition.getHeightDir();
|
||||
return textPosition.getHeightDirAdj();
|
||||
}
|
||||
|
||||
|
||||
@ -65,9 +65,9 @@ public class Character {
|
||||
double s = Math.sin(-0);
|
||||
double c = Math.cos(-0);
|
||||
xs[0] = c * x - s * y;
|
||||
xs[1] = c * (x + textPosition.getWidthDirAdj()) - s * (y + textPosition.getHeightDir());
|
||||
xs[1] = c * (x + textPosition.getWidthDirAdj()) - s * (y + textPosition.getHeightDirAdj());
|
||||
xs[2] = c * other.x - s * other.y;
|
||||
xs[3] = c * (other.x + other.textPosition.getWidthDirAdj()) - s * (other.y + other.textPosition.getHeightDir());
|
||||
xs[3] = c * (other.x + other.textPosition.getWidthDirAdj()) - s * (other.y + other.textPosition.getHeightDirAdj());
|
||||
boolean overlapping = xs[1] >= xs[2] && xs[3] >= xs[0];
|
||||
Arrays.sort(xs);
|
||||
return Math.abs(xs[2] - xs[1]) * (overlapping ? 1 : -1);
|
||||
|
||||
@ -1,18 +1,28 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
|
||||
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence.BOLD;
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence.BOLD_ITALIC;
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence.ITALIC;
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence.STANDARD;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Comparator;
|
||||
import java.util.EnumMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.FontStyle;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
|
||||
@Data
|
||||
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
|
||||
public class Line extends BoundingBox {
|
||||
@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = false)
|
||||
public class Line extends TextBoundingBox {
|
||||
|
||||
private static final double WORD_DISTANCE_MULTIPLIER = 0.18;
|
||||
|
||||
@ -28,6 +38,8 @@ public class Line extends BoundingBox {
|
||||
|
||||
private final double height;
|
||||
|
||||
private FontStyle fontStyle;
|
||||
|
||||
private final List<Character> characters;
|
||||
private final List<TextPositionSequence> words = new ArrayList<>();
|
||||
|
||||
@ -67,6 +79,29 @@ public class Line extends BoundingBox {
|
||||
height = computeHeight();
|
||||
computeWords(wordSpacing * WORD_DISTANCE_MULTIPLIER);
|
||||
buildBBox();
|
||||
computeFontStyle();
|
||||
}
|
||||
|
||||
|
||||
private void computeFontStyle() {
|
||||
|
||||
EnumMap<FontStyle, AtomicInteger> fontStyleCounter = new EnumMap<>(FontStyle.class);
|
||||
for (FontStyle fontStyle : FontStyle.values()) {
|
||||
fontStyleCounter.put(fontStyle, new AtomicInteger(0));
|
||||
}
|
||||
for (TextPositionSequence word : words) {
|
||||
switch (word.getFontStyle()) {
|
||||
case STANDARD -> fontStyleCounter.get(FontStyle.REGULAR).getAndIncrement();
|
||||
case BOLD -> fontStyleCounter.get(FontStyle.BOLD).getAndIncrement();
|
||||
case ITALIC -> fontStyleCounter.get(FontStyle.ITALIC).getAndIncrement();
|
||||
case BOLD_ITALIC -> fontStyleCounter.get(FontStyle.BOLD_ITALIC).getAndIncrement();
|
||||
}
|
||||
}
|
||||
fontStyle = fontStyleCounter.entrySet()
|
||||
.stream()
|
||||
.max(Comparator.comparing(entry -> entry.getValue().get()))
|
||||
.map(Map.Entry::getKey)
|
||||
.orElse(FontStyle.REGULAR);
|
||||
}
|
||||
|
||||
|
||||
@ -144,8 +179,8 @@ public class Line extends BoundingBox {
|
||||
private void buildBBox() {
|
||||
|
||||
this.setToBBoxOfComponents(characters.stream()
|
||||
.map(Character::getTextPosition)
|
||||
.toList());
|
||||
.map(Character::getTextPosition)
|
||||
.toList());
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -0,0 +1,102 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.Getter;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.Setter;
|
||||
import lombok.experimental.SuperBuilder;
|
||||
|
||||
@Getter
|
||||
@Setter
|
||||
@SuperBuilder
|
||||
@NoArgsConstructor
|
||||
@EqualsAndHashCode(callSuper = false)
|
||||
public abstract class TextBoundingBox extends BoundingBox {
|
||||
|
||||
protected Rectangle2D bBoxDirAdj;
|
||||
|
||||
protected TextDirection dir;
|
||||
|
||||
|
||||
@Override
|
||||
public void setToBBoxOfComponents(List<? extends BoundingBox> components) {
|
||||
|
||||
super.setToBBoxOfComponents(components);
|
||||
this.bBoxDirAdj = components.stream()
|
||||
.filter(c -> c instanceof TextBoundingBox)
|
||||
.map(c -> (TextBoundingBox) c)
|
||||
.map(TextBoundingBox::getBBoxDirAdj)
|
||||
.collect(RectangleTransformations.collectBBox());
|
||||
|
||||
Set<TextDirection> textDirections = components.stream()
|
||||
.filter(c -> c instanceof TextBoundingBox)
|
||||
.map(c -> (TextBoundingBox) c)
|
||||
.map(TextBoundingBox::getDir)
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
if (textDirections.isEmpty()) {
|
||||
dir = TextDirection.ZERO;
|
||||
} else if (textDirections.size() > 1) {
|
||||
throw new IllegalArgumentException("More than one text direction found");
|
||||
} else {
|
||||
dir = textDirections.iterator().next();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public double getXDirAdj() {
|
||||
|
||||
return this.bBoxDirAdj.getX();
|
||||
}
|
||||
|
||||
|
||||
public double getYDirAdj() {
|
||||
|
||||
return this.bBoxDirAdj.getY();
|
||||
}
|
||||
|
||||
|
||||
public double getWidthDirAdj() {
|
||||
|
||||
return this.bBoxDirAdj.getWidth();
|
||||
}
|
||||
|
||||
|
||||
public double getHeightDirAdj() {
|
||||
|
||||
return this.bBoxDirAdj.getHeight();
|
||||
}
|
||||
|
||||
|
||||
public double getMaxXDirAdj() {
|
||||
|
||||
return this.bBoxDirAdj.getMaxX();
|
||||
}
|
||||
|
||||
|
||||
public double getMaxYDirAdj() {
|
||||
|
||||
return this.bBoxDirAdj.getMaxY();
|
||||
}
|
||||
|
||||
|
||||
public double getCenterYDirAdj() {
|
||||
|
||||
return this.bBoxDirAdj.getCenterY();
|
||||
}
|
||||
|
||||
|
||||
public double getCenterXDirAdj() {
|
||||
|
||||
return this.bBoxDirAdj.getCenterX();
|
||||
}
|
||||
|
||||
}
|
||||
@ -6,9 +6,11 @@ import java.util.List;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
|
||||
@Data
|
||||
public class Zone extends BoundingBox {
|
||||
@EqualsAndHashCode(callSuper = false)
|
||||
public class Zone extends TextBoundingBox {
|
||||
|
||||
private List<Line> lines;
|
||||
|
||||
|
||||
@ -1,15 +1,17 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.docstrum.service;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.ListIterator;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils;
|
||||
|
||||
@ -19,21 +21,30 @@ public class ReadingOrderService {
|
||||
private static final double THRESHOLD = 5;
|
||||
public static final double MULTI_COLUMN_DETECTION_THRESHOLD = 1.5;
|
||||
|
||||
private static final Comparator<TextBoundingBox> COMPARATOR = //
|
||||
Comparator.comparing(TextBoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||
.thenComparing(TextBoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD));
|
||||
|
||||
public List<Zone> resolve(List<Zone> zones, boolean xyReadingOrder) {
|
||||
private static final Comparator<TextBoundingBox> COMPARATOR_DIR_ADJ = //
|
||||
Comparator.comparing(TextBoundingBox::getYDirAdj, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||
.thenComparing(TextBoundingBox::getXDirAdj, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD));
|
||||
|
||||
|
||||
public List<Zone> resolve(List<Zone> zones, boolean xyReadingOrder, boolean useDirAdjCoords) {
|
||||
|
||||
if (zones.isEmpty() || zones.size() == 1) {
|
||||
return zones;
|
||||
}
|
||||
|
||||
if (xyReadingOrder) {
|
||||
return resolveSingleColumnReadingOrder(zones);
|
||||
return resolveSingleColumnReadingOrder(zones, useDirAdjCoords);
|
||||
}
|
||||
|
||||
Map<Long, Integer> histogram = new HashMap<>();
|
||||
for (Zone zone : zones) {
|
||||
long minY = Math.round(zone.getBBox().getMinY());
|
||||
long maxY = Math.round(zone.getBBox().getMaxY());
|
||||
Rectangle2D bbox = useDirAdjCoords ? zone.getBBoxDirAdj() : zone.getBBox();
|
||||
long minY = Math.round(bbox.getMinY());
|
||||
long maxY = Math.round(bbox.getMaxY());
|
||||
for (long i = minY; i <= maxY; i++) {
|
||||
histogram.put(i, histogram.getOrDefault(i, 0) + 1);
|
||||
}
|
||||
@ -43,24 +54,32 @@ public class ReadingOrderService {
|
||||
.stream()
|
||||
.mapToInt(Integer::intValue).average()
|
||||
.orElse(1) < MULTI_COLUMN_DETECTION_THRESHOLD) {
|
||||
return resolveSingleColumnReadingOrder(zones);
|
||||
return resolveSingleColumnReadingOrder(zones, useDirAdjCoords);
|
||||
} else {
|
||||
|
||||
return resolveMultiColumnReadingOder(zones);
|
||||
return resolveMultiColumnReadingOder(zones, useDirAdjCoords);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
private static List<Zone> resolveSingleColumnReadingOrder(List<Zone> zones) {
|
||||
private static List<Zone> resolveSingleColumnReadingOrder(List<Zone> zones, boolean useDirAdjCoords) {
|
||||
|
||||
zones.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||
if (useDirAdjCoords) {
|
||||
return zones.stream()
|
||||
.collect(Collectors.groupingBy(TextBoundingBox::getDir)).values()
|
||||
.stream()
|
||||
.flatMap(words -> words.stream()
|
||||
.sorted(COMPARATOR_DIR_ADJ))
|
||||
.toList();
|
||||
}
|
||||
|
||||
zones.sort(COMPARATOR);
|
||||
return zones;
|
||||
}
|
||||
|
||||
|
||||
private List<Zone> resolveMultiColumnReadingOder(List<Zone> zones) {
|
||||
private List<Zone> resolveMultiColumnReadingOder(List<Zone> zones, boolean useDirAdjCoords) {
|
||||
|
||||
// Simple reading order resolver for multi column page layout as described here : https://pub.towardsai.net/advanced-rag-02-unveiling-pdf-parsing-b84ae866344e
|
||||
// TODO implement a more fancy reading order resolver see https://github.com/BobLd/DocumentLayoutAnalysis/blob/master/README.md#reading-order
|
||||
@ -69,11 +88,12 @@ public class ReadingOrderService {
|
||||
double maxX = Double.NEGATIVE_INFINITY;
|
||||
|
||||
for (Zone zone : zones) {
|
||||
if (zone.getX() < minX) {
|
||||
minX = zone.getX();
|
||||
Rectangle2D bbox = useDirAdjCoords ? zone.getBBoxDirAdj() : zone.getBBox();
|
||||
if (bbox.getX() < minX) {
|
||||
minX = zone.getXDirAdj();
|
||||
}
|
||||
if (zone.getX() + zone.getWidth() > maxX) {
|
||||
maxX = zone.getX() + zone.getWidth();
|
||||
if (bbox.getMaxX() > maxX) {
|
||||
maxX = zone.getMaxXDirAdj();
|
||||
}
|
||||
}
|
||||
|
||||
@ -82,24 +102,27 @@ public class ReadingOrderService {
|
||||
List<Zone> leftOf = new ArrayList<>();
|
||||
List<Zone> rightOf = new ArrayList<>();
|
||||
List<Zone> middle = new ArrayList<>();
|
||||
|
||||
for (Zone zone : zones) {
|
||||
if (zone.getX() < midLineXCoordinate && zone.getX() + zone.getWidth() < midLineXCoordinate) {
|
||||
Rectangle2D bbox = useDirAdjCoords ? zone.getBBoxDirAdj() : zone.getBBox();
|
||||
if (bbox.getX() < midLineXCoordinate && bbox.getX() + bbox.getWidth() < midLineXCoordinate) {
|
||||
leftOf.add(zone);
|
||||
} else if (zone.getX() > midLineXCoordinate && zone.getX() + zone.getWidth() > midLineXCoordinate) {
|
||||
} else if (bbox.getX() > midLineXCoordinate && bbox.getX() + bbox.getWidth() > midLineXCoordinate) {
|
||||
rightOf.add(zone);
|
||||
} else {
|
||||
middle.add(zone);
|
||||
}
|
||||
}
|
||||
|
||||
leftOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||
|
||||
rightOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||
|
||||
middle.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||
if (useDirAdjCoords) {
|
||||
leftOf.sort(COMPARATOR_DIR_ADJ);
|
||||
rightOf.sort(COMPARATOR_DIR_ADJ);
|
||||
middle.sort(COMPARATOR_DIR_ADJ);
|
||||
} else {
|
||||
leftOf.sort(COMPARATOR);
|
||||
rightOf.sort(COMPARATOR);
|
||||
middle.sort(COMPARATOR);
|
||||
}
|
||||
/*
|
||||
List<Zone> leftNotIntersecting = new ArrayList<>();
|
||||
for (Zone leftZone : leftOf) {
|
||||
@ -151,8 +174,9 @@ public class ReadingOrderService {
|
||||
|
||||
while (itty.hasNext()) {
|
||||
Zone current = itty.next();
|
||||
Rectangle2D bbox = useDirAdjCoords ? current.getBBoxDirAdj() : current.getBBox();
|
||||
for (int i = 0; i < sortedZones.size(); i++) {
|
||||
if (current.getY() < sortedZones.get(i).getY()) {
|
||||
if (bbox.getY() < sortedZones.get(i).getY()) {
|
||||
sortedZones.add(i, current);
|
||||
itty.remove();
|
||||
break;
|
||||
|
||||
@ -1,5 +1,7 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.docstrum.service;
|
||||
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.model.SectionIdentifier.numericalIdentifierPattern;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashSet;
|
||||
@ -21,7 +23,7 @@ public class ZoneBuilderService {
|
||||
private static final double MIN_HORIZONTAL_DISTANCE_MULTIPLIER = -0.5;
|
||||
private static final double MAX_VERTICAL_DISTANCE_MULTIPLIER = 1.2;
|
||||
|
||||
private static final double MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER = -3.0;
|
||||
private static final double MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER = -7;
|
||||
|
||||
private static final double MAX_VERTICAL_MERGE_DISTANCE_MULTIPLIER = 0.5;
|
||||
|
||||
@ -38,7 +40,7 @@ public class ZoneBuilderService {
|
||||
|
||||
double minHorizontalDistance = characterSpacing * MIN_HORIZONTAL_DISTANCE_MULTIPLIER;
|
||||
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_DISTANCE_MULTIPLIER;
|
||||
double minHorizontalMergeDistance = characterSpacing * MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER;
|
||||
double minHorizontalMergeDistance = lineSpacing * MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER;
|
||||
double maxVerticalMergeDistance = lineSpacing * MAX_VERTICAL_MERGE_DISTANCE_MULTIPLIER;
|
||||
|
||||
UnionFind<Line> unionFind = new UnionFind<>(new HashSet<>(lines));
|
||||
@ -54,11 +56,26 @@ public class ZoneBuilderService {
|
||||
return;
|
||||
}
|
||||
|
||||
double scale = Math.min(outerLine.getHeight(), innerLine.getHeight()) / meanHeight;
|
||||
scale = Math.max(MIN_LINE_SIZE_SCALE, Math.min(scale, MAX_LINE_SIZE_SCALE));
|
||||
// if (!innerLine.getFontStyle().equals(outerLine.getFontStyle()) //
|
||||
// && !outerLine.intersectsY(innerLine, -2f)) {
|
||||
// return;
|
||||
// }
|
||||
|
||||
double horizontalDistance = outerLine.horizontalDistance(innerLine) / scale;
|
||||
double verticalDistance = outerLine.verticalDistance(innerLine) / scale;
|
||||
double horizontalScale = Math.min(outerLine.getHeightDirAdj(), innerLine.getHeightDirAdj()) / meanHeight;
|
||||
horizontalScale = Math.max(MIN_LINE_SIZE_SCALE, Math.min(horizontalScale, MAX_LINE_SIZE_SCALE));
|
||||
double verticalScale = horizontalScale;
|
||||
|
||||
// if (innerLine.toString().endsWith(":")
|
||||
// || outerLine.toString().endsWith(":")
|
||||
// || numericalIdentifierPattern.matcher(innerLine.toString()).matches()
|
||||
// || numericalIdentifierPattern.matcher(outerLine.toString()).matches()) {
|
||||
//
|
||||
// horizontalScale *= 5;
|
||||
// verticalScale /= 10;
|
||||
// }
|
||||
|
||||
double horizontalDistance = outerLine.horizontalDistance(innerLine) / horizontalScale;
|
||||
double verticalDistance = outerLine.verticalDistance(innerLine) / verticalScale;
|
||||
|
||||
if ((!(minHorizontalDistance <= horizontalDistance) || !(verticalDistance <= maxVerticalDistance)) //
|
||||
&& (!(minHorizontalMergeDistance <= horizontalDistance) || !(verticalDistance <= maxVerticalMergeDistance))) {
|
||||
@ -87,7 +104,7 @@ public class ZoneBuilderService {
|
||||
double weights = 0.0;
|
||||
for (Line line : lines) {
|
||||
double weight = line.getLength();
|
||||
meanHeight += line.getHeight() * weight;
|
||||
meanHeight += line.getHeightDirAdj() * weight;
|
||||
weights += weight;
|
||||
}
|
||||
meanHeight /= weights;
|
||||
|
||||
@ -32,6 +32,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.He
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.FontStyle;
|
||||
|
||||
public class MarkdownMapper extends AbstractNodeVisitor {
|
||||
|
||||
@ -297,12 +298,6 @@ public class MarkdownMapper extends AbstractNodeVisitor {
|
||||
}
|
||||
|
||||
|
||||
enum FontStyle {
|
||||
REGULAR,
|
||||
BOLD,
|
||||
ITALIC,
|
||||
BOLD_ITALIC;
|
||||
}
|
||||
|
||||
record FontStyleChange(boolean enter, FontStyle style) {
|
||||
|
||||
|
||||
@ -18,6 +18,7 @@ import lombok.RequiredArgsConstructor;
|
||||
|
||||
@Data
|
||||
@RequiredArgsConstructor
|
||||
|
||||
public class ClassificationPage {
|
||||
|
||||
@NonNull
|
||||
@ -25,7 +26,7 @@ public class ClassificationPage {
|
||||
|
||||
private List<OutlineObject> outlineObjects = new ArrayList<>();
|
||||
|
||||
private List<AbstractPageBlock> headlines = new ArrayList<>();
|
||||
private List<AbstractPageBlock> headlines = new ArrayList<>();
|
||||
|
||||
private List<ClassifiedImage> images = new ArrayList<>();
|
||||
|
||||
@ -44,7 +45,7 @@ public class ClassificationPage {
|
||||
private float pageWidth;
|
||||
private float pageHeight;
|
||||
|
||||
CleanRulings cleanRulings;
|
||||
private CleanRulings cleanRulings;
|
||||
|
||||
private Map<String, List<Rectangle2D>> markedContentBboxPerType = new HashMap<>();
|
||||
|
||||
|
||||
@ -12,10 +12,10 @@ import lombok.Getter;
|
||||
@Getter
|
||||
public class FloatFrequencyCounter {
|
||||
|
||||
Map<Float, Integer> countPerValue = new HashMap<>();
|
||||
Map<Double, Integer> countPerValue = new HashMap<>();
|
||||
|
||||
|
||||
public void add(float value) {
|
||||
public void add(double value) {
|
||||
|
||||
if (!countPerValue.containsKey(value)) {
|
||||
countPerValue.put(value, 1);
|
||||
@ -25,9 +25,9 @@ public class FloatFrequencyCounter {
|
||||
}
|
||||
|
||||
|
||||
public void addAll(Map<Float, Integer> otherCounter) {
|
||||
public void addAll(Map<Double, Integer> otherCounter) {
|
||||
|
||||
for (Map.Entry<Float, Integer> entry : otherCounter.entrySet()) {
|
||||
for (Map.Entry<Double, Integer> entry : otherCounter.entrySet()) {
|
||||
if (countPerValue.containsKey(entry.getKey())) {
|
||||
countPerValue.put(entry.getKey(), countPerValue.get(entry.getKey()) + entry.getValue());
|
||||
} else {
|
||||
@ -37,10 +37,10 @@ public class FloatFrequencyCounter {
|
||||
}
|
||||
|
||||
|
||||
public Float getMostPopular() {
|
||||
public Double getMostPopular() {
|
||||
|
||||
Map.Entry<Float, Integer> mostPopular = null;
|
||||
for (Map.Entry<Float, Integer> entry : countPerValue.entrySet()) {
|
||||
Map.Entry<Double, Integer> mostPopular = null;
|
||||
for (Map.Entry<Double, Integer> entry : countPerValue.entrySet()) {
|
||||
if (mostPopular == null || entry.getValue() >= mostPopular.getValue()) {
|
||||
mostPopular = entry;
|
||||
}
|
||||
@ -49,11 +49,11 @@ public class FloatFrequencyCounter {
|
||||
}
|
||||
|
||||
|
||||
public List<Float> getHighterThanMostPopular() {
|
||||
public List<Double> getHigherThanMostPopular() {
|
||||
|
||||
Float mostPopular = getMostPopular();
|
||||
List<Float> higher = new ArrayList<>();
|
||||
for (Float value : countPerValue.keySet()) {
|
||||
Double mostPopular = getMostPopular();
|
||||
List<Double> higher = new ArrayList<>();
|
||||
for (Double value : countPerValue.keySet()) {
|
||||
if (value > mostPopular) {
|
||||
higher.add(value);
|
||||
}
|
||||
@ -63,10 +63,10 @@ public class FloatFrequencyCounter {
|
||||
}
|
||||
|
||||
|
||||
public Float getHighest() {
|
||||
public Double getHighest() {
|
||||
|
||||
Float highest = null;
|
||||
for (Float value : countPerValue.keySet()) {
|
||||
Double highest = null;
|
||||
for (Double value : countPerValue.keySet()) {
|
||||
if (highest == null || value > highest) {
|
||||
highest = value;
|
||||
}
|
||||
|
||||
@ -15,7 +15,7 @@ import lombok.experimental.FieldDefaults;
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class SectionIdentifier {
|
||||
|
||||
static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d+)[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?");
|
||||
public static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d+)[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?");
|
||||
|
||||
public enum Format {
|
||||
EMPTY,
|
||||
|
||||
@ -145,10 +145,7 @@ public class AtomicTextBlock implements TextBlock {
|
||||
}
|
||||
|
||||
|
||||
public static AtomicTextBlock fromAtomicTextBlockData(DocumentTextData documentTextData,
|
||||
DocumentPositionData documentPositionData,
|
||||
SemanticNode parent,
|
||||
Page page) {
|
||||
public static AtomicTextBlock fromAtomicTextBlockData(DocumentTextData documentTextData, DocumentPositionData documentPositionData, SemanticNode parent, Page page) {
|
||||
|
||||
return AtomicTextBlock.builder()
|
||||
.id(documentTextData.getId())
|
||||
@ -156,8 +153,10 @@ public class AtomicTextBlock implements TextBlock {
|
||||
.page(page)
|
||||
.textRange(new TextRange(documentTextData.getStart(), documentTextData.getEnd()))
|
||||
.searchText(documentTextData.getSearchText())
|
||||
.lineBreaks(Arrays.stream(documentTextData.getLineBreaks()).boxed().toList())
|
||||
.stringIdxToPositionIdx(Arrays.stream(documentPositionData.getStringIdxToPositionIdx()).boxed().toList())
|
||||
.lineBreaks(Arrays.stream(documentTextData.getLineBreaks()).boxed()
|
||||
.toList())
|
||||
.stringIdxToPositionIdx(Arrays.stream(documentPositionData.getStringIdxToPositionIdx()).boxed()
|
||||
.toList())
|
||||
.positions(toRectangle2DList(documentPositionData.getPositions()))
|
||||
.parent(parent)
|
||||
.build();
|
||||
@ -166,7 +165,9 @@ public class AtomicTextBlock implements TextBlock {
|
||||
|
||||
private static List<Rectangle2D> toRectangle2DList(float[][] positions) {
|
||||
|
||||
return Arrays.stream(positions).map(floatArr -> (Rectangle2D) new Rectangle2D.Float(floatArr[0], floatArr[1], floatArr[2], floatArr[3])).toList();
|
||||
return Arrays.stream(positions)
|
||||
.map(floatArr -> (Rectangle2D) new Rectangle2D.Float(floatArr[0], floatArr[1], floatArr[2], floatArr[3]))
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
@ -176,6 +177,9 @@ public class AtomicTextBlock implements TextBlock {
|
||||
throw new IndexOutOfBoundsException(format("line %d out of range for AtomicTextBlock with %d lines", lineNumber, numberOfLines()));
|
||||
}
|
||||
if (lineNumber == 0) {
|
||||
if (lineBreaks.isEmpty()) {
|
||||
return searchText;
|
||||
}
|
||||
return subSequence(textRange.start(), lineBreaks.get(0) + textRange.start());
|
||||
} else if (lineNumber == numberOfLines() - 1) {
|
||||
return subSequence(lineBreaks.get(lineBreaks.size() - 1) + textRange.start(), textRange.end());
|
||||
@ -195,9 +199,9 @@ public class AtomicTextBlock implements TextBlock {
|
||||
public int getNextLinebreak(int fromIndex) {
|
||||
|
||||
return lineBreaks.stream()//
|
||||
.filter(linebreak -> linebreak > fromIndex - textRange.start()) //
|
||||
.findFirst() //
|
||||
.orElse(searchText.length()) + textRange.start();
|
||||
.filter(linebreak -> linebreak > fromIndex - textRange.start()) //
|
||||
.findFirst() //
|
||||
.orElse(searchText.length()) + textRange.start();
|
||||
}
|
||||
|
||||
|
||||
@ -205,9 +209,9 @@ public class AtomicTextBlock implements TextBlock {
|
||||
public int getPreviousLinebreak(int fromIndex) {
|
||||
|
||||
return lineBreaks.stream()//
|
||||
.filter(linebreak -> linebreak <= fromIndex - textRange.start())//
|
||||
.reduce((a, b) -> b)//
|
||||
.orElse(0) + textRange.start();
|
||||
.filter(linebreak -> linebreak <= fromIndex - textRange.start())//
|
||||
.reduce((a, b) -> b)//
|
||||
.orElse(0) + textRange.start();
|
||||
}
|
||||
|
||||
|
||||
@ -255,7 +259,10 @@ public class AtomicTextBlock implements TextBlock {
|
||||
|
||||
protected List<Integer> getAllLineBreaksInBoundary(TextRange textRange) {
|
||||
|
||||
return getLineBreaks().stream().map(linebreak -> linebreak + this.textRange.start()).filter(textRange::contains).toList();
|
||||
return getLineBreaks().stream()
|
||||
.map(linebreak -> linebreak + this.textRange.start())
|
||||
.filter(textRange::contains)
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -10,8 +10,8 @@ import lombok.NonNull;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@Data
|
||||
@RequiredArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@RequiredArgsConstructor
|
||||
public class ClassifiedImage {
|
||||
|
||||
@NonNull
|
||||
@ -20,11 +20,19 @@ public class ClassifiedImage {
|
||||
private ImageType imageType;
|
||||
private boolean sourceByAi;
|
||||
private boolean isAppendedToSection;
|
||||
@NonNull
|
||||
private boolean hasTransparency;
|
||||
@NonNull
|
||||
private int page;
|
||||
@NonNull
|
||||
private String representation;
|
||||
|
||||
|
||||
public ClassifiedImage(@NonNull Rectangle2D position, @NonNull ImageType imageType, boolean hasTransparency, int page, @NonNull String representation) {
|
||||
|
||||
this.position = position;
|
||||
this.imageType = imageType;
|
||||
this.hasTransparency = hasTransparency;
|
||||
this.page = page;
|
||||
this.representation = representation;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -35,14 +35,14 @@ public class Cell extends BoundingBox {
|
||||
|
||||
public Cell(Point2D topLeft, Point2D bottomRight) {
|
||||
|
||||
this.bBoxInitialUserSpace = new Rectangle2D.Double(topLeft.getX(), topLeft.getY(), (bottomRight.getX() - topLeft.getX()), (bottomRight.getY() - topLeft.getY()));
|
||||
this.bBox = bBoxInitialUserSpace;
|
||||
this.bBoxPdf = new Rectangle2D.Double(topLeft.getX(), topLeft.getY(), (bottomRight.getX() - topLeft.getX()), (bottomRight.getY() - topLeft.getY()));
|
||||
this.bBox = bBoxPdf;
|
||||
}
|
||||
|
||||
|
||||
public Cell(Rectangle2D bBoxInitialUserSpace, AffineTransform initialUserSpaceToJava) {
|
||||
|
||||
this.bBoxInitialUserSpace = bBoxInitialUserSpace;
|
||||
this.bBoxPdf = bBoxInitialUserSpace;
|
||||
this.bBox = initialUserSpaceToJava.createTransformedShape(bBoxInitialUserSpace).getBounds2D();
|
||||
}
|
||||
|
||||
@ -50,7 +50,7 @@ public class Cell extends BoundingBox {
|
||||
public static Cell copy(Cell cell) {
|
||||
|
||||
Cell copy = new Cell();
|
||||
copy.bBoxInitialUserSpace = cell.bBoxInitialUserSpace;
|
||||
copy.bBoxPdf = cell.bBoxPdf;
|
||||
copy.bBox = cell.bBox;
|
||||
return copy;
|
||||
}
|
||||
|
||||
@ -70,7 +70,7 @@ public class CleanRulings {
|
||||
|
||||
public boolean lineBetween(BoundingBox a, BoundingBox b) {
|
||||
|
||||
return lineBetween(a.getBBoxInitialUserSpace(), b.getBBoxInitialUserSpace());
|
||||
return lineBetween(a.getBBoxPdf(), b.getBBoxPdf());
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -263,8 +263,8 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
|
||||
cells.stream()
|
||||
.map(originalCell -> new CellWithIntersection(originalCell,
|
||||
RectangleTransformations.calculateIntersectedArea(cellFromGridStructure.getBBoxInitialUserSpace(),
|
||||
originalCell.getBBoxInitialUserSpace())))
|
||||
RectangleTransformations.calculateIntersectedArea(cellFromGridStructure.getBBoxPdf(),
|
||||
originalCell.getBBoxPdf())))
|
||||
.filter(cellWithIntersection -> cellWithIntersection.intersectedArea > 0)
|
||||
.filter(cellWithIntersection -> cellWithIntersection.originalCell.getArea() > cellWithIntersection.intersectedArea * CELL_AREA_CONTAINED_THRESHOLD)
|
||||
.max(Comparator.comparing(CellWithIntersection::intersectedArea))
|
||||
|
||||
@ -0,0 +1,9 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||
|
||||
|
||||
public enum FontStyle {
|
||||
REGULAR,
|
||||
BOLD,
|
||||
ITALIC,
|
||||
BOLD_ITALIC;
|
||||
}
|
||||
@ -5,64 +5,50 @@ import java.awt.geom.Rectangle2D;
|
||||
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.experimental.SuperBuilder;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@SuperBuilder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class RedTextPosition extends BoundingBox {
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class RedTextPosition extends TextBoundingBox {
|
||||
|
||||
public final static int HEIGHT_PADDING = 2;
|
||||
|
||||
private Rectangle2D.Float bBoxDirAdj; // adjusted to text rotation
|
||||
String unicode;
|
||||
|
||||
@JsonIgnore
|
||||
private int rotation;
|
||||
// estimated using the TextMatrix in radians
|
||||
float exactDir;
|
||||
|
||||
@JsonIgnore
|
||||
private float pageHeight;
|
||||
float widthOfSpace;
|
||||
|
||||
@JsonIgnore
|
||||
private float pageWidth;
|
||||
float fontSizeInPt;
|
||||
|
||||
private String unicode;
|
||||
|
||||
@JsonIgnore
|
||||
private float dir;
|
||||
|
||||
// not used in reanalysis
|
||||
@JsonIgnore
|
||||
private float widthOfSpace;
|
||||
|
||||
// not used in reanalysis
|
||||
@JsonIgnore
|
||||
private float fontSizeInPt;
|
||||
|
||||
// not used in reanalysis
|
||||
@JsonIgnore
|
||||
private String fontName;
|
||||
String fontName;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public static RedTextPosition fromTextPosition(TextPosition textPosition) {
|
||||
|
||||
var pos = new RedTextPosition();
|
||||
pos.setRotation(textPosition.getRotation());
|
||||
pos.setPageHeight(textPosition.getPageHeight());
|
||||
pos.setPageWidth(textPosition.getPageWidth());
|
||||
pos.setUnicode(textPosition.getUnicode());
|
||||
pos.setDir(textPosition.getDir());
|
||||
pos.setWidthOfSpace(textPosition.getWidthOfSpace());
|
||||
pos.setFontSizeInPt(textPosition.getFontSizeInPt());
|
||||
pos.setFontName(textPosition.getFont().getName());
|
||||
pos.setExactDir((float) FastAtan2.fastAtan2(textPosition.getTextMatrix().getShearY(), textPosition.getTextMatrix().getScaleX()));
|
||||
pos.setDir(TextDirection.fromDegrees(textPosition.getDir()));
|
||||
|
||||
//TODO: There is a mismatch in the java coords of the text and the rulings,
|
||||
// I guess if we start with the initial user space positions and transform them the same way we do the rulings it would work.
|
||||
@ -73,18 +59,18 @@ public class RedTextPosition extends BoundingBox {
|
||||
textPosition.getYDirAdj() - textHeight,
|
||||
textPosition.getWidthDirAdj(),
|
||||
textHeight + HEIGHT_PADDING);
|
||||
|
||||
pos.setBBoxDirAdj(dirAdjPosition);
|
||||
|
||||
AffineTransform affineTransform = getRotationMatrix(TextDirection.fromDegrees(textPosition.getDir()), textPosition.getPageWidth(), textPosition.getPageHeight());
|
||||
Rectangle2D bBoxInitialUserSpace = affineTransform.createTransformedShape(dirAdjPosition).getBounds2D();
|
||||
|
||||
pos.setBBoxInitialUserSpace(bBoxInitialUserSpace); // These are definitely correct
|
||||
pos.setBBoxPdf(bBoxInitialUserSpace); // These are definitely correct
|
||||
|
||||
return pos;
|
||||
}
|
||||
|
||||
|
||||
|
||||
private static AffineTransform getRotationMatrix(TextDirection textDirection, float pageWidth, float pageHeight) {
|
||||
|
||||
AffineTransform transform = new AffineTransform();
|
||||
@ -103,32 +89,4 @@ public class RedTextPosition extends BoundingBox {
|
||||
return transform;
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getXDirAdj() {
|
||||
|
||||
return this.bBoxDirAdj.x;
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getYDirAdj() {
|
||||
|
||||
return this.bBoxDirAdj.y;
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getWidthDirAdj() {
|
||||
|
||||
return this.bBoxDirAdj.width;
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getHeightDir() {
|
||||
|
||||
return this.bBoxDirAdj.height;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -44,4 +44,15 @@ public enum TextDirection {
|
||||
|
||||
throw new IllegalArgumentException(String.format("A value of %f is not supported by TextDirection", degrees));
|
||||
}
|
||||
|
||||
|
||||
public int getRotation() {
|
||||
|
||||
return switch (this) {
|
||||
case ZERO -> 0;
|
||||
case QUARTER_CIRCLE -> 1;
|
||||
case HALF_CIRCLE -> 2;
|
||||
case THREE_QUARTER_CIRCLE -> 3;
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
@ -8,6 +8,7 @@ import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
@ -26,17 +27,19 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
@Builder.Default
|
||||
private List<TextPositionSequence> sequences = new ArrayList<>();
|
||||
|
||||
private Rectangle2D bBoxDirAdj;
|
||||
|
||||
private String mostPopularWordFont;
|
||||
|
||||
private String mostPopularWordStyle;
|
||||
|
||||
private float mostPopularWordFontSize;
|
||||
private double mostPopularWordFontSize;
|
||||
|
||||
private float mostPopularWordHeight;
|
||||
private double mostPopularWordHeight;
|
||||
|
||||
private float mostPopularWordSpaceWidth;
|
||||
private double mostPopularWordSpaceWidth;
|
||||
|
||||
private float highestFontSize;
|
||||
private double highestFontSize;
|
||||
|
||||
private PageBlockType classification;
|
||||
|
||||
@ -51,34 +54,24 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public TextDirection getDir() {
|
||||
|
||||
return sequences.get(0).getDir();
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getPageHeight() {
|
||||
|
||||
return sequences.get(0).getPageHeight();
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getPageWidth() {
|
||||
|
||||
return sequences.get(0).getPageWidth();
|
||||
}
|
||||
|
||||
|
||||
private void calculateBBox() {
|
||||
|
||||
if (sequences == null) {
|
||||
this.bBox = new Rectangle2D.Double();
|
||||
this.bBoxInitialUserSpace = new Rectangle2D.Double();
|
||||
this.bBoxPdf = new Rectangle2D.Double();
|
||||
this.bBoxDirAdj = new Rectangle2D.Double();
|
||||
return;
|
||||
}
|
||||
this.bBoxDirAdj = sequences.stream()
|
||||
.map(TextPositionSequence::getBBoxDirAdj)
|
||||
.collect(RectangleTransformations.collectBBox());
|
||||
|
||||
setToBBoxOfComponents(sequences);
|
||||
}
|
||||
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition.HEIGHT_PADDING;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
@ -8,8 +9,7 @@ import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
@ -23,23 +23,21 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = false)
|
||||
public class TextPositionSequence extends BoundingBox implements CharSequence {
|
||||
@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = true) // needs the bbox to be unique
|
||||
public class TextPositionSequence extends TextBoundingBox implements CharSequence {
|
||||
|
||||
public static final int HEIGHT_PADDING = 2;
|
||||
public static final String STANDARD = "standard";
|
||||
public static final String BOLD_ITALIC = "bold, italic";
|
||||
public static final String BOLD = "bold";
|
||||
public static final String ITALIC = "italic";
|
||||
|
||||
@EqualsAndHashCode.Include
|
||||
private int page;
|
||||
|
||||
@EqualsAndHashCode.Include
|
||||
@Builder.Default
|
||||
private List<RedTextPosition> textPositions = new ArrayList<>();
|
||||
|
||||
private Rectangle2D bBoxDirAdj;
|
||||
@EqualsAndHashCode.Include
|
||||
private TextDirection dir;
|
||||
private int rotation;
|
||||
private float pageHeight;
|
||||
private float pageWidth;
|
||||
private boolean isParagraphStart;
|
||||
private boolean strikethrough;
|
||||
private boolean underline;
|
||||
@ -51,10 +49,6 @@ public class TextPositionSequence extends BoundingBox implements CharSequence {
|
||||
.map(RedTextPosition::fromTextPosition)
|
||||
.collect(Collectors.toList());
|
||||
this.page = pageNumber;
|
||||
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
|
||||
this.rotation = textPositions.get(0).getRotation();
|
||||
this.pageHeight = textPositions.get(0).getPageHeight();
|
||||
this.pageWidth = textPositions.get(0).getPageWidth();
|
||||
this.isParagraphStart = isParagraphStart;
|
||||
calculateBBox();
|
||||
}
|
||||
@ -62,9 +56,6 @@ public class TextPositionSequence extends BoundingBox implements CharSequence {
|
||||
|
||||
private void calculateBBox() {
|
||||
|
||||
this.bBoxDirAdj = textPositions.stream()
|
||||
.map(RedTextPosition::getBBoxDirAdj)
|
||||
.collect(RectangleTransformations.collectBBox());
|
||||
setToBBoxOfComponents(getTextPositions());
|
||||
}
|
||||
|
||||
@ -73,10 +64,6 @@ public class TextPositionSequence extends BoundingBox implements CharSequence {
|
||||
|
||||
this.textPositions = textPositions;
|
||||
this.page = page;
|
||||
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
|
||||
this.rotation = textPositions.get(0).getRotation();
|
||||
this.pageHeight = textPositions.get(0).getPageHeight();
|
||||
this.pageWidth = textPositions.get(0).getPageWidth();
|
||||
calculateBBox();
|
||||
}
|
||||
|
||||
@ -112,9 +99,6 @@ public class TextPositionSequence extends BoundingBox implements CharSequence {
|
||||
textPositionSequence.textPositions = textPositions.subList(start, end);
|
||||
textPositionSequence.page = page;
|
||||
textPositionSequence.dir = dir;
|
||||
textPositionSequence.rotation = rotation;
|
||||
textPositionSequence.pageHeight = pageHeight;
|
||||
textPositionSequence.pageWidth = pageWidth;
|
||||
textPositionSequence.setToBBoxOfComponents(getTextPositions());
|
||||
return textPositionSequence;
|
||||
}
|
||||
@ -141,10 +125,6 @@ public class TextPositionSequence extends BoundingBox implements CharSequence {
|
||||
|
||||
this.textPositions.add(textPosition);
|
||||
this.page = textPositionSequence.getPage();
|
||||
this.dir = textPositionSequence.getDir();
|
||||
this.rotation = textPositionSequence.getRotation();
|
||||
this.pageHeight = textPositionSequence.getPageHeight();
|
||||
this.pageWidth = textPositionSequence.getPageWidth();
|
||||
calculateBBox();
|
||||
}
|
||||
|
||||
@ -152,79 +132,18 @@ public class TextPositionSequence extends BoundingBox implements CharSequence {
|
||||
public void add(TextPosition textPosition) {
|
||||
|
||||
this.textPositions.add(RedTextPosition.fromTextPosition(textPosition));
|
||||
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
|
||||
this.rotation = textPositions.get(0).getRotation();
|
||||
this.pageHeight = textPositions.get(0).getPageHeight();
|
||||
this.pageWidth = textPositions.get(0).getPageWidth();
|
||||
calculateBBox();
|
||||
}
|
||||
|
||||
public double getTextHeightNoPadding() {
|
||||
|
||||
/**
|
||||
* This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
|
||||
* This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt.
|
||||
*
|
||||
* @return the text direction adjusted minX value
|
||||
*/
|
||||
|
||||
public float getMinXDirAdj() {
|
||||
|
||||
return textPositions.get(0).getXDirAdj();
|
||||
|
||||
return textPositions.get(0).getHeightDirAdj();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
|
||||
* This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt.
|
||||
*
|
||||
* @return the text direction adjusted maxX value
|
||||
*/
|
||||
public double getTextHeight() {
|
||||
|
||||
public float getMaxXDirAdj() {
|
||||
|
||||
return textPositions.get(textPositions.size() - 1).getXDirAdj() + textPositions.get(textPositions.size() - 1).getWidthDirAdj() + HEIGHT_PADDING;
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
|
||||
* This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt.
|
||||
*
|
||||
* @return the text direction adjusted minY value. The upper border of the bounding box of the word.
|
||||
*/
|
||||
|
||||
public float getMinYDirAdj() {
|
||||
|
||||
return textPositions.get(0).getYDirAdj() - getTextHeight();
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
|
||||
* This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt.
|
||||
*
|
||||
* @return the text direction adjusted maxY value. The lower border of the bounding box of the word.
|
||||
*/
|
||||
|
||||
public float getMaxYDirAdj() {
|
||||
|
||||
return textPositions.get(0).getYDirAdj();
|
||||
|
||||
}
|
||||
|
||||
|
||||
public float getTextHeightNoPadding() {
|
||||
|
||||
return textPositions.get(0).getHeightDir();
|
||||
}
|
||||
|
||||
|
||||
public float getTextHeight() {
|
||||
|
||||
return textPositions.get(0).getHeightDir() + HEIGHT_PADDING;
|
||||
return textPositions.get(0).getHeightDirAdj() + HEIGHT_PADDING;
|
||||
}
|
||||
|
||||
|
||||
@ -240,18 +159,18 @@ public class TextPositionSequence extends BoundingBox implements CharSequence {
|
||||
public String getFontStyle() {
|
||||
|
||||
if (textPositions.get(0).getFontName() == null) {
|
||||
return "standard";
|
||||
return STANDARD;
|
||||
}
|
||||
String lowercaseFontName = textPositions.get(0).getFontName().toLowerCase(Locale.ROOT);
|
||||
|
||||
if (lowercaseFontName.contains("bold") && lowercaseFontName.contains("italic")) {
|
||||
return "bold, italic";
|
||||
} else if (lowercaseFontName.contains("bold")) {
|
||||
return "bold";
|
||||
} else if (lowercaseFontName.contains("italic")) {
|
||||
return "italic";
|
||||
if (lowercaseFontName.contains(BOLD) && lowercaseFontName.contains(ITALIC)) {
|
||||
return BOLD_ITALIC;
|
||||
} else if (lowercaseFontName.contains(BOLD)) {
|
||||
return BOLD;
|
||||
} else if (lowercaseFontName.contains(ITALIC)) {
|
||||
return ITALIC;
|
||||
} else {
|
||||
return "standard";
|
||||
return STANDARD;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -56,7 +56,7 @@ public class ImageServiceResponseAdapter {
|
||||
classificationPage.getImages().forEach(image -> {
|
||||
if (image.getImageType().equals(ImageType.OTHER)) {
|
||||
for (AbstractPageBlock textblock : classificationPage.getTextBlocks()) {
|
||||
if (image.getPosition().contains(textblock.getBBoxInitialUserSpace())) {
|
||||
if (image.getPosition().contains(textblock.getBBoxPdf())) {
|
||||
image.setImageType(ImageType.OCR);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -14,6 +14,7 @@ import lombok.NoArgsConstructor;
|
||||
@AllArgsConstructor
|
||||
public class Classification {
|
||||
|
||||
@Builder.Default
|
||||
private Map<String, Float> probabilities = new HashMap<>();
|
||||
private String label;
|
||||
|
||||
|
||||
@ -22,8 +22,10 @@ public class ImageServiceResponse {
|
||||
|
||||
@JsonProperty(value = "imageMetadata")
|
||||
@JsonAlias("data")
|
||||
@Builder.Default
|
||||
private List<ImageMetadata> data = new ArrayList<>();
|
||||
|
||||
@Builder.Default
|
||||
private List<ImageMetadata> dataCV = new ArrayList<>();
|
||||
|
||||
|
||||
|
||||
@ -15,6 +15,7 @@ import lombok.NoArgsConstructor;
|
||||
public class TableData {
|
||||
|
||||
private PageInfo pageInfo;
|
||||
@Builder.Default
|
||||
private List<TableCells> tableCells = new ArrayList<>();
|
||||
|
||||
}
|
||||
|
||||
@ -19,7 +19,7 @@ public class TableServiceResponse {
|
||||
private String operation;
|
||||
private String targetFileExtension;
|
||||
private String responseFileExtension;
|
||||
|
||||
@Builder.Default
|
||||
private List<TableData> data = new ArrayList<>();
|
||||
|
||||
}
|
||||
|
||||
@ -6,7 +6,6 @@ import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.experimental.UtilityClass;
|
||||
@ -38,7 +37,7 @@ public class GapDetectionService {
|
||||
for (TextPositionSequence currentTextPosition : sortedTextPositionSequences.subList(1, sortedTextPositionSequences.size())) {
|
||||
|
||||
double yDifference = Math.abs(currentTextPosition.getMaxYDirAdj() - previousTextPosition.getMaxYDirAdj());
|
||||
double xGap = Math.abs(previousTextPosition.getMaxXDirAdj() - currentTextPosition.getMinXDirAdj());
|
||||
double xGap = Math.abs(previousTextPosition.getMaxXDirAdj() - currentTextPosition.getXDirAdj());
|
||||
Rectangle2D previousTextPositionBBox = toRectangle2D(previousTextPosition);
|
||||
Rectangle2D currentTextPositionBBox = toRectangle2D(currentTextPosition);
|
||||
|
||||
|
||||
@ -71,7 +71,7 @@ public class LineDetectionService {
|
||||
|
||||
private static boolean isXGap(TextPositionSequence currentTextPosition, TextPositionSequence previousTextPosition, double avgTextPositionHeight) {
|
||||
|
||||
return Math.abs(previousTextPosition.getMaxXDirAdj() - currentTextPosition.getMinXDirAdj()) > (avgTextPositionHeight * X_GAP_FACTOR);
|
||||
return Math.abs(previousTextPosition.getMaxXDirAdj() - currentTextPosition.getXDirAdj()) > (avgTextPositionHeight * X_GAP_FACTOR);
|
||||
}
|
||||
|
||||
|
||||
@ -83,7 +83,7 @@ public class LineDetectionService {
|
||||
|
||||
private static boolean isNewLine(TextPositionSequence currentTextPosition, TextPositionSequence previousTextPosition, double avgTextPositionHeight) {
|
||||
|
||||
return Math.abs(previousTextPosition.getMinYDirAdj() - currentTextPosition.getMinYDirAdj()) > avgTextPositionHeight;
|
||||
return Math.abs(previousTextPosition.getYDirAdj() - currentTextPosition.getYDirAdj()) > avgTextPositionHeight;
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -78,7 +78,7 @@ public class TableExtractionService {
|
||||
|
||||
List<Cell> containedCells = new ArrayList<>();
|
||||
for (Cell c : cells) {
|
||||
if (c.hasMinimumSize() && area.contains(c.getBBoxInitialUserSpace())) {
|
||||
if (c.hasMinimumSize() && area.contains(c.getBBoxPdf())) {
|
||||
containedCells.add(c);
|
||||
}
|
||||
}
|
||||
|
||||
@ -31,13 +31,13 @@ public class TextRulingsClassifier {
|
||||
|
||||
private static void handleVerticalText(CleanRulings cleanRulings, TextPositionSequence word) {
|
||||
|
||||
float lowerY = (float) (word.getBBoxInitialUserSpace().getMinY() + TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
|
||||
float upperY = (float) (word.getBBoxInitialUserSpace().getMaxY() - TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
|
||||
float lowerY = (float) (word.getBBoxPdf().getMinY() + TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
|
||||
float upperY = (float) (word.getBBoxPdf().getMaxY() - TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
|
||||
|
||||
float strikethroughCenterX = (float) word.getBBoxInitialUserSpace().getCenterX();
|
||||
float strikethroughCenterX = (float) word.getBBoxPdf().getCenterX();
|
||||
float strikethroughBoxHeight = (float) ((word.getBBoxDirAdj().getHeight() * STRIKETHROUGH_ZONE) / 2);
|
||||
|
||||
float underlineCenterX = (float) (word.getDir().equals(TextDirection.QUARTER_CIRCLE) ? word.getBBoxInitialUserSpace().getMaxX() : word.getBBoxInitialUserSpace().getMinX());
|
||||
float underlineCenterX = (float) (word.getDir().equals(TextDirection.QUARTER_CIRCLE) ? word.getBBoxPdf().getMaxX() : word.getBBoxPdf().getMinX());
|
||||
float underlineBoxHeight = (float) ((word.getBBoxDirAdj().getHeight() * UNDERLINE_ZONE) / 2);
|
||||
|
||||
float leftX = Math.min(underlineCenterX - underlineBoxHeight, strikethroughCenterX - strikethroughBoxHeight);
|
||||
@ -65,13 +65,13 @@ public class TextRulingsClassifier {
|
||||
|
||||
private static void handleHorizontalText(CleanRulings cleanRulings, TextPositionSequence word) {
|
||||
|
||||
float leftX = (float) (word.getBBoxInitialUserSpace().getMinX() + TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
|
||||
float rightX = (float) (word.getBBoxInitialUserSpace().getMaxX() - TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
|
||||
float leftX = (float) (word.getBBoxPdf().getMinX() + TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
|
||||
float rightX = (float) (word.getBBoxPdf().getMaxX() - TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
|
||||
|
||||
float strikethroughCenterY = (float) word.getBBoxInitialUserSpace().getCenterY();
|
||||
float strikethroughCenterY = (float) word.getBBoxPdf().getCenterY();
|
||||
float strikethroughBoxHeight = (float) ((word.getBBoxDirAdj().getHeight() * STRIKETHROUGH_ZONE) / 2);
|
||||
|
||||
float underlineCenterY = (float) (word.getDir().equals(TextDirection.ZERO) ? word.getBBoxInitialUserSpace().getMinY() : word.getBBoxInitialUserSpace().getMaxY());
|
||||
float underlineCenterY = (float) (word.getDir().equals(TextDirection.ZERO) ? word.getBBoxPdf().getMinY() : word.getBBoxPdf().getMaxY());
|
||||
float underlineBoxHeight = (float) ((word.getBBoxDirAdj().getHeight() * UNDERLINE_ZONE) / 2);
|
||||
|
||||
float lowerY = Math.min(underlineCenterY - underlineBoxHeight, strikethroughCenterY - strikethroughBoxHeight);
|
||||
|
||||
@ -2,12 +2,10 @@ package com.knecon.fforesight.service.layoutparser.processor.services.blockifica
|
||||
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService.buildTextBlock;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.ListIterator;
|
||||
import java.util.Locale;
|
||||
import java.util.function.Function;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.springframework.stereotype.Service;
|
||||
@ -19,8 +17,6 @@ import com.knecon.fforesight.service.layoutparser.processor.model.SectionIdentif
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.SearchTextWithTextPositionFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@ -29,14 +25,6 @@ public class BlockificationPostprocessingService {
|
||||
|
||||
private static final float BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD = 5.0f;
|
||||
|
||||
private static final Function<TextPageBlock, Rectangle2D> blockToBoundingBox = (abstractPageBlock) -> abstractPageBlock.getSequences()
|
||||
.stream()
|
||||
.map(textPositionSequence -> textPositionSequence.getTextPositions()
|
||||
.stream()
|
||||
.map(tp -> SearchTextWithTextPositionFactory.mapRedTextPositionToInitialUserSpace(tp, textPositionSequence))
|
||||
.collect(RectangleTransformations.collectBBox()))
|
||||
.collect(RectangleTransformations.collectBBox());
|
||||
|
||||
|
||||
public OutlineObject sanitizeOutlineBlocks(ClassificationPage classificationPage, OutlineObject notFoundOutlineObject) {
|
||||
|
||||
@ -63,13 +51,13 @@ public class BlockificationPostprocessingService {
|
||||
}
|
||||
|
||||
if (!contextsOverlap(notFoundOutlineObjectProcessionContext, firstOutlineObjectProcessionContext)) {
|
||||
notFoundOutlineObject.setFound(selectMatch(classificationPage, notFoundOutlineObjectProcessionContext));
|
||||
notFoundOutlineObject.setFound(selectMatch(classificationPage, notFoundOutlineObjectProcessionContext, pageHeight));
|
||||
}
|
||||
if (firstOutlineObject != null) {
|
||||
// re-create the context for the updated blocks
|
||||
firstOutlineObjectProcessionContext = new OutlineProcessionContext(firstOutlineObject);
|
||||
processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, firstOutlineObjectProcessionContext);
|
||||
firstOutlineObject.setFound(selectMatch(classificationPage, firstOutlineObjectProcessionContext));
|
||||
firstOutlineObject.setFound(selectMatch(classificationPage, firstOutlineObjectProcessionContext, pageHeight));
|
||||
}
|
||||
|
||||
}
|
||||
@ -77,7 +65,7 @@ public class BlockificationPostprocessingService {
|
||||
outlineObjectListIterator.forEachRemaining(outlineObject -> {
|
||||
OutlineProcessionContext outlineObjectProcessionContext = new OutlineProcessionContext(outlineObject);
|
||||
processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, outlineObjectProcessionContext);
|
||||
outlineObject.setFound(selectMatch(classificationPage, outlineObjectProcessionContext));
|
||||
outlineObject.setFound(selectMatch(classificationPage, outlineObjectProcessionContext, pageHeight));
|
||||
});
|
||||
|
||||
if (!outlineObjects.isEmpty()) {
|
||||
@ -160,7 +148,7 @@ public class BlockificationPostprocessingService {
|
||||
}
|
||||
|
||||
|
||||
private boolean selectMatch(ClassificationPage classificationPage, OutlineProcessionContext context) {
|
||||
private boolean selectMatch(ClassificationPage classificationPage, OutlineProcessionContext context, float pageHeight) {
|
||||
|
||||
OutlineObject outlineObject = context.outlineObject;
|
||||
TextPageBlock directMatch = context.directMatch;
|
||||
@ -168,8 +156,8 @@ public class BlockificationPostprocessingService {
|
||||
TextPageBlock splitCandidate = context.splitCandidate;
|
||||
PageBlockType headlineType = PageBlockType.getHeadlineType(outlineObject.getTreeDepth());
|
||||
|
||||
double distanceToDirectMatch = directMatch != null ? calculateDistance(outlineObject, directMatch) : Double.MAX_VALUE;
|
||||
double distanceToSplitCandidate = splitCandidate != null ? calculateDistance(outlineObject, splitCandidate) : Double.MAX_VALUE;
|
||||
double distanceToDirectMatch = directMatch != null ? calculateDistance(outlineObject, directMatch, pageHeight) : Double.MAX_VALUE;
|
||||
double distanceToSplitCandidate = splitCandidate != null ? calculateDistance(outlineObject, splitCandidate, pageHeight) : Double.MAX_VALUE;
|
||||
|
||||
double distanceToBestMergeCandidates = Double.MAX_VALUE;
|
||||
List<TextPageBlock> bestMergeCandidateCombination = new ArrayList<>();
|
||||
@ -189,7 +177,7 @@ public class BlockificationPostprocessingService {
|
||||
|
||||
for (List<TextPageBlock> combination : combinations) {
|
||||
double averageDistance = combination.stream()
|
||||
.map(block -> calculateDistance(outlineObject, block))
|
||||
.map(block -> calculateDistance(outlineObject, block, pageHeight))
|
||||
.mapToDouble(Double::doubleValue).average()
|
||||
.orElse(Double.MAX_VALUE);
|
||||
if (distanceToBestMergeCandidates > averageDistance) {
|
||||
@ -273,7 +261,7 @@ public class BlockificationPostprocessingService {
|
||||
List<TextPositionSequence> postSequence = new ArrayList<>();
|
||||
StringBuilder currentSequence = new StringBuilder();
|
||||
|
||||
if (target.isBlank()){
|
||||
if (target.isBlank()) {
|
||||
return new WordSequenceResult();
|
||||
}
|
||||
|
||||
@ -418,10 +406,10 @@ public class BlockificationPostprocessingService {
|
||||
}
|
||||
|
||||
|
||||
private double calculateDistance(OutlineObject outlineObject, TextPageBlock pageBlock) {
|
||||
private double calculateDistance(OutlineObject outlineObject, TextPageBlock pageBlock, float pageHeight) {
|
||||
|
||||
double deltaX = outlineObject.getPoint().getX() - pageBlock.getMinX();
|
||||
double deltaY = pageBlock.getPageHeight() - outlineObject.getPoint().getY() - pageBlock.getMinY();
|
||||
double deltaY = pageHeight - outlineObject.getPoint().getY() - pageBlock.getMinY();
|
||||
return Math.sqrt(deltaX * deltaX + deltaY * deltaY);
|
||||
}
|
||||
|
||||
|
||||
@ -1,7 +1,6 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.blockification;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.ListIterator;
|
||||
|
||||
@ -10,7 +9,6 @@ import org.springframework.stereotype.Service;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.DocstrumSegmentationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
@ -40,7 +38,7 @@ public class DocstrumBlockificationService {
|
||||
|
||||
CleanRulings usedRulings = rulings.withoutTextRulings();
|
||||
|
||||
var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder, usedRulings, visualizations);
|
||||
List<Zone> zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder, usedRulings, visualizations);
|
||||
|
||||
if (!textPositions.isEmpty()) {
|
||||
visualizations.addZoneVisualizations(zones, textPositions.get(0).getPage());
|
||||
@ -48,11 +46,7 @@ public class DocstrumBlockificationService {
|
||||
visualizations.addCharactersWithNeighbours(zones, textPositions.get(0).getPage());
|
||||
}
|
||||
|
||||
var pageBlocks = toAbstractPageBlocks(zones, xyOrder, usedRulings);
|
||||
|
||||
if (xyOrder) {
|
||||
sortPageBlocksXThenY(pageBlocks);
|
||||
}
|
||||
var pageBlocks = toAbstractPageBlocks(zones);
|
||||
|
||||
var classificationPage = new ClassificationPage(pageBlocks);
|
||||
classificationPage.setCleanRulings(rulings);
|
||||
@ -73,21 +67,7 @@ public class DocstrumBlockificationService {
|
||||
}
|
||||
|
||||
|
||||
private static void sortPageBlocksXThenY(List<AbstractPageBlock> pageBlocks) {
|
||||
|
||||
pageBlocks.sort(Comparator.comparing(AbstractPageBlock::getMinY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||
.thenComparing(AbstractPageBlock::getMinX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||
pageBlocks.sort(new Comparator<AbstractPageBlock>() {
|
||||
@Override
|
||||
public int compare(AbstractPageBlock o1, AbstractPageBlock o2) {
|
||||
|
||||
return Math.abs(o1.getMinY() - o2.getMinY()) < 5 && o1.getMinX() < o2.getMinX() == true ? -1 : 0;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
private List<AbstractPageBlock> toAbstractPageBlocks(List<Zone> zones, boolean xyOrder, CleanRulings usedRulings) {
|
||||
private List<AbstractPageBlock> toAbstractPageBlocks(List<Zone> zones) {
|
||||
|
||||
List<AbstractPageBlock> abstractPageBlocks = new ArrayList<>();
|
||||
zones.forEach(zone -> {
|
||||
@ -296,6 +276,10 @@ public class DocstrumBlockificationService {
|
||||
continue;
|
||||
}
|
||||
|
||||
// if (!current.getMostPopularWordStyle().equals(inner.getMostPopularWordStyle())) {
|
||||
// continue;
|
||||
// }
|
||||
|
||||
if (current.getDir() == inner.getDir() && current.intersects(inner, yThreshold, xThreshold)) {
|
||||
|
||||
boolean toDuplicate = current.isToDuplicate() || inner.isToDuplicate();
|
||||
|
||||
@ -1,9 +1,6 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.blockification;
|
||||
|
||||
import static java.util.stream.Collectors.toSet;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
@ -13,10 +10,8 @@ import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.Orientation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
|
||||
@ -44,31 +39,30 @@ public class DocuMineBlockificationService {
|
||||
|
||||
CleanRulings usedRulings = cleanRulings.withoutTextRulings();
|
||||
|
||||
float minX = 1000;
|
||||
float maxX = 0;
|
||||
float minY = 1000;
|
||||
float maxY = 0;
|
||||
double minX = 1000;
|
||||
double maxX = 0;
|
||||
double minY = 1000;
|
||||
double maxY = 0;
|
||||
TextPositionSequence prev = null;
|
||||
|
||||
boolean wasSplitted = false;
|
||||
Float splitX1 = null;
|
||||
Double splitX1 = null;
|
||||
for (TextPositionSequence word : textPositions) {
|
||||
|
||||
boolean lineSeparation = prev != null && word.getMinYDirAdj() - maxY > Math.min(word.getHeight(), prev.getHeight()) * 1.25;
|
||||
boolean startFromTop = prev != null && word.getMinYDirAdj() < prev.getMinYDirAdj() - prev.getTextHeight();
|
||||
boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj();
|
||||
boolean negativeXGap = prev != null && word.getMinXDirAdj() - minX < -5;
|
||||
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
|
||||
boolean lineSeparation = prev != null && word.getYDirAdj() - prev.getMaxYDirAdj() > Math.min(word.getHeight(), prev.getHeight()) * 1.25;
|
||||
boolean startFromTop = prev != null && word.getYDirAdj() < prev.getYDirAdj() - prev.getTextHeight();
|
||||
boolean splitByX = prev != null && maxX + 50 < word.getXDirAdj() && prev.getYDirAdj() == word.getYDirAdj();
|
||||
boolean negativeXGap = prev != null && word.getXDirAdj() - minX < -5;
|
||||
boolean newLineAfterSplit = prev != null && word.getYDirAdj() != prev.getYDirAdj() && wasSplitted && splitX1 != word.getXDirAdj();
|
||||
boolean isSplitByRuling = prev != null && usedRulings.lineBetween(prev, word);
|
||||
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
|
||||
boolean splitByOtherFontAndOtherY = prev != null && Math.abs(prev.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight() * 0.2 && (word.getFontStyle()
|
||||
.contains("bold")
|
||||
&& !prev.getFontStyle()
|
||||
.contains("bold") || prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold"));
|
||||
boolean splitByOtherFontAndOtherY = prev != null && Math.abs(prev.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight() * 0.2 //
|
||||
&& (word.getFontStyle().contains("bold") && !prev.getFontStyle().contains("bold") //
|
||||
|| prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold"));
|
||||
|
||||
Matcher matcher = pattern.matcher(chunkWords.stream()
|
||||
.collect(Collectors.joining(" ")).toString());
|
||||
boolean startsOnSameX = Math.abs(minX - word.getMinXDirAdj()) < 5 && matcher.matches();
|
||||
boolean startsOnSameX = Math.abs(minX - word.getXDirAdj()) < 5 && matcher.matches();
|
||||
|
||||
if (prev != null && (lineSeparation || startFromTop || splitByDir || isSplitByRuling || splitByOtherFontAndOtherY || negativeXGap || startsOnSameX)) {
|
||||
|
||||
@ -84,7 +78,7 @@ public class DocuMineBlockificationService {
|
||||
if (splitByX && !isSplitByRuling) {
|
||||
wasSplitted = true;
|
||||
cb1.setOrientation(Orientation.LEFT);
|
||||
splitX1 = word.getMinXDirAdj();
|
||||
splitX1 = word.getXDirAdj();
|
||||
} else if (newLineAfterSplit && !isSplitByRuling) {
|
||||
wasSplitted = false;
|
||||
cb1.setOrientation(Orientation.RIGHT);
|
||||
@ -107,14 +101,14 @@ public class DocuMineBlockificationService {
|
||||
chunkWords.add(word);
|
||||
|
||||
prev = word;
|
||||
if (word.getMinXDirAdj() < minX) {
|
||||
minX = word.getMinXDirAdj();
|
||||
if (word.getXDirAdj() < minX) {
|
||||
minX = word.getXDirAdj();
|
||||
}
|
||||
if (word.getMaxXDirAdj() > maxX) {
|
||||
maxX = word.getMaxXDirAdj();
|
||||
}
|
||||
if (word.getMinYDirAdj() < minY) {
|
||||
minY = word.getMinYDirAdj();
|
||||
if (word.getYDirAdj() < minY) {
|
||||
minY = word.getYDirAdj();
|
||||
}
|
||||
if (word.getMaxYDirAdj() > maxY) {
|
||||
maxY = word.getMaxYDirAdj();
|
||||
@ -126,7 +120,5 @@ public class DocuMineBlockificationService {
|
||||
return new ClassificationPage(textPageBlocks);
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -38,18 +38,18 @@ public class RedactManagerBlockificationService {
|
||||
List<TextPositionSequence> chunkWords = new ArrayList<>();
|
||||
List<AbstractPageBlock> chunkBlockList = new ArrayList<>();
|
||||
|
||||
float minX = 1000, maxX = 0, minY = 1000, maxY = 0;
|
||||
double minX = 1000, maxX = 0, minY = 1000, maxY = 0;
|
||||
TextPositionSequence prev = null;
|
||||
|
||||
boolean wasSplitted = false;
|
||||
Float splitX1 = null;
|
||||
Double splitX1 = null;
|
||||
for (TextPositionSequence word : textPositions) {
|
||||
|
||||
boolean lineSeparation = word.getMinYDirAdj() - maxY > word.getHeight() * 1.25;
|
||||
boolean startFromTop = prev != null && word.getMinYDirAdj() < prev.getMinYDirAdj() - prev.getTextHeight();
|
||||
boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj();
|
||||
boolean xIsBeforeFirstX = prev != null && word.getMinXDirAdj() < minX;
|
||||
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
|
||||
boolean lineSeparation = word.getYDirAdj() - maxY > word.getHeight() * 1.25;
|
||||
boolean startFromTop = prev != null && word.getYDirAdj() < prev.getYDirAdj() - prev.getTextHeight();
|
||||
boolean splitByX = prev != null && maxX + 50 < word.getXDirAdj() && prev.getYDirAdj() == word.getYDirAdj();
|
||||
boolean xIsBeforeFirstX = prev != null && word.getXDirAdj() < minX;
|
||||
boolean newLineAfterSplit = prev != null && word.getYDirAdj() != prev.getYDirAdj() && wasSplitted && splitX1 != word.getXDirAdj();
|
||||
boolean isSplitByRuling = prev != null && usedRulings.lineBetween(prev, word);
|
||||
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
|
||||
|
||||
@ -69,7 +69,7 @@ public class RedactManagerBlockificationService {
|
||||
if (splitByX && !isSplitByRuling) {
|
||||
wasSplitted = true;
|
||||
cb1.setOrientation(Orientation.LEFT);
|
||||
splitX1 = word.getMinXDirAdj();
|
||||
splitX1 = word.getXDirAdj();
|
||||
} else if (newLineAfterSplit && !isSplitByRuling) {
|
||||
wasSplitted = false;
|
||||
cb1.setOrientation(Orientation.RIGHT);
|
||||
@ -92,14 +92,14 @@ public class RedactManagerBlockificationService {
|
||||
chunkWords.add(word);
|
||||
|
||||
prev = word;
|
||||
if (word.getMinXDirAdj() < minX) {
|
||||
minX = word.getMinXDirAdj();
|
||||
if (word.getXDirAdj() < minX) {
|
||||
minX = word.getXDirAdj();
|
||||
}
|
||||
if (word.getMaxXDirAdj() > maxX) {
|
||||
maxX = word.getMaxXDirAdj();
|
||||
}
|
||||
if (word.getMinYDirAdj() < minY) {
|
||||
minY = word.getMinYDirAdj();
|
||||
if (word.getYDirAdj() < minY) {
|
||||
minY = word.getYDirAdj();
|
||||
}
|
||||
if (word.getMaxYDirAdj() > maxY) {
|
||||
maxY = word.getMaxYDirAdj();
|
||||
|
||||
@ -23,7 +23,7 @@ public class ClarifyndClassificationService {
|
||||
|
||||
public void classifyDocument(ClassificationDocument document) {
|
||||
|
||||
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
|
||||
List<Double> headlineFontSizes = document.getFontSizeCounter().getHigherThanMostPopular();
|
||||
|
||||
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
|
||||
|
||||
@ -35,7 +35,7 @@ public class ClarifyndClassificationService {
|
||||
}
|
||||
|
||||
|
||||
private void classifyPage(HeadlineClassificationService headlineClassificationService, ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
|
||||
private void classifyPage(HeadlineClassificationService headlineClassificationService, ClassificationPage page, ClassificationDocument document, List<Double> headlineFontSizes) {
|
||||
|
||||
for (AbstractPageBlock textBlock : page.getTextBlocks()) {
|
||||
if (textBlock instanceof TextPageBlock) {
|
||||
@ -45,7 +45,7 @@ public class ClarifyndClassificationService {
|
||||
}
|
||||
|
||||
|
||||
private void classifyBlock(HeadlineClassificationService headlineClassificationService, TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
|
||||
private void classifyBlock(HeadlineClassificationService headlineClassificationService, TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List<Double> headlineFontSizes) {
|
||||
|
||||
var bodyTextFrame = page.getBodyTextFrame();
|
||||
|
||||
|
||||
@ -31,7 +31,7 @@ public class DocuMineClassificationService {
|
||||
|
||||
public void classifyDocument(ClassificationDocument document) {
|
||||
|
||||
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
|
||||
List<Double> headlineFontSizes = document.getFontSizeCounter().getHigherThanMostPopular();
|
||||
|
||||
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
|
||||
|
||||
@ -46,7 +46,7 @@ public class DocuMineClassificationService {
|
||||
private void classifyPage(HeadlineClassificationService headlineClassificationService,
|
||||
ClassificationPage page,
|
||||
ClassificationDocument document,
|
||||
List<Float> headlineFontSizes) {
|
||||
List<Double> headlineFontSizes) {
|
||||
|
||||
for (AbstractPageBlock textBlock : page.getTextBlocks()) {
|
||||
if (textBlock instanceof TextPageBlock) {
|
||||
@ -60,7 +60,7 @@ public class DocuMineClassificationService {
|
||||
TextPageBlock textBlock,
|
||||
ClassificationPage page,
|
||||
ClassificationDocument document,
|
||||
List<Float> headlineFontSizes) {
|
||||
List<Double> headlineFontSizes) {
|
||||
|
||||
log.debug("headlineFontSizes: {}", headlineFontSizes);
|
||||
var bodyTextFrame = page.getBodyTextFrame();
|
||||
|
||||
@ -25,7 +25,7 @@ public class RedactManagerClassificationService {
|
||||
|
||||
public void classifyDocument(ClassificationDocument document) {
|
||||
|
||||
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
|
||||
List<Double> headlineFontSizes = document.getFontSizeCounter().getHigherThanMostPopular();
|
||||
|
||||
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
|
||||
|
||||
@ -37,7 +37,7 @@ public class RedactManagerClassificationService {
|
||||
}
|
||||
|
||||
|
||||
private void classifyPage(HeadlineClassificationService headlineClassificationService, ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
|
||||
private void classifyPage(HeadlineClassificationService headlineClassificationService, ClassificationPage page, ClassificationDocument document, List<Double> headlineFontSizes) {
|
||||
|
||||
for (AbstractPageBlock textBlock : page.getTextBlocks()) {
|
||||
if (textBlock instanceof TextPageBlock) {
|
||||
@ -47,7 +47,7 @@ public class RedactManagerClassificationService {
|
||||
}
|
||||
|
||||
|
||||
private void classifyBlock(HeadlineClassificationService headlineClassificationService, TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
|
||||
private void classifyBlock(HeadlineClassificationService headlineClassificationService, TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List<Double> headlineFontSizes) {
|
||||
|
||||
var bodyTextFrame = page.getBodyTextFrame();
|
||||
|
||||
@ -56,7 +56,7 @@ public class RedactManagerClassificationService {
|
||||
return;
|
||||
}
|
||||
if (document.getFontSizeCounter().getMostPopular() == null) {
|
||||
textBlock.setClassification(PageBlockType.OTHER);
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -129,7 +129,7 @@ public class RedactManagerClassificationService {
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) {
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN);
|
||||
} else {
|
||||
textBlock.setClassification(PageBlockType.OTHER);
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -111,10 +111,10 @@ public class DocumentGraphFactory {
|
||||
textBlocks.add(originalTextBlock);
|
||||
textBlocks.addAll(textBlocksToMerge);
|
||||
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), node, context, page);
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock2(TextPositionOperations.mergeAndSort(textBlocks), node, context, page);
|
||||
|
||||
if (node instanceof DuplicatedParagraph duplicatedParagraph) {
|
||||
AtomicTextBlock unsortedTextBlock = context.textBlockFactory.buildAtomicTextBlock(textBlocks.stream()
|
||||
AtomicTextBlock unsortedTextBlock = context.textBlockFactory.buildAtomicTextBlock2(textBlocks.stream()
|
||||
.flatMap(tb -> tb.getSequences()
|
||||
.stream())
|
||||
.collect(Collectors.toList()), node, context, page);
|
||||
@ -191,7 +191,7 @@ public class DocumentGraphFactory {
|
||||
|
||||
Page page = context.getPage(textBlocks.get(0).getPage());
|
||||
Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build();
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks), footer, context, page);
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock2(TextPositionOperations.merge(textBlocks), footer, context, page);
|
||||
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer);
|
||||
footer.setTreeId(tocId);
|
||||
footer.setLeafTextBlock(textBlock);
|
||||
@ -203,7 +203,7 @@ public class DocumentGraphFactory {
|
||||
|
||||
Page page = context.getPage(textBlocks.get(0).getPage());
|
||||
Header header = Header.builder().documentTree(context.getDocumentTree()).build();
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks), header, 0, page);
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.merge(textBlocks), header, 0, page);
|
||||
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header);
|
||||
header.setTreeId(tocId);
|
||||
header.setLeafTextBlock(textBlock);
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.factory;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition.HEIGHT_PADDING;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
@ -11,7 +12,6 @@ import java.util.Objects;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
@ -19,14 +19,13 @@ import lombok.experimental.UtilityClass;
|
||||
@UtilityClass
|
||||
public class SearchTextWithTextPositionFactory {
|
||||
|
||||
public final int HEIGHT_PADDING = 2;
|
||||
// when checking for a hyphen linebreak, we need to check after a linebreak if the last hyphen was less than three symbols away.
|
||||
// We detect a linebreak as either a "\n" character or if two adjacent symbol's position differ in y-coordinates by at least one character height.
|
||||
// If there is a hyphen linebreak, the hyphen will be 1 position in front of a "\n" or 2 positions in front of the character which has a lower y-coordinate
|
||||
// This is why, we need to initialize this to < -2, otherwise, if the very first symbol is a \n we would detect a hyphen linebreak that isn't there.
|
||||
// Also, Integer.MIN_VALUE is a bad idea due to potential overflow during arithmetic operations. This is why the default should be -3.
|
||||
public final int MAX_HYPHEN_LINEBREAK_DISTANCE = 3;
|
||||
public static final double LINEBREAK_DELTA_TOLERANCE = 1.05;
|
||||
public static final double LINEBREAK_DELTA_TOLERANCE = 1.5;
|
||||
|
||||
|
||||
public SearchTextWithTextPositionDto buildSearchTextToTextPositionDto(List<TextPositionSequence> sequences) {
|
||||
@ -38,15 +37,13 @@ public class SearchTextWithTextPositionFactory {
|
||||
|
||||
Context context = new Context();
|
||||
|
||||
RedTextPosition currentTextPosition = sequences.get(0).getTextPositions()
|
||||
.get(0);
|
||||
RedTextPosition currentTextPosition = sequences.get(0).getTextPositions().get(0);
|
||||
RedTextPosition previousTextPosition = RedTextPosition.builder().unicode(" ").bBoxDirAdj(currentTextPosition.getBBoxDirAdj()).build();
|
||||
|
||||
for (TextPositionSequence word : sequences) {
|
||||
for (int i = 0; i < word.getTextPositions().size(); ++i) {
|
||||
|
||||
currentTextPosition = word.getTextPositions()
|
||||
.get(i);
|
||||
currentTextPosition = word.getTextPositions().get(i);
|
||||
if (isLineBreak(currentTextPosition, previousTextPosition)) {
|
||||
removeHyphenLinebreaks(context);
|
||||
context.lineBreaksStringIdx.add(context.stringIdx);
|
||||
@ -68,11 +65,10 @@ public class SearchTextWithTextPositionFactory {
|
||||
++context.stringIdx;
|
||||
}
|
||||
|
||||
|
||||
List<Rectangle2D> positions = sequences.stream()
|
||||
.map(TextPositionSequence::getTextPositions)
|
||||
.flatMap(Collection::stream)
|
||||
.map(RedTextPosition::getBBoxInitialUserSpace)
|
||||
.map(RedTextPosition::getBBoxPdf)
|
||||
.toList();
|
||||
|
||||
assert context.stringBuilder.length() == context.stringIdxToPositionIdx.size();
|
||||
@ -161,8 +157,8 @@ public class SearchTextWithTextPositionFactory {
|
||||
return false;
|
||||
}
|
||||
|
||||
double deltaY = Math.abs(currentPosition.getYDirAdj() - previousPosition.getYDirAdj()) * LINEBREAK_DELTA_TOLERANCE;
|
||||
return deltaY >= currentPosition.getHeightDir() || deltaY >= previousPosition.getHeightDir();
|
||||
double deltaY = (Math.abs(currentPosition.getYDirAdj() - previousPosition.getYDirAdj()) * LINEBREAK_DELTA_TOLERANCE) + (2 * HEIGHT_PADDING);
|
||||
return deltaY >= currentPosition.getHeightDirAdj() || deltaY >= previousPosition.getHeightDirAdj();
|
||||
}
|
||||
|
||||
|
||||
@ -188,32 +184,6 @@ public class SearchTextWithTextPositionFactory {
|
||||
}
|
||||
|
||||
|
||||
public Rectangle2D mapRedTextPositionToInitialUserSpace(RedTextPosition textPosition, TextPositionSequence sequence) {
|
||||
|
||||
float textHeight = sequence.getTextHeight() + HEIGHT_PADDING;
|
||||
Rectangle2D rectangle2D = new Rectangle2D.Double(textPosition.getXDirAdj(),
|
||||
textPosition.getYDirAdj() - textHeight,
|
||||
textPosition.getWidthDirAdj(),
|
||||
textHeight + HEIGHT_PADDING);
|
||||
|
||||
AffineTransform transform = new AffineTransform();
|
||||
|
||||
if (sequence.getDir() == TextDirection.ZERO || sequence.getDir() == TextDirection.HALF_CIRCLE) {
|
||||
transform.rotate(sequence.getDir().getRadians(), sequence.getPageWidth() / 2f, sequence.getPageHeight() / 2f);
|
||||
transform.translate(0f, sequence.getPageHeight());
|
||||
} else if (sequence.getDir() == TextDirection.QUARTER_CIRCLE) {
|
||||
transform.rotate(sequence.getDir().getRadians(), sequence.getPageWidth() / 2f, sequence.getPageWidth() / 2f);
|
||||
transform.translate(0f, sequence.getPageWidth());
|
||||
} else {
|
||||
transform.rotate(sequence.getDir().getRadians(), sequence.getPageHeight() / 2f, sequence.getPageHeight() / 2f);
|
||||
transform.translate(0f, sequence.getPageWidth());
|
||||
}
|
||||
transform.scale(1., -1.);
|
||||
|
||||
return transform.createTransformedShape(rectangle2D).getBounds2D();
|
||||
}
|
||||
|
||||
|
||||
private class Context {
|
||||
|
||||
List<Integer> stringIdxToPositionIdx = new LinkedList<>();
|
||||
|
||||
@ -234,7 +234,7 @@ public class SectionNodeFactory {
|
||||
.filter(abstractTextContainer -> !abstractTextContainer.equals(atc))
|
||||
.filter(abstractTextContainer -> abstractTextContainer.getPage() == atc.getPage())
|
||||
.filter(abstractTextContainer -> abstractTextContainer instanceof TextPageBlock)
|
||||
.filter(abstractTextContainer -> abstractTextContainer.intersectsY(atc))
|
||||
.filter(abstractTextContainer -> abstractTextContainer.intersectsYPdf(atc))
|
||||
.map(abstractTextContainer -> (TextPageBlock) abstractTextContainer)
|
||||
.filter(abstractTextContainer -> abstractTextContainer.getDir() == atc.getDir())
|
||||
.filter(abstractTextContainer -> !abstractTextContainer.isToDuplicate())
|
||||
|
||||
@ -136,7 +136,7 @@ public class TableNodeFactory {
|
||||
.row(rowIndex)
|
||||
.col(colIndex)
|
||||
.header(cell.isHeaderCell())
|
||||
.bBox(cell.getBBoxInitialUserSpace())
|
||||
.bBox(cell.getBBoxPdf())
|
||||
.build();
|
||||
page.getMainBody().add(tableCell);
|
||||
|
||||
@ -148,7 +148,7 @@ public class TableNodeFactory {
|
||||
tableCell.setLeafTextBlock(context.getTextBlockFactory().emptyTextBlock(tableNode, context, page));
|
||||
} else if (cell.getTextBlocks().size() == 1) {
|
||||
textBlock = context.getTextBlockFactory()
|
||||
.buildAtomicTextBlock(cell.getTextBlocks()
|
||||
.buildAtomicTextBlock2(cell.getTextBlocks()
|
||||
.get(0).getSequences(), tableCell, context, page);
|
||||
tableCell.setLeafTextBlock(textBlock);
|
||||
} else if (firstTextBlockIsHeadline(cell)) {
|
||||
@ -163,8 +163,8 @@ public class TableNodeFactory {
|
||||
context,
|
||||
document);
|
||||
} else if (cellAreaIsSmallerThanPageAreaTimesThreshold(cell, page)) {
|
||||
List<TextPositionSequence> sequences = TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(cell.getTextBlocks());
|
||||
textBlock = context.getTextBlockFactory().buildAtomicTextBlock(sequences, tableCell, context, page);
|
||||
List<TextPositionSequence> sequences = TextPositionOperations.mergeAndSort(cell.getTextBlocks());
|
||||
textBlock = context.getTextBlockFactory().buildAtomicTextBlock2(sequences, tableCell, context, page);
|
||||
tableCell.setLeafTextBlock(textBlock);
|
||||
} else {
|
||||
cell.getTextBlocks()
|
||||
|
||||
@ -17,7 +17,7 @@ public class TextBlockFactory {
|
||||
long textBlockIdx;
|
||||
|
||||
|
||||
public AtomicTextBlock buildAtomicTextBlock(List<TextPositionSequence> sequences, SemanticNode parent, DocumentGraphFactory.Context context, Page page) {
|
||||
public AtomicTextBlock buildAtomicTextBlock2(List<TextPositionSequence> sequences, SemanticNode parent, DocumentGraphFactory.Context context, Page page) {
|
||||
|
||||
Integer numberOnPage = context.getAndIncrementTextBlockNumberOnPage(page);
|
||||
return buildAtomicTextBlock(sequences, parent, numberOnPage, page);
|
||||
@ -32,27 +32,27 @@ public class TextBlockFactory {
|
||||
long idx = textBlockIdx;
|
||||
textBlockIdx++;
|
||||
String orientation;
|
||||
int textDirection;
|
||||
int textRotation;
|
||||
if (sequences.isEmpty()) {
|
||||
orientation = null;
|
||||
textDirection = 0;
|
||||
textRotation = 0;
|
||||
} else {
|
||||
orientation = sequences.get(0).getDir().toString();
|
||||
textDirection = sequences.get(0).getRotation();
|
||||
textRotation = sequences.get(0).getDir().getRotation();
|
||||
}
|
||||
return AtomicTextBlock.fromSearchTextWithTextPosition(searchTextWithTextPositionDto.getSearchText(),
|
||||
searchTextWithTextPositionDto.getLineBreaks(),
|
||||
searchTextWithTextPositionDto.getBoldTextBoundaries(),
|
||||
searchTextWithTextPositionDto.getItalicTextBoundaries(),
|
||||
searchTextWithTextPositionDto.getPositions(),
|
||||
searchTextWithTextPositionDto.getStringIdxToPositionIdx(),
|
||||
idx,
|
||||
parent,
|
||||
numberOnPage,
|
||||
page,
|
||||
offset,
|
||||
orientation,
|
||||
textDirection);
|
||||
searchTextWithTextPositionDto.getLineBreaks(),
|
||||
searchTextWithTextPositionDto.getBoldTextBoundaries(),
|
||||
searchTextWithTextPositionDto.getItalicTextBoundaries(),
|
||||
searchTextWithTextPositionDto.getPositions(),
|
||||
searchTextWithTextPositionDto.getStringIdxToPositionIdx(),
|
||||
idx,
|
||||
parent,
|
||||
numberOnPage,
|
||||
page,
|
||||
offset,
|
||||
orientation,
|
||||
textRotation);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -8,12 +8,11 @@ import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.SearchTextWithTextPositionFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
@ -67,10 +66,7 @@ public class GraphicExtractorService {
|
||||
private List<Box> getCharacterBBoxes(List<TextPositionSequence> textPositionSequences) {
|
||||
|
||||
return textPositionSequences.stream()
|
||||
.map(pos -> pos.getTextPositions()
|
||||
.stream()
|
||||
.map(tp -> SearchTextWithTextPositionFactory.mapRedTextPositionToInitialUserSpace(tp, pos))
|
||||
.collect(RectangleTransformations.collectBBox()))
|
||||
.map(BoundingBox::getBBoxPdf)
|
||||
.map(Box::new)
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
@ -96,7 +96,7 @@ public class HeaderFooterDetection {
|
||||
continue;
|
||||
}
|
||||
|
||||
int distance = calculateHammingDistanceWithPreprocessing(testString, paddedString);
|
||||
int distance = StringDistances.hamming(testString, paddedString);
|
||||
double normalizedScore = 1 - (double) distance / Math.max(testString.length(), paddedString.length());
|
||||
score += normalizedScore * (j < weights.length ? weights[j] : 1);
|
||||
}
|
||||
@ -180,44 +180,4 @@ public class HeaderFooterDetection {
|
||||
return headerCandidates;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Calculate the Hamming distance between two strings after preprocessing to make them the same length
|
||||
* and replacing all digits with a special character '@' since they are a common occurrence in headers/footers.
|
||||
*
|
||||
* @param firstCandidate First string
|
||||
* @param secondCandidate Second string
|
||||
* @return The Hamming distance between the two preprocessed strings.
|
||||
*/
|
||||
private int calculateHammingDistanceWithPreprocessing(String firstCandidate, String secondCandidate) {
|
||||
|
||||
int maxLength = Math.max(firstCandidate.length(), secondCandidate.length());
|
||||
|
||||
String cleanFirstCandidate = padString(firstCandidate, maxLength, '\0').replaceAll("\\d", "@");
|
||||
String cleanSecondCandidate = padString(secondCandidate, maxLength, '\0').replaceAll("\\d", "@");
|
||||
|
||||
int distance = 0;
|
||||
for (int i = 0; i < maxLength; i++) {
|
||||
if (cleanFirstCandidate.charAt(i) != cleanSecondCandidate.charAt(i)) {
|
||||
distance++;
|
||||
}
|
||||
}
|
||||
return distance;
|
||||
}
|
||||
|
||||
|
||||
private String padString(String input, int length, char padChar) {
|
||||
|
||||
if (input.length() >= length) {
|
||||
return input;
|
||||
}
|
||||
|
||||
StringBuilder sb = new StringBuilder(input);
|
||||
|
||||
while (sb.length() < length) {
|
||||
sb.append(padChar);
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -48,7 +48,7 @@ public class MarkedContentUtils {
|
||||
|
||||
return markedContentByYPosition.values()
|
||||
.stream()
|
||||
.map(textPositions -> new TextPositionSequence(textPositions, 0, true).getBBoxInitialUserSpace())
|
||||
.map(textPositions -> new TextPositionSequence(textPositions, 0, true).getBBoxPdf())
|
||||
.map(t -> new Rectangle2D.Double(t.getX(), t.getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight())))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
@ -90,7 +90,7 @@ public class MarkedContentUtils {
|
||||
.map(content -> (TextPosition) content)
|
||||
.filter(content -> !content.getUnicode().equals(" "))
|
||||
.map(textPositions -> new TextPositionSequence(List.of(textPositions), 0, true))
|
||||
.map(BoundingBox::getBBoxInitialUserSpace)
|
||||
.map(BoundingBox::getBBoxPdf)
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
||||
@ -108,7 +108,7 @@ public final class PositionUtils {
|
||||
}
|
||||
|
||||
|
||||
public float getHeightDifferenceBetweenChunkWordAndDocumentWord(TextPageBlock textBlock, Float documentMostPopularWordHeight) {
|
||||
public double getHeightDifferenceBetweenChunkWordAndDocumentWord(TextPageBlock textBlock, Double documentMostPopularWordHeight) {
|
||||
|
||||
return textBlock.getMostPopularWordHeight() - documentMostPopularWordHeight;
|
||||
}
|
||||
@ -116,7 +116,7 @@ public final class PositionUtils {
|
||||
|
||||
public double getApproxLineCount(TextPageBlock textBlock) {
|
||||
|
||||
return textBlock.getHeight() / textBlock.getMostPopularWordHeight();
|
||||
return textBlock.getBBoxDirAdj().getHeight() / textBlock.getMostPopularWordHeight();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -28,7 +28,7 @@ public class SpreadsheetFinder {
|
||||
Map<Point2D, Point2D> edgesV = new HashMap<>();
|
||||
|
||||
for (Cell cell : cells) {
|
||||
for (Point2D pt : getPoints(cell.getBBoxInitialUserSpace())) {
|
||||
for (Point2D pt : getPoints(cell.getBBoxPdf())) {
|
||||
if (pointSet.contains(pt)) { // shared vertex, remove it
|
||||
pointSet.remove(pt);
|
||||
} else {
|
||||
|
||||
@ -0,0 +1,49 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class StringDistances {
|
||||
|
||||
|
||||
/**
|
||||
* Calculate the Hamming distance between two strings after preprocessing to make them the same length
|
||||
* and replacing all digits with a special character '@' since they are a common occurrence in headers/footers.
|
||||
*
|
||||
* @param s1 First string
|
||||
* @param s2 Second string
|
||||
* @return The Hamming distance between the two preprocessed strings.
|
||||
*/
|
||||
public int hamming(String s1, String s2) {
|
||||
|
||||
int maxLength = Math.max(s1.length(), s2.length());
|
||||
|
||||
String cleanFirstCandidate = padString(s1, maxLength, '\0').replaceAll("\\d", "@");
|
||||
String cleanSecondCandidate = padString(s2, maxLength, '\0').replaceAll("\\d", "@");
|
||||
|
||||
int distance = 0;
|
||||
for (int i = 0; i < maxLength; i++) {
|
||||
if (cleanFirstCandidate.charAt(i) != cleanSecondCandidate.charAt(i)) {
|
||||
distance++;
|
||||
}
|
||||
}
|
||||
return distance;
|
||||
}
|
||||
|
||||
|
||||
private String padString(String input, int length, char padChar) {
|
||||
|
||||
if (input.length() >= length) {
|
||||
return input;
|
||||
}
|
||||
|
||||
StringBuilder sb = new StringBuilder(input);
|
||||
|
||||
while (sb.length() < length) {
|
||||
sb.append(padChar);
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
@ -1,30 +1,136 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.awt.geom.RectangularShape;
|
||||
import java.util.Collection;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.AngleFilter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class TextPositionOperations {
|
||||
|
||||
private static final TextPositionSequenceComparator comparator = new TextPositionSequenceComparator();
|
||||
public static final double ANGLE_TOLERANCE = Math.PI / 35;
|
||||
public static final AngleFilter ANGLE_FILTER = new AngleFilter(-ANGLE_TOLERANCE, ANGLE_TOLERANCE);
|
||||
public static final double MAX_LINE_HEIGHT_FACTOR = 0.66; // multiplied with max word height
|
||||
public static final double MAX_WORD_DISTANCE_FACTOR = 3.5; // multiplied with max word width
|
||||
|
||||
private static final double THRESHOLD = 5;
|
||||
private static final Comparator<TextBoundingBox> COMPARATOR_DIR_ADJ = //
|
||||
Comparator.comparing(TextBoundingBox::getDir)
|
||||
.thenComparing(TextBoundingBox::getYDirAdj, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||
.thenComparing(TextBoundingBox::getXDirAdj, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD));
|
||||
|
||||
|
||||
public static List<TextPositionSequence> mergeAndSortTextPositionSequenceByYThenX(List<TextPageBlock> textBlocks) {
|
||||
public List<TextPositionSequence> mergeAndSort(List<TextPageBlock> textBlocks) {
|
||||
|
||||
var sequence = textBlocks.stream().flatMap(tb -> tb.getSequences().stream()).collect(Collectors.toList());
|
||||
|
||||
// because the TextPositionSequenceComparator is not transitive, but
|
||||
// JDK7+ enforces transitivity on comparators, we need to use
|
||||
// a custom quicksort implementation (which is slower, unfortunately).
|
||||
QuickSort.sort(sequence, comparator);
|
||||
return sequence;
|
||||
var sequences = textBlocks.stream()
|
||||
.flatMap(tb -> tb.getSequences()
|
||||
.stream())
|
||||
.collect(Collectors.toSet());
|
||||
return sortUsingLineDetection(sequences);
|
||||
}
|
||||
|
||||
public static List<TextPositionSequence> mergeTextPositionSequence(List<TextPageBlock> textBlocks) {
|
||||
|
||||
return textBlocks.stream().flatMap(tb -> tb.getSequences().stream()).collect(Collectors.toList());
|
||||
public List<TextPositionSequence> sort(List<TextPositionSequence> sequences) {
|
||||
|
||||
return sortUsingLineDetection(new HashSet<>(sequences));
|
||||
}
|
||||
|
||||
|
||||
private List<TextPositionSequence> sortUsingLineDetection(Set<TextPositionSequence> sequences) {
|
||||
|
||||
return groupByLine(sequences).stream()
|
||||
.map(TextPositionOperations::sortByXDirAdj)
|
||||
.filter(line -> !line.isEmpty())
|
||||
.sorted(Comparator.comparing(line -> line.get(0), COMPARATOR_DIR_ADJ))
|
||||
.flatMap(Collection::stream)
|
||||
.toList();
|
||||
|
||||
}
|
||||
|
||||
|
||||
private List<TextPositionSequence> sortByXDirAdj(Set<TextPositionSequence> line) {
|
||||
|
||||
return line.stream()
|
||||
.sorted(Comparator.comparing(TextPositionSequence::getXDirAdj))
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
private Collection<Set<TextPositionSequence>> groupByLine(Set<TextPositionSequence> sequences) {
|
||||
|
||||
double maxLineDistance = sequences.stream()
|
||||
.map(TextPositionSequence::getBBoxDirAdj)
|
||||
.mapToDouble(RectangularShape::getHeight).average()
|
||||
.orElse(10) * MAX_LINE_HEIGHT_FACTOR;
|
||||
double maxXGap = sequences.stream()
|
||||
.map(TextPositionSequence::getBBoxDirAdj)
|
||||
.mapToDouble(RectangularShape::getWidth).average()
|
||||
.orElse(75) * MAX_WORD_DISTANCE_FACTOR;
|
||||
|
||||
UnionFind<TextPositionSequence> unionFind = new UnionFind<>(sequences);
|
||||
|
||||
for (TextPositionSequence sequence : sequences) {
|
||||
for (TextPositionSequence sequence2 : sequences) {
|
||||
|
||||
if (sequence.equals(sequence2) || unionFind.inSameSet(sequence, sequence2)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
double angle = computeAngle(sequence.getBBoxDirAdj(), sequence2.getBBoxDirAdj());
|
||||
|
||||
double normalizedVerticalDistance = Math.abs(sequence.getBBoxDirAdj().getCenterY() - sequence2.getBBoxDirAdj().getCenterY()) / maxLineDistance;
|
||||
double normalizedHorizontalDistance = Math.abs(sequence.getBBoxDirAdj().getCenterX() - sequence2.getBBoxDirAdj().getCenterX()) / maxXGap;
|
||||
|
||||
if (sequence.getDir() != sequence2.getDir()
|
||||
|| Math.abs(sequence.getFontSize() - sequence2.getFontSize()) > 0.5 * Math.min(sequence.getFontSize(),
|
||||
sequence2.getFontSize())
|
||||
|| Math.pow(normalizedVerticalDistance, 2) + Math.pow(normalizedHorizontalDistance, 2) > 1
|
||||
|| !ANGLE_FILTER.matches(angle)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
unionFind.union(sequence, sequence2);
|
||||
}
|
||||
}
|
||||
|
||||
return unionFind.getGroups();
|
||||
}
|
||||
|
||||
|
||||
public double computeAngle(Rectangle2D rect1, Rectangle2D rect2) {
|
||||
|
||||
double rect1CentroidX = rect1.getCenterX();
|
||||
double rect1CentroidY = rect1.getCenterY();
|
||||
double rect2CentroidX = rect2.getCenterX();
|
||||
double rect2CentroidY = rect2.getCenterY();
|
||||
|
||||
double deltaX = rect2CentroidX - rect1CentroidX;
|
||||
double deltaY = rect2CentroidY - rect1CentroidY;
|
||||
|
||||
return FastAtan2.fastAtan2(deltaY, deltaX);
|
||||
}
|
||||
|
||||
|
||||
public List<TextPositionSequence> merge(List<TextPageBlock> textBlocks) {
|
||||
|
||||
return textBlocks.stream()
|
||||
.map(TextPageBlock::getSequences)
|
||||
.flatMap(Collection::stream)
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,99 +0,0 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
|
||||
import java.util.Comparator;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
|
||||
/**
|
||||
* This class is a comparator for TextPosition operators. It handles
|
||||
* pages with text in different directions by grouping the text based
|
||||
* on direction and sorting in that direction. This allows continuous text
|
||||
* in a given direction to be more easily grouped together.
|
||||
*
|
||||
* @author Ben Litchfield
|
||||
*/
|
||||
public class TextPositionSequenceComparator implements Comparator<TextPositionSequence> {
|
||||
|
||||
@Override
|
||||
public int compare(TextPositionSequence pos1, TextPositionSequence pos2) {
|
||||
// only compare text that is in the same direction
|
||||
int cmp1 = Float.compare(pos1.getDir().getDegrees(), pos2.getDir().getDegrees());
|
||||
if (cmp1 != 0) {
|
||||
return cmp1;
|
||||
}
|
||||
|
||||
// get the text direction adjusted coordinates
|
||||
double x1 = pos1.getBBox().getX();
|
||||
double x2 = pos2.getBBox().getX();
|
||||
|
||||
double pos1YBottom = pos1.getBBox().getMaxY();
|
||||
double pos2YBottom = pos2.getBBox().getMaxY();
|
||||
|
||||
// note that the coordinates have been adjusted so 0,0 is in upper left
|
||||
double pos1YTop = pos1YBottom - pos1.getBBox().getHeight();
|
||||
double pos2YTop = pos2YBottom - pos2.getBBox().getHeight();
|
||||
|
||||
double yDifference = Math.abs(pos1YBottom - pos2YBottom);
|
||||
|
||||
|
||||
// Adjust for text rotation
|
||||
switch (pos1.getRotation()) {
|
||||
case 0:
|
||||
// 0 degrees (horizontal, top to bottom and left to right): Sort primarily by y-coordinates from top to bottom (pos1YBottom < pos2YBottom).
|
||||
if (yDifference < .1 || (pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom) || (pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom)) {
|
||||
return Double.compare(x1, x2);
|
||||
} else if (pos1YBottom < pos2YBottom) {
|
||||
return -1;
|
||||
} else {
|
||||
return 1;
|
||||
}
|
||||
case 90:
|
||||
// 90 degrees (vertical, right to left): Sort by x-coordinates first (x1 > x2), then by y-coordinates from top to bottom (pos1YBottom < pos2YBottom).
|
||||
if (x1 > x2) {
|
||||
return -1;
|
||||
} else if (x1 < x2) {
|
||||
return 1;
|
||||
} else {
|
||||
return Double.compare(pos1YBottom, pos2YBottom);
|
||||
}
|
||||
case 180:
|
||||
// 180 degrees (horizontal, bottom to top and right to left): Sort primarily by y-coordinates from bottom to top (pos1YBottom > pos2YBottom).
|
||||
if (yDifference < .1 || (pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom) || (pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom)) {
|
||||
return Double.compare(x2, x1);
|
||||
|
||||
} else if (pos1YBottom > pos2YBottom) {
|
||||
return -1;
|
||||
} else {
|
||||
return 1;
|
||||
}
|
||||
case 270:
|
||||
// 270 degrees (vertical, left to right): Sort by x-coordinates in reverse (x2 > x1), then by y-coordinates from bottom to top (pos2YBottom > pos1YBottom).
|
||||
if (x2 > x1) {
|
||||
return -1;
|
||||
} else if (x2 < x1) {
|
||||
return 1;
|
||||
} else {
|
||||
return Double.compare(pos2YBottom, pos1YBottom);
|
||||
}
|
||||
default:
|
||||
throw new RuntimeException("Rotation not supported. Only 0/90/180/270 degree rotation is supported.");
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -51,7 +51,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
||||
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.words);
|
||||
visualizationsOnPage.getColoredRectangles()
|
||||
.addAll(textPositionSequences.stream()
|
||||
.map(BoundingBox::getBBoxInitialUserSpace)
|
||||
.map(BoundingBox::getBBoxPdf)
|
||||
.map(rect -> new ColoredRectangle(rect, WORDS_COLOR, 1))
|
||||
.toList());
|
||||
}
|
||||
@ -105,7 +105,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
||||
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.cells);
|
||||
visualizationsOnPage.getColoredRectangles()
|
||||
.addAll(cells.stream()
|
||||
.map(cell -> new ColoredRectangle(cell.getBBoxInitialUserSpace(), CELLS_COLOR, 1))
|
||||
.map(cell -> new ColoredRectangle(cell.getBBoxPdf(), CELLS_COLOR, 1))
|
||||
.toList());
|
||||
}
|
||||
|
||||
@ -119,7 +119,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
||||
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(page, this.zones);
|
||||
visualizationsOnPage.getColoredRectangles()
|
||||
.addAll(zones.stream()
|
||||
.map(BoundingBox::getBBoxInitialUserSpace)
|
||||
.map(BoundingBox::getBBoxPdf)
|
||||
.map(zone -> new ColoredRectangle(zone, ZONES_COLOR, 1))
|
||||
.toList());
|
||||
|
||||
@ -144,7 +144,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
||||
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(page, this.lines);
|
||||
visualizationsOnPage.getColoredRectangles()
|
||||
.addAll(lines.stream()
|
||||
.map(BoundingBox::getBBoxInitialUserSpace)
|
||||
.map(BoundingBox::getBBoxPdf)
|
||||
.map(line -> new ColoredRectangle(line, LINES_COLOR, 0.5f))
|
||||
.toList());
|
||||
}
|
||||
@ -158,7 +158,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
||||
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(page, zones);
|
||||
visualizationsOnPage.getColoredRectangles()
|
||||
.addAll(textPageBlocks.stream()
|
||||
.map(rect -> new ColoredRectangle(rect.getBBoxInitialUserSpace(), ZONES_COLOR, 1))
|
||||
.map(rect -> new ColoredRectangle(rect.getBBoxPdf(), ZONES_COLOR, 1))
|
||||
.toList());
|
||||
}
|
||||
|
||||
@ -222,11 +222,11 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
||||
.flatMap(Collection::stream)
|
||||
.forEach(character -> {
|
||||
Color color = ROTATING_CHARACTER_COLOR.get(index.getAndIncrement() % ROTATING_CHARACTER_COLOR.size());
|
||||
Rectangle2D charBBox = character.getTextPosition().getBBoxInitialUserSpace();
|
||||
Rectangle2D charBBox = character.getTextPosition().getBBoxPdf();
|
||||
characterVisualizations.getColoredRectangles().add(new ColoredRectangle(charBBox, color, 1));
|
||||
character.getNeighbors()
|
||||
.forEach(neighbor -> {
|
||||
Rectangle2D neighborBBox = neighbor.getCharacter().getTextPosition().getBBoxInitialUserSpace();
|
||||
Rectangle2D neighborBBox = neighbor.getCharacter().getTextPosition().getBBoxPdf();
|
||||
Line2D line = new Line2D.Double(new Point2D.Double(charBBox.getCenterX(), charBBox.getCenterY()),
|
||||
new Point2D.Double(neighborBBox.getCenterX(), neighborBBox.getCenterY()));
|
||||
neighbourVisualizations.getColoredLines().add(new ColoredLine(line, color, 1));
|
||||
|
||||
@ -38,7 +38,7 @@ dependencies {
|
||||
implementation("com.amazonaws:aws-java-sdk-s3:1.12.536")
|
||||
implementation("org.springframework.cloud:spring-cloud-starter-openfeign:4.0.4")
|
||||
implementation("net.logstash.logback:logstash-logback-encoder:7.4")
|
||||
implementation("com.pdftron:PDFNet:10.7.0")
|
||||
implementation("com.pdftron:PDFNet:10.11.0")
|
||||
|
||||
// for integration testing only
|
||||
testImplementation(project(":viewer-doc-processor"))
|
||||
@ -52,6 +52,8 @@ dependencies {
|
||||
testImplementation("com.fasterxml.jackson.datatype:jackson-datatype-jsr310:${jacksonVersion}")
|
||||
testImplementation("org.apache.pdfbox:pdfbox:${pdfBoxVersion}")
|
||||
testImplementation("org.apache.pdfbox:pdfbox-tools:${pdfBoxVersion}")
|
||||
testImplementation("org.apache.commons:commons-text:1.12.0")
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
@ -89,6 +91,9 @@ tasks.named<BootBuildImage>("bootBuildImage") {
|
||||
environment.put("BPE_DELIM_JAVA_TOOL_OPTIONS", " ")
|
||||
environment.put("BPE_APPEND_JAVA_TOOL_OPTIONS", "-Dfile.encoding=UTF-8")
|
||||
|
||||
builder.set("docker-proxy.knecon.com/paketobuildpacks/builder:base")
|
||||
runImage.set("docker-proxy.knecon.com/paketobuildpacks/run:base-cnb")
|
||||
|
||||
imageName.set("nexus.knecon.com:5001/ff/${project.name}:${project.version}")
|
||||
if (project.hasProperty("buildbootDockerHostNetwork")) {
|
||||
network.set("host")
|
||||
@ -99,6 +104,13 @@ tasks.named<BootBuildImage>("bootBuildImage") {
|
||||
}
|
||||
verboseLogging.set(true)
|
||||
|
||||
builderRegistry {
|
||||
username.set(providers.gradleProperty("mavenUser").getOrNull())
|
||||
password.set(providers.gradleProperty("mavenPassword").getOrNull())
|
||||
email.set(providers.gradleProperty("mavenEmail").getOrNull())
|
||||
url.set("https://docker-proxy.knecon.com:5001/")
|
||||
}
|
||||
|
||||
publishRegistry {
|
||||
username.set(providers.gradleProperty("mavenUser").getOrNull())
|
||||
password.set(providers.gradleProperty("mavenPassword").getOrNull())
|
||||
@ -106,4 +118,5 @@ tasks.named<BootBuildImage>("bootBuildImage") {
|
||||
url.set("https://nexus.knecon.com:5001/")
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,20 +1,18 @@
|
||||
package com.knecon.fforesight.service.layoutparser.server;
|
||||
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.stereotype.Component;
|
||||
import org.springframework.context.annotation.Bean;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
|
||||
import com.google.common.base.Strings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.LayoutparserSettings;
|
||||
import com.pdftron.pdf.PDFNet;
|
||||
|
||||
import jakarta.annotation.PostConstruct;
|
||||
import jakarta.annotation.PreDestroy;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Component
|
||||
@Configuration
|
||||
@RequiredArgsConstructor
|
||||
public class PDFNetInitializer {
|
||||
|
||||
@ -22,26 +20,17 @@ public class PDFNetInitializer {
|
||||
private String pdftronLicense;
|
||||
|
||||
|
||||
@Bean
|
||||
@SneakyThrows
|
||||
@PostConstruct
|
||||
// Do not change back to application runner, if it is application runner it takes messages from the queue before PDFNet is initialized, that leads to UnsatisfiedLinkError.
|
||||
public void init() {
|
||||
|
||||
if (Strings.isNullOrEmpty(pdftronLicense)) {
|
||||
return;
|
||||
throw new IllegalArgumentException("PDFTRON_LICENSE not set!");
|
||||
}
|
||||
log.info("Initializing Native Libraries");
|
||||
log.info("Setting pdftron license: {}", pdftronLicense);
|
||||
PDFNet.setTempPath("/tmp/pdftron");
|
||||
PDFNet.initialize(pdftronLicense);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@PreDestroy
|
||||
public void terminate() {
|
||||
|
||||
PDFNet.terminate();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -27,23 +27,28 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@Slf4j
|
||||
public class LayoutparserEnd2EndTest extends AbstractTest {
|
||||
|
||||
public static final LayoutParsingType LAYOUT_PARSING_TYPE = LayoutParsingType.DOCUMINE;
|
||||
|
||||
@Autowired
|
||||
private LayoutParsingPipeline layoutParsingPipeline;
|
||||
|
||||
|
||||
@Test
|
||||
@Disabled
|
||||
public void testLayoutParserEndToEnd() {
|
||||
|
||||
String filePath = "files/test-1.pdf";
|
||||
String filePath = "/home/kschuettler/Downloads/55974b3de7ed2915718a10458206bbd8.ORIGIN.pdf";
|
||||
|
||||
runForFile(filePath);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@Disabled
|
||||
@SneakyThrows
|
||||
public void testLayoutParserEndToEndWithFolder() {
|
||||
|
||||
String folder = "/Users/maverickstuder/Documents/Fforesight/layoutparser/layoutparser-service/layoutparser-service-server/src/test";
|
||||
String folder = "/home/kschuettler/Dokumente/TestFiles/ReadingOrder";
|
||||
List<Path> pdfFiles = Files.walk(Path.of(folder))
|
||||
.filter(path -> path.getFileName().toString().endsWith(".pdf"))
|
||||
.sorted(Comparator.comparing(Path::getFileName))
|
||||
@ -69,7 +74,8 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
|
||||
file = new File(filePath);
|
||||
}
|
||||
|
||||
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH, true);
|
||||
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LAYOUT_PARSING_TYPE, true);
|
||||
|
||||
prepareStorage(layoutParsingRequest, file);
|
||||
|
||||
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
|
||||
|
||||
@ -57,9 +57,11 @@ public class OutlineDetectionTest extends AbstractTest {
|
||||
pdfNetInitializer.init();
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testOutlineError(){
|
||||
public void testOutlineError() {
|
||||
|
||||
String fileName = "files/syngenta/CustomerFiles/Clarifynd/VV-470942.pdf";
|
||||
|
||||
ClassificationDocument classificationDocument = parseLayout(fileName, LayoutParsingType.CLARIFYND);
|
||||
|
||||
@ -0,0 +1,452 @@
|
||||
package com.knecon.fforesight.service.layoutparser.server.graph;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.FileOutputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.text.similarity.LevenshteinDistance;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
|
||||
import com.iqser.red.storage.commons.service.StorageService;
|
||||
import com.iqser.red.storage.commons.utils.FileSystemBackedStorageService;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
|
||||
import com.knecon.fforesight.service.layoutparser.server.PDFNetInitializer;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.Standard14EmbeddableFont;
|
||||
import com.pdftron.common.Matrix2D;
|
||||
import com.pdftron.common.PDFNetException;
|
||||
import com.pdftron.pdf.ColorPt;
|
||||
import com.pdftron.pdf.ColorSpace;
|
||||
import com.pdftron.pdf.Element;
|
||||
import com.pdftron.pdf.ElementBuilder;
|
||||
import com.pdftron.pdf.ElementWriter;
|
||||
import com.pdftron.pdf.Font;
|
||||
import com.pdftron.pdf.GState;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
import com.pdftron.pdf.Page;
|
||||
import com.pdftron.sdf.SDFDoc;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
public class DocumentReadingOrderTest extends BuildDocumentTest {
|
||||
|
||||
private static final boolean DRAW_DIR_ADJ_COORDS = false;
|
||||
public static final List<LayoutParsingType> LAYOUT_PARSING_TYPES = List.of(LayoutParsingType.DOCUMINE,
|
||||
LayoutParsingType.DOCUMINE_OLD,
|
||||
LayoutParsingType.REDACT_MANAGER,
|
||||
LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH);
|
||||
|
||||
@Autowired
|
||||
PDFNetInitializer pdfNetInitializer;
|
||||
|
||||
@Autowired
|
||||
StorageService storageService;
|
||||
|
||||
|
||||
@BeforeEach
|
||||
public void before() {
|
||||
|
||||
pdfNetInitializer.init();
|
||||
}
|
||||
|
||||
|
||||
@AfterEach
|
||||
public void cleanUp() {
|
||||
|
||||
((FileSystemBackedStorageService) storageService).clearStorage();
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void readingOrderTestSeite14() {
|
||||
|
||||
String pdfFile = "files/syngenta/CustomerFiles/SinglePages/Seite14.pdf";
|
||||
String expectedText = """
|
||||
27
|
||||
26 APPENDICES SECTION
|
||||
APPENDIX 1 Analytical Report
|
||||
syngenta
|
||||
A16148F
|
||||
Batch ID 533158 (GP-080305)
|
||||
Batch Identification 533158
|
||||
Product Design Code A16148F
|
||||
Product Denomination SYN524464 FS (500)
|
||||
Product by Common Name SYN524464 FS (500)
|
||||
Other Product Code(s) GP-080305
|
||||
Source Technology & Projects, Syngenta Crop Protection, Inc.
|
||||
Chemical Analysis
|
||||
(Active Ingredient Content)
|
||||
Identity of the Active Ingredient* Confirmed
|
||||
Content of SYN524464* 45.6% (wt/wt) or 534 g/L
|
||||
Methodology Used for Characterization HPLC
|
||||
The Active Ingredient content is within the FAO limits.
|
||||
Physical Analysis
|
||||
Appearance* pink opaque liquid
|
||||
Density* 1171 g/L
|
||||
Stability:
|
||||
Storage Temperature <30°℃
|
||||
Expiration date March 2009
|
||||
The stability of this test substance will be determined concurrently through reanalysis of material held
|
||||
in inventory under GLP conditions at Syngenta Crop Protection, Inc., Greensboro, NC
|
||||
This Certificate of Analysis is summarizing data (marked with an asterisk) from a study that has been
|
||||
performed in compliance with Good Laboratory Practices per 40 CFR Part 160 Raw data,
|
||||
documentation, protocols, any amendments to study protocols and reports pertaining to this study are
|
||||
maintained in the Syngenta Crop Protection Archives in Greensboro, NC.
|
||||
Authorization'
|
||||
26 Mar 2008
|
||||
Dorothea Jeffery Date
|
||||
Group Leader I
|
||||
Analytical & Product Chemistry Department
|
||||
Document 10350420.doc Certificate of Analysis
|
||||
Page 1 of 1 Study T000973-08
|
||||
Report Number: 11813-08 Page 14 of 14
|
||||
""";
|
||||
|
||||
assertSimilarReadingOrder(expectedText, pdfFile);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void readingOrderTestTiltedText() {
|
||||
|
||||
String pdfFile = "files/syngenta/CustomerFiles/SinglePages/tiltedText.pdf";
|
||||
String expectedText = """
|
||||
However there was no consistency in the areas affected either between sexes or at different
|
||||
ages, in general other measurements for the same structures at other levels showed no
|
||||
differences, all were within the historical control range of mean values and none of these
|
||||
differences is considered to be related to treatment (Appendix K).
|
||||
7. DISCUSSION
|
||||
The purpose of this study, which was to determine the potential for developmental
|
||||
neurotoxicity in the assessment and evaluation of the toxic characteristics of lambda-
|
||||
cyhalothrin in rats, was successfully accomplished.
|
||||
There was evidence of toxicity characterised by lower bodyweights and food consumption in
|
||||
dams receiving 60 or 150 ppm lambda-cyhalothrin during gestation and also post partum in
|
||||
the 150 ppm group only.
|
||||
There were no treatment-related effects of administration of lambda-cyhalothrin on
|
||||
reproductive parameters: there were no effects on gestation length, mean litter size or on pup
|
||||
bodyweight at birth.
|
||||
There was evidence of toxicity in F1 animals receiving 150 ppm. This was seen as slightly
|
||||
higher pup mortality up to day 5 and lower bodyweights from day 5, reaching a maximum of
|
||||
approximately 8-9% below control on day 22.
|
||||
There was a small difference in the age at which male rats in the 150 ppm group reached
|
||||
preputial separation, but this was too small to be of toxicological significance.
|
||||
No effects were seen on motor activity or response to auditory startle.
|
||||
There was no clear evidence of any effects in the learning and memory assessment in
|
||||
weanling (age 21-24 days) or young adult animals (age 59-62 days). However, at day 21
|
||||
swimming speeds of females receiving 150 ppm were slightly slower than controls. The
|
||||
difference is considered to reflect a difference in swimming performance rather than an effect
|
||||
on learning or memory.
|
||||
No neuropathological effect of treatment with lambda-cyhalothrin was detected from a
|
||||
detailed microscopic examination of the selected F1 animals post mortem on day 12 or 63.
|
||||
LAMBDA-CYHALOTHRIN: DEVELOPMENTAL NEUROTOXICITY STUDY IN RATS
|
||||
CTL/RR0969/REGULATORY/REPORT - 34
|
||||
""";
|
||||
|
||||
assertSimilarReadingOrder(expectedText, pdfFile);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void readingOrderTest402Study() {
|
||||
|
||||
String pdfFile = "files/SinglePages/402StudyPage5.pdf";
|
||||
String expectedText = """
|
||||
2.0 INTRODUCTION
|
||||
2.1 Purpose
|
||||
Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor
|
||||
invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et
|
||||
accusam et justo duo dolores et ea rebum.
|
||||
Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem
|
||||
ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt
|
||||
ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et
|
||||
justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est
|
||||
Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed
|
||||
diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam
|
||||
voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd
|
||||
gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
|
||||
2.2 Guidelines
|
||||
Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor
|
||||
invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua.
|
||||
At vero eos et accusam et justo duo dolores et ea rebum.
|
||||
Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem
|
||||
ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt
|
||||
ut labore et dolore magna aliquyam erat, sed diam voluptua.
|
||||
At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no
|
||||
sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet,
|
||||
consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore
|
||||
magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et
|
||||
ea rebum.
|
||||
Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
|
||||
Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel
|
||||
illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui
|
||||
blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi. Lorem
|
||||
ipsum dolor sit amet.
|
||||
2.3 Test Facility
|
||||
Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor
|
||||
invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et
|
||||
accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata
|
||||
sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur
|
||||
sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna
|
||||
aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea
|
||||
rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
|
||||
Report Number: 20/080-002P 5
|
||||
""";
|
||||
|
||||
assertSimilarReadingOrder(expectedText, pdfFile);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void readingOrderTest402StudyRotated() {
|
||||
|
||||
String pdfFile = "files/SinglePages/402StudyPage5_rotated.pdf";
|
||||
String expectedText = """
|
||||
2.0 INTRODUCTION
|
||||
2.1 Purpose
|
||||
Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor
|
||||
invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et
|
||||
accusam et justo duo dolores et ea rebum.
|
||||
Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem
|
||||
ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt
|
||||
ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et
|
||||
justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est
|
||||
Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed
|
||||
diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam
|
||||
voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd
|
||||
gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
|
||||
2.2 Guidelines
|
||||
Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor
|
||||
invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua.
|
||||
At vero eos et accusam et justo duo dolores et ea rebum.
|
||||
Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem
|
||||
ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt
|
||||
ut labore et dolore magna aliquyam erat, sed diam voluptua.
|
||||
At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no
|
||||
sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet,
|
||||
consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore
|
||||
magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et
|
||||
ea rebum.
|
||||
Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
|
||||
Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel
|
||||
illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui
|
||||
blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi. Lorem
|
||||
ipsum dolor sit amet.
|
||||
2.3 Test Facility
|
||||
Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor
|
||||
invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et
|
||||
accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata
|
||||
sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur
|
||||
sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna
|
||||
aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea
|
||||
rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
|
||||
Report Number: 20/080-002P 5
|
||||
""";
|
||||
|
||||
assertSimilarReadingOrder(expectedText, pdfFile);
|
||||
}
|
||||
|
||||
|
||||
private void assertSimilarReadingOrder(String expectedText, String pdfFile) {
|
||||
|
||||
List<String> expectedLines = List.of(expectedText.split("\n"));
|
||||
for (LayoutParsingType layoutParsingType : LAYOUT_PARSING_TYPES) {
|
||||
|
||||
log.info("Evaluating for {}", layoutParsingType);
|
||||
|
||||
ClassificationDocument classificationDocument = parseLayout(pdfFile, layoutParsingType);
|
||||
|
||||
if (DRAW_DIR_ADJ_COORDS) {
|
||||
drawDirAdjCoords(pdfFile, classificationDocument, layoutParsingType);
|
||||
}
|
||||
|
||||
Document document = DocumentGraphFactory.buildDocumentGraph(layoutParsingType, classificationDocument);
|
||||
List<String> readLines = getTextAsLines(document);
|
||||
readLines.forEach(log::info);
|
||||
|
||||
int correctCount = 0;
|
||||
int maxLineOffset = 0;
|
||||
for (int i = 0; i < expectedLines.size(); i++) {
|
||||
String expectedLine = expectedLines.get(i);
|
||||
int mostSimilarLine = 0;
|
||||
double maxSimilarity = 0;
|
||||
for (int j = 0; j < readLines.size(); j++) {
|
||||
String readLine = readLines.get(j);
|
||||
double similarity = similarity(expectedLine, readLine);
|
||||
if (similarity > maxSimilarity) {
|
||||
maxSimilarity = similarity;
|
||||
mostSimilarLine = j;
|
||||
}
|
||||
}
|
||||
if (readLines.get(mostSimilarLine).trim().equals(expectedLine.trim())) {
|
||||
correctCount++;
|
||||
int lineOffset = Math.abs(mostSimilarLine - i);
|
||||
if (lineOffset > 0) {
|
||||
log.info("Line {} offset by {}", readLines.get(mostSimilarLine), lineOffset);
|
||||
}
|
||||
if (lineOffset > maxLineOffset) {
|
||||
maxLineOffset = lineOffset;
|
||||
}
|
||||
} else {
|
||||
log.error("Lines {}-{} do not match: \n Expected: {}\n Actual: {}", i, mostSimilarLine, expectedLine, readLines.get(mostSimilarLine));
|
||||
}
|
||||
}
|
||||
double correctLinesFactor = (double) correctCount / (double) readLines.size();
|
||||
double averageLineOffset = (double) maxLineOffset / (double) readLines.size();
|
||||
|
||||
log.info("Difference in number of lines: {}", Math.abs(expectedLines.size() - readLines.size()));
|
||||
log.info("Correct lines factor: {}", correctLinesFactor);
|
||||
log.info("Max order offset: {}, avg: {}", maxLineOffset, averageLineOffset);
|
||||
// In the rotated document one line is read as two
|
||||
|
||||
assertTrue(Math.abs(expectedLines.size() - readLines.size()) <= 1);
|
||||
// Most of the errors come from the similarity metric finding different lines in 402 study, as the lines are too similar, or a miss classification of Footers
|
||||
assertTrue(averageLineOffset < 1);
|
||||
assertTrue(correctLinesFactor > 0.9);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public List<String> getTextAsLines(Document document) {
|
||||
|
||||
return document.getTextBlock().getAtomicTextBlocks()
|
||||
.stream()
|
||||
.filter(atb -> !atb.isEmpty())
|
||||
.map(DocumentReadingOrderTest::getLines)
|
||||
.flatMap(List::stream)
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
private static List<String> getLines(AtomicTextBlock atomicTextBlock) {
|
||||
|
||||
int numberOfLines = atomicTextBlock.numberOfLines();
|
||||
List<String> lines = new ArrayList<>(numberOfLines);
|
||||
for (int line = 0; line < numberOfLines; line++) {
|
||||
lines.add(atomicTextBlock.getLine(line).toString());
|
||||
}
|
||||
return lines;
|
||||
}
|
||||
|
||||
|
||||
private static double similarity(String s1, String s2) {
|
||||
|
||||
LevenshteinDistance levenshteinDistance = new LevenshteinDistance();
|
||||
|
||||
int max = Math.max(s1.length(), s2.length());
|
||||
int dist = levenshteinDistance.apply(s1, s2);
|
||||
return 1 - (double) dist / (double) max;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void drawDirAdjCoords(String filename, ClassificationDocument classificationDocument, LayoutParsingType layoutParsingType) {
|
||||
|
||||
try (PDFDoc pdfDoc = new PDFDoc(); ElementWriter writer = new ElementWriter(); ElementBuilder builder = new ElementBuilder()) {
|
||||
|
||||
Standard14EmbeddableFont font = Standard14EmbeddableFont.helvetica();
|
||||
Font helvetica = Font.create(pdfDoc, Font.e_helvetica);
|
||||
for (ClassificationPage classificationDocumentPage : classificationDocument.getPages()) {
|
||||
int count = 0;
|
||||
Page page = pdfDoc.pageCreate();
|
||||
writer.begin(page);
|
||||
for (AbstractPageBlock abstractBlock : classificationDocumentPage.getTextBlocks()) {
|
||||
|
||||
if (abstractBlock instanceof TextPageBlock textBlock) {
|
||||
for (TextPositionSequence sequence : TextPositionOperations.mergeAndSort(List.of(textBlock))) {
|
||||
|
||||
float stringWidth;
|
||||
try {
|
||||
stringWidth = font.getStringWidth(sequence.toString());
|
||||
} catch (Exception e) {
|
||||
stringWidth = font.getFont().getAverageFontWidth() * sequence.toString().length();
|
||||
}
|
||||
double fontSize = (sequence.getBBoxDirAdj().getWidth() / stringWidth) * 1000;
|
||||
try (Matrix2D textMatrix = new Matrix2D(1,
|
||||
0,
|
||||
0,
|
||||
1,
|
||||
sequence.getXDirAdj(),
|
||||
page.getCropBox().getHeight() - sequence.getYDirAdj() - sequence.getHeightDirAdj())) {
|
||||
writeText(sequence.toString(), textMatrix, builder, helvetica, fontSize, writer, Color.BLACK);
|
||||
writeText(String.valueOf(count), textMatrix.translate(-(2 + (5 * String.valueOf(count).length())), 0), builder, helvetica, 8, writer, Color.RED);
|
||||
count++;
|
||||
}
|
||||
|
||||
writeBBox(sequence.getBBoxDirAdj(), builder, page, writer, Color.BLACK);
|
||||
}
|
||||
writeBBox(textBlock.getBBoxDirAdj(), builder, page, writer, Color.BLUE);
|
||||
}
|
||||
}
|
||||
writer.end();
|
||||
pdfDoc.pagePushBack(page);
|
||||
|
||||
}
|
||||
|
||||
Path stem = Path.of("/tmp/READING_ORDER_TEST/");
|
||||
Files.createDirectories(stem);
|
||||
try (var out = new FileOutputStream(stem.resolve(layoutParsingType.name() + "_" + Path.of(filename).getFileName()).toFile() + "_dirAdjCoordinates.pdf")) {
|
||||
pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static void writeBBox(Rectangle2D r, ElementBuilder builder, Page page, ElementWriter writer, Color color) throws PDFNetException {
|
||||
|
||||
Element rect = builder.createRect(r.getX(), page.getCropBox().getHeight() - r.getY(), r.getWidth(), -r.getHeight());
|
||||
float[] comp = color.getColorComponents(null);
|
||||
rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
|
||||
try (ColorPt colorpt = new ColorPt(comp[0], comp[1], comp[2])) {
|
||||
rect.getGState().setStrokeColor(colorpt);
|
||||
}
|
||||
rect.setPathStroke(true);
|
||||
writer.writeElement(rect);
|
||||
}
|
||||
|
||||
|
||||
private static void writeText(String string,
|
||||
Matrix2D matrix2D,
|
||||
ElementBuilder builder,
|
||||
Font helvetica,
|
||||
double fontSize,
|
||||
ElementWriter writer,
|
||||
Color color) throws PDFNetException {
|
||||
|
||||
Element text = builder.createTextBegin(helvetica, fontSize);
|
||||
text.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
|
||||
float[] colorComponents = color.getColorComponents(null);
|
||||
try (ColorPt colorpt = new ColorPt(colorComponents[0], colorComponents[1], colorComponents[2])) {
|
||||
text.getGState().setFillColor(colorpt);
|
||||
}
|
||||
text.setTextMatrix(matrix2D);
|
||||
text.getGState().setTextRenderMode(GState.e_fill_text);
|
||||
writer.writeElement(text);
|
||||
|
||||
text = builder.createTextRun(string);
|
||||
writer.writeElement(text);
|
||||
text = builder.createTextEnd();
|
||||
writer.writeElement(text);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,60 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.server.model;
|
||||
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
import org.apache.pdfbox.util.Matrix;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.iqser.red.storage.commons.properties.StorageProperties;
|
||||
import com.iqser.red.storage.commons.service.ObjectSerializer;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
public class TextPositionSequenceTest {
|
||||
|
||||
private static final String TEXT_POSITION_SEQUENCE_AS_JSON = "{\n" //
|
||||
+ " \"page\": 1,\n" //
|
||||
+ " \"textPositions\": [],\n" //
|
||||
+ " \"dir\": 180.0,\n" //
|
||||
+ " \"rotation\": 0,\n" //
|
||||
+ " \"pageHeight\": 800,\n" //
|
||||
+ " \"pageWidth\": 600\n" //
|
||||
+ "}";
|
||||
|
||||
private final ObjectSerializer objectSerializer = new ObjectSerializer(new ObjectMapper());
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testDeserializationWithJackson() {
|
||||
|
||||
TextPositionSequence textPositionSequence = objectSerializer.deserialize(new ByteArrayInputStream(TEXT_POSITION_SEQUENCE_AS_JSON.getBytes(StandardCharsets.UTF_8)),
|
||||
TextPositionSequence.class);
|
||||
|
||||
assertPropertiesAfterJsonDeserialization(textPositionSequence);
|
||||
}
|
||||
|
||||
|
||||
private void assertPropertiesAfterJsonDeserialization(TextPositionSequence textPositionSequence) {
|
||||
|
||||
assertThat(textPositionSequence.getPage()).isEqualTo(1);
|
||||
assertThat(textPositionSequence.getTextPositions()).hasSize(0);
|
||||
assertThat(textPositionSequence.getDir()).isEqualTo(TextDirection.HALF_CIRCLE);
|
||||
assertThat(textPositionSequence.getRotation()).isEqualTo(0);
|
||||
assertThat(textPositionSequence.getPageHeight()).isEqualTo(800f);
|
||||
assertThat(textPositionSequence.getPageWidth()).isEqualTo(600f);
|
||||
}
|
||||
|
||||
|
||||
private Matrix createIdentityMatrix() {
|
||||
|
||||
return new Matrix();
|
||||
}
|
||||
|
||||
}
|
||||
@ -3,12 +3,10 @@ package com.knecon.fforesight.service.layoutparser.server.services;
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
|
||||
|
||||
@ -29,7 +27,7 @@ class PageContentExtractorTest {
|
||||
textPositionPerPage.stream()
|
||||
.map(t -> t.getSortedTextPositionSequences()
|
||||
.stream()
|
||||
.map(TextPositionSequence::getBBoxInitialUserSpace)
|
||||
.map(TextPositionSequence::getBBoxPdf)
|
||||
.map(List::of)
|
||||
.toList())
|
||||
.toList(), tmpFileName);
|
||||
|
||||
@ -1,6 +1,8 @@
|
||||
package com.knecon.fforesight.service.layoutparser.server.utils;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Map;
|
||||
|
||||
@ -10,11 +12,27 @@ import org.springframework.core.io.ClassPathResource;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.Standard14EmbeddableFont;
|
||||
import com.pdftron.common.Matrix2D;
|
||||
import com.pdftron.pdf.ColorPt;
|
||||
import com.pdftron.pdf.ColorSpace;
|
||||
import com.pdftron.pdf.Element;
|
||||
import com.pdftron.pdf.ElementBuilder;
|
||||
import com.pdftron.pdf.ElementWriter;
|
||||
import com.pdftron.pdf.Font;
|
||||
import com.pdftron.pdf.GState;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
import com.pdftron.pdf.Page;
|
||||
import com.pdftron.sdf.SDFDoc;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@ -48,14 +66,14 @@ public abstract class BuildDocumentTest extends AbstractTest {
|
||||
@SneakyThrows
|
||||
protected Document buildGraph(String filename, LayoutParsingType layoutParsingType) {
|
||||
|
||||
|
||||
if (!filename.startsWith("files") && filename.startsWith("/")) {
|
||||
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(Path.of(filename).getFileName().toString(), LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH, true);
|
||||
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(Path.of(filename).getFileName().toString(), layoutParsingType, true);
|
||||
prepareStorage(layoutParsingRequest, new File(filename));
|
||||
return DocumentGraphFactory.buildDocumentGraph(layoutParsingType,
|
||||
layoutParsingPipeline.parseLayout(layoutParsingType,
|
||||
new File(filename),
|
||||
layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get()),
|
||||
layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId()
|
||||
.get()),
|
||||
new TableServiceResponse(),
|
||||
new VisualLayoutParsingResponse(),
|
||||
layoutParsingRequest.identifier()));
|
||||
@ -65,10 +83,12 @@ public abstract class BuildDocumentTest extends AbstractTest {
|
||||
} else {
|
||||
prepareStorage(filename);
|
||||
}
|
||||
return DocumentGraphFactory.buildDocumentGraph(layoutParsingType, parseLayout(filename, layoutParsingType));
|
||||
var classificationDocument = parseLayout(filename, layoutParsingType);
|
||||
return DocumentGraphFactory.buildDocumentGraph(layoutParsingType, classificationDocument);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
BIN
layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/402StudyPage5.pdf
(Stored with Git LFS)
Normal file
BIN
layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/402StudyPage5.pdf
(Stored with Git LFS)
Normal file
Binary file not shown.
BIN
layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/402StudyPage5_rotated.pdf
(Stored with Git LFS)
Normal file
BIN
layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/402StudyPage5_rotated.pdf
(Stored with Git LFS)
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1 +1 @@
|
||||
Subproject commit c6fd9e849f3efd7d1507401f63629b91dec9f4ec
|
||||
Subproject commit 0da08b1d9d1bc815a3fb51aa9638eafea2cf02d5
|
||||
@ -12,7 +12,7 @@ dependencies {
|
||||
implementation("org.apache.pdfbox:pdfbox-tools:${pdfBoxVersion}")
|
||||
implementation("org.slf4j:slf4j-api:1.7.25")
|
||||
implementation("com.knecon.fforesight:tracing-commons:0.5.0")
|
||||
implementation("com.pdftron:PDFNet:10.7.0")
|
||||
implementation("com.pdftron:PDFNet:10.11.0")
|
||||
|
||||
testImplementation("org.apache.logging.log4j:log4j-slf4j-impl:2.22.1")
|
||||
testImplementation("org.junit.jupiter:junit-jupiter")
|
||||
|
||||
@ -10,12 +10,14 @@ import org.apache.pdfbox.pdmodel.font.Standard14Fonts;
|
||||
import com.pdftron.pdf.Font;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@RequiredArgsConstructor
|
||||
public class Standard14EmbeddableFont implements EmbeddableFont {
|
||||
|
||||
@Getter
|
||||
private final PDType1Font font;
|
||||
|
||||
private final int pdfTronIdentifier;
|
||||
|
||||
@ -96,15 +96,18 @@ public class PDFTronViewerDocumentService {
|
||||
boolean isCurrentVersion = ViewerDocVersioningUtility.docIsCurrentVersion(pdfDoc);
|
||||
|
||||
int pageNumber = 1;
|
||||
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); pageNumber++) {
|
||||
try (PageIterator iterator = pdfDoc.getPageIterator()) {
|
||||
while (iterator.hasNext()) {
|
||||
|
||||
Page page = iterator.next();
|
||||
Page page = iterator.next();
|
||||
|
||||
if (isCurrentVersion) {
|
||||
pageContentCleaner.removeMarkedContent(page);
|
||||
if (isCurrentVersion) {
|
||||
pageContentCleaner.removeMarkedContent(page);
|
||||
}
|
||||
|
||||
visualizationWriter.drawVisualizationsOnPage(pageNumber, page);
|
||||
pageNumber++;
|
||||
}
|
||||
|
||||
visualizationWriter.drawVisualizationsOnPage(pageNumber, page);
|
||||
}
|
||||
|
||||
ViewerDocVersioningUtility.setVersionInDocument(pdfDoc);
|
||||
|
||||
@ -343,12 +343,7 @@ public class VisualizationWriter {
|
||||
@SneakyThrows
|
||||
private static AffineTransform getTextDeRotationTransform(Page page) {
|
||||
|
||||
return AffineTransform.getQuadrantRotateInstance(switch (page.getRotation()) {
|
||||
case 90 -> 3;
|
||||
case 180 -> 2;
|
||||
case 270 -> 1;
|
||||
default -> 0;
|
||||
});
|
||||
return AffineTransform.getQuadrantRotateInstance(page.getRotation());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -61,11 +61,12 @@ class PageContentCleanerTest {
|
||||
.markedContentToRemove(Set.of(LayerIdentifier.KNECON_OCR_DEBUG.markedContentName()))
|
||||
.build();
|
||||
|
||||
for (PageIterator iterator = doc.getPageIterator(); iterator.hasNext(); ) {
|
||||
try (PageIterator iterator = doc.getPageIterator()) {
|
||||
while (iterator.hasNext()) {
|
||||
Page page = iterator.next();
|
||||
|
||||
Page page = iterator.next();
|
||||
|
||||
pageContentCleaner.removeMarkedContent(page);
|
||||
pageContentCleaner.removeMarkedContent(page);
|
||||
}
|
||||
}
|
||||
|
||||
doc.save(out, SDFDoc.SaveMode.REMOVE_UNUSED, null);
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user