Merge branch 'main' into RED-7074

# Conflicts:
#	layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java
#	layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java
#	layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/SectionsBuilderService.java
#	layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java
#	layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java
#	layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java
#	layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java
#	layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java
#	layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/HeadlinesGoldStandardIntegrationTest.java
This commit is contained in:
maverickstuder 2024-05-15 14:17:59 +02:00
commit 61c90fc30d
77 changed files with 3204 additions and 1858 deletions

1
.gitattributes vendored Normal file
View File

@ -0,0 +1 @@
*.pdf filter=lfs diff=lfs merge=lfs -text

4
.gitmodules vendored
View File

@ -1,8 +1,8 @@
[submodule "layoutparser-service/layoutparser-service-server/src/test/resources/files/basf"]
path = layoutparser-service/layoutparser-service-server/src/test/resources/files/basf
url = https://gitlab.knecon.com/fforesight/documents/basf.git
url = ssh://git@git.knecon.com:22222/fforesight/documents/basf.git
update = merge
[submodule "layoutparser-service/layoutparser-service-server/src/test/resources/files/syngenta"]
path = layoutparser-service/layoutparser-service-server/src/test/resources/files/syngenta
url = https://gitlab.knecon.com/fforesight/documents/syngenta.git
url = ssh://git@git.knecon.com:22222/fforesight/documents/syngenta.git
update = merge

View File

@ -5,6 +5,7 @@ public enum LayoutParsingType {
REDACT_MANAGER_OLD,
REDACT_MANAGER_PARAGRAPH_DEBUG,
DOCUMINE,
DOCUMINE_OLD,
CLARIFYND,
CLARIFYND_PARAGRAPH_DEBUG
}

View File

@ -52,6 +52,7 @@ import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBui
import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService;
import com.knecon.fforesight.service.layoutparser.processor.services.TableExtractionService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.BlockificationPostprocessingService;
import com.knecon.fforesight.service.layoutparser.processor.services.TextRulingsClassifier;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService;
@ -59,12 +60,14 @@ import com.knecon.fforesight.service.layoutparser.processor.services.classificat
import com.knecon.fforesight.service.layoutparser.processor.services.classification.DocuMineClassificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.processor.services.graphics.Box;
import com.knecon.fforesight.service.layoutparser.processor.services.graphics.GraphicExtractorService;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper;
import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDFLinesTextStripper;
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
import io.micrometer.observation.Observation;
import io.micrometer.observation.ObservationRegistry;
@ -104,6 +107,7 @@ public class LayoutParsingPipeline {
OutlineExtractorService outlineExtractorService;
OutlineValidationService outlineValidationService;
TOCEnrichmentService tocEnrichmentService;
LayoutparserSettings settings;
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
@ -136,7 +140,8 @@ public class LayoutParsingPipeline {
.get());
}
ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(),
ClassificationDocument classificationDocument = parseLayout(settings.getLayoutParsingTypeOverride() == null //
? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride(),
originFile,
imageServiceResponse,
tableServiceResponse,
@ -145,11 +150,12 @@ public class LayoutParsingPipeline {
log.info("Building document graph for {}", layoutParsingRequest.identifier());
Document documentGraph = observeBuildDocumentGraph(layoutParsingRequest.layoutParsingType(), classificationDocument);
Document documentGraph = observeBuildDocumentGraph(settings.getLayoutParsingTypeOverride() == null //
? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride(), classificationDocument);
log.info("Creating viewer document for {}", layoutParsingRequest.identifier());
layoutGridService.addLayoutGrid(viewerDocumentFile, documentGraph, viewerDocumentFile, false);
layoutGridService.addLayoutGrid(viewerDocumentFile, documentGraph, viewerDocumentFile, false, layoutParsingRequest.visualLayoutParsingFileId().isPresent());
log.info("Storing resulting files for {}", layoutParsingRequest.identifier());
@ -239,6 +245,11 @@ public class LayoutParsingPipeline {
Map<Integer, List<ClassifiedImage>> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse);
Map<Integer, List<ClassifiedImage>> signatures = visualLayoutParsingAdapter.buildExtractedSignaturesPerPage(visualLayoutParsingResponse);
ClassificationDocument classificationDocument = new ClassificationDocument();
if (settings.isDebug() || identifier.containsKey("debug")) {
classificationDocument.getVisualizations().setActive(true);
}
List<ClassificationPage> classificationPages = new ArrayList<>();
OutlineObject lastProcessedOutlineObject = null;
@ -267,10 +278,12 @@ public class LayoutParsingPipeline {
stripper.setStartPage(pageNumber);
stripper.setEndPage(pageNumber);
stripper.setPdpage(pdPage);
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE)) {
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD)) {
stripper.setSortByPosition(true);
}
stripper.getText(originDocument);
List<TextPositionSequence> words = stripper.getTextPositionSequences();
classificationDocument.getVisualizations().addTextVisualizations(words, pageNumber);
PDRectangle pdr = pdPage.getMediaBox();
@ -278,16 +291,21 @@ public class LayoutParsingPipeline {
boolean isLandscape = pdr.getWidth() > pdr.getHeight() && (rotation == 0 || rotation == 180) || pdr.getHeight() > pdr.getWidth() && (rotation == 90 || rotation == 270);
PDRectangle cropbox = pdPage.getCropBox();
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), stripper.getRulings());
classificationDocument.getVisualizations().addRulingVisualization(stripper.getRulings(), pageNumber);
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(pdfTableCells.get(pageNumber), stripper.getRulings());
List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical());
PageInformation pageInformation = PageInformation.fromPDPage(pageNumber, pdPage);
List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontals(), cleanRulings.getVerticals(), pageInformation);
classificationDocument.getVisualizations().addCellVisualizations(emptyTableCells, pageNumber);
var graphics = graphicExtractorService.extractPathElementGraphics(originDocument,
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(words, cleanRulings);
List<Box> graphics = graphicExtractorService.extractPathElementGraphics(originDocument,
pdPage,
pageNumber,
cleanRulings,
stripper.getTextPositionSequences(),
emptyTableCells,
false);
pdfImages.computeIfAbsent(pageNumber, x -> new ArrayList<>())
@ -296,10 +314,13 @@ public class LayoutParsingPipeline {
.toList());
ClassificationPage classificationPage = switch (layoutParsingType) {
case REDACT_MANAGER_OLD -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells);
case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, true);
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, false);
case REDACT_MANAGER_OLD ->
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings, classificationDocument.getVisualizations());
case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings);
case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG ->
docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getVisualizations(), layoutParsingType);
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG ->
docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getVisualizations(), layoutParsingType);
};
classificationPage.setCleanRulings(cleanRulings);
@ -321,11 +342,12 @@ public class LayoutParsingPipeline {
lastProcessedOutlineObject = blockificationPostprocessingService.sanitizeOutlineBlocks(classificationPage, notFoundOutlineObject);
}
classificationDocument.getVisualizations().addMarkedContentVisualizations(stripper.getMarkedContents(), pageNumber);
// MarkedContent needs to be converted at this point, otherwise it leads to GC Problems in Pdfbox.
classificationPage.setMarkedContentBboxPerType(convertMarkedContents(stripper.getMarkedContents()));
// If images is ocr needs to be calculated before textBlocks are moved into tables, otherwise findOcr algorithm needs to be adopted.
if (pdfImages != null && pdfImages.containsKey(pageNumber)) {
if (pdfImages.containsKey(pageNumber)) {
classificationPage.setImages(pdfImages.get(pageNumber));
imageServiceResponseAdapter.findOcr(classificationPage);
}
@ -340,12 +362,6 @@ public class LayoutParsingPipeline {
tableExtractionService.extractTables(emptyTableCells, classificationPage);
if (layoutParsingType == LayoutParsingType.REDACT_MANAGER) {
docstrumBlockificationService.combineBlocks(classificationPage);
} else if (layoutParsingType == LayoutParsingType.CLARIFYND) {
docstrumBlockificationService.mergeIntersectingBlocks(classificationPage.getTextBlocks(), 0, 6.5f);
}
buildPageStatistics(classificationPage);
increaseDocumentStatistics(classificationPage, classificationDocument);
@ -356,11 +372,14 @@ public class LayoutParsingPipeline {
log.info("Calculating BodyTextFrame for {}", identifier);
bodyTextFrameService.setBodyTextFrames(classificationDocument, layoutParsingType);
for (ClassificationPage page : classificationDocument.getPages()) {
classificationDocument.getVisualizations().addCleanRulingVisualization(page.getCleanRulings(), page.getPageNumber());
}
log.info("Classify TextBlocks for {}", identifier);
switch (layoutParsingType) {
case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_OLD, CLARIFYND_PARAGRAPH_DEBUG ->
redactManagerClassificationService.classifyDocument(classificationDocument);
case DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument);
case DOCUMINE_OLD, DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument);
case CLARIFYND -> clarifyndClassificationService.classifyDocument(classificationDocument);
}

View File

@ -0,0 +1,20 @@
package com.knecon.fforesight.service.layoutparser.processor;
import org.springframework.boot.context.properties.ConfigurationProperties;
import org.springframework.context.annotation.Configuration;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import lombok.AccessLevel;
import lombok.Data;
import lombok.experimental.FieldDefaults;
@Data
@Configuration
@ConfigurationProperties("layoutparser")
@FieldDefaults(level = AccessLevel.PRIVATE)
public class LayoutparserSettings {
boolean debug;
LayoutParsingType layoutParsingTypeOverride;
}

View File

@ -7,14 +7,18 @@ import java.util.stream.Collectors;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.LineBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.NearestNeighbourService;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.ReadingOrderService;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.SpacingService;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.ZoneBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
import lombok.RequiredArgsConstructor;
@ -29,31 +33,37 @@ public class DocstrumSegmentationService {
private final ReadingOrderService readingOrderService;
public List<Zone> segmentPage(List<TextPositionSequence> textPositions, boolean xyOrder) {
public List<Zone> segmentPage(List<TextPositionSequence> textPositions, boolean xyOrder, CleanRulings usedRulings, LayoutparsingVisualizations visualizations) {
List<Zone> zones = new ArrayList<>();
zones.addAll(computeZones(textPositions, TextDirection.ZERO));
zones.addAll(computeZones(textPositions, TextDirection.QUARTER_CIRCLE));
zones.addAll(computeZones(textPositions, TextDirection.HALF_CIRCLE));
zones.addAll(computeZones(textPositions, TextDirection.THREE_QUARTER_CIRCLE));
zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.ZERO));
zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.QUARTER_CIRCLE));
zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.HALF_CIRCLE));
zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.THREE_QUARTER_CIRCLE));
return readingOrderService.resolve(zones, xyOrder);
}
private List<Zone> computeZones(List<TextPositionSequence> textPositions, TextDirection direction) {
private List<Zone> computeZones(List<TextPositionSequence> textPositions, CleanRulings rulings, LayoutparsingVisualizations visualizations, TextDirection direction) {
var positions = textPositions.stream().filter(t -> t.getDir() == direction).map(TextPositionSequence::getTextPositions).flatMap(List::stream).toList();
List<RedTextPosition> positions = textPositions.stream()
.filter(t -> t.getDir() == direction)
.map(TextPositionSequence::getTextPositions)
.flatMap(List::stream)
.toList();
var characters = positions.stream().map(Character::new).collect(Collectors.toList());
List<Character> characters = positions.stream()
.map(Character::new)
.collect(Collectors.toList());
nearestNeighbourService.findNearestNeighbors(characters);
var characterSpacing = spacingService.computeCharacterSpacing(characters);
var lineSpacing = Math.min(spacingService.computeLineSpacing(characters), 20);
double characterSpacing = spacingService.computeCharacterSpacing(characters);
double lineSpacing = Math.min(spacingService.computeLineSpacing(characters), 20);
var lines = lineBuilderService.buildLines(characters, characterSpacing, lineSpacing);
return zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing);
List<Line> lines = lineBuilderService.buildLines(characters, characterSpacing, lineSpacing, rulings);
return zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing, rulings);
}
}

View File

@ -1,13 +1,27 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
import java.awt.geom.Rectangle2D;
import java.util.Comparator;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.Data;
@Data
public abstract class BoundingBox {
private Rectangle2D bBox;
// Java coordinate system: (0, 0) is always upper left, x is increasing left to right and y is increasing from top to bottom.
// should be used when determining reading order or other tasks which require coordinates in a harmonized system.
protected Rectangle2D bBox; // I would not trust this coordinate when comparing rulings and text, due to the text positions being slightly off.
// PDF coordinate system: depends on page rotation, (0, 0) is lower left corner, x is increasing left to right and y from bottom to top.
// This rotates completely in 90 degree steps with page rotation.
// Needs to be used when writing to a PDF.
// Also, these are definitely correct and should be used whenever possible.
protected Rectangle2D bBoxInitialUserSpace;
protected static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f;
public double getX() {
@ -22,6 +36,42 @@ public abstract class BoundingBox {
}
public double getMinX() {
return bBox.getMinX();
}
public double getMinY() {
return bBox.getMinY();
}
public double getPdfMinX() {
return bBoxInitialUserSpace.getMinX();
}
public double getPdfMaxX() {
return bBoxInitialUserSpace.getMaxX();
}
public double getPdfMinY() {
return bBoxInitialUserSpace.getMinY();
}
public double getPdfMaxY() {
return bBoxInitialUserSpace.getMaxY();
}
public double getWidth() {
return bBox.getWidth();
@ -34,21 +84,170 @@ public abstract class BoundingBox {
}
public double getMaxX() {
return bBox.getMaxX();
}
public double getMaxY() {
return bBox.getMaxY();
}
public double getArea() {
return (bBox.getHeight() * bBox.getWidth());
}
public boolean contains(Rectangle2D contained, double tolerance) {
public boolean contains(BoundingBox contained) {
return bBox.getX() <= contained.getX() + tolerance && bBox.getY() <= contained.getY() + tolerance && bBox.getX() + bBox.getWidth() >= contained.getX() + contained.getWidth() - tolerance && bBox.getY() + bBox.getHeight() >= contained.getY() + contained.getHeight() - tolerance;
return contains(contained, 0);
}
public boolean contains(BoundingBox contained, double tolerance) {
return getPdfMinX() <= contained.getPdfMinX() + tolerance
&& getPdfMinY() <= contained.getPdfMinY() + tolerance
&& getPdfMaxX() >= contained.getPdfMaxX() - tolerance
&& getPdfMaxY() >= contained.getPdfMaxY() - tolerance;
}
public boolean intersects(BoundingBox other) {
return this.intersectsX(other) && this.intersectsY(other);
}
public boolean intersects(BoundingBox other, float yThreshold, float xThreshold) {
return this.intersectsX(other, xThreshold) && this.intersectsY(other, yThreshold);
}
public boolean intersectsY(BoundingBox other) {
return this.getBBox().getMinY() <= other.getBBox().getMaxY() && this.getBBox().getMaxY() >= other.getBBox().getMinY();
return this.getPdfMinY() <= other.getPdfMaxY() && this.getPdfMaxY() >= other.getPdfMinY();
}
public boolean intersectsYJava(BoundingBox other) {
return this.getY() <= other.getMaxY() && this.getMaxY() >= other.getY();
}
public boolean intersectsY(BoundingBox other, float threshold) {
return this.getPdfMinY() - threshold <= other.getPdfMaxY() && this.getPdfMaxY() + threshold >= other.getPdfMinY();
}
public boolean intersectsX(BoundingBox other) {
return this.getPdfMinX() <= other.getPdfMaxX() && this.getPdfMaxX() >= other.getPdfMinX();
}
public boolean intersectsXJava(BoundingBox other) {
return this.getX() <= other.getMaxX() && this.getMaxX() >= other.getMinX();
}
public boolean intersectsX(BoundingBox other, float threshold) {
return this.getPdfMinX() - threshold <= other.getMaxX() && this.getMaxX() + threshold >= other.getPdfMinX();
}
public void setToBBoxOfComponents(List<? extends BoundingBox> components) {
this.bBox = components.stream()
.map(BoundingBox::getBBox)
.collect(RectangleTransformations.collectBBox());
this.bBoxInitialUserSpace = components.stream()
.map(BoundingBox::getBBoxInitialUserSpace)
.collect(RectangleTransformations.collectBBox());
}
public double verticalOverlap(BoundingBox other) {
return Math.max(0, Math.min(this.getPdfMaxY(), other.getPdfMaxY()) - Math.max(this.getPdfMinY(), other.getPdfMinY()));
}
public static final Comparator<BoundingBox> ILL_DEFINED_ORDER = (o1, o2) -> {
if (o1.equals(o2)) {
return 0;
}
if (o1.verticalOverlap(o2) > VERTICAL_COMPARISON_THRESHOLD * ((o1.getHeight() + o2.getHeight()) / 2)) {
return Double.compare(o1.getPdfMinX(), o2.getPdfMinX());
} else {
return Double.compare(o1.getPdfMaxY(), o2.getPdfMaxY());
}
};
public double horizontalDistance(BoundingBox other) {
Rectangle2D left;
Rectangle2D right;
if (this.leftOf(other)) {
left = this.getBBox();
right = other.getBBox();
} else {
left = other.getBBox();
right = this.getBBox();
}
return Math.max(0, right.getMinX() - left.getMaxX());
}
public double verticalDistance(BoundingBox other) {
Rectangle2D bottom;
Rectangle2D top;
if (this.isAbove(other)) {
top = this.getBBox();
bottom = other.getBBox();
} else {
bottom = this.getBBox();
top = other.getBBox();
}
return Math.max(0, bottom.getMinY() - top.getMaxY());
}
public boolean rightOf(BoundingBox other) {
return this.intersectsYJava(other) && other.getMaxX() <= this.getMinX();
}
public boolean leftOf(BoundingBox other) {
return this.intersectsYJava(other) && other.getMinX() >= this.getMaxX();
}
public boolean isAbove(BoundingBox other) {
return this.intersectsXJava(other) && other.getMinY() >= this.getMaxY();
}
public boolean isBelow(BoundingBox other) {
return this.intersectsXJava(other) && this.getMinY() >= other.getMaxY();
}
}

View File

@ -27,8 +27,8 @@ public class Character {
public Character(RedTextPosition chunk) {
this.x = chunk.getXDirAdj() + chunk.getWidthDirAdj() / 2;
this.y = chunk.getYDirAdj() + chunk.getHeightDir() / 2;
this.x = chunk.getBBoxDirAdj().getCenterX();
this.y = chunk.getBBoxDirAdj().getCenterY();
this.textPosition = chunk;
}

View File

@ -0,0 +1,324 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
/*
WIP, mostly working, needs to be tested a bit more
*/
public class ColumnDetector {
public static final double MAX_VALUE_THRESHOLD = 0.5;
final static int bins_num = 512;
final static int globalStartIdx = 0; // ignore outer parts completely, we don't expect columns there
final static int globalEndIdx = bins_num; // i chose 7, since thirds seems a likely split for columns, therefore divided by 6 would eliminate those.
public static final double DERIVATIVE_ZERO_THRESHOLD = 1e-10;
public static final double MINIMUM_THRESHOLD_FOR_COLUMNS = 0.05;
public static final double NEAR_GLOBAL_THRESHOLD = 0.5;
double minY;
double maxY;
double midY;
double[] histogram;
double min;
double max;
double resolution;
double sum;
int N;
public ColumnDetector(double min, double max, double minY, double maxY) {
this.min = min;
this.max = max;
this.minY = minY;
this.maxY = maxY;
this.midY = maxY - minY;
this.resolution = (max - min) / bins_num;
this.histogram = new double[bins_num];
}
public void add(BoundingBox zone) {
N++;
double weight = computeWeight(zone);
int start = (int) ((zone.getMinX() - min) / resolution);
int end = (int) ((zone.getMaxX() - min) / resolution);
for (int i = start; i < end; i++) {
histogram[i] += weight;
sum += histogram[i];
}
}
private double computeWeight(BoundingBox zone) {
double areaWeight = zone.getBBox().getHeight();
double relativeDistance = relativeDistanceToMiddle(zone.getBBox().getCenterY());
double distanceWeight;
if (relativeDistance < 0.6) {
distanceWeight = 1;
} else if (relativeDistance < 0.8) {
distanceWeight = 0.8;
} else {
distanceWeight = 0.1;
}
return areaWeight * distanceWeight;
}
private double relativeDistanceToMiddle(double y) {
double range = (maxY - minY) / 2;
double mid = minY + range;
return Math.abs(y - mid) / range;
}
public double[] computeDerivative() {
int length = histogram.length;
double[] derivative = new double[length];
for (int i = 0; i < length; i++) {
if (i == 0) {
derivative[i] = (histogram[i + 1] - histogram[i]) / resolution;
} else if (i == length - 1) {
derivative[i] = (histogram[i] - histogram[i - 1]) / resolution;
} else {
derivative[i] = (histogram[i + 1] - histogram[i - 1]) / (2 * resolution);
}
}
return derivative;
}
public double calcMean(double[] arr, int start, int end) {
if (start == end) {
return 0;
}
double sum = 0;
for (int i = start; i < end; i++) {
sum += arr[i];
}
return sum / (end - start);
}
/*
Find columns, by finding all local maxima/minima of the derivative. Filtering them for the ones with the biggest values.
For each found minima, we will step to the right until we hit a 0 in the derivative, this indicates a minimum in the main histogram. If this minimum is below a threshold, it is deemed a column divider.
Same goes for maxima, but stepping to the left now, since minima in the function will always be to the left of a maximum in its derivative.
*/
public List<Double> determineColumnsWithDerivative(double[] derivative) {
assert derivative.length == histogram.length;
Set<Integer> columnIndices = new HashSet<>();
double mean = calcMean(histogram, 0, histogram.length);
double maxDvValue = calcMax(derivative);
double minDvValue = calcMin(derivative);
if (maxDvValue - minDvValue < mean * MAX_VALUE_THRESHOLD) {
Collections.emptyList();
}
Extrema derivativeExtrema = calculateNearGlobalExtrema(derivative, maxDvValue, minDvValue);
List<Integer> columnsRightOfMinima = findZerosToTheRightOfMinima(derivative, derivativeExtrema.minima(), mean);
columnIndices.addAll(columnsRightOfMinima);
List<Integer> columnsLeftOfMaxima = findZerosToTheLeftOfMaxima(derivative, derivativeExtrema.maxima(), mean);
columnIndices.addAll(columnsLeftOfMaxima);
return columnIndices.stream()
.sorted(Comparator.naturalOrder())
.map(this::calculateXCoordinateFromIdx)
.toList();
}
private List<Integer> findZerosToTheLeftOfMaxima(double[] derivative, List<Integer> derivativeMaxima, double mean) {
List<Integer> columnsLeftOfMaxima = new ArrayList<>();
for (int i = 0; i < derivativeMaxima.size(); i++) {
List<Integer> consecutiveZeroes = new LinkedList<>();
boolean maximumFound = false;
int maximaIdx = derivativeMaxima.get(i) - 1; // the highest derivative will always be at least one step away from the lowest value.
int endIdx = (int) Math.max(globalStartIdx,
Math.min(maximaIdx - 1,
maximaIdx - 0.1 * bins_num)); // search through 10% of array to the right, but at least one step and at most to the left edge;
for (int j = maximaIdx; j >= endIdx; j--) {
if (derivative[j] < DERIVATIVE_ZERO_THRESHOLD) {
maximumFound = true;
consecutiveZeroes.add(j);
} else if (maximumFound) {
break;
}
}
if (maximumFound) {
int midIdx = consecutiveZeroes.size() / 2;
int middleMinimumIdx = consecutiveZeroes.get(midIdx);
if (histogram[middleMinimumIdx] < mean * MINIMUM_THRESHOLD_FOR_COLUMNS) {
columnsLeftOfMaxima.add(middleMinimumIdx);
}
}
}
return columnsLeftOfMaxima;
}
private List<Integer> findZerosToTheRightOfMinima(double[] derivative, List<Integer> derivativeMinima, double mean) {
List<Integer> columnIndixes = new LinkedList<>();
for (int i = 0; i < derivativeMinima.size(); i++) {
List<Integer> consecutiveZeroes = new LinkedList<>();
boolean minimumFound = false;
int minimaIdx = derivativeMinima.get(i) + 1; // the highest derivative will always be at least one step earlier than the lowest value.
int endIdx = (int) Math.min(globalEndIdx,
Math.max(minimaIdx + 1,
minimaIdx + 0.1 * bins_num)); // search through 10% of array to the right, but at least one step and at most to the right edge;
for (int j = minimaIdx; j < endIdx; j++) {
if (derivative[j] < DERIVATIVE_ZERO_THRESHOLD) {
minimumFound = true;
consecutiveZeroes.add(j);
} else if (minimumFound) {
break;
}
}
if (minimumFound) {
int midIdx = consecutiveZeroes.size() / 2;
int middleMinimumIdx = consecutiveZeroes.get(midIdx);
if (histogram[middleMinimumIdx] < mean * MINIMUM_THRESHOLD_FOR_COLUMNS) {
columnIndixes.add(middleMinimumIdx);
}
}
}
return columnIndixes;
}
private double calcMax(double[] array) {
double max = Double.NEGATIVE_INFINITY;
for (int i = 0; i < array.length; i++) {
if (array[i] > max) {
max = array[i];
}
}
return max;
}
private double calcMin(double[] array) {
double min = Double.POSITIVE_INFINITY;
for (int i = 0; i < array.length; i++) {
if (array[i] < min) {
min = array[i];
}
}
return min;
}
private Extrema calculateNearGlobalExtrema(double[] derivative, double maxDvValue, double minDvValue) {
List<Integer> nearGlobalDvMaximaIdx = new LinkedList<>();
List<Integer> nearGlobalDvMinimaIdx = new LinkedList<>();
for (int i = globalStartIdx; i < globalEndIdx; i++) {
if (derivative[i] <= minDvValue * NEAR_GLOBAL_THRESHOLD) {
nearGlobalDvMinimaIdx.add(i);
}
if (derivative[i] >= maxDvValue * NEAR_GLOBAL_THRESHOLD) {
nearGlobalDvMaximaIdx.add(i);
}
}
nearGlobalDvMinimaIdx = removeConsecutive(nearGlobalDvMinimaIdx);
nearGlobalDvMaximaIdx = removeConsecutive(nearGlobalDvMaximaIdx);
return new Extrema(nearGlobalDvMaximaIdx, nearGlobalDvMinimaIdx);
}
private record Extrema(List<Integer> maxima, List<Integer> minima) {
}
private Double calculateXCoordinateFromIdx(int globalMinIdx) {
return min + ((globalMinIdx + 1) * resolution);
}
public static List<Integer> removeConsecutive(List<Integer> numbers) {
List<Integer> result = new ArrayList<>();
if (numbers == null || numbers.isEmpty()) {
return result;
}
result.add(numbers.get(0)); // Add the first number
for (int i = 1; i < numbers.size(); i++) {
if (numbers.get(i) != numbers.get(i - 1) + 1) {
result.add(numbers.get(i)); // Add non-consecutive numbers
}
}
return result;
}
public void kernelSmooth(double[] kernel) {
double[] newFrequencies = new double[histogram.length];
int shift = (kernel.length - 1) / 2;
for (int i = 0; i < kernel.length; i++) {
int jStart = Math.max(0, i - shift);
int jEnd = Math.min(histogram.length, histogram.length + i - shift);
for (int j = jStart; j < jEnd; j++) {
newFrequencies[j - i + shift] += kernel[i] * histogram[j];
}
}
histogram = newFrequencies;
}
public double[] createGaussianKernel(int length, double stdDeviation) {
int r = length / 2;
int size = 2 * r + 1;
double[] kernel = new double[size];
double sum = 0;
double b = 2 * (stdDeviation) * (stdDeviation);
double a = 1 / Math.sqrt(Math.PI * b);
for (int i = 0; i < size; i++) {
kernel[i] = a * Math.exp(-(i - r) * (i - r) / b);
sum += kernel[i];
}
for (int i = 0; i < size; i++) {
kernel[i] /= sum;
}
return kernel;
}
}

View File

@ -1,10 +1,10 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import lombok.Data;
@ -72,7 +72,7 @@ public class Line extends BoundingBox {
public double getAngle() {
return Math.atan2(y1 - y0, x1 - x0);
return FastAtan2.fastAtan2(y1 - y0, x1 - x0);
}
@ -84,7 +84,9 @@ public class Line extends BoundingBox {
private double computeHeight() {
return characters.stream().map(Character::getHeight).reduce(0d, Double::sum) / characters.size();
return characters.stream()
.map(Character::getHeight)
.reduce(0d, Double::sum) / characters.size();
}
@ -116,7 +118,7 @@ public class Line extends BoundingBox {
double ym = (y0 + y1) / 2;
double yn = (other.y0 + other.y1) / 2;
return Math.abs(ym - yn) / Math.sqrt(1);
return Math.abs(ym - yn);
}
@ -141,21 +143,9 @@ public class Line extends BoundingBox {
private void buildBBox() {
double minX = Double.POSITIVE_INFINITY;
double minY = Double.POSITIVE_INFINITY;
double maxX = Double.NEGATIVE_INFINITY;
double maxY = Double.NEGATIVE_INFINITY;
for (Character character : characters) {
minX = Math.min(minX, character.getTextPosition().getXDirAdj());
minY = Math.min(minY, character.getTextPosition().getYDirAdj());
maxX = Math.max(maxX, character.getTextPosition().getXDirAdj() + character.getTextPosition().getWidthDirAdj());
maxY = Math.max(maxY, character.getTextPosition().getYDirAdj() + character.getTextPosition().getHeightDir());
}
this.setBBox(new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY));
this.setToBBoxOfComponents(characters.stream()
.map(Character::getTextPosition)
.toList());
}

View File

@ -1,9 +1,10 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
import java.awt.geom.Rectangle2D;
import java.util.Comparator;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.Data;
@Data
@ -15,29 +16,9 @@ public class Zone extends BoundingBox {
@SuppressWarnings("PMD.ConstructorCallsOverridableMethod")
public Zone(List<Line> lines) {
lines.sort(Comparator.comparingDouble(Line::getY));
lines.sort(Comparator.comparingDouble(Line::getY0));
this.lines = lines;
buildBBox();
}
public void buildBBox() {
double minX = Double.POSITIVE_INFINITY;
double minY = Double.POSITIVE_INFINITY;
double maxX = Double.NEGATIVE_INFINITY;
double maxY = Double.NEGATIVE_INFINITY;
for (Line line : lines) {
minX = Math.min(minX, line.getX());
minY = Math.min(minY, line.getY());
maxX = Math.max(maxX, line.getX() + line.getWidth());
maxY = Math.max(maxY, line.getY() + line.getHeight());
}
this.setBBox(new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY));
setToBBoxOfComponents(lines);
}

View File

@ -1,6 +1,5 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.service;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
@ -11,43 +10,49 @@ import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Angle
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
@Service
public class LineBuilderService {
private static final double CHARACTER_SPACING_DISTANCE_MULTIPLIER = 3.5;
private static final double MAX_VERTICAL_CHARACTER_DISTANCE = 0.67;
private static final double LINE_SPACING_THRESHOLD_MULTIPLIER = 0.67;
private static final double ANGLE_TOLERANCE = Math.PI / 6;
public List<Line> buildLines(List<Character> characters, double characterSpacing, double lineSpacing) {
public List<Line> buildLines(List<Character> characters, double characterSpacing, double lineSpacing, CleanRulings rulings) {
double maxHorizontalDistance = characterSpacing * CHARACTER_SPACING_DISTANCE_MULTIPLIER;
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_CHARACTER_DISTANCE;
double maxVerticalDistance = lineSpacing * LINE_SPACING_THRESHOLD_MULTIPLIER;
UnionFind<Character> unionFind = new UnionFind<>(new HashSet<>(characters));
AngleFilter filter = new AngleFilter(-ANGLE_TOLERANCE, ANGLE_TOLERANCE);
AngleFilter angleFilter = new AngleFilter(-ANGLE_TOLERANCE, ANGLE_TOLERANCE);
characters.forEach(character -> {
character.getNeighbors().forEach(neighbor -> {
double x = neighbor.getHorizontalDistance() / maxHorizontalDistance;
double y = neighbor.getVerticalDistance() / maxVerticalDistance;
if (character.getTextPosition().getDir() == neighbor.getCharacter().getTextPosition().getDir() && filter.matches(neighbor) && Math.pow(x, 2) + Math.pow(y,
2) <= 1) {
unionFind.union(character, neighbor.getCharacter());
}
});
character.getNeighbors()
.forEach(neighbor -> {
double normalizedHorizontalDistance = neighbor.getHorizontalDistance() / maxHorizontalDistance;
double normalizedVerticalDistance = neighbor.getVerticalDistance() / maxVerticalDistance;
if (character.getTextPosition().getDir() != neighbor.getCharacter().getTextPosition().getDir() //
|| !angleFilter.matches(neighbor) //
|| Math.pow(normalizedHorizontalDistance, 2) + Math.pow(normalizedVerticalDistance, 2) > 1 //
|| rulings.lineBetween(character.getTextPosition(), neighbor.getCharacter().getTextPosition())) {
return;
}
unionFind.union(character, neighbor.getCharacter());
});
});
List<Line> lines = new ArrayList<>();
unionFind.getGroups().forEach(group -> {
List<Character> lineCharacters = new ArrayList<>(group);
lineCharacters.sort(Comparator.comparingDouble(Character::getX));
lines.add(new Line(lineCharacters, characterSpacing));
});
return lines;
return unionFind.getGroups()
.stream()
.map(lineCharacters -> lineCharacters.stream()
.sorted(Comparator.comparingDouble(Character::getX))
.toList())
.map(lineCharacters -> new Line(lineCharacters, characterSpacing))
.toList();
}
}

View File

@ -39,7 +39,10 @@ public class ReadingOrderService {
}
}
if (histogram.values().stream().mapToInt(Integer::intValue).average().orElse(1) < MULTI_COLUMN_DETECTION_THRESHOLD) {
if (histogram.values()
.stream()
.mapToInt(Integer::intValue).average()
.orElse(1) < MULTI_COLUMN_DETECTION_THRESHOLD) {
return resolveSingleColumnReadingOrder(zones);
} else {
@ -52,7 +55,7 @@ public class ReadingOrderService {
private static List<Zone> resolveSingleColumnReadingOrder(List<Zone> zones) {
zones.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
return zones;
}
@ -90,14 +93,14 @@ public class ReadingOrderService {
}
leftOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
rightOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
middle.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
/*
List<Zone> leftNotIntersecting = new ArrayList<>();
for (Zone leftZone : leftOf) {
boolean intersects = false;
@ -139,7 +142,7 @@ public class ReadingOrderService {
middle.addAll(leftNotIntersecting);
middle.addAll(rightNotIntersecting);
*/
List<Zone> sortedZones = new ArrayList<>();
sortedZones.addAll(leftOf);
sortedZones.addAll(rightOf);

View File

@ -5,6 +5,7 @@ import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import org.springframework.stereotype.Service;
@ -12,6 +13,7 @@ import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Chara
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
@Service
public class ZoneBuilderService {
@ -29,12 +31,10 @@ public class ZoneBuilderService {
private static final double ANGLE_TOLERANCE = Math.PI / 6;
private static final int MAX_ZONES = 300;
private static final double MAX_VERTICAL_MERGE_DISTANCE = 0.5;
public List<Zone> buildZones(List<Line> lines, double characterSpacing, double lineSpacing) {
public List<Zone> buildZones(List<Line> lines, double characterSpacing, double lineSpacing, CleanRulings rulings) {
double minHorizontalDistance = characterSpacing * MIN_HORIZONTAL_DISTANCE_MULTIPLIER;
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_DISTANCE_MULTIPLIER;
@ -45,38 +45,39 @@ public class ZoneBuilderService {
double meanHeight = calculateMeanHeight(lines);
lines.forEach(outerLine -> //
lines.forEach(innerLine -> {
lines.forEach(outerLine -> {
lines.forEach(innerLine -> {
double scale = Math.min(outerLine.getHeight(), innerLine.getHeight()) / meanHeight;
scale = Math.max(MIN_LINE_SIZE_SCALE, Math.min(scale, MAX_LINE_SIZE_SCALE));
if (innerLine == outerLine //
|| unionFind.inSameSet(outerLine, innerLine)//
|| outerLine.angularDifference(innerLine) > ANGLE_TOLERANCE) {
return;
}
if (!unionFind.inSameSet(outerLine, innerLine) && outerLine.angularDifference(innerLine) <= ANGLE_TOLERANCE) {
double scale = Math.min(outerLine.getHeight(), innerLine.getHeight()) / meanHeight;
scale = Math.max(MIN_LINE_SIZE_SCALE, Math.min(scale, MAX_LINE_SIZE_SCALE));
double horizontalDistance = outerLine.horizontalDistance(innerLine) / scale;
double verticalDistance = outerLine.verticalDistance(innerLine) / scale;
double horizontalDistance = outerLine.horizontalDistance(innerLine) / scale;
double verticalDistance = outerLine.verticalDistance(innerLine) / scale;
if (minHorizontalDistance <= horizontalDistance && verticalDistance <= maxVerticalDistance //
|| minHorizontalMergeDistance <= horizontalDistance && verticalDistance <= maxVerticalMergeDistance) {
unionFind.union(outerLine, innerLine);
}
}
}));
if ((!(minHorizontalDistance <= horizontalDistance) || !(verticalDistance <= maxVerticalDistance)) //
&& (!(minHorizontalMergeDistance <= horizontalDistance) || !(verticalDistance <= maxVerticalMergeDistance))) {
return;
}
List<Zone> zones = new ArrayList<>();
unionFind.getGroups().forEach(group -> {
zones.add(mergeLinesInZone(new ArrayList<>(group), characterSpacing, lineSpacing));
if (rulings.lineBetween(outerLine, innerLine)) {
return;
}
unionFind.union(outerLine, innerLine);
});
});
if (zones.size() > MAX_ZONES) {
List<Line> oneZoneLines = new ArrayList<>();
for (Zone zone : zones) {
oneZoneLines.addAll(zone.getLines());
}
return List.of(mergeLinesInZone(oneZoneLines, characterSpacing, lineSpacing));
}
return zones;
return unionFind.getGroups()
.stream()
.map(group -> mergeLinesInZone(new ArrayList<>(group), characterSpacing, lineSpacing))
.toList();
}
@ -103,35 +104,40 @@ public class ZoneBuilderService {
UnionFind<Line> unionFind = new UnionFind<>(new HashSet<>(lines));
lines.forEach(outer -> {
lines.forEach(inner -> {
if (inner != outer) {
if (inner == outer) {
return;
}
double horizontalDistance = outer.horizontalDistance(inner);
double verticalDistance = outer.verticalDistance(inner);
double horizontalDistance = outer.horizontalDistance(inner);
double verticalDistance = outer.verticalDistance(inner);
if (horizontalDistance <= maxHorizontalDistance && minVerticalDistance <= verticalDistance && verticalDistance <= maxVerticalDistance) {
unionFind.union(outer, inner);
} else if (minVerticalDistance <= verticalDistance && verticalDistance <= maxVerticalDistance && Math.abs(horizontalDistance - Math.min(outer.getLength(),
inner.getLength())) < 0.1) {
boolean characterOverlap = false;
int overlappingCount = 0;
for (Character outerCharacter : outer.getCharacters()) {
for (Character innerCharacter : inner.getCharacters()) {
double characterOverlapDistance = outerCharacter.overlappingDistance(innerCharacter);
if (characterOverlapDistance > 2) {
characterOverlap = true;
}
if (characterOverlapDistance > 0) {
overlappingCount++;
}
if (horizontalDistance <= maxHorizontalDistance && minVerticalDistance <= verticalDistance && verticalDistance <= maxVerticalDistance) {
unionFind.union(outer, inner);
} else if (minVerticalDistance <= verticalDistance
&& verticalDistance <= maxVerticalDistance
&& Math.abs(horizontalDistance - Math.min(outer.getLength(), inner.getLength())) < 0.1) {
boolean characterOverlap = false;
int overlappingCount = 0;
for (Character outerCharacter : outer.getCharacters()) {
for (Character innerCharacter : inner.getCharacters()) {
double characterOverlapDistance = outerCharacter.overlappingDistance(innerCharacter);
if (characterOverlapDistance > 2) {
characterOverlap = true;
}
if (characterOverlapDistance > 0) {
overlappingCount++;
}
}
if (!characterOverlap && overlappingCount <= 2) {
unionFind.union(outer, inner);
}
}
if (!characterOverlap && overlappingCount <= 2) {
unionFind.union(outer, inner);
}
}
});
});
@ -146,7 +152,9 @@ public class ZoneBuilderService {
outputZone.add(new Line(characters, characterSpacing));
}
return new Zone(outputZone);
return new Zone(outputZone.stream()
.sorted(Comparator.comparing(Line::getY0))
.collect(Collectors.toList()));
}
}

View File

@ -1,7 +1,10 @@
package com.knecon.fforesight.service.layoutparser.processor.model;
import java.awt.geom.Rectangle2D;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import lombok.AllArgsConstructor;
@ -13,16 +16,8 @@ import lombok.NoArgsConstructor;
@AllArgsConstructor
@NoArgsConstructor
@EqualsAndHashCode(callSuper = true)
public abstract class AbstractPageBlock extends Rectangle {
public abstract class AbstractPageBlock extends BoundingBox {
@JsonIgnore
protected float minX;
@JsonIgnore
protected float maxX;
@JsonIgnore
protected float minY;
@JsonIgnore
protected float maxY;
@JsonIgnore
protected PageBlockType classification;
@JsonIgnore
@ -41,63 +36,6 @@ public abstract class AbstractPageBlock extends Rectangle {
}
public boolean containsBlock(TextPageBlock other) {
return this.minX <= other.getPdfMinX() && this.maxX >= other.getPdfMaxX() && this.minY >= other.getPdfMinY() && this.maxY <= other.getPdfMaxY();
}
public boolean contains(AbstractPageBlock other) {
return this.minX <= other.minX && this.maxX >= other.maxX && this.minY >= other.minY && this.maxY <= other.maxY;
}
public boolean contains(Rectangle other) {
return page == other.getPage() && this.minX <= other.getTopLeft().getX() && this.maxX >= other.getTopLeft().getX() + other.getWidth() && this.minY <= other.getTopLeft()
.getY() && this.maxY >= other.getTopLeft().getY() + other.getHeight();
}
@JsonIgnore
public float getHeight() {
return maxY - minY;
}
@JsonIgnore
public float getWidth() {
return maxX - minX;
}
public boolean intersectsY(AbstractPageBlock apb) {
return this.minY <= apb.getMaxY() && this.maxY >= apb.getMinY();
}
public boolean almostIntersects(AbstractPageBlock apb, float yThreshold, float xThreshold) {
return this.almostIntersectsX(apb, xThreshold) && this.almostIntersectsY(apb, yThreshold);
}
private boolean almostIntersectsY(AbstractPageBlock apb, float threshold) {
return this.minY - threshold <= apb.getMaxY() && this.maxY + threshold >= apb.getMinY();
}
private boolean almostIntersectsX(AbstractPageBlock apb, float threshold) {
return this.minX - threshold <= apb.getMaxX() && this.maxX + threshold >= apb.getMinX();
}
public abstract boolean isEmpty();
}

View File

@ -7,6 +7,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.outline.Outlin
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents;
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.text.UnclassifiedText;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
import lombok.Data;
import lombok.NoArgsConstructor;
@ -24,6 +25,7 @@ public class ClassificationDocument {
private FloatFrequencyCounter fontSizeCounter = new FloatFrequencyCounter();
private StringFrequencyCounter fontCounter = new StringFrequencyCounter();
private StringFrequencyCounter fontStyleCounter = new StringFrequencyCounter();
private LayoutparsingVisualizations visualizations = new LayoutparsingVisualizations();
private boolean headlines;
private long rulesVersion;

View File

@ -12,6 +12,7 @@ import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
@ -40,6 +41,8 @@ public class Document implements GenericSemanticNode {
@Builder.Default
Set<RedactionEntity> entities = new HashSet<>();
LayoutparsingVisualizations visualizations;
@Override
public NodeType getType() {

View File

@ -1,11 +1,13 @@
package com.knecon.fforesight.service.layoutparser.processor.model.table;
import java.awt.geom.AffineTransform;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
@ -18,7 +20,7 @@ import lombok.NoArgsConstructor;
@Data
@EqualsAndHashCode(callSuper = true)
@NoArgsConstructor
public class Cell extends Rectangle {
public class Cell extends BoundingBox {
private List<TextPageBlock> textBlocks = new ArrayList<>();
@ -33,13 +35,24 @@ public class Cell extends Rectangle {
public Cell(Point2D topLeft, Point2D bottomRight) {
super((float) topLeft.getY(), (float) topLeft.getX(), (float) (bottomRight.getX() - topLeft.getX()), (float) (bottomRight.getY() - topLeft.getY()));
this.bBoxInitialUserSpace = new Rectangle2D.Double(topLeft.getX(), topLeft.getY(), (bottomRight.getX() - topLeft.getX()), (bottomRight.getY() - topLeft.getY()));
this.bBox = bBoxInitialUserSpace;
}
public Cell(Rectangle2D r) {
public Cell(Rectangle2D bBoxInitialUserSpace, AffineTransform initialUserSpaceToJava) {
super((float) r.getY(), (float) r.getX(), (float) r.getWidth(), (float) r.getHeight());
this.bBoxInitialUserSpace = bBoxInitialUserSpace;
this.bBox = initialUserSpaceToJava.createTransformedShape(bBoxInitialUserSpace).getBounds2D();
}
public static Cell copy(Cell cell) {
Cell copy = new Cell();
copy.bBoxInitialUserSpace = cell.bBoxInitialUserSpace;
copy.bBox = cell.bBox;
return copy;
}

View File

@ -1,15 +1,206 @@
package com.knecon.fforesight.service.layoutparser.processor.model.table;
import java.awt.geom.Line2D;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.LinkedList;
import java.util.List;
import java.util.stream.Stream;
import lombok.Builder;
import lombok.Data;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
@Data
@Builder
import lombok.Getter;
@Getter
public class CleanRulings {
List<Ruling> horizontal;
List<Ruling> vertical;
List<Ruling> horizontals; // unmodifiable sorted by Y list
List<Ruling> verticals; // unmodifiable sorted by X list
public CleanRulings(List<Ruling> horizontals, List<Ruling> verticals) {
this.horizontals = horizontals.stream()
.peek(Ruling::assertHorizontal)
.sorted(Comparator.comparing(Line2D.Float::getY1))
.toList();
this.verticals = verticals.stream()
.peek(Ruling::assertVertical)
.sorted(Comparator.comparing(Line2D.Float::getX1))
.toList();
}
public CleanRulings getTableLines() {
return new CleanRulings(horizontals.stream()
.filter(ruling -> ruling.getClassification().equals(Ruling.Classification.TABLE_LINE))
.toList(),
verticals.stream()
.filter(ruling -> ruling.getClassification().equals(Ruling.Classification.TABLE_LINE))
.toList());
}
public CleanRulings withoutTextRulings() {
return new CleanRulings(horizontals.stream()
.filter(ruling -> !(ruling.getClassification().equals(Ruling.Classification.UNDERLINE) || ruling.getClassification()
.equals(Ruling.Classification.STRIKETROUGH)))
.toList(),
verticals.stream()
.filter(ruling -> !(ruling.getClassification().equals(Ruling.Classification.UNDERLINE) || ruling.getClassification()
.equals(Ruling.Classification.STRIKETROUGH)))
.toList());
}
public List<Ruling> buildAll() {
ArrayList<Ruling> rulings = new ArrayList<>(horizontals.size() + verticals.size());
rulings.addAll(horizontals);
rulings.addAll(verticals);
return rulings;
}
public boolean lineBetween(BoundingBox a, BoundingBox b) {
return lineBetween(a.getBBoxInitialUserSpace(), b.getBBoxInitialUserSpace());
}
public boolean lineBetween(Rectangle2D a, Rectangle2D b) {
return lineBetween(new Point2D.Double(a.getCenterX(), a.getCenterY()), new Point2D.Double(b.getCenterX(), b.getCenterY()));
}
public boolean lineBetween(Point2D p1, Point2D p2) {
Ruling ruling = new Ruling(p1, p2);
if (ruling.isHorizontal()) {
return getVerticalsInXInterval(ruling.x1, ruling.x2).stream()
.anyMatch(vertical -> vertical.intersectsLine(ruling));
}
if (ruling.isVertical()) {
return getHorizontalsInYInterval(ruling.y1, ruling.y2).stream()
.anyMatch(horizontal -> horizontal.intersectsLine(ruling));
}
return Stream.of(getVerticalsInXInterval(ruling.x1, ruling.x2), getHorizontalsInYInterval(ruling.y1, ruling.y2))
.flatMap(Collection::stream)
.anyMatch(other -> other.intersectsLine(ruling));
}
public List<Ruling> getHorizontalsInYInterval(float y1, float y2) {
float startY = Math.min(y1, y2);
float endY = Math.max(y1, y2);
if (horizontals.isEmpty() || Float.isNaN(startY) || Float.isNaN(endY)) {
return Collections.emptyList();
}
int firstGreaterThanIdx = findFirstHorizontalRulingIdxAbove(startY);
if (firstGreaterThanIdx == -1) {
return Collections.emptyList();
}
List<Ruling> result = new LinkedList<>();
for (int i = firstGreaterThanIdx; i < horizontals.size(); i++) {
Ruling horizontal = horizontals.get(i);
if (horizontal.y1 > endY) {
break;
}
result.add(horizontal);
}
return result;
}
private int findFirstHorizontalRulingIdxAbove(float y) {
int low = 0;
int high = horizontals.size() - 1;
while (low <= high) {
int mid = low + (high - low) / 2;
Line2D.Float midLine = horizontals.get(mid);
float midY = midLine.y1;
if (midY == y) {
return mid;
} else if (midY > y) {
high = mid - 1;
} else {
low = mid + 1;
}
}
// Return the index of the first element greater than y or -1 if not found
return horizontals.size() > low && horizontals.get(low).y1 > y ? low : -1;
}
public List<Ruling> getVerticalsInXInterval(float x1, float x2) {
float startX = Math.min(x1, x2);
float endX = Math.max(x1, x2);
if (verticals.isEmpty() || Float.isNaN(startX) || Float.isNaN(endX)) {
return Collections.emptyList();
}
int firstGreaterThanIdx = findFirstVerticalRulingIdxRightOf(startX);
if (firstGreaterThanIdx == -1) {
return Collections.emptyList();
}
List<Ruling> result = new LinkedList<>();
for (int i = firstGreaterThanIdx; i < verticals.size(); i++) {
Ruling horizontal = verticals.get(i);
if (horizontal.x1 > endX) {
break;
}
result.add(horizontal);
}
return result;
}
private int findFirstVerticalRulingIdxRightOf(float x) {
int low = 0;
int high = verticals.size() - 1;
while (low <= high) {
int mid = low + (high - low) / 2;
Line2D.Float midLine = verticals.get(mid);
float midX = midLine.x1;
if (midX == x) {
return mid;
} else if (midX > x) {
high = mid - 1;
} else {
low = mid + 1;
}
}
// Return the index of the first element greater than y or -1 if not found
return verticals.size() > low && verticals.get(low).x1 > x ? low : -1;
}
}

View File

@ -1,218 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.table;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.util.Comparator;
import java.util.List;
@SuppressWarnings("all")
public class Rectangle extends Rectangle2D.Float {
protected static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f;
/**
* Ill-defined comparator, from when Rectangle was Comparable.
* <p>
* see https://github.com/tabulapdf/tabula-java/issues/116
*
* @deprecated with no replacement
*/
@Deprecated
public static final Comparator<Rectangle> ILL_DEFINED_ORDER = new Comparator<Rectangle>() {
@Override
public int compare(Rectangle o1, Rectangle o2) {
if (o1.equals(o2)) {
return 0;
}
if (o1.verticalOverlap(o2) > VERTICAL_COMPARISON_THRESHOLD) {
return o1.isLtrDominant() == -1 && o2.isLtrDominant() == -1 ? -java.lang.Double.compare(o1.getX(), o2.getX()) : java.lang.Double.compare(o1.getX(), o2.getX());
} else {
return java.lang.Float.compare(o1.getBottom(), o2.getBottom());
}
}
};
public Rectangle() {
super();
}
public Rectangle(float top, float left, float width, float height) {
super();
this.setRect(left, top, width, height);
}
/**
* @param rectangles
* @return minimum bounding box that contains all the rectangles
*/
public static Rectangle boundingBoxOf(List<? extends Rectangle> rectangles) {
float minx = java.lang.Float.MAX_VALUE;
float miny = java.lang.Float.MAX_VALUE;
float maxx = java.lang.Float.MIN_VALUE;
float maxy = java.lang.Float.MIN_VALUE;
for (Rectangle r : rectangles) {
minx = (float) Math.min(r.getMinX(), minx);
miny = (float) Math.min(r.getMinY(), miny);
maxx = (float) Math.max(r.getMaxX(), maxx);
maxy = (float) Math.max(r.getMaxY(), maxy);
}
return new Rectangle(miny, minx, maxx - minx, maxy - miny);
}
public int compareTo(Rectangle other) {
return ILL_DEFINED_ORDER.compare(this, other);
}
// I'm bad at Java and need this for fancy sorting in
// technology.tabula.TextChunk.
public int isLtrDominant() {
return 0;
}
public float getArea() {
return this.width * this.height;
}
public float verticalOverlap(Rectangle other) {
return Math.max(0, Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
}
public boolean verticallyOverlaps(Rectangle other) {
return verticalOverlap(other) > 0;
}
public float horizontalOverlap(Rectangle other) {
return Math.max(0, Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
}
public boolean horizontallyOverlaps(Rectangle other) {
return horizontalOverlap(other) > 0;
}
public float verticalOverlapRatio(Rectangle other) {
float rv = 0, delta = Math.min(this.getBottom() - this.getTop(), other.getBottom() - other.getTop());
if (other.getTop() <= this.getTop() && this.getTop() <= other.getBottom() && other.getBottom() <= this.getBottom()) {
rv = (other.getBottom() - this.getTop()) / delta;
} else if (this.getTop() <= other.getTop() && other.getTop() <= this.getBottom() && this.getBottom() <= other.getBottom()) {
rv = (this.getBottom() - other.getTop()) / delta;
} else if (this.getTop() <= other.getTop() && other.getTop() <= other.getBottom() && other.getBottom() <= this.getBottom()) {
rv = (other.getBottom() - other.getTop()) / delta;
} else if (other.getTop() <= this.getTop() && this.getTop() <= this.getBottom() && this.getBottom() <= other.getBottom()) {
rv = (this.getBottom() - this.getTop()) / delta;
}
return rv;
}
public float overlapRatio(Rectangle other) {
double intersectionWidth = Math.max(0, Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
double intersectionHeight = Math.max(0, Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
double intersectionArea = Math.max(0, intersectionWidth * intersectionHeight);
double unionArea = this.getArea() + other.getArea() - intersectionArea;
return (float) (intersectionArea / unionArea);
}
public Rectangle merge(Rectangle other) {
this.setRect(this.createUnion(other));
return this;
}
public float getTop() {
return (float) this.getMinY();
}
public void setTop(float top) {
float deltaHeight = top - this.y;
this.setRect(this.x, top, this.width, this.height - deltaHeight);
}
public float getRight() {
return (float) this.getMaxX();
}
public void setRight(float right) {
this.setRect(this.x, this.y, right - this.x, this.height);
}
public float getLeft() {
return (float) this.getMinX();
}
public void setLeft(float left) {
float deltaWidth = left - this.x;
this.setRect(left, this.y, this.width - deltaWidth, this.height);
}
public float getBottom() {
return (float) this.getMaxY();
}
public void setBottom(float bottom) {
this.setRect(this.x, this.y, this.width, bottom - this.y);
}
public Point2D[] getPoints() {
return new Point2D[]{new Point2D.Float(this.getLeft(), this.getTop()), new Point2D.Float(this.getRight(), this.getTop()), new Point2D.Float(this.getRight(),
this.getBottom()), new Point2D.Float(this.getLeft(), this.getBottom())};
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
String s = super.toString();
sb.append(s.substring(0, s.length() - 1));
sb.append(String.format(",bottom=%f,right=%f]", this.getBottom(), this.getRight()));
return sb.toString();
}
}

View File

@ -4,16 +4,14 @@ import java.awt.geom.Line2D;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Formatter;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import com.knecon.fforesight.service.layoutparser.processor.utils.CohenSutherlandClipping;
import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons;
import lombok.Getter;
import lombok.Setter;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@ -23,10 +21,24 @@ public class Ruling extends Line2D.Float {
public static final int PERPENDICULAR_UNIT_EXPAND_AMOUNT = 2;
public static final int COLINEAR_OR_PARALLEL_UNIT_EXPAND_AMOUNT = 2;
public enum Classification {
TABLE_LINE,
UNDERLINE,
STRIKETROUGH,
HEADER_SEPARATOR,
FOOTER_SEPARATOR,
OTHER
}
@Getter
@Setter
private Classification classification;
public Ruling(Point2D p1, Point2D p2) {
super(p1, p2);
this.classification = Classification.OTHER;
}
@ -60,126 +72,32 @@ public class Ruling extends Line2D.Float {
}
// log(n) implementation of find_intersections
// based on http://people.csail.mit.edu/indyk/6.838-old/handouts/lec2.pdf
public static Map<Point2D, Ruling[]> findIntersections(List<Ruling> horizontals, List<Ruling> verticals) {
class SortObject {
protected SOType type;
protected float position;
protected Ruling ruling;
public SortObject(SOType type, float position, Ruling ruling) {
this.type = type;
this.position = position;
this.ruling = ruling;
}
public void assertHorizontal() {
if (isHorizontal()) {
return;
}
List<SortObject> sos = new ArrayList<>();
TreeMap<Ruling, Boolean> tree = new TreeMap<>(new Comparator<Ruling>() {
@Override
public int compare(Ruling o1, Ruling o2) {
return java.lang.Double.compare(o1.getTop(), o2.getTop());
}
});
TreeMap<Point2D, Ruling[]> rv = new TreeMap<>(new Comparator<Point2D>() {
@Override
public int compare(Point2D o1, Point2D o2) {
if (o1.getY() > o2.getY()) {
return 1;
}
if (o1.getY() < o2.getY()) {
return -1;
}
if (o1.getX() > o2.getX()) {
return 1;
}
if (o1.getX() < o2.getX()) {
return -1;
}
return 0;
}
});
for (Ruling h : horizontals) {
sos.add(new SortObject(SOType.HLEFT, h.getLeft() - PERPENDICULAR_UNIT_EXPAND_AMOUNT, h));
sos.add(new SortObject(SOType.HRIGHT, h.getRight() + PERPENDICULAR_UNIT_EXPAND_AMOUNT, h));
}
for (Ruling v : verticals) {
sos.add(new SortObject(SOType.VERTICAL, v.getLeft(), v));
}
Collections.sort(sos, new Comparator<SortObject>() {
@Override
public int compare(SortObject a, SortObject b) {
int rv;
if (DoubleComparisons.feq(a.position, b.position)) {
if (a.type == SOType.VERTICAL && b.type == SOType.HLEFT) {
rv = 1;
} else if (a.type == SOType.VERTICAL && b.type == SOType.HRIGHT) {
rv = -1;
} else if (a.type == SOType.HLEFT && b.type == SOType.VERTICAL) {
rv = -1;
} else if (a.type == SOType.HRIGHT && b.type == SOType.VERTICAL) {
rv = 1;
} else {
rv = java.lang.Double.compare(a.position, b.position);
}
} else {
return java.lang.Double.compare(a.position, b.position);
}
return rv;
}
});
for (SortObject so : sos) {
switch (so.type) {
case VERTICAL:
for (Map.Entry<Ruling, Boolean> h : tree.entrySet()) {
try {
Point2D i = h.getKey().intersectionPoint(so.ruling);
if (i == null) {
continue;
}
rv.put(i, new Ruling[]{h.getKey().expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT), so.ruling.expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT)});
} catch (UnsupportedOperationException e) {
log.info("Some line are oblique, ignoring...");
continue;
}
}
break;
case HRIGHT:
tree.remove(so.ruling);
break;
case HLEFT:
tree.put(so.ruling, true);
break;
}
}
return rv;
throw new IllegalArgumentException("Ruling " + this + " is not horizontal");
}
public boolean vertical() {
public void assertVertical() {
if (isVertical()) {
return;
}
throw new IllegalArgumentException("Ruling " + this + " is not vertical");
}
public boolean isVertical() {
return this.length() > 0 && DoubleComparisons.feq(this.x1, this.x2); //diff < ORIENTATION_CHECK_THRESHOLD;
}
public boolean horizontal() {
public boolean isHorizontal() {
return this.length() > 0 && DoubleComparisons.feq(this.y1, this.y2); //diff < ORIENTATION_CHECK_THRESHOLD;
}
@ -188,36 +106,36 @@ public class Ruling extends Line2D.Float {
// these are used to have a single collapse method (in page, currently)
public boolean oblique() {
public boolean isOblique() {
return !(this.vertical() || this.horizontal());
return !(this.isVertical() || this.isHorizontal());
}
public float getPosition() {
if (this.oblique()) {
if (this.isOblique()) {
throw new UnsupportedOperationException();
}
return this.vertical() ? this.getLeft() : this.getTop();
return this.isVertical() ? this.getLeft() : this.getTop();
}
public float getStart() {
if (this.oblique()) {
if (this.isOblique()) {
throw new UnsupportedOperationException();
}
return this.vertical() ? this.getTop() : this.getLeft();
return this.isVertical() ? this.getTop() : this.getLeft();
}
public void setStart(float v) {
if (this.oblique()) {
if (this.isOblique()) {
throw new UnsupportedOperationException();
}
if (this.vertical()) {
if (this.isVertical()) {
this.setTop(v);
} else {
this.setLeft(v);
@ -227,19 +145,19 @@ public class Ruling extends Line2D.Float {
public float getEnd() {
if (this.oblique()) {
if (this.isOblique()) {
throw new UnsupportedOperationException();
}
return this.vertical() ? this.getBottom() : this.getRight();
return this.isVertical() ? this.getBottom() : this.getRight();
}
public void setEnd(float v) {
if (this.oblique()) {
if (this.isOblique()) {
throw new UnsupportedOperationException();
}
if (this.vertical()) {
if (this.isVertical()) {
this.setBottom(v);
} else {
this.setRight(v);
@ -249,10 +167,10 @@ public class Ruling extends Line2D.Float {
public void setStartEnd(float start, float end) {
if (this.oblique()) {
if (this.isOblique()) {
throw new UnsupportedOperationException();
}
if (this.vertical()) {
if (this.isVertical()) {
this.setTop(start);
this.setBottom(end);
} else {
@ -264,7 +182,7 @@ public class Ruling extends Line2D.Float {
public boolean perpendicularTo(Ruling other) {
return this.vertical() == other.horizontal();
return this.isVertical() == other.isHorizontal();
}
@ -318,30 +236,6 @@ public class Ruling extends Line2D.Float {
}
public Point2D intersectionPoint(Ruling other) {
Ruling this_l = this.expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT);
Ruling other_l = other.expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT);
Ruling horizontal, vertical;
if (!this_l.intersectsLine(other_l)) {
return null;
}
if (this_l.horizontal() && other_l.vertical()) {
horizontal = this_l;
vertical = other_l;
} else if (this_l.vertical() && other_l.horizontal()) {
vertical = this_l;
horizontal = other_l;
} else {
log.warn("lines must be orthogonal, vertical and horizontal");
return null;
}
return new Point2D.Float(vertical.getLeft(), horizontal.getTop());
}
@Override
public boolean equals(Object other) {
@ -451,16 +345,9 @@ public class Ruling extends Line2D.Float {
final float TOLERANCE = 1;
return Math.abs(ruling.getX1() - x1) < TOLERANCE &&//
Math.abs(ruling.getY1() - y1) < TOLERANCE &&//
Math.abs(ruling.getX2() - x2) < TOLERANCE &&//
Math.abs(ruling.getY2() - y2) < TOLERANCE;
}
private enum SOType {
VERTICAL,
HRIGHT,
HLEFT
Math.abs(ruling.getY1() - y1) < TOLERANCE &&//
Math.abs(ruling.getX2() - x2) < TOLERANCE &&//
Math.abs(ruling.getY2() - y2) < TOLERANCE;
}
}

View File

@ -36,14 +36,11 @@ public class TablePageBlock extends AbstractPageBlock {
private List<Cell> cells;
public TablePageBlock(List<Cell> cells, Rectangle area, int rotation) {
public TablePageBlock(List<Cell> cells, int rotation) {
setToBBoxOfComponents(cells);
this.cells = cells;
addCells(cells);
minX = area.getLeft();
minY = area.getBottom();
maxX = area.getRight();
maxY = area.getTop();
classification = PageBlockType.TABLE;
this.rotation = rotation;
}
@ -230,15 +227,15 @@ public class TablePageBlock extends AbstractPageBlock {
return new ArrayList<>();
}
Set<Float> uniqueX = new HashSet<>();
Set<Float> uniqueY = new HashSet<>();
Set<Double> uniqueX = new HashSet<>();
Set<Double> uniqueY = new HashSet<>();
cells.stream()
.filter(c -> !c.getTextBlocks().isEmpty() || c.getHeight() > 3 && c.getWidth() > 3)
.forEach(c -> {
uniqueX.add(c.getLeft());
uniqueX.add(c.getRight());
uniqueY.add(c.getBottom());
uniqueY.add(c.getTop());
uniqueX.add(c.getPdfMinX());
uniqueX.add(c.getPdfMaxX());
uniqueY.add(c.getPdfMinY());
uniqueY.add(c.getPdfMaxY());
});
var sortedUniqueX = uniqueX.stream()
@ -250,22 +247,24 @@ public class TablePageBlock extends AbstractPageBlock {
List<List<Cell>> rowsOfCells = new ArrayList<>();
Float prevY = null;
Double prevY = null;
for (Float y : sortedUniqueY) {
for (Double y : sortedUniqueY) {
List<Cell> row = new ArrayList<>();
Float prevX = null;
for (Float x : sortedUniqueX) {
Double prevX = null;
for (Double x : sortedUniqueX) {
if (prevY != null && prevX != null) {
var cellFromGridStructure = new Cell(new Point2D.Float(prevX, prevY), new Point2D.Float(x, y));
var cellFromGridStructure = new Cell(new Point2D.Double(prevX, prevY), new Point2D.Double(x, y));
if (cellFromGridStructure.hasMinimumSize()) {
cells.stream()
.map(originalCell -> new CellWithIntersection(originalCell, RectangleTransformations.calculateIntersectedArea(cellFromGridStructure, originalCell)))
.map(originalCell -> new CellWithIntersection(originalCell,
RectangleTransformations.calculateIntersectedArea(cellFromGridStructure.getBBoxInitialUserSpace(),
originalCell.getBBoxInitialUserSpace())))
.filter(cellWithIntersection -> cellWithIntersection.intersectedArea > 0)
.filter(cellWithIntersection -> cellWithIntersection.originalCell.getArea() > cellWithIntersection.intersectedArea * CELL_AREA_CONTAINED_THRESHOLD)
.max(Comparator.comparing(CellWithIntersection::intersectedArea))

View File

@ -1,8 +1,12 @@
package com.knecon.fforesight.service.layoutparser.processor.model.text;
import java.awt.geom.AffineTransform;
import java.awt.geom.Rectangle2D;
import org.apache.pdfbox.text.TextPosition;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
import lombok.AllArgsConstructor;
import lombok.Builder;
@ -14,9 +18,11 @@ import lombok.SneakyThrows;
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class RedTextPosition {
public class RedTextPosition extends BoundingBox {
private float[] position;
public final static int HEIGHT_PADDING = 2;
private Rectangle2D.Float bBoxDirAdj; // adjusted to text rotation
@JsonIgnore
private int rotation;
@ -58,43 +64,71 @@ public class RedTextPosition {
pos.setFontSizeInPt(textPosition.getFontSizeInPt());
pos.setFontName(textPosition.getFont().getName());
var position = new float[4];
//TODO: There is a mismatch in the java coords of the text and the rulings,
// I guess if we start with the initial user space positions and transform them the same way we do the rulings it would work.
pos.setBBox(new Rectangle2D.Float(textPosition.getX(), textPosition.getY(), textPosition.getWidthDirAdj(), textPosition.getHeight()));
position[0] = textPosition.getXDirAdj();
position[1] = textPosition.getYDirAdj();
position[2] = textPosition.getWidthDirAdj();
position[3] = textPosition.getHeightDir();
float textHeight = textPosition.getHeight() + HEIGHT_PADDING;
Rectangle2D.Float dirAdjPosition = new Rectangle2D.Float(textPosition.getXDirAdj(),
textPosition.getYDirAdj() - textHeight,
textPosition.getWidthDirAdj(),
textHeight + HEIGHT_PADDING);
pos.setBBoxDirAdj(dirAdjPosition);
AffineTransform affineTransform = getRotationMatrix(TextDirection.fromDegrees(textPosition.getDir()), textPosition.getPageWidth(), textPosition.getPageHeight());
Rectangle2D bBoxInitialUserSpace = affineTransform.createTransformedShape(dirAdjPosition).getBounds2D();
pos.setBBoxInitialUserSpace(bBoxInitialUserSpace); // These are definitely correct
pos.setPosition(position);
return pos;
}
private static AffineTransform getRotationMatrix(TextDirection textDirection, float pageWidth, float pageHeight) {
AffineTransform transform = new AffineTransform();
if (textDirection == TextDirection.ZERO || textDirection == TextDirection.HALF_CIRCLE) {
transform.rotate(textDirection.getRadians(), pageWidth / 2f, pageHeight / 2f);
transform.translate(0f, pageHeight);
} else if (textDirection == TextDirection.QUARTER_CIRCLE) {
transform.rotate(textDirection.getRadians(), pageWidth / 2f, pageWidth / 2f);
transform.translate(0f, pageWidth);
} else {
transform.rotate(textDirection.getRadians(), pageHeight / 2f, pageHeight / 2f);
transform.translate(0f, pageWidth);
}
transform.scale(1., -1.);
return transform;
}
@JsonIgnore
public float getXDirAdj() {
return position[0];
return this.bBoxDirAdj.x;
}
@JsonIgnore
public float getYDirAdj() {
return position[1];
return this.bBoxDirAdj.y;
}
@JsonIgnore
public float getWidthDirAdj() {
return position[2];
return this.bBoxDirAdj.width;
}
@JsonIgnore
public float getHeightDir() {
return position[3];
return this.bBoxDirAdj.height;
}
}

View File

@ -1,16 +1,13 @@
package com.knecon.fforesight.service.layoutparser.processor.model.text;
import static java.util.stream.Collectors.toSet;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
import lombok.AllArgsConstructor;
@ -29,34 +26,31 @@ public class TextPageBlock extends AbstractPageBlock {
@Builder.Default
private List<TextPositionSequence> sequences = new ArrayList<>();
@JsonIgnore
private int rotation;
@JsonIgnore
private String mostPopularWordFont;
@JsonIgnore
private String mostPopularWordStyle;
@JsonIgnore
private float mostPopularWordFontSize;
@JsonIgnore
private float mostPopularWordHeight;
@JsonIgnore
private float mostPopularWordSpaceWidth;
@JsonIgnore
private float highestFontSize;
@JsonIgnore
private PageBlockType classification;
@JsonIgnore
private boolean toDuplicate;
public TextPageBlock(List<TextPositionSequence> sequences) {
this.sequences = sequences;
calculateFrequencyCounters();
calculateBBox();
}
@JsonIgnore
public TextDirection getDir() {
@ -64,6 +58,17 @@ public class TextPageBlock extends AbstractPageBlock {
}
private void calculateBBox() {
if (sequences == null) {
this.bBox = new Rectangle2D.Double();
this.bBoxInitialUserSpace = new Rectangle2D.Double();
return;
}
setToBBoxOfComponents(sequences);
}
@JsonIgnore
public float getPageHeight() {
@ -80,18 +85,28 @@ public class TextPageBlock extends AbstractPageBlock {
public static TextPageBlock merge(List<TextPageBlock> textBlocksToMerge) {
if (textBlocksToMerge.isEmpty()) {
throw new IllegalArgumentException("Need to provide at least one TextPageBlock.");
}
if (textBlocksToMerge.stream()
.map(AbstractPageBlock::getPage)
.distinct()
.count() != 1) {
throw new IllegalArgumentException("Cannot merge textBlocks on different pages.");
}
List<TextPositionSequence> sequences = textBlocksToMerge.stream()
.map(TextPageBlock::getSequences)
.flatMap(java.util.Collection::stream)
.toList();
sequences = new ArrayList<>(sequences);
return fromTextPositionSequences(sequences);
return new TextPageBlock(sequences);
}
public static TextPageBlock fromTextPositionSequences(List<TextPositionSequence> wordBlockList) {
TextPageBlock textBlock = null;
private void calculateFrequencyCounters() {
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
@ -99,7 +114,7 @@ public class TextPageBlock extends AbstractPageBlock {
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
for (TextPositionSequence wordBlock : wordBlockList) {
for (TextPositionSequence wordBlock : sequences) {
lineHeightFrequencyCounter.add(wordBlock.getTextHeight());
fontSizeFrequencyCounter.add(wordBlock.getFontSize());
@ -107,161 +122,23 @@ public class TextPageBlock extends AbstractPageBlock {
fontFrequencyCounter.add(wordBlock.getFont());
styleFrequencyCounter.add(wordBlock.getFontStyle());
if (textBlock == null) {
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
wordBlock.getMaxXDirAdj(),
wordBlock.getMinYDirAdj(),
wordBlock.getMaxYDirAdj(),
wordBlockList,
wordBlock.getRotation());
} else {
TextPageBlock spatialEntity = textBlock.union(wordBlock);
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
}
}
if (textBlock != null) {
textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
}
if (textBlock != null
&& textBlock.getSequences() != null
&& textBlock.getSequences()
.stream()
.map(t -> DoubleComparisons.round(t.getMinYDirAdj(), 3))
.collect(toSet()).size() == 1) {
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
}
return textBlock;
}
/**
* Returns the minX value in pdf coordinate system.
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
* 0 -> LowerLeft
* 90 -> UpperLeft
* 180 -> UpperRight
* 270 -> LowerRight
*
* @return the minX value in pdf coordinate system
*/
@JsonIgnore
public float getPdfMinX() {
if (getDir().getDegrees() == 90) {
return minY;
} else if (getDir().getDegrees() == 180) {
return getPageWidth() - maxX;
} else if (getDir().getDegrees() == 270) {
return getPageWidth() - maxY;
} else {
return minX;
}
}
/**
* Returns the maxX value in pdf coordinate system.
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
* 0 -> LowerLeft
* 90 -> UpperLeft
* 180 -> UpperRight
* 270 -> LowerRight
*
* @return the maxX value in pdf coordinate system
*/
@JsonIgnore
public float getPdfMaxX() {
if (getDir().getDegrees() == 90) {
return maxY;
} else if (getDir().getDegrees() == 180) {
return getPageWidth() - minX;
} else if (getDir().getDegrees() == 270) {
return getPageWidth() - minY;
} else {
return maxX;
}
}
/**
* Returns the minY value in pdf coordinate system.
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
* 0 -> LowerLeft
* 90 -> UpperLeft
* 180 -> UpperRight
* 270 -> LowerRight
*
* @return the minY value in pdf coordinate system
*/
@JsonIgnore
public float getPdfMinY() {
if (getDir().getDegrees() == 90) {
return minX;
} else if (getDir().getDegrees() == 180) {
return maxY;
} else if (getDir().getDegrees() == 270) {
return getPageHeight() - maxX;
} else {
return getPageHeight() - maxY;
}
}
/**
* Returns the maxY value in pdf coordinate system.
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
* 0 -> LowerLeft
* 90 -> UpperLeft
* 180 -> UpperRight
* 270 -> LowerRight
*
* @return the maxY value in pdf coordinate system
*/
@JsonIgnore
public float getPdfMaxY() {
if (getDir().getDegrees() == 90) {
return maxX;
} else if (getDir().getDegrees() == 180) {
return minY;
} else if (getDir().getDegrees() == 270) {
return getPageHeight() - minX;
} else {
return getPageHeight() - minY;
}
}
public TextPageBlock(float minX, float maxX, float minY, float maxY, List<TextPositionSequence> sequences, int rotation) {
this.minX = minX;
this.maxX = maxX;
this.minY = minY;
this.maxY = maxY;
this.sequences = sequences;
this.rotation = rotation;
setMostPopularWordFont(fontFrequencyCounter.getMostPopular());
setMostPopularWordStyle(styleFrequencyCounter.getMostPopular());
setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular());
setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
setHighestFontSize(fontSizeFrequencyCounter.getHighest());
}
public TextPageBlock union(TextPositionSequence r) {
TextPageBlock union = this.copy();
union.add(r);
union.getSequences().add(r);
calculateFrequencyCounters();
calculateBBox();
return union;
}
@ -269,80 +146,32 @@ public class TextPageBlock extends AbstractPageBlock {
public TextPageBlock union(TextPageBlock r) {
TextPageBlock union = this.copy();
union.add(r);
union.getSequences().addAll(r.getSequences());
calculateFrequencyCounters();
calculateBBox();
return union;
}
public void add(TextPageBlock r) {
if (r.getMinX() < minX) {
minX = r.getMinX();
}
if (r.getMaxX() > maxX) {
maxX = r.getMaxX();
}
if (r.getMinY() < minY) {
minY = r.getMinY();
}
if (r.getMaxY() > maxY) {
maxY = r.getMaxY();
}
sequences.addAll(r.getSequences());
calculateFrequencyCounters();
calculateBBox();
}
public void add(TextPositionSequence r) {
setCoordinates(r);
sequences.add(r);
calculateFrequencyCounters();
calculateBBox();
}
public TextPageBlock copy() {
return new TextPageBlock(minX, maxX, minY, maxY, sequences, rotation);
}
public void resize(float x1, float y1, float width, float height) {
set(x1, y1, x1 + width, y1 + height);
}
public void resize() {
minX = Float.MAX_VALUE;
minY = Float.MAX_VALUE;
maxX = Float.MIN_VALUE;
maxY = Float.MIN_VALUE;
sequences.forEach(this::setCoordinates);
}
private void setCoordinates(TextPositionSequence sequence) {
if (sequence.getMinXDirAdj() < minX) {
minX = sequence.getMinXDirAdj();
}
if (sequence.getMaxXDirAdj() > maxX) {
maxX = sequence.getMaxXDirAdj();
}
if (sequence.getMinYDirAdj() < minY) {
minY = sequence.getMinYDirAdj();
}
if (sequence.getMaxYDirAdj() > maxY) {
maxY = sequence.getMaxYDirAdj();
}
}
public void set(float x1, float y1, float x2, float y2) {
this.minX = Math.min(x1, x2);
this.maxX = Math.max(x1, x2);
this.minY = Math.min(y1, y2);
this.maxY = Math.max(y1, y2);
return new TextPageBlock(new ArrayList<>(sequences));
}

View File

@ -1,7 +1,6 @@
package com.knecon.fforesight.service.layoutparser.processor.model.text;
import java.awt.geom.AffineTransform;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
@ -9,15 +8,14 @@ import java.util.stream.Collectors;
import org.apache.pdfbox.text.TextPosition;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@ -25,8 +23,8 @@ import lombok.extern.slf4j.Slf4j;
@Builder
@NoArgsConstructor
@AllArgsConstructor
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
public class TextPositionSequence implements CharSequence {
@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = false)
public class TextPositionSequence extends BoundingBox implements CharSequence {
public static final int HEIGHT_PADDING = 2;
@ -36,29 +34,38 @@ public class TextPositionSequence implements CharSequence {
@EqualsAndHashCode.Include
private List<RedTextPosition> textPositions = new ArrayList<>();
private Rectangle2D bBoxDirAdj;
@EqualsAndHashCode.Include
private TextDirection dir;
private int rotation;
private float pageHeight;
private float pageWidth;
private boolean isParagraphStart;
private boolean strikethrough;
private boolean underline;
public TextPositionSequence(int page) {
public TextPositionSequence(List<TextPosition> textPositions, int pageNumber, boolean isParagraphStart) {
this.page = page;
}
public TextPositionSequence(List<TextPosition> textPositions, int page, boolean isParagraphStart) {
this.textPositions = textPositions.stream().map(RedTextPosition::fromTextPosition).collect(Collectors.toList());
this.page = page;
this.textPositions = textPositions.stream()
.map(RedTextPosition::fromTextPosition)
.collect(Collectors.toList());
this.page = pageNumber;
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
this.rotation = textPositions.get(0).getRotation();
this.pageHeight = textPositions.get(0).getPageHeight();
this.pageWidth = textPositions.get(0).getPageWidth();
this.isParagraphStart = isParagraphStart;
calculateBBox();
}
private void calculateBBox() {
this.bBoxDirAdj = textPositions.stream()
.map(RedTextPosition::getBBoxDirAdj)
.collect(RectangleTransformations.collectBBox());
setToBBoxOfComponents(getTextPositions());
}
@ -70,6 +77,7 @@ public class TextPositionSequence implements CharSequence {
this.rotation = textPositions.get(0).getRotation();
this.pageHeight = textPositions.get(0).getPageHeight();
this.pageWidth = textPositions.get(0).getPageWidth();
calculateBBox();
}
@ -107,7 +115,7 @@ public class TextPositionSequence implements CharSequence {
textPositionSequence.rotation = rotation;
textPositionSequence.pageHeight = pageHeight;
textPositionSequence.pageWidth = pageWidth;
textPositionSequence.setToBBoxOfComponents(getTextPositions());
return textPositionSequence;
}
@ -137,18 +145,18 @@ public class TextPositionSequence implements CharSequence {
this.rotation = textPositionSequence.getRotation();
this.pageHeight = textPositionSequence.getPageHeight();
this.pageWidth = textPositionSequence.getPageWidth();
calculateBBox();
}
public void add(TextPosition textPosition) {
this.textPositions.add(RedTextPosition.fromTextPosition(textPosition));
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
this.rotation = textPositions.get(0).getRotation();
this.pageHeight = textPositions.get(0).getPageHeight();
this.pageWidth = textPositions.get(0).getPageWidth();
calculateBBox();
}
@ -220,18 +228,6 @@ public class TextPositionSequence implements CharSequence {
}
public float getHeight() {
return getMaxYDirAdj() - getMinYDirAdj();
}
public float getWidth() {
return getMaxXDirAdj() - getMinXDirAdj();
}
public String getFont() {
if (textPositions.get(0).getFontName() == null) {
@ -271,54 +267,5 @@ public class TextPositionSequence implements CharSequence {
return textPositions.get(0).getWidthOfSpace();
}
/**
* This returns the bounding box of the word in Pdf Coordinate System where {0,0} rotated with the page rotation.
* 0 -> LowerLeft
* 90 -> UpperLeft
* 180 -> UpperRight
* 270 -> LowerRight
*
* @return bounding box of the word in Pdf Coordinate System
*/
@SneakyThrows
public Rectangle getRectangle() {
log.debug("Page: '{}', Word: '{}', Rotation: '{}', textRotation {}", page, this, rotation, dir);
float textHeight = getTextHeight();
RedTextPosition firstTextPos = textPositions.get(0);
RedTextPosition lastTextPos = textPositions.get(textPositions.size() - 1);
Point2D bottomLeft = new Point2D.Double(firstTextPos.getXDirAdj(), firstTextPos.getYDirAdj() - HEIGHT_PADDING);
Point2D topRight = new Point2D.Double(lastTextPos.getXDirAdj() + lastTextPos.getWidthDirAdj(), lastTextPos.getYDirAdj() + textHeight + HEIGHT_PADDING);
AffineTransform transform = new AffineTransform();
if (dir == TextDirection.ZERO || dir == TextDirection.HALF_CIRCLE) {
transform.rotate(dir.getRadians(), pageWidth / 2f, pageHeight / 2f);
transform.translate(0f, pageHeight + textHeight);
transform.scale(1., -1.);
} else if (dir == TextDirection.QUARTER_CIRCLE) {
transform.rotate(dir.getRadians(), pageWidth / 2f, pageWidth / 2f);
transform.translate(0f, pageWidth + textHeight);
transform.scale(1., -1.);
} else {
transform.rotate(dir.getRadians(), pageHeight / 2f, pageHeight / 2f);
transform.translate(0f, pageWidth + textHeight);
transform.scale(1., -1.);
}
bottomLeft = transform.transform(bottomLeft, null);
topRight = transform.transform(topRight, null);
return new Rectangle( //
new Point((float) bottomLeft.getX(), (float) bottomLeft.getY()),
(float) (topRight.getX() - bottomLeft.getX()),
(float) (topRight.getY() - bottomLeft.getY()),
page);
}
}

View File

@ -9,6 +9,7 @@ import java.util.Map;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
@ -54,11 +55,12 @@ public class ImageServiceResponseAdapter {
classificationPage.getImages().forEach(image -> {
if (image.getImageType().equals(ImageType.OTHER)) {
classificationPage.getTextBlocks().forEach(textblock -> {
if (image.getPosition().contains(textblock.getMinX(), textblock.getMinY(), textblock.getWidth(), textblock.getHeight())) {
for (AbstractPageBlock textblock : classificationPage.getTextBlocks()) {
if (image.getPosition().contains(textblock.getBBoxInitialUserSpace())) {
image.setImageType(ImageType.OCR);
return;
}
});
}
}
});
}

View File

@ -31,8 +31,9 @@ public class BodyTextFrameService {
Rectangle bodyTextFrame = calculateBodyTextFrame(classificationDocument.getPages(), classificationDocument.getFontSizeCounter(), false, layoutParsingType);
Rectangle landscapeBodyTextFrame = calculateBodyTextFrame(classificationDocument.getPages(), classificationDocument.getFontSizeCounter(), true, layoutParsingType);
for (ClassificationPage page : classificationDocument.getPages()) {
// var updatedBodyTextFrame = getBodyTextFrameFromRulings(page, bodyTextFrame, landscapeBodyTextFrame);
var updatedBodyTextFrame = getBodyTextFrameFromRulings(page, bodyTextFrame, landscapeBodyTextFrame);
setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame);
classificationDocument.getVisualizations().addMainBodyVisualization(page.isLandscape() ? landscapeBodyTextFrame : bodyTextFrame, page.getPageNumber());
}
}
@ -58,24 +59,26 @@ public class BodyTextFrameService {
private List<Ruling> getPotentialFooterRulings(ClassificationPage page) {
return page.getCleanRulings()
.getHorizontal()
return page.getCleanRulings().getHorizontals()
.stream()
.filter(ruling -> ruling.getClassification().equals(Ruling.Classification.OTHER))
.filter(ruling -> ruling.getY1() < page.getPageHeight() * RULING_HEIGHT_THRESHOLD)
.filter(ruling -> ruling.getWidth() > RULING_WIDTH_THRESHOLD * page.getPageWidth())
.sorted(Comparator.comparingDouble(Ruling::getTop))
.peek(ruling -> ruling.setClassification(Ruling.Classification.FOOTER_SEPARATOR))
.toList();
}
private List<Ruling> getPotentialHeaderRulings(ClassificationPage page) {
return page.getCleanRulings()
.getHorizontal()
return page.getCleanRulings().getHorizontals()
.stream()
.filter(ruling -> ruling.getClassification().equals(Ruling.Classification.OTHER))
.filter(ruling -> ruling.getY1() > page.getPageHeight() * (1 - RULING_HEIGHT_THRESHOLD))
.filter(ruling -> ruling.getWidth() > RULING_WIDTH_THRESHOLD * page.getPageWidth())
.sorted(Comparator.comparingDouble(Ruling::getBottom).reversed())
.peek(ruling -> ruling.setClassification(Ruling.Classification.HEADER_SEPARATOR))
.toList();
}
@ -99,16 +102,16 @@ public class BodyTextFrameService {
if (page.getPageWidth() > page.getPageHeight() && page.getRotation() == 270) {
textFrame = new Rectangle(new Point(textFrame.getTopLeft().getY(), page.getPageHeight() - textFrame.getTopLeft().getX() - textFrame.getWidth()),
textFrame.getHeight(),
textFrame.getWidth(),
0);
textFrame.getHeight(),
textFrame.getWidth(),
0);
} else if (page.getPageWidth() > page.getPageHeight() && page.getRotation() != 0) {
textFrame = new Rectangle(new Point(textFrame.getTopLeft().getY(), textFrame.getTopLeft().getX()), textFrame.getHeight(), textFrame.getWidth(), page.getPageNumber());
} else if (page.getRotation() == 180) {
textFrame = new Rectangle(new Point(textFrame.getTopLeft().getX(), page.getPageHeight() - textFrame.getTopLeft().getY() - textFrame.getHeight()),
textFrame.getWidth(),
textFrame.getHeight(),
0);
textFrame.getWidth(),
textFrame.getHeight(),
0);
}
page.setBodyTextFrame(textFrame);
}
@ -152,14 +155,17 @@ public class BodyTextFrameService {
}
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) || MarkedContentUtils.intersects(textBlock,
page.getMarkedContentBboxPerType(),
MarkedContentUtils.FOOTER)) {
page.getMarkedContentBboxPerType(),
MarkedContentUtils.FOOTER)) {
continue;
}
float approxLineCount = PositionUtils.getApproxLineCount(textBlock);
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE) && approxLineCount < approximateHeaderLineCount && textBlock.getMaxY() >= page.getPageHeight() - (page.getPageHeight() / 10) || !layoutParsingType.equals(
LayoutParsingType.DOCUMINE) && approxLineCount < approximateHeaderLineCount) {
double approxLineCount = PositionUtils.getApproxLineCount(textBlock);
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD) //
&& approxLineCount < approximateHeaderLineCount //
&& textBlock.getMaxY() >= page.getPageHeight() - (page.getPageHeight() / 10)//
|| !layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD) //
&& approxLineCount < approximateHeaderLineCount) {
continue;
}
@ -185,10 +191,10 @@ public class BodyTextFrameService {
}
}
}
return new Rectangle(new Point(expansionsRectangle.minX, expansionsRectangle.minY),
expansionsRectangle.maxX - expansionsRectangle.minX,
expansionsRectangle.maxY - expansionsRectangle.minY,
0);
return new Rectangle(new Point((float) expansionsRectangle.minX, (float) expansionsRectangle.minY),
(float) (expansionsRectangle.maxX - expansionsRectangle.minX),
(float) (expansionsRectangle.maxY - expansionsRectangle.minY),
0);
}
@ -226,10 +232,10 @@ public class BodyTextFrameService {
private class BodyTextFrameExpansionsRectangle {
float minX = 10000;
float maxX = -100;
float minY = 10000;
float maxY = -100;
double minX = 10000;
double maxX = -100;
double minY = 10000;
double maxY = -100;
}

View File

@ -44,9 +44,9 @@ public class GapDetectionService {
if (yDifference > avgTextPositionHeight * Y_GAP_FACTOR) {
yGapContext.addGap(mainBodyTextFrame.getMinX(),
previousTextPositionBBox.getMaxY(),
mainBodyTextFrame.getWidth(),
-(previousTextPositionBBox.getMaxY() - currentTextPositionBBox.getMinY()));
previousTextPositionBBox.getMaxY(),
mainBodyTextFrame.getWidth(),
-(previousTextPositionBBox.getMaxY() - currentTextPositionBBox.getMinY()));
}
if (yDifference > avgTextPositionHeight * NEW_LINE_FACTOR) {
@ -69,32 +69,37 @@ public class GapDetectionService {
private static Rectangle2D toRectangle2D(TextPositionSequence textPosition) {
return mirrorY(RectangleTransformations.toRectangle2D(textPosition.getRectangle()));
return mirrorY(textPosition.getBBox());
}
private static Rectangle2D mirrorY(Rectangle2D rectangle2D) {
return new Rectangle2D.Double(rectangle2D.getX(), Math.min(rectangle2D.getMinY(), rectangle2D.getMaxY()), rectangle2D.getWidth(), Math.abs(rectangle2D.getHeight()));
}
private static void addGapToLine(Rectangle2D currentTextPosition, Rectangle2D previousTextPosition, XGapsContext context) {
context.gapsInCurrentLine.add(new Rectangle2D.Double(previousTextPosition.getMaxX(),
previousTextPosition.getMinY(),
currentTextPosition.getMinX() - previousTextPosition.getMaxX(),
(previousTextPosition.getHeight() + currentTextPosition.getHeight()) / 2));
previousTextPosition.getMinY(),
currentTextPosition.getMinX() - previousTextPosition.getMaxX(),
(previousTextPosition.getHeight() + currentTextPosition.getHeight()) / 2));
}
private static void assertAllTextPositionsHaveSameDir(List<TextPositionSequence> textPositionSequences) {
assert textPositionSequences.stream().map(TextPositionSequence::getDir).allMatch(a -> a.equals(textPositionSequences.get(0).getDir()));
assert textPositionSequences.stream()
.map(TextPositionSequence::getDir)
.allMatch(a -> a.equals(textPositionSequences.get(0).getDir()));
}
private static double getAvgTextPositionHeight(List<TextPositionSequence> textPositionSequences) {
return textPositionSequences.stream().mapToDouble(TextPositionSequence::getHeight).average().orElseThrow();
return textPositionSequences.stream()
.mapToDouble(TextPositionSequence::getHeight).average().orElseThrow();
}
@ -142,9 +147,9 @@ public class GapDetectionService {
public void addGapToRightEdgeOfMainBody(Rectangle2D textPosition) {
Rectangle2D leftGap = new Rectangle2D.Double(textPosition.getMaxX(),
textPosition.getMinY(),
mainBodyTextFrame.getMaxX() - textPosition.getMaxX(),
textPosition.getHeight());
textPosition.getMinY(),
mainBodyTextFrame.getMaxX() - textPosition.getMaxX(),
textPosition.getHeight());
gapsInCurrentLine.add(leftGap);
}
@ -152,9 +157,9 @@ public class GapDetectionService {
public void addGapFromLeftEdgeOfMainBody(Rectangle2D textPosition) {
Rectangle2D leftGap = new Rectangle2D.Double(mainBodyTextFrame.getMinX(),
textPosition.getMinY(),
textPosition.getMinX() - mainBodyTextFrame.getMinX(),
textPosition.getHeight());
textPosition.getMinY(),
textPosition.getMinX() - mainBodyTextFrame.getMinX(),
textPosition.getHeight());
gapsInCurrentLine.add(leftGap);
}

View File

@ -180,7 +180,7 @@ public class LineDetectionService {
private Rectangle2D textPositionBBox(List<TextPositionSequence> textPositionSequences) {
return RectangleTransformations.rectangleBBox(textPositionSequences.stream().map(TextPositionSequence::getRectangle).toList());
return RectangleTransformations.rectangle2DBBox(textPositionSequences.stream().map(TextPositionSequence::getBBox).toList());
}

View File

@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.services;
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.X_FIRST_RULING_COMPARATOR;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
@ -12,9 +13,9 @@ import java.util.stream.Collectors;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import com.knecon.fforesight.service.layoutparser.processor.utils.UnionFind;
import lombok.RequiredArgsConstructor;
@ -31,7 +32,7 @@ public class RulingCleaningService {
private static final float THRESHOLD_Y_HORIZONTAL = 3;
public CleanRulings getCleanRulings(List<TableCells> tableCells, List<Ruling> rulings) {
public CleanRulings deduplicateAndStraightenRulings(List<TableCells> tableCells, List<Ruling> rulings) {
Rulings verticalAndHorizontalRulingLines;
@ -45,43 +46,45 @@ public class RulingCleaningService {
verticalAndHorizontalRulingLines.horizontalLines.sort(X_FIRST_RULING_COMPARATOR);
verticalAndHorizontalRulingLines = cleanRulings(verticalAndHorizontalRulingLines);
return CleanRulings.builder().vertical(verticalAndHorizontalRulingLines.verticalLines()).horizontal(verticalAndHorizontalRulingLines.horizontalLines()).build();
return new CleanRulings(verticalAndHorizontalRulingLines.horizontalLines(), verticalAndHorizontalRulingLines.verticalLines());
}
private Rulings cleanRulings(Rulings rulings) {
List<List<Rectangle>> groupedOverlappingVerticalRectangles = groupOverlappingRectangles(rulings.verticalLines.stream()
.map(RulingCleaningService::getOverlapRectangle)
.distinct()
.toList());
List<Ruling> cleanedVerticalRulings = groupedOverlappingVerticalRectangles.stream()
.map(rectList -> getXCenteredRuling(Rectangle.boundingBoxOf(rectList)))
.toList();
List<List<Rectangle>> groupedOverlappingHorizontalRectangles = groupOverlappingRectangles(rulings.horizontalLines.stream()
List<List<Rectangle2D>> groupedOverlappingVerticalRectangles = groupOverlappingRectangles(rulings.verticalLines.stream()
.map(RulingCleaningService::getOverlapRectangle)
.distinct()
.toList());
List<Ruling> cleanedVerticalRulings = groupedOverlappingVerticalRectangles.stream()
.map(rectList -> getXCenteredRuling(RectangleTransformations.rectangle2DBBox(rectList)))
.filter(ruling -> ruling.length() > 0)
.toList();
List<List<Rectangle2D>> groupedOverlappingHorizontalRectangles = groupOverlappingRectangles(rulings.horizontalLines.stream()
.map(RulingCleaningService::getOverlapRectangle)
.distinct()
.toList());
List<Ruling> cleanedHorizontalRulings = groupedOverlappingHorizontalRectangles.stream()
.map(rectList -> getYCenteredRuling(Rectangle.boundingBoxOf(rectList)))
.map(rectList -> getYCenteredRuling(RectangleTransformations.rectangle2DBBox(rectList)))
.filter(ruling -> ruling.length() > 0)
.collect(Collectors.toList());
return new Rulings(cleanedVerticalRulings, cleanedHorizontalRulings);
}
private List<List<Rectangle>> groupOverlappingRectangles(List<Rectangle> rectangles) {
private List<List<Rectangle2D>> groupOverlappingRectangles(List<Rectangle2D> rectangles) {
UnionFind<Rectangle> unionFind = new UnionFind<>();
UnionFind<Rectangle2D> unionFind = new UnionFind<>();
for (int i = 0; i < rectangles.size(); i++) {
for (int j = i + 1; j < rectangles.size(); j++) {
Rectangle rectangle1 = rectangles.get(i);
Rectangle rectangle2 = rectangles.get(j);
Rectangle2D rectangle1 = rectangles.get(i);
Rectangle2D rectangle2 = rectangles.get(j);
// we can stop early when we are too far off because of x-y-sorting
if(rectangle1.getRight() < rectangle2.getLeft() && rectangle1.getBottom() < rectangle2.getTop()) {
if (rectangle1.getMaxX() < rectangle2.getMinX() && rectangle1.getMaxY() < rectangle2.getMinY()) {
break;
}
@ -91,66 +94,66 @@ public class RulingCleaningService {
}
}
Map<Rectangle, List<Rectangle>> groups = new HashMap<>();
for (Rectangle rectangle : rectangles) {
Rectangle root = unionFind.find(rectangle);
Map<Rectangle2D, List<Rectangle2D>> groups = new HashMap<>();
for (Rectangle2D rectangle : rectangles) {
Rectangle2D root = unionFind.find(rectangle);
groups.computeIfAbsent(root, k -> new ArrayList<>()).add(rectangle);
}
return new ArrayList<>(groups.values());
}
private static Rectangle getOverlapRectangle(Ruling ruling) {
private static Rectangle2D getOverlapRectangle(Ruling ruling) {
float top;
float left;
float y;
float x;
float w;
float h;
if (ruling.x1 < ruling.x2) {
left = ruling.x1;
x = ruling.x1;
w = ruling.x2 - ruling.x1;
} else {
left = ruling.x2;
x = ruling.x2;
w = ruling.x1 - ruling.x2;
}
if (ruling.y1 < ruling.y2) {
top = ruling.y1;
y = ruling.y1;
h = ruling.y2 - ruling.y1;
} else {
top = ruling.y2;
y = ruling.y2;
h = ruling.y1 - ruling.y2;
}
if (ruling.horizontal()) {
return new Rectangle(top - THRESHOLD_Y_HORIZONTAL, left - THRESHOLD_X_HORIZONTAL, w + 2 * THRESHOLD_X_HORIZONTAL, h + 2 * THRESHOLD_Y_HORIZONTAL);
if (ruling.isHorizontal()) {
return new Rectangle2D.Double(x - THRESHOLD_X_HORIZONTAL, y - THRESHOLD_Y_HORIZONTAL, w + 2 * THRESHOLD_X_HORIZONTAL, h + 2 * THRESHOLD_Y_HORIZONTAL);
} else {
return new Rectangle(top - THRESHOLD_Y_VERTICAL, left - THRESHOLD_X_VERTICAL, w + 2 * THRESHOLD_X_VERTICAL, h + 2 * THRESHOLD_Y_VERTICAL);
return new Rectangle2D.Double(x - THRESHOLD_X_VERTICAL, y - THRESHOLD_Y_VERTICAL, w + 2 * THRESHOLD_X_VERTICAL, h + 2 * THRESHOLD_Y_VERTICAL);
}
}
public static Ruling getXCenteredRuling(Rectangle rectangle) {
public static Ruling getXCenteredRuling(Rectangle2D rectangle) {
float x = (float) rectangle.getCenterX();
float y1 = rectangle.getTop();
float y2 = rectangle.getBottom();
double x = rectangle.getCenterX();
double y1 = rectangle.getMinY();
double y2 = rectangle.getMaxY();
Point2D point1 = new Point2D.Float(x, y1 + THRESHOLD_Y_VERTICAL);
Point2D point2 = new Point2D.Float(x, y2 - THRESHOLD_Y_VERTICAL);
Point2D point1 = new Point2D.Double(x, y1 + THRESHOLD_Y_VERTICAL);
Point2D point2 = new Point2D.Double(x, y2 - THRESHOLD_Y_VERTICAL);
return new Ruling(point1, point2);
}
public static Ruling getYCenteredRuling(Rectangle rectangle) {
public static Ruling getYCenteredRuling(Rectangle2D rectangle) {
float x1 = rectangle.getLeft();
float x2 = rectangle.getRight();
float y = (float) rectangle.getCenterY();
double x1 = rectangle.getX();
double x2 = rectangle.getMaxX();
double y = rectangle.getCenterY();
Point2D point1 = new Point2D.Float(x1 + THRESHOLD_X_HORIZONTAL, y);
Point2D point2 = new Point2D.Float(x2 - THRESHOLD_X_HORIZONTAL, y);
Point2D point1 = new Point2D.Double(x1 + THRESHOLD_X_HORIZONTAL, y);
Point2D point2 = new Point2D.Double(x2 - THRESHOLD_X_HORIZONTAL, y);
return new Ruling(point1, point2);
}
@ -160,14 +163,14 @@ public class RulingCleaningService {
List<Ruling> vrs = new ArrayList<>();
for (Ruling vr : rulings) {
if (vr.vertical()) {
if (vr.isVertical()) {
vrs.add(vr);
}
}
List<Ruling> hrs = new ArrayList<>();
for (Ruling hr : rulings) {
if (hr.horizontal()) {
if (hr.isHorizontal()) {
hrs.add(hr);
}
}

View File

@ -159,10 +159,10 @@ public class SectionsBuilderService {
}
}
for (ClassificationSection section : sectionsOnPage) {
Float xMin = null;
Float yMin = null;
Float xMax = null;
Float yMax = null;
Double xMin = null;
Double yMin = null;
Double xMax = null;
Double yMax = null;
for (AbstractPageBlock abs : section.getPageBlocks()) {
if (abs.getPage() != page.getPageNumber()) {
@ -244,7 +244,7 @@ public class SectionsBuilderService {
.get(0)
.stream()
.map(cell -> {
Cell fakeCell = new Cell(cell.getPoints()[0], cell.getPoints()[2]);
Cell fakeCell = Cell.copy(cell);
fakeCell.setHeaderCells(Collections.singletonList(cell));
return fakeCell;
})

View File

@ -3,6 +3,8 @@ package com.knecon.fforesight.service.layoutparser.processor.services;
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.CELL_SIZE_COMPARATOR;
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.RECTANGLE_SIZE_COMPARATOR;
import java.awt.geom.AffineTransform;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
@ -11,22 +13,26 @@ import java.util.stream.Collectors;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons;
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangularIntersectionFinder;
import com.knecon.fforesight.service.layoutparser.processor.utils.SpreadsheetFinder;
import lombok.SneakyThrows;
@Service
public class TableExtractionService {
private static final int MAX_TABLE_CONTAINED_CELLS_WITH_TEXT = 1;
private static final int TEXT_BLOCK_CONTAINMENT_TOLERANCE = 2;
private static final double TABLE_UNIFORMITY_THRESHOLD = 0.7;
@ -59,29 +65,31 @@ public class TableExtractionService {
}
}
var cells = new ArrayList<>(new HashSet<>(emptyCells));
DoubleComparisons.sort(cells, Rectangle.ILL_DEFINED_ORDER);
List<Cell> cells = new ArrayList<>(new HashSet<>(emptyCells));
DoubleComparisons.sort(cells, BoundingBox.ILL_DEFINED_ORDER);
List<Rectangle> spreadsheetAreas = SpreadsheetFinder.findSpreadsheetsFromCells(cells);
List<Rectangle2D> spreadsheetAreas = SpreadsheetFinder.findSpreadsheetsFromCells(cells);
// sort spreadsheetAreas by size (height * width) ascending so that cells are placed in the smallest tables first
// this way no cell duplication occurs when tables are contained in other tables and only the most inner table contains the cells
spreadsheetAreas.sort(RECTANGLE_SIZE_COMPARATOR);
List<TablePageBlock> tables = new ArrayList<>();
for (Rectangle area : spreadsheetAreas) {
for (Rectangle2D area : spreadsheetAreas) {
List<Cell> containedCells = new ArrayList<>();
for (Cell c : cells) {
if (c.hasMinimumSize() && area.contains(c)) {
if (c.hasMinimumSize() && area.contains(c.getBBoxInitialUserSpace())) {
containedCells.add(c);
}
}
var containedCellsWithText = containedCells.stream().filter(cell -> !cell.getTextBlocks().isEmpty()).toList();
var containedCellsWithText = containedCells.stream()
.filter(cell -> !cell.getTextBlocks().isEmpty())
.toList();
// verify if table would contain fewer cells with text than the threshold allows
if (containedCellsWithText.size() >= MAX_TABLE_CONTAINED_CELLS_WITH_TEXT && checkIfTableCellsAreUniform(containedCells)) {
tables.add(new TablePageBlock(containedCells, area, page.getRotation()));
tables.add(new TablePageBlock(containedCells, page.getRotation()));
cells.removeAll(containedCells);
}
}
@ -90,14 +98,18 @@ public class TableExtractionService {
int position = -1;
for (AbstractPageBlock pageBlock : page.getTextBlocks()) {
if (pageBlock instanceof TextPageBlock ? table.containsBlock((TextPageBlock) pageBlock) : table.contains(pageBlock) && position == -1) {
if (pageBlock instanceof TextPageBlock ? table.contains(pageBlock) : table.contains(pageBlock) && position == -1) {
position = page.getTextBlocks().indexOf(pageBlock);
}
}
if (position != -1) {
page.getTextBlocks().add(position, table);
var toBeRemoved = table.getCells().stream().map(Cell::getTextBlocks).flatMap(List::stream).toList();
var toBeRemoved = table.getCells()
.stream()
.map(Cell::getTextBlocks)
.flatMap(List::stream)
.toList();
// remove text blocks from the page that were also added with the table (from its contained cells)
page.getTextBlocks().removeAll(toBeRemoved);
}
@ -112,7 +124,7 @@ public class TableExtractionService {
}
Map<Long, List<Long>> cellsGroupedByRoundedWidth = containedCells.stream()
.map(Rectangle::getWidth)
.map(BoundingBox::getWidth)
.map(size -> Math.round(size / 10.0) * 10)
.collect(Collectors.groupingBy(Long::longValue));
@ -122,25 +134,26 @@ public class TableExtractionService {
private boolean doesCellContainTextBlock(Cell cell, TextPageBlock textBlock) {
if(cell.isEmpty() || textBlock.getSequences().isEmpty()) {
return false;
}
double x = textBlock.getPdfMinX();
double y = textBlock.getPdfMinY();
double w = textBlock.getPdfMaxX() - textBlock.getPdfMinX();
double h = textBlock.getPdfMaxY() - textBlock.getPdfMinY();
if (w <= 0 || h <= 0) {
return false;
}
double x0 = cell.getX();
double y0 = cell.getY();
return (x >= x0 - TEXT_BLOCK_CONTAINMENT_TOLERANCE && y >= y0 - TEXT_BLOCK_CONTAINMENT_TOLERANCE && (x + w) <= x0 + cell.getWidth() + 2 * TEXT_BLOCK_CONTAINMENT_TOLERANCE && (y + h) <= y0 + cell.getHeight() + 2 * TEXT_BLOCK_CONTAINMENT_TOLERANCE);
return cell.contains(textBlock, RedTextPosition.HEIGHT_PADDING);
}
public static List<Cell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
@SneakyThrows
public static List<Cell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines, PageInformation pageInformation) {
return RectangularIntersectionFinder.find(horizontalRulingLines, verticalRulingLines).stream().map(Cell::new).collect(Collectors.toList());
AffineTransform affineTransform = CoordinateTransforms.calculateInitialUserSpaceCoordsToImageCoords(pageInformation, 1);
/*
switch (pageInformation.rotationDegrees()) {
case 90 -> affineTransform.translate(RedTextPosition.HEIGHT_PADDING, 0); //although this is wrong, our text coordinates are wrong as well
case 180 -> affineTransform.translate(0, RedTextPosition.HEIGHT_PADDING);
case 270 -> affineTransform.translate(-RedTextPosition.HEIGHT_PADDING, 0);
default -> affineTransform.translate(0, -RedTextPosition.HEIGHT_PADDING);
}
*/
return RectangularIntersectionFinder.find(horizontalRulingLines, verticalRulingLines)
.stream()
.map(rect -> new Cell(rect, affineTransform))
.collect(Collectors.toList());
}
}

View File

@ -0,0 +1,99 @@
package com.knecon.fforesight.service.layoutparser.processor.services;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import lombok.experimental.UtilityClass;
@UtilityClass
public class TextRulingsClassifier {
private final static double STRIKETHROUGH_ZONE = 0.5; // multiplied with text height, determines height of intersection interval for strikethrough lines.
private final static double UNDERLINE_ZONE = 0.2; // multiplied with text height, determines height of intersection interval of underline lines.
private final static double TEXT_BBOX_THRESHOLD_FACTOR = 0.15; // multiplied with text width then subtracted from word width. If ruling covers this width, it is considered as strikethrough/underline.
public static void classifyUnderlinedAndStrikethroughText(List<TextPositionSequence> words, CleanRulings cleanRulings) {
for (TextPositionSequence word : words) {
if (word.getDir().equals(TextDirection.ZERO) || word.getDir().equals(TextDirection.HALF_CIRCLE)) {
handleHorizontalText(cleanRulings, word);
} else {
handleVerticalText(cleanRulings, word);
}
}
}
private static void handleVerticalText(CleanRulings cleanRulings, TextPositionSequence word) {
float lowerY = (float) (word.getBBoxInitialUserSpace().getMinY() + TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
float upperY = (float) (word.getBBoxInitialUserSpace().getMaxY() - TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
float strikethroughCenterX = (float) word.getBBoxInitialUserSpace().getCenterX();
float strikethroughBoxHeight = (float) ((word.getBBoxDirAdj().getHeight() * STRIKETHROUGH_ZONE) / 2);
float underlineCenterX = (float) (word.getDir().equals(TextDirection.QUARTER_CIRCLE) ? word.getBBoxInitialUserSpace().getMaxX() : word.getBBoxInitialUserSpace().getMinX());
float underlineBoxHeight = (float) ((word.getBBoxDirAdj().getHeight() * UNDERLINE_ZONE) / 2);
float leftX = Math.min(underlineCenterX - underlineBoxHeight, strikethroughCenterX - strikethroughBoxHeight);
float rightX = Math.max(underlineCenterX + underlineBoxHeight, strikethroughCenterX + strikethroughBoxHeight);
List<Ruling> rulingsIntersectingWord = cleanRulings.getVerticalsInXInterval(leftX, rightX)
.stream()
.filter(ruling -> ruling.getClassification().equals(Ruling.Classification.OTHER))
.filter(ruling -> ruling.y1 <= lowerY && upperY <= ruling.y2)
.toList();
for (Ruling ruling : rulingsIntersectingWord) {
if (strikethroughCenterX - strikethroughBoxHeight < ruling.x1 && ruling.x1 < strikethroughCenterX + strikethroughBoxHeight) {
ruling.setClassification(Ruling.Classification.STRIKETROUGH);
word.setStrikethrough(true);
}
if (underlineCenterX - underlineBoxHeight < ruling.x1 && ruling.x1 < underlineCenterX + underlineBoxHeight) {
ruling.setClassification(Ruling.Classification.UNDERLINE);
word.setUnderline(true);
}
}
}
private static void handleHorizontalText(CleanRulings cleanRulings, TextPositionSequence word) {
float leftX = (float) (word.getBBoxInitialUserSpace().getMinX() + TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
float rightX = (float) (word.getBBoxInitialUserSpace().getMaxX() - TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
float strikethroughCenterY = (float) word.getBBoxInitialUserSpace().getCenterY();
float strikethroughBoxHeight = (float) ((word.getBBoxDirAdj().getHeight() * STRIKETHROUGH_ZONE) / 2);
float underlineCenterY = (float) (word.getDir().equals(TextDirection.ZERO) ? word.getBBoxInitialUserSpace().getMinY() : word.getBBoxInitialUserSpace().getMaxY());
float underlineBoxHeight = (float) ((word.getBBoxDirAdj().getHeight() * UNDERLINE_ZONE) / 2);
float lowerY = Math.min(underlineCenterY - underlineBoxHeight, strikethroughCenterY - strikethroughBoxHeight);
float upperY = Math.max(underlineCenterY + underlineBoxHeight, strikethroughCenterY + strikethroughBoxHeight);
List<Ruling> rulingsIntersectingWord = cleanRulings.getHorizontalsInYInterval(lowerY, upperY)
.stream()
.filter(ruling -> ruling.getClassification().equals(Ruling.Classification.OTHER))
.filter(ruling -> ruling.x1 <= leftX && rightX <= ruling.x2)
.toList();
for (Ruling ruling : rulingsIntersectingWord) {
if (strikethroughCenterY - strikethroughBoxHeight < ruling.y1 && ruling.y1 < strikethroughCenterY + strikethroughBoxHeight) {
ruling.setClassification(Ruling.Classification.STRIKETROUGH);
word.setStrikethrough(true);
}
if (underlineCenterY - underlineBoxHeight < ruling.y1 && ruling.y1 < underlineCenterY + underlineBoxHeight) {
ruling.setClassification(Ruling.Classification.UNDERLINE);
word.setUnderline(true);
}
}
}
}

View File

@ -1,7 +1,5 @@
package com.knecon.fforesight.service.layoutparser.processor.services.blockification;
import static java.util.stream.Collectors.toSet;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
@ -9,21 +7,17 @@ import java.util.ListIterator;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.DocstrumSegmentationService;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
import lombok.RequiredArgsConstructor;
@ -37,22 +31,60 @@ public class DocstrumBlockificationService {
static final float THRESHOLD = 1f;
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Cell> cells, boolean xyOrder) {
public ClassificationPage blockify(List<TextPositionSequence> textPositions,
CleanRulings rulings,
boolean xyOrder,
LayoutparsingVisualizations visualizations,
LayoutParsingType layoutParsingType) {
CleanRulings usedRulings = RectangleTransformations.extractRulings(cells);
CleanRulings usedRulings = rulings.withoutTextRulings();
var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder);
var pageBlocks = toAbstractPageBlocks(zones, usedRulings.getHorizontal(), usedRulings.getVertical(), xyOrder);
var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder, usedRulings, visualizations);
if (!textPositions.isEmpty()) {
visualizations.addZoneVisualizations(zones, textPositions.get(0).getPage());
visualizations.addLineVisualizationsFromZones(zones, textPositions.get(0).getPage());
visualizations.addCharactersWithNeighbours(zones, textPositions.get(0).getPage());
}
var pageBlocks = toAbstractPageBlocks(zones, xyOrder, usedRulings);
if (xyOrder) {
sortPageBlocksXThenY(pageBlocks);
}
var classificationPage = new ClassificationPage(pageBlocks);
classificationPage.setCleanRulings(rulings);
mergeIntersectingBlocks(classificationPage.getTextBlocks(), 0, 0);
mergeIntersectingBlocks(classificationPage, usedRulings, 0, 0);
if (layoutParsingType == LayoutParsingType.DOCUMINE || layoutParsingType == LayoutParsingType.REDACT_MANAGER) {
combineBlocks(classificationPage);
}
if (layoutParsingType == LayoutParsingType.CLARIFYND) {
mergeIntersectingBlocks(classificationPage, usedRulings, 0, 6.5f);
}
return classificationPage;
}
private List<AbstractPageBlock> toAbstractPageBlocks(List<Zone> zones, List<Ruling> horizontalRulings, List<Ruling> verticalRulings, boolean xyOrder) {
private static void sortPageBlocksXThenY(List<AbstractPageBlock> pageBlocks) {
pageBlocks.sort(Comparator.comparing(AbstractPageBlock::getMinY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(AbstractPageBlock::getMinX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
pageBlocks.sort(new Comparator<AbstractPageBlock>() {
@Override
public int compare(AbstractPageBlock o1, AbstractPageBlock o2) {
return Math.abs(o1.getMinY() - o2.getMinY()) < 5 && o1.getMinX() < o2.getMinX() == true ? -1 : 0;
}
});
}
private List<AbstractPageBlock> toAbstractPageBlocks(List<Zone> zones, boolean xyOrder, CleanRulings usedRulings) {
List<AbstractPageBlock> abstractPageBlocks = new ArrayList<>();
zones.forEach(zone -> {
@ -66,7 +98,7 @@ public class DocstrumBlockificationService {
});
});
abstractPageBlocks.addAll(splitZonesAtRulings(textPositionSequences, horizontalRulings, verticalRulings));
abstractPageBlocks.add(buildTextBlock(textPositionSequences, 0));
});
if (xyOrder) {
@ -89,6 +121,7 @@ public class DocstrumBlockificationService {
TextPageBlock previous = new TextPageBlock();
ListIterator<AbstractPageBlock> itty = page.getTextBlocks().listIterator();
CleanRulings usedRulings = page.getCleanRulings().withoutTextRulings();
while (itty.hasNext()) {
AbstractPageBlock block = itty.next();
@ -100,7 +133,7 @@ public class DocstrumBlockificationService {
if (previous != null && !previous.getSequences().isEmpty()) {
if (current.getDir() != previous.getDir()) {
if (current.getDir() != previous.getDir() || usedRulings.lineBetween(current, previous)) {
previous = current;
continue;
}
@ -120,7 +153,7 @@ public class DocstrumBlockificationService {
continue;
}
if (previous.almostIntersects(current, 0, 0)) {
if (previous.intersects(current)) {
previous = combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate());
continue;
}
@ -139,7 +172,7 @@ public class DocstrumBlockificationService {
previous = current;
}
mergeIntersectingBlocks(page.getTextBlocks(), 0, 6.5f);
mergeIntersectingBlocks(page, usedRulings, 0, 6.5f);
}
@ -230,8 +263,9 @@ public class DocstrumBlockificationService {
}
public void mergeIntersectingBlocks(List<AbstractPageBlock> blocks, float xThreshold, float yThreshold) {
public void mergeIntersectingBlocks(ClassificationPage page, CleanRulings usedRulings, float xThreshold, float yThreshold) {
var blocks = page.getTextBlocks();
ListIterator<AbstractPageBlock> itty = blocks.listIterator();
while (itty.hasNext()) {
AbstractPageBlock block = itty.next();
@ -267,7 +301,11 @@ public class DocstrumBlockificationService {
TextPageBlock inner = (TextPageBlock) abstractPageBlock;
if (current.getDir() == inner.getDir() && current.almostIntersects(inner, yThreshold, xThreshold)) {
if (usedRulings.lineBetween(current, blocks.get(i))) {
continue;
}
if (current.getDir() == inner.getDir() && current.intersects(inner, yThreshold, xThreshold)) {
boolean toDuplicate = current.isToDuplicate() || inner.isToDuplicate();
current.getSequences().addAll(inner.getSequences());
@ -351,111 +389,7 @@ public class DocstrumBlockificationService {
public static TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
TextPageBlock textBlock = null;
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter();
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
for (TextPositionSequence wordBlock : wordBlockList) {
lineHeightFrequencyCounter.add(wordBlock.getTextHeight());
fontSizeFrequencyCounter.add(wordBlock.getFontSize());
spaceFrequencyCounter.add(wordBlock.getSpaceWidth());
fontFrequencyCounter.add(wordBlock.getFont());
styleFrequencyCounter.add(wordBlock.getFontStyle());
if (textBlock == null) {
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
wordBlock.getMaxXDirAdj(),
wordBlock.getMinYDirAdj(),
wordBlock.getMaxYDirAdj(),
wordBlockList,
wordBlock.getRotation());
} else {
TextPageBlock spatialEntity = textBlock.union(wordBlock);
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
}
}
if (textBlock != null) {
textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
}
if (textBlock != null
&& textBlock.getSequences() != null
&& textBlock.getSequences()
.stream()
.map(t -> round(t.getMinYDirAdj(), 3))
.collect(toSet()).size() == 1) {
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
}
return textBlock;
}
private boolean isSplitByRuling(float minX,
float minY,
float maxX,
float maxY,
TextPositionSequence word,
List<Ruling> horizontalRulingLines,
List<Ruling> verticalRulingLines) {
return isSplitByRuling(maxX, minY, word.getMinXDirAdj(), word.getMinYDirAdj(), verticalRulingLines, word.getDir().getDegrees(), word.getPageWidth(), word.getPageHeight())
//
|| isSplitByRuling(minX,
minY,
word.getMinXDirAdj(),
word.getMaxYDirAdj(),
horizontalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight())
//
|| isSplitByRuling(maxX,
minY,
word.getMinXDirAdj(),
word.getMinYDirAdj(),
horizontalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight())
//
|| isSplitByRuling(minX,
minY,
word.getMinXDirAdj(),
word.getMaxYDirAdj(),
verticalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight());
}
private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List<Ruling> rulingLines, float dir, float pageWidth, float pageHeight) {
for (Ruling ruling : rulingLines) {
var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight);
if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) {
return true;
}
}
return false;
}
private static double round(float value, int decimalPoints) {
var d = Math.pow(10, decimalPoints);
return Math.round(value * d) / d;
return new TextPageBlock(wordBlockList);
}
}

View File

@ -15,11 +15,10 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.Orientation;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
@Service
public class DocuMineBlockificationService {
@ -34,15 +33,16 @@ public class DocuMineBlockificationService {
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
* Rulings (Table lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
*
* @param textPositions The words of a page.
* @param horizontalRulingLines Horizontal table lines.
* @param verticalRulingLines Vertical table lines.
* @param textPositions The textPositions of a page.
* @param cleanRulings All rulings on a page
* @return Page object that contains the Textblock and text statistics.
*/
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
public ClassificationPage blockify(List<TextPositionSequence> textPositions, CleanRulings cleanRulings) {
List<TextPositionSequence> chunkWords = new ArrayList<>();
List<AbstractPageBlock> chunkBlockList1 = new ArrayList<>();
List<AbstractPageBlock> textPageBlocks = new ArrayList<>();
CleanRulings usedRulings = cleanRulings.withoutTextRulings();
float minX = 1000;
float maxX = 0;
@ -59,23 +59,26 @@ public class DocuMineBlockificationService {
boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj();
boolean negativeXGap = prev != null && word.getMinXDirAdj() - minX < -5;
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
boolean isSplitByRuling = prev != null && usedRulings.lineBetween(prev, word);
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
boolean splitByOtherFontAndOtherY = prev != null && Math.abs(prev.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight() * 0.2 && (word.getFontStyle()
.contains("bold") && !prev.getFontStyle().contains("bold") || prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold"));
.contains("bold")
&& !prev.getFontStyle()
.contains("bold") || prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold"));
Matcher matcher = pattern.matcher(chunkWords.stream().collect(Collectors.joining(" ")).toString());
Matcher matcher = pattern.matcher(chunkWords.stream()
.collect(Collectors.joining(" ")).toString());
boolean startsOnSameX = Math.abs(minX - word.getMinXDirAdj()) < 5 && matcher.matches();
if (prev != null && (lineSeparation || startFromTop || splitByDir || isSplitByRuling || splitByOtherFontAndOtherY || negativeXGap || startsOnSameX)) {
Orientation prevOrientation = null;
if (!chunkBlockList1.isEmpty()) {
prevOrientation = chunkBlockList1.get(chunkBlockList1.size() - 1).getOrientation();
if (!textPageBlocks.isEmpty()) {
prevOrientation = textPageBlocks.get(textPageBlocks.size() - 1).getOrientation();
}
TextPageBlock cb1 = buildTextBlock(chunkWords);
chunkBlockList1.add(cb1);
TextPageBlock cb1 = new TextPageBlock(chunkWords);
textPageBlocks.add(cb1);
chunkWords = new ArrayList<>();
if (splitByX && !isSplitByRuling) {
@ -86,7 +89,11 @@ public class DocuMineBlockificationService {
wasSplitted = false;
cb1.setOrientation(Orientation.RIGHT);
splitX1 = null;
} else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation || !startFromTop || !splitByX || !newLineAfterSplit || !isSplitByRuling)) {
} else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation
|| !startFromTop
|| !splitByX
|| !newLineAfterSplit
|| !isSplitByRuling)) {
cb1.setOrientation(Orientation.LEFT);
}
@ -114,128 +121,12 @@ public class DocuMineBlockificationService {
}
}
TextPageBlock cb1 = buildTextBlock(chunkWords);
if (cb1 != null) {
chunkBlockList1.add(cb1);
}
textPageBlocks.add(new TextPageBlock(chunkWords));
return new ClassificationPage(chunkBlockList1);
return new ClassificationPage(textPageBlocks);
}
private boolean equalsWithThreshold(float f1, float f2) {
return Math.abs(f1 - f2) < THRESHOLD;
}
private TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList) {
TextPageBlock textBlock = null;
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter();
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
for (TextPositionSequence wordBlock : wordBlockList) {
lineHeightFrequencyCounter.add(wordBlock.getTextHeight());
fontSizeFrequencyCounter.add(wordBlock.getFontSize());
spaceFrequencyCounter.add(wordBlock.getSpaceWidth());
fontFrequencyCounter.add(wordBlock.getFont());
styleFrequencyCounter.add(wordBlock.getFontStyle());
if (textBlock == null) {
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
wordBlock.getMaxXDirAdj(),
wordBlock.getMinYDirAdj(),
wordBlock.getMaxYDirAdj(),
wordBlockList,
wordBlock.getRotation());
} else {
TextPageBlock spatialEntity = textBlock.union(wordBlock);
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
}
}
if (textBlock != null) {
textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
}
if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) {
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
}
return textBlock;
}
private boolean isSplitByRuling(float minX,
float minY,
float maxX,
float maxY,
TextPositionSequence word,
List<Ruling> horizontalRulingLines,
List<Ruling> verticalRulingLines) {
return isSplitByRuling(maxX,
minY,
word.getMinXDirAdj(),
word.getMinYDirAdj(),
verticalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(minX,
minY,
word.getMinXDirAdj(),
word.getMaxYDirAdj(),
horizontalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(maxX,
minY,
word.getMinXDirAdj(),
word.getMinYDirAdj(),
horizontalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(minX,
minY,
word.getMinXDirAdj(),
word.getMaxYDirAdj(),
verticalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()); //
}
private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List<Ruling> rulingLines, float dir, float pageWidth, float pageHeight) {
for (Ruling ruling : rulingLines) {
var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight);
if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) {
return true;
}
}
return false;
}
private double round(float value, int decimalPoints) {
var d = Math.pow(10, decimalPoints);
return Math.round(value * d) / d;
}
}

View File

@ -13,14 +13,11 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.Orientation;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
@SuppressWarnings("all")
@Service
@ -34,12 +31,13 @@ public class RedactManagerBlockificationService {
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
* Rulings (TablePageBlock lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
*
* @param textPositions The words of a page.
* @param textPositions The words of a page.
* @param visualizations
* @return Page object that contains the Textblock and text statistics.
*/
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Cell> cells) {
public ClassificationPage blockify(List<TextPositionSequence> textPositions, CleanRulings cleanRulings, LayoutparsingVisualizations visualizations) {
CleanRulings usedRulings = RectangleTransformations.extractRulings(cells);
CleanRulings usedRulings = cleanRulings.withoutTextRulings();
int indexOnPage = 0;
List<TextPositionSequence> chunkWords = new ArrayList<>();
@ -57,7 +55,7 @@ public class RedactManagerBlockificationService {
boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj();
boolean xIsBeforeFirstX = prev != null && word.getMinXDirAdj() < minX;
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, usedRulings.getHorizontal(), usedRulings.getVertical());
boolean isSplitByRuling = prev != null && usedRulings.lineBetween(prev, word);
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) {
@ -67,7 +65,7 @@ public class RedactManagerBlockificationService {
prevOrientation = chunkBlockList.get(chunkBlockList.size() - 1).getOrientation();
}
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
TextPageBlock cb1 = new TextPageBlock(chunkWords);
indexOnPage++;
chunkBlockList.add(cb1);
@ -81,7 +79,11 @@ public class RedactManagerBlockificationService {
wasSplitted = false;
cb1.setOrientation(Orientation.RIGHT);
splitX1 = null;
} else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation || !startFromTop || !splitByX || !newLineAfterSplit || !isSplitByRuling)) {
} else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation
|| !startFromTop
|| !splitByX
|| !newLineAfterSplit
|| !isSplitByRuling)) {
cb1.setOrientation(Orientation.LEFT);
}
@ -109,8 +111,8 @@ public class RedactManagerBlockificationService {
}
}
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
if (cb1 != null) {
if (!chunkWords.isEmpty()) {
TextPageBlock cb1 = new TextPageBlock(chunkWords);
chunkBlockList.add(cb1);
}
@ -150,8 +152,11 @@ public class RedactManagerBlockificationService {
TextPageBlock block = (TextPageBlock) itty.next();
if (previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold(block.getMaxY(),
previous.getMaxY()) || previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation()
.equals(Orientation.RIGHT) && equalsWithThreshold(block.getMaxY(), previous.getMaxY())) {
previous.getMaxY())
|| previous != null
&& previous.getOrientation().equals(Orientation.LEFT)
&& block.getOrientation().equals(Orientation.RIGHT)
&& equalsWithThreshold(block.getMaxY(), previous.getMaxY())) {
previous.add(block);
itty.remove();
continue;
@ -159,123 +164,19 @@ public class RedactManagerBlockificationService {
previous = block;
}
if (!textPositions.isEmpty()) {
visualizations.addTextBlockVisualizations(chunkBlockList.stream()
.map(tb -> (TextPageBlock) tb)
.toList(), textPositions.get(0).getPage());
}
return new ClassificationPage(chunkBlockList);
}
private boolean equalsWithThreshold(float f1, float f2) {
private boolean equalsWithThreshold(double f1, double f2) {
return Math.abs(f1 - f2) < THRESHOLD;
}
private TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
TextPageBlock textBlock = null;
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter();
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
for (TextPositionSequence wordBlock : wordBlockList) {
lineHeightFrequencyCounter.add(wordBlock.getTextHeight());
fontSizeFrequencyCounter.add(wordBlock.getFontSize());
spaceFrequencyCounter.add(wordBlock.getSpaceWidth());
fontFrequencyCounter.add(wordBlock.getFont());
styleFrequencyCounter.add(wordBlock.getFontStyle());
if (textBlock == null) {
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
wordBlock.getMaxXDirAdj(),
wordBlock.getMinYDirAdj(),
wordBlock.getMaxYDirAdj(),
wordBlockList,
wordBlock.getRotation());
} else {
TextPageBlock spatialEntity = textBlock.union(wordBlock);
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
}
}
if (textBlock != null) {
textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
}
if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) {
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
}
return textBlock;
}
private boolean isSplitByRuling(float minX,
float minY,
float maxX,
float maxY,
TextPositionSequence word,
List<Ruling> horizontalRulingLines,
List<Ruling> verticalRulingLines) {
return isSplitByRuling(maxX,
minY,
word.getMinXDirAdj(),
word.getMinYDirAdj(),
verticalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(minX,
minY,
word.getMinXDirAdj(),
word.getMaxYDirAdj(),
horizontalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(maxX,
minY,
word.getMinXDirAdj(),
word.getMinYDirAdj(),
horizontalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(minX,
minY,
word.getMinXDirAdj(),
word.getMaxYDirAdj(),
verticalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight());
}
private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List<Ruling> rulingLines, float dir, float pageWidth, float pageHeight) {
for (Ruling ruling : rulingLines) {
var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight);
if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) {
return true;
}
}
return false;
}
private double round(float value, int decimalPoints) {
var d = Math.pow(10, decimalPoints);
return Math.round(value * d) / d;
}
}

View File

@ -14,6 +14,8 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.utils.HeaderFooterDetection;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
import lombok.RequiredArgsConstructor;
@ -25,7 +27,7 @@ import lombok.extern.slf4j.Slf4j;
public class DocuMineClassificationService {
private final HeadlineClassificationService headlineClassificationService;
private static final Pattern pattern = Pattern.compile("^(\\d{1,2}\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
private static final Pattern pattern = Pattern.compile("^([1-9]\\d?\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
private static final Pattern pattern2 = Pattern.compile("\\p{L}{3,}", Pattern.CASE_INSENSITIVE);
private static final Pattern pattern3 = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*");
@ -72,15 +74,26 @@ public class DocuMineClassificationService {
return;
}
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null
|| textBlock.getHighestFontSize() <= document.getFontSizeCounter()
.getMostPopular())) {
|| (PositionUtils.isOverBodyTextFrame(bodyTextFrame,
textBlock,
page.getRotation()) && (document.getFontSizeCounter().getMostPopular()
== null
|| textBlock.getHighestFontSize()
<= document.getFontSizeCounter()
.getMostPopular()))
|| HeaderFooterDetection.isLikelyHeader(textBlock, document, page)) {
textBlock.setClassification(PageBlockType.HEADER);
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
|| PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null
|| textBlock.getHighestFontSize() <= document.getFontSizeCounter()
.getMostPopular())) {
|| (PositionUtils.isUnderBodyTextFrame(bodyTextFrame,
textBlock,
page.getRotation())
&& (document.getFontSizeCounter().getMostPopular()
== null
|| textBlock.getHighestFontSize()
<= document.getFontSizeCounter()
.getMostPopular()))
|| HeaderFooterDetection.isLikelyFooter(textBlock, document, page)) {
textBlock.setClassification(PageBlockType.FOOTER);
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5
&& textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) {
@ -92,19 +105,19 @@ public class DocuMineClassificationService {
|| textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular())
&& PositionUtils.getApproxLineCount(textBlock) < 5.9
&& (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && !matcher2.matches() && !textBlock.toString()
&& (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && matcher2.reset().find() && !textBlock.toString()
.contains(":")
|| textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && !matcher2.matches() && !textBlock.toString().contains(":")
|| textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && matcher2.reset().find() && !textBlock.toString().contains(":")
|| textBlock.toString().startsWith("APPENDIX")
|| textBlock.toString().startsWith("FIGURE")
|| textBlock.toString().startsWith("TABLE"))
&& !textBlock.toString().endsWith(":")
&& matcher2.find()) {
&& matcher2.reset().find()) {
PageBlockType headlineType = PageBlockType.getHeadlineType(1);
headlineClassificationService.classifyHeadline(textBlock, headlineType);
document.setHeadlines(true);
} else if (matcher.find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && matcher2.find() && !matcher3.matches()) {
} else if (matcher.reset().find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && matcher2.reset().find() && !matcher3.reset().matches()) {
PageBlockType headlineType = PageBlockType.getHeadlineType(2);
headlineClassificationService.classifyHeadline(textBlock, headlineType);
document.setHeadlines(true);

View File

@ -52,6 +52,9 @@ public class DocumentGraphFactory {
public Document buildDocumentGraph(LayoutParsingType layoutParsingType, ClassificationDocument document) {
Document documentGraph = new Document();
documentGraph.setVisualizations(document.getVisualizations());
Context context = new Context(documentGraph);
document.getPages()
@ -88,14 +91,18 @@ public class DocumentGraphFactory {
}
public void addParagraphOrHeadline(GenericSemanticNode parentNode, TextPageBlock originalTextBlock, Context context, List<TextPageBlock> textBlocksToMerge) {
public void addParagraphOrHeadline(GenericSemanticNode parentNode,
TextPageBlock originalTextBlock,
Context context,
List<TextPageBlock> textBlocksToMerge,
LayoutParsingType layoutParsingType) {
Page page = context.getPage(originalTextBlock.getPage());
GenericSemanticNode node;
if (originalTextBlock.isHeadline()) {
node = Headline.builder().documentTree(context.getDocumentTree()).build();
} else if (originalTextBlock.isToDuplicate()) {
} else if (originalTextBlock.isToDuplicate() && layoutParsingType.equals(LayoutParsingType.REDACT_MANAGER)) {
node = DuplicatedParagraph.builder().documentTree(context.getDocumentTree()).build();
} else {
node = Paragraph.builder().documentTree(context.getDocumentTree()).build();
@ -274,8 +281,7 @@ public class DocumentGraphFactory {
return pages.keySet()
.stream()
.filter(page -> page.getNumber() == pageIndex)
.findFirst()
.orElseThrow(() -> new NoSuchElementException(format("ClassificationPage with number %d not found", pageIndex)));
.findFirst().orElseThrow(() -> new NoSuchElementException(format("ClassificationPage with number %d not found", pageIndex)));
}
}

View File

@ -2,6 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor.services.factory;
import java.awt.geom.AffineTransform;
import java.awt.geom.Rectangle2D;
import java.util.Collection;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
@ -29,19 +30,22 @@ public class SearchTextWithTextPositionFactory {
public SearchTextWithTextPositionDto buildSearchTextToTextPositionDto(List<TextPositionSequence> sequences) {
if (sequences.isEmpty() || sequences.stream().allMatch(sequence -> sequence.getTextPositions().isEmpty())) {
if (sequences.isEmpty() || sequences.stream()
.allMatch(sequence -> sequence.getTextPositions().isEmpty())) {
return SearchTextWithTextPositionDto.empty();
}
Context context = new Context();
RedTextPosition currentTextPosition = sequences.get(0).getTextPositions().get(0);
RedTextPosition previousTextPosition = RedTextPosition.builder().unicode(" ").position(currentTextPosition.getPosition()).build();
RedTextPosition currentTextPosition = sequences.get(0).getTextPositions()
.get(0);
RedTextPosition previousTextPosition = RedTextPosition.builder().unicode(" ").bBoxDirAdj(currentTextPosition.getBBoxDirAdj()).build();
for (TextPositionSequence word : sequences) {
for (int i = 0; i < word.getTextPositions().size(); ++i) {
currentTextPosition = word.getTextPositions().get(i);
currentTextPosition = word.getTextPositions()
.get(i);
if (isLineBreak(currentTextPosition, previousTextPosition)) {
removeHyphenLinebreaks(context);
context.lineBreaksStringIdx.add(context.stringIdx);
@ -57,18 +61,21 @@ public class SearchTextWithTextPositionFactory {
++context.positionIdx;
}
previousTextPosition = RedTextPosition.builder().unicode(" ").position(previousTextPosition.getPosition()).build();
previousTextPosition = RedTextPosition.builder().unicode(" ").bBoxDirAdj(previousTextPosition.getBBoxDirAdj()).build();
context.stringBuilder.append(" ");
context.stringIdxToPositionIdx.add(context.positionIdx);
++context.stringIdx;
}
assert context.stringBuilder.length() == context.stringIdxToPositionIdx.size();
List<Rectangle2D> positions = sequences.stream()
.flatMap(sequence -> sequence.getTextPositions().stream().map(textPosition -> mapRedTextPositionToInitialUserSpace(textPosition, sequence)))
.map(TextPositionSequence::getTextPositions)
.flatMap(Collection::stream)
.map(RedTextPosition::getBBoxInitialUserSpace)
.toList();
assert context.stringBuilder.length() == context.stringIdxToPositionIdx.size();
return SearchTextWithTextPositionDto.builder()
.searchText(context.stringBuilder.toString())
.lineBreaks(context.lineBreaksStringIdx)
@ -153,7 +160,7 @@ public class SearchTextWithTextPositionFactory {
return false;
}
float deltaY = Math.abs(currentPosition.getYDirAdj() - previousPosition.getYDirAdj());
double deltaY = Math.abs(currentPosition.getYDirAdj() - previousPosition.getYDirAdj());
return deltaY >= currentPosition.getHeightDir();
}
@ -167,16 +174,16 @@ public class SearchTextWithTextPositionFactory {
private boolean isHyphen(String unicodeCharacter) {
return Objects.equals(unicodeCharacter, "-") || //
Objects.equals(unicodeCharacter, "~") || //
Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "\u00AD");
Objects.equals(unicodeCharacter, "~") || //
Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "\u00AD");
}

View File

@ -140,15 +140,15 @@ public class SectionNodeFactory {
if (abstractPageBlock instanceof TextPageBlock) {
switch (layoutParsingType) {
case REDACT_MANAGER, CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> {
case REDACT_MANAGER, DOCUMINE, CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> {
alreadyMerged.add(abstractPageBlock);
remainingBlocks.remove(abstractPageBlock);
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, new ArrayList<>());
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, new ArrayList<>(), layoutParsingType);
}
default -> {
List<TextPageBlock> textBlocks = findTextBlocksWithSameClassificationAndAlignsY((TextPageBlock) abstractPageBlock, remainingBlocks);
alreadyMerged.addAll(textBlocks);
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, textBlocks);
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, textBlocks, layoutParsingType);
}
}
} else if (abstractPageBlock instanceof TablePageBlock tablePageBlock) {

View File

@ -45,7 +45,10 @@ public class TableNodeFactory {
.flatMap(Collection::stream)
.toList();
Table table = Table.builder().documentTree(context.getDocumentTree()).numberOfCols(mergedRows.isEmpty() ? 0 : mergedRows.get(0).size()).numberOfRows(mergedRows.size())
Table table = Table.builder()
.documentTree(context.getDocumentTree())
.numberOfCols(mergedRows.isEmpty() ? 0 : mergedRows.get(0).size())
.numberOfRows(mergedRows.size())
.build();
pages.forEach(page -> addTableToPage(page, parentNode, table));
@ -128,7 +131,12 @@ public class TableNodeFactory {
Page page = context.getPage(cell.getPageNumber());
TableCell tableCell = TableCell.builder().documentTree(context.getDocumentTree()).row(rowIndex).col(colIndex).header(cell.isHeaderCell()).bBox(cell.getBounds2D())
TableCell tableCell = TableCell.builder()
.documentTree(context.getDocumentTree())
.row(rowIndex)
.col(colIndex)
.header(cell.isHeaderCell())
.bBox(cell.getBBoxInitialUserSpace())
.build();
page.getMainBody().add(tableCell);
@ -160,7 +168,7 @@ public class TableNodeFactory {
tableCell.setLeafTextBlock(textBlock);
} else {
cell.getTextBlocks()
.forEach(tb -> DocumentGraphFactory.addParagraphOrHeadline(tableCell, tb, context, emptyList()));
.forEach(tb -> DocumentGraphFactory.addParagraphOrHeadline(tableCell, tb, context, emptyList(), layoutParsingType));
}
}

View File

@ -13,6 +13,9 @@ import org.apache.pdfbox.rendering.ImageType;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
import lombok.SneakyThrows;
@Service
@ -30,7 +33,7 @@ public class FindGraphicsRaster {
var renderer = new PDFRenderer(doc);
var img = renderer.renderImageWithDPI(pageInformation.number() - 1, DPI, ImageType.GRAY);
var imageCtm = getImageCTM(pageInformation, img.getWidth());
var imageCtm = CoordinateTransforms.calculateImageCoordsToInitialUserSpaceCoords(pageInformation, CoordinateTransforms.calculateScalingFactor(pageInformation, img.getWidth()));
return findCCBoundingBoxes(img, remove, THRESHOLD, DPI / 72, imageCtm);
}
@ -131,42 +134,4 @@ public class FindGraphicsRaster {
}
public AffineTransform getImageCTM(PageInformation pageInformation, int imageWidth) {
double scalingFactor = calculateScalingFactor(pageInformation, imageWidth);
AffineTransform imageToCropBoxScaling = new AffineTransform(scalingFactor, 0, 0, scalingFactor, -pageInformation.minX(), -pageInformation.minY());
AffineTransform mirrorMatrix = new AffineTransform(1, 0, 0, -1, 0, pageInformation.height());
AffineTransform rotationMatrix = switch (pageInformation.rotationDegrees()) {
case 90 -> new AffineTransform(0, 1, -1, 0, pageInformation.height(), 0);
case 180 -> new AffineTransform(-1, 0, 0, -1, pageInformation.width(), pageInformation.height());
case 270 -> new AffineTransform(0, -1, 1, 0, pageInformation.width() - pageInformation.height(), pageInformation.height()); // results from 90 + 180 rotations
default -> new AffineTransform();
};
// matrix multiplication is performed from right to left, so the order is reversed.
// scaling -> mirror -> rotation
AffineTransform resultMatrix = new AffineTransform();
resultMatrix.concatenate(rotationMatrix);
resultMatrix.concatenate(mirrorMatrix);
resultMatrix.concatenate(imageToCropBoxScaling);
return resultMatrix;
}
private double calculateScalingFactor(PageInformation pageInformation, int imageWidth) {
// PDFBox always returns page height and width based on rotation
double pageWidth;
if (pageInformation.rotationDegrees() == 90 || pageInformation.rotationDegrees() == 270) {
pageWidth = pageInformation.height();
} else {
pageWidth = pageInformation.width();
}
return pageWidth / imageWidth;
}
}

View File

@ -1,7 +1,6 @@
package com.knecon.fforesight.service.layoutparser.processor.services.graphics;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
@ -9,10 +8,11 @@ import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.SearchTextWithTextPositionFactory;
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.RequiredArgsConstructor;
@ -22,6 +22,9 @@ import lombok.SneakyThrows;
@RequiredArgsConstructor
public class GraphicExtractorService {
private static final int MIN_GRAPHICS_SIDE_LENGTH = 30;
private static final int MIN_GRAPHICS_AREA = 500;
private final GraphicsClusteringService graphicsClusteringService;
private final FindGraphicsRaster findGraphicsRaster;
@ -32,33 +35,32 @@ public class GraphicExtractorService {
int pageNumber,
CleanRulings cleanRulings,
List<TextPositionSequence> textPositionSequences,
List<Cell> emptyTableCells,
boolean graphicsRaster) {
var characterBBoxes = getCharacterBBoxes(textPositionSequences);
var tableLineBBoxes = getLineBBoxesFromTableCells(emptyTableCells);
var underLineBBoxes = getUnderlineBBoxes(cleanRulings, characterBBoxes);
var strikeThroughBBoxes = getStrikeThroughBBoxes(cleanRulings, characterBBoxes);
List<Box> characterBBoxes = getCharacterBBoxes(textPositionSequences);
List<Box> classifiedRulingsBoxes = getLineBBoxesOfAllClassifiedRulings(cleanRulings);
GraphicBBDetector graphicBBDetector = new GraphicBBDetector(pdPage, true);
var graphicBBoxes = graphicBBDetector.findGraphicBB();
List<Box> graphicBBoxes = graphicBBDetector.findGraphicBB();
if (graphicsRaster) {
// This should only be used if ocr was performed, it is currently in an early stage and needs to be improved.
graphicBBoxes.addAll(findGraphicsRaster.findCCBoundingBoxes(pdDocument,
characterBBoxes.stream().map(box -> new Rectangle2D.Double(box.x1 - 2, box.y1 - 2, box.width() + 4, box.height() + 4)).collect(Collectors.toList()),
PageInformation.fromPDPage(pageNumber, pdPage)));
characterBBoxes.stream()
.map(box -> new Rectangle2D.Double(box.x1 - 2, box.y1 - 2, box.width() + 4, box.height() + 4))
.collect(Collectors.toList()),
PageInformation.fromPDPage(pageNumber, pdPage)));
}
var filteredGraphicBBoxes = graphicBBoxes.stream()
.filter(box -> !box.intersectsAny(tableLineBBoxes, 4))
.filter(box -> !box.intersectsAny(underLineBBoxes, 4))
.filter(box -> !box.intersectsAny(strikeThroughBBoxes, 4))
List<Box> filteredGraphicBBoxes = graphicBBoxes.stream()
.filter(box -> !box.intersectsAny(classifiedRulingsBoxes, 4))
.collect(Collectors.toList());
var clusters = graphicsClusteringService.getClusters(filteredGraphicBBoxes, 14);
List<Box> clusters = graphicsClusteringService.getClusters(filteredGraphicBBoxes, 14);
return clusters.stream().filter(box -> box.area() > 500 && box.height() > 50 && box.width() > 50).toList();
return clusters.stream()
.filter(box -> box.area() > MIN_GRAPHICS_AREA && box.height() > MIN_GRAPHICS_SIDE_LENGTH && box.width() > MIN_GRAPHICS_SIDE_LENGTH)
.toList();
}
@ -74,34 +76,13 @@ public class GraphicExtractorService {
}
private List<Box> getLineBBoxesFromTableCells(List<Cell> emptyTableCells) {
private List<Box> getLineBBoxesOfAllClassifiedRulings(CleanRulings cleanRulings) {
List<Box> expandedTableLines = new ArrayList<>();
emptyTableCells.forEach(cell -> {
expandedTableLines.add(new Box(new Rectangle2D.Double(cell.x, cell.y - 1, cell.width, 2)));
expandedTableLines.add(new Box(new Rectangle2D.Double(cell.x, cell.y + cell.height - 1, cell.width, 2)));
expandedTableLines.add(new Box(new Rectangle2D.Double(cell.x - 1, cell.y, 2, cell.height)));
expandedTableLines.add(new Box(new Rectangle2D.Double(cell.x + cell.width - 1, cell.y, 2, cell.height)));
});
return expandedTableLines;
}
private List<Box> getUnderlineBBoxes(CleanRulings cleanRulings, List<Box> characterBBoxes) {
return cleanRulings.getHorizontal()
return cleanRulings.buildAll()
.stream()
.filter(ruling -> !ruling.getClassification().equals(Ruling.Classification.OTHER))
.map(h -> new Box(h.x1, h.y1, h.x2, h.y2))
.filter(box -> box.intersectsAnyAndOver(characterBBoxes, 6))
.collect(Collectors.toList());
}
private List<Box> getStrikeThroughBBoxes(CleanRulings cleanRulings, List<Box> characterBBoxes) {
return cleanRulings.getHorizontal().stream().map(h -> new Box(h.x1, h.y1, h.x2, h.y2)).filter(box -> box.intersectsCenter(characterBBoxes, 2)).collect(Collectors.toList());
}
}

View File

@ -82,7 +82,6 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine {
private int pageRotation;
private PDRectangle pageSize;
private Matrix translateMatrix;
private final GlyphList glyphList;
private final Map<COSDictionary, Float> fontHeightMap = new WeakHashMap<COSDictionary, Float>();
@ -134,12 +133,6 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine {
this.pageRotation = page.getRotation();
this.pageSize = page.getCropBox();
if (pageSize.getLowerLeftX() == 0 && pageSize.getLowerLeftY() == 0) {
translateMatrix = null;
} else {
// translation matrix for cropbox
translateMatrix = Matrix.getTranslateInstance(-pageSize.getLowerLeftX(), -pageSize.getLowerLeftY());
}
super.processPage(page);
}
@ -265,62 +258,52 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine {
}
}
// adjust for cropbox if needed
Matrix translatedTextRenderingMatrix;
if (translateMatrix == null) {
translatedTextRenderingMatrix = textRenderingMatrix;
} else {
translatedTextRenderingMatrix = Matrix.concatenate(translateMatrix, textRenderingMatrix);
nextX -= pageSize.getLowerLeftX();
nextY -= pageSize.getLowerLeftY();
}
// This is a hack for unicode letter with 2 chars e.g. RA see unicodeProblem.pdf
if (unicodeMapping.length() == 2) {
processTextPosition(new TextPosition(pageRotation,
pageSize.getWidth(),
pageSize.getHeight(),
translatedTextRenderingMatrix,
nextX,
nextY,
Math.abs(dyDisplay),
dxDisplay,
Math.abs(spaceWidthDisplay),
Character.toString(unicodeMapping.charAt(0)),
new int[]{code},
font,
fontSize,
(int) (fontSize * textMatrix.getScalingFactorX())));
pageSize.getWidth(),
pageSize.getHeight(),
textRenderingMatrix,
nextX,
nextY,
Math.abs(dyDisplay),
dxDisplay,
Math.abs(spaceWidthDisplay),
Character.toString(unicodeMapping.charAt(0)),
new int[]{code},
font,
fontSize,
(int) (fontSize * textMatrix.getScalingFactorX())));
processTextPosition(new TextPosition(pageRotation,
pageSize.getWidth(),
pageSize.getHeight(),
translatedTextRenderingMatrix,
nextX,
nextY,
Math.abs(dyDisplay),
dxDisplay,
Math.abs(spaceWidthDisplay),
Character.toString(unicodeMapping.charAt(1)),
new int[]{code},
font,
fontSize,
(int) (fontSize * textMatrix.getScalingFactorX())));
pageSize.getWidth(),
pageSize.getHeight(),
textRenderingMatrix,
nextX,
nextY,
Math.abs(dyDisplay),
dxDisplay,
Math.abs(spaceWidthDisplay),
Character.toString(unicodeMapping.charAt(1)),
new int[]{code},
font,
fontSize,
(int) (fontSize * textMatrix.getScalingFactorX())));
} else {
processTextPosition(new TextPosition(pageRotation,
pageSize.getWidth(),
pageSize.getHeight(),
translatedTextRenderingMatrix,
nextX,
nextY,
Math.abs(dyDisplay),
dxDisplay,
Math.abs(spaceWidthDisplay),
unicodeMapping,
new int[]{code},
font,
fontSize,
(int) (fontSize * textMatrix.getScalingFactorX())));
pageSize.getWidth(),
pageSize.getHeight(),
textRenderingMatrix,
nextX,
nextY,
Math.abs(dyDisplay),
dxDisplay,
Math.abs(spaceWidthDisplay),
unicodeMapping,
new int[]{code},
font,
fontSize,
(int) (fontSize * textMatrix.getScalingFactorX())));
}
}

View File

@ -1007,7 +1007,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
/**
* Set the desired word separator for output text. The PDFBox text extraction algorithm will output a space
* character if there is enough space between two words. By default a space character is used. If you need and
* character if there is enough space between two textPositions. By default a space character is used. If you need and
* accurate count of characters that are found in a PDF document then you might want to set the word separator to
* the empty string.
*
@ -1703,7 +1703,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
/**
* Write a list of string containing a whole line of a document.
*
* @param line a list with the words of the given line
* @param line a list with the textPositions of the given line
* @throws IOException if something went wrong
*/
private void writeLine(List<WordWithTextPositions> line, boolean isParagraphEnd) throws IOException {
@ -1744,9 +1744,9 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
/**
* Handles the LTR and RTL direction of the given words. The whole implementation stands and falls with the given
* word. If the word is a full line, the results will be the best. If the word contains of single words or
* characters, the order of the characters in a word or words in a line may wrong, due to RTL and LTR marks and
* Handles the LTR and RTL direction of the given textPositions. The whole implementation stands and falls with the given
* word. If the word is a full line, the results will be the best. If the word contains of single textPositions or
* characters, the order of the characters in a word or textPositions in a line may wrong, due to RTL and LTR marks and
* characters!
* <p>
* Based on http://www.nesterovsky-bros.com/weblog/2013/07/28/VisualToLogicalConversionInJava.aspx

View File

@ -65,12 +65,20 @@ public class LayoutGridService {
@SneakyThrows
@Observed(name = "ViewerDocumentService", contextualName = "create-viewer-document")
public void addLayoutGrid(File originFile, Document document, File destinationFile, boolean layerVisibilityDefaultValue) {
public void addLayoutGrid(File originFile, Document document, File destinationFile, boolean layerVisibilityDefaultValue, boolean writeVisualLayoutParsingGrid) {
List<Visualizations> allVisualizations;
Visualizations layoutGrid = this.addLayoutGrid(document, layerVisibilityDefaultValue, false);
Visualizations visualLayoutGrid = this.addLayoutGrid(document, layerVisibilityDefaultValue, true);
if (writeVisualLayoutParsingGrid) {
Visualizations visualLayoutGrid = this.addLayoutGrid(document, layerVisibilityDefaultValue, true);
allVisualizations = Stream.concat(Stream.of(layoutGrid, visualLayoutGrid), document.getVisualizations().streamAll())
.toList();
} else {
allVisualizations = Stream.concat(Stream.of(layoutGrid), document.getVisualizations().streamAll())
.toList();
}
viewerDocumentService.addVisualizationsOnPage(originFile, destinationFile, List.of(layoutGrid, visualLayoutGrid));
viewerDocumentService.addVisualizationsOnPage(originFile, destinationFile, allVisualizations);
}
@ -130,7 +138,10 @@ public class LayoutGridService {
}
for (Page page : table.getPages()) {
Optional<Integer> optionalFirstRowOnPage = table.streamCol(0).filter(tableCell -> tableCell.isOnPage(page.getNumber())).map(TableCell::getRow).findFirst();
Optional<Integer> optionalFirstRowOnPage = table.streamCol(0)
.filter(tableCell -> tableCell.isOnPage(page.getNumber()))
.map(TableCell::getRow)
.findFirst();
if (optionalFirstRowOnPage.isEmpty()) {
continue;
}
@ -170,14 +181,17 @@ public class LayoutGridService {
private static Stream<Rectangle2D> streamBBoxOfCellsOnPage(Stream<TableCell> table, Page page) {
return table.filter(tableCell -> tableCell.isOnPage(page.getNumber())).map(TableCell::getBBox).map(bBoxMap -> bBoxMap.get(page));
return table.filter(tableCell -> tableCell.isOnPage(page.getNumber()))
.map(TableCell::getBBox)
.map(bBoxMap -> bBoxMap.get(page));
}
private void addSection(SemanticNode semanticNode, LayoutGrid layoutGrid, Color color) {
Map<Page, Rectangle2D> bBoxMap = semanticNode.getBBox();
List<SemanticNode> subSections = semanticNode.streamAllSubNodesOfType(NodeType.SECTION).toList();
List<SemanticNode> subSections = semanticNode.streamAllSubNodesOfType(NodeType.SECTION)
.toList();
Page firstPage = semanticNode.getFirstPage();
String treeIdString = buildTreeIdString(semanticNode);
if (!subSections.isEmpty()) {
@ -197,7 +211,10 @@ public class LayoutGridService {
}
return;
}
List<Page> pagesInOrder = bBoxMap.keySet().stream().sorted(Comparator.comparingInt(Page::getNumber)).collect(Collectors.toList());
List<Page> pagesInOrder = bBoxMap.keySet()
.stream()
.sorted(Comparator.comparingInt(Page::getNumber))
.collect(Collectors.toList());
pagesInOrder.remove(0);
addLinesForFirstPageOfSection(semanticNode, color, firstPage, layoutGrid);
var lastPage = pagesInOrder.remove(pagesInOrder.size() - 1);
@ -294,7 +311,10 @@ public class LayoutGridService {
private String buildTreeIdString(SemanticNode semanticNode) {
return semanticNode.getTreeId().stream().map(Object::toString).collect(Collectors.joining("."));
return semanticNode.getTreeId()
.stream()
.map(Object::toString)
.collect(Collectors.joining("."));
}

View File

@ -0,0 +1,56 @@
package com.knecon.fforesight.service.layoutparser.processor.utils;
import java.awt.geom.AffineTransform;
import lombok.SneakyThrows;
import lombok.experimental.UtilityClass;
@UtilityClass
public class CoordinateTransforms {
public AffineTransform calculateImageCoordsToInitialUserSpaceCoords(PageInformation pageInformation, double scalingFactor) {
AffineTransform imageToCropBoxScaling = new AffineTransform(scalingFactor, 0, 0, scalingFactor, -pageInformation.minX(), -pageInformation.minY());
AffineTransform mirrorMatrix = new AffineTransform(1, 0, 0, -1, 0, pageInformation.height());
AffineTransform rotationMatrix = switch (pageInformation.rotationDegrees()) {
case 90 -> new AffineTransform(0, 1, -1, 0, pageInformation.height(), 0);
case 180 -> new AffineTransform(-1, 0, 0, -1, pageInformation.width(), pageInformation.height());
case 270 -> new AffineTransform(0, -1, 1, 0, pageInformation.width() - pageInformation.height(), pageInformation.height()); // results from 90 + 180 rotations
default -> new AffineTransform();
};
// matrix multiplication is performed from right to left, so the order is reversed.
// scaling -> mirror -> rotation
AffineTransform resultMatrix = new AffineTransform();
resultMatrix.concatenate(rotationMatrix);
resultMatrix.concatenate(mirrorMatrix);
resultMatrix.concatenate(imageToCropBoxScaling);
return resultMatrix;
}
@SneakyThrows
public AffineTransform calculateInitialUserSpaceCoordsToImageCoords(PageInformation pageInformation, double scalingFactor) {
return calculateImageCoordsToInitialUserSpaceCoords(pageInformation, scalingFactor).createInverse();
}
public double calculateScalingFactor(PageInformation pageInformation, double imageWidth) {
// PDFBox always returns page height and width based on rotation
double pageWidth;
if (pageInformation.rotationDegrees() == 90 || pageInformation.rotationDegrees() == 270) {
pageWidth = pageInformation.height();
} else {
pageWidth = pageInformation.width();
}
return pageWidth / imageWidth;
}
}

View File

@ -1,10 +1,10 @@
package com.knecon.fforesight.service.layoutparser.processor.utils;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.util.Comparator;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
public class GeometricComparators {
@ -58,7 +58,7 @@ public class GeometricComparators {
return cell1Size.compareTo(cell2Size);
};
public static final Comparator<Rectangle> RECTANGLE_SIZE_COMPARATOR = (rect1, rect2) -> {
public static final Comparator<Rectangle2D> RECTANGLE_SIZE_COMPARATOR = (rect1, rect2) -> {
Double rect1Size = rect1.getHeight() * rect1.getWidth();
Double rect2Size = rect2.getHeight() * rect2.getWidth();

View File

@ -0,0 +1,223 @@
package com.knecon.fforesight.service.layoutparser.processor.utils;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import lombok.experimental.UtilityClass;
@UtilityClass
public class HeaderFooterDetection {
private final Map<Integer, ClassificationPage> pagesCache = new HashMap<>();
private static final double THRESHOLD = 0.5;
// Weight will go from 1.0 to 0.5 because the first element is the most likely to be the header on the page.
private static final double[] headerWeights = {1.0, 0.75, 0.5};
// Weight will go from 0.5 to 1.0 because the last element is the most likely to be the footer on the page.
private static final double[] footerWeights = {0.5, 0.75, 1.0};
public boolean isLikelyFooter(TextPageBlock textPageBlock, ClassificationDocument document, ClassificationPage classificationPage) {
int numberOfPages = document.getPages().size();
if (numberOfPages < 3) {
// If the document has 1 or 2 pages this may lead to more false positives than actual findings.
return false;
}
int window = Math.min(numberOfPages, 8);
List<ClassificationPage> nearestPages = findNearestPages(classificationPage, document.getPages(), window);
List<List<AbstractPageBlock>> footerCandidates = getFooterCandidates(nearestPages);
return detectHeadersOrFootersByPageAssociation(textPageBlock.getText(), footerCandidates, window, footerWeights);
}
public boolean isLikelyHeader(TextPageBlock textPageBlock, ClassificationDocument document, ClassificationPage classificationPage) {
int numberOfPages = document.getPages().size();
if (numberOfPages < 3) {
// If the document has 1 or 2 pages this may lead to more false positives than actual findings.
return false;
}
int window = Math.min(numberOfPages, 8);
List<ClassificationPage> nearestPages = findNearestPages(classificationPage, document.getPages(), window);
List<List<AbstractPageBlock>> headerCandidates = getHeaderCandidates(nearestPages);
return detectHeadersOrFootersByPageAssociation(textPageBlock.getText(), headerCandidates, window, headerWeights);
}
private boolean detectHeadersOrFootersByPageAssociation(String testString, List<List<AbstractPageBlock>> candidates, int window, double[] weights) {
double highestScore = 0.0;
for (int i = 0; i < candidates.size(); i++) {
List<List<String>> candidateStrings = new ArrayList<>();
for (int k = Math.max(i - window, 0); k < Math.min(i + window, candidates.size()); k++) {
candidateStrings.add(candidates.get(k)
.stream()
.map(AbstractPageBlock::getText)
.collect(Collectors.toList()));
}
int maxLen = candidateStrings.stream()
.mapToInt(List::size)
.max()
.orElse(0);
for (List<String> sublist : candidateStrings) {
while (sublist.size() < maxLen) {
sublist.add(0, "");
}
}
// Compare the testString against each candidate in the window
for (int j = 0; j < maxLen; j++) {
double score = 0.0;
int finalJ = j;
List<String> paddedCandidateStrings = candidateStrings.stream()
.map(sublist -> sublist.size() > finalJ ? sublist.get(finalJ) : "")
.toList();
for (String paddedString : paddedCandidateStrings) {
if ((testString.length() >= 5 && paddedString.length() >= 5) && (testString.length() > 2 * paddedString.length()
|| paddedString.length() > 2 * testString.length())) {
// If both strings are at least 5 characters long and one string is more than twice as long as the other,
// skip the distance calculation as it's time-consuming, and we can assume they are not similar enough
continue;
}
int distance = calculateHammingDistanceWithPreprocessing(testString, paddedString);
double normalizedScore = 1 - (double) distance / Math.max(testString.length(), paddedString.length());
score += normalizedScore * (j < weights.length ? weights[j] : 1);
}
score /= paddedCandidateStrings.size();
highestScore = Math.max(highestScore, score);
// Early stop
if (highestScore > THRESHOLD) {
return true;
}
}
}
return false;
}
/**
* Find the nearest n pages for a given page.
* For example: nearest 8 pages for page 4 are: 1, 2, 3, 5, 6, 7, 8, 9.
*
* @param currentPage Current page to find the nearest ones.
* @param allPages All pages in the document.
* @param numNeighbors Number of neighbouring pages to find.
* @return The nearest pages.
*/
private List<ClassificationPage> findNearestPages(ClassificationPage currentPage, List<ClassificationPage> allPages, int numNeighbors) {
int totalPages = allPages.size();
List<ClassificationPage> nearestPages = new ArrayList<>();
int currentPageIndex = currentPage.getPageNumber() - 1;
int halfWin = numNeighbors / 2;
int start = Math.max(0, currentPageIndex - halfWin);
int end = Math.min(totalPages - 1, currentPageIndex + halfWin);
for (int i = start; i <= end; i++) {
if (i != currentPageIndex) {
nearestPages.add(pagesCache.computeIfAbsent(i, allPages::get));
}
}
pagesCache.keySet().removeIf(key -> key < start || key > end);
return nearestPages;
}
// Get the last 3 TextBlocks on the page as they are likely to be a footer
private List<List<AbstractPageBlock>> getFooterCandidates(List<ClassificationPage> pages) {
List<List<AbstractPageBlock>> footerCandidates = new ArrayList<>();
for (ClassificationPage page : pages) {
List<AbstractPageBlock> textBlocks = page.getTextBlocks();
List<TextPageBlock> textPageBlocks = textBlocks.stream()
.filter(textBlock -> textBlock instanceof TextPageBlock)
.map(textBlock -> (TextPageBlock) textBlock)
.toList();
int blockCount = textPageBlocks.size();
if (blockCount > 0) {
int start = Math.max(0, blockCount - 3);
footerCandidates.add(new ArrayList<>(textPageBlocks.subList(start, blockCount)));
}
}
return footerCandidates;
}
// Get the first 3 TextBlocks on the page as they are likely to be a header
private List<List<AbstractPageBlock>> getHeaderCandidates(List<ClassificationPage> pages) {
List<List<AbstractPageBlock>> headerCandidates = new ArrayList<>();
for (ClassificationPage page : pages) {
List<AbstractPageBlock> textBlocks = page.getTextBlocks();
List<TextPageBlock> textPageBlocks = textBlocks.stream()
.filter(textBlock -> textBlock instanceof TextPageBlock)
.map(textBlock -> (TextPageBlock) textBlock)
.toList();
int count = Math.min(3, textPageBlocks.size());
headerCandidates.add(new ArrayList<>(textPageBlocks.subList(0, count)));
}
return headerCandidates;
}
/**
* Calculate the Hamming distance between two strings after preprocessing to make them the same length
* and replacing all digits with a special character '@' since they are a common occurrence in headers/footers.
*
* @param firstCandidate First string
* @param secondCandidate Second string
* @return The Hamming distance between the two preprocessed strings.
*/
private int calculateHammingDistanceWithPreprocessing(String firstCandidate, String secondCandidate) {
int maxLength = Math.max(firstCandidate.length(), secondCandidate.length());
String cleanFirstCandidate = padString(firstCandidate, maxLength, '\0').replaceAll("\\d", "@");
String cleanSecondCandidate = padString(secondCandidate, maxLength, '\0').replaceAll("\\d", "@");
int distance = 0;
for (int i = 0; i < maxLength; i++) {
if (cleanFirstCandidate.charAt(i) != cleanSecondCandidate.charAt(i)) {
distance++;
}
}
return distance;
}
private String padString(String input, int length, char padChar) {
if (input.length() >= length) {
return input;
}
StringBuilder sb = new StringBuilder(input);
while (sb.length() < length) {
sb.append(padChar);
}
return sb.toString();
}
}

View File

@ -1,12 +1,5 @@
package com.knecon.fforesight.service.layoutparser.processor.utils;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import lombok.experimental.UtilityClass;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
import org.apache.pdfbox.text.TextPosition;
import java.awt.geom.Rectangle2D;
import java.util.Collection;
import java.util.Collections;
@ -14,12 +7,23 @@ import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
import org.apache.pdfbox.text.TextPosition;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import lombok.experimental.UtilityClass;
@UtilityClass
public class MarkedContentUtils {
public static final String HEADER = "Header";
public static final String FOOTER = "Footer";
public List<Rectangle2D> getMarkedContentBboxPerLine(List<PDMarkedContent> markedContents, String subtype) {
if (markedContents == null) {
@ -31,7 +35,8 @@ public class MarkedContentUtils {
.filter(m -> m.getProperties() != null)
.filter(m -> m.getProperties().getItem("Subtype") != null)
.filter(m -> ((COSName) m.getProperties().getItem("Subtype")).getName().equals(subtype))
.map(PDMarkedContent::getContents).flatMap(Collection::stream)
.map(PDMarkedContent::getContents)
.flatMap(Collection::stream)
.filter(t -> t instanceof TextPosition)
.map(t -> (TextPosition) t)
.filter(t -> !t.getUnicode().equals(" "))
@ -41,16 +46,77 @@ public class MarkedContentUtils {
return Collections.emptyList();
}
return markedContentByYPosition.values().stream()
.map(textPositions -> new TextPositionSequence(textPositions.stream()
.toList(), 0, true)
.getRectangle())
.map(t -> new Rectangle2D.Float(t.getTopLeft().getX(), t.getTopLeft().getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight()))).collect(Collectors.toList());
return markedContentByYPosition.values()
.stream()
.map(textPositions -> new TextPositionSequence(textPositions, 0, true).getBBoxInitialUserSpace())
.map(t -> new Rectangle2D.Double(t.getX(), t.getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight())))
.collect(Collectors.toList());
}
public List<MarkedContentPosition> getMarkedContentPositions(List<PDMarkedContent> markedContents) {
if (markedContents == null) {
return Collections.emptyList();
}
return markedContents.stream()
.filter(m -> !m.getContents().isEmpty())
.map(MarkedContentPosition::fromPDMarkedContent)
.toList();
}
public boolean intersects(TextPageBlock textBlock, Map<String, List<Rectangle2D>> markedContentBboxPerType, String type) {
return markedContentBboxPerType.get(type) != null && markedContentBboxPerType.get(type).stream().anyMatch(rectangle -> rectangle.intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight()));
return markedContentBboxPerType.get(type) != null && markedContentBboxPerType.get(type)
.stream()
.anyMatch(rectangle -> rectangle.intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight()));
}
public record MarkedContentPosition(String type, String subType, List<Rectangle2D> textPositions) {
public static MarkedContentPosition fromPDMarkedContent(PDMarkedContent markedContent) {
return new MarkedContentPosition(markedContent.getTag(), parseSubType(markedContent), parseTextPositions(markedContent.getContents()));
}
private static List<Rectangle2D> parseTextPositions(List<Object> contents) {
return contents.stream()
.filter(content -> content instanceof TextPosition)
.map(content -> (TextPosition) content)
.filter(content -> !content.getUnicode().equals(" "))
.map(textPositions -> new TextPositionSequence(List.of(textPositions), 0, true))
.map(BoundingBox::getBBoxInitialUserSpace)
.collect(Collectors.toList());
}
private static String parseSubType(PDMarkedContent markedContent) {
if (markedContent == null || markedContent.getProperties() == null || markedContent.getProperties().getItem("Subtype") == null) {
return null;
}
return ((COSName) markedContent.getProperties().getItem("Subtype")).getName();
}
public String formattedType() {
if (subType == null || subType.isEmpty()) {
return type;
}
if (type.equals("Artifact")) {
return subType;
}
return String.format("%s-%s", type, subType);
}
}
}

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.services.graphics;
package com.knecon.fforesight.service.layoutparser.processor.utils;
import java.awt.geom.Rectangle2D;
@ -22,6 +22,15 @@ public record PageInformation(Rectangle2D mediabox, int number, int rotationDegr
}
public double heightRot() {
if (rotationDegrees == 90 || rotationDegrees == 270) {
return width();
}
return height();
}
public double width() {
return mediabox.getWidth();

View File

@ -114,7 +114,7 @@ public final class PositionUtils {
}
public Float getApproxLineCount(TextPageBlock textBlock) {
public double getApproxLineCount(TextPageBlock textBlock) {
return textBlock.getHeight() / textBlock.getMostPopularWordHeight();
}

View File

@ -52,7 +52,10 @@ public class RectangleTransformations {
public static Rectangle2D bBoxUnionAtomicTextBlock(List<AtomicTextBlock> atomicTextBlocks) {
return atomicTextBlocks.stream().flatMap(atomicTextBlock -> atomicTextBlock.getPositions().stream()).collect(new Rectangle2DBBoxCollector());
return atomicTextBlocks.stream()
.flatMap(atomicTextBlock -> atomicTextBlock.getPositions()
.stream())
.collect(new Rectangle2DBBoxCollector());
}
@ -77,7 +80,10 @@ public class RectangleTransformations {
public static Rectangle2D atomicTextBlockBBox(List<AtomicTextBlock> atomicTextBlocks) {
return atomicTextBlocks.stream().flatMap(atomicTextBlock -> atomicTextBlock.getPositions().stream()).collect(new Rectangle2DBBoxCollector());
return atomicTextBlocks.stream()
.flatMap(atomicTextBlock -> atomicTextBlock.getPositions()
.stream())
.collect(new Rectangle2DBBoxCollector());
}
@ -89,16 +95,18 @@ public class RectangleTransformations {
public static Rectangle2D rectangleBBox(List<Rectangle> rectangles) {
return rectangles.stream().map(RectangleTransformations::toRectangle2D).collect(new Rectangle2DBBoxCollector());
return rectangles.stream()
.map(RectangleTransformations::toRectangle2D)
.collect(new Rectangle2DBBoxCollector());
}
public static Rectangle2D toRectangle2D(Rectangle redactionLogRectangle) {
return new Rectangle2D.Double(redactionLogRectangle.getTopLeft().getX(),
redactionLogRectangle.getTopLeft().getY() + redactionLogRectangle.getHeight(),
redactionLogRectangle.getWidth(),
-redactionLogRectangle.getHeight());
redactionLogRectangle.getTopLeft().getY() + redactionLogRectangle.getHeight(),
redactionLogRectangle.getWidth(),
-redactionLogRectangle.getHeight());
}
@ -111,15 +119,16 @@ public class RectangleTransformations {
public static Rectangle toRedactionLogRectangle(Rectangle2D rectangle2D, int pageNumber) {
return new Rectangle(new Point((float) rectangle2D.getMinX(), (float) (rectangle2D.getMinY() + rectangle2D.getHeight())),
(float) rectangle2D.getWidth(),
-(float) rectangle2D.getHeight(),
pageNumber);
(float) rectangle2D.getWidth(),
-(float) rectangle2D.getHeight(),
pageNumber);
}
public static Rectangle2D rectangle2DBBox(List<Rectangle2D> rectangle2DList) {
return rectangle2DList.stream().collect(new Rectangle2DBBoxCollector());
return rectangle2DList.stream()
.collect(new Rectangle2DBBoxCollector());
}
@ -134,7 +143,8 @@ public class RectangleTransformations {
if (rectangle2DList.isEmpty()) {
return Collections.emptyList();
}
double splitThreshold = rectangle2DList.stream().mapToDouble(RectangularShape::getWidth).average().orElse(5) * 5.0;
double splitThreshold = rectangle2DList.stream()
.mapToDouble(RectangularShape::getWidth).average().orElse(5) * 5.0;
List<List<Rectangle2D>> rectangleListsWithGaps = new LinkedList<>();
List<Rectangle2D> rectangleListWithoutGaps = new LinkedList<>();
@ -171,7 +181,7 @@ public class RectangleTransformations {
verticalRulings.add(new Ruling(new Point2D.Float(rectangle.x + rectangle.width, rectangle.y),
new Point2D.Float(rectangle.x + rectangle.width, rectangle.y + rectangle.height)));
});
return CleanRulings.builder().vertical(verticalRulings).horizontal(horizontalRulings).build();
return new CleanRulings(verticalRulings, horizontalRulings);
}
@ -195,9 +205,9 @@ public class RectangleTransformations {
public BinaryOperator<BBox> combiner() {
return (b1, b2) -> new BBox(Math.min(b1.lowerLeftX, b2.lowerLeftX),
Math.min(b1.lowerLeftY, b2.lowerLeftY),
Math.max(b1.upperRightX, b2.upperRightX),
Math.max(b1.upperRightY, b2.upperRightY));
Math.min(b1.lowerLeftY, b2.lowerLeftY),
Math.max(b1.upperRightX, b2.upperRightX),
Math.max(b1.upperRightY, b2.upperRightY));
}

View File

@ -14,23 +14,24 @@ public class RectangularIntersectionFinder {
public static List<Rectangle2D> find(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
// Fix for 211.pdf
for (Ruling r : horizontalRulingLines) {
if (r.getX2() < r.getX1()) {
double a = r.getX2();
r.x2 = (float) r.getX1();
r.x1 = (float) a;
}
}
// // Fix for 211.pdf
// for (Ruling r : horizontalRulingLines) {
// if (r.getX2() < r.getX1()) {
// double a = r.getX2();
// r.x2 = (float) r.getX1();
// r.x1 = (float) a;
// }
// }
List<Rectangle2D> foundRectangles = new ArrayList<>();
Map<Point2D, Ruling[]> intersectionPoints = Ruling.findIntersections(horizontalRulingLines, verticalRulingLines);
Map<Point2D, RulingIntersectionFinder.IntersectingRulings> intersectionPoints = RulingIntersectionFinder.findNaive(horizontalRulingLines, verticalRulingLines);
List<Point2D> intersectionPointsList = new ArrayList<>(intersectionPoints.keySet());
intersectionPointsList.sort(Y_FIRST_POINT_COMPARATOR);
for (int i = 0; i < intersectionPointsList.size(); i++) {
Point2D topLeft = intersectionPointsList.get(i);
Ruling[] hv = intersectionPoints.get(topLeft);
RulingIntersectionFinder.IntersectingRulings intersectingRulingsFromTopLeft = intersectionPoints.get(topLeft);
// CrossingPointsDirectlyBelow( topLeft );
List<Point2D> xPoints = new ArrayList<>();
@ -48,19 +49,24 @@ public class RectangularIntersectionFinder {
outer:
for (Point2D xPoint : xPoints) {
// is there a vertical edge b/w topLeft and xPoint?
if (!hv[1].equals(intersectionPoints.get(xPoint)[1])) {
if (!intersectingRulingsFromTopLeft.vertical().equals(intersectionPoints.get(xPoint).vertical())) {
continue;
}
for (Point2D yPoint : yPoints) {
// is there a horizontal edge b/w topLeft and yPoint ?
if (!hv[0].equals(intersectionPoints.get(yPoint)[0])) {
if (!intersectingRulingsFromTopLeft.horizontal().equals(intersectionPoints.get(yPoint).horizontal())) {
continue;
}
Point2D btmRight = new Point2D.Float((float) yPoint.getX(), (float) xPoint.getY());
if (intersectionPoints.containsKey(btmRight)
&& intersectionPoints.get(btmRight)[0].equals(intersectionPoints.get(xPoint)[0])
&& intersectionPoints.get(btmRight)[1].equals(intersectionPoints.get(yPoint)[1])) {
&& intersectionPoints.get(btmRight).horizontal().equals(intersectionPoints.get(xPoint).horizontal())
&& intersectionPoints.get(btmRight).vertical().equals(intersectionPoints.get(yPoint).vertical())) {
foundRectangles.add(new Rectangle2D.Double(topLeft.getX(), topLeft.getY(), btmRight.getX() - topLeft.getX(), btmRight.getY() - topLeft.getY()));
intersectionPoints.get(topLeft).horizontal().setClassification(Ruling.Classification.TABLE_LINE);
intersectionPoints.get(topLeft).vertical().setClassification(Ruling.Classification.TABLE_LINE);
intersectionPoints.get(btmRight).horizontal().setClassification(Ruling.Classification.TABLE_LINE);
intersectionPoints.get(btmRight).vertical().setClassification(Ruling.Classification.TABLE_LINE);
break outer;
}
}

View File

@ -0,0 +1,200 @@
package com.knecon.fforesight.service.layoutparser.processor.utils;
import java.awt.geom.Point2D;
import java.util.Collections;
import java.util.Comparator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.TreeMap;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import lombok.experimental.UtilityClass;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@UtilityClass
public class RulingIntersectionFinder {
public static final int PERPENDICULAR_UNIT_EXPAND_AMOUNT = 2;
public static final Comparator<Point2D> Y_THEN_X_POINT_COMPARATOR = Comparator.comparingDouble(Point2D::getY).thenComparing(Point2D::getX);
/**
* Implementation to find line intersection in O(P + n log n), where n is the number of lines and P the numer of intersections.
* based on <a href="http://people.csail.mit.edu/indyk/6.838-old/handouts/lec2.pdf">Segment Intersection by Piotr Indyk</a>
*
* @param horizontals a list of non-overlapping horizontal rulings
* @param verticals a list of non-overlapping vertical rulings
* @return a Map of each found intersection point pointing to the two lines forming the intersection.
*/
/*
* The algorithm assumes there are only horizontal and vertical lines which are unique in their coordinates. (E.g. no overlapping horizontal lines exist)
* As a high level overview, the algorithm uses a sweep line advancing from left to right.
* It dynamically updates the horizontal rulings which are intersected by the current sweep line.
* When the sweep line hits a vertical line, it then checks for all intersections with the currently intersected horizontal rulings.
* THe trick of the algorithm is using a binary search tree to store the currently intersected horizontal rulings. This way the lookup should be in O(log n).
* This way the initial sorting step has the highest complexity class (O(n log n) and thus determines the complexity class of the entire algorithm
* Unfortunately, the implementation here takes a few liberties compared to the original algorithm. The binary search tree is replaced by an ordered Set which is simply looped over.
* Therefore, this implementation's worst case, where all horizontal lines span the entire sweep, you are essentially performing the naive approach with a bunch of overhead.
* Since we are using this implementation to find table cells, one can expect this worst case to always be the case.
* A simple runtime comparison for a single page with the most lines we can expect (SinglePages/AbsolutelyEnormousTable.pdf with 30 horizontals and 144 verticals) shows this implementation takes roughly 14 ms, whereas the naive approach takes 7 ms. Both are negligible, but the naive approach is two times as fast.
* If we would like to make this faster, we would need a better data structure for 'TreeMap<Ruling, Void> horizontalRulingsInCurrentSweep', where we can query the TreeMap for all horizontal rulings in a given interval in O(log n).
*/
public Map<Point2D, IntersectingRulings> find(List<Ruling> horizontals, List<Ruling> verticals) {
long start = System.currentTimeMillis();
List<SweepStep> sweepTrajectory = buildSweepTrajectory(horizontals, verticals);
TreeMap<Ruling, Void> horizontalRulingsInCurrentSweep = new TreeMap<>(Comparator.comparingDouble(Ruling::getTop));
TreeMap<Point2D, IntersectingRulings> intersections = new TreeMap<>(Y_THEN_X_POINT_COMPARATOR);
for (SweepStep step : sweepTrajectory) {
switch (step.type) {
case VERTICAL: // check for intersections with currently intersected horizontal lines
for (Ruling horizontalRuling : horizontalRulingsInCurrentSweep.navigableKeySet()) {
Optional<Point2D> intersectionPoint = findIntersectionPoint(horizontalRuling, step.ruling);
if (intersectionPoint.isEmpty()) {
continue;
}
intersections.put(intersectionPoint.get(), new IntersectingRulings(horizontalRuling, step.ruling));
}
break;
case HORIZONTAL_ENTRY: // sweep line now intersects this horizontal ruling
horizontalRulingsInCurrentSweep.put(step.ruling, null);
break;
case HORIZONTAL_EXIT: // sweep line no longer intersects this horizontal ruling
horizontalRulingsInCurrentSweep.remove(step.ruling);
break;
}
}
log.debug("Finished building intersections with line sweep in {} ms", System.currentTimeMillis() - start);
return intersections;
}
/**
* Naive Approach in O(n^2) of finding intersections between lines by iterating over all lines.
*
* @param horizontals a list of non-overlapping horizontal rulings
* @param verticals a list of non-overlapping vertical rulings
* @return a Map of each found intersection point pointing to the two lines forming the intersection.
*/
public Map<Point2D, IntersectingRulings> findNaive(List<Ruling> horizontals, List<Ruling> verticals) {
long start = System.currentTimeMillis();
TreeMap<Point2D, IntersectingRulings> intersections = new TreeMap<>(Y_THEN_X_POINT_COMPARATOR);
for (Ruling horizontal : horizontals) {
for (Ruling vertical : verticals) {
Optional<Point2D> intersectionPoint = findIntersectionPoint(horizontal, vertical);
if (intersectionPoint.isEmpty()) {
continue;
}
intersections.put(intersectionPoint.get(), new IntersectingRulings(horizontal, vertical));
}
}
log.debug("Finished building intersections naively in {} ms", System.currentTimeMillis() - start);
return intersections;
}
private static List<SweepStep> buildSweepTrajectory(List<Ruling> horizontals, List<Ruling> verticals) {
List<SweepStep> sweepTrajectory = new LinkedList<>();
for (Ruling horizontalRuling : horizontals) {
sweepTrajectory.add(new SweepStep(SweepStep.Type.HORIZONTAL_ENTRY, horizontalRuling.getLeft() - PERPENDICULAR_UNIT_EXPAND_AMOUNT, horizontalRuling));
sweepTrajectory.add(new SweepStep(SweepStep.Type.HORIZONTAL_EXIT, horizontalRuling.getRight() + PERPENDICULAR_UNIT_EXPAND_AMOUNT, horizontalRuling));
}
for (Ruling verticalRuling : verticals) {
sweepTrajectory.add(new SweepStep(SweepStep.Type.VERTICAL, verticalRuling.getLeft(), verticalRuling));
}
Collections.sort(sweepTrajectory);
return sweepTrajectory;
}
public Optional<Point2D> findIntersectionPoint(Ruling horizontal, Ruling vertical) {
if (!horizontal.isHorizontal() || !vertical.isVertical()) {
log.warn("lines must be orthogonal, vertical and horizontal");
return Optional.empty();
}
Ruling expanded_horizontal = horizontal.expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT);
Ruling expanded_vertical = vertical.expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT);
if (!expanded_horizontal.intersectsLine(expanded_vertical)) {
return Optional.empty();
}
return Optional.of(new Point2D.Float(vertical.getLeft(), horizontal.getTop()));
}
private class SweepStep implements Comparable<SweepStep> {
protected Type type;
protected float y_position;
protected Ruling ruling;
private enum Type {
VERTICAL,
HORIZONTAL_EXIT,
HORIZONTAL_ENTRY
}
SweepStep(Type type, float y_position, Ruling ruling) {
this.type = type;
this.y_position = y_position;
this.ruling = ruling;
}
@Override
public int compareTo(SweepStep other) {
int rv;
if (DoubleComparisons.feq(y_position, other.y_position)) {
if (type == SweepStep.Type.VERTICAL && other.type == SweepStep.Type.HORIZONTAL_ENTRY) {
rv = 1;
} else if (type == SweepStep.Type.VERTICAL && other.type == SweepStep.Type.HORIZONTAL_EXIT) {
rv = -1;
} else if (type == SweepStep.Type.HORIZONTAL_ENTRY && other.type == SweepStep.Type.VERTICAL) {
rv = -1;
} else if (type == SweepStep.Type.HORIZONTAL_EXIT && other.type == SweepStep.Type.VERTICAL) {
rv = 1;
} else {
rv = Double.compare(y_position, other.y_position);
}
} else {
return Double.compare(y_position, other.y_position);
}
return rv;
}
}
public record IntersectingRulings(Ruling horizontal, Ruling vertical) {
}
}

View File

@ -4,6 +4,7 @@ import static com.knecon.fforesight.service.layoutparser.processor.utils.Geometr
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.Y_FIRST_POINT_COMPARATOR;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
@ -11,7 +12,7 @@ import java.util.List;
import java.util.Map;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
public class SpreadsheetFinder {
@ -19,15 +20,15 @@ public class SpreadsheetFinder {
private static final float AREA_TOLERANCE = 0.001f;
public static List<Rectangle> findSpreadsheetsFromCells(List<? extends Rectangle> cells) {
public static List<Rectangle2D> findSpreadsheetsFromCells(List<Cell> cells) {
// via: http://stackoverflow.com/questions/13746284/merging-multiple-adjacent-rectangles-into-one-polygon
List<Rectangle> rectangles = new ArrayList<>();
List<Rectangle2D> rectangles = new ArrayList<>();
Set<Point2D> pointSet = new HashSet<>();
Map<Point2D, Point2D> edgesH = new HashMap<>();
Map<Point2D, Point2D> edgesV = new HashMap<>();
for (Rectangle cell : cells) {
for (Point2D pt : cell.getPoints()) {
for (Cell cell : cells) {
for (Point2D pt : getPoints(cell.getBBoxInitialUserSpace())) {
if (pointSet.contains(pt)) { // shared vertex, remove it
pointSet.remove(pt);
} else {
@ -116,13 +117,22 @@ public class SpreadsheetFinder {
// do not add polygons with too many outer points as they are unlikely to be tables
if (poly.size() <= MAX_OUTER_POINT_TOLERANCE) {
rectangles.add(new Rectangle(top - AREA_TOLERANCE, left - AREA_TOLERANCE, right - left + 2 * AREA_TOLERANCE, bottom - top + 2 * AREA_TOLERANCE));
rectangles.add(new Rectangle2D.Double(left - AREA_TOLERANCE, top - AREA_TOLERANCE, right - left + (2 * AREA_TOLERANCE), bottom - top + (2 * AREA_TOLERANCE)));
}
}
return rectangles;
}
public static List<Point2D> getPoints(Rectangle2D rectangle2D) {
return List.of(new Point2D.Double(rectangle2D.getX(), rectangle2D.getY()),
new Point2D.Double(rectangle2D.getMaxX(), rectangle2D.getY()),
new Point2D.Double(rectangle2D.getMaxX(), rectangle2D.getMaxY()),
new Point2D.Double(rectangle2D.getX(), rectangle2D.getMaxY()));
}
private enum Direction {
HORIZONTAL,
VERTICAL

View File

@ -39,21 +39,21 @@ public class TextPositionSequenceComparator implements Comparator<TextPositionSe
}
// get the text direction adjusted coordinates
float x1 = pos1.getMinXDirAdj();
float x2 = pos2.getMinXDirAdj();
double x1 = pos1.getBBox().getX();
double x2 = pos2.getBBox().getX();
float pos1YBottom = pos1.getMaxYDirAdj();
float pos2YBottom = pos2.getMaxYDirAdj();
double pos1YBottom = pos1.getBBox().getMaxY();
double pos2YBottom = pos2.getBBox().getMaxY();
// note that the coordinates have been adjusted so 0,0 is in upper left
float pos1YTop = pos1YBottom - pos1.getTextHeightNoPadding();
float pos2YTop = pos2YBottom - pos2.getTextHeightNoPadding();
double pos1YTop = pos1YBottom - pos1.getBBox().getHeight();
double pos2YTop = pos2YBottom - pos2.getBBox().getHeight();
float yDifference = Math.abs(pos1YBottom - pos2YBottom);
double yDifference = Math.abs(pos1YBottom - pos2YBottom);
// we will do a simple tolerance comparison
if (yDifference < .1 || pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom || pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom) {
return Float.compare(x1, x2);
return Double.compare(x1, x2);
} else if (pos1YBottom < pos2YBottom) {
return -1;
} else {

View File

@ -0,0 +1,310 @@
package com.knecon.fforesight.service.layoutparser.processor.visualization;
import java.awt.Color;
import java.awt.geom.Line2D;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.util.Collection;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Stream;
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import com.knecon.fforesight.service.viewerdoc.ContentStreams;
import com.knecon.fforesight.service.viewerdoc.model.ColoredLine;
import com.knecon.fforesight.service.viewerdoc.model.ColoredRectangle;
import com.knecon.fforesight.service.viewerdoc.model.PlacedText;
import com.knecon.fforesight.service.viewerdoc.model.Standard14EmbeddableFont;
import com.knecon.fforesight.service.viewerdoc.model.Visualizations;
import com.knecon.fforesight.service.viewerdoc.model.VisualizationsOnPage;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.NoArgsConstructor;
import lombok.Setter;
import lombok.experimental.FieldDefaults;
@Getter
@NoArgsConstructor
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class LayoutparsingVisualizations {
static final Standard14EmbeddableFont FONT = Standard14EmbeddableFont.helvetica();
static final Color WORDS_COLOR = new Color(68, 84, 147);
static final Color LINES_COLOR = new Color(152, 45, 179);
static final Color ZONES_COLOR = new Color(131, 38, 38);
static final Color RULINGS_COLOR = new Color(21, 221, 174);
static final Color TABLE_RULINGS_COLOR = new Color(255, 175, 175);
static final Color HEADER_RULING_COLOR = new Color(171, 131, 6);
static final Color FOOTER_RULING_COLOR = new Color(106, 82, 2);
static final Color UNDERLINE_RULING_COLOR = new Color(6, 39, 171);
static final Color STRIKETROUGH_RULING_COLOR = new Color(171, 6, 6);
static final Color CELLS_COLOR = new Color(31, 214, 27);
static final Color MAIN_BODY_COLOR = new Color(171, 131, 6);
static final Color MARKED_CONTENT_COLOR = new Color(171, 131, 6);
static final List<Color> ROTATING_CHARACTER_COLOR = List.of(new Color(255, 87, 51),
new Color(255, 195, 0),
new Color(76, 175, 80),
new Color(33, 150, 243),
new Color(155, 89, 182),
new Color(233, 30, 99),
new Color(0, 188, 212),
new Color(121, 85, 72));
@Setter
boolean active;
final Visualizations words = Visualizations.builder().layer(ContentStreams.WORDS).build();
final Visualizations lines = Visualizations.builder().layer(ContentStreams.LINES).build();
final Visualizations zones = Visualizations.builder().layer(ContentStreams.ZONES).build();
final Visualizations mainBody = Visualizations.builder().layer(ContentStreams.MAIN_BODY).build();
final Visualizations clean_rulings = Visualizations.builder().layer(ContentStreams.CLEAN_RULINGS).build();
final Visualizations rulings = Visualizations.builder().layer(ContentStreams.RULINGS).build();
final Visualizations cells = Visualizations.builder().layer(ContentStreams.CELLS).build();
final Visualizations markedContent = Visualizations.builder().layer(ContentStreams.MARKED_CONTENT).build();
final Visualizations neighbours = Visualizations.builder().layer(ContentStreams.NEIGHBOURS).build();
final Visualizations characters = Visualizations.builder().layer(ContentStreams.CHARACTERS).build();
public Stream<Visualizations> streamAll() {
if (!active) {
return Stream.empty();
}
return Stream.of(characters, //
neighbours,//
words, //
lines, //
zones, //
rulings, //
clean_rulings, //
cells, //
mainBody, //
markedContent //
);
}
public void addTextVisualizations(List<TextPositionSequence> textPositionSequences, int pageNumber) {
if (!active) {
return;
}
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.words);
visualizationsOnPage.getColoredRectangles()
.addAll(textPositionSequences.stream()
.map(BoundingBox::getBBoxInitialUserSpace)
.map(rect -> new ColoredRectangle(rect, WORDS_COLOR, 1))
.toList());
}
public void addCleanRulingVisualization(CleanRulings cleanRulings, int pageNumber) {
if (!active) {
return;
}
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.clean_rulings);
visualizationsOnPage.getColoredLines()
.addAll(cleanRulings.buildAll()
.stream()
.map(ruling -> new ColoredLine(ruling, decideOnRulingColor(ruling), 0.5f))
.toList());
}
public void addRulingVisualization(List<Ruling> rulings, int pageNumber) {
if (!active) {
return;
}
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.rulings);
visualizationsOnPage.getColoredLines()
.addAll(rulings
.stream()
.map(ruling -> new ColoredLine(ruling, decideOnRulingColor(ruling), 0.5f))
.toList());
}
private Color decideOnRulingColor(Ruling ruling) {
return switch (ruling.getClassification()) {
case TABLE_LINE -> TABLE_RULINGS_COLOR;
case HEADER_SEPARATOR -> HEADER_RULING_COLOR;
case FOOTER_SEPARATOR -> FOOTER_RULING_COLOR;
case UNDERLINE -> UNDERLINE_RULING_COLOR;
case STRIKETROUGH -> STRIKETROUGH_RULING_COLOR;
default -> RULINGS_COLOR;
};
}
public void addCellVisualizations(List<? extends BoundingBox> cells, int pageNumber) {
if (!active) {
return;
}
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.cells);
visualizationsOnPage.getColoredRectangles()
.addAll(cells.stream()
.map(cell -> new ColoredRectangle(cell.getBBoxInitialUserSpace(), CELLS_COLOR, 1))
.toList());
}
public void addZoneVisualizations(List<Zone> zones, int page) {
if (!active) {
return;
}
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(page, this.zones);
visualizationsOnPage.getColoredRectangles()
.addAll(zones.stream()
.map(BoundingBox::getBBoxInitialUserSpace)
.map(zone -> new ColoredRectangle(zone, ZONES_COLOR, 1))
.toList());
}
public void addLineVisualizationsFromZones(List<Zone> zones, int page) {
addLineVisualizations(zones.stream()
.map(Zone::getLines)
.flatMap(Collection::stream)
.toList(), page);
}
public void addLineVisualizations(List<Line> lines, int page) {
if (!active) {
return;
}
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(page, this.lines);
visualizationsOnPage.getColoredRectangles()
.addAll(lines.stream()
.map(BoundingBox::getBBoxInitialUserSpace)
.map(line -> new ColoredRectangle(line, LINES_COLOR, 0.5f))
.toList());
}
public void addTextBlockVisualizations(List<TextPageBlock> textPageBlocks, int page) {
if (!active) {
return;
}
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(page, zones);
visualizationsOnPage.getColoredRectangles()
.addAll(textPageBlocks.stream()
.map(rect -> new ColoredRectangle(rect.getBBoxInitialUserSpace(), ZONES_COLOR, 1))
.toList());
}
public void addMainBodyVisualization(Rectangle rectangle, int pageNumber) {
if (!active) {
return;
}
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, mainBody);
visualizationsOnPage.getColoredRectangles()
.add(new ColoredRectangle(new Rectangle2D.Double(rectangle.getTopLeft().getX(), rectangle.getTopLeft().getY(), rectangle.getWidth(), rectangle.getHeight()),
MAIN_BODY_COLOR,
1));
}
public void addMarkedContentVisualizations(List<PDMarkedContent> markedContents, int pageNumber) {
if (!active) {
return;
}
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, markedContent);
List<MarkedContentUtils.MarkedContentPosition> markedContentBBoxMapBySubType = MarkedContentUtils.getMarkedContentPositions(markedContents);
AtomicInteger count = new AtomicInteger();
markedContentBBoxMapBySubType.forEach(markedContentPosition -> {
var bbox = markedContentPosition.textPositions()
.stream()
.collect(RectangleTransformations.collectBBox());
String type = markedContentPosition.formattedType() + " " + count.getAndIncrement();
float translationAmount = ((FONT.getStringWidth(type) / 100) + 6);
// Pushes the string to the left of the box: calculate string width, divide by font units (1000), multiply with font size (10), add small offset (6).
visualizationsOnPage.getPlacedTexts()
.add(PlacedText.textFacingUp(type, new Point2D.Double(bbox.getX() - translationAmount, bbox.getY() + bbox.getHeight()), 10, Color.BLACK, FONT));
visualizationsOnPage.getColoredRectangles().add(new ColoredRectangle(bbox, MARKED_CONTENT_COLOR, 1));
}
);
}
public void addCharactersWithNeighbours(List<Zone> zones, int page) {
if (!active) {
return;
}
VisualizationsOnPage characterVisualizations = getOrCreateVisualizationsOnPage(page, characters);
VisualizationsOnPage neighbourVisualizations = getOrCreateVisualizationsOnPage(page, neighbours);
AtomicInteger index = new AtomicInteger(0);
zones.forEach(zone -> zone.getLines()
.stream()
.map(Line::getCharacters)
.flatMap(Collection::stream)
.forEach(character -> {
Color color = ROTATING_CHARACTER_COLOR.get(index.getAndIncrement() % ROTATING_CHARACTER_COLOR.size());
Rectangle2D charBBox = character.getTextPosition().getBBoxInitialUserSpace();
characterVisualizations.getColoredRectangles().add(new ColoredRectangle(charBBox, color, 1));
character.getNeighbors()
.forEach(neighbor -> {
Rectangle2D neighborBBox = neighbor.getCharacter().getTextPosition().getBBoxInitialUserSpace();
Line2D line = new Line2D.Double(new Point2D.Double(charBBox.getCenterX(), charBBox.getCenterY()),
new Point2D.Double(neighborBBox.getCenterX(), neighborBBox.getCenterY()));
neighbourVisualizations.getColoredLines().add(new ColoredLine(line, color, 1));
});
}));
}
private VisualizationsOnPage getOrCreateVisualizationsOnPage(int page, Visualizations visualizations) {
if (visualizations.getVisualizationsOnPages().containsKey(page - 1)) {
return visualizations.getVisualizationsOnPages()
.get(page - 1);
}
VisualizationsOnPage visualizationsOnPage = VisualizationsOnPage.builder().build();
visualizations.getVisualizationsOnPages().put(page - 1, visualizationsOnPage);
return visualizationsOnPage;
}
}

View File

@ -69,10 +69,10 @@ public class HeadlinesGoldStandardIntegrationTest {
public void testHeadlineDetection() {
List<Metrics> metrics = new ArrayList<>();
//metrics.add(getMetrics("files/syngenta/CustomerFiles/01 - CGA100251 - Acute Oral Toxicity (Up and Down Procedure) - Rat (1).pdf",
// "files/headlineTest/01 - CGA100251 - Acute Oral Toxicity (Up and Down Procedure) - Rat (1)_REDACTION_LOG.json"));
//metrics.add(getMetrics("files/syngenta/CustomerFiles/91 Trinexapac-ethyl_RAR_01_Volume_1_2018-02-23.pdf",
// "files/headlineTest/91 Trinexapac-ethyl_RAR_01_Volume_1_2018-02-23_REDACTION_LOG.json"));
metrics.add(getMetrics("files/syngenta/CustomerFiles/01 - CGA100251 - Acute Oral Toxicity (Up and Down Procedure) - Rat (1).pdf",
"files/headlineTest/01 - CGA100251 - Acute Oral Toxicity (Up and Down Procedure) - Rat (1)_REDACTION_LOG.json"));
metrics.add(getMetrics("files/syngenta/CustomerFiles/91 Trinexapac-ethyl_RAR_01_Volume_1_2018-02-23.pdf",
"files/headlineTest/91 Trinexapac-ethyl_RAR_01_Volume_1_2018-02-23_REDACTION_LOG.json"));
metrics.add(getMetrics("files/syngenta/CustomerFiles/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf", "files/headlineTest/S-Metolachlor_RAR_01_Volume_1_2018-09-06_REDACTION_LOG.json"));
double precision = metrics.stream().mapToDouble(Metrics::getPrecision).average().orElse(1.0);
@ -96,8 +96,8 @@ public class HeadlinesGoldStandardIntegrationTest {
goldStandardLog.getRedactionLogEntry().removeIf(r -> !r.isRedacted() || r.getChanges().get(r.getChanges().size() - 1).getType().equals(ChangeType.REMOVED));
goldStandardLog.getRedactionLogEntry().forEach(e -> goldStandardHeadlines.add(new Headline(e.getPositions().get(0).getPage(), e.getValue())));
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER,
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.DOCUMINE,
layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE,
pdfFileResource.getFile(),
new ImageServiceResponse(),
new TableServiceResponse(),

View File

@ -1,10 +1,20 @@
package com.knecon.fforesight.service.layoutparser.server;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Arrays;
import java.util.Comparator;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.core.io.ClassPathResource;
import com.iqser.red.storage.commons.utils.FileSystemBackedStorageService;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
@ -20,28 +30,65 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
@Autowired
private LayoutParsingPipeline layoutParsingPipeline;
@Disabled
@Test
@SneakyThrows
public void testLayoutParserEndToEnd() {
prepareStorage("files/bdr/Wie weiter bei Kristeneinrichtungen.pdf");
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER);
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
Arrays.stream(finishedEvent.message().split("\n"))
.forEach(log::info);
String filePath = "files/Minimal Examples/RotateTextWithRulingsTestFile.pdf";
runForFile(filePath);
}
@Test
@Disabled
@SneakyThrows
public void testLayoutParserEndToEndWithFolder() {
String folder = "/home/kschuettler/Dokumente/TestFiles/large number of prod files";
List<Path> pdfFiles = Files.walk(Path.of(folder))
.filter(path -> path.getFileName().toString().endsWith(".pdf"))
.sorted(Comparator.comparing(Path::getFileName))
.peek(System.out::println)
.toList();
System.out.printf("Found %d pdf files to process %n", pdfFiles.size());
AtomicInteger count = new AtomicInteger(0);
pdfFiles.stream()
.peek(path -> log.info("{}/{}-{}", count.getAndIncrement(), pdfFiles.size(), path.getFileName()))
.forEach(path -> runForFile(path.toFile().toString()));
}
@Test
@SneakyThrows
public void testLayoutParserEndToEnd_RED_8747() {
private void runForFile(String filePath) {
String fileName = Path.of(filePath).getFileName().toString();
File file;
if (filePath.startsWith("files")) { // from resources
file = new ClassPathResource(filePath).getFile();
} else { // absolute path
file = new File(filePath);
}
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LayoutParsingType.REDACT_MANAGER, true);
prepareStorage(layoutParsingRequest, file);
prepareStorage("files/syngenta/CustomerFiles/SinglePages/Page26_fRR A23317A PI0015600 CEU core part B6 - CZ.pdf");
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER_OLD);
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
Arrays.stream(finishedEvent.message().split("\n"))
.forEach(log::info);
File tmpFile = new File("/tmp/layoutparserEND2END/" + fileName + "_VIEWER.pdf");
assert tmpFile.getParentFile().exists() || tmpFile.getParentFile().mkdirs();
storageService.downloadTo(TENANT_ID, layoutParsingRequest.viewerDocumentStorageId(), tmpFile);
}
@AfterEach
public void cleanUpTmp() {
((FileSystemBackedStorageService) storageService).clearStorage();
}
}

View File

@ -23,6 +23,10 @@ import lombok.SneakyThrows;
public class ViewerDocumentTest extends BuildDocumentTest {
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
@Test
@SneakyThrows
public void testViewerDocument() {
@ -32,12 +36,10 @@ public class ViewerDocumentTest extends BuildDocumentTest {
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
var documentFile = new ClassPathResource(fileName).getFile();
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
long start = System.currentTimeMillis();
Document document = buildGraph(fileName, LayoutParsingType.DOCUMINE);
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER);
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true, false);
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
}
@ -55,17 +57,17 @@ public class ViewerDocumentTest extends BuildDocumentTest {
var tableResponse = mapper.readValue(new ClassPathResource(tableFileName).getInputStream(), TableServiceResponse.class);
var documentFile = new ClassPathResource(fileName).getFile();
var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE,
documentFile,
new ImageServiceResponse(),
tableResponse,
new VisualLayoutParsingResponse(),
Map.of("file", Path.of(fileName).getFileName().toFile().toString()));
var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE_OLD,
documentFile,
new ImageServiceResponse(),
tableResponse,
new VisualLayoutParsingResponse(),
Map.of("file", Path.of(fileName).getFileName().toFile().toString()));
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.DOCUMINE, classificationDocument);
Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.DOCUMINE_OLD, classificationDocument);
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true, false);
}
}

View File

@ -0,0 +1,118 @@
package com.knecon.fforesight.service.layoutparser.server.model;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.util.Collections;
import java.util.List;
import java.util.stream.IntStream;
import org.junit.jupiter.api.Test;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
class CleanRulingsTest {
@Test
public void testLineBetween() {
List<Ruling> verticals = List.of(new Ruling(new Point2D.Double(10, 0), new Point2D.Double(10, 10)));
List<Ruling> horizontals = List.of(new Ruling(new Point2D.Double(0, 5), new Point2D.Double(10, 5)));
CleanRulings cleanRulings = new CleanRulings(horizontals, verticals);
Rectangle2D a = new Rectangle2D.Double(1, 6, 3, 3);
Rectangle2D b = new Rectangle2D.Double(5, 6, 3, 3);
Rectangle2D c = new Rectangle2D.Double(11, 6, 3, 3);
Rectangle2D d = new Rectangle2D.Double(1, 1, 3, 3);
Rectangle2D e = new Rectangle2D.Double(5, 1, 3, 3);
Rectangle2D f = new Rectangle2D.Double(11, 1, 3, 3);
assertFalse(cleanRulings.lineBetween(a, a));
assertFalse(cleanRulings.lineBetween(a, b));
assertTrue(cleanRulings.lineBetween(a, c));
assertTrue(cleanRulings.lineBetween(a, d));
assertTrue(cleanRulings.lineBetween(a, e));
assertTrue(cleanRulings.lineBetween(a, f));
}
@Test
public void testSingleLineInRange() {
List<Ruling> horizontals = List.of(new Ruling(new Point2D.Float(0, 1), new Point2D.Float(100, 1)));
List<Ruling> verticals = List.of(new Ruling(new Point2D.Float(1, 0), new Point2D.Float(1, 100)));
CleanRulings cleanRulings = new CleanRulings(horizontals, verticals);
assertEquals(0, cleanRulings.getVerticalsInXInterval(-2, -1).size());
assertEquals(0, cleanRulings.getVerticalsInXInterval(-2, -5).size());
assertEquals(0, cleanRulings.getVerticalsInXInterval(-2, Float.NaN).size());
assertEquals(1, cleanRulings.getVerticalsInXInterval(1, 10).size());
assertEquals(0, cleanRulings.getVerticalsInXInterval(100, 101).size());
assertEquals(verticals.size(), cleanRulings.getVerticalsInXInterval(Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY).size());
assertEquals(1, cleanRulings.getVerticalsInXInterval(1 - 1e-5f, 1 + 1e-5f).size());
assertEquals(0, cleanRulings.getVerticalsInXInterval(1e-5f, 1 - 1e-5f).size());
assertEquals(0, cleanRulings.getHorizontalsInYInterval(-2, -1).size());
assertEquals(1, cleanRulings.getHorizontalsInYInterval(1, 10).size());
assertEquals(0, cleanRulings.getHorizontalsInYInterval(100, 1001).size());
}
@Test
public void testLinesInRange() {
List<Ruling> horizontals = IntStream.range(0, 101).boxed()
.map(y -> new Ruling(new Point2D.Float(0, y), new Point2D.Float(100, y)))
.toList();
List<Ruling> verticals = IntStream.range(0, 101).boxed()
.map(x -> new Ruling(new Point2D.Float(x, 0), new Point2D.Float(x, 100)))
.toList();
CleanRulings cleanRulings = new CleanRulings(horizontals, verticals);
assertEquals(0, cleanRulings.getVerticalsInXInterval(-2, -1).size());
assertEquals(0, cleanRulings.getVerticalsInXInterval(-2, -5).size());
assertEquals(0, cleanRulings.getVerticalsInXInterval(-2, Float.NaN).size());
assertEquals(10, cleanRulings.getVerticalsInXInterval(1, 10).size());
assertEquals(1, cleanRulings.getVerticalsInXInterval(100, 101).size());
assertEquals(verticals.size(), cleanRulings.getVerticalsInXInterval(Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY).size());
assertEquals(1, cleanRulings.getVerticalsInXInterval(-1e-5f, 1e-5f).size());
assertEquals(1, cleanRulings.getVerticalsInXInterval(0, 0).size());
assertEquals(0, cleanRulings.getVerticalsInXInterval(1e-5f, 1 - 1e-5f).size());
assertEquals(0, cleanRulings.getHorizontalsInYInterval(-2, -1).size());
assertEquals(10, cleanRulings.getHorizontalsInYInterval(1, 10).size());
assertEquals(1, cleanRulings.getHorizontalsInYInterval(100, 1001).size());
}
@Test
public void testLinesInRangePerformance() {
List<Ruling> horizontals = IntStream.range(0, (int) 1e6).boxed()
.map(y -> new Ruling(new Point2D.Float(0, y), new Point2D.Float(100, y)))
.toList();
CleanRulings cleanRulings = new CleanRulings(horizontals, Collections.emptyList());
float startY = 29;
float endY = 3000;
long start = System.currentTimeMillis();
var result = cleanRulings.getHorizontalsInYInterval(startY, endY);
long time = System.currentTimeMillis() - start;
start = System.currentTimeMillis();
var result2 = cleanRulings.getHorizontals()
.stream()
.filter(ruling -> ruling.getY1() >= startY && ruling.getY1() <= endY)
.toList();
long time2 = System.currentTimeMillis() - start;
assertEquals(result, result2);
assertTrue(time < time2);
}
}

View File

@ -0,0 +1,62 @@
package com.knecon.fforesight.service.layoutparser.server.model;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.util.List;
import org.junit.jupiter.api.Test;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
public class RulingTest {
@Test
public void testLineBetween() {
List<Ruling> verticals = List.of(new Ruling(new Point2D.Double(10, 0), new Point2D.Double(10, 10)), new Ruling(new Point2D.Double(5, 0), new Point2D.Double(5, 5)));
List<Ruling> horizontals = List.of(new Ruling(new Point2D.Double(0, 5), new Point2D.Double(10, 5)));
CleanRulings cleanRulings = new CleanRulings(horizontals, verticals);
Rectangle2D a = new Rectangle2D.Double(1, 6, 3, 3);
Rectangle2D b = new Rectangle2D.Double(5, 6, 3, 3);
Rectangle2D c = new Rectangle2D.Double(11, 6, 3, 3);
Rectangle2D d = new Rectangle2D.Double(1, 1, 3, 3);
Rectangle2D e = new Rectangle2D.Double(5, 1, 3, 3);
Rectangle2D f = new Rectangle2D.Double(11, 1, 3, 3);
assertFalse(cleanRulings.lineBetween(a, a));
assertFalse(cleanRulings.lineBetween(a, b));
assertTrue(cleanRulings.lineBetween(a, c));
assertTrue(cleanRulings.lineBetween(a, d));
assertTrue(cleanRulings.lineBetween(a, e));
assertTrue(cleanRulings.lineBetween(a, f));
assertFalse(cleanRulings.lineBetween(d, d));
assertTrue(cleanRulings.lineBetween(d, b));
assertTrue(cleanRulings.lineBetween(d, c));
assertTrue(cleanRulings.lineBetween(d, a));
assertTrue(cleanRulings.lineBetween(d, e));
assertTrue(cleanRulings.lineBetween(d, f));
assertFalse(cleanRulings.lineBetween(c, c));
assertTrue(cleanRulings.lineBetween(c, b));
assertTrue(cleanRulings.lineBetween(c, d));
assertTrue(cleanRulings.lineBetween(c, a));
assertTrue(cleanRulings.lineBetween(c, e));
assertFalse(cleanRulings.lineBetween(c, f));
var all = List.of(a, b, c, d, e, f);
for (Rectangle2D r1 : all) {
for (Rectangle2D r2 : all) {
assertEquals(cleanRulings.lineBetween(r1, r2), cleanRulings.lineBetween(r2, r1));
}
}
}
}

View File

@ -52,28 +52,16 @@ public class PdfSegmentationServiceTest extends AbstractTest {
@Autowired
private ObjectMapper objectMapper;
@Autowired
private RedactManagerClassificationService redactManagerClassificationService;
@Autowired
private SectionsBuilderService sectionsBuilderService;
@SneakyThrows
public ClassificationDocument buildClassificationDocument(File originDocument, TableServiceResponse tableServiceResponse) {
ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD,
originDocument,
new ImageServiceResponse(),
tableServiceResponse,
new VisualLayoutParsingResponse(),
Map.of("file","document"));
redactManagerClassificationService.classifyDocument(classificationDocument);
sectionsBuilderService.buildSections(classificationDocument);
return classificationDocument;
return layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
originDocument,
new ImageServiceResponse(),
tableServiceResponse,
new VisualLayoutParsingResponse(),
Map.of("file", "document"));
}
@ -133,7 +121,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
.get(0).getSequences().size()).isEqualTo(8);
assertThat(classificationDocument.getHeaders()
.get(0).getTextBlocks()
.get(0).toString()).isEqualTo(textToSearch);
.get(0).toString()).contains(textToSearch);
Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER, classificationDocument);
@ -143,6 +131,17 @@ public class PdfSegmentationServiceTest extends AbstractTest {
}
@Test
@SneakyThrows
public void testTableAndCellRotations() {
String fileName = "files/Minimal Examples/simpleTablesRotated.pdf";
ClassPathResource pdfFileResource = new ClassPathResource(fileName);
ClassificationDocument classificationDocument = buildClassificationDocument(pdfFileResource.getFile());
}
@Disabled
@Test
public void testScanRotationBorderIsIgnored() throws IOException {
@ -157,7 +156,11 @@ public class PdfSegmentationServiceTest extends AbstractTest {
.flatMap(paragraph -> paragraph.getTables()
.stream())
.collect(Collectors.toList())).isNotEmpty();
var tables = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList();
var tables = document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.toList();
// Quality of the table parsing is not good, because the file is rotated at scanning.
// We only asset that the table border is not the page border.
@ -179,12 +182,12 @@ public class PdfSegmentationServiceTest extends AbstractTest {
imageServiceResponse.getData()
.forEach(imageMetadata -> images.computeIfAbsent(imageMetadata.getPosition().getPageNumber(), x -> new ArrayList<>())
.add(new ClassifiedImage(new Rectangle2D.Double(imageMetadata.getPosition().getX1(),
imageMetadata.getPosition().getY1(),
imageMetadata.getGeometry().getWidth(),
imageMetadata.getGeometry().getHeight()),
ImageType.valueOf(imageMetadata.getClassification().getLabel().toUpperCase(Locale.ROOT)),
imageMetadata.isAlpha(),
imageMetadata.getPosition().getPageNumber())));
imageMetadata.getPosition().getY1(),
imageMetadata.getGeometry().getWidth(),
imageMetadata.getGeometry().getHeight()),
ImageType.valueOf(imageMetadata.getClassification().getLabel().toUpperCase(Locale.ROOT)),
imageMetadata.isAlpha(),
imageMetadata.getPosition().getPageNumber())));
System.out.println("object");
}
@ -196,11 +199,22 @@ public class PdfSegmentationServiceTest extends AbstractTest {
ClassPathResource pdfFileResource = new ClassPathResource("files/syngenta/CustomerFiles/SinglePages/Spanning Cells - Page131_S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf");
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(0);
assertThat(document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.collect(Collectors.toList())).isNotEmpty();
TablePageBlock table = document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.toList()
.get(0);
assertThat(table.getColCount()).isEqualTo(6);
assertThat(table.getRowCount()).isEqualTo(13);
assertThat(table.getRows().stream().mapToInt(List::size).sum()).isEqualTo(6 * 13);
assertThat(table.getRows()
.stream()
.mapToInt(List::size).sum()).isEqualTo(6 * 13);
}
@ -373,29 +387,30 @@ public class PdfSegmentationServiceTest extends AbstractTest {
validateTable(document, 0, 8, 8, 0, 0);
List<List<String>> values = Arrays.asList(Arrays.asList("Annex point Reference within DAR/RAR",
"Author, date",
"Study title",
"Analytical method Author, date, No.",
"Technique, LOQ of the method, validated working range",
"Method meets analytical validation criteria",
"Remarks (in case validation criteria are not met)",
"Acceptability of the method"),
Arrays.asList("Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies"),
Arrays.asList("CA 7.1.2.1.1 DAR (2009)",
"Evans P.G. 2001 TMJ4569B, VV-323245",
"Azoxystrobin Laboratory Degradation Study in Three Soil Types, Sampled from Holland and the United Kingdom",
"Method: RAM 269 Johnson R.I., Tummon O.J., Earl M. 1995 RJ1864B, VV-377731 Johnson R.I., Tummon O.J., Earl M. 1998 RAM 269/02, VV-124072 Johnson R.I., Tummon O.J., Earl M. 2000 RAM 269/03, VV-123986 Validation: Robinson N.J. 2001 TMJ4617B, VV-895845",
"LC-MS/MS LOQ: 0.01 mg/kg (R401553 (SYN50165 7), R402173 (SYN501114 )) or 0.02mg/kg (azoxystrobin, R230310, R234886) Working range: 0.02-1.0 or 0.01-0.5 mg/kg (depending on analyte) Other supporting quantificati on methods: HPLC-UV GC-MSD",
"Y",
"N/A",
"Y"));
"Author, date",
"Study title",
"Analytical method Author, date, No.",
"Technique, LOQ of the method, validated working range",
"Method meets analytical validation criteria",
"Remarks (in case validation criteria are not met)",
"Acceptability of the method"),
Arrays.asList(
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies"),
Arrays.asList("CA 7.1.2.1.1 DAR (2009)",
"Evans P.G. 2001 TMJ4569B, VV-323245",
"Azoxystrobin Laboratory Degradation Study in Three Soil Types, Sampled from Holland and the United Kingdom",
"Method: RAM 269 Johnson R.I., Tummon O.J., Earl M. 1995 RJ1864B, VV-377731 Johnson R.I., Tummon O.J., Earl M. 1998 RAM 269/02, VV-124072 Johnson R.I., Tummon O.J., Earl M. 2000 RAM 269/03, VV-123986 Validation: Robinson N.J. 2001 TMJ4617B, VV-895845",
"LC-MS/MS LOQ: 0.01 mg/kg (R401553 (SYN50165 7), R402173 (SYN501114 )) or 0.02mg/kg (azoxystrobin, R230310, R234886) Working range: 0.02-1.0 or 0.01-0.5 mg/kg (depending on analyte) Other supporting quantificati on methods: HPLC-UV GC-MSD",
"Y",
"N/A",
"Y"));
validateTable(document, 0, values);
@ -785,6 +800,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
}
@Test
public void testMergedEntities_Page26() throws IOException {
@ -802,7 +818,11 @@ public class PdfSegmentationServiceTest extends AbstractTest {
@SneakyThrows
private void toHtml(ClassificationDocument document, String filename) {
var tables = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList();
var tables = document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.toList();
StringBuilder sb = new StringBuilder();
int currentPage = 1;
@ -823,9 +843,19 @@ public class PdfSegmentationServiceTest extends AbstractTest {
private void validateTable(ClassificationDocument document, int tableIndex, int colCount, int rowCount, int emptyCellsCountCorrect, int emptyCellsCountIncorrect) {
TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(tableIndex);
TablePageBlock table = document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.toList()
.get(tableIndex);
List<List<Cell>> rows = table.getRows();
int emptyCellsFoundFound = rows.stream().flatMap(List::stream).toList().stream().filter(f -> f.toString().isEmpty()).toList().size();
int emptyCellsFoundFound = rows.stream()
.flatMap(List::stream)
.toList()
.stream()
.filter(f -> f.toString().isEmpty())
.toList().size();
for (List<Cell> row : table.getRows()) {
row.forEach(r -> System.out.println(r.toString()));
@ -840,11 +870,20 @@ public class PdfSegmentationServiceTest extends AbstractTest {
private void validateTable(ClassificationDocument document, int tableIndex, List<List<String>> values) {
TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(tableIndex);
TablePageBlock table = document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.toList()
.get(tableIndex);
List<List<Cell>> rows = table.getRows();
List<Cell> rowsFlattened = rows.stream().flatMap(List::stream).toList();
List<String> valuesFlattened = values.stream().flatMap(List::stream).toList();
List<Cell> rowsFlattened = rows.stream()
.flatMap(List::stream)
.toList();
List<String> valuesFlattened = values.stream()
.flatMap(List::stream)
.toList();
for (int i = 0; i < valuesFlattened.size(); i++) {
Cell cell = rowsFlattened.get(i);
@ -857,7 +896,11 @@ public class PdfSegmentationServiceTest extends AbstractTest {
private void validateTableSize(ClassificationDocument document, int tableSize) {
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().size()).isEqualTo(tableSize);
assertThat(document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.toList().size()).isEqualTo(tableSize);
}

View File

@ -28,29 +28,30 @@ class InvisibleTableDetectionServiceTest {
String fileName = "files/basf/CustomerFiles/invisible_tables_test-two-pages_ocred.pdf";
var tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_TABLE.pdf").toString();
List<PageInformation> pageContents = PageContentExtractor.getSortedPageContents(fileName).stream().map(PageInformationService::build).collect(Collectors.toList());
List<PageInformation> pageContents = PageContentExtractor.getSortedPageContents(fileName)
.stream()
.map(PageInformationService::build)
.collect(Collectors.toList());
int pageNumber = 1;
Rectangle2D tableBBox = pageContents.get(0)
.getPageContents()
.getSortedTextPositionSequences()
.subList(45, 152)
Rectangle2D tableBBox = pageContents.get(0).getPageContents().getSortedTextPositionSequences().subList(45, 152)
.stream()
.map(TextPositionSequence::getRectangle)
.map(RectangleTransformations::toRectangle2D)
.map(TextPositionSequence::getBBox)
.map(this::mirrorY)
.collect(RectangleTransformations.collectBBox());
List<TextPositionSequence> textPositionSequences = pageContents.get(0)
.getPageContents()
.getSortedTextPositionSequences()
List<TextPositionSequence> textPositionSequences = pageContents.get(0).getPageContents().getSortedTextPositionSequences()
.stream()
.filter(textPositionSequence -> tableBBox.contains(mirrorY(RectangleTransformations.toRectangle2D(textPositionSequence.getRectangle()))))
.filter(textPositionSequence -> tableBBox.contains(mirrorY(textPositionSequence.getBBox())))
.toList();
var table = InvisibleTableDetectionService.detectTable(textPositionSequences, tableBBox);
PdfDraw.drawRectanglesPerPage(fileName, List.of(table.stream().flatMap(Collection::stream).toList(), Collections.emptyList()), tmpFileName);
PdfDraw.drawRectanglesPerPage(fileName,
List.of(table.stream()
.flatMap(Collection::stream)
.toList(), Collections.emptyList()),
tmpFileName);
}

View File

@ -29,9 +29,7 @@ class PageContentExtractorTest {
textPositionPerPage.stream()
.map(t -> t.getSortedTextPositionSequences()
.stream()
.map(TextPositionSequence::getRectangle)
.map(RectangleTransformations::toRectangle2D)
//.map(textPositionSequence -> (Rectangle2D) new Rectangle2D.Double(textPositionSequence.getMaxXDirAdj(), textPositionSequence.getMaxYDirAdj(), textPositionSequence.getWidth(), textPositionSequence.getHeight()))
.map(TextPositionSequence::getBBoxInitialUserSpace)
.map(List::of)
.toList())
.toList(), tmpFileName);

View File

@ -52,8 +52,8 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
RulingCleaningService rulingCleaningService = new RulingCleaningService();
List<List<Rectangle2D>> rectanglesPerPage = new LinkedList<>();
for (PageContents pageContent : pageContents) {
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(Collections.emptyList(), pageContent.getRulings());
List<Rectangle2D> rects = RectangularIntersectionFinder.find(cleanRulings.getHorizontal(), cleanRulings.getVertical());
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(Collections.emptyList(), pageContent.getRulings());
List<Rectangle2D> rects = RectangularIntersectionFinder.find(cleanRulings.getHorizontals(), cleanRulings.getVerticals());
rectanglesPerPage.add(rects);
}
@ -72,15 +72,16 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
RulingCleaningService rulingCleaningService = new RulingCleaningService();
List<CleanRulings> cleanRulingsPerPage = new LinkedList<>();
for (PageContents pageContent : pageContents) {
cleanRulingsPerPage.add(rulingCleaningService.getCleanRulings(Collections.emptyList(), pageContent.getRulings()));
cleanRulingsPerPage.add(rulingCleaningService.deduplicateAndStraightenRulings(Collections.emptyList(), pageContent.getRulings()));
}
var cleanRulings = cleanRulingsPerPage.stream().map(CleanRulings::getVertical).collect(Collectors.toList());
var cleanRulings = cleanRulingsPerPage.stream().map(CleanRulings::getVerticals).collect(Collectors.toList());
PdfDraw.drawLinesPerPage(fileName, cleanRulings, lineFileName);
}
@Test
@Disabled
@SneakyThrows
public void testTableExtraction() {
@ -98,6 +99,7 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
}
@SneakyThrows
private void writeJsons(Path filename) {

View File

@ -0,0 +1,84 @@
package com.knecon.fforesight.service.layoutparser.server.services;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.util.Collections;
import java.util.List;
import org.junit.jupiter.api.Test;
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
import com.knecon.fforesight.service.layoutparser.processor.services.TextRulingsClassifier;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangularIntersectionFinder;
import lombok.SneakyThrows;
public class RulingsClassifierTest {
@Test
@SneakyThrows
public void textRulingExtractionTest() {
String fileName = "files/Minimal Examples/RotateTextWithRulingsTestFile.pdf";
List<PageContents> pageContents = PageContentExtractor.getSortedPageContents(fileName);
RulingCleaningService rulingCleaningService = new RulingCleaningService();
for (PageContents pageContent : pageContents) {
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(Collections.emptyList(), pageContent.getRulings());
RectangularIntersectionFinder.find(cleanRulings.getHorizontals(), cleanRulings.getVerticals());
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(pageContent.getSortedTextPositionSequences(), cleanRulings);
assertTrue(pageContent.getSortedTextPositionSequences()
.stream()
.filter(word -> word.toString().equals("Underlined"))
.allMatch(TextPositionSequence::isUnderline));
assertTrue(pageContent.getSortedTextPositionSequences()
.stream()
.filter(word -> word.toString().equals("Striketrough"))
.allMatch(TextPositionSequence::isStrikethrough));
assertEquals(4,
cleanRulings.buildAll()
.stream()
.filter(ruling -> ruling.getClassification().equals(Ruling.Classification.STRIKETROUGH))
.count());
assertEquals(4,
cleanRulings.buildAll()
.stream()
.filter(ruling -> ruling.getClassification().equals(Ruling.Classification.UNDERLINE))
.count());
assertEquals(0, cleanRulings.withoutTextRulings().buildAll().size());
}
}
@Test
@SneakyThrows
public void tableRulingExtractionTest() {
String fileName = "files/SinglePages/AbsolutelyEnormousTable.pdf";
List<PageContents> pageContents = PageContentExtractor.getSortedPageContents(fileName);
RulingCleaningService rulingCleaningService = new RulingCleaningService();
for (PageContents pageContent : pageContents) {
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(Collections.emptyList(), pageContent.getRulings());
RectangularIntersectionFinder.find(cleanRulings.getHorizontals(), cleanRulings.getVerticals());
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(pageContent.getSortedTextPositionSequences(), cleanRulings);
assertEquals(30, cleanRulings.getHorizontals().size());
assertEquals(30, cleanRulings.getTableLines().getHorizontals().size());
assertEquals(144, cleanRulings.getVerticals().size());
assertEquals(144, cleanRulings.getTableLines().getVerticals().size());
}
}
}

View File

@ -1,6 +1,9 @@
package com.knecon.fforesight.service.layoutparser.server.utils;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.nio.file.Path;
import java.util.Map;
import java.util.Optional;
@ -102,29 +105,22 @@ public abstract class AbstractTest {
}
@SneakyThrows
protected LayoutParsingRequest prepareStorage(InputStream fileInputStream) {
storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileInputStream);
return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER_OLD);
}
protected LayoutParsingRequest buildDefaultLayoutParsingRequest(LayoutParsingType layoutParsingType) {
protected LayoutParsingRequest buildDefaultLayoutParsingRequest(String fileName, LayoutParsingType layoutParsingType, boolean debug) {
var identifier = debug ? Map.of("fileId", fileName, "debug", "true") : Map.of("fileId", fileName);
return LayoutParsingRequest.builder()
.identifier(Map.of("fileId", "1337"))
.identifier(identifier)
.layoutParsingType(layoutParsingType)
.originFileStorageId(ORIGIN_FILE_ID)
.tablesFileStorageId(Optional.of(TABLE_FILE_ID))
.imagesFileStorageId(Optional.of(IMAGE_FILE_ID))
.visualLayoutParsingFileId(Optional.of(VISUAL_LAYOUT_FILE))
.structureFileStorageId(STRUCTURE_FILE_ID)
.textBlockFileStorageId(TEXT_FILE_ID)
.positionBlockFileStorageId(POSITION_FILE_ID)
.pageFileStorageId(PAGES_FILE_ID)
.simplifiedTextStorageId(SIMPLIFIED_ID)
.viewerDocumentStorageId(VIEWER_DOCUMENT_ID)
.originFileStorageId(fileName + ORIGIN_FILE_ID)
.tablesFileStorageId(Optional.of(fileName + TABLE_FILE_ID))
.imagesFileStorageId(Optional.of(fileName + IMAGE_FILE_ID))
.visualLayoutParsingFileId(Optional.empty())
.structureFileStorageId(fileName + STRUCTURE_FILE_ID)
.textBlockFileStorageId(fileName + TEXT_FILE_ID)
.positionBlockFileStorageId(fileName + POSITION_FILE_ID)
.pageFileStorageId(fileName + PAGES_FILE_ID)
.simplifiedTextStorageId(fileName + SIMPLIFIED_ID)
.viewerDocumentStorageId(fileName + VIEWER_DOCUMENT_ID)
.build();
}
@ -148,10 +144,28 @@ public abstract class AbstractTest {
ClassPathResource imageInfoFileResource = new ClassPathResource(imageInfoFile);
ClassPathResource visualLayoutParsingResponseResource = new ClassPathResource(visualLayoutParsingResponseFile);
return prepareStorage(pdfFileResource.getInputStream(),
cvServiceResponseFileResource.getInputStream(),
imageInfoFileResource.getInputStream(),
visualLayoutParsingResponseResource.getInputStream());
return prepareStorage(Path.of(file).getFileName().toString(),
pdfFileResource.getInputStream(),
cvServiceResponseFileResource.getInputStream(),
imageInfoFileResource.getInputStream(),
visualLayoutParsingResponseResource.getInputStream());
}
@SneakyThrows
protected void prepareStorage(LayoutParsingRequest layoutParsingRequest, File file) {
ClassPathResource cvServiceResponseFileResource = new ClassPathResource("cv_table_parsing_response/empty.json");
ClassPathResource imageInfoFileResource = new ClassPathResource("image_service_response/empty.json");
ClassPathResource visualLayoutParsingResponseResource = new ClassPathResource("visual_layout_parsing_response/empty.json");
try (var in = new FileInputStream(file)) {
prepareStorage(layoutParsingRequest,
in,
cvServiceResponseFileResource.getInputStream(),
imageInfoFileResource.getInputStream(),
visualLayoutParsingResponseResource.getInputStream());
}
}
@ -162,12 +176,29 @@ public abstract class AbstractTest {
storageService.storeObject(TenantContext.getTenantId(), TABLE_FILE_ID, cvServiceResponseFileStream);
storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileStream);
return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER_OLD);
return buildDefaultLayoutParsingRequest("test", LayoutParsingType.REDACT_MANAGER_OLD, true);
}
@SneakyThrows
protected LayoutParsingRequest prepareStorage(InputStream fileStream,
protected void prepareStorage(LayoutParsingRequest layoutParsingRequest,
InputStream fileStream,
InputStream cvServiceResponseFileStream,
InputStream imageInfoStream,
InputStream visualLayoutParsingResponseFileStream) {
storageService.storeObject(TenantContext.getTenantId(), layoutParsingRequest.imagesFileStorageId().get(), imageInfoStream);
storageService.storeObject(TenantContext.getTenantId(), layoutParsingRequest.tablesFileStorageId().get(), cvServiceResponseFileStream);
storageService.storeObject(TenantContext.getTenantId(), layoutParsingRequest.originFileStorageId(), fileStream);
if (layoutParsingRequest.visualLayoutParsingFileId().isPresent()) {
storageService.storeObject(TenantContext.getTenantId(), layoutParsingRequest.visualLayoutParsingFileId().get(), visualLayoutParsingResponseFileStream);
}
}
@SneakyThrows
protected LayoutParsingRequest prepareStorage(String fileName,
InputStream fileStream,
InputStream cvServiceResponseFileStream,
InputStream imageInfoStream,
InputStream visualLayoutParsingResponseFileStream) {
@ -177,7 +208,7 @@ public abstract class AbstractTest {
storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileStream);
storageService.storeObject(TenantContext.getTenantId(), VISUAL_LAYOUT_FILE, visualLayoutParsingResponseFileStream);
return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER_OLD);
return buildDefaultLayoutParsingRequest(fileName, LayoutParsingType.REDACT_MANAGER_OLD, true);
}

View File

@ -1,11 +1,13 @@
package com.knecon.fforesight.service.layoutparser.server.utils;
import java.io.File;
import java.nio.file.Path;
import java.util.Map;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.core.io.ClassPathResource;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
@ -28,11 +30,11 @@ public abstract class BuildDocumentTest extends AbstractTest {
File fileResource = new ClassPathResource(filename).getFile();
prepareStorage(filename);
return layoutParsingPipeline.parseLayout(layoutParsingType,
fileResource,
layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID),
new TableServiceResponse(),
new VisualLayoutParsingResponse(),
Map.of("file",filename));
fileResource,
layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID),
new TableServiceResponse(),
new VisualLayoutParsingResponse(),
Map.of("file", filename, "debug", "true"));
}
@ -46,13 +48,25 @@ public abstract class BuildDocumentTest extends AbstractTest {
@SneakyThrows
protected Document buildGraph(String filename, LayoutParsingType layoutParsingType) {
if (filename.equals("files/syngenta/CustomerFiles/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf")) {
prepareStorage(filename, "cv_table_parsing_response/empty.json", "image_service_response/S-Metolachlor_RAR_01_Volume_1_2018-09-06.IMAGE_INFO.json");
if (!filename.startsWith("files") && filename.startsWith("/")) {
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(Path.of(filename).getFileName().toString(), LayoutParsingType.REDACT_MANAGER, true);
prepareStorage(layoutParsingRequest, new File(filename));
return DocumentGraphFactory.buildDocumentGraph(layoutParsingType,
layoutParsingPipeline.parseLayout(layoutParsingType,
new File(filename),
layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get()),
new TableServiceResponse(),
new VisualLayoutParsingResponse(),
layoutParsingRequest.identifier()));
} else {
prepareStorage(filename);
if (filename.equals("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf")) {
prepareStorage(filename, "cv_table_parsing_response/empty.json", "image_service_response/S-Metolachlor_RAR_01_Volume_1_2018-09-06.IMAGE_INFO.json");
} else {
prepareStorage(filename);
}
return DocumentGraphFactory.buildDocumentGraph(layoutParsingType, parseLayout(filename, layoutParsingType));
}
return DocumentGraphFactory.buildDocumentGraph(layoutParsingType, parseLayout(filename, layoutParsingType));
}
}

View File

@ -26,6 +26,26 @@ public class ContentStreams {
public static Identifier ESCAPE_END = new Identifier("escape start", COSName.getPDFName("ESCAPE_END"), false);
public static Identifier CLEAN_RULINGS = new Identifier("Cleaned Rulings", COSName.getPDFName("KNECON_CLEAN_RULINGS"), true);
public static Identifier RULINGS = new Identifier("Rulings", COSName.getPDFName("KNECON_RULINGS"), true);
public static Identifier WORDS = new Identifier("Words", COSName.getPDFName("KNECON_WORDS"), true);
public static Identifier ZONES = new Identifier("Text Zones", COSName.getPDFName("KNECON_ZONES"), true);
public static Identifier LINES = new Identifier("Text Lines", COSName.getPDFName("KNECON_LINES"), true);
public static Identifier CELLS = new Identifier("Cells", COSName.getPDFName("KNECON_CELLS"), true);
public static Identifier MAIN_BODY = new Identifier("Main Text Body", COSName.getPDFName("KNECON_MAIN_BODY"), true);
public static Identifier MARKED_CONTENT = new Identifier("Marked content", COSName.getPDFName("KNECON_MARKED_CONTENT"), true);
public static Identifier NEIGHBOURS = new Identifier("Neighbours", COSName.getPDFName("KNECON_NEIGHBOURS"), true);
public static Identifier CHARACTERS = new Identifier("Characters", COSName.getPDFName("KNECON_CHARACTERS"), true);
public static List<Identifier> allContentStreams = List.of(KNECON_LAYOUT,
KNECON_VISUAL_PARSING,
KNECON_OCR,
@ -33,7 +53,17 @@ public class ContentStreams {
KNECON_OCR_TEXT_DEBUG,
OTHER,
ESCAPE_START,
ESCAPE_END);
ESCAPE_END,
RULINGS,
CLEAN_RULINGS,
WORDS,
ZONES,
LINES,
MAIN_BODY,
MARKED_CONTENT,
NEIGHBOURS,
CHARACTERS,
CELLS);
public record Identifier(String name, COSName cosName, boolean optionalContent) {

View File

@ -1,5 +1,6 @@
package com.knecon.fforesight.service.viewerdoc.model;
import java.util.LinkedHashMap;
import java.util.Map;
import com.knecon.fforesight.service.viewerdoc.ContentStreams;
@ -17,7 +18,8 @@ import lombok.experimental.FieldDefaults;
public class Visualizations {
ContentStreams.Identifier layer;
Map<Integer, VisualizationsOnPage> visualizationsOnPages;
@Builder.Default
Map<Integer, VisualizationsOnPage> visualizationsOnPages = new LinkedHashMap<>();
boolean layerVisibilityDefaultValue;
}

View File

@ -53,12 +53,6 @@ public class ViewerDocumentService {
private final ObservationRegistry registry;
public void addVisualizationsOnPage(File originFile, File destinationFile, Visualizations visualizations) {
addVisualizationsOnPage(originFile, destinationFile, List.of(visualizations));
}
@Observed(name = "ViewerDocumentService", contextualName = "add-visualizations")
@SneakyThrows
public void addVisualizationsOnPage(File originFile, File destinationFile, List<Visualizations> visualizations) {
@ -70,9 +64,14 @@ public class ViewerDocumentService {
PDDocument pdDocument = openPDDocument(tmpFile.toFile());
enrichObservation(pdDocument, visualizations.stream().map(Visualizations::getLayer).toList());
enrichObservation(pdDocument,
visualizations.stream()
.map(Visualizations::getLayer)
.toList());
Set<ContentStreams.Identifier> allLayers = visualizations.stream().map(Visualizations::getLayer).collect(Collectors.toUnmodifiableSet());
Set<ContentStreams.Identifier> allLayers = visualizations.stream()
.map(Visualizations::getLayer)
.collect(Collectors.toUnmodifiableSet());
Map<ContentStreams.Identifier, PDOptionalContentGroup> optionalContentGroupMap = addLayersToDocument(visualizations, pdDocument);
@ -229,11 +228,11 @@ public class ViewerDocumentService {
Matrix textMatrix;
if (placedText.textMatrix().isEmpty()) {
textMatrix = new Matrix((float) textDeRotationMatrix.getScaleX(),
(float) textDeRotationMatrix.getShearX(),
(float) textDeRotationMatrix.getShearY(),
(float) textDeRotationMatrix.getScaleY(),
(float) placedText.lineStart().getX(),
(float) placedText.lineStart().getY());
(float) textDeRotationMatrix.getShearX(),
(float) textDeRotationMatrix.getShearY(),
(float) textDeRotationMatrix.getScaleY(),
(float) placedText.lineStart().getX(),
(float) placedText.lineStart().getY());
} else {
textMatrix = placedText.textMatrix().get();
}

View File

@ -12,4 +12,4 @@ commit_hash=$(git rev-parse --short=5 HEAD)
buildName="${USER}-${branch}-${commit_hash}"
gradle bootBuildImage --publishImage -PbuildbootDockerHostNetwork=true -Pversion=$buildName --no-build-cache
echo "nexus.knecon.com:5001/ff/${dir}-service-server:$buildName"
echo "nexus.knecon.com:5001/ff/layoutparser-service-server:$buildName"