Merge branch 'RED-7141' into 'main'

RED-7141: Implemented docstrum layout parsing

See merge request fforesight/layout-parser!108
This commit is contained in:
Dominique Eifländer 2024-03-08 14:27:59 +01:00
commit 0b4ad29dcb
63 changed files with 2170 additions and 901 deletions

View File

@ -55,6 +55,13 @@ public class DocumentStructure implements Serializable {
} }
@Schema(description = "Object containing the extra field names, a duplicate paragraph has in its properties field.")
public static class DuplicateParagraphProperties implements Serializable {
public static final String UNSORTED_TEXTBLOCK_ID = "utbid";
}
public static final String RECTANGLE_DELIMITER = ";"; public static final String RECTANGLE_DELIMITER = ";";

View File

@ -3,6 +3,5 @@ package com.knecon.fforesight.service.layoutparser.internal.api.queue;
public class LayoutParsingQueueNames { public class LayoutParsingQueueNames {
public static final String LAYOUT_PARSING_REQUEST_QUEUE = "layout_parsing_request_queue"; public static final String LAYOUT_PARSING_REQUEST_QUEUE = "layout_parsing_request_queue";
public static final String LAYOUT_PARSING_DLQ = "layout_parsing_dead_letter_queue";
public static final String LAYOUT_PARSING_FINISHED_EVENT_QUEUE = "layout_parsing_response_queue"; public static final String LAYOUT_PARSING_FINISHED_EVENT_QUEUE = "layout_parsing_response_queue";
} }

View File

@ -2,6 +2,9 @@ package com.knecon.fforesight.service.layoutparser.internal.api.queue;
public enum LayoutParsingType { public enum LayoutParsingType {
REDACT_MANAGER, REDACT_MANAGER,
TAAS, REDACT_MANAGER_OLD,
DOCUMINE REDACT_MANAGER_PARAGRAPH_DEBUG,
DOCUMINE,
CLARIFYND,
CLARIFYND_PARAGRAPH_DEBUG
} }

View File

@ -24,4 +24,5 @@ dependencies {
implementation("com.fasterxml.jackson.module:jackson-module-afterburner:${jacksonVersion}") implementation("com.fasterxml.jackson.module:jackson-module-afterburner:${jacksonVersion}")
implementation("com.fasterxml.jackson.datatype:jackson-datatype-jsr310:${jacksonVersion}") implementation("com.fasterxml.jackson.datatype:jackson-datatype-jsr310:${jacksonVersion}")
implementation("org.springframework.boot:spring-boot-starter-web:3.1.3") implementation("org.springframework.boot:spring-boot-starter-web:3.1.3")
implementation("org.jgrapht:jgrapht-core:1.5.2")
} }

View File

@ -28,6 +28,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
@ -43,12 +44,11 @@ import com.knecon.fforesight.service.layoutparser.processor.services.RulingClean
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService; import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService; import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService;
import com.knecon.fforesight.service.layoutparser.processor.services.TableExtractionService; import com.knecon.fforesight.service.layoutparser.processor.services.TableExtractionService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService; import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService; import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.TaasBlockificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.DocuMineClassificationService; import com.knecon.fforesight.service.layoutparser.processor.services.classification.DocuMineClassificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService; import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.TaasClassificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper; import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper; import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper;
@ -76,16 +76,15 @@ public class LayoutParsingPipeline {
CvTableParsingAdapter cvTableParsingAdapter; CvTableParsingAdapter cvTableParsingAdapter;
LayoutParsingStorageService layoutParsingStorageService; LayoutParsingStorageService layoutParsingStorageService;
SectionsBuilderService sectionsBuilderService; SectionsBuilderService sectionsBuilderService;
TaasClassificationService taasClassificationService;
RedactManagerClassificationService redactManagerClassificationService; RedactManagerClassificationService redactManagerClassificationService;
DocuMineClassificationService docuMineClassificationService; DocuMineClassificationService docuMineClassificationService;
SimplifiedSectionTextService simplifiedSectionTextService; SimplifiedSectionTextService simplifiedSectionTextService;
BodyTextFrameService bodyTextFrameService; BodyTextFrameService bodyTextFrameService;
RulingCleaningService rulingCleaningService; RulingCleaningService rulingCleaningService;
TableExtractionService tableExtractionService; TableExtractionService tableExtractionService;
TaasBlockificationService taasBlockificationService;
DocuMineBlockificationService docuMineBlockificationService; DocuMineBlockificationService docuMineBlockificationService;
RedactManagerBlockificationService redactManagerBlockificationService; RedactManagerBlockificationService redactManagerBlockificationService;
DocstrumBlockificationService docstrumBlockificationService;
LayoutGridService layoutGridService; LayoutGridService layoutGridService;
ObservationRegistry observationRegistry; ObservationRegistry observationRegistry;
VisualLayoutParsingAdapter visualLayoutParsingAdapter; VisualLayoutParsingAdapter visualLayoutParsingAdapter;
@ -97,28 +96,21 @@ public class LayoutParsingPipeline {
log.info("Starting layout parsing for {}", layoutParsingRequest.identifier()); log.info("Starting layout parsing for {}", layoutParsingRequest.identifier());
File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId()); File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()) File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()).orElse(originFile);
.orElse(originFile);
VisualLayoutParsingResponse visualLayoutParsingResponse = new VisualLayoutParsingResponse(); VisualLayoutParsingResponse visualLayoutParsingResponse = new VisualLayoutParsingResponse();
if (layoutParsingRequest.visualLayoutParsingFileId() if (layoutParsingRequest.visualLayoutParsingFileId().isPresent()) {
.isPresent()) { visualLayoutParsingResponse = layoutParsingStorageService.getVisualLayoutParsingFile(layoutParsingRequest.visualLayoutParsingFileId().get());
visualLayoutParsingResponse = layoutParsingStorageService.getVisualLayoutParsingFile(layoutParsingRequest.visualLayoutParsingFileId()
.get());
} }
ImageServiceResponse imageServiceResponse = new ImageServiceResponse(); ImageServiceResponse imageServiceResponse = new ImageServiceResponse();
if (layoutParsingRequest.imagesFileStorageId() if (layoutParsingRequest.imagesFileStorageId().isPresent()) {
.isPresent()) { imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get());
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId()
.get());
} }
TableServiceResponse tableServiceResponse = new TableServiceResponse(); TableServiceResponse tableServiceResponse = new TableServiceResponse();
if (layoutParsingRequest.tablesFileStorageId() if (layoutParsingRequest.tablesFileStorageId().isPresent()) {
.isPresent()) { tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get());
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId()
.get());
} }
ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(), ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(),
@ -130,7 +122,7 @@ public class LayoutParsingPipeline {
log.info("Building document graph for {}", layoutParsingRequest.identifier()); log.info("Building document graph for {}", layoutParsingRequest.identifier());
Document documentGraph = observeBuildDocumentGraph(classificationDocument); Document documentGraph = observeBuildDocumentGraph(layoutParsingRequest.layoutParsingType(), classificationDocument);
log.info("Creating viewer document for {}", layoutParsingRequest.identifier()); log.info("Creating viewer document for {}", layoutParsingRequest.identifier());
@ -142,7 +134,7 @@ public class LayoutParsingPipeline {
layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentGraph)); layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentGraph));
layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, viewerDocumentFile); layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, viewerDocumentFile);
if (layoutParsingRequest.layoutParsingType().equals(LayoutParsingType.TAAS)) { if (layoutParsingRequest.layoutParsingType().equals(LayoutParsingType.CLARIFYND)) {
log.info("Building research document data for {}", layoutParsingRequest.identifier()); log.info("Building research document data for {}", layoutParsingRequest.identifier());
var researchDocumentData = TaasDocumentDataMapper.fromDocument(documentGraph); var researchDocumentData = TaasDocumentDataMapper.fromDocument(documentGraph);
layoutParsingStorageService.storeResearchDocumentData(layoutParsingRequest, researchDocumentData); layoutParsingStorageService.storeResearchDocumentData(layoutParsingRequest, researchDocumentData);
@ -182,13 +174,13 @@ public class LayoutParsingPipeline {
} }
private Document observeBuildDocumentGraph(ClassificationDocument classificationDocument) { private Document observeBuildDocumentGraph(LayoutParsingType layoutParsingType, ClassificationDocument classificationDocument) {
AtomicReference<Document> documentReference = new AtomicReference<>(); AtomicReference<Document> documentReference = new AtomicReference<>();
Observation.createNotStarted("LayoutParsingPipeline", observationRegistry) Observation.createNotStarted("LayoutParsingPipeline", observationRegistry)
.contextualName("build-document-graph") .contextualName("build-document-graph")
.observe(() -> documentReference.set(DocumentGraphFactory.buildDocumentGraph(classificationDocument))); .observe(() -> documentReference.set(DocumentGraphFactory.buildDocumentGraph(layoutParsingType, classificationDocument)));
return documentReference.get(); return documentReference.get();
} }
@ -260,11 +252,16 @@ public class LayoutParsingPipeline {
PDRectangle cropbox = pdPage.getCropBox(); PDRectangle cropbox = pdPage.getCropBox();
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), stripper.getRulings()); CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), stripper.getRulings());
List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical());
ClassificationPage classificationPage = switch (layoutParsingType) { ClassificationPage classificationPage = switch (layoutParsingType) {
case REDACT_MANAGER -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); case REDACT_MANAGER_OLD ->
case TAAS -> taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, true);
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, false);
}; };
classificationPage.setCleanRulings(cleanRulings); classificationPage.setCleanRulings(cleanRulings);
classificationPage.setRotation(rotation); classificationPage.setRotation(rotation);
classificationPage.setLandscape(isLandscape); classificationPage.setLandscape(isLandscape);
@ -289,7 +286,13 @@ public class LayoutParsingPipeline {
} }
} }
tableExtractionService.extractTables(cleanRulings, classificationPage); tableExtractionService.extractTables(emptyTableCells, classificationPage);
if (layoutParsingType == LayoutParsingType.REDACT_MANAGER) {
docstrumBlockificationService.combineBlocks(classificationPage);
} else if (layoutParsingType == LayoutParsingType.CLARIFYND) {
docstrumBlockificationService.mergeIntersectingBlocks(classificationPage.getTextBlocks());
}
buildPageStatistics(classificationPage); buildPageStatistics(classificationPage);
increaseDocumentStatistics(classificationPage, classificationDocument); increaseDocumentStatistics(classificationPage, classificationDocument);
@ -303,14 +306,21 @@ public class LayoutParsingPipeline {
bodyTextFrameService.setBodyTextFrames(classificationDocument, layoutParsingType); bodyTextFrameService.setBodyTextFrames(classificationDocument, layoutParsingType);
log.info("Classify TextBlocks for {}", identifier); log.info("Classify TextBlocks for {}", identifier);
switch (layoutParsingType) { switch (layoutParsingType) {
case TAAS -> taasClassificationService.classifyDocument(classificationDocument); case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_OLD, CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG ->
redactManagerClassificationService.classifyDocument(classificationDocument);
case DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument); case DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument);
case REDACT_MANAGER -> redactManagerClassificationService.classifyDocument(classificationDocument);
} }
log.info("Building Sections for {}", identifier); log.info("Building Sections for {}", identifier);
switch (layoutParsingType) {
case CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_PARAGRAPH_DEBUG -> sectionsBuilderService.buildParagraphDebugSections(classificationDocument);
default -> {
sectionsBuilderService.buildSections(classificationDocument); sectionsBuilderService.buildSections(classificationDocument);
sectionsBuilderService.addImagesToSections(classificationDocument); sectionsBuilderService.addImagesToSections(classificationDocument);
}
}
return classificationDocument; return classificationDocument;
} }

View File

@ -0,0 +1,59 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.LineBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.NearestNeighbourService;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.ReadingOrderService;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.SpacingService;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.ZoneBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import lombok.RequiredArgsConstructor;
@Service
@RequiredArgsConstructor
public class DocstrumSegmentationService {
private final NearestNeighbourService nearestNeighbourService;
private final SpacingService spacingService;
private final LineBuilderService lineBuilderService;
private final ZoneBuilderService zoneBuilderService;
private final ReadingOrderService readingOrderService;
public List<Zone> segmentPage(List<TextPositionSequence> textPositions, boolean xyOrder) {
List<Zone> zones = new ArrayList<>();
zones.addAll(computeZones(textPositions, TextDirection.ZERO));
zones.addAll(computeZones(textPositions, TextDirection.QUARTER_CIRCLE));
zones.addAll(computeZones(textPositions, TextDirection.HALF_CIRCLE));
zones.addAll(computeZones(textPositions, TextDirection.THREE_QUARTER_CIRCLE));
return readingOrderService.resolve(zones, xyOrder);
}
private List<Zone> computeZones(List<TextPositionSequence> textPositions, TextDirection direction) {
var positions = textPositions.stream().filter(t -> t.getDir() == direction).map(TextPositionSequence::getTextPositions).flatMap(List::stream).toList();
var characters = positions.stream().map(Character::new).collect(Collectors.toList());
nearestNeighbourService.findNearestNeighbors(characters);
var characterSpacing = spacingService.computeCharacterSpacing(characters);
var lineSpacing = Math.min(spacingService.computeLineSpacing(characters), 20);
var lines = lineBuilderService.buildLines(characters, characterSpacing, lineSpacing);
return zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing);
}
}

View File

@ -0,0 +1,25 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
public class AngleFilter {
protected double lowerAngle;
protected double upperAngle;
public AngleFilter(double lowerAngle, double upperAngle) {
this.lowerAngle = lowerAngle < -Math.PI / 2 ? lowerAngle + Math.PI : lowerAngle;
this.upperAngle = upperAngle >= Math.PI / 2 ? upperAngle - Math.PI : upperAngle;
}
public boolean matches(Neighbor neighbor) {
if (lowerAngle <= upperAngle) {
return lowerAngle <= neighbor.getAngle() && neighbor.getAngle() < upperAngle;
} else {
return lowerAngle <= neighbor.getAngle() || neighbor.getAngle() < upperAngle;
}
}
}

View File

@ -0,0 +1,54 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
import java.awt.geom.Rectangle2D;
import lombok.Data;
@Data
public abstract class BoundingBox {
private Rectangle2D bBox;
public double getX() {
return bBox.getX();
}
public double getY() {
return bBox.getY();
}
public double getWidth() {
return bBox.getWidth();
}
public double getHeight() {
return bBox.getHeight();
}
public double getArea() {
return (bBox.getHeight() * bBox.getWidth());
}
public boolean contains(Rectangle2D contained, double tolerance) {
return bBox.getX() <= contained.getX() + tolerance && bBox.getY() <= contained.getY() + tolerance && bBox.getX() + bBox.getWidth() >= contained.getX() + contained.getWidth() - tolerance && bBox.getY() + bBox.getHeight() >= contained.getY() + contained.getHeight() - tolerance;
}
public boolean intersectsY(BoundingBox other) {
return this.getBBox().getMinY() <= other.getBBox().getMaxY() && this.getBBox().getMaxY() >= other.getBBox().getMinY();
}
}

View File

@ -0,0 +1,86 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2;
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.Setter;
@Data
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
public class Character {
@EqualsAndHashCode.Include
private final double x;
@EqualsAndHashCode.Include
private final double y;
private final RedTextPosition textPosition;
@Setter
private List<Neighbor> neighbors = new ArrayList<>();
public Character(RedTextPosition chunk) {
this.x = chunk.getXDirAdj() + chunk.getWidthDirAdj() / 2;
this.y = chunk.getYDirAdj() + chunk.getHeightDir() / 2;
this.textPosition = chunk;
}
public double getHeight() {
return textPosition.getHeightDir();
}
public double distance(Character character) {
double dx = getX() - character.getX();
double dy = getY() - character.getY();
return Math.sqrt(dx * dx + dy * dy);
}
public double horizontalDistance(Character character) {
return Math.abs(getX() - character.getX());
}
public double verticalDistance(Character character) {
return Math.abs(getY() - character.getY());
}
public double overlappingDistance(Character other) {
double[] xs = new double[4];
double s = Math.sin(-0);
double c = Math.cos(-0);
xs[0] = c * x - s * y;
xs[1] = c * (x + textPosition.getWidthDirAdj()) - s * (y + textPosition.getHeightDir());
xs[2] = c * other.x - s * other.y;
xs[3] = c * (other.x + other.textPosition.getWidthDirAdj()) - s * (other.y + other.textPosition.getHeightDir());
boolean overlapping = xs[1] >= xs[2] && xs[3] >= xs[0];
Arrays.sort(xs);
return Math.abs(xs[2] - xs[1]) * (overlapping ? 1 : -1);
}
public double angle(Character character) {
if (getX() > character.getX()) {
return FastAtan2.fastAtan2(getY() - character.getY(), getX() - character.getX());
} else {
return FastAtan2.fastAtan2(character.getY() - getY(), character.getX() - getX());
}
}
}

View File

@ -0,0 +1,90 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
public class Histogram {
private static final double EPSILON = 1.0e-6;
private final double min;
private final double resolution;
private double[] frequencies;
public Histogram(double minValue, double maxValue, double resolution) {
this.min = minValue - EPSILON;
double delta = maxValue - minValue + 2 * EPSILON;
int size = Math.max(1, (int) Math.round((maxValue - minValue) / resolution));
this.resolution = delta / size;
this.frequencies = new double[size];
}
public void kernelSmooth(double[] kernel) {
double[] newFrequencies = new double[frequencies.length];
int shift = (kernel.length - 1) / 2;
for (int i = 0; i < kernel.length; i++) {
int jStart = Math.max(0, i - shift);
int jEnd = Math.min(frequencies.length, frequencies.length + i - shift);
for (int j = jStart; j < jEnd; j++) {
newFrequencies[j - i + shift] += kernel[i] * frequencies[j];
}
}
frequencies = newFrequencies;
}
public double[] createGaussianKernel(double length, double stdDeviation) {
int r = (int) Math.round(length / resolution) / 2;
int size = 2 * r + 1;
double[] kernel = new double[size];
double sum = 0;
double b = 2 * (stdDeviation / resolution) * (stdDeviation / resolution);
double a = 1 / Math.sqrt(Math.PI * b);
for (int i = 0; i < size; i++) {
kernel[i] = a * Math.exp(-(i - r) * (i - r) / b);
sum += kernel[i];
}
for (int i = 0; i < size; i++) {
kernel[i] /= sum;
}
return kernel;
}
public void gaussianSmooth(double windowLength, double stdDeviation) {
kernelSmooth(createGaussianKernel(windowLength, stdDeviation));
}
public void add(double value) {
frequencies[(int) ((value - min) / resolution)] += 1.0;
}
public int getSize() {
return frequencies.length;
}
public double getPeakValue() {
int peakIndex = 0;
for (int i = 1; i < frequencies.length; i++) {
if (frequencies[i] > frequencies[peakIndex]) {
peakIndex = i;
}
}
int peakEndIndex = peakIndex + 1;
final double EPS = 0.0001;
while (peakEndIndex < frequencies.length && Math.abs(frequencies[peakEndIndex] - frequencies[peakIndex]) < EPS) {
peakEndIndex++;
}
return ((double) peakIndex + peakEndIndex) / 2 * resolution + min;
}
}

View File

@ -0,0 +1,170 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import lombok.Data;
import lombok.EqualsAndHashCode;
@Data
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
public class Line extends BoundingBox {
private static final double WORD_DISTANCE_MULTIPLIER = 0.18;
@EqualsAndHashCode.Include
private final double x0;
@EqualsAndHashCode.Include
private final double y0;
@EqualsAndHashCode.Include
private final double x1;
@EqualsAndHashCode.Include
private final double y1;
private final double height;
private final List<Character> characters;
private final List<TextPositionSequence> words = new ArrayList<>();
public Line(List<Character> characters, double wordSpacing) {
this.characters = characters;
if (characters.size() >= 2) {
// linear regression
double sx = 0.0;
double sxx = 0.0;
double sxy = 0.0;
double sy = 0.0;
for (Character character : characters) {
sx += character.getX();
sxx += character.getX() * character.getX();
sxy += character.getX() * character.getY();
sy += character.getY();
}
double b = (characters.size() * sxy - sx * sy) / (characters.size() * sxx - sx * sx);
double a = (sy - b * sx) / characters.size();
this.x0 = characters.get(0).getX();
this.y0 = a + b * this.x0;
this.x1 = characters.get(characters.size() - 1).getX();
this.y1 = a + b * this.x1;
} else {
Character character = characters.get(0);
double dx = character.getTextPosition().getWidthDirAdj() / 3;
double dy = dx * Math.tan(0);
this.x0 = character.getX() - dx;
this.x1 = character.getX() + dx;
this.y0 = character.getY() - dy;
this.y1 = character.getY() + dy;
}
height = computeHeight();
computeWords(wordSpacing * WORD_DISTANCE_MULTIPLIER);
buildBBox();
}
public double getAngle() {
return Math.atan2(y1 - y0, x1 - x0);
}
public double getLength() {
return Math.sqrt((x0 - x1) * (x0 - x1) + (y0 - y1) * (y0 - y1));
}
private double computeHeight() {
return characters.stream().map(Character::getHeight).reduce(0d, Double::sum) / characters.size();
}
public double angularDifference(Line j) {
double diff = Math.abs(getAngle() - j.getAngle());
if (diff <= Math.PI / 2) {
return diff;
} else {
return Math.PI - diff;
}
}
public double horizontalDistance(Line other) {
double[] xs = new double[4];
xs[0] = x0;
xs[1] = x1;
xs[2] = other.x0;
xs[3] = other.x1;
boolean overlapping = xs[1] >= xs[2] && xs[3] >= xs[0];
Arrays.sort(xs);
return Math.abs(xs[2] - xs[1]) * (overlapping ? 1 : -1);
}
public double verticalDistance(Line other) {
double ym = (y0 + y1) / 2;
double yn = (other.y0 + other.y1) / 2;
return Math.abs(ym - yn) / Math.sqrt(1);
}
private void computeWords(double wordSpacing) {
TextPositionSequence word = new TextPositionSequence();
Character previous = null;
for (Character current : characters) {
if (previous != null) {
double dist = current.getTextPosition().getXDirAdj() - previous.getTextPosition().getXDirAdj() - previous.getTextPosition().getWidthDirAdj();
if (dist > wordSpacing) {
words.add(word);
word = new TextPositionSequence();
}
}
word.getTextPositions().add(current.getTextPosition());
previous = current;
}
words.add(word);
}
private void buildBBox() {
double minX = Double.POSITIVE_INFINITY;
double minY = Double.POSITIVE_INFINITY;
double maxX = Double.NEGATIVE_INFINITY;
double maxY = Double.NEGATIVE_INFINITY;
for (Character character : characters) {
minX = Math.min(minX, character.getTextPosition().getXDirAdj());
minY = Math.min(minY, character.getTextPosition().getYDirAdj());
maxX = Math.max(maxX, character.getTextPosition().getXDirAdj() + character.getTextPosition().getWidthDirAdj());
maxY = Math.max(maxY, character.getTextPosition().getYDirAdj() + character.getTextPosition().getHeightDir());
}
this.setBBox(new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY));
}
public String toString() {
StringBuilder sb = new StringBuilder();
words.forEach(word -> sb.append(word.toString()).append(" "));
return sb.toString().trim();
}
}

View File

@ -0,0 +1,43 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
import lombok.Getter;
public class Neighbor {
@Getter
private final double distance;
private Double angle;
private final Character originCharacter;
@Getter
private final Character character;
public Neighbor(Character neighbor, Character origin) {
this.distance = neighbor.distance(origin);
this.character = neighbor;
this.originCharacter = origin;
}
public double getHorizontalDistance() {
return character.horizontalDistance(originCharacter);
}
public double getVerticalDistance() {
return character.verticalDistance(originCharacter);
}
public double getAngle() {
if (angle != null) {
return angle;
}
return this.character.angle(this.originCharacter);
}
}

View File

@ -0,0 +1,31 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
import java.util.Collection;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Set;
public class UnionFind<T> extends org.jgrapht.alg.util.UnionFind<T> {
public UnionFind(Set<T> elements) {
super(elements);
}
public Collection<Set<T>> getGroups() {
Map<T, Set<T>> setRep = new LinkedHashMap<>();
for (T t : getParentMap().keySet()) {
T representative = find(t);
if (!setRep.containsKey(representative)) {
setRep.put(representative, new LinkedHashSet<>());
}
setRep.get(representative).add(t);
}
return setRep.values();
}
}

View File

@ -0,0 +1,51 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
import java.awt.geom.Rectangle2D;
import java.util.Comparator;
import java.util.List;
import lombok.Data;
@Data
public class Zone extends BoundingBox {
private List<Line> lines;
@SuppressWarnings("PMD.ConstructorCallsOverridableMethod")
public Zone(List<Line> lines) {
lines.sort(Comparator.comparingDouble(Line::getY));
this.lines = lines;
buildBBox();
}
public void buildBBox() {
double minX = Double.POSITIVE_INFINITY;
double minY = Double.POSITIVE_INFINITY;
double maxX = Double.NEGATIVE_INFINITY;
double maxY = Double.NEGATIVE_INFINITY;
for (Line line : lines) {
minX = Math.min(minX, line.getX());
minY = Math.min(minY, line.getY());
maxX = Math.max(maxX, line.getX() + line.getWidth());
maxY = Math.max(maxY, line.getY() + line.getHeight());
}
this.setBBox(new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY));
}
public String toString() {
StringBuilder sb = new StringBuilder();
lines.forEach(line -> sb.append(line.toString()).append("\n"));
return sb.toString().trim();
}
}

View File

@ -0,0 +1,53 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.service;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.AngleFilter;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind;
@Service
public class LineBuilderService {
private static final double CHARACTER_SPACING_DISTANCE_MULTIPLIER = 3.5;
private static final double MAX_VERTICAL_CHARACTER_DISTANCE = 0.67;
private static final double ANGLE_TOLERANCE = Math.PI / 6;
public List<Line> buildLines(List<Character> characters, double characterSpacing, double lineSpacing) {
double maxHorizontalDistance = characterSpacing * CHARACTER_SPACING_DISTANCE_MULTIPLIER;
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_CHARACTER_DISTANCE;
UnionFind<Character> unionFind = new UnionFind<>(new HashSet<>(characters));
AngleFilter filter = new AngleFilter(-ANGLE_TOLERANCE, ANGLE_TOLERANCE);
characters.forEach(character -> {
character.getNeighbors().forEach(neighbor -> {
double x = neighbor.getHorizontalDistance() / maxHorizontalDistance;
double y = neighbor.getVerticalDistance() / maxVerticalDistance;
if (character.getTextPosition().getDir() == neighbor.getCharacter().getTextPosition().getDir() && filter.matches(neighbor) && Math.pow(x, 2) + Math.pow(y,
2) <= 1) {
unionFind.union(character, neighbor.getCharacter());
}
});
});
List<Line> lines = new ArrayList<>();
unionFind.getGroups().forEach(group -> {
List<Character> lineCharacters = new ArrayList<>(group);
lineCharacters.sort(Comparator.comparingDouble(Character::getX));
lines.add(new Line(lineCharacters, characterSpacing));
});
return lines;
}
}

View File

@ -0,0 +1,128 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.service;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Neighbor;
@Service
public class NearestNeighbourService {
private static final int NUMBER_OF_NEIGHBOURS = 8;
private static final double STEP = 16.0;
public void findNearestNeighbors(List<Character> characters) {
if (characters.isEmpty() || characters.size() == 1) {
return;
}
characters.sort(Comparator.comparingDouble(Character::getX));
int maxNeighborCount = NUMBER_OF_NEIGHBOURS;
if (characters.size() <= NUMBER_OF_NEIGHBOURS) {
maxNeighborCount = characters.size() - 1;
}
for (int i = 0; i < characters.size(); i++) {
Neighbor[] candidates = new Neighbor[maxNeighborCount + 1];
int neighborInsertionIndex = 0;
int neighborCount = 0;
int start = i;
int end = i + 1;
double distance = Double.POSITIVE_INFINITY;
for (double searchDistance = 0; searchDistance < distance; ) {
searchDistance += STEP;
boolean newCandidatesFound = false;
while (start > 0 && characters.get(i).getX() - characters.get(start - 1).getX() < searchDistance) {
start--;
candidates[neighborInsertionIndex] = new Neighbor(characters.get(start), characters.get(i));
neighborCount++;
if (neighborCount > maxNeighborCount) {
neighborInsertionIndex = clearMostDistant(candidates);
neighborCount--;
} else {
neighborInsertionIndex++;
}
newCandidatesFound = true;
}
while (end < characters.size() && characters.get(end).getX() - characters.get(i).getX() < searchDistance) {
candidates[neighborInsertionIndex] = new Neighbor(characters.get(end), characters.get(i));
neighborCount++;
if (neighborCount > maxNeighborCount) {
neighborInsertionIndex = clearMostDistant(candidates);
neighborCount--;
} else {
neighborInsertionIndex++;
}
end++;
newCandidatesFound = true;
}
if (newCandidatesFound && neighborCount >= maxNeighborCount) {
distance = maxDistance(candidates);
}
}
if (neighborCount < maxNeighborCount) {
clearMostDistant(candidates);
}
List<Neighbor> candidatesList = new ArrayList<>(maxNeighborCount);
for (Neighbor candidate : candidates) {
if (candidate != null) {
candidatesList.add(candidate);
}
}
candidatesList.sort(Comparator.comparingDouble(Neighbor::getDistance));
assert candidatesList.size() == maxNeighborCount;
characters.get(i).setNeighbors(candidatesList);
}
}
private double maxDistance(Neighbor[] candidates) {
double maxDistance = 0;
for (Neighbor candidate : candidates) {
if (candidate == null) {
continue;
}
if (candidate.getDistance() > maxDistance) {
maxDistance = candidate.getDistance();
}
}
return maxDistance;
}
private int clearMostDistant(Neighbor[] candidates) {
double maxDistance = 0;
int maxIndex = 0;
for (int i = 0; i < candidates.length; i++) {
Neighbor candidate = candidates[i];
if (candidate == null) {
continue;
}
if (candidate.getDistance() > maxDistance) {
maxDistance = candidate.getDistance();
maxIndex = i;
}
}
candidates[maxIndex] = null;
return maxIndex;
}
}

View File

@ -0,0 +1,165 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.service;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.ListIterator;
import java.util.Map;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils;
@Service
public class ReadingOrderService {
private static final double THRESHOLD = 5;
public static final double MULTI_COLUMN_DETECTION_THRESHOLD = 1.5;
public List<Zone> resolve(List<Zone> zones, boolean xyReadingOrder) {
if (zones.isEmpty() || zones.size() == 1) {
return zones;
}
if (xyReadingOrder) {
return resolveSingleColumnReadingOrder(zones);
}
Map<Long, Integer> histogram = new HashMap<>();
for (Zone zone : zones) {
long minY = Math.round(zone.getBBox().getMinY());
long maxY = Math.round(zone.getBBox().getMaxY());
for (long i = minY; i <= maxY; i++) {
histogram.put(i, histogram.getOrDefault(i, 0) + 1);
}
}
if (histogram.values().stream().mapToInt(Integer::intValue).average().orElse(1) < MULTI_COLUMN_DETECTION_THRESHOLD) {
return resolveSingleColumnReadingOrder(zones);
} else {
return resolveMultiColumnReadingOder(zones);
}
}
private static List<Zone> resolveSingleColumnReadingOrder(List<Zone> zones) {
zones.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
return zones;
}
private List<Zone> resolveMultiColumnReadingOder(List<Zone> zones) {
// Simple reading order resolver for multi column page layout as described here : https://pub.towardsai.net/advanced-rag-02-unveiling-pdf-parsing-b84ae866344e
// TODO implement a more fancy reading order resolver see https://github.com/BobLd/DocumentLayoutAnalysis/blob/master/README.md#reading-order
double minX = Double.POSITIVE_INFINITY;
double maxX = Double.NEGATIVE_INFINITY;
for (Zone zone : zones) {
if (zone.getX() < minX) {
minX = zone.getX();
}
if (zone.getX() + zone.getWidth() > maxX) {
maxX = zone.getX() + zone.getWidth();
}
}
double midLineXCoordinate = (minX + maxX) / 2;
List<Zone> leftOf = new ArrayList<>();
List<Zone> rightOf = new ArrayList<>();
List<Zone> middle = new ArrayList<>();
for (Zone zone : zones) {
if (zone.getX() < midLineXCoordinate && zone.getX() + zone.getWidth() < midLineXCoordinate) {
leftOf.add(zone);
} else if (zone.getX() > midLineXCoordinate && zone.getX() + zone.getWidth() > midLineXCoordinate) {
rightOf.add(zone);
} else {
middle.add(zone);
}
}
leftOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
rightOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
middle.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
List<Zone> leftNotIntersecting = new ArrayList<>();
for (Zone leftZone : leftOf) {
boolean intersects = false;
for (Zone rightZone : rightOf) {
if (leftZone.intersectsY(rightZone)) {
intersects = true;
break;
}
// early stopping
if (rightZone.getBBox().getMinY() > leftZone.getBBox().getMaxY()) {
break;
}
}
if (!intersects) {
leftNotIntersecting.add(leftZone);
}
}
List<Zone> rightNotIntersecting = new ArrayList<>();
for (Zone rightZone : rightOf) {
boolean intersects = false;
for (Zone leftZone : leftOf) {
if (rightZone.intersectsY(leftZone)) {
intersects = true;
break;
}
// early stopping
if (leftZone.getBBox().getMinY() > rightZone.getBBox().getMaxY()) {
break;
}
}
if (!intersects) {
rightNotIntersecting.add(rightZone);
}
}
leftOf.removeAll(leftNotIntersecting);
rightOf.removeAll(rightNotIntersecting);
middle.addAll(leftNotIntersecting);
middle.addAll(rightNotIntersecting);
List<Zone> sortedZones = new ArrayList<>();
sortedZones.addAll(leftOf);
sortedZones.addAll(rightOf);
ListIterator<Zone> itty = middle.listIterator();
while (itty.hasNext()) {
Zone current = itty.next();
for (int i = 0; i < sortedZones.size(); i++) {
if (current.getY() < sortedZones.get(i).getY()) {
sortedZones.add(i, current);
itty.remove();
break;
}
}
}
sortedZones.addAll(middle);
return sortedZones;
}
}

View File

@ -0,0 +1,56 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.service;
import java.util.List;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.AngleFilter;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Histogram;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Neighbor;
@Service
public class SpacingService {
private static final double SPACING_HISTOGRAM_RESOLUTION = 0.5;
private static final double SPACING_HISTOGRAM_SMOOTHING_LENGTH = 2.5;
private static final double SPACING_HIST_SMOOTHING_STANDARD_DEVIATION = 0.5;
private static final double ANGLE_TOLERANCE = Math.PI / 6;
public double computeCharacterSpacing(List<Character> characters) {
return computeSpacing(characters, 0);
}
public double computeLineSpacing(List<Character> characters) {
return computeSpacing(characters, Math.PI / 2);
}
private double computeSpacing(List<Character> characters, double angle) {
double maxDistance = Double.NEGATIVE_INFINITY;
for (Character character : characters) {
for (Neighbor neighbor : character.getNeighbors()) {
maxDistance = Math.max(maxDistance, neighbor.getDistance());
}
}
Histogram histogram = new Histogram(0, maxDistance, SPACING_HISTOGRAM_RESOLUTION);
AngleFilter angleFilter = new AngleFilter(angle - ANGLE_TOLERANCE, angle + ANGLE_TOLERANCE);
for (Character character : characters) {
for (Neighbor neighbor : character.getNeighbors()) {
if (angleFilter.matches(neighbor)) {
histogram.add(neighbor.getDistance());
}
}
}
histogram.gaussianSmooth(SPACING_HISTOGRAM_SMOOTHING_LENGTH, SPACING_HIST_SMOOTHING_STANDARD_DEVIATION);
return histogram.getPeakValue();
}
}

View File

@ -0,0 +1,152 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.service;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
@Service
public class ZoneBuilderService {
private static final double MIN_HORIZONTAL_DISTANCE_MULTIPLIER = -0.5;
private static final double MAX_VERTICAL_DISTANCE_MULTIPLIER = 1.2;
private static final double MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER = -3.0;
private static final double MAX_VERTICAL_MERGE_DISTANCE_MULTIPLIER = 0.5;
private static final double MIN_LINE_SIZE_SCALE = 0.9;
private static final double MAX_LINE_SIZE_SCALE = 2.5;
private static final double ANGLE_TOLERANCE = Math.PI / 6;
private static final int MAX_ZONES = 300;
private static final double MAX_VERTICAL_MERGE_DISTANCE = 0.5;
public List<Zone> buildZones(List<Line> lines, double characterSpacing, double lineSpacing) {
double minHorizontalDistance = characterSpacing * MIN_HORIZONTAL_DISTANCE_MULTIPLIER;
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_DISTANCE_MULTIPLIER;
double minHorizontalMergeDistance = characterSpacing * MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER;
double maxVerticalMergeDistance = lineSpacing * MAX_VERTICAL_MERGE_DISTANCE_MULTIPLIER;
UnionFind<Line> unionFind = new UnionFind<>(new HashSet<>(lines));
double meanHeight = calculateMeanHeight(lines);
lines.forEach(outerLine -> //
lines.forEach(innerLine -> {
double scale = Math.min(outerLine.getHeight(), innerLine.getHeight()) / meanHeight;
scale = Math.max(MIN_LINE_SIZE_SCALE, Math.min(scale, MAX_LINE_SIZE_SCALE));
if (!unionFind.inSameSet(outerLine, innerLine) && outerLine.angularDifference(innerLine) <= ANGLE_TOLERANCE) {
double horizontalDistance = outerLine.horizontalDistance(innerLine) / scale;
double verticalDistance = outerLine.verticalDistance(innerLine) / scale;
if (minHorizontalDistance <= horizontalDistance && verticalDistance <= maxVerticalDistance //
|| minHorizontalMergeDistance <= horizontalDistance && verticalDistance <= maxVerticalMergeDistance) {
unionFind.union(outerLine, innerLine);
}
}
}));
List<Zone> zones = new ArrayList<>();
unionFind.getGroups().forEach(group -> {
zones.add(new Zone(new ArrayList<>(group)));
});
if (zones.size() > MAX_ZONES) {
List<Line> oneZoneLines = new ArrayList<>();
for (Zone zone : zones) {
oneZoneLines.addAll(zone.getLines());
}
return List.of(mergeLinesInZone(oneZoneLines, characterSpacing, lineSpacing));
}
return zones;
}
private double calculateMeanHeight(List<Line> lines) {
double meanHeight = 0.0;
double weights = 0.0;
for (Line line : lines) {
double weight = line.getLength();
meanHeight += line.getHeight() * weight;
weights += weight;
}
meanHeight /= weights;
return meanHeight;
}
private Zone mergeLinesInZone(List<Line> lines, double characterSpacing, double lineSpacing) {
double maxHorizontalDistance = 0;
double minVerticalDistance = 0;
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_MERGE_DISTANCE;
UnionFind<Line> unionFind = new UnionFind<>(new HashSet<>(lines));
lines.forEach(outer -> {
lines.forEach(inner -> {
if (inner != outer) {
double horizontalDistance = outer.horizontalDistance(inner);
double verticalDistance = outer.verticalDistance(inner);
if (horizontalDistance <= maxHorizontalDistance && minVerticalDistance <= verticalDistance && verticalDistance <= maxVerticalDistance) {
unionFind.union(outer, inner);
} else if (minVerticalDistance <= verticalDistance && verticalDistance <= maxVerticalDistance && Math.abs(horizontalDistance - Math.min(outer.getLength(),
inner.getLength())) < 0.1) {
boolean characterOverlap = false;
int overlappingCount = 0;
for (Character outerCharacter : outer.getCharacters()) {
for (Character innerCharacter : inner.getCharacters()) {
double characterOverlapDistance = outerCharacter.overlappingDistance(innerCharacter);
if (characterOverlapDistance > 2) {
characterOverlap = true;
}
if (characterOverlapDistance > 0) {
overlappingCount++;
}
}
}
if (!characterOverlap && overlappingCount <= 2) {
unionFind.union(outer, inner);
}
}
}
});
});
List<Line> outputZone = new ArrayList<>();
for (Set<Line> group : unionFind.getGroups()) {
List<Character> characters = new ArrayList<>();
for (Line line : group) {
characters.addAll(line.getCharacters());
}
characters.sort(Comparator.comparingDouble(Character::getX));
outputZone.add(new Line(characters, characterSpacing));
}
return new Zone(outputZone);
}
}

View File

@ -0,0 +1,15 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.utils;
public class DoubleUtils {
public static int compareDouble(double d1, double d2, double precision) {
if (Double.isNaN(d1) || Double.isNaN(d2)) {
return Double.compare(d1, d2);
}
long i1 = Math.round(d1 / (precision == 0 ? 1 : precision));
long i2 = Math.round(d2 / (precision == 0 ? 1 : precision));
return Long.compare(i1, i2);
}
}

View File

@ -0,0 +1,76 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.utils;
public class FastAtan2 {
static final private int Size_Ac = 1000;
static final private int Size_Ar = Size_Ac + 1;
static final private double Pi = (float) Math.PI;
static final private double Pi_H = Pi / 2;
static final private double[] Atan2 = new double[Size_Ar];
static final private double[] Atan2_PM = new double[Size_Ar];
static final private double[] Atan2_MP = new double[Size_Ar];
static final private double[] Atan2_MM = new double[Size_Ar];
static final private double[] Atan2_R = new double[Size_Ar];
static final private double[] Atan2_RPM = new double[Size_Ar];
static final private double[] Atan2_RMP = new double[Size_Ar];
static final private double[] Atan2_RMM = new double[Size_Ar];
static {
for (int i = 0; i <= Size_Ac; i++) {
double d = (double) i / Size_Ac;
double x = 1;
double y = x * d;
double v = Math.atan2(y, x);
Atan2[i] = v;
Atan2_PM[i] = Pi - v;
Atan2_MP[i] = -v;
Atan2_MM[i] = -Pi + v;
Atan2_R[i] = Pi_H - v;
Atan2_RPM[i] = Pi_H + v;
Atan2_RMP[i] = -Pi_H + v;
Atan2_RMM[i] = -Pi_H - v;
}
}
@SuppressWarnings("ParameterAssignment")
static public double fastAtan2(double y, double x) {
if (y < 0) {
if (x < 0) {
//(y < x) because == (-y > -x)
if (y < x) {
return Atan2_RMM[(int) (x / y * Size_Ac)];
} else {
return Atan2_MM[(int) (y / x * Size_Ac)];
}
} else {
y = -y;
if (y > x) {
return Atan2_RMP[(int) (x / y * Size_Ac)];
} else {
return Atan2_MP[(int) (y / x * Size_Ac)];
}
}
} else {
if (x < 0) {
x = -x;
if (y > x) {
return Atan2_RPM[(int) (x / y * Size_Ac)];
} else {
return Atan2_PM[(int) (y / x * Size_Ac)];
}
} else {
if (y > x) {
return Atan2_R[(int) (x / y * Size_Ac)];
} else {
return Atan2[(int) (y / x * Size_Ac)];
}
}
}
}
}

View File

@ -15,7 +15,6 @@ import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.No
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity; import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
import lombok.AccessLevel; import lombok.AccessLevel;
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
@ -52,7 +51,7 @@ public class Document implements GenericSemanticNode {
public TextBlock getTextBlock() { public TextBlock getTextBlock() {
if (textBlock == null) { if (textBlock == null) {
textBlock = streamTerminalTextBlocksInOrder().collect(new TextBlockCollector()); textBlock = GenericSemanticNode.super.getTextBlock();
} }
return textBlock; return textBlock;
} }
@ -67,8 +66,7 @@ public class Document implements GenericSemanticNode {
public Stream<TextBlock> streamTerminalTextBlocksInOrder() { public Stream<TextBlock> streamTerminalTextBlocksInOrder() {
return streamAllNodes().filter(SemanticNode::isLeaf) return streamAllNodes().filter(SemanticNode::isLeaf).map(SemanticNode::getTextBlock);
.map(SemanticNode::getLeafTextBlock);
} }

View File

@ -0,0 +1,34 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.SuperBuilder;
@Data
@EqualsAndHashCode(callSuper = true)
@SuperBuilder
public class DuplicatedParagraph extends Paragraph {
TextBlock unsortedLeafTextBlock;
@Override
public TextBlock getTextBlock() {
return Stream.of(leafTextBlock, unsortedLeafTextBlock).collect(new TextBlockCollector());
}
@Override
public String toString() {
return super.toString();
}
}

View File

@ -18,11 +18,12 @@ import lombok.Builder;
import lombok.Data; import lombok.Data;
import lombok.EqualsAndHashCode; import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults; import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
@Data @Data
@Builder @SuperBuilder
@AllArgsConstructor @AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE) @FieldDefaults(level = AccessLevel.PROTECTED)
public class Paragraph implements GenericSemanticNode { public class Paragraph implements GenericSemanticNode {
@Builder.Default @Builder.Default

View File

@ -11,7 +11,6 @@ import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.No
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity; import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
import lombok.AccessLevel; import lombok.AccessLevel;
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
@ -62,9 +61,7 @@ public class Section implements GenericSemanticNode {
public TextBlock getTextBlock() { public TextBlock getTextBlock() {
if (textBlock == null) { if (textBlock == null) {
textBlock = streamAllSubNodes().filter(SemanticNode::isLeaf) textBlock = GenericSemanticNode.super.getTextBlock();
.map(SemanticNode::getLeafTextBlock)
.collect(new TextBlockCollector());
} }
return textBlock; return textBlock;
} }

View File

@ -20,6 +20,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.E
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity; import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
public interface SemanticNode { public interface SemanticNode {
@ -39,7 +40,10 @@ public interface SemanticNode {
* *
* @return TextBlock containing all AtomicTextBlocks that are located under this Node. * @return TextBlock containing all AtomicTextBlocks that are located under this Node.
*/ */
TextBlock getTextBlock(); default TextBlock getTextBlock() {
return streamAllSubNodes().filter(SemanticNode::isLeaf).map(SemanticNode::getTextBlock).collect(new TextBlockCollector());
}
/** /**

View File

@ -48,7 +48,6 @@ public class Table implements SemanticNode {
@EqualsAndHashCode.Exclude @EqualsAndHashCode.Exclude
Map<Page, Rectangle2D> bBoxCache; Map<Page, Rectangle2D> bBoxCache;
/** /**
* Streams all entities in this table, that appear in a row, which contains any of the provided strings. * Streams all entities in this table, that appear in a row, which contains any of the provided strings.
* *
@ -332,9 +331,7 @@ public class Table implements SemanticNode {
public TextBlock getTextBlock() { public TextBlock getTextBlock() {
if (textBlock == null) { if (textBlock == null) {
textBlock = streamAllSubNodes().filter(SemanticNode::isLeaf) textBlock = SemanticNode.super.getTextBlock();
.map(SemanticNode::getLeafTextBlock)
.collect(new TextBlockCollector());
} }
return textBlock; return textBlock;
} }

View File

@ -1,7 +1,6 @@
package com.knecon.fforesight.service.layoutparser.processor.model.text; package com.knecon.fforesight.service.layoutparser.processor.model.text;
import org.apache.pdfbox.text.TextPosition; import org.apache.pdfbox.text.TextPosition;
import org.springframework.beans.BeanUtils;
import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.annotation.JsonIgnore;
@ -50,10 +49,14 @@ public class RedTextPosition {
public static RedTextPosition fromTextPosition(TextPosition textPosition) { public static RedTextPosition fromTextPosition(TextPosition textPosition) {
var pos = new RedTextPosition(); var pos = new RedTextPosition();
BeanUtils.copyProperties(textPosition, pos); pos.setRotation(textPosition.getRotation());
pos.setFontName(textPosition.getFont().getName()); pos.setPageHeight(textPosition.getPageHeight());
pos.setPageWidth(textPosition.getPageWidth());
pos.setUnicode(textPosition.getUnicode());
pos.setDir(textPosition.getDir());
pos.setWidthOfSpace(textPosition.getWidthOfSpace());
pos.setFontSizeInPt(textPosition.getFontSizeInPt()); pos.setFontSizeInPt(textPosition.getFontSizeInPt());
pos.setFontName(textPosition.getFont().getName());
var position = new float[4]; var position = new float[4];

View File

@ -53,6 +53,9 @@ public class TextPageBlock extends AbstractPageBlock {
@JsonIgnore @JsonIgnore
private PageBlockType classification; private PageBlockType classification;
@JsonIgnore
private boolean toDuplicate;
@JsonIgnore @JsonIgnore
public TextDirection getDir() { public TextDirection getDir() {
@ -82,6 +85,7 @@ public class TextPageBlock extends AbstractPageBlock {
return fromTextPositionSequences(sequences); return fromTextPositionSequences(sequences);
} }
public static TextPageBlock fromTextPositionSequences(List<TextPositionSequence> wordBlockList) { public static TextPageBlock fromTextPositionSequences(List<TextPositionSequence> wordBlockList) {
TextPageBlock textBlock = null; TextPageBlock textBlock = null;
@ -133,7 +137,6 @@ public class TextPageBlock extends AbstractPageBlock {
} }
/** /**
* Returns the minX value in pdf coordinate system. * Returns the minX value in pdf coordinate system.
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation. * Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
@ -362,7 +365,22 @@ public class TextPageBlock extends AbstractPageBlock {
} }
return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString()); return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString());
}
public int getNumberOfLines() {
int numberOfLines = 1;
TextPositionSequence previous = null;
for (TextPositionSequence word : sequences) {
if (previous != null) {
if (word.getMaxYDirAdj() - previous.getMaxYDirAdj() > word.getTextHeight()) {
numberOfLines++;
}
}
previous = word;
}
return numberOfLines;
} }

View File

@ -15,6 +15,7 @@ import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlo
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
import lombok.Builder; import lombok.Builder;
import lombok.Data; import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor; import lombok.NoArgsConstructor;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
@ -24,12 +25,18 @@ import lombok.extern.slf4j.Slf4j;
@Builder @Builder
@NoArgsConstructor @NoArgsConstructor
@AllArgsConstructor @AllArgsConstructor
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
public class TextPositionSequence implements CharSequence { public class TextPositionSequence implements CharSequence {
public static final int HEIGHT_PADDING = 2; public static final int HEIGHT_PADDING = 2;
@EqualsAndHashCode.Include
private int page; private int page;
@EqualsAndHashCode.Include
private List<RedTextPosition> textPositions = new ArrayList<>(); private List<RedTextPosition> textPositions = new ArrayList<>();
@EqualsAndHashCode.Include
private TextDirection dir; private TextDirection dir;
private int rotation; private int rotation;
private float pageHeight; private float pageHeight;
@ -55,6 +62,17 @@ public class TextPositionSequence implements CharSequence {
} }
public TextPositionSequence(List<RedTextPosition> textPositions, int page) {
this.textPositions = textPositions;
this.page = page;
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
this.rotation = textPositions.get(0).getRotation();
this.pageHeight = textPositions.get(0).getPageHeight();
this.pageWidth = textPositions.get(0).getPageWidth();
}
@Override @Override
public int length() { public int length() {

View File

@ -25,6 +25,7 @@ public class BodyTextFrameService {
private static final float RULING_HEIGHT_THRESHOLD = 0.15f; // multiplied with page height. Header/Footer Rulings must be within that border of the page. private static final float RULING_HEIGHT_THRESHOLD = 0.15f; // multiplied with page height. Header/Footer Rulings must be within that border of the page.
private static final float RULING_WIDTH_THRESHOLD = 0.75f; // multiplied with page width. Header/Footer Rulings must be at least that wide. private static final float RULING_WIDTH_THRESHOLD = 0.75f; // multiplied with page width. Header/Footer Rulings must be at least that wide.
public void setBodyTextFrames(ClassificationDocument classificationDocument, LayoutParsingType layoutParsingType) { public void setBodyTextFrames(ClassificationDocument classificationDocument, LayoutParsingType layoutParsingType) {
Rectangle bodyTextFrame = calculateBodyTextFrame(classificationDocument.getPages(), classificationDocument.getFontSizeCounter(), false, layoutParsingType); Rectangle bodyTextFrame = calculateBodyTextFrame(classificationDocument.getPages(), classificationDocument.getFontSizeCounter(), false, layoutParsingType);
@ -132,12 +133,7 @@ public class BodyTextFrameService {
boolean landscape, boolean landscape,
LayoutParsingType layoutParsingType) { LayoutParsingType layoutParsingType) {
float approximateHeaderLineCount; float approximateHeaderLineCount = 2.9f;
if (layoutParsingType.equals(LayoutParsingType.TAAS)) {
approximateHeaderLineCount = 3.3f;
} else {
approximateHeaderLineCount = 2.9f;
}
BodyTextFrameExpansionsRectangle expansionsRectangle = new BodyTextFrameExpansionsRectangle(); BodyTextFrameExpansionsRectangle expansionsRectangle = new BodyTextFrameExpansionsRectangle();
@ -155,8 +151,9 @@ public class BodyTextFrameService {
continue; continue;
} }
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) || MarkedContentUtils.intersects(textBlock,
|| MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)) { page.getMarkedContentBboxPerType(),
MarkedContentUtils.FOOTER)) {
continue; continue;
} }

View File

@ -7,6 +7,7 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.logging.log4j.util.Strings;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
@ -110,6 +111,20 @@ public class SectionsBuilderService {
} }
public void buildParagraphDebugSections(ClassificationDocument document) {
List<ClassificationSection> sections = new ArrayList<>();
for (var page : document.getPages()) {
page.getTextBlocks().forEach(block -> {
block.setPage(page.getPageNumber());
var section = buildTextBlock(List.of(block), Strings.EMPTY);
sections.add(section);
});
}
document.setSections(sections);
}
public void addImagesToSections(ClassificationDocument document) { public void addImagesToSections(ClassificationDocument document) {
Map<Integer, List<ClassificationSection>> sectionMap = new HashMap<>(); Map<Integer, List<ClassificationSection>> sectionMap = new HashMap<>();

View File

@ -14,7 +14,6 @@ import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle; import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
@ -41,19 +40,18 @@ public class TableExtractionService {
* <p> * <p>
* DirAdj (Text direction adjusted) values can not be used here. * DirAdj (Text direction adjusted) values can not be used here.
* *
* @param cleanRulings The lines used to build the table. * @param emptyCells The cells used to build the table.
* @param page Page object that contains textblocks and statistics. * @param page Page object that contains textblocks and statistics.
*/ */
public void extractTables(CleanRulings cleanRulings, ClassificationPage page) { public void extractTables(List<Cell> emptyCells, ClassificationPage page) {
List<Cell> cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical());
// sort cells by size (height * width) ascending so that textBlocks are always assigned to the smallest cells that contain them // sort cells by size (height * width) ascending so that textBlocks are always assigned to the smallest cells that contain them
cells.sort(CELL_SIZE_COMPARATOR); emptyCells.sort(CELL_SIZE_COMPARATOR);
for (AbstractPageBlock abstractPageBlock : page.getTextBlocks()) { for (AbstractPageBlock abstractPageBlock : page.getTextBlocks()) {
TextPageBlock textBlock = (TextPageBlock) abstractPageBlock; TextPageBlock textBlock = (TextPageBlock) abstractPageBlock;
for (Cell cell : cells) { for (Cell cell : emptyCells) {
if (cell.hasMinimumSize() && doesCellContainTextBlock(cell, textBlock)) { if (cell.hasMinimumSize() && doesCellContainTextBlock(cell, textBlock)) {
cell.addTextBlock(textBlock); cell.addTextBlock(textBlock);
break; break;
@ -61,7 +59,7 @@ public class TableExtractionService {
} }
} }
cells = new ArrayList<>(new HashSet<>(cells)); var cells = new ArrayList<>(new HashSet<>(emptyCells));
DoubleComparisons.sort(cells, Rectangle.ILL_DEFINED_ORDER); DoubleComparisons.sort(cells, Rectangle.ILL_DEFINED_ORDER);
List<Rectangle> spreadsheetAreas = SpreadsheetFinder.findSpreadsheetsFromCells(cells); List<Rectangle> spreadsheetAreas = SpreadsheetFinder.findSpreadsheetsFromCells(cells);
@ -79,9 +77,7 @@ public class TableExtractionService {
} }
} }
var containedCellsWithText = containedCells.stream() var containedCellsWithText = containedCells.stream().filter(cell -> !cell.getTextBlocks().isEmpty()).toList();
.filter(cell -> !cell.getTextBlocks().isEmpty())
.toList();
// verify if table would contain fewer cells with text than the threshold allows // verify if table would contain fewer cells with text than the threshold allows
if (containedCellsWithText.size() >= MAX_TABLE_CONTAINED_CELLS_WITH_TEXT && checkIfTableCellsAreUniform(containedCells)) { if (containedCellsWithText.size() >= MAX_TABLE_CONTAINED_CELLS_WITH_TEXT && checkIfTableCellsAreUniform(containedCells)) {
@ -101,11 +97,7 @@ public class TableExtractionService {
if (position != -1) { if (position != -1) {
page.getTextBlocks().add(position, table); page.getTextBlocks().add(position, table);
var toBeRemoved = table.getCells() var toBeRemoved = table.getCells().stream().map(Cell::getTextBlocks).flatMap(List::stream).toList();
.stream()
.map(Cell::getTextBlocks)
.flatMap(List::stream)
.toList();
// remove text blocks from the page that were also added with the table (from its contained cells) // remove text blocks from the page that were also added with the table (from its contained cells)
page.getTextBlocks().removeAll(toBeRemoved); page.getTextBlocks().removeAll(toBeRemoved);
} }
@ -139,19 +131,13 @@ public class TableExtractionService {
} }
double x0 = cell.getX(); double x0 = cell.getX();
double y0 = cell.getY(); double y0 = cell.getY();
return (x >= x0 - TEXT_BLOCK_CONTAINMENT_TOLERANCE return (x >= x0 - TEXT_BLOCK_CONTAINMENT_TOLERANCE && y >= y0 - TEXT_BLOCK_CONTAINMENT_TOLERANCE && (x + w) <= x0 + cell.getWidth() + 2 * TEXT_BLOCK_CONTAINMENT_TOLERANCE && (y + h) <= y0 + cell.getHeight() + 2 * TEXT_BLOCK_CONTAINMENT_TOLERANCE);
&& y >= y0 - TEXT_BLOCK_CONTAINMENT_TOLERANCE
&& (x + w) <= x0 + cell.getWidth() + 2 * TEXT_BLOCK_CONTAINMENT_TOLERANCE
&& (y + h) <= y0 + cell.getHeight() + 2 * TEXT_BLOCK_CONTAINMENT_TOLERANCE);
} }
public static List<Cell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) { public static List<Cell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
return RectangularIntersectionFinder.find(horizontalRulingLines, verticalRulingLines) return RectangularIntersectionFinder.find(horizontalRulingLines, verticalRulingLines).stream().map(Cell::new).collect(Collectors.toList());
.stream()
.map(Cell::new)
.collect(Collectors.toList());
} }
} }

View File

@ -0,0 +1,423 @@
package com.knecon.fforesight.service.layoutparser.processor.services.blockification;
import static java.util.stream.Collectors.toSet;
import java.awt.geom.Point2D;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.ListIterator;
import java.util.Set;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.DocstrumSegmentationService;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.utils.QuickSort;
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionSequenceComparator;
import lombok.RequiredArgsConstructor;
@SuppressWarnings("all")
@Service
@RequiredArgsConstructor
public class DocstrumBlockificationService {
private final DocstrumSegmentationService docstrumSegmentationService;
static final float THRESHOLD = 1f;
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Cell> cells, boolean xyOrder) {
// Underlined or strikethrough are also in rulings but we dont want to split blocks with them so we use cells.
List<Ruling> usedHorizonalRulings = new ArrayList<>();
List<Ruling> usedVerticalRulings = new ArrayList<>();
cells.forEach(cell -> {
usedHorizonalRulings.add(new Ruling(new Point2D.Float(cell.x, cell.y), new Point2D.Float(cell.x + cell.width, cell.y)));
usedHorizonalRulings.add(new Ruling(new Point2D.Float(cell.x, cell.y + cell.height), new Point2D.Float(cell.x + cell.width, cell.y + cell.height)));
usedVerticalRulings.add(new Ruling(new Point2D.Float(cell.x, cell.y), new Point2D.Float(cell.x, cell.y + cell.height)));
usedVerticalRulings.add(new Ruling(new Point2D.Float(cell.x + cell.width, cell.y), new Point2D.Float(cell.x + cell.width, cell.y + cell.height)));
});
var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder);
var pageBlocks = toAbstractPageBlocks(zones, usedHorizonalRulings, usedVerticalRulings);
return new ClassificationPage(pageBlocks);
}
private List<AbstractPageBlock> toAbstractPageBlocks(List<Zone> zones, List<Ruling> horizontalRulings, List<Ruling> verticalRulings) {
List<AbstractPageBlock> abstractPageBlocks = new ArrayList<>();
zones.forEach(zone -> {
List<TextPositionSequence> textPositionSequences = new ArrayList<>();
zone.getLines().forEach(line -> {
line.getWords().forEach(word -> {
textPositionSequences.add(new TextPositionSequence(word.getTextPositions(), word.getPage()));
});
});
abstractPageBlocks.addAll(splitZonesAtRulings(textPositionSequences, horizontalRulings, verticalRulings));
});
return abstractPageBlocks;
}
public void combineBlocks(ClassificationPage page) {
mergeIntersectingBlocks(page.getTextBlocks());
TextPageBlock previous = new TextPageBlock();
ListIterator<AbstractPageBlock> itty = page.getTextBlocks().listIterator();
while (itty.hasNext()) {
AbstractPageBlock block = itty.next();
if (block instanceof TablePageBlock) {
continue;
}
TextPageBlock current = (TextPageBlock) block;
if (previous != null && !previous.getSequences().isEmpty()) {
if (current.getDir() != previous.getDir()) {
previous = current;
continue;
}
if (areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(previous, current, page)) {
combineBlocksAndResetIterator(previous, current, itty, true);
continue;
}
if (previous.almostIntersects(current, 0, 0)) {
combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate());
continue;
}
if (isSameTopOrBottomWithPreviousHavingMaxTwoLinesAndCurrentThanOneAndMax4OtherBlocksOnHeight(previous, current, page)) {
combineBlocksAndResetIterator(previous, current, itty, false);
continue;
}
if (isOnlyIntersectingYAndOnelineOrPrevoiusTwoLines(previous, current, page)) {
combineBlocksAndResetIterator(previous, current, itty, false);
continue;
}
}
previous = current;
}
mergeIntersectingBlocks(page.getTextBlocks());
}
private boolean isOnlyIntersectingYAndOnelineOrPrevoiusTwoLines(TextPageBlock previous, TextPageBlock current, ClassificationPage page) {
return current.intersectsY(previous) //
&& (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() == 2 && current.getNumberOfLines() == 1) //
&& numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) <= 0;
}
private boolean isSameTopOrBottomWithPreviousHavingMaxTwoLinesAndCurrentThanOneAndMax4OtherBlocksOnHeight(TextPageBlock previous,
TextPageBlock current,
ClassificationPage page) {
return (Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD || Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) //
&& (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() == 2 && current.getNumberOfLines() == 1) //
&& !hasBetween(current, previous, page.getTextBlocks()) && numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) <= 4;
}
private boolean areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(TextPageBlock previous, TextPageBlock current, ClassificationPage page) {
return previous.getNumberOfLines() >= 2 && current.getNumberOfLines() >= 2 //
&& previous.intersectsY(current) //
&& numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) == 0;
}
private void combineBlocksAndResetIterator(TextPageBlock previous, TextPageBlock current, ListIterator<AbstractPageBlock> itty, boolean toDuplicate) {
previous.getSequences().addAll(current.getSequences());
previous = buildTextBlock(previous.getSequences(), 0);
previous.setToDuplicate(toDuplicate);
itty.remove();
itty.previous();
itty.set(previous);
itty.next();
}
private boolean hasBetween(TextPageBlock block, TextPageBlock other, List<AbstractPageBlock> allBlocks) {
for (AbstractPageBlock current : allBlocks) {
if (current == other || current == block) {
continue;
}
if (other.intersectsY(current) && other.getMaxX() <= current.getMinX() && current.getMaxX() <= block.getMinX()) {
return true;
}
}
return false;
}
private int numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(TextPageBlock block, TextPageBlock other, List<AbstractPageBlock> allBlocks) {
double minY = Math.min(block.getMinY(), other.getMinY());
double maxY = Math.min(block.getMaxY(), other.getMaxY());
int numberOfYIntersections = 0;
for (AbstractPageBlock current : allBlocks) {
if (current == other || current == block) {
continue;
}
if (minY <= current.getMaxY() && maxY >= current.getMinY()) {
numberOfYIntersections++;
}
}
return numberOfYIntersections;
}
public void mergeIntersectingBlocks(List<AbstractPageBlock> blocks) {
ListIterator<AbstractPageBlock> itty = blocks.listIterator();
Set<AbstractPageBlock> toRemove = new HashSet<>();
while (itty.hasNext()) {
AbstractPageBlock block = itty.next();
if (block instanceof TablePageBlock) {
continue;
}
TextPageBlock current = (TextPageBlock) block;
if (current.isToDuplicate()) {
continue;
}
for (int i = 0; i < blocks.size(); i++) {
if (toRemove.contains(blocks.get(i))) {
continue;
}
if (blocks.get(i) == current) {
continue;
}
if (blocks.get(i) instanceof TablePageBlock) {
continue;
}
TextPageBlock inner = (TextPageBlock) blocks.get(i);
if (inner.isToDuplicate()) {
continue;
}
if (current.getDir() == inner.getDir() && current.almostIntersects(inner, 0, 0)) {
current.getSequences().addAll(inner.getSequences());
QuickSort.sort(current.getSequences(), new TextPositionSequenceComparator());
current = buildTextBlock(current.getSequences(), 0);
toRemove.add(inner);
itty.set(current);
}
}
}
blocks.removeAll(toRemove);
}
public List<AbstractPageBlock> splitZonesAtRulings(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
int indexOnPage = 0;
List<TextPositionSequence> chunkWords = new ArrayList<>();
List<AbstractPageBlock> chunkBlockList = new ArrayList<>();
float minX = 1000, maxX = 0, minY = 1000, maxY = 0;
TextPositionSequence prev = null;
for (TextPositionSequence word : textPositions) {
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
if (prev != null && (splitByDir || isSplitByRuling)) {
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
indexOnPage++;
chunkBlockList.add(cb1);
chunkWords = new ArrayList<>();
minX = 1000;
maxX = 0;
minY = 1000;
maxY = 0;
prev = null;
}
chunkWords.add(word);
prev = word;
if (word.getMinXDirAdj() < minX) {
minX = word.getMinXDirAdj();
}
if (word.getMaxXDirAdj() > maxX) {
maxX = word.getMaxXDirAdj();
}
if (word.getMinYDirAdj() < minY) {
minY = word.getMinYDirAdj();
}
if (word.getMaxYDirAdj() > maxY) {
maxY = word.getMaxYDirAdj();
}
}
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
if (cb1 != null) {
chunkBlockList.add(cb1);
}
return chunkBlockList;
}
private boolean equalsWithThreshold(float f1, float f2) {
return Math.abs(f1 - f2) < THRESHOLD;
}
private TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
TextPageBlock textBlock = null;
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter();
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
for (TextPositionSequence wordBlock : wordBlockList) {
lineHeightFrequencyCounter.add(wordBlock.getTextHeight());
fontSizeFrequencyCounter.add(wordBlock.getFontSize());
spaceFrequencyCounter.add(wordBlock.getSpaceWidth());
fontFrequencyCounter.add(wordBlock.getFont());
styleFrequencyCounter.add(wordBlock.getFontStyle());
if (textBlock == null) {
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
wordBlock.getMaxXDirAdj(),
wordBlock.getMinYDirAdj(),
wordBlock.getMaxYDirAdj(),
wordBlockList,
wordBlock.getRotation());
} else {
TextPageBlock spatialEntity = textBlock.union(wordBlock);
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
}
}
if (textBlock != null) {
textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
}
if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) {
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
}
return textBlock;
}
private boolean isSplitByRuling(float minX,
float minY,
float maxX,
float maxY,
TextPositionSequence word,
List<Ruling> horizontalRulingLines,
List<Ruling> verticalRulingLines) {
return isSplitByRuling(maxX,
minY,
word.getMinXDirAdj(),
word.getMinYDirAdj(),
verticalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(minX,
minY,
word.getMinXDirAdj(),
word.getMaxYDirAdj(),
horizontalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(maxX,
minY,
word.getMinXDirAdj(),
word.getMinYDirAdj(),
horizontalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(minX,
minY,
word.getMinXDirAdj(),
word.getMaxYDirAdj(),
verticalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight());
}
private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List<Ruling> rulingLines, float dir, float pageWidth, float pageHeight) {
for (Ruling ruling : rulingLines) {
var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight);
if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) {
return true;
}
}
return false;
}
private double round(float value, int decimalPoints) {
var d = Math.pow(10, decimalPoints);
return Math.round(value * d) / d;
}
}

View File

@ -1,330 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.services.blockification;
// TODO: figure out, why this fails the build
// import static com.knecon.fforesight.service.layoutparser.processor.services.factory.SearchTextWithTextPositionFactory.HEIGHT_PADDING;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.Orientation;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
import org.springframework.stereotype.Service;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Stream;
@Service
@SuppressWarnings("all")
public class TaasBlockificationService {
private static final float THRESHOLD = 1f;
private static final float Y_GAP_SPLIT_HEIGHT_MODIFIER = 1.25f; // multiplied with text height
private static final float INTERSECTS_Y_THRESHOLD = 4;// 2 * HEIGHT_PADDING // This is exactly 2 times our position height padding. This is required to find boxes that are visually intersecting.
private static final int X_GAP_SPLIT_CONSTANT = 50;
public static final int X_ALIGNMENT_THRESHOLD = 1;
public static final int NEGATIVE_X_GAP_THRESHOLD = -5;
private Pattern listIdentifier = Pattern.compile("^(?:(?:[1-9]|1\\d|20|[ivxlc]|[a-z])\\s*(?:[.)]))|\\uF0B7", Pattern.CASE_INSENSITIVE);
/**
* This method is building blocks by expanding the minX/maxX and minY/maxY value on each word that is not split by the conditions.
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
* Rulings (Table lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
*
* @param textPositions The words of a page.
* @param horizontalRulingLines Horizontal table lines.
* @param verticalRulingLines Vertical table lines.
* @return ClassificationPage object that contains the Textblock and text statistics.
*/
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
List<TextPageBlock> classificationTextBlocks = constructFineGranularTextPageBlocks(textPositions, horizontalRulingLines, verticalRulingLines);
classificationTextBlocks = mergeTextPageBlocksAligningX(classificationTextBlocks);
classificationTextBlocks = mergeIntersectingTextBlocksUntilConvergence(classificationTextBlocks);
return new ClassificationPage(new ArrayList<>(classificationTextBlocks.stream().map(classificationTextBlock -> (AbstractPageBlock) classificationTextBlock).toList()));
}
private List<TextPageBlock> mergeIntersectingTextBlocksUntilConvergence(List<TextPageBlock> classificationTextBlocks) {
int currentSize = classificationTextBlocks.size();
while (true) {
classificationTextBlocks = mergeTextPageBlocksAlmostIntersecting(classificationTextBlocks);
if (classificationTextBlocks.size() == currentSize) {
break;
}
currentSize = classificationTextBlocks.size();
}
return classificationTextBlocks;
}
private List<TextPageBlock> mergeTextPageBlocksAligningX(List<TextPageBlock> classificationTextBlocks) {
if (classificationTextBlocks.isEmpty()) {
return new ArrayList<>();
}
List<List<TextPageBlock>> textBlocksToMerge = new LinkedList<>();
List<TextPageBlock> currentTextBlocksToMerge = new LinkedList<>();
textBlocksToMerge.add(currentTextBlocksToMerge);
TextPageBlock previousTextBlock = null;
Float lastLineGap = null;
for (TextPageBlock currentTextBlock : classificationTextBlocks) {
if (previousTextBlock == null) {
currentTextBlocksToMerge.add(currentTextBlock);
previousTextBlock = currentTextBlock;
continue;
}
Matcher listIdentifierPattern = listIdentifier.matcher(currentTextBlock.getText());
boolean isListIdentifier = listIdentifierPattern.find();
boolean yGap = Math.abs(currentTextBlock.getPdfMaxY() - previousTextBlock.getPdfMinY()) < previousTextBlock.getMostPopularWordHeight() * Y_GAP_SPLIT_HEIGHT_MODIFIER;
boolean sameFont = previousTextBlock.getMostPopularWordFont().equals(currentTextBlock.getMostPopularWordFont()) && previousTextBlock.getMostPopularWordFontSize() == currentTextBlock.getMostPopularWordFontSize();
// boolean yGap = previousTextBlock != null && currentTextBlock.getMinYDirAdj() - maxY > Math.min(word.getHeight(), prev.getHeight()) * Y_GAP_SPLIT_HEIGHT_MODIFIER;
boolean alignsXRight = Math.abs(currentTextBlock.getPdfMaxX() - previousTextBlock.getPdfMaxX()) < X_ALIGNMENT_THRESHOLD;
boolean alignsXLeft = Math.abs(currentTextBlock.getPdfMinX() - previousTextBlock.getPdfMinX()) < X_ALIGNMENT_THRESHOLD;
// boolean smallYGap = Math.abs(currentTextBlock.getPdfMaxY() - previousTextBlock.getPdfMinY()) < yGap;
if (yGap && sameFont && !isListIdentifier) {
currentTextBlocksToMerge.add(currentTextBlock);
} else {
currentTextBlocksToMerge = new LinkedList<>();
currentTextBlocksToMerge.add(currentTextBlock);
textBlocksToMerge.add(currentTextBlocksToMerge);
}
previousTextBlock = currentTextBlock;
}
return textBlocksToMerge.stream().map(TextPageBlock::merge).toList();
}
private List<TextPageBlock> mergeTextPageBlocksAlmostIntersecting(List<TextPageBlock> textPageBlocks) {
Set<TextPageBlock> alreadyMerged = new HashSet<>();
List<List<TextPageBlock>> textBlocksToMerge = new LinkedList<>();
for (TextPageBlock textPageBlock : textPageBlocks) {
if (alreadyMerged.contains(textPageBlock)) {
continue;
}
alreadyMerged.add(textPageBlock);
textBlocksToMerge.add(Stream.concat(Stream.of(textPageBlock),
textPageBlocks.stream().filter(textPageBlock2 -> textPageBlock.almostIntersects(textPageBlock2, INTERSECTS_Y_THRESHOLD, 0) && !alreadyMerged.contains(textPageBlock2)).peek(alreadyMerged::add))
.toList());
}
return textBlocksToMerge.stream().map(TextPageBlock::merge).toList();
}
private void assignOrientations(List<TextPageBlock> classificationTextBlocks) {
Iterator<TextPageBlock> itty = classificationTextBlocks.iterator();
TextPageBlock previousLeft = null;
TextPageBlock previousRight = null;
while (itty.hasNext()) {
TextPageBlock block = (TextPageBlock) itty.next();
if (previousLeft != null && block.getOrientation().equals(Orientation.LEFT)) {
if (previousLeft.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousLeft.getMinY()) {
previousLeft.add(block);
itty.remove();
continue;
}
}
if (previousRight != null && block.getOrientation().equals(Orientation.RIGHT)) {
if (previousRight.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousRight.getMinY()) {
previousRight.add(block);
itty.remove();
continue;
}
}
if (block.getOrientation().equals(Orientation.LEFT)) {
previousLeft = block;
} else if (block.getOrientation().equals(Orientation.RIGHT)) {
previousRight = block;
}
}
itty = classificationTextBlocks.iterator();
TextPageBlock previous = null;
while (itty.hasNext()) {
TextPageBlock block = (TextPageBlock) itty.next();
if (previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold(
block.getMaxY(),
previous.getMaxY()) || previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation()
.equals(Orientation.RIGHT) && equalsWithThreshold(block.getMaxY(), previous.getMaxY())) {
previous.add(block);
itty.remove();
continue;
}
previous = block;
}
}
private List<TextPageBlock> constructFineGranularTextPageBlocks(List<TextPositionSequence> textPositions,
List<Ruling> horizontalRulingLines,
List<Ruling> verticalRulingLines) {
int indexOnPage = 0;
List<TextPositionSequence> wordClusterToCombine = new ArrayList<>();
List<TextPageBlock> classificationTextBlocks = new ArrayList<>();
float minX = 1000, maxX = 0, minY = 1000, maxY = 0;
TextPositionSequence prev = null;
// TODO: make static final constant
boolean wasSplitted = false;
Float splitX1 = null;
for (TextPositionSequence word : textPositions) {
Matcher listIdentifierPattern = listIdentifier.matcher(word.toString());
boolean yGap = prev != null && word.getMinYDirAdj() - maxY > Math.min(word.getHeight(), prev.getHeight()) * Y_GAP_SPLIT_HEIGHT_MODIFIER;
boolean sameLine = prev != null && equalsWithThreshold(prev.getMinYDirAdj(), word.getMinYDirAdj());
boolean positiveXGapInline = prev != null && maxX + X_GAP_SPLIT_CONSTANT < word.getMinXDirAdj() && sameLine;
boolean negativeXGap = prev != null && word.getMinXDirAdj() - minX < NEGATIVE_X_GAP_THRESHOLD;
boolean startFromTop = prev != null && word.getMinYDirAdj() < prev.getMinYDirAdj() - prev.getTextHeight();
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
boolean splitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
boolean fontChange = prev != null && (!word.getFont().equals(prev.getFont()) || !word.getFontStyle()
.equals(prev.getFontStyle()) || word.getFontSize() != prev.getFontSize());
boolean newline = prev != null && Math.abs(word.getMinYDirAdj() - prev.getMinYDirAdj()) > word.getHeight();
boolean isListIdentifier = listIdentifierPattern.matches();
if (prev != null && (prev.isParagraphStart() || negativeXGap || positiveXGapInline || yGap || startFromTop || splitByRuling || (newline && (fontChange || isListIdentifier)))) {
// if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) {
Orientation prevOrientation = null;
if (!classificationTextBlocks.isEmpty()) {
prevOrientation = classificationTextBlocks.get(classificationTextBlocks.size() - X_ALIGNMENT_THRESHOLD).getOrientation();
}
TextPageBlock classificationTextBlock = TextPageBlock.fromTextPositionSequences(wordClusterToCombine);
classificationTextBlocks.add(classificationTextBlock);
wordClusterToCombine = new ArrayList<>();
if (positiveXGapInline && !splitByRuling) {
wasSplitted = true;
classificationTextBlock.setOrientation(Orientation.LEFT);
splitX1 = word.getMinXDirAdj();
} else if (newLineAfterSplit && !splitByRuling) {
wasSplitted = false;
classificationTextBlock.setOrientation(Orientation.RIGHT);
splitX1 = null;
} else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (yGap || !startFromTop || !positiveXGapInline || !newLineAfterSplit || !splitByRuling)) {
classificationTextBlock.setOrientation(Orientation.LEFT);
}
minX = 1000;
maxX = 0;
minY = 1000;
maxY = 0;
prev = null;
}
wordClusterToCombine.add(word);
prev = word;
if (word.getMinXDirAdj() < minX) {
minX = word.getMinXDirAdj();
}
if (word.getMaxXDirAdj() > maxX) {
maxX = word.getMaxXDirAdj();
}
if (word.getMinYDirAdj() < minY) {
minY = word.getMinYDirAdj();
}
if (word.getMaxYDirAdj() > maxY) {
maxY = word.getMaxYDirAdj();
}
}
TextPageBlock classificationTextBlock = TextPageBlock.fromTextPositionSequences(wordClusterToCombine);
if (classificationTextBlock != null) {
classificationTextBlocks.add(classificationTextBlock);
}
return classificationTextBlocks;
}
private boolean equalsWithThreshold(float f1, float f2) {
return Math.abs(f1 - f2) < THRESHOLD;
}
private boolean isSplitByRuling(float minX,
float minY,
float maxX,
float maxY,
TextPositionSequence word,
List<Ruling> horizontalRulingLines,
List<Ruling> verticalRulingLines) {
return isSplitByRuling(maxX,
minY,
word.getMinXDirAdj(),
word.getMinYDirAdj(),
verticalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(minX,
minY,
word.getMinXDirAdj(),
word.getMaxYDirAdj(),
horizontalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(maxX,
minY,
word.getMinXDirAdj(),
word.getMinYDirAdj(),
horizontalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(minX,
minY,
word.getMinXDirAdj(),
word.getMaxYDirAdj(),
verticalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()); //
}
private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List<Ruling> rulingLines, float dir, float pageWidth, float pageHeight) {
for (Ruling ruling : rulingLines) {
var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight);
if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) {
return true;
}
}
return false;
}
}

View File

@ -1,114 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.services.classification;
import java.util.List;
import java.util.regex.Pattern;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService;
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
public class TaasClassificationService {
private final BodyTextFrameService bodyTextFrameService;
public void classifyDocument(ClassificationDocument document) {
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
for (ClassificationPage page : document.getPages()) {
classifyPage(page, document, headlineFontSizes);
}
}
public void classifyPage(ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
for (AbstractPageBlock textBlock : page.getTextBlocks()) {
if (textBlock instanceof TextPageBlock) {
classifyBlock((TextPageBlock) textBlock, page, document, headlineFontSizes);
}
}
}
public void classifyBlock(TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
var bodyTextFrame = page.getBodyTextFrame();
if (document.getFontSizeCounter().getMostPopular() == null) {
textBlock.setClassification(PageBlockType.OTHER);
return;
}
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation())) {
textBlock.setClassification(PageBlockType.HEADER);
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
|| PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation())) {
textBlock.setClassification(PageBlockType.FOOTER);
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
.size() == 1)) {
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
textBlock.setClassification(PageBlockType.TITLE);
}
} else if (textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter()
.getMostPopular() && PositionUtils.getApproxLineCount(textBlock) < 4.9 && (textBlock.getMostPopularWordStyle().equals("bold") || !document.getFontStyleCounter()
.getCountPerValue()
.containsKey("bold") && textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() + 1) && textBlock.getSequences()
.get(0)
.getTextPositions()
.get(0)
.getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
for (int i = 1; i <= headlineFontSizes.size(); i++) {
if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) {
textBlock.setClassification(PageBlockType.getHeadlineType(i));
document.setHeadlines(true);
}
}
} else if (!textBlock.getText().startsWith("Figure ") && PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordStyle()
.equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9 && textBlock.getSequences()
.get(0)
.getTextPositions()
.get(0)
.getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
textBlock.setClassification(PageBlockType.getHeadlineType(headlineFontSizes.size() + 1));
document.setHeadlines(true);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) {
textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFont()
.equals(document.getFontCounter().getMostPopular()) && textBlock.getMostPopularWordStyle()
.equals(document.getFontStyleCounter().getMostPopular()) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) {
textBlock.setClassification(PageBlockType.PARAGRAPH);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("italic") && !document.getFontStyleCounter()
.getMostPopular()
.equals("italic") && PositionUtils.getApproxLineCount(textBlock) < 2.9) {
textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) {
textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN);
} else {
textBlock.setClassification(PageBlockType.OTHER);
}
}
}

View File

@ -13,8 +13,10 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.NoSuchElementException; import java.util.NoSuchElementException;
import java.util.Set; import java.util.Set;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter;
@ -22,6 +24,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.DuplicatedParagraph;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Footer; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Footer;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Header; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Header;
@ -46,14 +49,14 @@ import lombok.experimental.UtilityClass;
@UtilityClass @UtilityClass
public class DocumentGraphFactory { public class DocumentGraphFactory {
public Document buildDocumentGraph(ClassificationDocument document) { public Document buildDocumentGraph(LayoutParsingType layoutParsingType, ClassificationDocument document) {
Document documentGraph = new Document(); Document documentGraph = new Document();
Context context = new Context(documentGraph); Context context = new Context(documentGraph);
document.getPages().forEach(context::buildAndAddPageWithCounter); document.getPages().forEach(context::buildAndAddPageWithCounter);
document.getSections().stream().flatMap(section -> section.getImages().stream()).forEach(image -> context.getImages().add(image)); document.getSections().stream().flatMap(section -> section.getImages().stream()).forEach(image -> context.getImages().add(image));
addSections(document, context); addSections(layoutParsingType, document, context);
addHeaderAndFooterToEachPage(document, context); addHeaderAndFooterToEachPage(document, context);
documentGraph.setNumberOfPages(context.pages.size()); documentGraph.setNumberOfPages(context.pages.size());
@ -64,9 +67,9 @@ public class DocumentGraphFactory {
} }
private void addSections(ClassificationDocument document, Context context) { private void addSections(LayoutParsingType layoutParsingType, ClassificationDocument document, Context context) {
document.getSections().forEach(section -> SectionNodeFactory.addSection(null, section.getNonEmptyPageBlocks(), section.getImages(), context)); document.getSections().forEach(section -> SectionNodeFactory.addSection(layoutParsingType, null, section.getNonEmptyPageBlocks(), section.getImages(), context));
} }
@ -77,6 +80,8 @@ public class DocumentGraphFactory {
GenericSemanticNode node; GenericSemanticNode node;
if (originalTextBlock.isHeadline()) { if (originalTextBlock.isHeadline()) {
node = Headline.builder().documentTree(context.getDocumentTree()).build(); node = Headline.builder().documentTree(context.getDocumentTree()).build();
} else if (originalTextBlock.isToDuplicate()) {
node = DuplicatedParagraph.builder().documentTree(context.getDocumentTree()).build();
} else { } else {
node = Paragraph.builder().documentTree(context.getDocumentTree()).build(); node = Paragraph.builder().documentTree(context.getDocumentTree()).build();
} }
@ -86,7 +91,16 @@ public class DocumentGraphFactory {
List<TextPageBlock> textBlocks = new ArrayList<>(); List<TextPageBlock> textBlocks = new ArrayList<>();
textBlocks.add(originalTextBlock); textBlocks.add(originalTextBlock);
textBlocks.addAll(textBlocksToMerge); textBlocks.addAll(textBlocksToMerge);
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), node, context, page); AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), node, context, page);
if (node instanceof DuplicatedParagraph duplicatedParagraph) {
AtomicTextBlock unsortedTextBlock = context.textBlockFactory.buildAtomicTextBlock(textBlocks.stream()
.flatMap(tb -> tb.getSequences().stream())
.collect(Collectors.toList()), node, context, page);
duplicatedParagraph.setUnsortedLeafTextBlock(unsortedTextBlock);
}
List<Integer> treeId = context.documentTree.createNewChildEntryAndReturnId(parentNode, node); List<Integer> treeId = context.documentTree.createNewChildEntryAndReturnId(parentNode, node);
node.setLeafTextBlock(textBlock); node.setLeafTextBlock(textBlock);
node.setTreeId(treeId); node.setTreeId(treeId);

View File

@ -4,19 +4,21 @@ import static java.lang.String.format;
import static java.util.Collections.emptyList; import static java.util.Collections.emptyList;
import static java.util.stream.Collectors.groupingBy; import static java.util.stream.Collectors.groupingBy;
import java.util.ArrayList;
import java.util.HashSet; import java.util.HashSet;
import java.util.LinkedList; import java.util.LinkedList;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.utils.TableMergingUtility; import com.knecon.fforesight.service.layoutparser.processor.utils.TableMergingUtility;
import lombok.experimental.UtilityClass; import lombok.experimental.UtilityClass;
@ -24,7 +26,11 @@ import lombok.experimental.UtilityClass;
@UtilityClass @UtilityClass
public class SectionNodeFactory { public class SectionNodeFactory {
public void addSection(GenericSemanticNode parentNode, List<AbstractPageBlock> pageBlocks, List<ClassifiedImage> images, DocumentGraphFactory.Context context) { public void addSection(LayoutParsingType layoutParsingType,
GenericSemanticNode parentNode,
List<AbstractPageBlock> pageBlocks,
List<ClassifiedImage> images,
DocumentGraphFactory.Context context) {
if (pageBlocks.isEmpty()) { if (pageBlocks.isEmpty()) {
return; return;
@ -37,11 +43,11 @@ public class SectionNodeFactory {
section.setTreeId(getTreeId(parentNode, context, section)); section.setTreeId(getTreeId(parentNode, context, section));
addFirstHeadlineDirectlyToSection(pageBlocks, context, section); addFirstHeadlineDirectlyToSection(layoutParsingType, pageBlocks, context, section);
if (containsTablesAndTextBlocks(pageBlocks)) { if (containsTablesAndTextBlocks(pageBlocks)) {
splitPageBlocksIntoSubSections(pageBlocks).forEach(subSectionPageBlocks -> addSection(section, subSectionPageBlocks, emptyList(), context)); splitPageBlocksIntoSubSections(pageBlocks).forEach(subSectionPageBlocks -> addSection(layoutParsingType, section, subSectionPageBlocks, emptyList(), context));
} else { } else {
addTablesAndParagraphsAndHeadlinesToSection(pageBlocks, context, section); addTablesAndParagraphsAndHeadlinesToSection(layoutParsingType, pageBlocks, context, section);
} }
images.stream().distinct().forEach(image -> DocumentGraphFactory.addImage(section, image, context)); images.stream().distinct().forEach(image -> DocumentGraphFactory.addImage(section, image, context));
@ -58,16 +64,19 @@ public class SectionNodeFactory {
} }
private void addFirstHeadlineDirectlyToSection(List<AbstractPageBlock> pageBlocks, DocumentGraphFactory.Context context, Section section) { private void addFirstHeadlineDirectlyToSection(LayoutParsingType layoutParsingType, List<AbstractPageBlock> pageBlocks, DocumentGraphFactory.Context context, Section section) {
if (pageBlocks.get(0).isHeadline()) { if (pageBlocks.get(0).isHeadline()) {
addTablesAndParagraphsAndHeadlinesToSection(List.of(pageBlocks.get(0)), context, section); addTablesAndParagraphsAndHeadlinesToSection(layoutParsingType, List.of(pageBlocks.get(0)), context, section);
pageBlocks.remove(0); pageBlocks.remove(0);
} }
} }
private void addTablesAndParagraphsAndHeadlinesToSection(List<AbstractPageBlock> pageBlocks, DocumentGraphFactory.Context context, Section section) { private void addTablesAndParagraphsAndHeadlinesToSection(LayoutParsingType layoutParsingType,
List<AbstractPageBlock> pageBlocks,
DocumentGraphFactory.Context context,
Section section) {
Set<AbstractPageBlock> alreadyMerged = new HashSet<>(); Set<AbstractPageBlock> alreadyMerged = new HashSet<>();
List<AbstractPageBlock> remainingBlocks = new LinkedList<>(pageBlocks); List<AbstractPageBlock> remainingBlocks = new LinkedList<>(pageBlocks);
@ -80,13 +89,23 @@ public class SectionNodeFactory {
remainingBlocks.removeAll(alreadyMerged); remainingBlocks.removeAll(alreadyMerged);
if (abstractPageBlock instanceof TextPageBlock) { if (abstractPageBlock instanceof TextPageBlock) {
switch (layoutParsingType) {
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> {
alreadyMerged.add(abstractPageBlock);
remainingBlocks.remove(abstractPageBlock);
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, new ArrayList<>());
}
default -> {
List<TextPageBlock> textBlocks = findTextBlocksWithSameClassificationAndAlignsY((TextPageBlock) abstractPageBlock, remainingBlocks); List<TextPageBlock> textBlocks = findTextBlocksWithSameClassificationAndAlignsY((TextPageBlock) abstractPageBlock, remainingBlocks);
alreadyMerged.addAll(textBlocks); alreadyMerged.addAll(textBlocks);
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, textBlocks); DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, textBlocks);
}
}
} else if (abstractPageBlock instanceof TablePageBlock tablePageBlock) { } else if (abstractPageBlock instanceof TablePageBlock tablePageBlock) {
List<TablePageBlock> tablesToMerge = TableMergingUtility.findConsecutiveTablesWithSameColCountAndSameHeaders(tablePageBlock, remainingBlocks); List<TablePageBlock> tablesToMerge = TableMergingUtility.findConsecutiveTablesWithSameColCountAndSameHeaders(tablePageBlock, remainingBlocks);
alreadyMerged.addAll(tablesToMerge); alreadyMerged.addAll(tablesToMerge);
TableNodeFactory.addTable(section, tablesToMerge, context); TableNodeFactory.addTable(layoutParsingType, section, tablesToMerge, context);
} else { } else {
throw new RuntimeException(format("Unhandled AbstractPageBlockType %s!", abstractPageBlock.getClass())); throw new RuntimeException(format("Unhandled AbstractPageBlockType %s!", abstractPageBlock.getClass()));
} }
@ -171,6 +190,7 @@ public class SectionNodeFactory {
.filter(abstractTextContainer -> abstractTextContainer.intersectsY(atc)) .filter(abstractTextContainer -> abstractTextContainer.intersectsY(atc))
.map(abstractTextContainer -> (TextPageBlock) abstractTextContainer) .map(abstractTextContainer -> (TextPageBlock) abstractTextContainer)
.filter(abstractTextContainer -> abstractTextContainer.getDir() == atc.getDir()) .filter(abstractTextContainer -> abstractTextContainer.getDir() == atc.getDir())
.filter(abstractTextContainer -> !abstractTextContainer.isToDuplicate())
.toList(); .toList();
} }

View File

@ -7,16 +7,17 @@ import java.util.List;
import java.util.Set; import java.util.Set;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations; import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
import lombok.experimental.UtilityClass; import lombok.experimental.UtilityClass;
@ -27,7 +28,7 @@ public class TableNodeFactory {
public final double TABLE_CELL_MERGE_CONTENTS_SIZE_THRESHOLD = 0.05; public final double TABLE_CELL_MERGE_CONTENTS_SIZE_THRESHOLD = 0.05;
public void addTable(GenericSemanticNode parentNode, List<TablePageBlock> tablesToMerge, DocumentGraphFactory.Context context) { public void addTable(LayoutParsingType layoutParsingType, GenericSemanticNode parentNode, List<TablePageBlock> tablesToMerge, DocumentGraphFactory.Context context) {
setPageNumberInCells(tablesToMerge); setPageNumberInCells(tablesToMerge);
Set<Page> pages = tablesToMerge.stream().map(AbstractPageBlock::getPage).map(context::getPage).collect(Collectors.toSet()); Set<Page> pages = tablesToMerge.stream().map(AbstractPageBlock::getPage).map(context::getPage).collect(Collectors.toSet());
@ -43,7 +44,7 @@ public class TableNodeFactory {
List<Integer> treeId = context.getDocumentTree().createNewChildEntryAndReturnId(parentNode, table); List<Integer> treeId = context.getDocumentTree().createNewChildEntryAndReturnId(parentNode, table);
table.setTreeId(treeId); table.setTreeId(treeId);
addTableCells(mergedRows, table, context); addTableCells(layoutParsingType, mergedRows, table, context);
ifTableHasNoHeadersSetFirstRowAsHeaders(table); ifTableHasNoHeadersSetFirstRowAsHeaders(table);
} }
@ -88,18 +89,18 @@ public class TableNodeFactory {
} }
private void addTableCells(List<List<Cell>> rows, Table table, DocumentGraphFactory.Context context) { private void addTableCells(LayoutParsingType layoutParsingType, List<List<Cell>> rows, Table table, DocumentGraphFactory.Context context) {
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) { for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
for (int colIndex = 0; colIndex < rows.get(rowIndex).size(); colIndex++) { for (int colIndex = 0; colIndex < rows.get(rowIndex).size(); colIndex++) {
addTableCell(rows.get(rowIndex).get(colIndex), rowIndex, colIndex, table, context); addTableCell(layoutParsingType, rows.get(rowIndex).get(colIndex), rowIndex, colIndex, table, context);
} }
} }
} }
@SuppressWarnings("PMD.UnusedPrivateMethod") // PMD actually flags this wrong @SuppressWarnings("PMD.UnusedPrivateMethod") // PMD actually flags this wrong
private void addTableCell(Cell cell, int rowIndex, int colIndex, Table tableNode, DocumentGraphFactory.Context context) { private void addTableCell(LayoutParsingType layoutParsingType, Cell cell, int rowIndex, int colIndex, Table tableNode, DocumentGraphFactory.Context context) {
Page page = context.getPage(cell.getPageNumber()); Page page = context.getPage(cell.getPageNumber());
@ -116,7 +117,7 @@ public class TableNodeFactory {
textBlock = context.getTextBlockFactory().buildAtomicTextBlock(cell.getTextBlocks().get(0).getSequences(), tableCell, context, page); textBlock = context.getTextBlockFactory().buildAtomicTextBlock(cell.getTextBlocks().get(0).getSequences(), tableCell, context, page);
tableCell.setLeafTextBlock(textBlock); tableCell.setLeafTextBlock(textBlock);
} else if (firstTextBlockIsHeadline(cell)) { } else if (firstTextBlockIsHeadline(cell)) {
SectionNodeFactory.addSection(tableCell, cell.getTextBlocks().stream().map(tb -> (AbstractPageBlock) tb).toList(), emptyList(), context); SectionNodeFactory.addSection(layoutParsingType, tableCell, cell.getTextBlocks().stream().map(tb -> (AbstractPageBlock) tb).toList(), emptyList(), context);
} else if (cellAreaIsSmallerThanPageAreaTimesThreshold(cell, page)) { } else if (cellAreaIsSmallerThanPageAreaTimesThreshold(cell, page)) {
List<TextPositionSequence> sequences = TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(cell.getTextBlocks()); List<TextPositionSequence> sequences = TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(cell.getTextBlocks());
textBlock = context.getTextBlockFactory().buildAtomicTextBlock(sequences, tableCell, context, page); textBlock = context.getTextBlockFactory().buildAtomicTextBlock(sequences, tableCell, context, page);

View File

@ -8,8 +8,6 @@ import java.util.Map;
import java.util.Set; import java.util.Set;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import javax.xml.parsers.DocumentBuilder;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
@ -18,6 +16,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.Do
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.DuplicatedParagraph;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
@ -33,27 +32,20 @@ public class DocumentDataMapper {
public DocumentData toDocumentData(Document document) { public DocumentData toDocumentData(Document document) {
List<DocumentTextData> documentTextData = document.streamTerminalTextBlocksInOrder() List<DocumentTextData> documentTextData = document.streamTerminalTextBlocksInOrder()
.flatMap(textBlock -> textBlock.getAtomicTextBlocks() .flatMap(textBlock -> textBlock.getAtomicTextBlocks().stream())
.stream())
.distinct() .distinct()
.map(DocumentDataMapper::toAtomicTextBlockData) .map(DocumentDataMapper::toAtomicTextBlockData)
.toList(); .toList();
List<DocumentPositionData> atomicPositionBlockData = document.streamTerminalTextBlocksInOrder() List<DocumentPositionData> atomicPositionBlockData = document.streamTerminalTextBlocksInOrder()
.flatMap(textBlock -> textBlock.getAtomicTextBlocks() .flatMap(textBlock -> textBlock.getAtomicTextBlocks().stream())
.stream())
.distinct() .distinct()
.map(DocumentDataMapper::toAtomicPositionBlockData) .map(DocumentDataMapper::toAtomicPositionBlockData)
.toList(); .toList();
Set<Long> nonEmptyTextBlocks = documentTextData.stream() Set<Long> nonEmptyTextBlocks = documentTextData.stream().mapToLong(DocumentTextData::getId).boxed().collect(Collectors.toSet());
.mapToLong(DocumentTextData::getId).boxed()
.collect(Collectors.toSet());
List<DocumentPage> documentPageData = document.getPages() List<DocumentPage> documentPageData = document.getPages().stream().map(DocumentDataMapper::toPageData).toList();
.stream()
.map(DocumentDataMapper::toPageData)
.toList();
DocumentStructure tableOfContentsData = toDocumentTreeData(document.getDocumentTree()); DocumentStructure tableOfContentsData = toDocumentTreeData(document.getDocumentTree());
return DocumentData.builder() return DocumentData.builder()
.documentTextData(documentTextData.toArray(new DocumentTextData[0])) .documentTextData(documentTextData.toArray(new DocumentTextData[0]))
@ -84,22 +76,17 @@ public class DocumentDataMapper {
case TABLE -> PropertiesMapper.buildTableProperties((Table) entry.getNode()); case TABLE -> PropertiesMapper.buildTableProperties((Table) entry.getNode());
case TABLE_CELL -> PropertiesMapper.buildTableCellProperties((TableCell) entry.getNode()); case TABLE_CELL -> PropertiesMapper.buildTableCellProperties((TableCell) entry.getNode());
case IMAGE -> PropertiesMapper.buildImageProperties((Image) entry.getNode()); case IMAGE -> PropertiesMapper.buildImageProperties((Image) entry.getNode());
case PARAGRAPH ->
entry.getNode() instanceof DuplicatedParagraph duplicatedParagraph ? PropertiesMapper.buildDuplicateParagraphProperties(duplicatedParagraph) : new HashMap<>();
default -> new HashMap<>(); default -> new HashMap<>();
}; };
DocumentStructure.EntryData.EntryDataBuilder documentBuilder = DocumentStructure.EntryData.builder() DocumentStructure.EntryData.EntryDataBuilder documentBuilder = DocumentStructure.EntryData.builder()
.treeId(toPrimitiveIntArray(entry.getTreeId())) .treeId(toPrimitiveIntArray(entry.getTreeId()))
.children(entry.getChildren() .children(entry.getChildren().stream().map(DocumentDataMapper::toEntryData).toList())
.stream()
.map(DocumentDataMapper::toEntryData)
.toList())
.type(entry.getType()) .type(entry.getType())
.atomicBlockIds(atomicTextBlocks) .atomicBlockIds(atomicTextBlocks)
.pageNumbers(entry.getNode().getPages() .pageNumbers(entry.getNode().getPages().stream().map(Page::getNumber).map(Integer::longValue).toArray(Long[]::new))
.stream()
.map(Page::getNumber)
.map(Integer::longValue)
.toArray(Long[]::new))
.properties(properties); .properties(properties);
if (entry.getNode() != null) { if (entry.getNode() != null) {
documentBuilder.engines(entry.getNode().getEngines()); documentBuilder.engines(entry.getNode().getEngines());
@ -112,10 +99,7 @@ public class DocumentDataMapper {
private Long[] toAtomicTextBlockIds(TextBlock textBlock) { private Long[] toAtomicTextBlockIds(TextBlock textBlock) {
return textBlock.getAtomicTextBlocks() return textBlock.getAtomicTextBlocks().stream().map(AtomicTextBlock::getId).toArray(Long[]::new);
.stream()
.map(AtomicTextBlock::getId)
.toArray(Long[]::new);
} }
@ -167,9 +151,7 @@ public class DocumentDataMapper {
private int[] toPrimitiveIntArray(List<Integer> list) { private int[] toPrimitiveIntArray(List<Integer> list) {
return list.stream() return list.stream().mapToInt(Integer::intValue).toArray();
.mapToInt(Integer::intValue)
.toArray();
} }
} }

View File

@ -7,13 +7,14 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.NoSuchElementException; import java.util.NoSuchElementException;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.DuplicatedParagraph;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Footer; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Footer;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Header; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Header;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Headline; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Headline;
@ -61,7 +62,7 @@ public class DocumentGraphMapper {
SemanticNode node = switch (entryData.getType()) { SemanticNode node = switch (entryData.getType()) {
case SECTION -> buildSection(context); case SECTION -> buildSection(context);
case PARAGRAPH -> buildParagraph(context); case PARAGRAPH -> buildParagraph(context, entryData.getProperties());
case HEADLINE -> buildHeadline(context); case HEADLINE -> buildHeadline(context);
case HEADER -> buildHeader(context); case HEADER -> buildHeader(context);
case FOOTER -> buildFooter(context); case FOOTER -> buildFooter(context);
@ -140,7 +141,17 @@ public class DocumentGraphMapper {
} }
private Paragraph buildParagraph(Context context) { private Paragraph buildParagraph(Context context, Map<String, String> properties) {
if (PropertiesMapper.isDuplicateParagraph(properties)) {
DuplicatedParagraph duplicatedParagraph = DuplicatedParagraph.builder().documentTree(context.documentTree).build();
Long[] unsortedTextblockIds = PropertiesMapper.getUnsortedTextblockIds(properties);
duplicatedParagraph.setUnsortedLeafTextBlock(toTextBlock(unsortedTextblockIds, context, duplicatedParagraph));
return duplicatedParagraph;
}
return Paragraph.builder().documentTree(context.documentTree).build(); return Paragraph.builder().documentTree(context.documentTree).build();
} }

View File

@ -1,17 +1,19 @@
package com.knecon.fforesight.service.layoutparser.processor.services.mapper; package com.knecon.fforesight.service.layoutparser.processor.services.mapper;
import java.awt.geom.Rectangle2D; import java.awt.geom.Rectangle2D;
import java.util.Collections; import java.util.Arrays;
import java.util.HashMap; import java.util.HashMap;
import java.util.Locale; import java.util.Locale;
import java.util.Map; import java.util.Map;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.DuplicatedParagraph;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
public class PropertiesMapper { public class PropertiesMapper {
@ -76,6 +78,32 @@ public class PropertiesMapper {
} }
public static Map<String, String> buildDuplicateParagraphProperties(DuplicatedParagraph duplicatedParagraph) {
Map<String, String> properties = new HashMap<>();
properties.put(DocumentStructure.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID, Arrays.toString(toAtomicTextBlockIds(duplicatedParagraph.getUnsortedLeafTextBlock())));
return properties;
}
public static boolean isDuplicateParagraph(Map<String, String> properties) {
return properties.containsKey(DocumentStructure.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID);
}
public static Long[] getUnsortedTextblockIds(Map<String, String> properties) {
return toLongArray(properties.get(DocumentStructure.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID));
}
public static Long[] toLongArray(String ids) {
return Arrays.stream(ids.substring(1, ids.length() - 1).trim().split(",")).map(Long::valueOf).toArray(Long[]::new);
}
private static ImageType parseImageType(String imageType) { private static ImageType parseImageType(String imageType) {
return switch (imageType) { return switch (imageType) {
@ -101,4 +129,10 @@ public class PropertiesMapper {
rectangle2D.getHeight()); rectangle2D.getHeight());
} }
private static Long[] toAtomicTextBlockIds(TextBlock textBlock) {
return textBlock.getAtomicTextBlocks().stream().map(AtomicTextBlock::getId).toArray(Long[]::new);
}
} }

View File

@ -237,8 +237,13 @@ public class PDFLinesTextStripper extends PDFTextStripper {
int startIndex = 0; int startIndex = 0;
RedTextPosition previous = null; RedTextPosition previous = null;
float direction = -1;
for (int i = 0; i <= textPositions.size() - 1; i++) { for (int i = 0; i <= textPositions.size() - 1; i++) {
if (direction == -1) {
direction = textPositions.get(i).getDir();
}
if (!textPositionSequences.isEmpty()) { if (!textPositionSequences.isEmpty()) {
previous = textPositionSequences.get(textPositionSequences.size() - 1) previous = textPositionSequences.get(textPositionSequences.size() - 1)
.getTextPositions() .getTextPositions()
@ -250,6 +255,13 @@ public class PDFLinesTextStripper extends PDFTextStripper {
continue; continue;
} }
if (textPositions.get(i).getDir() != direction && startIndex != i) {
List<TextPosition> sublist = textPositions.subList(startIndex, i);
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart));
startIndex = i;
direction = textPositions.get(i).getDir();
}
// Strange but sometimes this is happening, for example: Metolachlor2.pdf // Strange but sometimes this is happening, for example: Metolachlor2.pdf
if (checkIfCurrentPositionIsToTheRightOfPreviousPosition(i, textPositions)) { if (checkIfCurrentPositionIsToTheRightOfPreviousPosition(i, textPositions)) {
List<TextPosition> sublist = textPositions.subList(startIndex, i); List<TextPosition> sublist = textPositions.subList(startIndex, i);
@ -329,6 +341,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
.getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < maximumGapSize; .getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < maximumGapSize;
} }
@Override @Override
public String getText(PDDocument doc) throws IOException { public String getText(PDDocument doc) throws IOException {

View File

@ -20,6 +20,7 @@ import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.DuplicatedParagraph;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
@ -53,6 +54,8 @@ public class LayoutGridService {
static Color INNER_LINES_COLOR = new Color(255, 175, 175); static Color INNER_LINES_COLOR = new Color(255, 175, 175);
static Color PARAGRAPH_COLOR = new Color(70, 130, 180); static Color PARAGRAPH_COLOR = new Color(70, 130, 180);
static Color DUPLICATE_PARAGRAPH_COLOR = new Color(70, 180, 101);
static Color TABLE_COLOR = new Color(102, 205, 170); static Color TABLE_COLOR = new Color(102, 205, 170);
static Color SECTION_COLOR = new Color(50, 50, 50); static Color SECTION_COLOR = new Color(50, 50, 50);
static Color HEADLINE_COLOR = new Color(162, 56, 56); static Color HEADLINE_COLOR = new Color(162, 56, 56);
@ -100,6 +103,11 @@ public class LayoutGridService {
case IMAGE -> IMAGE_COLOR; case IMAGE -> IMAGE_COLOR;
default -> null; default -> null;
}; };
if (semanticNode instanceof DuplicatedParagraph) {
color = DUPLICATE_PARAGRAPH_COLOR;
}
if (isNotSectionOrTableCellOrDocument(semanticNode)) { if (isNotSectionOrTableCellOrDocument(semanticNode)) {
addAsRectangle(semanticNode, layoutGrid, color); addAsRectangle(semanticNode, layoutGrid, color);
} }

View File

@ -2,7 +2,6 @@ package com.knecon.fforesight.service.layoutparser.processor.utils;
import static java.lang.String.format; import static java.lang.String.format;
import java.awt.geom.Area;
import java.awt.geom.Rectangle2D; import java.awt.geom.Rectangle2D;
import java.awt.geom.RectangularShape; import java.awt.geom.RectangularShape;
import java.util.Collections; import java.util.Collections;
@ -40,11 +39,10 @@ public class RectangleTransformations {
public static double calculateIntersectedArea(Rectangle2D r1, Rectangle2D r2) { public static double calculateIntersectedArea(Rectangle2D r1, Rectangle2D r2) {
Area a1 = new Area(r1); double xOverlap = Math.max(0, Math.min(r1.getMaxX(), r2.getMaxX()) - Math.max(r1.getMinX(), r2.getMinX()));
Area a2 = new Area(r2); double yOverlap = Math.max(0, Math.min(r1.getMaxY(), r2.getMaxY()) - Math.max(r1.getY(), r2.getY()));
a1.intersect(a2);
Rectangle2D intersection = a1.getBounds2D(); return xOverlap * yOverlap;
return intersection.getWidth() * intersection.getHeight();
} }

View File

@ -3,7 +3,6 @@ package com.knecon.fforesight.service.layoutparser.processor.utils;
import java.util.List; import java.util.List;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;

View File

@ -11,14 +11,13 @@ import org.springframework.context.annotation.Import;
import com.amazonaws.services.s3.model.metrics.MetricsConfiguration; import com.amazonaws.services.s3.model.metrics.MetricsConfiguration;
import com.iqser.red.storage.commons.StorageAutoConfiguration; import com.iqser.red.storage.commons.StorageAutoConfiguration;
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingServiceProcessorConfiguration; import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingServiceProcessorConfiguration;
import com.knecon.fforesight.service.layoutparser.server.queue.MessagingConfiguration;
import com.knecon.fforesight.tenantcommons.MultiTenancyAutoConfiguration; import com.knecon.fforesight.tenantcommons.MultiTenancyAutoConfiguration;
import io.micrometer.observation.ObservationRegistry; import io.micrometer.observation.ObservationRegistry;
import io.micrometer.observation.aop.ObservedAspect; import io.micrometer.observation.aop.ObservedAspect;
@ImportAutoConfiguration({MultiTenancyAutoConfiguration.class}) @ImportAutoConfiguration({MultiTenancyAutoConfiguration.class})
@Import({MetricsConfiguration.class, StorageAutoConfiguration.class, LayoutParsingServiceProcessorConfiguration.class, MessagingConfiguration.class}) @Import({MetricsConfiguration.class, StorageAutoConfiguration.class, LayoutParsingServiceProcessorConfiguration.class})
@SpringBootApplication(exclude = {SecurityAutoConfiguration.class, ManagementWebSecurityAutoConfiguration.class}) @SpringBootApplication(exclude = {SecurityAutoConfiguration.class, ManagementWebSecurityAutoConfiguration.class})
public class Application { public class Application {

View File

@ -37,7 +37,7 @@ public class MessageHandler {
LayoutParsingRequest layoutParsingRequest = objectMapper.readValue(message.getBody(), LayoutParsingRequest.class); LayoutParsingRequest layoutParsingRequest = objectMapper.readValue(message.getBody(), LayoutParsingRequest.class);
if (layoutParsingRequest.layoutParsingType().equals(LayoutParsingType.TAAS) && layoutParsingRequest.researchDocumentStorageId() == null) { if (layoutParsingRequest.layoutParsingType().equals(LayoutParsingType.CLARIFYND) && layoutParsingRequest.researchDocumentStorageId() == null) {
throw new IllegalArgumentException("ResearchDocumentDataStorageId is null!"); throw new IllegalArgumentException("ResearchDocumentDataStorageId is null!");
} }
log.info("Layout parsing request received {}", layoutParsingRequest.identifier()); log.info("Layout parsing request received {}", layoutParsingRequest.identifier());

View File

@ -1,37 +0,0 @@
package com.knecon.fforesight.service.layoutparser.server.queue;
import org.springframework.amqp.core.Queue;
import org.springframework.amqp.core.QueueBuilder;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingQueueNames;
import lombok.RequiredArgsConstructor;
@Configuration
@RequiredArgsConstructor
public class MessagingConfiguration {
@Bean
public Queue layoutparsingRequestQueue() {
return QueueBuilder.durable(LayoutParsingQueueNames.LAYOUT_PARSING_REQUEST_QUEUE)//
.withArgument("x-dead-letter-exchange", "").withArgument("x-dead-letter-routing-key", LayoutParsingQueueNames.LAYOUT_PARSING_DLQ).build();
}
@Bean
public Queue layoutparsingResponseQueue() {
return QueueBuilder.durable(LayoutParsingQueueNames.LAYOUT_PARSING_FINISHED_EVENT_QUEUE).build();
}
@Bean
public Queue layoutparsingDLQ() {
return QueueBuilder.durable(LayoutParsingQueueNames.LAYOUT_PARSING_DLQ).build();
}
}

View File

@ -48,7 +48,8 @@ public class BdrJsonBuildTest extends AbstractTest {
@SneakyThrows @SneakyThrows
protected Document buildGraph(File file) { protected Document buildGraph(File file) {
return DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.TAAS, return DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.CLARIFYND,
layoutParsingPipeline.parseLayout(LayoutParsingType.CLARIFYND,
file, file,
new ImageServiceResponse(), new ImageServiceResponse(),
new TableServiceResponse(), new TableServiceResponse(),

View File

@ -95,7 +95,8 @@ public class HeadlinesGoldStandardIntegrationTest {
goldStandardLog.getRedactionLogEntry().removeIf(r -> !r.isRedacted() || r.getChanges().get(r.getChanges().size() - 1).getType().equals(ChangeType.REMOVED)); goldStandardLog.getRedactionLogEntry().removeIf(r -> !r.isRedacted() || r.getChanges().get(r.getChanges().size() - 1).getType().equals(ChangeType.REMOVED));
goldStandardLog.getRedactionLogEntry().forEach(e -> goldStandardHeadlines.add(new Headline(e.getPositions().get(0).getPage(), e.getValue()))); goldStandardLog.getRedactionLogEntry().forEach(e -> goldStandardHeadlines.add(new Headline(e.getPositions().get(0).getPage(), e.getValue())));
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, Document documentGraph = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_OLD,
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD,
pdfFileResource.getFile(), pdfFileResource.getFile(),
new ImageServiceResponse(), new ImageServiceResponse(),
new TableServiceResponse(), new TableServiceResponse(),

View File

@ -26,7 +26,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
public void testLayoutParserEndToEnd() { public void testLayoutParserEndToEnd() {
prepareStorage("files/bdr/Wie weiter bei Kristeneinrichtungen.pdf"); prepareStorage("files/bdr/Wie weiter bei Kristeneinrichtungen.pdf");
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER); LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER_OLD);
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest); LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
Arrays.stream(finishedEvent.message().split("\n")).forEach(log::info); Arrays.stream(finishedEvent.message().split("\n")).forEach(log::info);
} }

View File

@ -55,7 +55,8 @@ public class DocumentGraphJsonWritingTest extends BuildDocumentTest {
@SneakyThrows @SneakyThrows
private void writeJsons(Path filename) { private void writeJsons(Path filename) {
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, Document documentGraph = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_OLD,
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD,
filename.toFile(), filename.toFile(),
new ImageServiceResponse(), new ImageServiceResponse(),
new TableServiceResponse(), new TableServiceResponse(),

View File

@ -26,15 +26,15 @@ public class ViewerDocumentTest extends BuildDocumentTest {
@SneakyThrows @SneakyThrows
public void testViewerDocument() { public void testViewerDocument() {
String fileName = "files/SinglePages/T5 VV-640252-Page16.pdf"; String fileName = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf"; String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
var documentFile = new ClassPathResource(fileName).getFile(); var documentFile = new ClassPathResource(fileName).getFile();
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null); ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService); LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER);
long start = System.currentTimeMillis(); long start = System.currentTimeMillis();
Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER);
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true); layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000); System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
} }
@ -57,10 +57,11 @@ public class ViewerDocumentTest extends BuildDocumentTest {
documentFile, documentFile,
new ImageServiceResponse(), new ImageServiceResponse(),
tableResponse, tableResponse,
new VisualLayoutParsingResponse(),Path.of(fileName).getFileName().toFile().toString()); new VisualLayoutParsingResponse(),
Path.of(fileName).getFileName().toFile().toString());
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null); ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService); LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
Document document = DocumentGraphFactory.buildDocumentGraph(classificationDocument); Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.DOCUMINE, classificationDocument);
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true); layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
} }

View File

@ -56,7 +56,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
@SneakyThrows @SneakyThrows
public ClassificationDocument buildClassificationDocument(File originDocument, TableServiceResponse tableServiceResponse) { public ClassificationDocument buildClassificationDocument(File originDocument, TableServiceResponse tableServiceResponse) {
ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD,
originDocument, originDocument,
new ImageServiceResponse(), new ImageServiceResponse(),
tableServiceResponse, tableServiceResponse,
@ -112,16 +112,8 @@ public class PdfSegmentationServiceTest extends AbstractTest {
var tableServiceResponse = objectMapper.readValue(cvTablesResource.getInputStream(), TableServiceResponse.class); var tableServiceResponse = objectMapper.readValue(cvTablesResource.getInputStream(), TableServiceResponse.class);
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile(), tableServiceResponse); ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile(), tableServiceResponse);
assertThat(document.getSections() assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
.stream() var tables = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList();
.flatMap(paragraph -> paragraph.getTables()
.stream())
.collect(Collectors.toList())).isNotEmpty();
var tables = document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.toList();
// Quality of the table parsing is not good, because the file is rotated at scanning. // Quality of the table parsing is not good, because the file is rotated at scanning.
// We only asset that the table border is not the page border. // We only asset that the table border is not the page border.
@ -160,22 +152,11 @@ public class PdfSegmentationServiceTest extends AbstractTest {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Spanning Cells.pdf"); ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Spanning Cells.pdf");
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
assertThat(document.getSections() assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
.stream() TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(0);
.flatMap(paragraph -> paragraph.getTables()
.stream())
.collect(Collectors.toList())).isNotEmpty();
TablePageBlock table = document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.toList()
.get(0);
assertThat(table.getColCount()).isEqualTo(6); assertThat(table.getColCount()).isEqualTo(6);
assertThat(table.getRowCount()).isEqualTo(13); assertThat(table.getRowCount()).isEqualTo(13);
assertThat(table.getRows() assertThat(table.getRows().stream().mapToInt(List::size).sum()).isEqualTo(6 * 13);
.stream()
.mapToInt(List::size).sum()).isEqualTo(6 * 13);
} }
@ -185,37 +166,15 @@ public class PdfSegmentationServiceTest extends AbstractTest {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Table.pdf"); ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Table.pdf");
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
assertThat(document.getSections() assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
.stream() TablePageBlock firstTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(0);
.flatMap(paragraph -> paragraph.getTables()
.stream())
.collect(Collectors.toList())).isNotEmpty();
TablePageBlock firstTable = document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.toList()
.get(0);
assertThat(firstTable.getColCount()).isEqualTo(8); assertThat(firstTable.getColCount()).isEqualTo(8);
assertThat(firstTable.getRowCount()).isEqualTo(1); assertThat(firstTable.getRowCount()).isEqualTo(1);
TablePageBlock secondTable = document.getSections() TablePageBlock secondTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(1);
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.toList()
.get(1);
assertThat(secondTable.getColCount()).isEqualTo(8); assertThat(secondTable.getColCount()).isEqualTo(8);
assertThat(secondTable.getRowCount()).isEqualTo(2); assertThat(secondTable.getRowCount()).isEqualTo(2);
List<List<Cell>> firstTableHeaderCells = firstTable.getRows() List<List<Cell>> firstTableHeaderCells = firstTable.getRows().get(0).stream().map(Collections::singletonList).collect(Collectors.toList());
.get(0) assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).toList().equals(firstTableHeaderCells))).isTrue();
.stream()
.map(Collections::singletonList)
.collect(Collectors.toList());
assertThat(secondTable.getRows()
.stream()
.allMatch(row -> row.stream()
.map(Cell::getHeaderCells)
.toList().equals(firstTableHeaderCells))).isTrue();
} }
@ -225,37 +184,15 @@ public class PdfSegmentationServiceTest extends AbstractTest {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Multi Page Table.pdf"); ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Multi Page Table.pdf");
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
assertThat(document.getSections() assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
.stream() TablePageBlock firstTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(0);
.flatMap(paragraph -> paragraph.getTables()
.stream())
.collect(Collectors.toList())).isNotEmpty();
TablePageBlock firstTable = document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.toList()
.get(0);
assertThat(firstTable.getColCount()).isEqualTo(9); assertThat(firstTable.getColCount()).isEqualTo(9);
assertThat(firstTable.getRowCount()).isEqualTo(5); assertThat(firstTable.getRowCount()).isEqualTo(5);
TablePageBlock secondTable = document.getSections() TablePageBlock secondTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(1);
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.toList()
.get(1);
assertThat(secondTable.getColCount()).isEqualTo(9); assertThat(secondTable.getColCount()).isEqualTo(9);
assertThat(secondTable.getRowCount()).isEqualTo(6); assertThat(secondTable.getRowCount()).isEqualTo(6);
List<List<Cell>> firstTableHeaderCells = firstTable.getRows() List<List<Cell>> firstTableHeaderCells = firstTable.getRows().get(firstTable.getRowCount() - 1).stream().map(Cell::getHeaderCells).collect(Collectors.toList());
.get(firstTable.getRowCount() - 1) assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).toList().equals(firstTableHeaderCells))).isTrue();
.stream()
.map(Cell::getHeaderCells)
.collect(Collectors.toList());
assertThat(secondTable.getRows()
.stream()
.allMatch(row -> row.stream()
.map(Cell::getHeaderCells)
.toList().equals(firstTableHeaderCells))).isTrue();
} }
@ -265,37 +202,15 @@ public class PdfSegmentationServiceTest extends AbstractTest {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Rotated Table Headers.pdf"); ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Rotated Table Headers.pdf");
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
assertThat(document.getSections() assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
.stream() TablePageBlock firstTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(0);
.flatMap(paragraph -> paragraph.getTables()
.stream())
.collect(Collectors.toList())).isNotEmpty();
TablePageBlock firstTable = document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.toList()
.get(0);
assertThat(firstTable.getColCount()).isEqualTo(8); assertThat(firstTable.getColCount()).isEqualTo(8);
assertThat(firstTable.getRowCount()).isEqualTo(1); assertThat(firstTable.getRowCount()).isEqualTo(1);
TablePageBlock secondTable = document.getSections() TablePageBlock secondTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(1);
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.toList()
.get(1);
assertThat(secondTable.getColCount()).isEqualTo(8); assertThat(secondTable.getColCount()).isEqualTo(8);
assertThat(secondTable.getRowCount()).isEqualTo(6); assertThat(secondTable.getRowCount()).isEqualTo(6);
List<List<Cell>> firstTableHeaderCells = firstTable.getRows() List<List<Cell>> firstTableHeaderCells = firstTable.getRows().get(0).stream().map(Collections::singletonList).collect(Collectors.toList());
.get(0) assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).toList().equals(firstTableHeaderCells))).isTrue();
.stream()
.map(Collections::singletonList)
.collect(Collectors.toList());
assertThat(secondTable.getRows()
.stream()
.allMatch(row -> row.stream()
.map(Cell::getHeaderCells)
.toList().equals(firstTableHeaderCells))).isTrue();
} }
@ -352,8 +267,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
"Method meets analytical validation criteria", "Method meets analytical validation criteria",
"Remarks (in case validation criteria are not met)", "Remarks (in case validation criteria are not met)",
"Acceptability of the method"), "Acceptability of the method"),
Arrays.asList( Arrays.asList("Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
@ -757,11 +671,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
@SneakyThrows @SneakyThrows
private void toHtml(ClassificationDocument document, String filename) { private void toHtml(ClassificationDocument document, String filename) {
var tables = document.getSections() var tables = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList();
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.toList();
StringBuilder sb = new StringBuilder(); StringBuilder sb = new StringBuilder();
int currentPage = 1; int currentPage = 1;
@ -782,19 +692,9 @@ public class PdfSegmentationServiceTest extends AbstractTest {
private void validateTable(ClassificationDocument document, int tableIndex, int colCount, int rowCount, int emptyCellsCountCorrect, int emptyCellsCountIncorrect) { private void validateTable(ClassificationDocument document, int tableIndex, int colCount, int rowCount, int emptyCellsCountCorrect, int emptyCellsCountIncorrect) {
TablePageBlock table = document.getSections() TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(tableIndex);
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.toList()
.get(tableIndex);
List<List<Cell>> rows = table.getRows(); List<List<Cell>> rows = table.getRows();
int emptyCellsFoundFound = rows.stream() int emptyCellsFoundFound = rows.stream().flatMap(List::stream).toList().stream().filter(f -> f.toString().isEmpty()).toList().size();
.flatMap(List::stream)
.toList()
.stream()
.filter(f -> f.toString().isEmpty())
.toList().size();
for (List<Cell> row : table.getRows()) { for (List<Cell> row : table.getRows()) {
row.forEach(r -> System.out.println(r.toString())); row.forEach(r -> System.out.println(r.toString()));
@ -809,20 +709,11 @@ public class PdfSegmentationServiceTest extends AbstractTest {
private void validateTable(ClassificationDocument document, int tableIndex, List<List<String>> values) { private void validateTable(ClassificationDocument document, int tableIndex, List<List<String>> values) {
TablePageBlock table = document.getSections() TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(tableIndex);
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.toList()
.get(tableIndex);
List<List<Cell>> rows = table.getRows(); List<List<Cell>> rows = table.getRows();
List<Cell> rowsFlattened = rows.stream() List<Cell> rowsFlattened = rows.stream().flatMap(List::stream).toList();
.flatMap(List::stream) List<String> valuesFlattened = values.stream().flatMap(List::stream).toList();
.toList();
List<String> valuesFlattened = values.stream()
.flatMap(List::stream)
.toList();
for (int i = 0; i < valuesFlattened.size(); i++) { for (int i = 0; i < valuesFlattened.size(); i++) {
Cell cell = rowsFlattened.get(i); Cell cell = rowsFlattened.get(i);
@ -835,11 +726,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
private void validateTableSize(ClassificationDocument document, int tableSize) { private void validateTableSize(ClassificationDocument document, int tableSize) {
assertThat(document.getSections() assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().size()).isEqualTo(tableSize);
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.toList().size()).isEqualTo(tableSize);
} }

View File

@ -21,7 +21,7 @@ class BodyTextFrameServiceTest extends BuildDocumentTest {
String filename = "files/211.pdf"; String filename = "files/211.pdf";
String outputFilename = "/tmp/" + Path.of(filename).getFileName() + "_MAINBODY.pdf"; String outputFilename = "/tmp/" + Path.of(filename).getFileName() + "_MAINBODY.pdf";
ClassificationDocument document = parseLayout(filename, LayoutParsingType.TAAS); ClassificationDocument document = parseLayout(filename, LayoutParsingType.CLARIFYND);
PdfDraw.drawRectanglesPerPage(filename, PdfDraw.drawRectanglesPerPage(filename,
document.getPages().stream().map(page -> List.of(RectangleTransformations.toRectangle2D(page.getBodyTextFrame()))).toList(), document.getPages().stream().map(page -> List.of(RectangleTransformations.toRectangle2D(page.getBodyTextFrame()))).toList(),
outputFilename); outputFilename);

View File

@ -99,13 +99,15 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
@SneakyThrows @SneakyThrows
private void writeJsons(Path filename) { private void writeJsons(Path filename) {
Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_OLD,
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD,
filename.toFile(), filename.toFile(),
new ImageServiceResponse(), new ImageServiceResponse(),
new TableServiceResponse(), new TableServiceResponse(),
new VisualLayoutParsingResponse(), new VisualLayoutParsingResponse(),
filename.toFile().toString())); filename.toFile().toString()));
Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_OLD,
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD,
filename.toFile(), filename.toFile(),
new ImageServiceResponse(), new ImageServiceResponse(),
new TableServiceResponse(), new TableServiceResponse(),

View File

@ -20,7 +20,6 @@ import org.springframework.context.annotation.Import;
import org.springframework.context.annotation.Primary; import org.springframework.context.annotation.Primary;
import org.springframework.core.io.ClassPathResource; import org.springframework.core.io.ClassPathResource;
import org.springframework.test.context.junit.jupiter.SpringExtension; import org.springframework.test.context.junit.jupiter.SpringExtension;
import org.xmlunit.builder.Input;
import com.iqser.red.commons.jackson.ObjectMapperFactory; import com.iqser.red.commons.jackson.ObjectMapperFactory;
import com.iqser.red.storage.commons.service.StorageService; import com.iqser.red.storage.commons.service.StorageService;
@ -68,7 +67,7 @@ public abstract class AbstractTest {
protected LayoutParsingRequest buildStandardLayoutParsingRequest() { protected LayoutParsingRequest buildStandardLayoutParsingRequest() {
return LayoutParsingRequest.builder() return LayoutParsingRequest.builder()
.layoutParsingType(LayoutParsingType.REDACT_MANAGER) .layoutParsingType(LayoutParsingType.REDACT_MANAGER_OLD)
.originFileStorageId(ORIGIN_FILE_ID) .originFileStorageId(ORIGIN_FILE_ID)
.tablesFileStorageId(Optional.of(TABLE_FILE_ID)) .tablesFileStorageId(Optional.of(TABLE_FILE_ID))
.imagesFileStorageId(Optional.of(IMAGE_FILE_ID)) .imagesFileStorageId(Optional.of(IMAGE_FILE_ID))
@ -107,7 +106,7 @@ public abstract class AbstractTest {
protected LayoutParsingRequest prepareStorage(InputStream fileInputStream) { protected LayoutParsingRequest prepareStorage(InputStream fileInputStream) {
storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileInputStream); storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileInputStream);
return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER); return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER_OLD);
} }
@ -140,6 +139,7 @@ public abstract class AbstractTest {
return prepareStorage(pdfFileResource.getInputStream(), cvServiceResponseFileResource.getInputStream(), imageInfoFileResource.getInputStream()); return prepareStorage(pdfFileResource.getInputStream(), cvServiceResponseFileResource.getInputStream(), imageInfoFileResource.getInputStream());
} }
@SneakyThrows @SneakyThrows
protected LayoutParsingRequest prepareStorage(String file, String cvServiceResponseFile, String imageInfoFile, String visualLayoutParsingResponseFile) { protected LayoutParsingRequest prepareStorage(String file, String cvServiceResponseFile, String imageInfoFile, String visualLayoutParsingResponseFile) {
@ -148,9 +148,13 @@ public abstract class AbstractTest {
ClassPathResource imageInfoFileResource = new ClassPathResource(imageInfoFile); ClassPathResource imageInfoFileResource = new ClassPathResource(imageInfoFile);
ClassPathResource visualLayoutParsingResponseResource = new ClassPathResource(visualLayoutParsingResponseFile); ClassPathResource visualLayoutParsingResponseResource = new ClassPathResource(visualLayoutParsingResponseFile);
return prepareStorage(pdfFileResource.getInputStream(), cvServiceResponseFileResource.getInputStream(), imageInfoFileResource.getInputStream(), visualLayoutParsingResponseResource.getInputStream()); return prepareStorage(pdfFileResource.getInputStream(),
cvServiceResponseFileResource.getInputStream(),
imageInfoFileResource.getInputStream(),
visualLayoutParsingResponseResource.getInputStream());
} }
@SneakyThrows @SneakyThrows
protected LayoutParsingRequest prepareStorage(InputStream fileStream, InputStream cvServiceResponseFileStream, InputStream imageInfoStream) { protected LayoutParsingRequest prepareStorage(InputStream fileStream, InputStream cvServiceResponseFileStream, InputStream imageInfoStream) {
@ -158,18 +162,22 @@ public abstract class AbstractTest {
storageService.storeObject(TenantContext.getTenantId(), TABLE_FILE_ID, cvServiceResponseFileStream); storageService.storeObject(TenantContext.getTenantId(), TABLE_FILE_ID, cvServiceResponseFileStream);
storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileStream); storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileStream);
return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER); return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER_OLD);
} }
@SneakyThrows @SneakyThrows
protected LayoutParsingRequest prepareStorage(InputStream fileStream, InputStream cvServiceResponseFileStream, InputStream imageInfoStream, InputStream visualLayoutParsingResponseFileStream) { protected LayoutParsingRequest prepareStorage(InputStream fileStream,
InputStream cvServiceResponseFileStream,
InputStream imageInfoStream,
InputStream visualLayoutParsingResponseFileStream) {
storageService.storeObject(TenantContext.getTenantId(), IMAGE_FILE_ID, imageInfoStream); storageService.storeObject(TenantContext.getTenantId(), IMAGE_FILE_ID, imageInfoStream);
storageService.storeObject(TenantContext.getTenantId(), TABLE_FILE_ID, cvServiceResponseFileStream); storageService.storeObject(TenantContext.getTenantId(), TABLE_FILE_ID, cvServiceResponseFileStream);
storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileStream); storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileStream);
storageService.storeObject(TenantContext.getTenantId(), VISUAL_LAYOUT_FILE, visualLayoutParsingResponseFileStream); storageService.storeObject(TenantContext.getTenantId(), VISUAL_LAYOUT_FILE, visualLayoutParsingResponseFileStream);
return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER); return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER_OLD);
} }

View File

@ -26,14 +26,19 @@ public abstract class BuildDocumentTest extends AbstractTest {
File fileResource = new ClassPathResource(filename).getFile(); File fileResource = new ClassPathResource(filename).getFile();
prepareStorage(filename); prepareStorage(filename);
return layoutParsingPipeline.parseLayout(layoutParsingType, fileResource, layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID), new TableServiceResponse(), new VisualLayoutParsingResponse(),filename); return layoutParsingPipeline.parseLayout(layoutParsingType,
fileResource,
layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID),
new TableServiceResponse(),
new VisualLayoutParsingResponse(),
filename);
} }
@SneakyThrows @SneakyThrows
protected Document buildGraph(String filename) { protected Document buildGraph(String filename) {
return buildGraph(filename, LayoutParsingType.REDACT_MANAGER); return buildGraph(filename, LayoutParsingType.REDACT_MANAGER_OLD);
} }
@ -46,7 +51,7 @@ public abstract class BuildDocumentTest extends AbstractTest {
prepareStorage(filename); prepareStorage(filename);
} }
return DocumentGraphFactory.buildDocumentGraph(parseLayout(filename, layoutParsingType)); return DocumentGraphFactory.buildDocumentGraph(layoutParsingType, parseLayout(filename, layoutParsingType));
} }
} }

View File

@ -1,6 +1,6 @@
plugins { plugins {
id("com.knecon.fforesight.java-conventions") id("com.knecon.fforesight.java-conventions")
id("io.freefair.lombok") version "8.2.2" id("io.freefair.lombok") version "8.4"
} }
description = "Library for adding/removing layers in the viewer document" description = "Library for adding/removing layers in the viewer document"