RED-8825: added split by ruling into every step of docstrum

This commit is contained in:
Kilian Schuettler 2024-04-22 10:48:35 +02:00
parent 6a691183dc
commit e4663ac8db
30 changed files with 537 additions and 220 deletions

View File

@ -5,6 +5,7 @@ public enum LayoutParsingType {
REDACT_MANAGER_OLD,
REDACT_MANAGER_PARAGRAPH_DEBUG,
DOCUMINE,
DOCUMINE_OLD,
CLARIFYND,
CLARIFYND_PARAGRAPH_DEBUG
}

View File

@ -101,24 +101,20 @@ public class LayoutParsingPipeline {
log.info("Starting layout parsing for {}", layoutParsingRequest.identifier());
File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId())
.orElse(originFile);
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()).orElse(originFile);
VisualLayoutParsingResponse visualLayoutParsingResponse = new VisualLayoutParsingResponse();
if (layoutParsingRequest.visualLayoutParsingFileId()
.isPresent()) {
if (layoutParsingRequest.visualLayoutParsingFileId().isPresent()) {
visualLayoutParsingResponse = layoutParsingStorageService.getVisualLayoutParsingFile(layoutParsingRequest.visualLayoutParsingFileId().get());
}
ImageServiceResponse imageServiceResponse = new ImageServiceResponse();
if (layoutParsingRequest.imagesFileStorageId()
.isPresent()) {
if (layoutParsingRequest.imagesFileStorageId().isPresent()) {
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get());
}
TableServiceResponse tableServiceResponse = new TableServiceResponse();
if (layoutParsingRequest.tablesFileStorageId()
.isPresent()) {
if (layoutParsingRequest.tablesFileStorageId().isPresent()) {
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get());
}
@ -135,7 +131,7 @@ public class LayoutParsingPipeline {
log.info("Creating viewer document for {}", layoutParsingRequest.identifier());
layoutGridService.addLayoutGrid(viewerDocumentFile, documentGraph, viewerDocumentFile, false);
layoutGridService.addLayoutGrid(viewerDocumentFile, documentGraph, viewerDocumentFile, false, layoutParsingRequest.visualLayoutParsingFileId().isPresent());
log.info("Storing resulting files for {}", layoutParsingRequest.identifier());
@ -251,12 +247,12 @@ public class LayoutParsingPipeline {
stripper.setStartPage(pageNumber);
stripper.setEndPage(pageNumber);
stripper.setPdpage(pdPage);
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE)) {
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD)) {
stripper.setSortByPosition(true);
}
stripper.getText(originDocument);
classificationDocument.getVisualizations().addTextVisualizations(stripper.getTextPositionSequences(), pageNumber);
List<TextPositionSequence> words = stripper.getTextPositionSequences();
classificationDocument.getVisualizations().addTextVisualizations(words, pageNumber);
PDRectangle pdr = pdPage.getMediaBox();
@ -266,9 +262,7 @@ public class LayoutParsingPipeline {
PDRectangle cropbox = pdPage.getCropBox();
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), stripper.getRulings());
classificationDocument.getVisualizations().addCleanRulingVisualization(cleanRulings, pageNumber);
List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical());
List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontals(), cleanRulings.getVerticals());
var graphics = graphicExtractorService.extractPathElementGraphics(originDocument,
pdPage,
@ -287,12 +281,11 @@ public class LayoutParsingPipeline {
ClassificationPage classificationPage = switch (layoutParsingType) {
case REDACT_MANAGER_OLD ->
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells);
case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG ->
docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, true, classificationDocument.getVisualizations());
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG ->
docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, false, classificationDocument.getVisualizations());
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, classificationDocument.getVisualizations());
case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings.getHorizontals(), cleanRulings.getVerticals());
case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG ->
docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getVisualizations());
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getVisualizations());
};
classificationPage.setCleanRulings(cleanRulings);
@ -313,7 +306,7 @@ public class LayoutParsingPipeline {
}
if (signatures.containsKey(pageNumber)) {
if (classificationPage.getImages() == null || classificationPage.getImages().size() == 0) {
if (classificationPage.getImages() == null || classificationPage.getImages().isEmpty()) {
classificationPage.setImages(signatures.get(pageNumber));
} else {
classificationPage.getImages().addAll(signatures.get(pageNumber));
@ -325,7 +318,7 @@ public class LayoutParsingPipeline {
if (layoutParsingType == LayoutParsingType.REDACT_MANAGER) {
docstrumBlockificationService.combineBlocks(classificationPage);
} else if (layoutParsingType == LayoutParsingType.CLARIFYND) {
docstrumBlockificationService.mergeIntersectingBlocks(classificationPage.getTextBlocks(), 0, 6.5f);
docstrumBlockificationService.mergeIntersectingBlocks(classificationPage, 0, 6.5f);
}
buildPageStatistics(classificationPage);
@ -338,11 +331,14 @@ public class LayoutParsingPipeline {
log.info("Calculating BodyTextFrame for {}", identifier);
bodyTextFrameService.setBodyTextFrames(classificationDocument, layoutParsingType);
for (ClassificationPage page : classificationDocument.getPages()) {
classificationDocument.getVisualizations().addCleanRulingVisualization(page.getCleanRulings(), page.getPageNumber());
}
log.info("Classify TextBlocks for {}", identifier);
switch (layoutParsingType) {
case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_OLD, CLARIFYND_PARAGRAPH_DEBUG ->
redactManagerClassificationService.classifyDocument(classificationDocument);
case DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument);
case DOCUMINE_OLD, DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument);
case CLARIFYND -> clarifyndClassificationService.classifyDocument(classificationDocument);
}

View File

@ -14,9 +14,11 @@ import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.Nea
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.ReadingOrderService;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.SpacingService;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.ZoneBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
import lombok.RequiredArgsConstructor;
@ -31,31 +33,37 @@ public class DocstrumSegmentationService {
private final ReadingOrderService readingOrderService;
public List<Zone> segmentPage(List<TextPositionSequence> textPositions, boolean xyOrder) {
public List<Zone> segmentPage(List<TextPositionSequence> textPositions, boolean xyOrder, CleanRulings usedRulings, LayoutparsingVisualizations visualizations) {
List<Zone> zones = new ArrayList<>();
zones.addAll(computeZones(textPositions, TextDirection.ZERO));
zones.addAll(computeZones(textPositions, TextDirection.QUARTER_CIRCLE));
zones.addAll(computeZones(textPositions, TextDirection.HALF_CIRCLE));
zones.addAll(computeZones(textPositions, TextDirection.THREE_QUARTER_CIRCLE));
zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.ZERO));
zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.QUARTER_CIRCLE));
zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.HALF_CIRCLE));
zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.THREE_QUARTER_CIRCLE));
return readingOrderService.resolve(zones, xyOrder);
}
private List<Zone> computeZones(List<TextPositionSequence> textPositions, TextDirection direction) {
private List<Zone> computeZones(List<TextPositionSequence> textPositions, CleanRulings rulings, LayoutparsingVisualizations visualizations, TextDirection direction) {
List<RedTextPosition> positions = textPositions.stream().filter(t -> t.getDir() == direction).map(TextPositionSequence::getTextPositions).flatMap(List::stream).toList();
List<RedTextPosition> positions = textPositions.stream()
.filter(t -> t.getDir() == direction)
.map(TextPositionSequence::getTextPositions)
.flatMap(List::stream)
.toList();
List<Character> characters = positions.stream().map(Character::new).collect(Collectors.toList());
List<Character> characters = positions.stream()
.map(Character::new)
.collect(Collectors.toList());
nearestNeighbourService.findNearestNeighbors(characters);
double characterSpacing = spacingService.computeCharacterSpacing(characters);
double lineSpacing = Math.min(spacingService.computeLineSpacing(characters), 20);
List<Line> lines = lineBuilderService.buildLines(characters, characterSpacing, lineSpacing);
return zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing);
List<Line> lines = lineBuilderService.buildLines(characters, characterSpacing, lineSpacing, rulings);
return zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing, rulings);
}
}

View File

@ -4,6 +4,7 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2;
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
@ -73,7 +74,7 @@ public class Line extends BoundingBox {
public double getAngle() {
return Math.atan2(y1 - y0, x1 - x0);
return FastAtan2.fastAtan2(y1 - y0, x1 - x0);
}

View File

@ -16,7 +16,7 @@ public class Zone extends BoundingBox {
@SuppressWarnings("PMD.ConstructorCallsOverridableMethod")
public Zone(List<Line> lines) {
lines.sort(Comparator.comparingDouble(Line::getY));
lines.sort(Comparator.comparingDouble(Line::getY0));
this.lines = lines;
buildBBox();
}

View File

@ -10,34 +10,39 @@ import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Angle
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
@Service
public class LineBuilderService {
private static final double CHARACTER_SPACING_DISTANCE_MULTIPLIER = 3.5;
private static final double MAX_VERTICAL_CHARACTER_DISTANCE = 0.67;
private static final double LINE_SPACING_THRESHOLD_MULTIPLIER = 0.67;
private static final double ANGLE_TOLERANCE = Math.PI / 6;
public List<Line> buildLines(List<Character> characters, double characterSpacing, double lineSpacing) {
public List<Line> buildLines(List<Character> characters, double characterSpacing, double lineSpacing, CleanRulings rulings) {
double maxHorizontalDistance = characterSpacing * CHARACTER_SPACING_DISTANCE_MULTIPLIER;
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_CHARACTER_DISTANCE;
double maxVerticalDistance = lineSpacing * LINE_SPACING_THRESHOLD_MULTIPLIER;
UnionFind<Character> unionFind = new UnionFind<>(new HashSet<>(characters));
AngleFilter filter = new AngleFilter(-ANGLE_TOLERANCE, ANGLE_TOLERANCE);
AngleFilter angleFilter = new AngleFilter(-ANGLE_TOLERANCE, ANGLE_TOLERANCE);
characters.forEach(character -> {
character.getNeighbors()
.forEach(neighbor -> {
double normalizedHorizontalDistance = neighbor.getHorizontalDistance() / maxHorizontalDistance;
double normalizedVerticalDistance = neighbor.getVerticalDistance() / maxVerticalDistance;
if (character.getTextPosition().getDir() == neighbor.getCharacter().getTextPosition().getDir() //
&& filter.matches(neighbor) //
&& Math.pow(normalizedHorizontalDistance, 2) + Math.pow(normalizedVerticalDistance, 2) <= 1) {
unionFind.union(character, neighbor.getCharacter());
if (character.getTextPosition().getDir() != neighbor.getCharacter().getTextPosition().getDir() //
|| !angleFilter.matches(neighbor) //
|| Math.pow(normalizedHorizontalDistance, 2) + Math.pow(normalizedVerticalDistance, 2) > 1 //
|| rulings.lineBetween(character, neighbor.getCharacter())) {
return;
}
unionFind.union(character, neighbor.getCharacter());
});
});

View File

@ -5,6 +5,7 @@ import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import org.springframework.stereotype.Service;
@ -12,6 +13,7 @@ import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Chara
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
@Service
public class ZoneBuilderService {
@ -29,12 +31,10 @@ public class ZoneBuilderService {
private static final double ANGLE_TOLERANCE = Math.PI / 6;
private static final int MAX_ZONES = 300;
private static final double MAX_VERTICAL_MERGE_DISTANCE = 0.5;
public List<Zone> buildZones(List<Line> lines, double characterSpacing, double lineSpacing) {
public List<Zone> buildZones(List<Line> lines, double characterSpacing, double lineSpacing, CleanRulings rulings) {
double minHorizontalDistance = characterSpacing * MIN_HORIZONTAL_DISTANCE_MULTIPLIER;
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_DISTANCE_MULTIPLIER;
@ -60,29 +60,23 @@ public class ZoneBuilderService {
double horizontalDistance = outerLine.horizontalDistance(innerLine) / scale;
double verticalDistance = outerLine.verticalDistance(innerLine) / scale;
if (minHorizontalDistance <= horizontalDistance && verticalDistance <= maxVerticalDistance //
|| minHorizontalMergeDistance <= horizontalDistance && verticalDistance <= maxVerticalMergeDistance) {
unionFind.union(outerLine, innerLine);
if ((!(minHorizontalDistance <= horizontalDistance) || !(verticalDistance <= maxVerticalDistance)) //
&& (!(minHorizontalMergeDistance <= horizontalDistance) || !(verticalDistance <= maxVerticalMergeDistance))) {
return;
}
if (rulings.lineBetween(outerLine.getBBox(), innerLine.getBBox())) {
return;
}
unionFind.union(outerLine, innerLine);
});
});
List<Zone> zones = unionFind.getGroups()
return unionFind.getGroups()
.stream()
.map(group -> mergeLinesInZone(new ArrayList<>(group), characterSpacing, lineSpacing))
.toList();
if (zones.size() > MAX_ZONES) {
List<Line> oneZoneLines = new ArrayList<>();
for (Zone zone : zones) {
oneZoneLines.addAll(zone.getLines());
}
return List.of(mergeLinesInZone(oneZoneLines, characterSpacing, lineSpacing));
}
return zones;
}
@ -157,7 +151,7 @@ public class ZoneBuilderService {
outputZone.add(new Line(characters, characterSpacing));
}
return new Zone(outputZone);
return new Zone(outputZone.stream().sorted(Comparator.comparing(Line::getY0)).collect(Collectors.toList()));
}
}

View File

@ -1,5 +1,7 @@
package com.knecon.fforesight.service.layoutparser.processor.model;
import java.awt.geom.Rectangle2D;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
@ -15,6 +17,8 @@ import lombok.NoArgsConstructor;
@EqualsAndHashCode(callSuper = true)
public abstract class AbstractPageBlock extends Rectangle {
protected Rectangle2D bBox; // in initial user space
@JsonIgnore
protected float minX;
@JsonIgnore

View File

@ -1,15 +1,80 @@
package com.knecon.fforesight.service.layoutparser.processor.model.table;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class CleanRulings {
List<Ruling> horizontal;
List<Ruling> vertical;
List<Ruling> horizontals;
List<Ruling> verticals;
public CleanRulings getTableLines() {
return new CleanRulings(horizontals.stream()
.filter(ruling -> ruling.getClassification().equals(Ruling.Classification.TABLE_LINE))
.toList(),
verticals.stream()
.filter(ruling -> ruling.getClassification().equals(Ruling.Classification.TABLE_LINE))
.toList());
}
public boolean lineBetween(Character a, Character b) {
return lineBetween(a.getTextPosition().getInitialUserSpacePosition(), b.getTextPosition().getInitialUserSpacePosition());
}
public boolean lineBetween(Rectangle2D a, Rectangle2D b) {
if (a.intersects(b)) {
return false;
}
return lineBetween(new Point2D.Double(a.getCenterX(), a.getCenterY()), new Point2D.Double(b.getCenterX(), b.getCenterY()));
}
public boolean lineBetween(Point2D p1, Point2D p2) {
Ruling ruling = new Ruling(p1, p2);
if (ruling.isHorizontal()) {
return verticals.stream()
.anyMatch(vertical -> vertical.intersectsLine(ruling));
}
if (ruling.isVertical()) {
return horizontals.stream()
.anyMatch(horizontal -> horizontal.intersectsLine(ruling));
}
return buildAll().stream()
.anyMatch(other -> other.intersectsLine(ruling));
}
public List<Ruling> buildAll() {
ArrayList<Ruling> rulings = new ArrayList<>(horizontals.size() + verticals.size());
rulings.addAll(horizontals);
rulings.addAll(verticals);
return rulings;
}
}

View File

@ -10,6 +10,8 @@ import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.utils.CohenSutherlandClipping;
import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons;
import lombok.Getter;
import lombok.Setter;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@ -19,10 +21,24 @@ public class Ruling extends Line2D.Float {
public static final int PERPENDICULAR_UNIT_EXPAND_AMOUNT = 2;
public static final int COLINEAR_OR_PARALLEL_UNIT_EXPAND_AMOUNT = 2;
public enum Classification {
TABLE_LINE,
UNDERLINE,
STRIKETROUGH,
HEADER_SEPARATOR,
FOOTER_SEPARATOR,
OTHER
}
@Getter
@Setter
private Classification classification;
public Ruling(Point2D p1, Point2D p2) {
super(p1, p2);
this.classification = Classification.OTHER;
}

View File

@ -1,6 +1,7 @@
package com.knecon.fforesight.service.layoutparser.processor.model.table;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
@ -129,8 +130,7 @@ public class TablePageBlock extends AbstractPageBlock {
List<Cell> cellsToTheTop = new ArrayList<>();
for (int i = 0; i < rowIndex; i++) {
try {
cellsToTheTop.add(rows.get(i)
.get(colIndex));
cellsToTheTop.add(rows.get(i).get(colIndex));
} catch (IndexOutOfBoundsException e) {
log.debug("No cell {} in row {}, ignoring.", colIndex, rowIndex);
}
@ -145,8 +145,7 @@ public class TablePageBlock extends AbstractPageBlock {
if (lastHeaderCell != null) {
cell.getHeaderCells().add(lastHeaderCell);
}
if (!cell.getTextBlocks().isEmpty() && cell.getTextBlocks()
.get(0).getMostPopularWordStyle().equals("bold")) {
if (!cell.getTextBlocks().isEmpty() && cell.getTextBlocks().get(0).getMostPopularWordStyle().equals("bold")) {
cell.setHeaderCell(true);
}
}
@ -210,8 +209,7 @@ public class TablePageBlock extends AbstractPageBlock {
for (int i = 0; i < rowsOfCellsMatrix.size(); i++) {
for (int j = 0; j < rowsOfCellsMatrix.get(i).size(); j++) {
addCellToRowAndCol(rowsOfCellsMatrix.get(i)
.get(j), i, j);
addCellToRowAndCol(rowsOfCellsMatrix.get(i).get(j), i, j);
}
}
@ -413,6 +411,16 @@ public class TablePageBlock extends AbstractPageBlock {
}
public Rectangle2D getBBox() {
if (this.bBox == null) {
this.bBox = cells.stream()
.collect(RectangleTransformations.collectBBox());
}
return this.bBox;
}
record CellWithIntersection(Cell originalCell, double intersectedArea) {
}

View File

@ -2,6 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor.model.text;
import static java.util.stream.Collectors.toSet;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
@ -11,6 +12,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
import lombok.AllArgsConstructor;
@ -80,7 +82,10 @@ public class TextPageBlock extends AbstractPageBlock {
public static TextPageBlock merge(List<TextPageBlock> textBlocksToMerge) {
List<TextPositionSequence> sequences = textBlocksToMerge.stream().map(TextPageBlock::getSequences).flatMap(java.util.Collection::stream).toList();
List<TextPositionSequence> sequences = textBlocksToMerge.stream()
.map(TextPageBlock::getSequences)
.flatMap(java.util.Collection::stream)
.toList();
sequences = new ArrayList<>(sequences);
return fromTextPositionSequences(sequences);
}
@ -106,11 +111,11 @@ public class TextPageBlock extends AbstractPageBlock {
if (textBlock == null) {
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
wordBlock.getMaxXDirAdj(),
wordBlock.getMinYDirAdj(),
wordBlock.getMaxYDirAdj(),
wordBlockList,
wordBlock.getRotation());
wordBlock.getMaxXDirAdj(),
wordBlock.getMinYDirAdj(),
wordBlock.getMaxYDirAdj(),
wordBlockList,
wordBlock.getRotation());
} else {
TextPageBlock spatialEntity = textBlock.union(wordBlock);
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
@ -126,17 +131,29 @@ public class TextPageBlock extends AbstractPageBlock {
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
}
if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences()
.stream()
.map(t -> DoubleComparisons.round(t.getMinYDirAdj(), 3))
.collect(toSet())
.size() == 1) {
if (textBlock != null
&& textBlock.getSequences() != null
&& textBlock.getSequences()
.stream()
.map(t -> DoubleComparisons.round(t.getMinYDirAdj(), 3))
.collect(toSet()).size() == 1) {
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
}
return textBlock;
}
public Rectangle2D getBBox() {
if (this.bBox == null) {
this.bBox = sequences.stream()
.map(TextPositionSequence::getBoundingBox)
.collect(RectangleTransformations.collectBBox());
}
return this.bBox;
}
/**
* Returns the minX value in pdf coordinate system.
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.

View File

@ -31,7 +31,7 @@ public class BodyTextFrameService {
Rectangle bodyTextFrame = calculateBodyTextFrame(classificationDocument.getPages(), classificationDocument.getFontSizeCounter(), false, layoutParsingType);
Rectangle landscapeBodyTextFrame = calculateBodyTextFrame(classificationDocument.getPages(), classificationDocument.getFontSizeCounter(), true, layoutParsingType);
for (ClassificationPage page : classificationDocument.getPages()) {
// var updatedBodyTextFrame = getBodyTextFrameFromRulings(page, bodyTextFrame, landscapeBodyTextFrame);
var updatedBodyTextFrame = getBodyTextFrameFromRulings(page, bodyTextFrame, landscapeBodyTextFrame);
setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame);
classificationDocument.getVisualizations().addMainBodyVisualization(page.isLandscape() ? landscapeBodyTextFrame : bodyTextFrame, page.getPageNumber());
}
@ -59,24 +59,26 @@ public class BodyTextFrameService {
private List<Ruling> getPotentialFooterRulings(ClassificationPage page) {
return page.getCleanRulings()
.getHorizontal()
return page.getCleanRulings().getHorizontals()
.stream()
.filter(ruling -> ruling.getClassification().equals(Ruling.Classification.OTHER))
.filter(ruling -> ruling.getY1() < page.getPageHeight() * RULING_HEIGHT_THRESHOLD)
.filter(ruling -> ruling.getWidth() > RULING_WIDTH_THRESHOLD * page.getPageWidth())
.sorted(Comparator.comparingDouble(Ruling::getTop))
.peek(ruling -> ruling.setClassification(Ruling.Classification.FOOTER_SEPARATOR))
.toList();
}
private List<Ruling> getPotentialHeaderRulings(ClassificationPage page) {
return page.getCleanRulings()
.getHorizontal()
return page.getCleanRulings().getHorizontals()
.stream()
.filter(ruling -> ruling.getClassification().equals(Ruling.Classification.OTHER))
.filter(ruling -> ruling.getY1() > page.getPageHeight() * (1 - RULING_HEIGHT_THRESHOLD))
.filter(ruling -> ruling.getWidth() > RULING_WIDTH_THRESHOLD * page.getPageWidth())
.sorted(Comparator.comparingDouble(Ruling::getBottom).reversed())
.peek(ruling -> ruling.setClassification(Ruling.Classification.HEADER_SEPARATOR))
.toList();
}
@ -100,16 +102,16 @@ public class BodyTextFrameService {
if (page.getPageWidth() > page.getPageHeight() && page.getRotation() == 270) {
textFrame = new Rectangle(new Point(textFrame.getTopLeft().getY(), page.getPageHeight() - textFrame.getTopLeft().getX() - textFrame.getWidth()),
textFrame.getHeight(),
textFrame.getWidth(),
0);
textFrame.getHeight(),
textFrame.getWidth(),
0);
} else if (page.getPageWidth() > page.getPageHeight() && page.getRotation() != 0) {
textFrame = new Rectangle(new Point(textFrame.getTopLeft().getY(), textFrame.getTopLeft().getX()), textFrame.getHeight(), textFrame.getWidth(), page.getPageNumber());
} else if (page.getRotation() == 180) {
textFrame = new Rectangle(new Point(textFrame.getTopLeft().getX(), page.getPageHeight() - textFrame.getTopLeft().getY() - textFrame.getHeight()),
textFrame.getWidth(),
textFrame.getHeight(),
0);
textFrame.getWidth(),
textFrame.getHeight(),
0);
}
page.setBodyTextFrame(textFrame);
}
@ -153,14 +155,16 @@ public class BodyTextFrameService {
}
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) || MarkedContentUtils.intersects(textBlock,
page.getMarkedContentBboxPerType(),
MarkedContentUtils.FOOTER)) {
page.getMarkedContentBboxPerType(),
MarkedContentUtils.FOOTER)) {
continue;
}
float approxLineCount = PositionUtils.getApproxLineCount(textBlock);
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE) && approxLineCount < approximateHeaderLineCount && textBlock.getMaxY() >= page.getPageHeight() - (page.getPageHeight() / 10) || !layoutParsingType.equals(
LayoutParsingType.DOCUMINE) && approxLineCount < approximateHeaderLineCount) {
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD)
&& approxLineCount < approximateHeaderLineCount
&& textBlock.getMaxY() >= page.getPageHeight() - (page.getPageHeight() / 10)
|| !layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD) && approxLineCount < approximateHeaderLineCount) {
continue;
}
@ -187,9 +191,9 @@ public class BodyTextFrameService {
}
}
return new Rectangle(new Point(expansionsRectangle.minX, expansionsRectangle.minY),
expansionsRectangle.maxX - expansionsRectangle.minX,
expansionsRectangle.maxY - expansionsRectangle.minY,
0);
expansionsRectangle.maxX - expansionsRectangle.minX,
expansionsRectangle.maxY - expansionsRectangle.minY,
0);
}

View File

@ -45,7 +45,7 @@ public class RulingCleaningService {
verticalAndHorizontalRulingLines.horizontalLines.sort(X_FIRST_RULING_COMPARATOR);
verticalAndHorizontalRulingLines = cleanRulings(verticalAndHorizontalRulingLines);
return CleanRulings.builder().vertical(verticalAndHorizontalRulingLines.verticalLines()).horizontal(verticalAndHorizontalRulingLines.horizontalLines()).build();
return CleanRulings.builder().verticals(verticalAndHorizontalRulingLines.verticalLines()).horizontals(verticalAndHorizontalRulingLines.horizontalLines()).build();
}

View File

@ -26,7 +26,7 @@ import com.knecon.fforesight.service.layoutparser.processor.utils.SpreadsheetFin
public class TableExtractionService {
private static final int MAX_TABLE_CONTAINED_CELLS_WITH_TEXT = 1;
private static final int TEXT_BLOCK_CONTAINMENT_TOLERANCE = 2;
private static final double TEXT_BLOCK_CONTAINMENT_TOLERANCE = 0.02;
private static final double TABLE_UNIFORMITY_THRESHOLD = 0.7;
@ -77,7 +77,9 @@ public class TableExtractionService {
}
}
var containedCellsWithText = containedCells.stream().filter(cell -> !cell.getTextBlocks().isEmpty()).toList();
var containedCellsWithText = containedCells.stream()
.filter(cell -> !cell.getTextBlocks().isEmpty())
.toList();
// verify if table would contain fewer cells with text than the threshold allows
if (containedCellsWithText.size() >= MAX_TABLE_CONTAINED_CELLS_WITH_TEXT && checkIfTableCellsAreUniform(containedCells)) {
@ -97,7 +99,11 @@ public class TableExtractionService {
if (position != -1) {
page.getTextBlocks().add(position, table);
var toBeRemoved = table.getCells().stream().map(Cell::getTextBlocks).flatMap(List::stream).toList();
var toBeRemoved = table.getCells()
.stream()
.map(Cell::getTextBlocks)
.flatMap(List::stream)
.toList();
// remove text blocks from the page that were also added with the table (from its contained cells)
page.getTextBlocks().removeAll(toBeRemoved);
}
@ -122,22 +128,27 @@ public class TableExtractionService {
private boolean doesCellContainTextBlock(Cell cell, TextPageBlock textBlock) {
double x = textBlock.getPdfMinX();
double y = textBlock.getPdfMinY();
double w = textBlock.getPdfMaxX() - textBlock.getPdfMinX();
double h = textBlock.getPdfMaxY() - textBlock.getPdfMinY();
double x = textBlock.getBBox().getX();
double y = textBlock.getBBox().getY();
double w = textBlock.getBBox().getWidth();
double h = textBlock.getBBox().getHeight();
if (cell.isEmpty() || w <= 0 || h <= 0) {
return false;
}
double x0 = cell.getX();
double y0 = cell.getY();
return (x >= x0 - TEXT_BLOCK_CONTAINMENT_TOLERANCE && y >= y0 - TEXT_BLOCK_CONTAINMENT_TOLERANCE && (x + w) <= x0 + cell.getWidth() + 2 * TEXT_BLOCK_CONTAINMENT_TOLERANCE && (y + h) <= y0 + cell.getHeight() + 2 * TEXT_BLOCK_CONTAINMENT_TOLERANCE);
double xTol = TEXT_BLOCK_CONTAINMENT_TOLERANCE * w;
double yTol = TEXT_BLOCK_CONTAINMENT_TOLERANCE * h;
return (x >= x0 - xTol && y >= y0 - yTol && (x + w) <= x0 + cell.getWidth() + 2 * xTol && (y + h) <= y0 + cell.getHeight() + 2 * yTol);
}
public static List<Cell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
return RectangularIntersectionFinder.find(horizontalRulingLines, verticalRulingLines).stream().map(Cell::new).collect(Collectors.toList());
return RectangularIntersectionFinder.find(horizontalRulingLines, verticalRulingLines)
.stream()
.map(Cell::new)
.collect(Collectors.toList());
}
}

View File

@ -15,16 +15,14 @@ import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.Doubl
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
import lombok.RequiredArgsConstructor;
@ -38,29 +36,52 @@ public class DocstrumBlockificationService {
static final float THRESHOLD = 1f;
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Cell> cells, boolean xyOrder, LayoutparsingVisualizations visualizations) {
public ClassificationPage blockify(List<TextPositionSequence> textPositions, CleanRulings rulings, boolean xyOrder, LayoutparsingVisualizations visualizations) {
CleanRulings usedRulings = RectangleTransformations.extractRulings(cells);
CleanRulings usedRulings = rulings.getTableLines();
var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder);
var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder, usedRulings, visualizations);
if (!textPositions.isEmpty()) {
visualizations.addZoneVisualizations(zones, textPositions.get(0).getPage());
visualizations.addLineVisualizations(zones, textPositions.get(0).getPage());
visualizations.addLineVisualizationsFromZones(zones, textPositions.get(0).getPage());
visualizations.addCharactersWithNeighbours(zones, textPositions.get(0).getPage());
}
var pageBlocks = toAbstractPageBlocks(zones, usedRulings.getHorizontal(), usedRulings.getVertical(), xyOrder);
var pageBlocks = toAbstractPageBlocks(zones, usedRulings.getHorizontals(), usedRulings.getVerticals(), xyOrder, usedRulings);
if (xyOrder) {
sortPageBlocksXThenY(pageBlocks);
}
var classificationPage = new ClassificationPage(pageBlocks);
classificationPage.setCleanRulings(rulings);
mergeIntersectingBlocks(classificationPage.getTextBlocks(), 0, 0);
mergeIntersectingBlocks(classificationPage, 0, 0);
return classificationPage;
}
private List<AbstractPageBlock> toAbstractPageBlocks(List<Zone> zones, List<Ruling> horizontalRulings, List<Ruling> verticalRulings, boolean xyOrder) {
private static void sortPageBlocksXThenY(List<AbstractPageBlock> pageBlocks) {
pageBlocks.sort(Comparator.comparing(AbstractPageBlock::getMinY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(AbstractPageBlock::getMinX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
pageBlocks.sort(new Comparator<AbstractPageBlock>() {
@Override
public int compare(AbstractPageBlock o1, AbstractPageBlock o2) {
return Math.abs(o1.getMinY() - o2.getMinY()) < 5 && o1.getMinX() < o2.getMinX() == true ? -1 : 0;
}
});
}
private List<AbstractPageBlock> toAbstractPageBlocks(List<Zone> zones,
List<Ruling> horizontalRulings,
List<Ruling> verticalRulings,
boolean xyOrder,
CleanRulings usedRulings) {
List<AbstractPageBlock> abstractPageBlocks = new ArrayList<>();
zones.forEach(zone -> {
@ -74,21 +95,9 @@ public class DocstrumBlockificationService {
});
});
abstractPageBlocks.addAll(splitZonesAtRulings(textPositionSequences, horizontalRulings, verticalRulings));
abstractPageBlocks.add(buildTextBlock(textPositionSequences, 0));
});
if (xyOrder) {
abstractPageBlocks.sort(Comparator.comparing(AbstractPageBlock::getMinY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(AbstractPageBlock::getMinX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
abstractPageBlocks.sort(new Comparator<AbstractPageBlock>() {
@Override
public int compare(AbstractPageBlock o1, AbstractPageBlock o2) {
return Math.abs(o1.getMinY() - o2.getMinY()) < 5 && o1.getMinX() < o2.getMinX() == true ? -1 : 0;
}
});
}
return abstractPageBlocks;
}
@ -137,7 +146,7 @@ public class DocstrumBlockificationService {
previous = current;
}
mergeIntersectingBlocks(page.getTextBlocks(), 0, 6.5f);
mergeIntersectingBlocks(page, 0, 6.5f);
}
@ -218,8 +227,9 @@ public class DocstrumBlockificationService {
}
public void mergeIntersectingBlocks(List<AbstractPageBlock> blocks, float xThreshold, float yThreshold) {
public void mergeIntersectingBlocks(ClassificationPage page, float xThreshold, float yThreshold) {
var blocks = page.getTextBlocks();
ListIterator<AbstractPageBlock> itty = blocks.listIterator();
while (itty.hasNext()) {
AbstractPageBlock block = itty.next();
@ -246,6 +256,10 @@ public class DocstrumBlockificationService {
TextPageBlock inner = (TextPageBlock) blocks.get(i);
if (page.getCleanRulings().lineBetween(inner.getBBox(), current.getBBox())) {
continue;
}
if (current.getDir() == inner.getDir() && current.almostIntersects(inner, yThreshold, xThreshold)) {
boolean toDuplicate = current.isToDuplicate() || inner.isToDuplicate();

View File

@ -21,6 +21,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageB
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
@SuppressWarnings("all")
@Service
@ -34,10 +35,11 @@ public class RedactManagerBlockificationService {
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
* Rulings (TablePageBlock lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
*
* @param textPositions The words of a page.
* @param textPositions The words of a page.
* @param visualizations
* @return Page object that contains the Textblock and text statistics.
*/
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Cell> cells) {
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Cell> cells, LayoutparsingVisualizations visualizations) {
CleanRulings usedRulings = RectangleTransformations.extractRulings(cells);
@ -57,7 +59,7 @@ public class RedactManagerBlockificationService {
boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj();
boolean xIsBeforeFirstX = prev != null && word.getMinXDirAdj() < minX;
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, usedRulings.getHorizontal(), usedRulings.getVertical());
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, usedRulings.getHorizontals(), usedRulings.getVerticals());
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) {
@ -160,6 +162,8 @@ public class RedactManagerBlockificationService {
previous = block;
}
visualizations.addTextBlockVisualizations(chunkBlockList.stream().map(tb -> (TextPageBlock) tb).toList(), textPositions.get(0).getPage());
return new ClassificationPage(chunkBlockList);
}

View File

@ -65,12 +65,18 @@ public class LayoutGridService {
@SneakyThrows
@Observed(name = "ViewerDocumentService", contextualName = "create-viewer-document")
public void addLayoutGrid(File originFile, Document document, File destinationFile, boolean layerVisibilityDefaultValue) {
public void addLayoutGrid(File originFile, Document document, File destinationFile, boolean layerVisibilityDefaultValue, boolean writeVisualLayoutParsingGrid) {
List<Visualizations> allVisualizations;
Visualizations layoutGrid = this.addLayoutGrid(document, layerVisibilityDefaultValue, false);
Visualizations visualLayoutGrid = this.addLayoutGrid(document, layerVisibilityDefaultValue, true);
List<Visualizations> allVisualizations = Stream.concat(Stream.of(layoutGrid, visualLayoutGrid), document.getVisualizations().streamAll()).toList();
if (writeVisualLayoutParsingGrid) {
Visualizations visualLayoutGrid = this.addLayoutGrid(document, layerVisibilityDefaultValue, true);
allVisualizations = Stream.concat(Stream.of(layoutGrid, visualLayoutGrid), document.getVisualizations().streamAll())
.toList();
} else {
allVisualizations = Stream.concat(Stream.of(layoutGrid), document.getVisualizations().streamAll())
.toList();
}
viewerDocumentService.addVisualizationsOnPage(originFile, destinationFile, allVisualizations);
}
@ -132,7 +138,10 @@ public class LayoutGridService {
}
for (Page page : table.getPages()) {
Optional<Integer> optionalFirstRowOnPage = table.streamCol(0).filter(tableCell -> tableCell.isOnPage(page.getNumber())).map(TableCell::getRow).findFirst();
Optional<Integer> optionalFirstRowOnPage = table.streamCol(0)
.filter(tableCell -> tableCell.isOnPage(page.getNumber()))
.map(TableCell::getRow)
.findFirst();
if (optionalFirstRowOnPage.isEmpty()) {
continue;
}
@ -172,14 +181,17 @@ public class LayoutGridService {
private static Stream<Rectangle2D> streamBBoxOfCellsOnPage(Stream<TableCell> table, Page page) {
return table.filter(tableCell -> tableCell.isOnPage(page.getNumber())).map(TableCell::getBBox).map(bBoxMap -> bBoxMap.get(page));
return table.filter(tableCell -> tableCell.isOnPage(page.getNumber()))
.map(TableCell::getBBox)
.map(bBoxMap -> bBoxMap.get(page));
}
private void addSection(SemanticNode semanticNode, LayoutGrid layoutGrid, Color color) {
Map<Page, Rectangle2D> bBoxMap = semanticNode.getBBox();
List<SemanticNode> subSections = semanticNode.streamAllSubNodesOfType(NodeType.SECTION).toList();
List<SemanticNode> subSections = semanticNode.streamAllSubNodesOfType(NodeType.SECTION)
.toList();
Page firstPage = semanticNode.getFirstPage();
if (!subSections.isEmpty()) {
addPlacedText(firstPage, bBoxMap.get(firstPage), buildTreeIdString(semanticNode), layoutGrid);
@ -198,7 +210,10 @@ public class LayoutGridService {
}
return;
}
List<Page> pagesInOrder = bBoxMap.keySet().stream().sorted(Comparator.comparingInt(Page::getNumber)).collect(Collectors.toList());
List<Page> pagesInOrder = bBoxMap.keySet()
.stream()
.sorted(Comparator.comparingInt(Page::getNumber))
.collect(Collectors.toList());
pagesInOrder.remove(0);
addLinesForFirstPageOfSection(semanticNode, color, firstPage, layoutGrid);
var lastPage = pagesInOrder.remove(pagesInOrder.size() - 1);
@ -295,7 +310,10 @@ public class LayoutGridService {
private String buildTreeIdString(SemanticNode semanticNode) {
return semanticNode.getTreeId().stream().map(Object::toString).collect(Collectors.joining("."));
return semanticNode.getTreeId()
.stream()
.map(Object::toString)
.collect(Collectors.joining("."));
}

View File

@ -182,7 +182,7 @@ public class RectangleTransformations {
verticalRulings.add(new Ruling(new Point2D.Float(rectangle.x + rectangle.width, rectangle.y),
new Point2D.Float(rectangle.x + rectangle.width, rectangle.y + rectangle.height)));
});
return CleanRulings.builder().vertical(verticalRulings).horizontal(horizontalRulings).build();
return CleanRulings.builder().verticals(verticalRulings).horizontals(horizontalRulings).build();
}

View File

@ -63,6 +63,10 @@ public class RectangularIntersectionFinder {
&& intersectionPoints.get(btmRight).vertical().equals(intersectionPoints.get(yPoint).vertical())) {
foundRectangles.add(new Rectangle2D.Double(topLeft.getX(), topLeft.getY(), btmRight.getX() - topLeft.getX(), btmRight.getY() - topLeft.getY()));
intersectionPoints.get(topLeft).horizontal().setClassification(Ruling.Classification.TABLE_LINE);
intersectionPoints.get(topLeft).vertical().setClassification(Ruling.Classification.TABLE_LINE);
intersectionPoints.get(btmRight).horizontal().setClassification(Ruling.Classification.TABLE_LINE);
intersectionPoints.get(btmRight).vertical().setClassification(Ruling.Classification.TABLE_LINE);
break outer;
}
}

View File

@ -17,7 +17,9 @@ import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Bound
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
@ -47,8 +49,16 @@ public class LayoutparsingVisualizations {
static final Color WORDS_COLOR = new Color(68, 84, 147);
static final Color LINES_COLOR = new Color(152, 45, 179);
static final Color ZONES_COLOR = new Color(131, 38, 38);
static final Color RULINGS_COLOR = new Color(21, 221, 174);
static final Color TABLE_RULINGS_COLOR = new Color(255, 175, 175);
static final Color HEADER_RULING_COLOR = new Color(171, 131, 6);
static final Color FOOTER_RULING_COLOR = new Color(106, 82, 2);
static final Color UNDERLINE_RULING_COLOR = new Color(6, 39, 171);
static final Color STRIKETROUGH_RULING_COLOR = new Color(171, 6, 6);
static final Color CELLS_COLOR = new Color(31, 214, 27);
static final Color MAIN_BODY_COLOR = new Color(171, 131, 6);
static final Color MARKED_CONTENT_COLOR = new Color(171, 131, 6);
@ -98,14 +108,15 @@ public class LayoutparsingVisualizations {
if (!active) {
return;
}
List<ColoredRectangle> list = textPositionSequences.stream()
.map(textPositionSequence -> textPositionSequence.getTextPositions()
.stream()
.map(RedTextPosition::getInitialUserSpacePosition)
.collect(RectangleTransformations.collectBBox()))
.map(rect -> new ColoredRectangle(rect, WORDS_COLOR, 1))
.toList();
this.words.getVisualizationsOnPages().put(pageNumber - 1, VisualizationsOnPage.builder().coloredRectangles(list).build());
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.words);
visualizationsOnPage.getColoredRectangles()
.addAll(textPositionSequences.stream()
.map(textPositionSequence -> textPositionSequence.getTextPositions()
.stream()
.map(RedTextPosition::getInitialUserSpacePosition)
.collect(RectangleTransformations.collectBBox()))
.map(rect -> new ColoredRectangle(rect, WORDS_COLOR, 1))
.toList());
}
@ -114,14 +125,25 @@ public class LayoutparsingVisualizations {
if (!active) {
return;
}
this.rulings.getVisualizationsOnPages()
.put(pageNumber - 1,
VisualizationsOnPage.builder()
.coloredLines(Stream.of(cleanRulings.getHorizontal(), cleanRulings.getVertical())
.flatMap(Collection::stream)
.map(ruling -> new ColoredLine(ruling, RULINGS_COLOR, 1))
.toList())
.build());
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.rulings);
visualizationsOnPage.getColoredLines()
.addAll(Stream.of(cleanRulings.getHorizontals(), cleanRulings.getVerticals())
.flatMap(Collection::stream)
.map(ruling -> new ColoredLine(ruling, decideOnRulingColor(ruling), 1))
.toList());
}
private Color decideOnRulingColor(Ruling ruling) {
return switch (ruling.getClassification()) {
case TABLE_LINE -> TABLE_RULINGS_COLOR;
case HEADER_SEPARATOR -> HEADER_RULING_COLOR;
case FOOTER_SEPARATOR -> FOOTER_RULING_COLOR;
case UNDERLINE -> UNDERLINE_RULING_COLOR;
case STRIKETROUGH -> STRIKETROUGH_RULING_COLOR;
default -> RULINGS_COLOR;
};
}
@ -130,13 +152,11 @@ public class LayoutparsingVisualizations {
if (!active) {
return;
}
this.cells.getVisualizationsOnPages()
.put(pageNumber - 1,
VisualizationsOnPage.builder()
.coloredRectangles(cells.stream()
.map(ruling -> new ColoredRectangle(ruling, CELLS_COLOR, 1))
.toList())
.build());
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.cells);
visualizationsOnPage.getColoredRectangles()
.addAll(cells.stream()
.map(ruling -> new ColoredRectangle(ruling, CELLS_COLOR, 1))
.toList());
}
@ -146,33 +166,50 @@ public class LayoutparsingVisualizations {
return;
}
this.zones.getVisualizationsOnPages()
.put(page - 1,
VisualizationsOnPage.builder()
.coloredRectangles(zones.stream()
.map(BoundingBox::getBBox)
.map(zone -> new ColoredRectangle(zone, ZONES_COLOR, 1))
.toList())
.build());
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(page, this.zones);
visualizationsOnPage.getColoredRectangles()
.addAll(zones.stream()
.map(BoundingBox::getBBox)
.map(zone -> new ColoredRectangle(zone, ZONES_COLOR, 1))
.toList());
}
public void addLineVisualizations(List<Zone> zones, int page) {
public void addLineVisualizationsFromZones(List<Zone> zones, int page) {
addLineVisualizations(zones.stream()
.map(Zone::getLines)
.flatMap(Collection::stream)
.toList(), page);
}
public void addLineVisualizations(List<Line> lines, int page) {
if (!active) {
return;
}
this.lines.getVisualizationsOnPages()
.put(page - 1,
VisualizationsOnPage.builder()
.coloredRectangles(zones.stream()
.map(Zone::getLines)
.flatMap(Collection::stream)
.map(BoundingBox::getBBox)
.map(line -> new ColoredRectangle(line, LINES_COLOR, 1))
.toList())
.build());
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(page, this.lines);
visualizationsOnPage.getColoredRectangles()
.addAll(lines.stream()
.map(BoundingBox::getBBox)
.map(line -> new ColoredRectangle(line, LINES_COLOR, 0.3f))
.toList());
}
public void addTextBlockVisualizations(List<TextPageBlock> textPageBlocks, int page) {
if (!active) {
return;
}
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(page, zones);
visualizationsOnPage.getColoredRectangles()
.addAll(textPageBlocks.stream()
.map(rect -> new ColoredRectangle(rect.getBBox(), ZONES_COLOR, 1))
.toList());
}
@ -181,14 +218,11 @@ public class LayoutparsingVisualizations {
if (!active) {
return;
}
this.mainBody.getVisualizationsOnPages()
.put(pageNumber - 1,
VisualizationsOnPage.builder()
.coloredRectangles(List.of(new ColoredRectangle(new Rectangle2D.Double(rectangle.getTopLeft().getX(),
rectangle.getTopLeft().getY(),
rectangle.getWidth(),
rectangle.getHeight()), MAIN_BODY_COLOR, 1)))
.build());
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, mainBody);
visualizationsOnPage.getColoredRectangles()
.add(new ColoredRectangle(new Rectangle2D.Double(rectangle.getTopLeft().getX(), rectangle.getTopLeft().getY(), rectangle.getWidth(), rectangle.getHeight()),
MAIN_BODY_COLOR,
1));
}
@ -197,9 +231,11 @@ public class LayoutparsingVisualizations {
if (!active) {
return;
}
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, markedContent);
List<MarkedContentUtils.MarkedContentPosition> markedContentBBoxMapBySubType = MarkedContentUtils.getMarkedContentPositions(markedContents, pdPage);
VisualizationsOnPage visualizationsOnPage = VisualizationsOnPage.builder().build();
this.markedContent.getVisualizationsOnPages().put(pageNumber - 1, visualizationsOnPage);
markedContentBBoxMapBySubType.forEach(markedContentPosition -> {
var bbox = markedContentPosition.textPositions()
@ -224,10 +260,8 @@ public class LayoutparsingVisualizations {
return;
}
VisualizationsOnPage neighbourVisualizations = VisualizationsOnPage.builder().build();
neighbours.getVisualizationsOnPages().put(page - 1, neighbourVisualizations);
VisualizationsOnPage characterVisualizations = VisualizationsOnPage.builder().build();
characters.getVisualizationsOnPages().put(page - 1, characterVisualizations);
VisualizationsOnPage characterVisualizations = getOrCreateVisualizationsOnPage(page, characters);
VisualizationsOnPage neighbourVisualizations = getOrCreateVisualizationsOnPage(page, neighbours);
AtomicInteger index = new AtomicInteger(0);
zones.forEach(zone -> zone.getLines()
@ -249,4 +283,15 @@ public class LayoutparsingVisualizations {
}
private VisualizationsOnPage getOrCreateVisualizationsOnPage(int page, Visualizations visualizations) {
if (visualizations.getVisualizationsOnPages().containsKey(page - 1)) {
return visualizations.getVisualizationsOnPages().get(page - 1);
}
VisualizationsOnPage visualizationsOnPage = VisualizationsOnPage.builder().build();
visualizations.getVisualizationsOnPages().put(page - 1, visualizationsOnPage);
return visualizationsOnPage;
}
}

View File

@ -96,8 +96,8 @@ public class HeadlinesGoldStandardIntegrationTest {
goldStandardLog.getRedactionLogEntry().removeIf(r -> !r.isRedacted() || r.getChanges().get(r.getChanges().size() - 1).getType().equals(ChangeType.REMOVED));
goldStandardLog.getRedactionLogEntry().forEach(e -> goldStandardHeadlines.add(new Headline(e.getPositions().get(0).getPage(), e.getValue())));
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_OLD,
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD,
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.DOCUMINE,
layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE,
pdfFileResource.getFile(),
new ImageServiceResponse(),
new TableServiceResponse(),

View File

@ -9,7 +9,6 @@ import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.core.io.ClassPathResource;
@ -34,17 +33,17 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
@Test
public void testLayoutParserEndToEnd() {
String filePath = "files/bdr/Wie weiter bei Kristeneinrichtungen.pdf";
String filePath = "files/SinglePages/VV-931175_Page1.pdf";
runForFile(filePath);
}
@Test
@Disabled
// @Disabled
@SneakyThrows
public void testLayoutParserEndToEndWithFolder() {
String folder = "/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles-pdftron-ocred";
String folder = "/home/kschuettler/iqser/fforesight/layout-parser/layoutparser/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages";
List<Path> pdfFiles = Files.walk(Path.of(folder))
.filter(path -> path.getFileName().toString().endsWith(".pdf"))
.sorted(Comparator.comparing(Path::getFileName))
@ -70,7 +69,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
file = new File(filePath);
}
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LayoutParsingType.REDACT_MANAGER, true);
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LayoutParsingType.DOCUMINE, true);
prepareStorage(layoutParsingRequest, file);
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);

View File

@ -38,7 +38,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
long start = System.currentTimeMillis();
Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER);
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true, false);
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
}
@ -56,7 +56,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
var tableResponse = mapper.readValue(new ClassPathResource(tableFileName).getInputStream(), TableServiceResponse.class);
var documentFile = new ClassPathResource(fileName).getFile();
var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE,
var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE_OLD,
documentFile,
new ImageServiceResponse(),
tableResponse,
@ -64,9 +64,9 @@ public class ViewerDocumentTest extends BuildDocumentTest {
Map.of("file", Path.of(fileName).getFileName().toFile().toString()));
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.DOCUMINE, classificationDocument);
Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.DOCUMINE_OLD, classificationDocument);
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true, false);
}
}

View File

@ -0,0 +1,38 @@
package com.knecon.fforesight.service.layoutparser.server.model;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.util.List;
import org.junit.jupiter.api.Test;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
class CleanRulingsTest {
@Test
public void testLineBetween() {
List<Ruling> verticals = List.of(new Ruling(new Point2D.Double(10, 0), new Point2D.Double(10, 10)));
List<Ruling> horizontals = List.of(new Ruling(new Point2D.Double(0, 5), new Point2D.Double(10, 5)));
CleanRulings cleanRulings = new CleanRulings(horizontals, verticals);
Rectangle2D a = new Rectangle2D.Double(1, 6, 3, 3);
Rectangle2D b = new Rectangle2D.Double(5, 6, 3, 3);
Rectangle2D c = new Rectangle2D.Double(11, 6, 3, 3);
Rectangle2D d = new Rectangle2D.Double(1, 1, 3, 3);
Rectangle2D e = new Rectangle2D.Double(5, 1, 3, 3);
Rectangle2D f = new Rectangle2D.Double(11, 1, 3, 3);
assertFalse(cleanRulings.lineBetween(a, a));
assertFalse(cleanRulings.lineBetween(a, b));
assertTrue(cleanRulings.lineBetween(a, c));
assertTrue(cleanRulings.lineBetween(a, d));
assertTrue(cleanRulings.lineBetween(a, e));
assertTrue(cleanRulings.lineBetween(a, f));
}
}

View File

@ -0,0 +1,62 @@
package com.knecon.fforesight.service.layoutparser.server.model;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.util.List;
import org.junit.jupiter.api.Test;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
public class RulingTest {
@Test
public void testLineBetween() {
List<Ruling> verticals = List.of(new Ruling(new Point2D.Double(10, 0), new Point2D.Double(10, 10)), new Ruling(new Point2D.Double(5, 0), new Point2D.Double(5, 5)));
List<Ruling> horizontals = List.of(new Ruling(new Point2D.Double(0, 5), new Point2D.Double(10, 5)));
CleanRulings cleanRulings = new CleanRulings(horizontals, verticals);
Rectangle2D a = new Rectangle2D.Double(1, 6, 3, 3);
Rectangle2D b = new Rectangle2D.Double(5, 6, 3, 3);
Rectangle2D c = new Rectangle2D.Double(11, 6, 3, 3);
Rectangle2D d = new Rectangle2D.Double(1, 1, 3, 3);
Rectangle2D e = new Rectangle2D.Double(5, 1, 3, 3);
Rectangle2D f = new Rectangle2D.Double(11, 1, 3, 3);
assertFalse(cleanRulings.lineBetween(a, a));
assertFalse(cleanRulings.lineBetween(a, b));
assertTrue(cleanRulings.lineBetween(a, c));
assertTrue(cleanRulings.lineBetween(a, d));
assertTrue(cleanRulings.lineBetween(a, e));
assertTrue(cleanRulings.lineBetween(a, f));
assertFalse(cleanRulings.lineBetween(d, d));
assertTrue(cleanRulings.lineBetween(d, b));
assertTrue(cleanRulings.lineBetween(d, c));
assertTrue(cleanRulings.lineBetween(d, a));
assertTrue(cleanRulings.lineBetween(d, e));
assertTrue(cleanRulings.lineBetween(d, f));
assertFalse(cleanRulings.lineBetween(c, c));
assertTrue(cleanRulings.lineBetween(c, b));
assertTrue(cleanRulings.lineBetween(c, d));
assertTrue(cleanRulings.lineBetween(c, a));
assertTrue(cleanRulings.lineBetween(c, e));
assertFalse(cleanRulings.lineBetween(c, f));
var all = List.of(a, b, c, d, e, f);
for (Rectangle2D r1 : all) {
for (Rectangle2D r2 : all) {
assertEquals(cleanRulings.lineBetween(r1, r2), cleanRulings.lineBetween(r2, r1));
}
}
}
}

View File

@ -62,7 +62,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
@SneakyThrows
public ClassificationDocument buildClassificationDocument(File originDocument, TableServiceResponse tableServiceResponse) {
ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD,
ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
originDocument,
new ImageServiceResponse(),
tableServiceResponse,

View File

@ -53,7 +53,7 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
List<List<Rectangle2D>> rectanglesPerPage = new LinkedList<>();
for (PageContents pageContent : pageContents) {
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(Collections.emptyList(), pageContent.getRulings());
List<Rectangle2D> rects = RectangularIntersectionFinder.find(cleanRulings.getHorizontal(), cleanRulings.getVertical());
List<Rectangle2D> rects = RectangularIntersectionFinder.find(cleanRulings.getHorizontals(), cleanRulings.getVerticals());
rectanglesPerPage.add(rects);
}
@ -74,7 +74,7 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
for (PageContents pageContent : pageContents) {
cleanRulingsPerPage.add(rulingCleaningService.getCleanRulings(Collections.emptyList(), pageContent.getRulings()));
}
var cleanRulings = cleanRulingsPerPage.stream().map(CleanRulings::getVertical).collect(Collectors.toList());
var cleanRulings = cleanRulingsPerPage.stream().map(CleanRulings::getVerticals).collect(Collectors.toList());
PdfDraw.drawLinesPerPage(fileName, cleanRulings, lineFileName);
}

View File

@ -114,7 +114,7 @@ public abstract class AbstractTest {
.originFileStorageId(fileName + ORIGIN_FILE_ID)
.tablesFileStorageId(Optional.of(fileName + TABLE_FILE_ID))
.imagesFileStorageId(Optional.of(fileName + IMAGE_FILE_ID))
.visualLayoutParsingFileId(Optional.of(fileName + VISUAL_LAYOUT_FILE))
.visualLayoutParsingFileId(Optional.empty())
.structureFileStorageId(fileName + STRUCTURE_FILE_ID)
.textBlockFileStorageId(fileName + TEXT_FILE_ID)
.positionBlockFileStorageId(fileName + POSITION_FILE_ID)
@ -190,7 +190,9 @@ public abstract class AbstractTest {
storageService.storeObject(TenantContext.getTenantId(), layoutParsingRequest.imagesFileStorageId().get(), imageInfoStream);
storageService.storeObject(TenantContext.getTenantId(), layoutParsingRequest.tablesFileStorageId().get(), cvServiceResponseFileStream);
storageService.storeObject(TenantContext.getTenantId(), layoutParsingRequest.originFileStorageId(), fileStream);
storageService.storeObject(TenantContext.getTenantId(), layoutParsingRequest.visualLayoutParsingFileId().get(), visualLayoutParsingResponseFileStream);
if (layoutParsingRequest.visualLayoutParsingFileId().isPresent()) {
storageService.storeObject(TenantContext.getTenantId(), layoutParsingRequest.visualLayoutParsingFileId().get(), visualLayoutParsingResponseFileStream);
}
}

View File

@ -41,6 +41,7 @@ public class ContentStreams {
public static Identifier MARKED_CONTENT = new Identifier("Marked content", COSName.getPDFName("KNECON_MARKED_CONTENT"), true);
public static Identifier NEIGHBOURS = new Identifier("Neighbours", COSName.getPDFName("KNECON_NEIGHBOURS"), true);
public static Identifier CHARACTERS = new Identifier("Characters", COSName.getPDFName("KNECON_CHARACTERS"), true);
public static List<Identifier> allContentStreams = List.of(KNECON_LAYOUT,