RED-8825: added split by ruling into every step of docstrum
This commit is contained in:
parent
6a691183dc
commit
e4663ac8db
@ -5,6 +5,7 @@ public enum LayoutParsingType {
|
||||
REDACT_MANAGER_OLD,
|
||||
REDACT_MANAGER_PARAGRAPH_DEBUG,
|
||||
DOCUMINE,
|
||||
DOCUMINE_OLD,
|
||||
CLARIFYND,
|
||||
CLARIFYND_PARAGRAPH_DEBUG
|
||||
}
|
||||
|
||||
@ -101,24 +101,20 @@ public class LayoutParsingPipeline {
|
||||
log.info("Starting layout parsing for {}", layoutParsingRequest.identifier());
|
||||
|
||||
File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
|
||||
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId())
|
||||
.orElse(originFile);
|
||||
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()).orElse(originFile);
|
||||
|
||||
VisualLayoutParsingResponse visualLayoutParsingResponse = new VisualLayoutParsingResponse();
|
||||
if (layoutParsingRequest.visualLayoutParsingFileId()
|
||||
.isPresent()) {
|
||||
if (layoutParsingRequest.visualLayoutParsingFileId().isPresent()) {
|
||||
visualLayoutParsingResponse = layoutParsingStorageService.getVisualLayoutParsingFile(layoutParsingRequest.visualLayoutParsingFileId().get());
|
||||
}
|
||||
|
||||
ImageServiceResponse imageServiceResponse = new ImageServiceResponse();
|
||||
if (layoutParsingRequest.imagesFileStorageId()
|
||||
.isPresent()) {
|
||||
if (layoutParsingRequest.imagesFileStorageId().isPresent()) {
|
||||
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get());
|
||||
}
|
||||
|
||||
TableServiceResponse tableServiceResponse = new TableServiceResponse();
|
||||
if (layoutParsingRequest.tablesFileStorageId()
|
||||
.isPresent()) {
|
||||
if (layoutParsingRequest.tablesFileStorageId().isPresent()) {
|
||||
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get());
|
||||
}
|
||||
|
||||
@ -135,7 +131,7 @@ public class LayoutParsingPipeline {
|
||||
|
||||
log.info("Creating viewer document for {}", layoutParsingRequest.identifier());
|
||||
|
||||
layoutGridService.addLayoutGrid(viewerDocumentFile, documentGraph, viewerDocumentFile, false);
|
||||
layoutGridService.addLayoutGrid(viewerDocumentFile, documentGraph, viewerDocumentFile, false, layoutParsingRequest.visualLayoutParsingFileId().isPresent());
|
||||
|
||||
log.info("Storing resulting files for {}", layoutParsingRequest.identifier());
|
||||
|
||||
@ -251,12 +247,12 @@ public class LayoutParsingPipeline {
|
||||
stripper.setStartPage(pageNumber);
|
||||
stripper.setEndPage(pageNumber);
|
||||
stripper.setPdpage(pdPage);
|
||||
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE)) {
|
||||
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD)) {
|
||||
stripper.setSortByPosition(true);
|
||||
}
|
||||
stripper.getText(originDocument);
|
||||
|
||||
classificationDocument.getVisualizations().addTextVisualizations(stripper.getTextPositionSequences(), pageNumber);
|
||||
List<TextPositionSequence> words = stripper.getTextPositionSequences();
|
||||
classificationDocument.getVisualizations().addTextVisualizations(words, pageNumber);
|
||||
|
||||
PDRectangle pdr = pdPage.getMediaBox();
|
||||
|
||||
@ -266,9 +262,7 @@ public class LayoutParsingPipeline {
|
||||
PDRectangle cropbox = pdPage.getCropBox();
|
||||
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), stripper.getRulings());
|
||||
|
||||
classificationDocument.getVisualizations().addCleanRulingVisualization(cleanRulings, pageNumber);
|
||||
|
||||
List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontals(), cleanRulings.getVerticals());
|
||||
|
||||
var graphics = graphicExtractorService.extractPathElementGraphics(originDocument,
|
||||
pdPage,
|
||||
@ -287,12 +281,11 @@ public class LayoutParsingPipeline {
|
||||
|
||||
ClassificationPage classificationPage = switch (layoutParsingType) {
|
||||
case REDACT_MANAGER_OLD ->
|
||||
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells);
|
||||
case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG ->
|
||||
docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, true, classificationDocument.getVisualizations());
|
||||
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG ->
|
||||
docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, false, classificationDocument.getVisualizations());
|
||||
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, classificationDocument.getVisualizations());
|
||||
case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings.getHorizontals(), cleanRulings.getVerticals());
|
||||
case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG ->
|
||||
docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getVisualizations());
|
||||
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getVisualizations());
|
||||
};
|
||||
|
||||
classificationPage.setCleanRulings(cleanRulings);
|
||||
@ -313,7 +306,7 @@ public class LayoutParsingPipeline {
|
||||
}
|
||||
|
||||
if (signatures.containsKey(pageNumber)) {
|
||||
if (classificationPage.getImages() == null || classificationPage.getImages().size() == 0) {
|
||||
if (classificationPage.getImages() == null || classificationPage.getImages().isEmpty()) {
|
||||
classificationPage.setImages(signatures.get(pageNumber));
|
||||
} else {
|
||||
classificationPage.getImages().addAll(signatures.get(pageNumber));
|
||||
@ -325,7 +318,7 @@ public class LayoutParsingPipeline {
|
||||
if (layoutParsingType == LayoutParsingType.REDACT_MANAGER) {
|
||||
docstrumBlockificationService.combineBlocks(classificationPage);
|
||||
} else if (layoutParsingType == LayoutParsingType.CLARIFYND) {
|
||||
docstrumBlockificationService.mergeIntersectingBlocks(classificationPage.getTextBlocks(), 0, 6.5f);
|
||||
docstrumBlockificationService.mergeIntersectingBlocks(classificationPage, 0, 6.5f);
|
||||
}
|
||||
|
||||
buildPageStatistics(classificationPage);
|
||||
@ -338,11 +331,14 @@ public class LayoutParsingPipeline {
|
||||
|
||||
log.info("Calculating BodyTextFrame for {}", identifier);
|
||||
bodyTextFrameService.setBodyTextFrames(classificationDocument, layoutParsingType);
|
||||
for (ClassificationPage page : classificationDocument.getPages()) {
|
||||
classificationDocument.getVisualizations().addCleanRulingVisualization(page.getCleanRulings(), page.getPageNumber());
|
||||
}
|
||||
log.info("Classify TextBlocks for {}", identifier);
|
||||
switch (layoutParsingType) {
|
||||
case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_OLD, CLARIFYND_PARAGRAPH_DEBUG ->
|
||||
redactManagerClassificationService.classifyDocument(classificationDocument);
|
||||
case DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument);
|
||||
case DOCUMINE_OLD, DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument);
|
||||
case CLARIFYND -> clarifyndClassificationService.classifyDocument(classificationDocument);
|
||||
}
|
||||
|
||||
|
||||
@ -14,9 +14,11 @@ import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.Nea
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.ReadingOrderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.SpacingService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.ZoneBuilderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@ -31,31 +33,37 @@ public class DocstrumSegmentationService {
|
||||
private final ReadingOrderService readingOrderService;
|
||||
|
||||
|
||||
public List<Zone> segmentPage(List<TextPositionSequence> textPositions, boolean xyOrder) {
|
||||
public List<Zone> segmentPage(List<TextPositionSequence> textPositions, boolean xyOrder, CleanRulings usedRulings, LayoutparsingVisualizations visualizations) {
|
||||
|
||||
List<Zone> zones = new ArrayList<>();
|
||||
zones.addAll(computeZones(textPositions, TextDirection.ZERO));
|
||||
zones.addAll(computeZones(textPositions, TextDirection.QUARTER_CIRCLE));
|
||||
zones.addAll(computeZones(textPositions, TextDirection.HALF_CIRCLE));
|
||||
zones.addAll(computeZones(textPositions, TextDirection.THREE_QUARTER_CIRCLE));
|
||||
zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.ZERO));
|
||||
zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.QUARTER_CIRCLE));
|
||||
zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.HALF_CIRCLE));
|
||||
zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.THREE_QUARTER_CIRCLE));
|
||||
|
||||
return readingOrderService.resolve(zones, xyOrder);
|
||||
}
|
||||
|
||||
|
||||
private List<Zone> computeZones(List<TextPositionSequence> textPositions, TextDirection direction) {
|
||||
private List<Zone> computeZones(List<TextPositionSequence> textPositions, CleanRulings rulings, LayoutparsingVisualizations visualizations, TextDirection direction) {
|
||||
|
||||
List<RedTextPosition> positions = textPositions.stream().filter(t -> t.getDir() == direction).map(TextPositionSequence::getTextPositions).flatMap(List::stream).toList();
|
||||
List<RedTextPosition> positions = textPositions.stream()
|
||||
.filter(t -> t.getDir() == direction)
|
||||
.map(TextPositionSequence::getTextPositions)
|
||||
.flatMap(List::stream)
|
||||
.toList();
|
||||
|
||||
List<Character> characters = positions.stream().map(Character::new).collect(Collectors.toList());
|
||||
List<Character> characters = positions.stream()
|
||||
.map(Character::new)
|
||||
.collect(Collectors.toList());
|
||||
|
||||
nearestNeighbourService.findNearestNeighbors(characters);
|
||||
|
||||
double characterSpacing = spacingService.computeCharacterSpacing(characters);
|
||||
double lineSpacing = Math.min(spacingService.computeLineSpacing(characters), 20);
|
||||
|
||||
List<Line> lines = lineBuilderService.buildLines(characters, characterSpacing, lineSpacing);
|
||||
return zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing);
|
||||
List<Line> lines = lineBuilderService.buildLines(characters, characterSpacing, lineSpacing, rulings);
|
||||
return zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing, rulings);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -4,6 +4,7 @@ import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
@ -73,7 +74,7 @@ public class Line extends BoundingBox {
|
||||
|
||||
public double getAngle() {
|
||||
|
||||
return Math.atan2(y1 - y0, x1 - x0);
|
||||
return FastAtan2.fastAtan2(y1 - y0, x1 - x0);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -16,7 +16,7 @@ public class Zone extends BoundingBox {
|
||||
@SuppressWarnings("PMD.ConstructorCallsOverridableMethod")
|
||||
public Zone(List<Line> lines) {
|
||||
|
||||
lines.sort(Comparator.comparingDouble(Line::getY));
|
||||
lines.sort(Comparator.comparingDouble(Line::getY0));
|
||||
this.lines = lines;
|
||||
buildBBox();
|
||||
}
|
||||
|
||||
@ -10,34 +10,39 @@ import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Angle
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
|
||||
@Service
|
||||
public class LineBuilderService {
|
||||
|
||||
private static final double CHARACTER_SPACING_DISTANCE_MULTIPLIER = 3.5;
|
||||
private static final double MAX_VERTICAL_CHARACTER_DISTANCE = 0.67;
|
||||
private static final double LINE_SPACING_THRESHOLD_MULTIPLIER = 0.67;
|
||||
private static final double ANGLE_TOLERANCE = Math.PI / 6;
|
||||
|
||||
|
||||
public List<Line> buildLines(List<Character> characters, double characterSpacing, double lineSpacing) {
|
||||
public List<Line> buildLines(List<Character> characters, double characterSpacing, double lineSpacing, CleanRulings rulings) {
|
||||
|
||||
double maxHorizontalDistance = characterSpacing * CHARACTER_SPACING_DISTANCE_MULTIPLIER;
|
||||
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_CHARACTER_DISTANCE;
|
||||
double maxVerticalDistance = lineSpacing * LINE_SPACING_THRESHOLD_MULTIPLIER;
|
||||
|
||||
UnionFind<Character> unionFind = new UnionFind<>(new HashSet<>(characters));
|
||||
|
||||
AngleFilter filter = new AngleFilter(-ANGLE_TOLERANCE, ANGLE_TOLERANCE);
|
||||
AngleFilter angleFilter = new AngleFilter(-ANGLE_TOLERANCE, ANGLE_TOLERANCE);
|
||||
|
||||
characters.forEach(character -> {
|
||||
character.getNeighbors()
|
||||
.forEach(neighbor -> {
|
||||
double normalizedHorizontalDistance = neighbor.getHorizontalDistance() / maxHorizontalDistance;
|
||||
double normalizedVerticalDistance = neighbor.getVerticalDistance() / maxVerticalDistance;
|
||||
if (character.getTextPosition().getDir() == neighbor.getCharacter().getTextPosition().getDir() //
|
||||
&& filter.matches(neighbor) //
|
||||
&& Math.pow(normalizedHorizontalDistance, 2) + Math.pow(normalizedVerticalDistance, 2) <= 1) {
|
||||
unionFind.union(character, neighbor.getCharacter());
|
||||
|
||||
if (character.getTextPosition().getDir() != neighbor.getCharacter().getTextPosition().getDir() //
|
||||
|| !angleFilter.matches(neighbor) //
|
||||
|| Math.pow(normalizedHorizontalDistance, 2) + Math.pow(normalizedVerticalDistance, 2) > 1 //
|
||||
|| rulings.lineBetween(character, neighbor.getCharacter())) {
|
||||
return;
|
||||
}
|
||||
|
||||
unionFind.union(character, neighbor.getCharacter());
|
||||
});
|
||||
});
|
||||
|
||||
|
||||
@ -5,6 +5,7 @@ import java.util.Comparator;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
@ -12,6 +13,7 @@ import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Chara
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
|
||||
@Service
|
||||
public class ZoneBuilderService {
|
||||
@ -29,12 +31,10 @@ public class ZoneBuilderService {
|
||||
|
||||
private static final double ANGLE_TOLERANCE = Math.PI / 6;
|
||||
|
||||
private static final int MAX_ZONES = 300;
|
||||
|
||||
private static final double MAX_VERTICAL_MERGE_DISTANCE = 0.5;
|
||||
|
||||
|
||||
public List<Zone> buildZones(List<Line> lines, double characterSpacing, double lineSpacing) {
|
||||
public List<Zone> buildZones(List<Line> lines, double characterSpacing, double lineSpacing, CleanRulings rulings) {
|
||||
|
||||
double minHorizontalDistance = characterSpacing * MIN_HORIZONTAL_DISTANCE_MULTIPLIER;
|
||||
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_DISTANCE_MULTIPLIER;
|
||||
@ -60,29 +60,23 @@ public class ZoneBuilderService {
|
||||
double horizontalDistance = outerLine.horizontalDistance(innerLine) / scale;
|
||||
double verticalDistance = outerLine.verticalDistance(innerLine) / scale;
|
||||
|
||||
if (minHorizontalDistance <= horizontalDistance && verticalDistance <= maxVerticalDistance //
|
||||
|| minHorizontalMergeDistance <= horizontalDistance && verticalDistance <= maxVerticalMergeDistance) {
|
||||
|
||||
unionFind.union(outerLine, innerLine);
|
||||
if ((!(minHorizontalDistance <= horizontalDistance) || !(verticalDistance <= maxVerticalDistance)) //
|
||||
&& (!(minHorizontalMergeDistance <= horizontalDistance) || !(verticalDistance <= maxVerticalMergeDistance))) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (rulings.lineBetween(outerLine.getBBox(), innerLine.getBBox())) {
|
||||
return;
|
||||
}
|
||||
unionFind.union(outerLine, innerLine);
|
||||
|
||||
});
|
||||
});
|
||||
|
||||
List<Zone> zones = unionFind.getGroups()
|
||||
return unionFind.getGroups()
|
||||
.stream()
|
||||
.map(group -> mergeLinesInZone(new ArrayList<>(group), characterSpacing, lineSpacing))
|
||||
.toList();
|
||||
|
||||
if (zones.size() > MAX_ZONES) {
|
||||
List<Line> oneZoneLines = new ArrayList<>();
|
||||
for (Zone zone : zones) {
|
||||
oneZoneLines.addAll(zone.getLines());
|
||||
}
|
||||
return List.of(mergeLinesInZone(oneZoneLines, characterSpacing, lineSpacing));
|
||||
}
|
||||
|
||||
return zones;
|
||||
}
|
||||
|
||||
|
||||
@ -157,7 +151,7 @@ public class ZoneBuilderService {
|
||||
outputZone.add(new Line(characters, characterSpacing));
|
||||
}
|
||||
|
||||
return new Zone(outputZone);
|
||||
return new Zone(outputZone.stream().sorted(Comparator.comparing(Line::getY0)).collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,5 +1,7 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
@ -15,6 +17,8 @@ import lombok.NoArgsConstructor;
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
public abstract class AbstractPageBlock extends Rectangle {
|
||||
|
||||
protected Rectangle2D bBox; // in initial user space
|
||||
|
||||
@JsonIgnore
|
||||
protected float minX;
|
||||
@JsonIgnore
|
||||
|
||||
@ -1,15 +1,80 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.table;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class CleanRulings {
|
||||
|
||||
List<Ruling> horizontal;
|
||||
List<Ruling> vertical;
|
||||
List<Ruling> horizontals;
|
||||
List<Ruling> verticals;
|
||||
|
||||
|
||||
public CleanRulings getTableLines() {
|
||||
|
||||
return new CleanRulings(horizontals.stream()
|
||||
.filter(ruling -> ruling.getClassification().equals(Ruling.Classification.TABLE_LINE))
|
||||
.toList(),
|
||||
verticals.stream()
|
||||
.filter(ruling -> ruling.getClassification().equals(Ruling.Classification.TABLE_LINE))
|
||||
.toList());
|
||||
}
|
||||
|
||||
|
||||
public boolean lineBetween(Character a, Character b) {
|
||||
|
||||
return lineBetween(a.getTextPosition().getInitialUserSpacePosition(), b.getTextPosition().getInitialUserSpacePosition());
|
||||
}
|
||||
|
||||
|
||||
public boolean lineBetween(Rectangle2D a, Rectangle2D b) {
|
||||
|
||||
if (a.intersects(b)) {
|
||||
return false;
|
||||
}
|
||||
return lineBetween(new Point2D.Double(a.getCenterX(), a.getCenterY()), new Point2D.Double(b.getCenterX(), b.getCenterY()));
|
||||
}
|
||||
|
||||
|
||||
public boolean lineBetween(Point2D p1, Point2D p2) {
|
||||
|
||||
Ruling ruling = new Ruling(p1, p2);
|
||||
|
||||
if (ruling.isHorizontal()) {
|
||||
return verticals.stream()
|
||||
.anyMatch(vertical -> vertical.intersectsLine(ruling));
|
||||
|
||||
}
|
||||
|
||||
if (ruling.isVertical()) {
|
||||
return horizontals.stream()
|
||||
.anyMatch(horizontal -> horizontal.intersectsLine(ruling));
|
||||
|
||||
}
|
||||
|
||||
return buildAll().stream()
|
||||
.anyMatch(other -> other.intersectsLine(ruling));
|
||||
}
|
||||
|
||||
|
||||
public List<Ruling> buildAll() {
|
||||
|
||||
ArrayList<Ruling> rulings = new ArrayList<>(horizontals.size() + verticals.size());
|
||||
rulings.addAll(horizontals);
|
||||
rulings.addAll(verticals);
|
||||
return rulings;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -10,6 +10,8 @@ import java.util.List;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.CohenSutherlandClipping;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@ -19,10 +21,24 @@ public class Ruling extends Line2D.Float {
|
||||
public static final int PERPENDICULAR_UNIT_EXPAND_AMOUNT = 2;
|
||||
public static final int COLINEAR_OR_PARALLEL_UNIT_EXPAND_AMOUNT = 2;
|
||||
|
||||
public enum Classification {
|
||||
TABLE_LINE,
|
||||
UNDERLINE,
|
||||
STRIKETROUGH,
|
||||
HEADER_SEPARATOR,
|
||||
FOOTER_SEPARATOR,
|
||||
OTHER
|
||||
}
|
||||
|
||||
@Getter
|
||||
@Setter
|
||||
private Classification classification;
|
||||
|
||||
|
||||
public Ruling(Point2D p1, Point2D p2) {
|
||||
|
||||
super(p1, p2);
|
||||
this.classification = Classification.OTHER;
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.table;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
@ -129,8 +130,7 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
List<Cell> cellsToTheTop = new ArrayList<>();
|
||||
for (int i = 0; i < rowIndex; i++) {
|
||||
try {
|
||||
cellsToTheTop.add(rows.get(i)
|
||||
.get(colIndex));
|
||||
cellsToTheTop.add(rows.get(i).get(colIndex));
|
||||
} catch (IndexOutOfBoundsException e) {
|
||||
log.debug("No cell {} in row {}, ignoring.", colIndex, rowIndex);
|
||||
}
|
||||
@ -145,8 +145,7 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
if (lastHeaderCell != null) {
|
||||
cell.getHeaderCells().add(lastHeaderCell);
|
||||
}
|
||||
if (!cell.getTextBlocks().isEmpty() && cell.getTextBlocks()
|
||||
.get(0).getMostPopularWordStyle().equals("bold")) {
|
||||
if (!cell.getTextBlocks().isEmpty() && cell.getTextBlocks().get(0).getMostPopularWordStyle().equals("bold")) {
|
||||
cell.setHeaderCell(true);
|
||||
}
|
||||
}
|
||||
@ -210,8 +209,7 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
|
||||
for (int i = 0; i < rowsOfCellsMatrix.size(); i++) {
|
||||
for (int j = 0; j < rowsOfCellsMatrix.get(i).size(); j++) {
|
||||
addCellToRowAndCol(rowsOfCellsMatrix.get(i)
|
||||
.get(j), i, j);
|
||||
addCellToRowAndCol(rowsOfCellsMatrix.get(i).get(j), i, j);
|
||||
}
|
||||
}
|
||||
|
||||
@ -413,6 +411,16 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
}
|
||||
|
||||
|
||||
public Rectangle2D getBBox() {
|
||||
|
||||
if (this.bBox == null) {
|
||||
this.bBox = cells.stream()
|
||||
.collect(RectangleTransformations.collectBBox());
|
||||
}
|
||||
return this.bBox;
|
||||
}
|
||||
|
||||
|
||||
record CellWithIntersection(Cell originalCell, double intersectedArea) {
|
||||
|
||||
}
|
||||
|
||||
@ -2,6 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||
|
||||
import static java.util.stream.Collectors.toSet;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
@ -11,6 +12,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
@ -80,7 +82,10 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
|
||||
public static TextPageBlock merge(List<TextPageBlock> textBlocksToMerge) {
|
||||
|
||||
List<TextPositionSequence> sequences = textBlocksToMerge.stream().map(TextPageBlock::getSequences).flatMap(java.util.Collection::stream).toList();
|
||||
List<TextPositionSequence> sequences = textBlocksToMerge.stream()
|
||||
.map(TextPageBlock::getSequences)
|
||||
.flatMap(java.util.Collection::stream)
|
||||
.toList();
|
||||
sequences = new ArrayList<>(sequences);
|
||||
return fromTextPositionSequences(sequences);
|
||||
}
|
||||
@ -106,11 +111,11 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
|
||||
if (textBlock == null) {
|
||||
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
|
||||
wordBlock.getMaxXDirAdj(),
|
||||
wordBlock.getMinYDirAdj(),
|
||||
wordBlock.getMaxYDirAdj(),
|
||||
wordBlockList,
|
||||
wordBlock.getRotation());
|
||||
wordBlock.getMaxXDirAdj(),
|
||||
wordBlock.getMinYDirAdj(),
|
||||
wordBlock.getMaxYDirAdj(),
|
||||
wordBlockList,
|
||||
wordBlock.getRotation());
|
||||
} else {
|
||||
TextPageBlock spatialEntity = textBlock.union(wordBlock);
|
||||
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
|
||||
@ -126,17 +131,29 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
|
||||
}
|
||||
|
||||
if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences()
|
||||
.stream()
|
||||
.map(t -> DoubleComparisons.round(t.getMinYDirAdj(), 3))
|
||||
.collect(toSet())
|
||||
.size() == 1) {
|
||||
if (textBlock != null
|
||||
&& textBlock.getSequences() != null
|
||||
&& textBlock.getSequences()
|
||||
.stream()
|
||||
.map(t -> DoubleComparisons.round(t.getMinYDirAdj(), 3))
|
||||
.collect(toSet()).size() == 1) {
|
||||
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
|
||||
}
|
||||
return textBlock;
|
||||
}
|
||||
|
||||
|
||||
public Rectangle2D getBBox() {
|
||||
|
||||
if (this.bBox == null) {
|
||||
this.bBox = sequences.stream()
|
||||
.map(TextPositionSequence::getBoundingBox)
|
||||
.collect(RectangleTransformations.collectBBox());
|
||||
}
|
||||
return this.bBox;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns the minX value in pdf coordinate system.
|
||||
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
|
||||
|
||||
@ -31,7 +31,7 @@ public class BodyTextFrameService {
|
||||
Rectangle bodyTextFrame = calculateBodyTextFrame(classificationDocument.getPages(), classificationDocument.getFontSizeCounter(), false, layoutParsingType);
|
||||
Rectangle landscapeBodyTextFrame = calculateBodyTextFrame(classificationDocument.getPages(), classificationDocument.getFontSizeCounter(), true, layoutParsingType);
|
||||
for (ClassificationPage page : classificationDocument.getPages()) {
|
||||
// var updatedBodyTextFrame = getBodyTextFrameFromRulings(page, bodyTextFrame, landscapeBodyTextFrame);
|
||||
var updatedBodyTextFrame = getBodyTextFrameFromRulings(page, bodyTextFrame, landscapeBodyTextFrame);
|
||||
setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame);
|
||||
classificationDocument.getVisualizations().addMainBodyVisualization(page.isLandscape() ? landscapeBodyTextFrame : bodyTextFrame, page.getPageNumber());
|
||||
}
|
||||
@ -59,24 +59,26 @@ public class BodyTextFrameService {
|
||||
|
||||
private List<Ruling> getPotentialFooterRulings(ClassificationPage page) {
|
||||
|
||||
return page.getCleanRulings()
|
||||
.getHorizontal()
|
||||
return page.getCleanRulings().getHorizontals()
|
||||
.stream()
|
||||
.filter(ruling -> ruling.getClassification().equals(Ruling.Classification.OTHER))
|
||||
.filter(ruling -> ruling.getY1() < page.getPageHeight() * RULING_HEIGHT_THRESHOLD)
|
||||
.filter(ruling -> ruling.getWidth() > RULING_WIDTH_THRESHOLD * page.getPageWidth())
|
||||
.sorted(Comparator.comparingDouble(Ruling::getTop))
|
||||
.peek(ruling -> ruling.setClassification(Ruling.Classification.FOOTER_SEPARATOR))
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
private List<Ruling> getPotentialHeaderRulings(ClassificationPage page) {
|
||||
|
||||
return page.getCleanRulings()
|
||||
.getHorizontal()
|
||||
return page.getCleanRulings().getHorizontals()
|
||||
.stream()
|
||||
.filter(ruling -> ruling.getClassification().equals(Ruling.Classification.OTHER))
|
||||
.filter(ruling -> ruling.getY1() > page.getPageHeight() * (1 - RULING_HEIGHT_THRESHOLD))
|
||||
.filter(ruling -> ruling.getWidth() > RULING_WIDTH_THRESHOLD * page.getPageWidth())
|
||||
.sorted(Comparator.comparingDouble(Ruling::getBottom).reversed())
|
||||
.peek(ruling -> ruling.setClassification(Ruling.Classification.HEADER_SEPARATOR))
|
||||
.toList();
|
||||
}
|
||||
|
||||
@ -100,16 +102,16 @@ public class BodyTextFrameService {
|
||||
|
||||
if (page.getPageWidth() > page.getPageHeight() && page.getRotation() == 270) {
|
||||
textFrame = new Rectangle(new Point(textFrame.getTopLeft().getY(), page.getPageHeight() - textFrame.getTopLeft().getX() - textFrame.getWidth()),
|
||||
textFrame.getHeight(),
|
||||
textFrame.getWidth(),
|
||||
0);
|
||||
textFrame.getHeight(),
|
||||
textFrame.getWidth(),
|
||||
0);
|
||||
} else if (page.getPageWidth() > page.getPageHeight() && page.getRotation() != 0) {
|
||||
textFrame = new Rectangle(new Point(textFrame.getTopLeft().getY(), textFrame.getTopLeft().getX()), textFrame.getHeight(), textFrame.getWidth(), page.getPageNumber());
|
||||
} else if (page.getRotation() == 180) {
|
||||
textFrame = new Rectangle(new Point(textFrame.getTopLeft().getX(), page.getPageHeight() - textFrame.getTopLeft().getY() - textFrame.getHeight()),
|
||||
textFrame.getWidth(),
|
||||
textFrame.getHeight(),
|
||||
0);
|
||||
textFrame.getWidth(),
|
||||
textFrame.getHeight(),
|
||||
0);
|
||||
}
|
||||
page.setBodyTextFrame(textFrame);
|
||||
}
|
||||
@ -153,14 +155,16 @@ public class BodyTextFrameService {
|
||||
}
|
||||
|
||||
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) || MarkedContentUtils.intersects(textBlock,
|
||||
page.getMarkedContentBboxPerType(),
|
||||
MarkedContentUtils.FOOTER)) {
|
||||
page.getMarkedContentBboxPerType(),
|
||||
MarkedContentUtils.FOOTER)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
float approxLineCount = PositionUtils.getApproxLineCount(textBlock);
|
||||
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE) && approxLineCount < approximateHeaderLineCount && textBlock.getMaxY() >= page.getPageHeight() - (page.getPageHeight() / 10) || !layoutParsingType.equals(
|
||||
LayoutParsingType.DOCUMINE) && approxLineCount < approximateHeaderLineCount) {
|
||||
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD)
|
||||
&& approxLineCount < approximateHeaderLineCount
|
||||
&& textBlock.getMaxY() >= page.getPageHeight() - (page.getPageHeight() / 10)
|
||||
|| !layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD) && approxLineCount < approximateHeaderLineCount) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -187,9 +191,9 @@ public class BodyTextFrameService {
|
||||
}
|
||||
}
|
||||
return new Rectangle(new Point(expansionsRectangle.minX, expansionsRectangle.minY),
|
||||
expansionsRectangle.maxX - expansionsRectangle.minX,
|
||||
expansionsRectangle.maxY - expansionsRectangle.minY,
|
||||
0);
|
||||
expansionsRectangle.maxX - expansionsRectangle.minX,
|
||||
expansionsRectangle.maxY - expansionsRectangle.minY,
|
||||
0);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -45,7 +45,7 @@ public class RulingCleaningService {
|
||||
verticalAndHorizontalRulingLines.horizontalLines.sort(X_FIRST_RULING_COMPARATOR);
|
||||
verticalAndHorizontalRulingLines = cleanRulings(verticalAndHorizontalRulingLines);
|
||||
|
||||
return CleanRulings.builder().vertical(verticalAndHorizontalRulingLines.verticalLines()).horizontal(verticalAndHorizontalRulingLines.horizontalLines()).build();
|
||||
return CleanRulings.builder().verticals(verticalAndHorizontalRulingLines.verticalLines()).horizontals(verticalAndHorizontalRulingLines.horizontalLines()).build();
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -26,7 +26,7 @@ import com.knecon.fforesight.service.layoutparser.processor.utils.SpreadsheetFin
|
||||
public class TableExtractionService {
|
||||
|
||||
private static final int MAX_TABLE_CONTAINED_CELLS_WITH_TEXT = 1;
|
||||
private static final int TEXT_BLOCK_CONTAINMENT_TOLERANCE = 2;
|
||||
private static final double TEXT_BLOCK_CONTAINMENT_TOLERANCE = 0.02;
|
||||
private static final double TABLE_UNIFORMITY_THRESHOLD = 0.7;
|
||||
|
||||
|
||||
@ -77,7 +77,9 @@ public class TableExtractionService {
|
||||
}
|
||||
}
|
||||
|
||||
var containedCellsWithText = containedCells.stream().filter(cell -> !cell.getTextBlocks().isEmpty()).toList();
|
||||
var containedCellsWithText = containedCells.stream()
|
||||
.filter(cell -> !cell.getTextBlocks().isEmpty())
|
||||
.toList();
|
||||
|
||||
// verify if table would contain fewer cells with text than the threshold allows
|
||||
if (containedCellsWithText.size() >= MAX_TABLE_CONTAINED_CELLS_WITH_TEXT && checkIfTableCellsAreUniform(containedCells)) {
|
||||
@ -97,7 +99,11 @@ public class TableExtractionService {
|
||||
if (position != -1) {
|
||||
page.getTextBlocks().add(position, table);
|
||||
|
||||
var toBeRemoved = table.getCells().stream().map(Cell::getTextBlocks).flatMap(List::stream).toList();
|
||||
var toBeRemoved = table.getCells()
|
||||
.stream()
|
||||
.map(Cell::getTextBlocks)
|
||||
.flatMap(List::stream)
|
||||
.toList();
|
||||
// remove text blocks from the page that were also added with the table (from its contained cells)
|
||||
page.getTextBlocks().removeAll(toBeRemoved);
|
||||
}
|
||||
@ -122,22 +128,27 @@ public class TableExtractionService {
|
||||
|
||||
private boolean doesCellContainTextBlock(Cell cell, TextPageBlock textBlock) {
|
||||
|
||||
double x = textBlock.getPdfMinX();
|
||||
double y = textBlock.getPdfMinY();
|
||||
double w = textBlock.getPdfMaxX() - textBlock.getPdfMinX();
|
||||
double h = textBlock.getPdfMaxY() - textBlock.getPdfMinY();
|
||||
double x = textBlock.getBBox().getX();
|
||||
double y = textBlock.getBBox().getY();
|
||||
double w = textBlock.getBBox().getWidth();
|
||||
double h = textBlock.getBBox().getHeight();
|
||||
if (cell.isEmpty() || w <= 0 || h <= 0) {
|
||||
return false;
|
||||
}
|
||||
double x0 = cell.getX();
|
||||
double y0 = cell.getY();
|
||||
return (x >= x0 - TEXT_BLOCK_CONTAINMENT_TOLERANCE && y >= y0 - TEXT_BLOCK_CONTAINMENT_TOLERANCE && (x + w) <= x0 + cell.getWidth() + 2 * TEXT_BLOCK_CONTAINMENT_TOLERANCE && (y + h) <= y0 + cell.getHeight() + 2 * TEXT_BLOCK_CONTAINMENT_TOLERANCE);
|
||||
double xTol = TEXT_BLOCK_CONTAINMENT_TOLERANCE * w;
|
||||
double yTol = TEXT_BLOCK_CONTAINMENT_TOLERANCE * h;
|
||||
return (x >= x0 - xTol && y >= y0 - yTol && (x + w) <= x0 + cell.getWidth() + 2 * xTol && (y + h) <= y0 + cell.getHeight() + 2 * yTol);
|
||||
}
|
||||
|
||||
|
||||
public static List<Cell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
||||
|
||||
return RectangularIntersectionFinder.find(horizontalRulingLines, verticalRulingLines).stream().map(Cell::new).collect(Collectors.toList());
|
||||
return RectangularIntersectionFinder.find(horizontalRulingLines, verticalRulingLines)
|
||||
.stream()
|
||||
.map(Cell::new)
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -15,16 +15,14 @@ import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.Doubl
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@ -38,29 +36,52 @@ public class DocstrumBlockificationService {
|
||||
static final float THRESHOLD = 1f;
|
||||
|
||||
|
||||
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Cell> cells, boolean xyOrder, LayoutparsingVisualizations visualizations) {
|
||||
public ClassificationPage blockify(List<TextPositionSequence> textPositions, CleanRulings rulings, boolean xyOrder, LayoutparsingVisualizations visualizations) {
|
||||
|
||||
CleanRulings usedRulings = RectangleTransformations.extractRulings(cells);
|
||||
CleanRulings usedRulings = rulings.getTableLines();
|
||||
|
||||
var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder);
|
||||
var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder, usedRulings, visualizations);
|
||||
|
||||
if (!textPositions.isEmpty()) {
|
||||
visualizations.addZoneVisualizations(zones, textPositions.get(0).getPage());
|
||||
visualizations.addLineVisualizations(zones, textPositions.get(0).getPage());
|
||||
visualizations.addLineVisualizationsFromZones(zones, textPositions.get(0).getPage());
|
||||
visualizations.addCharactersWithNeighbours(zones, textPositions.get(0).getPage());
|
||||
}
|
||||
|
||||
var pageBlocks = toAbstractPageBlocks(zones, usedRulings.getHorizontal(), usedRulings.getVertical(), xyOrder);
|
||||
var pageBlocks = toAbstractPageBlocks(zones, usedRulings.getHorizontals(), usedRulings.getVerticals(), xyOrder, usedRulings);
|
||||
|
||||
if (xyOrder) {
|
||||
sortPageBlocksXThenY(pageBlocks);
|
||||
}
|
||||
|
||||
var classificationPage = new ClassificationPage(pageBlocks);
|
||||
classificationPage.setCleanRulings(rulings);
|
||||
|
||||
mergeIntersectingBlocks(classificationPage.getTextBlocks(), 0, 0);
|
||||
mergeIntersectingBlocks(classificationPage, 0, 0);
|
||||
|
||||
return classificationPage;
|
||||
}
|
||||
|
||||
|
||||
private List<AbstractPageBlock> toAbstractPageBlocks(List<Zone> zones, List<Ruling> horizontalRulings, List<Ruling> verticalRulings, boolean xyOrder) {
|
||||
private static void sortPageBlocksXThenY(List<AbstractPageBlock> pageBlocks) {
|
||||
|
||||
pageBlocks.sort(Comparator.comparing(AbstractPageBlock::getMinY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||
.thenComparing(AbstractPageBlock::getMinX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||
pageBlocks.sort(new Comparator<AbstractPageBlock>() {
|
||||
@Override
|
||||
public int compare(AbstractPageBlock o1, AbstractPageBlock o2) {
|
||||
|
||||
return Math.abs(o1.getMinY() - o2.getMinY()) < 5 && o1.getMinX() < o2.getMinX() == true ? -1 : 0;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
private List<AbstractPageBlock> toAbstractPageBlocks(List<Zone> zones,
|
||||
List<Ruling> horizontalRulings,
|
||||
List<Ruling> verticalRulings,
|
||||
boolean xyOrder,
|
||||
CleanRulings usedRulings) {
|
||||
|
||||
List<AbstractPageBlock> abstractPageBlocks = new ArrayList<>();
|
||||
zones.forEach(zone -> {
|
||||
@ -74,21 +95,9 @@ public class DocstrumBlockificationService {
|
||||
});
|
||||
});
|
||||
|
||||
abstractPageBlocks.addAll(splitZonesAtRulings(textPositionSequences, horizontalRulings, verticalRulings));
|
||||
abstractPageBlocks.add(buildTextBlock(textPositionSequences, 0));
|
||||
});
|
||||
|
||||
if (xyOrder) {
|
||||
abstractPageBlocks.sort(Comparator.comparing(AbstractPageBlock::getMinY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||
.thenComparing(AbstractPageBlock::getMinX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||
abstractPageBlocks.sort(new Comparator<AbstractPageBlock>() {
|
||||
@Override
|
||||
public int compare(AbstractPageBlock o1, AbstractPageBlock o2) {
|
||||
|
||||
return Math.abs(o1.getMinY() - o2.getMinY()) < 5 && o1.getMinX() < o2.getMinX() == true ? -1 : 0;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
return abstractPageBlocks;
|
||||
}
|
||||
|
||||
@ -137,7 +146,7 @@ public class DocstrumBlockificationService {
|
||||
previous = current;
|
||||
}
|
||||
|
||||
mergeIntersectingBlocks(page.getTextBlocks(), 0, 6.5f);
|
||||
mergeIntersectingBlocks(page, 0, 6.5f);
|
||||
}
|
||||
|
||||
|
||||
@ -218,8 +227,9 @@ public class DocstrumBlockificationService {
|
||||
}
|
||||
|
||||
|
||||
public void mergeIntersectingBlocks(List<AbstractPageBlock> blocks, float xThreshold, float yThreshold) {
|
||||
public void mergeIntersectingBlocks(ClassificationPage page, float xThreshold, float yThreshold) {
|
||||
|
||||
var blocks = page.getTextBlocks();
|
||||
ListIterator<AbstractPageBlock> itty = blocks.listIterator();
|
||||
while (itty.hasNext()) {
|
||||
AbstractPageBlock block = itty.next();
|
||||
@ -246,6 +256,10 @@ public class DocstrumBlockificationService {
|
||||
|
||||
TextPageBlock inner = (TextPageBlock) blocks.get(i);
|
||||
|
||||
if (page.getCleanRulings().lineBetween(inner.getBBox(), current.getBBox())) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (current.getDir() == inner.getDir() && current.almostIntersects(inner, yThreshold, xThreshold)) {
|
||||
|
||||
boolean toDuplicate = current.isToDuplicate() || inner.isToDuplicate();
|
||||
|
||||
@ -21,6 +21,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageB
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
|
||||
|
||||
@SuppressWarnings("all")
|
||||
@Service
|
||||
@ -34,10 +35,11 @@ public class RedactManagerBlockificationService {
|
||||
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
|
||||
* Rulings (TablePageBlock lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
|
||||
*
|
||||
* @param textPositions The words of a page.
|
||||
* @param textPositions The words of a page.
|
||||
* @param visualizations
|
||||
* @return Page object that contains the Textblock and text statistics.
|
||||
*/
|
||||
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Cell> cells) {
|
||||
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Cell> cells, LayoutparsingVisualizations visualizations) {
|
||||
|
||||
CleanRulings usedRulings = RectangleTransformations.extractRulings(cells);
|
||||
|
||||
@ -57,7 +59,7 @@ public class RedactManagerBlockificationService {
|
||||
boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj();
|
||||
boolean xIsBeforeFirstX = prev != null && word.getMinXDirAdj() < minX;
|
||||
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
|
||||
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, usedRulings.getHorizontal(), usedRulings.getVertical());
|
||||
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, usedRulings.getHorizontals(), usedRulings.getVerticals());
|
||||
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
|
||||
|
||||
if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) {
|
||||
@ -160,6 +162,8 @@ public class RedactManagerBlockificationService {
|
||||
previous = block;
|
||||
}
|
||||
|
||||
visualizations.addTextBlockVisualizations(chunkBlockList.stream().map(tb -> (TextPageBlock) tb).toList(), textPositions.get(0).getPage());
|
||||
|
||||
return new ClassificationPage(chunkBlockList);
|
||||
}
|
||||
|
||||
|
||||
@ -65,12 +65,18 @@ public class LayoutGridService {
|
||||
|
||||
@SneakyThrows
|
||||
@Observed(name = "ViewerDocumentService", contextualName = "create-viewer-document")
|
||||
public void addLayoutGrid(File originFile, Document document, File destinationFile, boolean layerVisibilityDefaultValue) {
|
||||
public void addLayoutGrid(File originFile, Document document, File destinationFile, boolean layerVisibilityDefaultValue, boolean writeVisualLayoutParsingGrid) {
|
||||
|
||||
List<Visualizations> allVisualizations;
|
||||
Visualizations layoutGrid = this.addLayoutGrid(document, layerVisibilityDefaultValue, false);
|
||||
Visualizations visualLayoutGrid = this.addLayoutGrid(document, layerVisibilityDefaultValue, true);
|
||||
|
||||
List<Visualizations> allVisualizations = Stream.concat(Stream.of(layoutGrid, visualLayoutGrid), document.getVisualizations().streamAll()).toList();
|
||||
if (writeVisualLayoutParsingGrid) {
|
||||
Visualizations visualLayoutGrid = this.addLayoutGrid(document, layerVisibilityDefaultValue, true);
|
||||
allVisualizations = Stream.concat(Stream.of(layoutGrid, visualLayoutGrid), document.getVisualizations().streamAll())
|
||||
.toList();
|
||||
} else {
|
||||
allVisualizations = Stream.concat(Stream.of(layoutGrid), document.getVisualizations().streamAll())
|
||||
.toList();
|
||||
}
|
||||
|
||||
viewerDocumentService.addVisualizationsOnPage(originFile, destinationFile, allVisualizations);
|
||||
}
|
||||
@ -132,7 +138,10 @@ public class LayoutGridService {
|
||||
}
|
||||
for (Page page : table.getPages()) {
|
||||
|
||||
Optional<Integer> optionalFirstRowOnPage = table.streamCol(0).filter(tableCell -> tableCell.isOnPage(page.getNumber())).map(TableCell::getRow).findFirst();
|
||||
Optional<Integer> optionalFirstRowOnPage = table.streamCol(0)
|
||||
.filter(tableCell -> tableCell.isOnPage(page.getNumber()))
|
||||
.map(TableCell::getRow)
|
||||
.findFirst();
|
||||
if (optionalFirstRowOnPage.isEmpty()) {
|
||||
continue;
|
||||
}
|
||||
@ -172,14 +181,17 @@ public class LayoutGridService {
|
||||
|
||||
private static Stream<Rectangle2D> streamBBoxOfCellsOnPage(Stream<TableCell> table, Page page) {
|
||||
|
||||
return table.filter(tableCell -> tableCell.isOnPage(page.getNumber())).map(TableCell::getBBox).map(bBoxMap -> bBoxMap.get(page));
|
||||
return table.filter(tableCell -> tableCell.isOnPage(page.getNumber()))
|
||||
.map(TableCell::getBBox)
|
||||
.map(bBoxMap -> bBoxMap.get(page));
|
||||
}
|
||||
|
||||
|
||||
private void addSection(SemanticNode semanticNode, LayoutGrid layoutGrid, Color color) {
|
||||
|
||||
Map<Page, Rectangle2D> bBoxMap = semanticNode.getBBox();
|
||||
List<SemanticNode> subSections = semanticNode.streamAllSubNodesOfType(NodeType.SECTION).toList();
|
||||
List<SemanticNode> subSections = semanticNode.streamAllSubNodesOfType(NodeType.SECTION)
|
||||
.toList();
|
||||
Page firstPage = semanticNode.getFirstPage();
|
||||
if (!subSections.isEmpty()) {
|
||||
addPlacedText(firstPage, bBoxMap.get(firstPage), buildTreeIdString(semanticNode), layoutGrid);
|
||||
@ -198,7 +210,10 @@ public class LayoutGridService {
|
||||
}
|
||||
return;
|
||||
}
|
||||
List<Page> pagesInOrder = bBoxMap.keySet().stream().sorted(Comparator.comparingInt(Page::getNumber)).collect(Collectors.toList());
|
||||
List<Page> pagesInOrder = bBoxMap.keySet()
|
||||
.stream()
|
||||
.sorted(Comparator.comparingInt(Page::getNumber))
|
||||
.collect(Collectors.toList());
|
||||
pagesInOrder.remove(0);
|
||||
addLinesForFirstPageOfSection(semanticNode, color, firstPage, layoutGrid);
|
||||
var lastPage = pagesInOrder.remove(pagesInOrder.size() - 1);
|
||||
@ -295,7 +310,10 @@ public class LayoutGridService {
|
||||
|
||||
private String buildTreeIdString(SemanticNode semanticNode) {
|
||||
|
||||
return semanticNode.getTreeId().stream().map(Object::toString).collect(Collectors.joining("."));
|
||||
return semanticNode.getTreeId()
|
||||
.stream()
|
||||
.map(Object::toString)
|
||||
.collect(Collectors.joining("."));
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -182,7 +182,7 @@ public class RectangleTransformations {
|
||||
verticalRulings.add(new Ruling(new Point2D.Float(rectangle.x + rectangle.width, rectangle.y),
|
||||
new Point2D.Float(rectangle.x + rectangle.width, rectangle.y + rectangle.height)));
|
||||
});
|
||||
return CleanRulings.builder().vertical(verticalRulings).horizontal(horizontalRulings).build();
|
||||
return CleanRulings.builder().verticals(verticalRulings).horizontals(horizontalRulings).build();
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -63,6 +63,10 @@ public class RectangularIntersectionFinder {
|
||||
&& intersectionPoints.get(btmRight).vertical().equals(intersectionPoints.get(yPoint).vertical())) {
|
||||
|
||||
foundRectangles.add(new Rectangle2D.Double(topLeft.getX(), topLeft.getY(), btmRight.getX() - topLeft.getX(), btmRight.getY() - topLeft.getY()));
|
||||
intersectionPoints.get(topLeft).horizontal().setClassification(Ruling.Classification.TABLE_LINE);
|
||||
intersectionPoints.get(topLeft).vertical().setClassification(Ruling.Classification.TABLE_LINE);
|
||||
intersectionPoints.get(btmRight).horizontal().setClassification(Ruling.Classification.TABLE_LINE);
|
||||
intersectionPoints.get(btmRight).vertical().setClassification(Ruling.Classification.TABLE_LINE);
|
||||
break outer;
|
||||
}
|
||||
}
|
||||
|
||||
@ -17,7 +17,9 @@ import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Bound
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
@ -47,8 +49,16 @@ public class LayoutparsingVisualizations {
|
||||
static final Color WORDS_COLOR = new Color(68, 84, 147);
|
||||
static final Color LINES_COLOR = new Color(152, 45, 179);
|
||||
static final Color ZONES_COLOR = new Color(131, 38, 38);
|
||||
|
||||
static final Color RULINGS_COLOR = new Color(21, 221, 174);
|
||||
static final Color TABLE_RULINGS_COLOR = new Color(255, 175, 175);
|
||||
static final Color HEADER_RULING_COLOR = new Color(171, 131, 6);
|
||||
static final Color FOOTER_RULING_COLOR = new Color(106, 82, 2);
|
||||
static final Color UNDERLINE_RULING_COLOR = new Color(6, 39, 171);
|
||||
static final Color STRIKETROUGH_RULING_COLOR = new Color(171, 6, 6);
|
||||
|
||||
static final Color CELLS_COLOR = new Color(31, 214, 27);
|
||||
|
||||
static final Color MAIN_BODY_COLOR = new Color(171, 131, 6);
|
||||
static final Color MARKED_CONTENT_COLOR = new Color(171, 131, 6);
|
||||
|
||||
@ -98,14 +108,15 @@ public class LayoutparsingVisualizations {
|
||||
if (!active) {
|
||||
return;
|
||||
}
|
||||
List<ColoredRectangle> list = textPositionSequences.stream()
|
||||
.map(textPositionSequence -> textPositionSequence.getTextPositions()
|
||||
.stream()
|
||||
.map(RedTextPosition::getInitialUserSpacePosition)
|
||||
.collect(RectangleTransformations.collectBBox()))
|
||||
.map(rect -> new ColoredRectangle(rect, WORDS_COLOR, 1))
|
||||
.toList();
|
||||
this.words.getVisualizationsOnPages().put(pageNumber - 1, VisualizationsOnPage.builder().coloredRectangles(list).build());
|
||||
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.words);
|
||||
visualizationsOnPage.getColoredRectangles()
|
||||
.addAll(textPositionSequences.stream()
|
||||
.map(textPositionSequence -> textPositionSequence.getTextPositions()
|
||||
.stream()
|
||||
.map(RedTextPosition::getInitialUserSpacePosition)
|
||||
.collect(RectangleTransformations.collectBBox()))
|
||||
.map(rect -> new ColoredRectangle(rect, WORDS_COLOR, 1))
|
||||
.toList());
|
||||
}
|
||||
|
||||
|
||||
@ -114,14 +125,25 @@ public class LayoutparsingVisualizations {
|
||||
if (!active) {
|
||||
return;
|
||||
}
|
||||
this.rulings.getVisualizationsOnPages()
|
||||
.put(pageNumber - 1,
|
||||
VisualizationsOnPage.builder()
|
||||
.coloredLines(Stream.of(cleanRulings.getHorizontal(), cleanRulings.getVertical())
|
||||
.flatMap(Collection::stream)
|
||||
.map(ruling -> new ColoredLine(ruling, RULINGS_COLOR, 1))
|
||||
.toList())
|
||||
.build());
|
||||
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.rulings);
|
||||
visualizationsOnPage.getColoredLines()
|
||||
.addAll(Stream.of(cleanRulings.getHorizontals(), cleanRulings.getVerticals())
|
||||
.flatMap(Collection::stream)
|
||||
.map(ruling -> new ColoredLine(ruling, decideOnRulingColor(ruling), 1))
|
||||
.toList());
|
||||
}
|
||||
|
||||
|
||||
private Color decideOnRulingColor(Ruling ruling) {
|
||||
|
||||
return switch (ruling.getClassification()) {
|
||||
case TABLE_LINE -> TABLE_RULINGS_COLOR;
|
||||
case HEADER_SEPARATOR -> HEADER_RULING_COLOR;
|
||||
case FOOTER_SEPARATOR -> FOOTER_RULING_COLOR;
|
||||
case UNDERLINE -> UNDERLINE_RULING_COLOR;
|
||||
case STRIKETROUGH -> STRIKETROUGH_RULING_COLOR;
|
||||
default -> RULINGS_COLOR;
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@ -130,13 +152,11 @@ public class LayoutparsingVisualizations {
|
||||
if (!active) {
|
||||
return;
|
||||
}
|
||||
this.cells.getVisualizationsOnPages()
|
||||
.put(pageNumber - 1,
|
||||
VisualizationsOnPage.builder()
|
||||
.coloredRectangles(cells.stream()
|
||||
.map(ruling -> new ColoredRectangle(ruling, CELLS_COLOR, 1))
|
||||
.toList())
|
||||
.build());
|
||||
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.cells);
|
||||
visualizationsOnPage.getColoredRectangles()
|
||||
.addAll(cells.stream()
|
||||
.map(ruling -> new ColoredRectangle(ruling, CELLS_COLOR, 1))
|
||||
.toList());
|
||||
}
|
||||
|
||||
|
||||
@ -146,33 +166,50 @@ public class LayoutparsingVisualizations {
|
||||
return;
|
||||
}
|
||||
|
||||
this.zones.getVisualizationsOnPages()
|
||||
.put(page - 1,
|
||||
VisualizationsOnPage.builder()
|
||||
.coloredRectangles(zones.stream()
|
||||
.map(BoundingBox::getBBox)
|
||||
.map(zone -> new ColoredRectangle(zone, ZONES_COLOR, 1))
|
||||
.toList())
|
||||
.build());
|
||||
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(page, this.zones);
|
||||
visualizationsOnPage.getColoredRectangles()
|
||||
.addAll(zones.stream()
|
||||
.map(BoundingBox::getBBox)
|
||||
.map(zone -> new ColoredRectangle(zone, ZONES_COLOR, 1))
|
||||
.toList());
|
||||
|
||||
}
|
||||
|
||||
|
||||
public void addLineVisualizations(List<Zone> zones, int page) {
|
||||
public void addLineVisualizationsFromZones(List<Zone> zones, int page) {
|
||||
|
||||
addLineVisualizations(zones.stream()
|
||||
.map(Zone::getLines)
|
||||
.flatMap(Collection::stream)
|
||||
.toList(), page);
|
||||
|
||||
}
|
||||
|
||||
|
||||
public void addLineVisualizations(List<Line> lines, int page) {
|
||||
|
||||
if (!active) {
|
||||
return;
|
||||
}
|
||||
this.lines.getVisualizationsOnPages()
|
||||
.put(page - 1,
|
||||
VisualizationsOnPage.builder()
|
||||
.coloredRectangles(zones.stream()
|
||||
.map(Zone::getLines)
|
||||
.flatMap(Collection::stream)
|
||||
.map(BoundingBox::getBBox)
|
||||
.map(line -> new ColoredRectangle(line, LINES_COLOR, 1))
|
||||
.toList())
|
||||
.build());
|
||||
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(page, this.lines);
|
||||
visualizationsOnPage.getColoredRectangles()
|
||||
.addAll(lines.stream()
|
||||
.map(BoundingBox::getBBox)
|
||||
.map(line -> new ColoredRectangle(line, LINES_COLOR, 0.3f))
|
||||
.toList());
|
||||
}
|
||||
|
||||
|
||||
public void addTextBlockVisualizations(List<TextPageBlock> textPageBlocks, int page) {
|
||||
|
||||
if (!active) {
|
||||
return;
|
||||
}
|
||||
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(page, zones);
|
||||
visualizationsOnPage.getColoredRectangles()
|
||||
.addAll(textPageBlocks.stream()
|
||||
.map(rect -> new ColoredRectangle(rect.getBBox(), ZONES_COLOR, 1))
|
||||
.toList());
|
||||
}
|
||||
|
||||
|
||||
@ -181,14 +218,11 @@ public class LayoutparsingVisualizations {
|
||||
if (!active) {
|
||||
return;
|
||||
}
|
||||
this.mainBody.getVisualizationsOnPages()
|
||||
.put(pageNumber - 1,
|
||||
VisualizationsOnPage.builder()
|
||||
.coloredRectangles(List.of(new ColoredRectangle(new Rectangle2D.Double(rectangle.getTopLeft().getX(),
|
||||
rectangle.getTopLeft().getY(),
|
||||
rectangle.getWidth(),
|
||||
rectangle.getHeight()), MAIN_BODY_COLOR, 1)))
|
||||
.build());
|
||||
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, mainBody);
|
||||
visualizationsOnPage.getColoredRectangles()
|
||||
.add(new ColoredRectangle(new Rectangle2D.Double(rectangle.getTopLeft().getX(), rectangle.getTopLeft().getY(), rectangle.getWidth(), rectangle.getHeight()),
|
||||
MAIN_BODY_COLOR,
|
||||
1));
|
||||
}
|
||||
|
||||
|
||||
@ -197,9 +231,11 @@ public class LayoutparsingVisualizations {
|
||||
if (!active) {
|
||||
return;
|
||||
}
|
||||
|
||||
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, markedContent);
|
||||
|
||||
List<MarkedContentUtils.MarkedContentPosition> markedContentBBoxMapBySubType = MarkedContentUtils.getMarkedContentPositions(markedContents, pdPage);
|
||||
VisualizationsOnPage visualizationsOnPage = VisualizationsOnPage.builder().build();
|
||||
this.markedContent.getVisualizationsOnPages().put(pageNumber - 1, visualizationsOnPage);
|
||||
|
||||
markedContentBBoxMapBySubType.forEach(markedContentPosition -> {
|
||||
|
||||
var bbox = markedContentPosition.textPositions()
|
||||
@ -224,10 +260,8 @@ public class LayoutparsingVisualizations {
|
||||
return;
|
||||
}
|
||||
|
||||
VisualizationsOnPage neighbourVisualizations = VisualizationsOnPage.builder().build();
|
||||
neighbours.getVisualizationsOnPages().put(page - 1, neighbourVisualizations);
|
||||
VisualizationsOnPage characterVisualizations = VisualizationsOnPage.builder().build();
|
||||
characters.getVisualizationsOnPages().put(page - 1, characterVisualizations);
|
||||
VisualizationsOnPage characterVisualizations = getOrCreateVisualizationsOnPage(page, characters);
|
||||
VisualizationsOnPage neighbourVisualizations = getOrCreateVisualizationsOnPage(page, neighbours);
|
||||
|
||||
AtomicInteger index = new AtomicInteger(0);
|
||||
zones.forEach(zone -> zone.getLines()
|
||||
@ -249,4 +283,15 @@ public class LayoutparsingVisualizations {
|
||||
|
||||
}
|
||||
|
||||
|
||||
private VisualizationsOnPage getOrCreateVisualizationsOnPage(int page, Visualizations visualizations) {
|
||||
|
||||
if (visualizations.getVisualizationsOnPages().containsKey(page - 1)) {
|
||||
return visualizations.getVisualizationsOnPages().get(page - 1);
|
||||
}
|
||||
VisualizationsOnPage visualizationsOnPage = VisualizationsOnPage.builder().build();
|
||||
visualizations.getVisualizationsOnPages().put(page - 1, visualizationsOnPage);
|
||||
return visualizationsOnPage;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -96,8 +96,8 @@ public class HeadlinesGoldStandardIntegrationTest {
|
||||
goldStandardLog.getRedactionLogEntry().removeIf(r -> !r.isRedacted() || r.getChanges().get(r.getChanges().size() - 1).getType().equals(ChangeType.REMOVED));
|
||||
goldStandardLog.getRedactionLogEntry().forEach(e -> goldStandardHeadlines.add(new Headline(e.getPositions().get(0).getPage(), e.getValue())));
|
||||
|
||||
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_OLD,
|
||||
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD,
|
||||
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.DOCUMINE,
|
||||
layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE,
|
||||
pdfFileResource.getFile(),
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse(),
|
||||
|
||||
@ -9,7 +9,6 @@ import java.util.List;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
@ -34,17 +33,17 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
|
||||
@Test
|
||||
public void testLayoutParserEndToEnd() {
|
||||
|
||||
String filePath = "files/bdr/Wie weiter bei Kristeneinrichtungen.pdf";
|
||||
String filePath = "files/SinglePages/VV-931175_Page1.pdf";
|
||||
|
||||
runForFile(filePath);
|
||||
}
|
||||
|
||||
@Test
|
||||
@Disabled
|
||||
// @Disabled
|
||||
@SneakyThrows
|
||||
public void testLayoutParserEndToEndWithFolder() {
|
||||
|
||||
String folder = "/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles-pdftron-ocred";
|
||||
String folder = "/home/kschuettler/iqser/fforesight/layout-parser/layoutparser/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages";
|
||||
List<Path> pdfFiles = Files.walk(Path.of(folder))
|
||||
.filter(path -> path.getFileName().toString().endsWith(".pdf"))
|
||||
.sorted(Comparator.comparing(Path::getFileName))
|
||||
@ -70,7 +69,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
|
||||
file = new File(filePath);
|
||||
}
|
||||
|
||||
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LayoutParsingType.REDACT_MANAGER, true);
|
||||
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LayoutParsingType.DOCUMINE, true);
|
||||
prepareStorage(layoutParsingRequest, file);
|
||||
|
||||
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
|
||||
|
||||
@ -38,7 +38,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER);
|
||||
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
|
||||
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true, false);
|
||||
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
|
||||
}
|
||||
|
||||
@ -56,7 +56,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
var tableResponse = mapper.readValue(new ClassPathResource(tableFileName).getInputStream(), TableServiceResponse.class);
|
||||
var documentFile = new ClassPathResource(fileName).getFile();
|
||||
|
||||
var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE,
|
||||
var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE_OLD,
|
||||
documentFile,
|
||||
new ImageServiceResponse(),
|
||||
tableResponse,
|
||||
@ -64,9 +64,9 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
Map.of("file", Path.of(fileName).getFileName().toFile().toString()));
|
||||
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
|
||||
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
|
||||
Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.DOCUMINE, classificationDocument);
|
||||
Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.DOCUMINE_OLD, classificationDocument);
|
||||
|
||||
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
|
||||
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true, false);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,38 @@
|
||||
package com.knecon.fforesight.service.layoutparser.server.model;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.List;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
|
||||
class CleanRulingsTest {
|
||||
|
||||
@Test
|
||||
public void testLineBetween() {
|
||||
|
||||
List<Ruling> verticals = List.of(new Ruling(new Point2D.Double(10, 0), new Point2D.Double(10, 10)));
|
||||
List<Ruling> horizontals = List.of(new Ruling(new Point2D.Double(0, 5), new Point2D.Double(10, 5)));
|
||||
CleanRulings cleanRulings = new CleanRulings(horizontals, verticals);
|
||||
Rectangle2D a = new Rectangle2D.Double(1, 6, 3, 3);
|
||||
Rectangle2D b = new Rectangle2D.Double(5, 6, 3, 3);
|
||||
Rectangle2D c = new Rectangle2D.Double(11, 6, 3, 3);
|
||||
Rectangle2D d = new Rectangle2D.Double(1, 1, 3, 3);
|
||||
Rectangle2D e = new Rectangle2D.Double(5, 1, 3, 3);
|
||||
Rectangle2D f = new Rectangle2D.Double(11, 1, 3, 3);
|
||||
|
||||
assertFalse(cleanRulings.lineBetween(a, a));
|
||||
assertFalse(cleanRulings.lineBetween(a, b));
|
||||
assertTrue(cleanRulings.lineBetween(a, c));
|
||||
assertTrue(cleanRulings.lineBetween(a, d));
|
||||
assertTrue(cleanRulings.lineBetween(a, e));
|
||||
assertTrue(cleanRulings.lineBetween(a, f));
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,62 @@
|
||||
package com.knecon.fforesight.service.layoutparser.server.model;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.List;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
|
||||
public class RulingTest {
|
||||
|
||||
@Test
|
||||
public void testLineBetween() {
|
||||
|
||||
List<Ruling> verticals = List.of(new Ruling(new Point2D.Double(10, 0), new Point2D.Double(10, 10)), new Ruling(new Point2D.Double(5, 0), new Point2D.Double(5, 5)));
|
||||
List<Ruling> horizontals = List.of(new Ruling(new Point2D.Double(0, 5), new Point2D.Double(10, 5)));
|
||||
|
||||
CleanRulings cleanRulings = new CleanRulings(horizontals, verticals);
|
||||
|
||||
Rectangle2D a = new Rectangle2D.Double(1, 6, 3, 3);
|
||||
Rectangle2D b = new Rectangle2D.Double(5, 6, 3, 3);
|
||||
Rectangle2D c = new Rectangle2D.Double(11, 6, 3, 3);
|
||||
Rectangle2D d = new Rectangle2D.Double(1, 1, 3, 3);
|
||||
Rectangle2D e = new Rectangle2D.Double(5, 1, 3, 3);
|
||||
Rectangle2D f = new Rectangle2D.Double(11, 1, 3, 3);
|
||||
|
||||
assertFalse(cleanRulings.lineBetween(a, a));
|
||||
assertFalse(cleanRulings.lineBetween(a, b));
|
||||
assertTrue(cleanRulings.lineBetween(a, c));
|
||||
assertTrue(cleanRulings.lineBetween(a, d));
|
||||
assertTrue(cleanRulings.lineBetween(a, e));
|
||||
assertTrue(cleanRulings.lineBetween(a, f));
|
||||
|
||||
assertFalse(cleanRulings.lineBetween(d, d));
|
||||
assertTrue(cleanRulings.lineBetween(d, b));
|
||||
assertTrue(cleanRulings.lineBetween(d, c));
|
||||
assertTrue(cleanRulings.lineBetween(d, a));
|
||||
assertTrue(cleanRulings.lineBetween(d, e));
|
||||
assertTrue(cleanRulings.lineBetween(d, f));
|
||||
|
||||
assertFalse(cleanRulings.lineBetween(c, c));
|
||||
assertTrue(cleanRulings.lineBetween(c, b));
|
||||
assertTrue(cleanRulings.lineBetween(c, d));
|
||||
assertTrue(cleanRulings.lineBetween(c, a));
|
||||
assertTrue(cleanRulings.lineBetween(c, e));
|
||||
assertFalse(cleanRulings.lineBetween(c, f));
|
||||
|
||||
var all = List.of(a, b, c, d, e, f);
|
||||
for (Rectangle2D r1 : all) {
|
||||
for (Rectangle2D r2 : all) {
|
||||
assertEquals(cleanRulings.lineBetween(r1, r2), cleanRulings.lineBetween(r2, r1));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -62,7 +62,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
@SneakyThrows
|
||||
public ClassificationDocument buildClassificationDocument(File originDocument, TableServiceResponse tableServiceResponse) {
|
||||
|
||||
ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD,
|
||||
ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||
originDocument,
|
||||
new ImageServiceResponse(),
|
||||
tableServiceResponse,
|
||||
|
||||
@ -53,7 +53,7 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
|
||||
List<List<Rectangle2D>> rectanglesPerPage = new LinkedList<>();
|
||||
for (PageContents pageContent : pageContents) {
|
||||
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(Collections.emptyList(), pageContent.getRulings());
|
||||
List<Rectangle2D> rects = RectangularIntersectionFinder.find(cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
List<Rectangle2D> rects = RectangularIntersectionFinder.find(cleanRulings.getHorizontals(), cleanRulings.getVerticals());
|
||||
rectanglesPerPage.add(rects);
|
||||
}
|
||||
|
||||
@ -74,7 +74,7 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
|
||||
for (PageContents pageContent : pageContents) {
|
||||
cleanRulingsPerPage.add(rulingCleaningService.getCleanRulings(Collections.emptyList(), pageContent.getRulings()));
|
||||
}
|
||||
var cleanRulings = cleanRulingsPerPage.stream().map(CleanRulings::getVertical).collect(Collectors.toList());
|
||||
var cleanRulings = cleanRulingsPerPage.stream().map(CleanRulings::getVerticals).collect(Collectors.toList());
|
||||
PdfDraw.drawLinesPerPage(fileName, cleanRulings, lineFileName);
|
||||
|
||||
}
|
||||
|
||||
@ -114,7 +114,7 @@ public abstract class AbstractTest {
|
||||
.originFileStorageId(fileName + ORIGIN_FILE_ID)
|
||||
.tablesFileStorageId(Optional.of(fileName + TABLE_FILE_ID))
|
||||
.imagesFileStorageId(Optional.of(fileName + IMAGE_FILE_ID))
|
||||
.visualLayoutParsingFileId(Optional.of(fileName + VISUAL_LAYOUT_FILE))
|
||||
.visualLayoutParsingFileId(Optional.empty())
|
||||
.structureFileStorageId(fileName + STRUCTURE_FILE_ID)
|
||||
.textBlockFileStorageId(fileName + TEXT_FILE_ID)
|
||||
.positionBlockFileStorageId(fileName + POSITION_FILE_ID)
|
||||
@ -190,7 +190,9 @@ public abstract class AbstractTest {
|
||||
storageService.storeObject(TenantContext.getTenantId(), layoutParsingRequest.imagesFileStorageId().get(), imageInfoStream);
|
||||
storageService.storeObject(TenantContext.getTenantId(), layoutParsingRequest.tablesFileStorageId().get(), cvServiceResponseFileStream);
|
||||
storageService.storeObject(TenantContext.getTenantId(), layoutParsingRequest.originFileStorageId(), fileStream);
|
||||
storageService.storeObject(TenantContext.getTenantId(), layoutParsingRequest.visualLayoutParsingFileId().get(), visualLayoutParsingResponseFileStream);
|
||||
if (layoutParsingRequest.visualLayoutParsingFileId().isPresent()) {
|
||||
storageService.storeObject(TenantContext.getTenantId(), layoutParsingRequest.visualLayoutParsingFileId().get(), visualLayoutParsingResponseFileStream);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -41,6 +41,7 @@ public class ContentStreams {
|
||||
public static Identifier MARKED_CONTENT = new Identifier("Marked content", COSName.getPDFName("KNECON_MARKED_CONTENT"), true);
|
||||
|
||||
public static Identifier NEIGHBOURS = new Identifier("Neighbours", COSName.getPDFName("KNECON_NEIGHBOURS"), true);
|
||||
|
||||
public static Identifier CHARACTERS = new Identifier("Characters", COSName.getPDFName("KNECON_CHARACTERS"), true);
|
||||
|
||||
public static List<Identifier> allContentStreams = List.of(KNECON_LAYOUT,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user