RED-8825: improve layoutparsing

* added improved debugging capabilities to viewer-doc
* refactored coordinates (wip)
* refactored line intersection algorithm
* removed cropbox correction from pdfbox text positions
This commit is contained in:
Kilian Schuettler 2024-04-17 17:26:04 +02:00
parent 6fb1a0bef3
commit 3dd215288a
34 changed files with 1147 additions and 599 deletions

View File

@ -101,29 +101,33 @@ public class LayoutParsingPipeline {
log.info("Starting layout parsing for {}", layoutParsingRequest.identifier());
File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()).orElse(originFile);
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId())
.orElse(originFile);
VisualLayoutParsingResponse visualLayoutParsingResponse = new VisualLayoutParsingResponse();
if (layoutParsingRequest.visualLayoutParsingFileId().isPresent()) {
if (layoutParsingRequest.visualLayoutParsingFileId()
.isPresent()) {
visualLayoutParsingResponse = layoutParsingStorageService.getVisualLayoutParsingFile(layoutParsingRequest.visualLayoutParsingFileId().get());
}
ImageServiceResponse imageServiceResponse = new ImageServiceResponse();
if (layoutParsingRequest.imagesFileStorageId().isPresent()) {
if (layoutParsingRequest.imagesFileStorageId()
.isPresent()) {
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get());
}
TableServiceResponse tableServiceResponse = new TableServiceResponse();
if (layoutParsingRequest.tablesFileStorageId().isPresent()) {
if (layoutParsingRequest.tablesFileStorageId()
.isPresent()) {
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get());
}
ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(),
originFile,
imageServiceResponse,
tableServiceResponse,
visualLayoutParsingResponse,
layoutParsingRequest.identifier());
originFile,
imageServiceResponse,
tableServiceResponse,
visualLayoutParsingResponse,
layoutParsingRequest.identifier());
log.info("Building document graph for {}", layoutParsingRequest.identifier());
@ -155,25 +159,25 @@ public class LayoutParsingPipeline {
.numberOfPages(documentGraph.getNumberOfPages())
.duration(System.currentTimeMillis() - start)
.message(format("""
Layout parsing has finished in %.02f s.
identifiers: %s
%s
Files have been saved with Ids:
Structure: %s
Text: %s
Positions: %s
PageData: %s
Simplified Text: %s
Viewer Doc: %s""",
((float) (System.currentTimeMillis() - start)) / 1000,
layoutParsingRequest.identifier(),
buildSemanticNodeCountMessage(documentGraph.getNumberOfPages(), documentGraph.buildSemanticNodeCounts()),
layoutParsingRequest.structureFileStorageId(),
layoutParsingRequest.textBlockFileStorageId(),
layoutParsingRequest.positionBlockFileStorageId(),
layoutParsingRequest.pageFileStorageId(),
layoutParsingRequest.simplifiedTextStorageId(),
layoutParsingRequest.viewerDocumentStorageId()))
Layout parsing has finished in %.02f s.
identifiers: %s
%s
Files have been saved with Ids:
Structure: %s
Text: %s
Positions: %s
PageData: %s
Simplified Text: %s
Viewer Doc: %s""",
((float) (System.currentTimeMillis() - start)) / 1000,
layoutParsingRequest.identifier(),
buildSemanticNodeCountMessage(documentGraph.getNumberOfPages(), documentGraph.buildSemanticNodeCounts()),
layoutParsingRequest.structureFileStorageId(),
layoutParsingRequest.textBlockFileStorageId(),
layoutParsingRequest.positionBlockFileStorageId(),
layoutParsingRequest.pageFileStorageId(),
layoutParsingRequest.simplifiedTextStorageId(),
layoutParsingRequest.viewerDocumentStorageId()))
.build();
}
@ -194,14 +198,14 @@ public class LayoutParsingPipeline {
private String buildSemanticNodeCountMessage(int numberOfPages, Map<NodeType, Long> semanticNodeCounts) {
return String.format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed",
numberOfPages,
semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION),
semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE),
semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH),
semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE),
semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL),
semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER),
semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER));
numberOfPages,
semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION),
semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE),
semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH),
semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE),
semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL),
semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER),
semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER));
}
@ -220,6 +224,9 @@ public class LayoutParsingPipeline {
Map<Integer, List<ClassifiedImage>> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse);
Map<Integer, List<ClassifiedImage>> signatures = visualLayoutParsingAdapter.buildExtractedSignaturesPerPage(visualLayoutParsingResponse);
ClassificationDocument classificationDocument = new ClassificationDocument();
classificationDocument.getVisualizations().setActive(identifier.containsKey("debug"));
List<ClassificationPage> classificationPages = new ArrayList<>();
long pageCount = originDocument.getNumberOfPages();
@ -249,6 +256,8 @@ public class LayoutParsingPipeline {
}
stripper.getText(originDocument);
classificationDocument.getVisualizations().addTextVisualizations(stripper.getTextPositionSequences(), pageNumber);
PDRectangle pdr = pdPage.getMediaBox();
int rotation = pdPage.getRotation();
@ -257,6 +266,8 @@ public class LayoutParsingPipeline {
PDRectangle cropbox = pdPage.getCropBox();
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), stripper.getRulings());
classificationDocument.getVisualizations().addCleanRulingVisualization(cleanRulings, pageNumber);
List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical());
var graphics = graphicExtractorService.extractPathElementGraphics(originDocument,
@ -272,11 +283,16 @@ public class LayoutParsingPipeline {
.map(g -> new ClassifiedImage(new Rectangle2D.Double(g.x1, g.y1, g.width(), g.height()), ImageType.GRAPHIC, false, stripper.getPageNumber()))
.toList());
classificationDocument.getVisualizations().addCellVisualizations(emptyTableCells, pageNumber);
ClassificationPage classificationPage = switch (layoutParsingType) {
case REDACT_MANAGER_OLD -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells);
case REDACT_MANAGER_OLD ->
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells);
case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, true);
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, false);
case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG ->
docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, true, classificationDocument.getVisualizations());
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG ->
docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, false, classificationDocument.getVisualizations());
};
classificationPage.setCleanRulings(cleanRulings);
@ -286,8 +302,9 @@ public class LayoutParsingPipeline {
classificationPage.setPageWidth(cropbox.getWidth());
classificationPage.setPageHeight(cropbox.getHeight());
classificationDocument.getVisualizations().addMarkedContentVisualizations(stripper.getMarkedContents(), pageNumber, pdPage);
// MarkedContent needs to be converted at this point, otherwise it leads to GC Problems in Pdfbox.
classificationPage.setMarkedContentBboxPerType(convertMarkedContents(stripper.getMarkedContents()));
classificationPage.setMarkedContentBboxPerType(convertMarkedContents(stripper.getMarkedContents(), pdPage));
// If images is ocr needs to be calculated before textBlocks are moved into tables, otherwise findOcr algorithm needs to be adopted.
if (pdfImages != null && pdfImages.containsKey(pageNumber)) {
@ -361,11 +378,11 @@ public class LayoutParsingPipeline {
}
private Map<String, List<Rectangle2D>> convertMarkedContents(List<PDMarkedContent> pdMarkedContents) {
private Map<String, List<Rectangle2D>> convertMarkedContents(List<PDMarkedContent> pdMarkedContents, PDPage pdPage) {
Map<String, List<Rectangle2D>> markedContentBboxes = new HashMap<>();
markedContentBboxes.put(MarkedContentUtils.HEADER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.HEADER));
markedContentBboxes.put(MarkedContentUtils.FOOTER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.FOOTER));
markedContentBboxes.put(MarkedContentUtils.HEADER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.HEADER, pdPage));
markedContentBboxes.put(MarkedContentUtils.FOOTER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.FOOTER, pdPage));
return markedContentBboxes;
}

View File

@ -7,12 +7,14 @@ import java.util.stream.Collectors;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.LineBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.NearestNeighbourService;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.ReadingOrderService;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.SpacingService;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.ZoneBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
@ -43,16 +45,16 @@ public class DocstrumSegmentationService {
private List<Zone> computeZones(List<TextPositionSequence> textPositions, TextDirection direction) {
var positions = textPositions.stream().filter(t -> t.getDir() == direction).map(TextPositionSequence::getTextPositions).flatMap(List::stream).toList();
List<RedTextPosition> positions = textPositions.stream().filter(t -> t.getDir() == direction).map(TextPositionSequence::getTextPositions).flatMap(List::stream).toList();
var characters = positions.stream().map(Character::new).collect(Collectors.toList());
List<Character> characters = positions.stream().map(Character::new).collect(Collectors.toList());
nearestNeighbourService.findNearestNeighbors(characters);
var characterSpacing = spacingService.computeCharacterSpacing(characters);
var lineSpacing = Math.min(spacingService.computeLineSpacing(characters), 20);
double characterSpacing = spacingService.computeCharacterSpacing(characters);
double lineSpacing = Math.min(spacingService.computeLineSpacing(characters), 20);
var lines = lineBuilderService.buildLines(characters, characterSpacing, lineSpacing);
List<Line> lines = lineBuilderService.buildLines(characters, characterSpacing, lineSpacing);
return zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing);
}

View File

@ -27,8 +27,8 @@ public class Character {
public Character(RedTextPosition chunk) {
this.x = chunk.getXDirAdj() + chunk.getWidthDirAdj() / 2;
this.y = chunk.getYDirAdj() + chunk.getHeightDir() / 2;
this.x = chunk.getDirectionAdjustedPosition().getCenterX();
this.y = chunk.getDirectionAdjustedPosition().getCenterY();
this.textPosition = chunk;
}
@ -82,5 +82,4 @@ public class Character {
return FastAtan2.fastAtan2(character.getY() - getY(), character.getX() - getX());
}
}
}

View File

@ -1,11 +1,12 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.Data;
import lombok.EqualsAndHashCode;
@ -84,7 +85,9 @@ public class Line extends BoundingBox {
private double computeHeight() {
return characters.stream().map(Character::getHeight).reduce(0d, Double::sum) / characters.size();
return characters.stream()
.map(Character::getHeight)
.reduce(0d, Double::sum) / characters.size();
}
@ -116,7 +119,7 @@ public class Line extends BoundingBox {
double ym = (y0 + y1) / 2;
double yn = (other.y0 + other.y1) / 2;
return Math.abs(ym - yn) / Math.sqrt(1);
return Math.abs(ym - yn);
}
@ -141,21 +144,10 @@ public class Line extends BoundingBox {
private void buildBBox() {
double minX = Double.POSITIVE_INFINITY;
double minY = Double.POSITIVE_INFINITY;
double maxX = Double.NEGATIVE_INFINITY;
double maxY = Double.NEGATIVE_INFINITY;
for (Character character : characters) {
minX = Math.min(minX, character.getTextPosition().getXDirAdj());
minY = Math.min(minY, character.getTextPosition().getYDirAdj());
maxX = Math.max(maxX, character.getTextPosition().getXDirAdj() + character.getTextPosition().getWidthDirAdj());
maxY = Math.max(maxY, character.getTextPosition().getYDirAdj() + character.getTextPosition().getHeightDir());
}
this.setBBox(new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY));
this.setBBox(characters.stream()
.map(Character::getTextPosition)
.map(RedTextPosition::getInitialUserSpacePosition)
.collect(RectangleTransformations.collectBBox()));
}

View File

@ -1,9 +1,10 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
import java.awt.geom.Rectangle2D;
import java.util.Comparator;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.Data;
@Data
@ -23,21 +24,9 @@ public class Zone extends BoundingBox {
public void buildBBox() {
double minX = Double.POSITIVE_INFINITY;
double minY = Double.POSITIVE_INFINITY;
double maxX = Double.NEGATIVE_INFINITY;
double maxY = Double.NEGATIVE_INFINITY;
for (Line line : lines) {
minX = Math.min(minX, line.getX());
minY = Math.min(minY, line.getY());
maxX = Math.max(maxX, line.getX() + line.getWidth());
maxY = Math.max(maxY, line.getY() + line.getHeight());
}
this.setBBox(new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY));
this.setBBox(getLines().stream()
.map(BoundingBox::getBBox)
.collect(RectangleTransformations.collectBBox()));
}

View File

@ -1,6 +1,5 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.service;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
@ -30,24 +29,25 @@ public class LineBuilderService {
AngleFilter filter = new AngleFilter(-ANGLE_TOLERANCE, ANGLE_TOLERANCE);
characters.forEach(character -> {
character.getNeighbors().forEach(neighbor -> {
double x = neighbor.getHorizontalDistance() / maxHorizontalDistance;
double y = neighbor.getVerticalDistance() / maxVerticalDistance;
if (character.getTextPosition().getDir() == neighbor.getCharacter().getTextPosition().getDir() && filter.matches(neighbor) && Math.pow(x, 2) + Math.pow(y,
2) <= 1) {
unionFind.union(character, neighbor.getCharacter());
}
});
character.getNeighbors()
.forEach(neighbor -> {
double normalizedHorizontalDistance = neighbor.getHorizontalDistance() / maxHorizontalDistance;
double normalizedVerticalDistance = neighbor.getVerticalDistance() / maxVerticalDistance;
if (character.getTextPosition().getDir() == neighbor.getCharacter().getTextPosition().getDir() //
&& filter.matches(neighbor) //
&& Math.pow(normalizedHorizontalDistance, 2) + Math.pow(normalizedVerticalDistance, 2) <= 1) {
unionFind.union(character, neighbor.getCharacter());
}
});
});
List<Line> lines = new ArrayList<>();
unionFind.getGroups().forEach(group -> {
List<Character> lineCharacters = new ArrayList<>(group);
lineCharacters.sort(Comparator.comparingDouble(Character::getX));
lines.add(new Line(lineCharacters, characterSpacing));
});
return lines;
return unionFind.getGroups()
.stream()
.map(lineCharacters -> lineCharacters.stream()
.sorted(Comparator.comparingDouble(Character::getX))
.toList())
.map(lineCharacters -> new Line(lineCharacters, characterSpacing))
.toList();
}
}

View File

@ -45,29 +45,35 @@ public class ZoneBuilderService {
double meanHeight = calculateMeanHeight(lines);
lines.forEach(outerLine -> //
lines.forEach(innerLine -> {
lines.forEach(outerLine -> {
lines.forEach(innerLine -> {
double scale = Math.min(outerLine.getHeight(), innerLine.getHeight()) / meanHeight;
scale = Math.max(MIN_LINE_SIZE_SCALE, Math.min(scale, MAX_LINE_SIZE_SCALE));
if (innerLine == outerLine //
|| unionFind.inSameSet(outerLine, innerLine)//
|| outerLine.angularDifference(innerLine) > ANGLE_TOLERANCE) {
return;
}
if (!unionFind.inSameSet(outerLine, innerLine) && outerLine.angularDifference(innerLine) <= ANGLE_TOLERANCE) {
double scale = Math.min(outerLine.getHeight(), innerLine.getHeight()) / meanHeight;
scale = Math.max(MIN_LINE_SIZE_SCALE, Math.min(scale, MAX_LINE_SIZE_SCALE));
double horizontalDistance = outerLine.horizontalDistance(innerLine) / scale;
double verticalDistance = outerLine.verticalDistance(innerLine) / scale;
double horizontalDistance = outerLine.horizontalDistance(innerLine) / scale;
double verticalDistance = outerLine.verticalDistance(innerLine) / scale;
if (minHorizontalDistance <= horizontalDistance && verticalDistance <= maxVerticalDistance //
|| minHorizontalMergeDistance <= horizontalDistance && verticalDistance <= maxVerticalMergeDistance) {
unionFind.union(outerLine, innerLine);
}
}
}));
if (minHorizontalDistance <= horizontalDistance && verticalDistance <= maxVerticalDistance //
|| minHorizontalMergeDistance <= horizontalDistance && verticalDistance <= maxVerticalMergeDistance) {
List<Zone> zones = new ArrayList<>();
unionFind.getGroups().forEach(group -> {
zones.add(mergeLinesInZone(new ArrayList<>(group), characterSpacing, lineSpacing));
unionFind.union(outerLine, innerLine);
}
});
});
List<Zone> zones = unionFind.getGroups()
.stream()
.map(group -> mergeLinesInZone(new ArrayList<>(group), characterSpacing, lineSpacing))
.toList();
if (zones.size() > MAX_ZONES) {
List<Line> oneZoneLines = new ArrayList<>();
for (Zone zone : zones) {
@ -103,35 +109,40 @@ public class ZoneBuilderService {
UnionFind<Line> unionFind = new UnionFind<>(new HashSet<>(lines));
lines.forEach(outer -> {
lines.forEach(inner -> {
if (inner != outer) {
if (inner == outer) {
return;
}
double horizontalDistance = outer.horizontalDistance(inner);
double verticalDistance = outer.verticalDistance(inner);
double horizontalDistance = outer.horizontalDistance(inner);
double verticalDistance = outer.verticalDistance(inner);
if (horizontalDistance <= maxHorizontalDistance && minVerticalDistance <= verticalDistance && verticalDistance <= maxVerticalDistance) {
unionFind.union(outer, inner);
} else if (minVerticalDistance <= verticalDistance && verticalDistance <= maxVerticalDistance && Math.abs(horizontalDistance - Math.min(outer.getLength(),
inner.getLength())) < 0.1) {
boolean characterOverlap = false;
int overlappingCount = 0;
for (Character outerCharacter : outer.getCharacters()) {
for (Character innerCharacter : inner.getCharacters()) {
double characterOverlapDistance = outerCharacter.overlappingDistance(innerCharacter);
if (characterOverlapDistance > 2) {
characterOverlap = true;
}
if (characterOverlapDistance > 0) {
overlappingCount++;
}
if (horizontalDistance <= maxHorizontalDistance && minVerticalDistance <= verticalDistance && verticalDistance <= maxVerticalDistance) {
unionFind.union(outer, inner);
} else if (minVerticalDistance <= verticalDistance
&& verticalDistance <= maxVerticalDistance
&& Math.abs(horizontalDistance - Math.min(outer.getLength(), inner.getLength())) < 0.1) {
boolean characterOverlap = false;
int overlappingCount = 0;
for (Character outerCharacter : outer.getCharacters()) {
for (Character innerCharacter : inner.getCharacters()) {
double characterOverlapDistance = outerCharacter.overlappingDistance(innerCharacter);
if (characterOverlapDistance > 2) {
characterOverlap = true;
}
if (characterOverlapDistance > 0) {
overlappingCount++;
}
}
if (!characterOverlap && overlappingCount <= 2) {
unionFind.union(outer, inner);
}
}
if (!characterOverlap && overlappingCount <= 2) {
unionFind.union(outer, inner);
}
}
});
});

View File

@ -5,6 +5,7 @@ import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.text.UnclassifiedText;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
import lombok.Data;
import lombok.NoArgsConstructor;
@ -22,6 +23,7 @@ public class ClassificationDocument {
private FloatFrequencyCounter fontSizeCounter = new FloatFrequencyCounter();
private StringFrequencyCounter fontCounter = new StringFrequencyCounter();
private StringFrequencyCounter fontStyleCounter = new StringFrequencyCounter();
private LayoutparsingVisualizations visualizations = new LayoutparsingVisualizations();
private boolean headlines;
private long rulesVersion;

View File

@ -12,6 +12,7 @@ import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
@ -40,6 +41,8 @@ public class Document implements GenericSemanticNode {
@Builder.Default
Set<RedactionEntity> entities = new HashSet<>();
LayoutparsingVisualizations visualizations;
@Override
public NodeType getType() {

View File

@ -4,12 +4,8 @@ import java.awt.geom.Line2D;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Formatter;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import com.knecon.fforesight.service.layoutparser.processor.utils.CohenSutherlandClipping;
import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons;
@ -60,126 +56,13 @@ public class Ruling extends Line2D.Float {
}
// log(n) implementation of find_intersections
// based on http://people.csail.mit.edu/indyk/6.838-old/handouts/lec2.pdf
public static Map<Point2D, Ruling[]> findIntersections(List<Ruling> horizontals, List<Ruling> verticals) {
class SortObject {
protected SOType type;
protected float position;
protected Ruling ruling;
public SortObject(SOType type, float position, Ruling ruling) {
this.type = type;
this.position = position;
this.ruling = ruling;
}
}
List<SortObject> sos = new ArrayList<>();
TreeMap<Ruling, Boolean> tree = new TreeMap<>(new Comparator<Ruling>() {
@Override
public int compare(Ruling o1, Ruling o2) {
return java.lang.Double.compare(o1.getTop(), o2.getTop());
}
});
TreeMap<Point2D, Ruling[]> rv = new TreeMap<>(new Comparator<Point2D>() {
@Override
public int compare(Point2D o1, Point2D o2) {
if (o1.getY() > o2.getY()) {
return 1;
}
if (o1.getY() < o2.getY()) {
return -1;
}
if (o1.getX() > o2.getX()) {
return 1;
}
if (o1.getX() < o2.getX()) {
return -1;
}
return 0;
}
});
for (Ruling h : horizontals) {
sos.add(new SortObject(SOType.HLEFT, h.getLeft() - PERPENDICULAR_UNIT_EXPAND_AMOUNT, h));
sos.add(new SortObject(SOType.HRIGHT, h.getRight() + PERPENDICULAR_UNIT_EXPAND_AMOUNT, h));
}
for (Ruling v : verticals) {
sos.add(new SortObject(SOType.VERTICAL, v.getLeft(), v));
}
Collections.sort(sos, new Comparator<SortObject>() {
@Override
public int compare(SortObject a, SortObject b) {
int rv;
if (DoubleComparisons.feq(a.position, b.position)) {
if (a.type == SOType.VERTICAL && b.type == SOType.HLEFT) {
rv = 1;
} else if (a.type == SOType.VERTICAL && b.type == SOType.HRIGHT) {
rv = -1;
} else if (a.type == SOType.HLEFT && b.type == SOType.VERTICAL) {
rv = -1;
} else if (a.type == SOType.HRIGHT && b.type == SOType.VERTICAL) {
rv = 1;
} else {
rv = java.lang.Double.compare(a.position, b.position);
}
} else {
return java.lang.Double.compare(a.position, b.position);
}
return rv;
}
});
for (SortObject so : sos) {
switch (so.type) {
case VERTICAL:
for (Map.Entry<Ruling, Boolean> h : tree.entrySet()) {
try {
Point2D i = h.getKey().intersectionPoint(so.ruling);
if (i == null) {
continue;
}
rv.put(i, new Ruling[]{h.getKey().expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT), so.ruling.expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT)});
} catch (UnsupportedOperationException e) {
log.info("Some line are oblique, ignoring...");
continue;
}
}
break;
case HRIGHT:
tree.remove(so.ruling);
break;
case HLEFT:
tree.put(so.ruling, true);
break;
}
}
return rv;
}
public boolean vertical() {
public boolean isVertical() {
return this.length() > 0 && DoubleComparisons.feq(this.x1, this.x2); //diff < ORIENTATION_CHECK_THRESHOLD;
}
public boolean horizontal() {
public boolean isHorizontal() {
return this.length() > 0 && DoubleComparisons.feq(this.y1, this.y2); //diff < ORIENTATION_CHECK_THRESHOLD;
}
@ -188,36 +71,36 @@ public class Ruling extends Line2D.Float {
// these are used to have a single collapse method (in page, currently)
public boolean oblique() {
public boolean isOblique() {
return !(this.vertical() || this.horizontal());
return !(this.isVertical() || this.isHorizontal());
}
public float getPosition() {
if (this.oblique()) {
if (this.isOblique()) {
throw new UnsupportedOperationException();
}
return this.vertical() ? this.getLeft() : this.getTop();
return this.isVertical() ? this.getLeft() : this.getTop();
}
public float getStart() {
if (this.oblique()) {
if (this.isOblique()) {
throw new UnsupportedOperationException();
}
return this.vertical() ? this.getTop() : this.getLeft();
return this.isVertical() ? this.getTop() : this.getLeft();
}
public void setStart(float v) {
if (this.oblique()) {
if (this.isOblique()) {
throw new UnsupportedOperationException();
}
if (this.vertical()) {
if (this.isVertical()) {
this.setTop(v);
} else {
this.setLeft(v);
@ -227,19 +110,19 @@ public class Ruling extends Line2D.Float {
public float getEnd() {
if (this.oblique()) {
if (this.isOblique()) {
throw new UnsupportedOperationException();
}
return this.vertical() ? this.getBottom() : this.getRight();
return this.isVertical() ? this.getBottom() : this.getRight();
}
public void setEnd(float v) {
if (this.oblique()) {
if (this.isOblique()) {
throw new UnsupportedOperationException();
}
if (this.vertical()) {
if (this.isVertical()) {
this.setBottom(v);
} else {
this.setRight(v);
@ -249,10 +132,10 @@ public class Ruling extends Line2D.Float {
public void setStartEnd(float start, float end) {
if (this.oblique()) {
if (this.isOblique()) {
throw new UnsupportedOperationException();
}
if (this.vertical()) {
if (this.isVertical()) {
this.setTop(start);
this.setBottom(end);
} else {
@ -264,7 +147,7 @@ public class Ruling extends Line2D.Float {
public boolean perpendicularTo(Ruling other) {
return this.vertical() == other.horizontal();
return this.isVertical() == other.isHorizontal();
}
@ -318,30 +201,6 @@ public class Ruling extends Line2D.Float {
}
public Point2D intersectionPoint(Ruling other) {
Ruling this_l = this.expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT);
Ruling other_l = other.expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT);
Ruling horizontal, vertical;
if (!this_l.intersectsLine(other_l)) {
return null;
}
if (this_l.horizontal() && other_l.vertical()) {
horizontal = this_l;
vertical = other_l;
} else if (this_l.vertical() && other_l.horizontal()) {
vertical = this_l;
horizontal = other_l;
} else {
log.warn("lines must be orthogonal, vertical and horizontal");
return null;
}
return new Point2D.Float(vertical.getLeft(), horizontal.getTop());
}
@Override
public boolean equals(Object other) {
@ -451,16 +310,9 @@ public class Ruling extends Line2D.Float {
final float TOLERANCE = 1;
return Math.abs(ruling.getX1() - x1) < TOLERANCE &&//
Math.abs(ruling.getY1() - y1) < TOLERANCE &&//
Math.abs(ruling.getX2() - x2) < TOLERANCE &&//
Math.abs(ruling.getY2() - y2) < TOLERANCE;
}
private enum SOType {
VERTICAL,
HRIGHT,
HLEFT
Math.abs(ruling.getY1() - y1) < TOLERANCE &&//
Math.abs(ruling.getX2() - x2) < TOLERANCE &&//
Math.abs(ruling.getY2() - y2) < TOLERANCE;
}
}

View File

@ -1,5 +1,8 @@
package com.knecon.fforesight.service.layoutparser.processor.model.text;
import java.awt.geom.AffineTransform;
import java.awt.geom.Rectangle2D;
import org.apache.pdfbox.text.TextPosition;
import com.fasterxml.jackson.annotation.JsonIgnore;
@ -16,7 +19,9 @@ import lombok.SneakyThrows;
@AllArgsConstructor
public class RedTextPosition {
private float[] position;
private final static int HEIGHT_PADDING = 2;
private Rectangle2D.Float directionAdjustedPosition;
private Rectangle2D initialUserSpacePosition;
@JsonIgnore
private int rotation;
@ -58,43 +63,65 @@ public class RedTextPosition {
pos.setFontSizeInPt(textPosition.getFontSizeInPt());
pos.setFontName(textPosition.getFont().getName());
var position = new float[4];
float textHeight = textPosition.getHeight() + HEIGHT_PADDING;
Rectangle2D.Float dirAdjPosition = new Rectangle2D.Float(textPosition.getXDirAdj(),
textPosition.getYDirAdj() - textHeight,
textPosition.getWidthDirAdj(),
textHeight + HEIGHT_PADDING);
pos.setDirectionAdjustedPosition(dirAdjPosition);
position[0] = textPosition.getXDirAdj();
position[1] = textPosition.getYDirAdj();
position[2] = textPosition.getWidthDirAdj();
position[3] = textPosition.getHeightDir();
AffineTransform affineTransform = getRotationMatrix(TextDirection.fromDegrees(textPosition.getDir()), textPosition.getPageWidth(), textPosition.getPageHeight());
Rectangle2D initialUserSpacePositionRect = affineTransform.createTransformedShape(dirAdjPosition).getBounds2D();
pos.setInitialUserSpacePosition(initialUserSpacePositionRect);
pos.setPosition(position);
return pos;
}
private static AffineTransform getRotationMatrix(TextDirection textDirection, float pageWidth, float pageHeight) {
AffineTransform transform = new AffineTransform();
if (textDirection == TextDirection.ZERO || textDirection == TextDirection.HALF_CIRCLE) {
transform.rotate(textDirection.getRadians(), pageWidth / 2f, pageHeight / 2f);
transform.translate(0f, pageHeight);
} else if (textDirection == TextDirection.QUARTER_CIRCLE) {
transform.rotate(textDirection.getRadians(), pageWidth / 2f, pageWidth / 2f);
transform.translate(0f, pageWidth);
} else {
transform.rotate(textDirection.getRadians(), pageHeight / 2f, pageHeight / 2f);
transform.translate(0f, pageWidth);
}
transform.scale(1., -1.);
return transform;
}
@JsonIgnore
public float getXDirAdj() {
return position[0];
return this.directionAdjustedPosition.x;
}
@JsonIgnore
public float getYDirAdj() {
return position[1];
return this.directionAdjustedPosition.y;
}
@JsonIgnore
public float getWidthDirAdj() {
return position[2];
return this.directionAdjustedPosition.width;
}
@JsonIgnore
public float getHeightDir() {
return position[3];
return this.directionAdjustedPosition.height;
}
}

View File

@ -2,6 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor.model.text;
import java.awt.geom.AffineTransform;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
@ -11,6 +12,7 @@ import org.apache.pdfbox.text.TextPosition;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.AllArgsConstructor;
import lombok.Builder;
@ -44,21 +46,19 @@ public class TextPositionSequence implements CharSequence {
private boolean isParagraphStart;
public TextPositionSequence(int page) {
public TextPositionSequence(List<TextPosition> textPositions, int pageNumber, boolean isParagraphStart) {
this.page = page;
}
public TextPositionSequence(List<TextPosition> textPositions, int page, boolean isParagraphStart) {
this.textPositions = textPositions.stream().map(RedTextPosition::fromTextPosition).collect(Collectors.toList());
this.page = page;
this.textPositions = textPositions.stream()
.map(RedTextPosition::fromTextPosition)
.collect(Collectors.toList());
this.page = pageNumber;
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
this.rotation = textPositions.get(0).getRotation();
this.pageHeight = textPositions.get(0).getPageHeight();
this.pageWidth = textPositions.get(0).getPageWidth();
this.isParagraphStart = isParagraphStart;
}
@ -314,10 +314,18 @@ public class TextPositionSequence implements CharSequence {
topRight = transform.transform(topRight, null);
return new Rectangle( //
new Point((float) bottomLeft.getX(), (float) bottomLeft.getY()),
(float) (topRight.getX() - bottomLeft.getX()),
(float) (topRight.getY() - bottomLeft.getY()),
page);
new Point((float) bottomLeft.getX(), (float) bottomLeft.getY()),
(float) (topRight.getX() - bottomLeft.getX()),
(float) (topRight.getY() - bottomLeft.getY()),
page);
}
public Rectangle2D getBoundingBox() {
return getTextPositions().stream()
.map(RedTextPosition::getInitialUserSpacePosition)
.collect(RectangleTransformations.collectBBox());
}
}

View File

@ -33,6 +33,7 @@ public class BodyTextFrameService {
for (ClassificationPage page : classificationDocument.getPages()) {
// var updatedBodyTextFrame = getBodyTextFrameFromRulings(page, bodyTextFrame, landscapeBodyTextFrame);
setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame);
classificationDocument.getVisualizations().addMainBodyVisualization(page.isLandscape() ? landscapeBodyTextFrame : bodyTextFrame, page.getPageNumber());
}
}

View File

@ -122,7 +122,7 @@ public class RulingCleaningService {
h = ruling.y1 - ruling.y2;
}
if (ruling.horizontal()) {
if (ruling.isHorizontal()) {
return new Rectangle(top - THRESHOLD_Y_HORIZONTAL, left - THRESHOLD_X_HORIZONTAL, w + 2 * THRESHOLD_X_HORIZONTAL, h + 2 * THRESHOLD_Y_HORIZONTAL);
} else {
return new Rectangle(top - THRESHOLD_Y_VERTICAL, left - THRESHOLD_X_VERTICAL, w + 2 * THRESHOLD_X_VERTICAL, h + 2 * THRESHOLD_Y_VERTICAL);
@ -160,14 +160,14 @@ public class RulingCleaningService {
List<Ruling> vrs = new ArrayList<>();
for (Ruling vr : rulings) {
if (vr.vertical()) {
if (vr.isVertical()) {
vrs.add(vr);
}
}
List<Ruling> hrs = new ArrayList<>();
for (Ruling hr : rulings) {
if (hr.horizontal()) {
if (hr.isHorizontal()) {
hrs.add(hr);
}
}

View File

@ -15,6 +15,7 @@ import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.Doubl
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
@ -37,11 +38,18 @@ public class DocstrumBlockificationService {
static final float THRESHOLD = 1f;
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Cell> cells, boolean xyOrder) {
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Cell> cells, boolean xyOrder, LayoutparsingVisualizations visualizations) {
CleanRulings usedRulings = RectangleTransformations.extractRulings(cells);
var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder);
if (!textPositions.isEmpty()) {
visualizations.addZoneVisualizations(zones, textPositions.get(0).getPage());
visualizations.addLineVisualizations(zones, textPositions.get(0).getPage());
visualizations.addCharactersWithNeighbours(zones, textPositions.get(0).getPage());
}
var pageBlocks = toAbstractPageBlocks(zones, usedRulings.getHorizontal(), usedRulings.getVertical(), xyOrder);
var classificationPage = new ClassificationPage(pageBlocks);
@ -58,18 +66,20 @@ public class DocstrumBlockificationService {
zones.forEach(zone -> {
List<TextPositionSequence> textPositionSequences = new ArrayList<>();
zone.getLines().forEach(line -> {
line.getWords().forEach(word -> {
textPositionSequences.add(new TextPositionSequence(word.getTextPositions(), word.getPage()));
});
});
zone.getLines()
.forEach(line -> {
line.getWords()
.forEach(word -> {
textPositionSequences.add(new TextPositionSequence(word.getTextPositions(), word.getPage()));
});
});
abstractPageBlocks.addAll(splitZonesAtRulings(textPositionSequences, horizontalRulings, verticalRulings));
});
if (xyOrder) {
abstractPageBlocks.sort(Comparator.comparing(AbstractPageBlock::getMinY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(AbstractPageBlock::getMinX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
.thenComparing(AbstractPageBlock::getMinX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
abstractPageBlocks.sort(new Comparator<AbstractPageBlock>() {
@Override
public int compare(AbstractPageBlock o1, AbstractPageBlock o2) {
@ -134,8 +144,8 @@ public class DocstrumBlockificationService {
private boolean isOnlyIntersectingYAndOnelineOrPrevoiusTwoLines(TextPageBlock previous, TextPageBlock current, ClassificationPage page) {
return current.intersectsY(previous) //
&& (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() == 2 && current.getNumberOfLines() == 1) //
&& numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) <= 0;
&& (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() == 2 && current.getNumberOfLines() == 1) //
&& numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) <= 0;
}
@ -144,16 +154,16 @@ public class DocstrumBlockificationService {
ClassificationPage page) {
return (Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD || Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) //
&& (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() >= 2 && current.getNumberOfLines() == 1) //
&& !hasBetween(current, previous, page.getTextBlocks()) && numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) <= 4;
&& (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() >= 2 && current.getNumberOfLines() == 1) //
&& !hasBetween(current, previous, page.getTextBlocks()) && numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) <= 4;
}
private boolean areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(TextPageBlock previous, TextPageBlock current, ClassificationPage page) {
return previous.getNumberOfLines() >= 2 && current.getNumberOfLines() >= 2 //
&& previous.intersectsY(current) //
&& numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) == 0;
&& previous.intersectsY(current) //
&& numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) == 0;
}
@ -213,7 +223,7 @@ public class DocstrumBlockificationService {
ListIterator<AbstractPageBlock> itty = blocks.listIterator();
while (itty.hasNext()) {
AbstractPageBlock block = itty.next();
if(block == null){
if (block == null) {
continue;
}
if (block instanceof TablePageBlock) {
@ -224,7 +234,7 @@ public class DocstrumBlockificationService {
for (int i = 0; i < blocks.size(); i++) {
if(blocks.get(i) == null){
if (blocks.get(i) == null) {
continue;
}
if (blocks.get(i) == current) {
@ -249,8 +259,8 @@ public class DocstrumBlockificationService {
}
}
var blocksIterator = blocks.iterator();
while(blocksIterator.hasNext()){
if(blocksIterator.next() == null){
while (blocksIterator.hasNext()) {
if (blocksIterator.next() == null) {
blocksIterator.remove();
}
}
@ -338,11 +348,11 @@ public class DocstrumBlockificationService {
if (textBlock == null) {
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
wordBlock.getMaxXDirAdj(),
wordBlock.getMinYDirAdj(),
wordBlock.getMaxYDirAdj(),
wordBlockList,
wordBlock.getRotation());
wordBlock.getMaxXDirAdj(),
wordBlock.getMinYDirAdj(),
wordBlock.getMaxYDirAdj(),
wordBlockList,
wordBlock.getRotation());
} else {
TextPageBlock spatialEntity = textBlock.union(wordBlock);
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
@ -358,7 +368,12 @@ public class DocstrumBlockificationService {
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
}
if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) {
if (textBlock != null
&& textBlock.getSequences() != null
&& textBlock.getSequences()
.stream()
.map(t -> round(t.getMinYDirAdj(), 3))
.collect(toSet()).size() == 1) {
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
}
return textBlock;
@ -373,38 +388,34 @@ public class DocstrumBlockificationService {
List<Ruling> horizontalRulingLines,
List<Ruling> verticalRulingLines) {
return isSplitByRuling(maxX,
minY,
word.getMinXDirAdj(),
word.getMinYDirAdj(),
verticalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(minX,
minY,
word.getMinXDirAdj(),
word.getMaxYDirAdj(),
horizontalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(maxX,
minY,
word.getMinXDirAdj(),
word.getMinYDirAdj(),
horizontalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(minX,
minY,
word.getMinXDirAdj(),
word.getMaxYDirAdj(),
verticalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight());
return isSplitByRuling(maxX, minY, word.getMinXDirAdj(), word.getMinYDirAdj(), verticalRulingLines, word.getDir().getDegrees(), word.getPageWidth(), word.getPageHeight())
//
|| isSplitByRuling(minX,
minY,
word.getMinXDirAdj(),
word.getMaxYDirAdj(),
horizontalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight())
//
|| isSplitByRuling(maxX,
minY,
word.getMinXDirAdj(),
word.getMinYDirAdj(),
horizontalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight())
//
|| isSplitByRuling(minX,
minY,
word.getMinXDirAdj(),
word.getMaxYDirAdj(),
verticalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight());
}

View File

@ -34,7 +34,7 @@ public class DocuMineBlockificationService {
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
* Rulings (Table lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
*
* @param textPositions The words of a page.
* @param textPositions The textPositions of a page.
* @param horizontalRulingLines Horizontal table lines.
* @param verticalRulingLines Vertical table lines.
* @return Page object that contains the Textblock and text statistics.

View File

@ -7,7 +7,6 @@ import static java.util.stream.Collectors.toList;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
@ -15,7 +14,6 @@ import java.util.NoSuchElementException;
import java.util.Set;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
@ -52,6 +50,9 @@ public class DocumentGraphFactory {
public Document buildDocumentGraph(LayoutParsingType layoutParsingType, ClassificationDocument document) {
Document documentGraph = new Document();
documentGraph.setVisualizations(document.getVisualizations());
Context context = new Context(documentGraph);
document.getPages()
@ -85,14 +86,11 @@ public class DocumentGraphFactory {
GenericSemanticNode node;
if (originalTextBlock.isHeadline()) {
node = Headline.builder().documentTree(context.getDocumentTree())
.build();
node = Headline.builder().documentTree(context.getDocumentTree()).build();
} else if (originalTextBlock.isToDuplicate()) {
node = DuplicatedParagraph.builder().documentTree(context.getDocumentTree())
.build();
node = DuplicatedParagraph.builder().documentTree(context.getDocumentTree()).build();
} else {
node = Paragraph.builder().documentTree(context.getDocumentTree())
.build();
node = Paragraph.builder().documentTree(context.getDocumentTree()).build();
}
page.getMainBody().add(node);
@ -178,8 +176,7 @@ public class DocumentGraphFactory {
private void addFooter(List<TextPageBlock> textBlocks, Context context) {
Page page = context.getPage(textBlocks.get(0).getPage());
Footer footer = Footer.builder().documentTree(context.getDocumentTree())
.build();
Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build();
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks),
footer,
context,
@ -194,8 +191,7 @@ public class DocumentGraphFactory {
public void addHeader(List<TextPageBlock> textBlocks, Context context) {
Page page = context.getPage(textBlocks.get(0).getPage());
Header header = Header.builder().documentTree(context.getDocumentTree())
.build();
Header header = Header.builder().documentTree(context.getDocumentTree()).build();
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks), header, 0, page);
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header);
header.setTreeId(tocId);
@ -207,8 +203,7 @@ public class DocumentGraphFactory {
private void addEmptyFooter(int pageIndex, Context context) {
Page page = context.getPage(pageIndex);
Footer footer = Footer.builder().documentTree(context.getDocumentTree())
.build();
Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build();
AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(footer, context, page);
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer);
footer.setTreeId(tocId);
@ -220,8 +215,7 @@ public class DocumentGraphFactory {
private void addEmptyHeader(int pageIndex, Context context) {
Page page = context.getPage(pageIndex);
Header header = Header.builder().documentTree(context.getDocumentTree())
.build();
Header header = Header.builder().documentTree(context.getDocumentTree()).build();
AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(header, 0, page);
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header);
header.setTreeId(tocId);

View File

@ -29,19 +29,22 @@ public class SearchTextWithTextPositionFactory {
public SearchTextWithTextPositionDto buildSearchTextToTextPositionDto(List<TextPositionSequence> sequences) {
if (sequences.isEmpty() || sequences.stream().allMatch(sequence -> sequence.getTextPositions().isEmpty())) {
if (sequences.isEmpty() || sequences.stream()
.allMatch(sequence -> sequence.getTextPositions().isEmpty())) {
return SearchTextWithTextPositionDto.empty();
}
Context context = new Context();
RedTextPosition currentTextPosition = sequences.get(0).getTextPositions().get(0);
RedTextPosition previousTextPosition = RedTextPosition.builder().unicode(" ").position(currentTextPosition.getPosition()).build();
RedTextPosition currentTextPosition = sequences.get(0).getTextPositions()
.get(0);
RedTextPosition previousTextPosition = RedTextPosition.builder().unicode(" ").directionAdjustedPosition(currentTextPosition.getDirectionAdjustedPosition()).build();
for (TextPositionSequence word : sequences) {
for (int i = 0; i < word.getTextPositions().size(); ++i) {
currentTextPosition = word.getTextPositions().get(i);
currentTextPosition = word.getTextPositions()
.get(i);
if (isLineBreak(currentTextPosition, previousTextPosition)) {
removeHyphenLinebreaks(context);
context.lineBreaksStringIdx.add(context.stringIdx);
@ -57,7 +60,7 @@ public class SearchTextWithTextPositionFactory {
++context.positionIdx;
}
previousTextPosition = RedTextPosition.builder().unicode(" ").position(previousTextPosition.getPosition()).build();
previousTextPosition = RedTextPosition.builder().unicode(" ").directionAdjustedPosition(previousTextPosition.getDirectionAdjustedPosition()).build();
context.stringBuilder.append(" ");
context.stringIdxToPositionIdx.add(context.positionIdx);
++context.stringIdx;
@ -66,7 +69,7 @@ public class SearchTextWithTextPositionFactory {
assert context.stringBuilder.length() == context.stringIdxToPositionIdx.size();
List<Rectangle2D> positions = sequences.stream()
.flatMap(sequence -> sequence.getTextPositions().stream().map(textPosition -> mapRedTextPositionToInitialUserSpace(textPosition, sequence)))
.map(TextPositionSequence::getBoundingBox)
.toList();
return SearchTextWithTextPositionDto.builder()
@ -153,7 +156,7 @@ public class SearchTextWithTextPositionFactory {
return false;
}
float deltaY = Math.abs(currentPosition.getYDirAdj() - previousPosition.getYDirAdj());
double deltaY = Math.abs(currentPosition.getYDirAdj() - previousPosition.getYDirAdj());
return deltaY >= currentPosition.getHeightDir();
}
@ -167,16 +170,16 @@ public class SearchTextWithTextPositionFactory {
private boolean isHyphen(String unicodeCharacter) {
return Objects.equals(unicodeCharacter, "-") || //
Objects.equals(unicodeCharacter, "~") || //
Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "\u00AD");
Objects.equals(unicodeCharacter, "~") || //
Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "\u00AD");
}

View File

@ -82,7 +82,6 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine {
private int pageRotation;
private PDRectangle pageSize;
private Matrix translateMatrix;
private final GlyphList glyphList;
private final Map<COSDictionary, Float> fontHeightMap = new WeakHashMap<COSDictionary, Float>();
@ -134,12 +133,6 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine {
this.pageRotation = page.getRotation();
this.pageSize = page.getCropBox();
if (pageSize.getLowerLeftX() == 0 && pageSize.getLowerLeftY() == 0) {
translateMatrix = null;
} else {
// translation matrix for cropbox
translateMatrix = Matrix.getTranslateInstance(-pageSize.getLowerLeftX(), -pageSize.getLowerLeftY());
}
super.processPage(page);
}
@ -265,62 +258,52 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine {
}
}
// adjust for cropbox if needed
Matrix translatedTextRenderingMatrix;
if (translateMatrix == null) {
translatedTextRenderingMatrix = textRenderingMatrix;
} else {
translatedTextRenderingMatrix = Matrix.concatenate(translateMatrix, textRenderingMatrix);
nextX -= pageSize.getLowerLeftX();
nextY -= pageSize.getLowerLeftY();
}
// This is a hack for unicode letter with 2 chars e.g. RA see unicodeProblem.pdf
if (unicodeMapping.length() == 2) {
processTextPosition(new TextPosition(pageRotation,
pageSize.getWidth(),
pageSize.getHeight(),
translatedTextRenderingMatrix,
nextX,
nextY,
Math.abs(dyDisplay),
dxDisplay,
Math.abs(spaceWidthDisplay),
Character.toString(unicodeMapping.charAt(0)),
new int[]{code},
font,
fontSize,
(int) (fontSize * textMatrix.getScalingFactorX())));
pageSize.getWidth(),
pageSize.getHeight(),
textRenderingMatrix,
nextX,
nextY,
Math.abs(dyDisplay),
dxDisplay,
Math.abs(spaceWidthDisplay),
Character.toString(unicodeMapping.charAt(0)),
new int[]{code},
font,
fontSize,
(int) (fontSize * textMatrix.getScalingFactorX())));
processTextPosition(new TextPosition(pageRotation,
pageSize.getWidth(),
pageSize.getHeight(),
translatedTextRenderingMatrix,
nextX,
nextY,
Math.abs(dyDisplay),
dxDisplay,
Math.abs(spaceWidthDisplay),
Character.toString(unicodeMapping.charAt(1)),
new int[]{code},
font,
fontSize,
(int) (fontSize * textMatrix.getScalingFactorX())));
pageSize.getWidth(),
pageSize.getHeight(),
textRenderingMatrix,
nextX,
nextY,
Math.abs(dyDisplay),
dxDisplay,
Math.abs(spaceWidthDisplay),
Character.toString(unicodeMapping.charAt(1)),
new int[]{code},
font,
fontSize,
(int) (fontSize * textMatrix.getScalingFactorX())));
} else {
processTextPosition(new TextPosition(pageRotation,
pageSize.getWidth(),
pageSize.getHeight(),
translatedTextRenderingMatrix,
nextX,
nextY,
Math.abs(dyDisplay),
dxDisplay,
Math.abs(spaceWidthDisplay),
unicodeMapping,
new int[]{code},
font,
fontSize,
(int) (fontSize * textMatrix.getScalingFactorX())));
pageSize.getWidth(),
pageSize.getHeight(),
textRenderingMatrix,
nextX,
nextY,
Math.abs(dyDisplay),
dxDisplay,
Math.abs(spaceWidthDisplay),
unicodeMapping,
new int[]{code},
font,
fontSize,
(int) (fontSize * textMatrix.getScalingFactorX())));
}
}

View File

@ -1007,7 +1007,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
/**
* Set the desired word separator for output text. The PDFBox text extraction algorithm will output a space
* character if there is enough space between two words. By default a space character is used. If you need and
* character if there is enough space between two textPositions. By default a space character is used. If you need and
* accurate count of characters that are found in a PDF document then you might want to set the word separator to
* the empty string.
*
@ -1703,7 +1703,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
/**
* Write a list of string containing a whole line of a document.
*
* @param line a list with the words of the given line
* @param line a list with the textPositions of the given line
* @throws IOException if something went wrong
*/
private void writeLine(List<WordWithTextPositions> line, boolean isParagraphEnd) throws IOException {
@ -1744,9 +1744,9 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
/**
* Handles the LTR and RTL direction of the given words. The whole implementation stands and falls with the given
* word. If the word is a full line, the results will be the best. If the word contains of single words or
* characters, the order of the characters in a word or words in a line may wrong, due to RTL and LTR marks and
* Handles the LTR and RTL direction of the given textPositions. The whole implementation stands and falls with the given
* word. If the word is a full line, the results will be the best. If the word contains of single textPositions or
* characters, the order of the characters in a word or textPositions in a line may wrong, due to RTL and LTR marks and
* characters!
* <p>
* Based on http://www.nesterovsky-bros.com/weblog/2013/07/28/VisualToLogicalConversionInJava.aspx

View File

@ -70,7 +70,9 @@ public class LayoutGridService {
Visualizations layoutGrid = this.addLayoutGrid(document, layerVisibilityDefaultValue, false);
Visualizations visualLayoutGrid = this.addLayoutGrid(document, layerVisibilityDefaultValue, true);
viewerDocumentService.addVisualizationsOnPage(originFile, destinationFile, List.of(layoutGrid, visualLayoutGrid));
List<Visualizations> allVisualizations = Stream.concat(Stream.of(layoutGrid, visualLayoutGrid), document.getVisualizations().streamAll()).toList();
viewerDocumentService.addVisualizationsOnPage(originFile, destinationFile, allVisualizations);
}

View File

@ -1,12 +1,5 @@
package com.knecon.fforesight.service.layoutparser.processor.utils;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import lombok.experimental.UtilityClass;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
import org.apache.pdfbox.text.TextPosition;
import java.awt.geom.Rectangle2D;
import java.util.Collection;
import java.util.Collections;
@ -14,13 +7,24 @@ import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
import org.apache.pdfbox.text.TextPosition;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import lombok.experimental.UtilityClass;
@UtilityClass
public class MarkedContentUtils {
public static final String HEADER = "Header";
public static final String FOOTER = "Footer";
public List<Rectangle2D> getMarkedContentBboxPerLine(List<PDMarkedContent> markedContents, String subtype) {
public List<Rectangle2D> getMarkedContentBboxPerLine(List<PDMarkedContent> markedContents, String subtype, PDPage pdPage) {
if (markedContents == null) {
return Collections.emptyList();
@ -31,7 +35,8 @@ public class MarkedContentUtils {
.filter(m -> m.getProperties() != null)
.filter(m -> m.getProperties().getItem("Subtype") != null)
.filter(m -> ((COSName) m.getProperties().getItem("Subtype")).getName().equals(subtype))
.map(PDMarkedContent::getContents).flatMap(Collection::stream)
.map(PDMarkedContent::getContents)
.flatMap(Collection::stream)
.filter(t -> t instanceof TextPosition)
.map(t -> (TextPosition) t)
.filter(t -> !t.getUnicode().equals(" "))
@ -41,16 +46,77 @@ public class MarkedContentUtils {
return Collections.emptyList();
}
return markedContentByYPosition.values().stream()
.map(textPositions -> new TextPositionSequence(textPositions.stream()
.toList(), 0, true)
.getRectangle())
.map(t -> new Rectangle2D.Float(t.getTopLeft().getX(), t.getTopLeft().getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight()))).collect(Collectors.toList());
return markedContentByYPosition.values()
.stream()
.map(textPositions -> new TextPositionSequence(textPositions, 0, true).getRectangle())
.map(t -> new Rectangle2D.Float(t.getTopLeft().getX(), t.getTopLeft().getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight())))
.collect(Collectors.toList());
}
public List<MarkedContentPosition> getMarkedContentPositions(List<PDMarkedContent> markedContents, PDPage pdPage) {
if (markedContents == null) {
return Collections.emptyList();
}
return markedContents.stream()
.filter(m -> !m.getContents().isEmpty())
.map(markedContent -> MarkedContentPosition.fromPDMarkedContent(markedContent, pdPage))
.toList();
}
public boolean intersects(TextPageBlock textBlock, Map<String, List<Rectangle2D>> markedContentBboxPerType, String type) {
return markedContentBboxPerType.get(type) != null && markedContentBboxPerType.get(type).stream().anyMatch(rectangle -> rectangle.intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight()));
return markedContentBboxPerType.get(type) != null && markedContentBboxPerType.get(type)
.stream()
.anyMatch(rectangle -> rectangle.intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight()));
}
public record MarkedContentPosition(String type, String subType, List<Rectangle2D> textPositions) {
public static MarkedContentPosition fromPDMarkedContent(PDMarkedContent markedContent, PDPage pdPage) {
return new MarkedContentPosition(markedContent.getTag(), parseSubType(markedContent), parseTextPositions(markedContent.getContents(), pdPage));
}
private static List<Rectangle2D> parseTextPositions(List<Object> contents, PDPage pdPage) {
return contents.stream()
.filter(content -> content instanceof TextPosition)
.map(content -> (TextPosition) content)
.filter(content -> !content.getUnicode().equals(" "))
.map(textPositions -> new TextPositionSequence(List.of(textPositions), 0, true).getRectangle())
.map(t -> new Rectangle2D.Float(t.getTopLeft().getX(), t.getTopLeft().getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight())))
.collect(Collectors.toList());
}
private static String parseSubType(PDMarkedContent markedContent) {
if (markedContent == null || markedContent.getProperties() == null || markedContent.getProperties().getItem("Subtype") == null) {
return null;
}
return ((COSName) markedContent.getProperties().getItem("Subtype")).getName();
}
public String formattedType() {
if (subType == null || subType.isEmpty()) {
return type;
}
if (type.equals("Artifact")) {
return subType;
}
return String.format("%s-%s", type, subType);
}
}
}

View File

@ -52,7 +52,10 @@ public class RectangleTransformations {
public static Rectangle2D bBoxUnionAtomicTextBlock(List<AtomicTextBlock> atomicTextBlocks) {
return atomicTextBlocks.stream().flatMap(atomicTextBlock -> atomicTextBlock.getPositions().stream()).collect(new Rectangle2DBBoxCollector());
return atomicTextBlocks.stream()
.flatMap(atomicTextBlock -> atomicTextBlock.getPositions()
.stream())
.collect(new Rectangle2DBBoxCollector());
}
@ -77,7 +80,10 @@ public class RectangleTransformations {
public static Rectangle2D atomicTextBlockBBox(List<AtomicTextBlock> atomicTextBlocks) {
return atomicTextBlocks.stream().flatMap(atomicTextBlock -> atomicTextBlock.getPositions().stream()).collect(new Rectangle2DBBoxCollector());
return atomicTextBlocks.stream()
.flatMap(atomicTextBlock -> atomicTextBlock.getPositions()
.stream())
.collect(new Rectangle2DBBoxCollector());
}
@ -89,16 +95,18 @@ public class RectangleTransformations {
public static Rectangle2D rectangleBBox(List<Rectangle> rectangles) {
return rectangles.stream().map(RectangleTransformations::toRectangle2D).collect(new Rectangle2DBBoxCollector());
return rectangles.stream()
.map(RectangleTransformations::toRectangle2D)
.collect(new Rectangle2DBBoxCollector());
}
public static Rectangle2D toRectangle2D(Rectangle redactionLogRectangle) {
return new Rectangle2D.Double(redactionLogRectangle.getTopLeft().getX(),
redactionLogRectangle.getTopLeft().getY() + redactionLogRectangle.getHeight(),
redactionLogRectangle.getWidth(),
-redactionLogRectangle.getHeight());
redactionLogRectangle.getTopLeft().getY() + redactionLogRectangle.getHeight(),
redactionLogRectangle.getWidth(),
-redactionLogRectangle.getHeight());
}
@ -111,15 +119,16 @@ public class RectangleTransformations {
public static Rectangle toRedactionLogRectangle(Rectangle2D rectangle2D, int pageNumber) {
return new Rectangle(new Point((float) rectangle2D.getMinX(), (float) (rectangle2D.getMinY() + rectangle2D.getHeight())),
(float) rectangle2D.getWidth(),
-(float) rectangle2D.getHeight(),
pageNumber);
(float) rectangle2D.getWidth(),
-(float) rectangle2D.getHeight(),
pageNumber);
}
public static Rectangle2D rectangle2DBBox(List<Rectangle2D> rectangle2DList) {
return rectangle2DList.stream().collect(new Rectangle2DBBoxCollector());
return rectangle2DList.stream()
.collect(new Rectangle2DBBoxCollector());
}
@ -134,7 +143,9 @@ public class RectangleTransformations {
if (rectangle2DList.isEmpty()) {
return Collections.emptyList();
}
double splitThreshold = rectangle2DList.stream().mapToDouble(RectangularShape::getWidth).average().orElse(5) * 5.0;
double splitThreshold = rectangle2DList.stream()
.mapToDouble(RectangularShape::getWidth).average()
.orElse(5) * 5.0;
List<List<Rectangle2D>> rectangleListsWithGaps = new LinkedList<>();
List<Rectangle2D> rectangleListWithoutGaps = new LinkedList<>();
@ -195,9 +206,9 @@ public class RectangleTransformations {
public BinaryOperator<BBox> combiner() {
return (b1, b2) -> new BBox(Math.min(b1.lowerLeftX, b2.lowerLeftX),
Math.min(b1.lowerLeftY, b2.lowerLeftY),
Math.max(b1.upperRightX, b2.upperRightX),
Math.max(b1.upperRightY, b2.upperRightY));
Math.min(b1.lowerLeftY, b2.lowerLeftY),
Math.max(b1.upperRightX, b2.upperRightX),
Math.max(b1.upperRightY, b2.upperRightY));
}

View File

@ -14,23 +14,24 @@ public class RectangularIntersectionFinder {
public static List<Rectangle2D> find(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
// Fix for 211.pdf
for (Ruling r : horizontalRulingLines) {
if (r.getX2() < r.getX1()) {
double a = r.getX2();
r.x2 = (float) r.getX1();
r.x1 = (float) a;
}
}
// // Fix for 211.pdf
// for (Ruling r : horizontalRulingLines) {
// if (r.getX2() < r.getX1()) {
// double a = r.getX2();
// r.x2 = (float) r.getX1();
// r.x1 = (float) a;
// }
// }
List<Rectangle2D> foundRectangles = new ArrayList<>();
Map<Point2D, Ruling[]> intersectionPoints = Ruling.findIntersections(horizontalRulingLines, verticalRulingLines);
Map<Point2D, RulingIntersectionFinder.IntersectingRulings> intersectionPoints = RulingIntersectionFinder.findNaive(horizontalRulingLines, verticalRulingLines);
List<Point2D> intersectionPointsList = new ArrayList<>(intersectionPoints.keySet());
intersectionPointsList.sort(Y_FIRST_POINT_COMPARATOR);
for (int i = 0; i < intersectionPointsList.size(); i++) {
Point2D topLeft = intersectionPointsList.get(i);
Ruling[] hv = intersectionPoints.get(topLeft);
RulingIntersectionFinder.IntersectingRulings intersectingRulingsFromTopLeft = intersectionPoints.get(topLeft);
// CrossingPointsDirectlyBelow( topLeft );
List<Point2D> xPoints = new ArrayList<>();
@ -48,18 +49,19 @@ public class RectangularIntersectionFinder {
outer:
for (Point2D xPoint : xPoints) {
// is there a vertical edge b/w topLeft and xPoint?
if (!hv[1].equals(intersectionPoints.get(xPoint)[1])) {
if (!intersectingRulingsFromTopLeft.vertical().equals(intersectionPoints.get(xPoint).vertical())) {
continue;
}
for (Point2D yPoint : yPoints) {
// is there a horizontal edge b/w topLeft and yPoint ?
if (!hv[0].equals(intersectionPoints.get(yPoint)[0])) {
if (!intersectingRulingsFromTopLeft.horizontal().equals(intersectionPoints.get(yPoint).horizontal())) {
continue;
}
Point2D btmRight = new Point2D.Float((float) yPoint.getX(), (float) xPoint.getY());
if (intersectionPoints.containsKey(btmRight)
&& intersectionPoints.get(btmRight)[0].equals(intersectionPoints.get(xPoint)[0])
&& intersectionPoints.get(btmRight)[1].equals(intersectionPoints.get(yPoint)[1])) {
&& intersectionPoints.get(btmRight).horizontal().equals(intersectionPoints.get(xPoint).horizontal())
&& intersectionPoints.get(btmRight).vertical().equals(intersectionPoints.get(yPoint).vertical())) {
foundRectangles.add(new Rectangle2D.Double(topLeft.getX(), topLeft.getY(), btmRight.getX() - topLeft.getX(), btmRight.getY() - topLeft.getY()));
break outer;
}

View File

@ -0,0 +1,201 @@
package com.knecon.fforesight.service.layoutparser.processor.utils;
import java.awt.geom.Point2D;
import java.util.Collections;
import java.util.Comparator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.TreeMap;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import lombok.experimental.UtilityClass;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@UtilityClass
public class RulingIntersectionFinder {
public static final int PERPENDICULAR_UNIT_EXPAND_AMOUNT = 2;
public static final Comparator<Point2D> Y_THEN_X_POINT_COMPARATOR = Comparator.comparingDouble(Point2D::getY).thenComparing(Point2D::getX);
/**
* Implementation to find line intersection in O(P + n log n), where n is the number of lines and P the numer of intersections
* based on <a href="http://people.csail.mit.edu/indyk/6.838-old/handouts/lec2.pdf">Segment Intersection by Piotr Indyk</a>
* The algorithm assumes there are only horizontal and vertical lines which are unique in their coordinates. (E.g. no overlapping horizontal lines exist)
* As a high level overview, the algorithm uses a sweep line advancing from left to right.
* It dynamically updates the horizontal rulings which are intersected by the current sweep line.
* When the sweep line hits a vertical line, it then checks for all intersections with the currently intersected horizontal rulings.
* THe trick of the algorithm is using a binary search tree to store the currently intersected horizontal rulings. This way the lookup should be in O(log n).
* This way the initial sorting step has the highest complexity class (O(n log n) and thus determines the complexity class of the entire algorithm
*
* Unfortunately, the implementation here takes a few liberties compared to the original algorithm. The binary search tree is replaced by an ordered Set which is simply looped over.
* Therefore, this implementation's worst case, where all horizontal lines span the entire sweep, you are essentially performing the naive approach with a bunch of overhead.
* Since we are using this implementation to find table cells, one can expect this worst case to always be the case.
*
* A simple runtime comparison for a single page with the most lines we can expect (SinglePages/AbsolutelyEnormousTable.pdf with 30 horizontals and 144 verticals) shows this implementation takes roughly 14 ms, whereas the naive approach takes 7 ms. Both are negligible, but the naive approach is two times as fast.
*
* If we would like to make this faster, we would need a better data structure for 'TreeMap<Ruling, Void> horizontalRulingsInCurrentSweep', where we can query the TreeMap for all horizontal rulings in a given interval in O(log n).
*
* @param horizontals a list of non-overlapping horizontal rulings
* @param verticals a list of non-overlapping vertical rulings
* @return a Map of each found intersection point pointing to the two lines forming the intersection.
*/
public Map<Point2D, IntersectingRulings> find(List<Ruling> horizontals, List<Ruling> verticals) {
long start = System.currentTimeMillis();
List<SweepStep> sweepTrajectory = buildSweepTrajectory(horizontals, verticals);
TreeMap<Ruling, Void> horizontalRulingsInCurrentSweep = new TreeMap<>(Comparator.comparingDouble(Ruling::getTop));
TreeMap<Point2D, IntersectingRulings> intersections = new TreeMap<>(Y_THEN_X_POINT_COMPARATOR);
for (SweepStep step : sweepTrajectory) {
switch (step.type) {
case VERTICAL: // check for intersections with currently intersected horizontal lines
for (Ruling horizontalRuling : horizontalRulingsInCurrentSweep.navigableKeySet()) {
Optional<Point2D> intersectionPoint = findIntersectionPoint(horizontalRuling, step.ruling);
if (intersectionPoint.isEmpty()) {
continue;
}
intersections.put(intersectionPoint.get(), new IntersectingRulings(horizontalRuling, step.ruling));
}
break;
case HORIZONTAL_ENTRY: // sweep line now intersects this horizontal ruling
horizontalRulingsInCurrentSweep.put(step.ruling, null);
break;
case HORIZONTAL_EXIT: // sweep line no longer intersects this horizontal ruling
horizontalRulingsInCurrentSweep.remove(step.ruling);
break;
}
}
log.debug("Finished building intersections with line sweep in {} ms", System.currentTimeMillis() - start);
return intersections;
}
/**
* Naive Approach in O(n^2) of finding intersections between lines by iterating over all lines.
*
* @param horizontals a list of non-overlapping horizontal rulings
* @param verticals a list of non-overlapping vertical rulings
* @return a Map of each found intersection point pointing to the two lines forming the intersection.
*/
public Map<Point2D, IntersectingRulings> findNaive(List<Ruling> horizontals, List<Ruling> verticals) {
long start = System.currentTimeMillis();
TreeMap<Point2D, IntersectingRulings> intersections = new TreeMap<>(Y_THEN_X_POINT_COMPARATOR);
for (Ruling horizontal : horizontals) {
for (Ruling vertical : verticals) {
Optional<Point2D> intersectionPoint = findIntersectionPoint(horizontal, vertical);
if (intersectionPoint.isEmpty()) {
continue;
}
intersections.put(intersectionPoint.get(), new IntersectingRulings(horizontal, vertical));
}
}
log.debug("Finished building intersections naively in {} ms", System.currentTimeMillis() - start);
return intersections;
}
private static List<SweepStep> buildSweepTrajectory(List<Ruling> horizontals, List<Ruling> verticals) {
List<SweepStep> sweepTrajectory = new LinkedList<>();
for (Ruling horizontalRuling : horizontals) {
sweepTrajectory.add(new SweepStep(SweepStep.Type.HORIZONTAL_ENTRY, horizontalRuling.getLeft() - PERPENDICULAR_UNIT_EXPAND_AMOUNT, horizontalRuling));
sweepTrajectory.add(new SweepStep(SweepStep.Type.HORIZONTAL_EXIT, horizontalRuling.getRight() + PERPENDICULAR_UNIT_EXPAND_AMOUNT, horizontalRuling));
}
for (Ruling verticalRuling : verticals) {
sweepTrajectory.add(new SweepStep(SweepStep.Type.VERTICAL, verticalRuling.getLeft(), verticalRuling));
}
Collections.sort(sweepTrajectory);
return sweepTrajectory;
}
public Optional<Point2D> findIntersectionPoint(Ruling horizontal, Ruling vertical) {
if (!horizontal.isHorizontal() || !vertical.isVertical()) {
log.warn("lines must be orthogonal, vertical and horizontal");
return Optional.empty();
}
Ruling expanded_horizontal = horizontal.expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT);
Ruling expanded_vertical = vertical.expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT);
if (!expanded_horizontal.intersectsLine(expanded_vertical)) {
return Optional.empty();
}
return Optional.of(new Point2D.Float(vertical.getLeft(), horizontal.getTop()));
}
private class SweepStep implements Comparable<SweepStep> {
protected Type type;
protected float y_position;
protected Ruling ruling;
private enum Type {
VERTICAL,
HORIZONTAL_EXIT,
HORIZONTAL_ENTRY
}
public SweepStep(Type type, float y_position, Ruling ruling) {
this.type = type;
this.y_position = y_position;
this.ruling = ruling;
}
@Override
public int compareTo(SweepStep other) {
int rv;
if (DoubleComparisons.feq(y_position, other.y_position)) {
if (type == SweepStep.Type.VERTICAL && other.type == SweepStep.Type.HORIZONTAL_ENTRY) {
rv = 1;
} else if (type == SweepStep.Type.VERTICAL && other.type == SweepStep.Type.HORIZONTAL_EXIT) {
rv = -1;
} else if (type == SweepStep.Type.HORIZONTAL_ENTRY && other.type == SweepStep.Type.VERTICAL) {
rv = -1;
} else if (type == SweepStep.Type.HORIZONTAL_EXIT && other.type == SweepStep.Type.VERTICAL) {
rv = 1;
} else {
rv = Double.compare(y_position, other.y_position);
}
} else {
return Double.compare(y_position, other.y_position);
}
return rv;
}
}
public record IntersectingRulings(Ruling horizontal, Ruling vertical) {
}
}

View File

@ -0,0 +1,252 @@
package com.knecon.fforesight.service.layoutparser.processor.visualization;
import java.awt.Color;
import java.awt.geom.Line2D;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.util.Collection;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Stream;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import com.knecon.fforesight.service.viewerdoc.ContentStreams;
import com.knecon.fforesight.service.viewerdoc.model.ColoredLine;
import com.knecon.fforesight.service.viewerdoc.model.ColoredRectangle;
import com.knecon.fforesight.service.viewerdoc.model.PlacedText;
import com.knecon.fforesight.service.viewerdoc.model.Standard14EmbeddableFont;
import com.knecon.fforesight.service.viewerdoc.model.Visualizations;
import com.knecon.fforesight.service.viewerdoc.model.VisualizationsOnPage;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.NoArgsConstructor;
import lombok.Setter;
import lombok.experimental.FieldDefaults;
@Getter
@NoArgsConstructor
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class LayoutparsingVisualizations {
static final Standard14EmbeddableFont FONT = Standard14EmbeddableFont.helvetica();
static final Color WORDS_COLOR = new Color(68, 84, 147);
static final Color LINES_COLOR = new Color(152, 45, 179);
static final Color ZONES_COLOR = new Color(131, 38, 38);
static final Color RULINGS_COLOR = new Color(21, 221, 174);
static final Color CELLS_COLOR = new Color(31, 214, 27);
static final Color MAIN_BODY_COLOR = new Color(171, 131, 6);
static final Color MARKED_CONTENT_COLOR = new Color(171, 131, 6);
static final List<Color> ROTATING_CHARACTER_COLOR = List.of(new Color(255, 87, 51),
new Color(255, 195, 0),
new Color(76, 175, 80),
new Color(33, 150, 243),
new Color(155, 89, 182),
new Color(233, 30, 99),
new Color(0, 188, 212),
new Color(121, 85, 72));
@Setter
boolean active = false;
final Visualizations words = Visualizations.builder().layer(ContentStreams.WORDS).build();
final Visualizations lines = Visualizations.builder().layer(ContentStreams.LINES).build();
final Visualizations zones = Visualizations.builder().layer(ContentStreams.ZONES).build();
final Visualizations mainBody = Visualizations.builder().layer(ContentStreams.MAIN_BODY).build();
final Visualizations rulings = Visualizations.builder().layer(ContentStreams.RULINGS).build();
final Visualizations cells = Visualizations.builder().layer(ContentStreams.CELLS).build();
final Visualizations markedContent = Visualizations.builder().layer(ContentStreams.MARKED_CONTENT).build();
final Visualizations neighbours = Visualizations.builder().layer(ContentStreams.NEIGHBOURS).build();
final Visualizations characters = Visualizations.builder().layer(ContentStreams.CHARACTERS).build();
public Stream<Visualizations> streamAll() {
if (!active) {
return Stream.empty();
}
return Stream.of(characters, //
neighbours,//
words, //
lines, //
zones, //
rulings, //
cells, //
mainBody, //
markedContent //
);
}
public void addTextVisualizations(List<TextPositionSequence> textPositionSequences, int pageNumber) {
if (!active) {
return;
}
List<ColoredRectangle> list = textPositionSequences.stream()
.map(textPositionSequence -> textPositionSequence.getTextPositions()
.stream()
.map(RedTextPosition::getInitialUserSpacePosition)
.collect(RectangleTransformations.collectBBox()))
.map(rect -> new ColoredRectangle(rect, WORDS_COLOR, 1))
.toList();
this.words.getVisualizationsOnPages().put(pageNumber - 1, VisualizationsOnPage.builder().coloredRectangles(list).build());
}
public void addCleanRulingVisualization(CleanRulings cleanRulings, int pageNumber) {
if (!active) {
return;
}
this.rulings.getVisualizationsOnPages()
.put(pageNumber - 1,
VisualizationsOnPage.builder()
.coloredLines(Stream.of(cleanRulings.getHorizontal(), cleanRulings.getVertical())
.flatMap(Collection::stream)
.map(ruling -> new ColoredLine(ruling, RULINGS_COLOR, 1))
.toList())
.build());
}
public void addCellVisualizations(List<? extends Rectangle2D> cells, int pageNumber) {
if (!active) {
return;
}
this.cells.getVisualizationsOnPages()
.put(pageNumber - 1,
VisualizationsOnPage.builder()
.coloredRectangles(cells.stream()
.map(ruling -> new ColoredRectangle(ruling, CELLS_COLOR, 1))
.toList())
.build());
}
public void addZoneVisualizations(List<Zone> zones, int page) {
if (!active) {
return;
}
this.zones.getVisualizationsOnPages()
.put(page - 1,
VisualizationsOnPage.builder()
.coloredRectangles(zones.stream()
.map(BoundingBox::getBBox)
.map(zone -> new ColoredRectangle(zone, ZONES_COLOR, 1))
.toList())
.build());
}
public void addLineVisualizations(List<Zone> zones, int page) {
if (!active) {
return;
}
this.lines.getVisualizationsOnPages()
.put(page - 1,
VisualizationsOnPage.builder()
.coloredRectangles(zones.stream()
.map(Zone::getLines)
.flatMap(Collection::stream)
.map(BoundingBox::getBBox)
.map(line -> new ColoredRectangle(line, LINES_COLOR, 1))
.toList())
.build());
}
public void addMainBodyVisualization(Rectangle rectangle, int pageNumber) {
if (!active) {
return;
}
this.mainBody.getVisualizationsOnPages()
.put(pageNumber - 1,
VisualizationsOnPage.builder()
.coloredRectangles(List.of(new ColoredRectangle(new Rectangle2D.Double(rectangle.getTopLeft().getX(),
rectangle.getTopLeft().getY(),
rectangle.getWidth(),
rectangle.getHeight()), MAIN_BODY_COLOR, 1)))
.build());
}
public void addMarkedContentVisualizations(List<PDMarkedContent> markedContents, int pageNumber, PDPage pdPage) {
if (!active) {
return;
}
List<MarkedContentUtils.MarkedContentPosition> markedContentBBoxMapBySubType = MarkedContentUtils.getMarkedContentPositions(markedContents, pdPage);
VisualizationsOnPage visualizationsOnPage = VisualizationsOnPage.builder().build();
this.markedContent.getVisualizationsOnPages().put(pageNumber - 1, visualizationsOnPage);
markedContentBBoxMapBySubType.forEach(markedContentPosition -> {
var bbox = markedContentPosition.textPositions()
.stream()
.collect(RectangleTransformations.collectBBox());
String type = markedContentPosition.formattedType();
float translationAmount = ((FONT.getStringWidth(type) / 1000) * 10 + (2 * 1) + 4);
visualizationsOnPage.getPlacedTexts()
.add(PlacedText.textFacingUp(type, new Point2D.Double(bbox.getX() - translationAmount, bbox.getY() + bbox.getHeight()), 10, Color.BLACK, FONT));
visualizationsOnPage.getColoredRectangles().add(new ColoredRectangle(bbox, MARKED_CONTENT_COLOR, 1));
}
);
}
public void addCharactersWithNeighbours(List<Zone> zones, int page) {
if (!active) {
return;
}
VisualizationsOnPage neighbourVisualizations = VisualizationsOnPage.builder().build();
neighbours.getVisualizationsOnPages().put(page - 1, neighbourVisualizations);
VisualizationsOnPage characterVisualizations = VisualizationsOnPage.builder().build();
characters.getVisualizationsOnPages().put(page - 1, characterVisualizations);
AtomicInteger index = new AtomicInteger(0);
zones.forEach(zone -> zone.getLines()
.stream()
.map(Line::getCharacters)
.flatMap(Collection::stream)
.forEach(character -> {
Color color = ROTATING_CHARACTER_COLOR.get(index.getAndIncrement() % ROTATING_CHARACTER_COLOR.size());
Rectangle2D charBBox = character.getTextPosition().getInitialUserSpacePosition();
characterVisualizations.getColoredRectangles().add(new ColoredRectangle(charBBox, color, 1));
character.getNeighbors()
.forEach(neighbor -> {
Rectangle2D neighborBBox = neighbor.getCharacter().getTextPosition().getInitialUserSpacePosition();
Line2D line = new Line2D.Double(new Point2D.Double(charBBox.getCenterX(), charBBox.getCenterY()),
new Point2D.Double(neighborBBox.getCenterX(), neighborBBox.getCenterY()));
neighbourVisualizations.getColoredLines().add(new ColoredLine(line, color, 1));
});
}));
}
}

View File

@ -1,10 +1,20 @@
package com.knecon.fforesight.service.layoutparser.server;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Arrays;
import java.util.Comparator;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.core.io.ClassPathResource;
import com.iqser.red.storage.commons.utils.FileSystemBackedStorageService;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
@ -22,26 +32,63 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
@Test
@SneakyThrows
public void testLayoutParserEndToEnd() {
prepareStorage("files/bdr/Wie weiter bei Kristeneinrichtungen.pdf");
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER);
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
Arrays.stream(finishedEvent.message().split("\n"))
.forEach(log::info);
String filePath = "files/bdr/Wie weiter bei Kristeneinrichtungen.pdf";
runForFile(filePath);
}
@Test
@Disabled
@SneakyThrows
public void testLayoutParserEndToEndWithFolder() {
String folder = "/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles-pdftron-ocred";
List<Path> pdfFiles = Files.walk(Path.of(folder))
.filter(path -> path.getFileName().toString().endsWith(".pdf"))
.sorted(Comparator.comparing(Path::getFileName))
.peek(System.out::println)
.toList();
System.out.printf("Found %d pdf files to process %n", pdfFiles.size());
AtomicInteger count = new AtomicInteger(0);
pdfFiles.stream()
.peek(path -> log.info("{}/{}-{}", count.getAndIncrement(), pdfFiles.size(), path.getFileName()))
.forEach(path -> runForFile(path.toFile().toString()));
}
@Test
@SneakyThrows
public void testLayoutParserEndToEnd_RED_8747() {
private void runForFile(String filePath) {
String fileName = Path.of(filePath).getFileName().toString();
File file;
if (filePath.startsWith("files")) { // from resources
file = new ClassPathResource(filePath).getFile();
} else { // absolute path
file = new File(filePath);
}
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LayoutParsingType.REDACT_MANAGER, true);
prepareStorage(layoutParsingRequest, file);
prepareStorage("files/syngenta/CustomerFiles/SinglePages/Page26_fRR A23317A PI0015600 CEU core part B6 - CZ.pdf");
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER_OLD);
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
Arrays.stream(finishedEvent.message().split("\n"))
.forEach(log::info);
File tmpFile = new File("/tmp/layout-E2E/" + fileName + "_VIEWER.pdf");
assert tmpFile.getParentFile().exists() || tmpFile.getParentFile().mkdirs();
storageService.downloadTo(TENANT_ID, layoutParsingRequest.viewerDocumentStorageId(), tmpFile);
}
@AfterEach
public void cleanUpTmp() {
((FileSystemBackedStorageService) storageService).clearStorage();
}
}

View File

@ -23,6 +23,10 @@ import lombok.SneakyThrows;
public class ViewerDocumentTest extends BuildDocumentTest {
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
@Test
@SneakyThrows
public void testViewerDocument() {
@ -31,11 +35,9 @@ public class ViewerDocumentTest extends BuildDocumentTest {
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
var documentFile = new ClassPathResource(fileName).getFile();
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
long start = System.currentTimeMillis();
Document document = buildGraph(fileName, LayoutParsingType.DOCUMINE);
Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER);
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
}
@ -55,11 +57,11 @@ public class ViewerDocumentTest extends BuildDocumentTest {
var documentFile = new ClassPathResource(fileName).getFile();
var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE,
documentFile,
new ImageServiceResponse(),
tableResponse,
new VisualLayoutParsingResponse(),
Map.of("file", Path.of(fileName).getFileName().toFile().toString()));
documentFile,
new ImageServiceResponse(),
tableResponse,
new VisualLayoutParsingResponse(),
Map.of("file", Path.of(fileName).getFileName().toFile().toString()));
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.DOCUMINE, classificationDocument);

View File

@ -1,6 +1,9 @@
package com.knecon.fforesight.service.layoutparser.server.utils;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.nio.file.Path;
import java.util.Map;
import java.util.Optional;
@ -102,29 +105,22 @@ public abstract class AbstractTest {
}
@SneakyThrows
protected LayoutParsingRequest prepareStorage(InputStream fileInputStream) {
storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileInputStream);
return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER_OLD);
}
protected LayoutParsingRequest buildDefaultLayoutParsingRequest(LayoutParsingType layoutParsingType) {
protected LayoutParsingRequest buildDefaultLayoutParsingRequest(String fileName, LayoutParsingType layoutParsingType, boolean debug) {
var identifier = debug ? Map.of("fileId", fileName, "debug", "true") : Map.of("fileId", fileName);
return LayoutParsingRequest.builder()
.identifier(Map.of("fileId", "1337"))
.identifier(identifier)
.layoutParsingType(layoutParsingType)
.originFileStorageId(ORIGIN_FILE_ID)
.tablesFileStorageId(Optional.of(TABLE_FILE_ID))
.imagesFileStorageId(Optional.of(IMAGE_FILE_ID))
.visualLayoutParsingFileId(Optional.of(VISUAL_LAYOUT_FILE))
.structureFileStorageId(STRUCTURE_FILE_ID)
.textBlockFileStorageId(TEXT_FILE_ID)
.positionBlockFileStorageId(POSITION_FILE_ID)
.pageFileStorageId(PAGES_FILE_ID)
.simplifiedTextStorageId(SIMPLIFIED_ID)
.viewerDocumentStorageId(VIEWER_DOCUMENT_ID)
.originFileStorageId(fileName + ORIGIN_FILE_ID)
.tablesFileStorageId(Optional.of(fileName + TABLE_FILE_ID))
.imagesFileStorageId(Optional.of(fileName + IMAGE_FILE_ID))
.visualLayoutParsingFileId(Optional.of(fileName + VISUAL_LAYOUT_FILE))
.structureFileStorageId(fileName + STRUCTURE_FILE_ID)
.textBlockFileStorageId(fileName + TEXT_FILE_ID)
.positionBlockFileStorageId(fileName + POSITION_FILE_ID)
.pageFileStorageId(fileName + PAGES_FILE_ID)
.simplifiedTextStorageId(fileName + SIMPLIFIED_ID)
.viewerDocumentStorageId(fileName + VIEWER_DOCUMENT_ID)
.build();
}
@ -148,10 +144,28 @@ public abstract class AbstractTest {
ClassPathResource imageInfoFileResource = new ClassPathResource(imageInfoFile);
ClassPathResource visualLayoutParsingResponseResource = new ClassPathResource(visualLayoutParsingResponseFile);
return prepareStorage(pdfFileResource.getInputStream(),
cvServiceResponseFileResource.getInputStream(),
imageInfoFileResource.getInputStream(),
visualLayoutParsingResponseResource.getInputStream());
return prepareStorage(Path.of(file).getFileName().toString(),
pdfFileResource.getInputStream(),
cvServiceResponseFileResource.getInputStream(),
imageInfoFileResource.getInputStream(),
visualLayoutParsingResponseResource.getInputStream());
}
@SneakyThrows
protected void prepareStorage(LayoutParsingRequest layoutParsingRequest, File file) {
ClassPathResource cvServiceResponseFileResource = new ClassPathResource("cv_table_parsing_response/empty.json");
ClassPathResource imageInfoFileResource = new ClassPathResource("image_service_response/empty.json");
ClassPathResource visualLayoutParsingResponseResource = new ClassPathResource("visual_layout_parsing_response/empty.json");
try (var in = new FileInputStream(file)) {
prepareStorage(layoutParsingRequest,
in,
cvServiceResponseFileResource.getInputStream(),
imageInfoFileResource.getInputStream(),
visualLayoutParsingResponseResource.getInputStream());
}
}
@ -162,12 +176,27 @@ public abstract class AbstractTest {
storageService.storeObject(TenantContext.getTenantId(), TABLE_FILE_ID, cvServiceResponseFileStream);
storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileStream);
return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER_OLD);
return buildDefaultLayoutParsingRequest("test", LayoutParsingType.REDACT_MANAGER_OLD, true);
}
@SneakyThrows
protected LayoutParsingRequest prepareStorage(InputStream fileStream,
protected void prepareStorage(LayoutParsingRequest layoutParsingRequest,
InputStream fileStream,
InputStream cvServiceResponseFileStream,
InputStream imageInfoStream,
InputStream visualLayoutParsingResponseFileStream) {
storageService.storeObject(TenantContext.getTenantId(), layoutParsingRequest.imagesFileStorageId().get(), imageInfoStream);
storageService.storeObject(TenantContext.getTenantId(), layoutParsingRequest.tablesFileStorageId().get(), cvServiceResponseFileStream);
storageService.storeObject(TenantContext.getTenantId(), layoutParsingRequest.originFileStorageId(), fileStream);
storageService.storeObject(TenantContext.getTenantId(), layoutParsingRequest.visualLayoutParsingFileId().get(), visualLayoutParsingResponseFileStream);
}
@SneakyThrows
protected LayoutParsingRequest prepareStorage(String fileName,
InputStream fileStream,
InputStream cvServiceResponseFileStream,
InputStream imageInfoStream,
InputStream visualLayoutParsingResponseFileStream) {
@ -177,7 +206,7 @@ public abstract class AbstractTest {
storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileStream);
storageService.storeObject(TenantContext.getTenantId(), VISUAL_LAYOUT_FILE, visualLayoutParsingResponseFileStream);
return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER_OLD);
return buildDefaultLayoutParsingRequest(fileName, LayoutParsingType.REDACT_MANAGER_OLD, true);
}

View File

@ -1,11 +1,13 @@
package com.knecon.fforesight.service.layoutparser.server.utils;
import java.io.File;
import java.nio.file.Path;
import java.util.Map;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.core.io.ClassPathResource;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
@ -28,11 +30,11 @@ public abstract class BuildDocumentTest extends AbstractTest {
File fileResource = new ClassPathResource(filename).getFile();
prepareStorage(filename);
return layoutParsingPipeline.parseLayout(layoutParsingType,
fileResource,
layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID),
new TableServiceResponse(),
new VisualLayoutParsingResponse(),
Map.of("file",filename));
fileResource,
layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID),
new TableServiceResponse(),
new VisualLayoutParsingResponse(),
Map.of("file", filename, "debug", "true"));
}
@ -46,13 +48,25 @@ public abstract class BuildDocumentTest extends AbstractTest {
@SneakyThrows
protected Document buildGraph(String filename, LayoutParsingType layoutParsingType) {
if (filename.equals("files/syngenta/CustomerFiles/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf")) {
prepareStorage(filename, "cv_table_parsing_response/empty.json", "image_service_response/S-Metolachlor_RAR_01_Volume_1_2018-09-06.IMAGE_INFO.json");
if (!filename.startsWith("files") && filename.startsWith("/")) {
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(Path.of(filename).getFileName().toString(), LayoutParsingType.REDACT_MANAGER, true);
prepareStorage(layoutParsingRequest, new File(filename));
return DocumentGraphFactory.buildDocumentGraph(layoutParsingType,
layoutParsingPipeline.parseLayout(layoutParsingType,
new File(filename),
layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get()),
new TableServiceResponse(),
new VisualLayoutParsingResponse(),
layoutParsingRequest.identifier()));
} else {
prepareStorage(filename);
if (filename.equals("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf")) {
prepareStorage(filename, "cv_table_parsing_response/empty.json", "image_service_response/S-Metolachlor_RAR_01_Volume_1_2018-09-06.IMAGE_INFO.json");
} else {
prepareStorage(filename);
}
return DocumentGraphFactory.buildDocumentGraph(layoutParsingType, parseLayout(filename, layoutParsingType));
}
return DocumentGraphFactory.buildDocumentGraph(layoutParsingType, parseLayout(filename, layoutParsingType));
}
}

View File

@ -26,6 +26,23 @@ public class ContentStreams {
public static Identifier ESCAPE_END = new Identifier("escape start", COSName.getPDFName("ESCAPE_END"), false);
public static Identifier RULINGS = new Identifier("Rulings", COSName.getPDFName("KNECON_RULINGS"), true);
public static Identifier WORDS = new Identifier("Words", COSName.getPDFName("KNECON_WORDS"), true);
public static Identifier ZONES = new Identifier("Text Zones", COSName.getPDFName("KNECON_ZONES"), true);
public static Identifier LINES = new Identifier("Text Lines", COSName.getPDFName("KNECON_LINES"), true);
public static Identifier CELLS = new Identifier("Cells", COSName.getPDFName("KNECON_CELLS"), true);
public static Identifier MAIN_BODY = new Identifier("Main Text Body", COSName.getPDFName("KNECON_MAIN_BODY"), true);
public static Identifier MARKED_CONTENT = new Identifier("Marked content", COSName.getPDFName("KNECON_MARKED_CONTENT"), true);
public static Identifier NEIGHBOURS = new Identifier("Neighbours", COSName.getPDFName("KNECON_NEIGHBOURS"), true);
public static Identifier CHARACTERS = new Identifier("Characters", COSName.getPDFName("KNECON_CHARACTERS"), true);
public static List<Identifier> allContentStreams = List.of(KNECON_LAYOUT,
KNECON_VISUAL_PARSING,
KNECON_OCR,
@ -33,7 +50,16 @@ public class ContentStreams {
KNECON_OCR_TEXT_DEBUG,
OTHER,
ESCAPE_START,
ESCAPE_END);
ESCAPE_END,
RULINGS,
WORDS,
ZONES,
LINES,
MAIN_BODY,
MARKED_CONTENT,
NEIGHBOURS,
CHARACTERS,
CELLS);
public record Identifier(String name, COSName cosName, boolean optionalContent) {

View File

@ -1,5 +1,6 @@
package com.knecon.fforesight.service.viewerdoc.model;
import java.util.LinkedHashMap;
import java.util.Map;
import com.knecon.fforesight.service.viewerdoc.ContentStreams;
@ -17,7 +18,8 @@ import lombok.experimental.FieldDefaults;
public class Visualizations {
ContentStreams.Identifier layer;
Map<Integer, VisualizationsOnPage> visualizationsOnPages;
@Builder.Default
Map<Integer, VisualizationsOnPage> visualizationsOnPages = new LinkedHashMap<>();
boolean layerVisibilityDefaultValue;
}

View File

@ -53,12 +53,6 @@ public class ViewerDocumentService {
private final ObservationRegistry registry;
public void addVisualizationsOnPage(File originFile, File destinationFile, Visualizations visualizations) {
addVisualizationsOnPage(originFile, destinationFile, List.of(visualizations));
}
@Observed(name = "ViewerDocumentService", contextualName = "add-visualizations")
@SneakyThrows
public void addVisualizationsOnPage(File originFile, File destinationFile, List<Visualizations> visualizations) {
@ -70,9 +64,14 @@ public class ViewerDocumentService {
PDDocument pdDocument = openPDDocument(tmpFile.toFile());
enrichObservation(pdDocument, visualizations.stream().map(Visualizations::getLayer).toList());
enrichObservation(pdDocument,
visualizations.stream()
.map(Visualizations::getLayer)
.toList());
Set<ContentStreams.Identifier> allLayers = visualizations.stream().map(Visualizations::getLayer).collect(Collectors.toUnmodifiableSet());
Set<ContentStreams.Identifier> allLayers = visualizations.stream()
.map(Visualizations::getLayer)
.collect(Collectors.toUnmodifiableSet());
Map<ContentStreams.Identifier, PDOptionalContentGroup> optionalContentGroupMap = addLayersToDocument(visualizations, pdDocument);
@ -186,7 +185,8 @@ public class ViewerDocumentService {
contentStream.setFont(font, placedText.fontSize());
contentStream.beginText();
contentStream.setNonStrokingColor(placedText.color());
if (placedText.renderingMode().isPresent()) {
if (placedText.renderingMode()
.isPresent()) {
contentStream.setRenderingMode(placedText.renderingMode().get());
} else {
contentStream.setRenderingMode(RenderingMode.FILL);
@ -229,11 +229,11 @@ public class ViewerDocumentService {
Matrix textMatrix;
if (placedText.textMatrix().isEmpty()) {
textMatrix = new Matrix((float) textDeRotationMatrix.getScaleX(),
(float) textDeRotationMatrix.getShearX(),
(float) textDeRotationMatrix.getShearY(),
(float) textDeRotationMatrix.getScaleY(),
(float) placedText.lineStart().getX(),
(float) placedText.lineStart().getY());
(float) textDeRotationMatrix.getShearX(),
(float) textDeRotationMatrix.getShearY(),
(float) textDeRotationMatrix.getScaleY(),
(float) placedText.lineStart().getX(),
(float) placedText.lineStart().getY());
} else {
textMatrix = placedText.textMatrix().get();
}