RED-8825: improve layoutparsing

* added improved debugging capabilities to viewer-doc
* refactored coordinates (wip)
* refactored line intersection algorithm
* removed cropbox correction from pdfbox text positions
This commit is contained in:
Kilian Schuettler 2024-04-17 17:26:04 +02:00
parent 6fb1a0bef3
commit 3dd215288a
34 changed files with 1147 additions and 599 deletions

View File

@ -101,29 +101,33 @@ public class LayoutParsingPipeline {
log.info("Starting layout parsing for {}", layoutParsingRequest.identifier()); log.info("Starting layout parsing for {}", layoutParsingRequest.identifier());
File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId()); File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()).orElse(originFile); File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId())
.orElse(originFile);
VisualLayoutParsingResponse visualLayoutParsingResponse = new VisualLayoutParsingResponse(); VisualLayoutParsingResponse visualLayoutParsingResponse = new VisualLayoutParsingResponse();
if (layoutParsingRequest.visualLayoutParsingFileId().isPresent()) { if (layoutParsingRequest.visualLayoutParsingFileId()
.isPresent()) {
visualLayoutParsingResponse = layoutParsingStorageService.getVisualLayoutParsingFile(layoutParsingRequest.visualLayoutParsingFileId().get()); visualLayoutParsingResponse = layoutParsingStorageService.getVisualLayoutParsingFile(layoutParsingRequest.visualLayoutParsingFileId().get());
} }
ImageServiceResponse imageServiceResponse = new ImageServiceResponse(); ImageServiceResponse imageServiceResponse = new ImageServiceResponse();
if (layoutParsingRequest.imagesFileStorageId().isPresent()) { if (layoutParsingRequest.imagesFileStorageId()
.isPresent()) {
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get()); imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get());
} }
TableServiceResponse tableServiceResponse = new TableServiceResponse(); TableServiceResponse tableServiceResponse = new TableServiceResponse();
if (layoutParsingRequest.tablesFileStorageId().isPresent()) { if (layoutParsingRequest.tablesFileStorageId()
.isPresent()) {
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get()); tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get());
} }
ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(), ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(),
originFile, originFile,
imageServiceResponse, imageServiceResponse,
tableServiceResponse, tableServiceResponse,
visualLayoutParsingResponse, visualLayoutParsingResponse,
layoutParsingRequest.identifier()); layoutParsingRequest.identifier());
log.info("Building document graph for {}", layoutParsingRequest.identifier()); log.info("Building document graph for {}", layoutParsingRequest.identifier());
@ -155,25 +159,25 @@ public class LayoutParsingPipeline {
.numberOfPages(documentGraph.getNumberOfPages()) .numberOfPages(documentGraph.getNumberOfPages())
.duration(System.currentTimeMillis() - start) .duration(System.currentTimeMillis() - start)
.message(format(""" .message(format("""
Layout parsing has finished in %.02f s. Layout parsing has finished in %.02f s.
identifiers: %s identifiers: %s
%s %s
Files have been saved with Ids: Files have been saved with Ids:
Structure: %s Structure: %s
Text: %s Text: %s
Positions: %s Positions: %s
PageData: %s PageData: %s
Simplified Text: %s Simplified Text: %s
Viewer Doc: %s""", Viewer Doc: %s""",
((float) (System.currentTimeMillis() - start)) / 1000, ((float) (System.currentTimeMillis() - start)) / 1000,
layoutParsingRequest.identifier(), layoutParsingRequest.identifier(),
buildSemanticNodeCountMessage(documentGraph.getNumberOfPages(), documentGraph.buildSemanticNodeCounts()), buildSemanticNodeCountMessage(documentGraph.getNumberOfPages(), documentGraph.buildSemanticNodeCounts()),
layoutParsingRequest.structureFileStorageId(), layoutParsingRequest.structureFileStorageId(),
layoutParsingRequest.textBlockFileStorageId(), layoutParsingRequest.textBlockFileStorageId(),
layoutParsingRequest.positionBlockFileStorageId(), layoutParsingRequest.positionBlockFileStorageId(),
layoutParsingRequest.pageFileStorageId(), layoutParsingRequest.pageFileStorageId(),
layoutParsingRequest.simplifiedTextStorageId(), layoutParsingRequest.simplifiedTextStorageId(),
layoutParsingRequest.viewerDocumentStorageId())) layoutParsingRequest.viewerDocumentStorageId()))
.build(); .build();
} }
@ -194,14 +198,14 @@ public class LayoutParsingPipeline {
private String buildSemanticNodeCountMessage(int numberOfPages, Map<NodeType, Long> semanticNodeCounts) { private String buildSemanticNodeCountMessage(int numberOfPages, Map<NodeType, Long> semanticNodeCounts) {
return String.format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed", return String.format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed",
numberOfPages, numberOfPages,
semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION), semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION),
semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE), semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE),
semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH), semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH),
semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE), semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE),
semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL), semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL),
semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER), semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER),
semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER)); semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER));
} }
@ -220,6 +224,9 @@ public class LayoutParsingPipeline {
Map<Integer, List<ClassifiedImage>> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse); Map<Integer, List<ClassifiedImage>> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse);
Map<Integer, List<ClassifiedImage>> signatures = visualLayoutParsingAdapter.buildExtractedSignaturesPerPage(visualLayoutParsingResponse); Map<Integer, List<ClassifiedImage>> signatures = visualLayoutParsingAdapter.buildExtractedSignaturesPerPage(visualLayoutParsingResponse);
ClassificationDocument classificationDocument = new ClassificationDocument(); ClassificationDocument classificationDocument = new ClassificationDocument();
classificationDocument.getVisualizations().setActive(identifier.containsKey("debug"));
List<ClassificationPage> classificationPages = new ArrayList<>(); List<ClassificationPage> classificationPages = new ArrayList<>();
long pageCount = originDocument.getNumberOfPages(); long pageCount = originDocument.getNumberOfPages();
@ -249,6 +256,8 @@ public class LayoutParsingPipeline {
} }
stripper.getText(originDocument); stripper.getText(originDocument);
classificationDocument.getVisualizations().addTextVisualizations(stripper.getTextPositionSequences(), pageNumber);
PDRectangle pdr = pdPage.getMediaBox(); PDRectangle pdr = pdPage.getMediaBox();
int rotation = pdPage.getRotation(); int rotation = pdPage.getRotation();
@ -257,6 +266,8 @@ public class LayoutParsingPipeline {
PDRectangle cropbox = pdPage.getCropBox(); PDRectangle cropbox = pdPage.getCropBox();
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), stripper.getRulings()); CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), stripper.getRulings());
classificationDocument.getVisualizations().addCleanRulingVisualization(cleanRulings, pageNumber);
List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical()); List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical());
var graphics = graphicExtractorService.extractPathElementGraphics(originDocument, var graphics = graphicExtractorService.extractPathElementGraphics(originDocument,
@ -272,11 +283,16 @@ public class LayoutParsingPipeline {
.map(g -> new ClassifiedImage(new Rectangle2D.Double(g.x1, g.y1, g.width(), g.height()), ImageType.GRAPHIC, false, stripper.getPageNumber())) .map(g -> new ClassifiedImage(new Rectangle2D.Double(g.x1, g.y1, g.width(), g.height()), ImageType.GRAPHIC, false, stripper.getPageNumber()))
.toList()); .toList());
classificationDocument.getVisualizations().addCellVisualizations(emptyTableCells, pageNumber);
ClassificationPage classificationPage = switch (layoutParsingType) { ClassificationPage classificationPage = switch (layoutParsingType) {
case REDACT_MANAGER_OLD -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells); case REDACT_MANAGER_OLD ->
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells);
case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, true); case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG ->
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, false); docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, true, classificationDocument.getVisualizations());
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG ->
docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, false, classificationDocument.getVisualizations());
}; };
classificationPage.setCleanRulings(cleanRulings); classificationPage.setCleanRulings(cleanRulings);
@ -286,8 +302,9 @@ public class LayoutParsingPipeline {
classificationPage.setPageWidth(cropbox.getWidth()); classificationPage.setPageWidth(cropbox.getWidth());
classificationPage.setPageHeight(cropbox.getHeight()); classificationPage.setPageHeight(cropbox.getHeight());
classificationDocument.getVisualizations().addMarkedContentVisualizations(stripper.getMarkedContents(), pageNumber, pdPage);
// MarkedContent needs to be converted at this point, otherwise it leads to GC Problems in Pdfbox. // MarkedContent needs to be converted at this point, otherwise it leads to GC Problems in Pdfbox.
classificationPage.setMarkedContentBboxPerType(convertMarkedContents(stripper.getMarkedContents())); classificationPage.setMarkedContentBboxPerType(convertMarkedContents(stripper.getMarkedContents(), pdPage));
// If images is ocr needs to be calculated before textBlocks are moved into tables, otherwise findOcr algorithm needs to be adopted. // If images is ocr needs to be calculated before textBlocks are moved into tables, otherwise findOcr algorithm needs to be adopted.
if (pdfImages != null && pdfImages.containsKey(pageNumber)) { if (pdfImages != null && pdfImages.containsKey(pageNumber)) {
@ -361,11 +378,11 @@ public class LayoutParsingPipeline {
} }
private Map<String, List<Rectangle2D>> convertMarkedContents(List<PDMarkedContent> pdMarkedContents) { private Map<String, List<Rectangle2D>> convertMarkedContents(List<PDMarkedContent> pdMarkedContents, PDPage pdPage) {
Map<String, List<Rectangle2D>> markedContentBboxes = new HashMap<>(); Map<String, List<Rectangle2D>> markedContentBboxes = new HashMap<>();
markedContentBboxes.put(MarkedContentUtils.HEADER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.HEADER)); markedContentBboxes.put(MarkedContentUtils.HEADER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.HEADER, pdPage));
markedContentBboxes.put(MarkedContentUtils.FOOTER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.FOOTER)); markedContentBboxes.put(MarkedContentUtils.FOOTER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.FOOTER, pdPage));
return markedContentBboxes; return markedContentBboxes;
} }

View File

@ -7,12 +7,14 @@ import java.util.stream.Collectors;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.LineBuilderService; import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.LineBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.NearestNeighbourService; import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.NearestNeighbourService;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.ReadingOrderService; import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.ReadingOrderService;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.SpacingService; import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.SpacingService;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.ZoneBuilderService; import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.ZoneBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
@ -43,16 +45,16 @@ public class DocstrumSegmentationService {
private List<Zone> computeZones(List<TextPositionSequence> textPositions, TextDirection direction) { private List<Zone> computeZones(List<TextPositionSequence> textPositions, TextDirection direction) {
var positions = textPositions.stream().filter(t -> t.getDir() == direction).map(TextPositionSequence::getTextPositions).flatMap(List::stream).toList(); List<RedTextPosition> positions = textPositions.stream().filter(t -> t.getDir() == direction).map(TextPositionSequence::getTextPositions).flatMap(List::stream).toList();
var characters = positions.stream().map(Character::new).collect(Collectors.toList()); List<Character> characters = positions.stream().map(Character::new).collect(Collectors.toList());
nearestNeighbourService.findNearestNeighbors(characters); nearestNeighbourService.findNearestNeighbors(characters);
var characterSpacing = spacingService.computeCharacterSpacing(characters); double characterSpacing = spacingService.computeCharacterSpacing(characters);
var lineSpacing = Math.min(spacingService.computeLineSpacing(characters), 20); double lineSpacing = Math.min(spacingService.computeLineSpacing(characters), 20);
var lines = lineBuilderService.buildLines(characters, characterSpacing, lineSpacing); List<Line> lines = lineBuilderService.buildLines(characters, characterSpacing, lineSpacing);
return zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing); return zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing);
} }

View File

@ -27,8 +27,8 @@ public class Character {
public Character(RedTextPosition chunk) { public Character(RedTextPosition chunk) {
this.x = chunk.getXDirAdj() + chunk.getWidthDirAdj() / 2; this.x = chunk.getDirectionAdjustedPosition().getCenterX();
this.y = chunk.getYDirAdj() + chunk.getHeightDir() / 2; this.y = chunk.getDirectionAdjustedPosition().getCenterY();
this.textPosition = chunk; this.textPosition = chunk;
} }
@ -82,5 +82,4 @@ public class Character {
return FastAtan2.fastAtan2(character.getY() - getY(), character.getX() - getX()); return FastAtan2.fastAtan2(character.getY() - getY(), character.getX() - getX());
} }
} }
} }

View File

@ -1,11 +1,12 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model; package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.List; import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.Data; import lombok.Data;
import lombok.EqualsAndHashCode; import lombok.EqualsAndHashCode;
@ -84,7 +85,9 @@ public class Line extends BoundingBox {
private double computeHeight() { private double computeHeight() {
return characters.stream().map(Character::getHeight).reduce(0d, Double::sum) / characters.size(); return characters.stream()
.map(Character::getHeight)
.reduce(0d, Double::sum) / characters.size();
} }
@ -116,7 +119,7 @@ public class Line extends BoundingBox {
double ym = (y0 + y1) / 2; double ym = (y0 + y1) / 2;
double yn = (other.y0 + other.y1) / 2; double yn = (other.y0 + other.y1) / 2;
return Math.abs(ym - yn) / Math.sqrt(1); return Math.abs(ym - yn);
} }
@ -141,21 +144,10 @@ public class Line extends BoundingBox {
private void buildBBox() { private void buildBBox() {
double minX = Double.POSITIVE_INFINITY; this.setBBox(characters.stream()
double minY = Double.POSITIVE_INFINITY; .map(Character::getTextPosition)
double maxX = Double.NEGATIVE_INFINITY; .map(RedTextPosition::getInitialUserSpacePosition)
double maxY = Double.NEGATIVE_INFINITY; .collect(RectangleTransformations.collectBBox()));
for (Character character : characters) {
minX = Math.min(minX, character.getTextPosition().getXDirAdj());
minY = Math.min(minY, character.getTextPosition().getYDirAdj());
maxX = Math.max(maxX, character.getTextPosition().getXDirAdj() + character.getTextPosition().getWidthDirAdj());
maxY = Math.max(maxY, character.getTextPosition().getYDirAdj() + character.getTextPosition().getHeightDir());
}
this.setBBox(new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY));
} }

View File

@ -1,9 +1,10 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model; package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
import java.awt.geom.Rectangle2D;
import java.util.Comparator; import java.util.Comparator;
import java.util.List; import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.Data; import lombok.Data;
@Data @Data
@ -23,21 +24,9 @@ public class Zone extends BoundingBox {
public void buildBBox() { public void buildBBox() {
double minX = Double.POSITIVE_INFINITY; this.setBBox(getLines().stream()
double minY = Double.POSITIVE_INFINITY; .map(BoundingBox::getBBox)
double maxX = Double.NEGATIVE_INFINITY; .collect(RectangleTransformations.collectBBox()));
double maxY = Double.NEGATIVE_INFINITY;
for (Line line : lines) {
minX = Math.min(minX, line.getX());
minY = Math.min(minY, line.getY());
maxX = Math.max(maxX, line.getX() + line.getWidth());
maxY = Math.max(maxY, line.getY() + line.getHeight());
}
this.setBBox(new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY));
} }

View File

@ -1,6 +1,5 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.service; package com.knecon.fforesight.service.layoutparser.processor.docstrum.service;
import java.util.ArrayList;
import java.util.Comparator; import java.util.Comparator;
import java.util.HashSet; import java.util.HashSet;
import java.util.List; import java.util.List;
@ -30,24 +29,25 @@ public class LineBuilderService {
AngleFilter filter = new AngleFilter(-ANGLE_TOLERANCE, ANGLE_TOLERANCE); AngleFilter filter = new AngleFilter(-ANGLE_TOLERANCE, ANGLE_TOLERANCE);
characters.forEach(character -> { characters.forEach(character -> {
character.getNeighbors().forEach(neighbor -> { character.getNeighbors()
double x = neighbor.getHorizontalDistance() / maxHorizontalDistance; .forEach(neighbor -> {
double y = neighbor.getVerticalDistance() / maxVerticalDistance; double normalizedHorizontalDistance = neighbor.getHorizontalDistance() / maxHorizontalDistance;
if (character.getTextPosition().getDir() == neighbor.getCharacter().getTextPosition().getDir() && filter.matches(neighbor) && Math.pow(x, 2) + Math.pow(y, double normalizedVerticalDistance = neighbor.getVerticalDistance() / maxVerticalDistance;
2) <= 1) { if (character.getTextPosition().getDir() == neighbor.getCharacter().getTextPosition().getDir() //
unionFind.union(character, neighbor.getCharacter()); && filter.matches(neighbor) //
} && Math.pow(normalizedHorizontalDistance, 2) + Math.pow(normalizedVerticalDistance, 2) <= 1) {
}); unionFind.union(character, neighbor.getCharacter());
}
});
}); });
List<Line> lines = new ArrayList<>(); return unionFind.getGroups()
unionFind.getGroups().forEach(group -> { .stream()
List<Character> lineCharacters = new ArrayList<>(group); .map(lineCharacters -> lineCharacters.stream()
lineCharacters.sort(Comparator.comparingDouble(Character::getX)); .sorted(Comparator.comparingDouble(Character::getX))
lines.add(new Line(lineCharacters, characterSpacing)); .toList())
}); .map(lineCharacters -> new Line(lineCharacters, characterSpacing))
.toList();
return lines;
} }
} }

View File

@ -45,29 +45,35 @@ public class ZoneBuilderService {
double meanHeight = calculateMeanHeight(lines); double meanHeight = calculateMeanHeight(lines);
lines.forEach(outerLine -> // lines.forEach(outerLine -> {
lines.forEach(innerLine -> { lines.forEach(innerLine -> {
double scale = Math.min(outerLine.getHeight(), innerLine.getHeight()) / meanHeight; if (innerLine == outerLine //
scale = Math.max(MIN_LINE_SIZE_SCALE, Math.min(scale, MAX_LINE_SIZE_SCALE)); || unionFind.inSameSet(outerLine, innerLine)//
|| outerLine.angularDifference(innerLine) > ANGLE_TOLERANCE) {
return;
}
if (!unionFind.inSameSet(outerLine, innerLine) && outerLine.angularDifference(innerLine) <= ANGLE_TOLERANCE) { double scale = Math.min(outerLine.getHeight(), innerLine.getHeight()) / meanHeight;
scale = Math.max(MIN_LINE_SIZE_SCALE, Math.min(scale, MAX_LINE_SIZE_SCALE));
double horizontalDistance = outerLine.horizontalDistance(innerLine) / scale; double horizontalDistance = outerLine.horizontalDistance(innerLine) / scale;
double verticalDistance = outerLine.verticalDistance(innerLine) / scale; double verticalDistance = outerLine.verticalDistance(innerLine) / scale;
if (minHorizontalDistance <= horizontalDistance && verticalDistance <= maxVerticalDistance // if (minHorizontalDistance <= horizontalDistance && verticalDistance <= maxVerticalDistance //
|| minHorizontalMergeDistance <= horizontalDistance && verticalDistance <= maxVerticalMergeDistance) { || minHorizontalMergeDistance <= horizontalDistance && verticalDistance <= maxVerticalMergeDistance) {
unionFind.union(outerLine, innerLine);
}
}
}));
List<Zone> zones = new ArrayList<>(); unionFind.union(outerLine, innerLine);
unionFind.getGroups().forEach(group -> { }
zones.add(mergeLinesInZone(new ArrayList<>(group), characterSpacing, lineSpacing));
});
}); });
List<Zone> zones = unionFind.getGroups()
.stream()
.map(group -> mergeLinesInZone(new ArrayList<>(group), characterSpacing, lineSpacing))
.toList();
if (zones.size() > MAX_ZONES) { if (zones.size() > MAX_ZONES) {
List<Line> oneZoneLines = new ArrayList<>(); List<Line> oneZoneLines = new ArrayList<>();
for (Zone zone : zones) { for (Zone zone : zones) {
@ -103,35 +109,40 @@ public class ZoneBuilderService {
UnionFind<Line> unionFind = new UnionFind<>(new HashSet<>(lines)); UnionFind<Line> unionFind = new UnionFind<>(new HashSet<>(lines));
lines.forEach(outer -> { lines.forEach(outer -> {
lines.forEach(inner -> { lines.forEach(inner -> {
if (inner != outer) { if (inner == outer) {
return;
}
double horizontalDistance = outer.horizontalDistance(inner); double horizontalDistance = outer.horizontalDistance(inner);
double verticalDistance = outer.verticalDistance(inner); double verticalDistance = outer.verticalDistance(inner);
if (horizontalDistance <= maxHorizontalDistance && minVerticalDistance <= verticalDistance && verticalDistance <= maxVerticalDistance) { if (horizontalDistance <= maxHorizontalDistance && minVerticalDistance <= verticalDistance && verticalDistance <= maxVerticalDistance) {
unionFind.union(outer, inner);
} else if (minVerticalDistance <= verticalDistance && verticalDistance <= maxVerticalDistance && Math.abs(horizontalDistance - Math.min(outer.getLength(), unionFind.union(outer, inner);
inner.getLength())) < 0.1) {
boolean characterOverlap = false; } else if (minVerticalDistance <= verticalDistance
int overlappingCount = 0; && verticalDistance <= maxVerticalDistance
for (Character outerCharacter : outer.getCharacters()) { && Math.abs(horizontalDistance - Math.min(outer.getLength(), inner.getLength())) < 0.1) {
for (Character innerCharacter : inner.getCharacters()) {
double characterOverlapDistance = outerCharacter.overlappingDistance(innerCharacter); boolean characterOverlap = false;
if (characterOverlapDistance > 2) { int overlappingCount = 0;
characterOverlap = true; for (Character outerCharacter : outer.getCharacters()) {
} for (Character innerCharacter : inner.getCharacters()) {
if (characterOverlapDistance > 0) { double characterOverlapDistance = outerCharacter.overlappingDistance(innerCharacter);
overlappingCount++; if (characterOverlapDistance > 2) {
} characterOverlap = true;
}
if (characterOverlapDistance > 0) {
overlappingCount++;
} }
} }
if (!characterOverlap && overlappingCount <= 2) { }
unionFind.union(outer, inner); if (!characterOverlap && overlappingCount <= 2) {
} unionFind.union(outer, inner);
} }
} }
}); });
}); });

View File

@ -5,6 +5,7 @@ import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter; import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.text.UnclassifiedText; import com.knecon.fforesight.service.layoutparser.processor.model.text.UnclassifiedText;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
import lombok.Data; import lombok.Data;
import lombok.NoArgsConstructor; import lombok.NoArgsConstructor;
@ -22,6 +23,7 @@ public class ClassificationDocument {
private FloatFrequencyCounter fontSizeCounter = new FloatFrequencyCounter(); private FloatFrequencyCounter fontSizeCounter = new FloatFrequencyCounter();
private StringFrequencyCounter fontCounter = new StringFrequencyCounter(); private StringFrequencyCounter fontCounter = new StringFrequencyCounter();
private StringFrequencyCounter fontStyleCounter = new StringFrequencyCounter(); private StringFrequencyCounter fontStyleCounter = new StringFrequencyCounter();
private LayoutparsingVisualizations visualizations = new LayoutparsingVisualizations();
private boolean headlines; private boolean headlines;
private long rulesVersion; private long rulesVersion;

View File

@ -12,6 +12,7 @@ import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity; import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
@ -40,6 +41,8 @@ public class Document implements GenericSemanticNode {
@Builder.Default @Builder.Default
Set<RedactionEntity> entities = new HashSet<>(); Set<RedactionEntity> entities = new HashSet<>();
LayoutparsingVisualizations visualizations;
@Override @Override
public NodeType getType() { public NodeType getType() {

View File

@ -4,12 +4,8 @@ import java.awt.geom.Line2D;
import java.awt.geom.Point2D; import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D; import java.awt.geom.Rectangle2D;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Formatter; import java.util.Formatter;
import java.util.List; import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import com.knecon.fforesight.service.layoutparser.processor.utils.CohenSutherlandClipping; import com.knecon.fforesight.service.layoutparser.processor.utils.CohenSutherlandClipping;
import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons; import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons;
@ -60,126 +56,13 @@ public class Ruling extends Line2D.Float {
} }
// log(n) implementation of find_intersections public boolean isVertical() {
// based on http://people.csail.mit.edu/indyk/6.838-old/handouts/lec2.pdf
public static Map<Point2D, Ruling[]> findIntersections(List<Ruling> horizontals, List<Ruling> verticals) {
class SortObject {
protected SOType type;
protected float position;
protected Ruling ruling;
public SortObject(SOType type, float position, Ruling ruling) {
this.type = type;
this.position = position;
this.ruling = ruling;
}
}
List<SortObject> sos = new ArrayList<>();
TreeMap<Ruling, Boolean> tree = new TreeMap<>(new Comparator<Ruling>() {
@Override
public int compare(Ruling o1, Ruling o2) {
return java.lang.Double.compare(o1.getTop(), o2.getTop());
}
});
TreeMap<Point2D, Ruling[]> rv = new TreeMap<>(new Comparator<Point2D>() {
@Override
public int compare(Point2D o1, Point2D o2) {
if (o1.getY() > o2.getY()) {
return 1;
}
if (o1.getY() < o2.getY()) {
return -1;
}
if (o1.getX() > o2.getX()) {
return 1;
}
if (o1.getX() < o2.getX()) {
return -1;
}
return 0;
}
});
for (Ruling h : horizontals) {
sos.add(new SortObject(SOType.HLEFT, h.getLeft() - PERPENDICULAR_UNIT_EXPAND_AMOUNT, h));
sos.add(new SortObject(SOType.HRIGHT, h.getRight() + PERPENDICULAR_UNIT_EXPAND_AMOUNT, h));
}
for (Ruling v : verticals) {
sos.add(new SortObject(SOType.VERTICAL, v.getLeft(), v));
}
Collections.sort(sos, new Comparator<SortObject>() {
@Override
public int compare(SortObject a, SortObject b) {
int rv;
if (DoubleComparisons.feq(a.position, b.position)) {
if (a.type == SOType.VERTICAL && b.type == SOType.HLEFT) {
rv = 1;
} else if (a.type == SOType.VERTICAL && b.type == SOType.HRIGHT) {
rv = -1;
} else if (a.type == SOType.HLEFT && b.type == SOType.VERTICAL) {
rv = -1;
} else if (a.type == SOType.HRIGHT && b.type == SOType.VERTICAL) {
rv = 1;
} else {
rv = java.lang.Double.compare(a.position, b.position);
}
} else {
return java.lang.Double.compare(a.position, b.position);
}
return rv;
}
});
for (SortObject so : sos) {
switch (so.type) {
case VERTICAL:
for (Map.Entry<Ruling, Boolean> h : tree.entrySet()) {
try {
Point2D i = h.getKey().intersectionPoint(so.ruling);
if (i == null) {
continue;
}
rv.put(i, new Ruling[]{h.getKey().expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT), so.ruling.expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT)});
} catch (UnsupportedOperationException e) {
log.info("Some line are oblique, ignoring...");
continue;
}
}
break;
case HRIGHT:
tree.remove(so.ruling);
break;
case HLEFT:
tree.put(so.ruling, true);
break;
}
}
return rv;
}
public boolean vertical() {
return this.length() > 0 && DoubleComparisons.feq(this.x1, this.x2); //diff < ORIENTATION_CHECK_THRESHOLD; return this.length() > 0 && DoubleComparisons.feq(this.x1, this.x2); //diff < ORIENTATION_CHECK_THRESHOLD;
} }
public boolean horizontal() { public boolean isHorizontal() {
return this.length() > 0 && DoubleComparisons.feq(this.y1, this.y2); //diff < ORIENTATION_CHECK_THRESHOLD; return this.length() > 0 && DoubleComparisons.feq(this.y1, this.y2); //diff < ORIENTATION_CHECK_THRESHOLD;
} }
@ -188,36 +71,36 @@ public class Ruling extends Line2D.Float {
// these are used to have a single collapse method (in page, currently) // these are used to have a single collapse method (in page, currently)
public boolean oblique() { public boolean isOblique() {
return !(this.vertical() || this.horizontal()); return !(this.isVertical() || this.isHorizontal());
} }
public float getPosition() { public float getPosition() {
if (this.oblique()) { if (this.isOblique()) {
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();
} }
return this.vertical() ? this.getLeft() : this.getTop(); return this.isVertical() ? this.getLeft() : this.getTop();
} }
public float getStart() { public float getStart() {
if (this.oblique()) { if (this.isOblique()) {
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();
} }
return this.vertical() ? this.getTop() : this.getLeft(); return this.isVertical() ? this.getTop() : this.getLeft();
} }
public void setStart(float v) { public void setStart(float v) {
if (this.oblique()) { if (this.isOblique()) {
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();
} }
if (this.vertical()) { if (this.isVertical()) {
this.setTop(v); this.setTop(v);
} else { } else {
this.setLeft(v); this.setLeft(v);
@ -227,19 +110,19 @@ public class Ruling extends Line2D.Float {
public float getEnd() { public float getEnd() {
if (this.oblique()) { if (this.isOblique()) {
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();
} }
return this.vertical() ? this.getBottom() : this.getRight(); return this.isVertical() ? this.getBottom() : this.getRight();
} }
public void setEnd(float v) { public void setEnd(float v) {
if (this.oblique()) { if (this.isOblique()) {
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();
} }
if (this.vertical()) { if (this.isVertical()) {
this.setBottom(v); this.setBottom(v);
} else { } else {
this.setRight(v); this.setRight(v);
@ -249,10 +132,10 @@ public class Ruling extends Line2D.Float {
public void setStartEnd(float start, float end) { public void setStartEnd(float start, float end) {
if (this.oblique()) { if (this.isOblique()) {
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();
} }
if (this.vertical()) { if (this.isVertical()) {
this.setTop(start); this.setTop(start);
this.setBottom(end); this.setBottom(end);
} else { } else {
@ -264,7 +147,7 @@ public class Ruling extends Line2D.Float {
public boolean perpendicularTo(Ruling other) { public boolean perpendicularTo(Ruling other) {
return this.vertical() == other.horizontal(); return this.isVertical() == other.isHorizontal();
} }
@ -318,30 +201,6 @@ public class Ruling extends Line2D.Float {
} }
public Point2D intersectionPoint(Ruling other) {
Ruling this_l = this.expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT);
Ruling other_l = other.expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT);
Ruling horizontal, vertical;
if (!this_l.intersectsLine(other_l)) {
return null;
}
if (this_l.horizontal() && other_l.vertical()) {
horizontal = this_l;
vertical = other_l;
} else if (this_l.vertical() && other_l.horizontal()) {
vertical = this_l;
horizontal = other_l;
} else {
log.warn("lines must be orthogonal, vertical and horizontal");
return null;
}
return new Point2D.Float(vertical.getLeft(), horizontal.getTop());
}
@Override @Override
public boolean equals(Object other) { public boolean equals(Object other) {
@ -451,16 +310,9 @@ public class Ruling extends Line2D.Float {
final float TOLERANCE = 1; final float TOLERANCE = 1;
return Math.abs(ruling.getX1() - x1) < TOLERANCE &&// return Math.abs(ruling.getX1() - x1) < TOLERANCE &&//
Math.abs(ruling.getY1() - y1) < TOLERANCE &&// Math.abs(ruling.getY1() - y1) < TOLERANCE &&//
Math.abs(ruling.getX2() - x2) < TOLERANCE &&// Math.abs(ruling.getX2() - x2) < TOLERANCE &&//
Math.abs(ruling.getY2() - y2) < TOLERANCE; Math.abs(ruling.getY2() - y2) < TOLERANCE;
}
private enum SOType {
VERTICAL,
HRIGHT,
HLEFT
} }
} }

View File

@ -1,5 +1,8 @@
package com.knecon.fforesight.service.layoutparser.processor.model.text; package com.knecon.fforesight.service.layoutparser.processor.model.text;
import java.awt.geom.AffineTransform;
import java.awt.geom.Rectangle2D;
import org.apache.pdfbox.text.TextPosition; import org.apache.pdfbox.text.TextPosition;
import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.annotation.JsonIgnore;
@ -16,7 +19,9 @@ import lombok.SneakyThrows;
@AllArgsConstructor @AllArgsConstructor
public class RedTextPosition { public class RedTextPosition {
private float[] position; private final static int HEIGHT_PADDING = 2;
private Rectangle2D.Float directionAdjustedPosition;
private Rectangle2D initialUserSpacePosition;
@JsonIgnore @JsonIgnore
private int rotation; private int rotation;
@ -58,43 +63,65 @@ public class RedTextPosition {
pos.setFontSizeInPt(textPosition.getFontSizeInPt()); pos.setFontSizeInPt(textPosition.getFontSizeInPt());
pos.setFontName(textPosition.getFont().getName()); pos.setFontName(textPosition.getFont().getName());
var position = new float[4]; float textHeight = textPosition.getHeight() + HEIGHT_PADDING;
Rectangle2D.Float dirAdjPosition = new Rectangle2D.Float(textPosition.getXDirAdj(),
textPosition.getYDirAdj() - textHeight,
textPosition.getWidthDirAdj(),
textHeight + HEIGHT_PADDING);
pos.setDirectionAdjustedPosition(dirAdjPosition);
position[0] = textPosition.getXDirAdj(); AffineTransform affineTransform = getRotationMatrix(TextDirection.fromDegrees(textPosition.getDir()), textPosition.getPageWidth(), textPosition.getPageHeight());
position[1] = textPosition.getYDirAdj(); Rectangle2D initialUserSpacePositionRect = affineTransform.createTransformedShape(dirAdjPosition).getBounds2D();
position[2] = textPosition.getWidthDirAdj();
position[3] = textPosition.getHeightDir(); pos.setInitialUserSpacePosition(initialUserSpacePositionRect);
pos.setPosition(position);
return pos; return pos;
} }
private static AffineTransform getRotationMatrix(TextDirection textDirection, float pageWidth, float pageHeight) {
AffineTransform transform = new AffineTransform();
if (textDirection == TextDirection.ZERO || textDirection == TextDirection.HALF_CIRCLE) {
transform.rotate(textDirection.getRadians(), pageWidth / 2f, pageHeight / 2f);
transform.translate(0f, pageHeight);
} else if (textDirection == TextDirection.QUARTER_CIRCLE) {
transform.rotate(textDirection.getRadians(), pageWidth / 2f, pageWidth / 2f);
transform.translate(0f, pageWidth);
} else {
transform.rotate(textDirection.getRadians(), pageHeight / 2f, pageHeight / 2f);
transform.translate(0f, pageWidth);
}
transform.scale(1., -1.);
return transform;
}
@JsonIgnore @JsonIgnore
public float getXDirAdj() { public float getXDirAdj() {
return position[0]; return this.directionAdjustedPosition.x;
} }
@JsonIgnore @JsonIgnore
public float getYDirAdj() { public float getYDirAdj() {
return position[1]; return this.directionAdjustedPosition.y;
} }
@JsonIgnore @JsonIgnore
public float getWidthDirAdj() { public float getWidthDirAdj() {
return position[2]; return this.directionAdjustedPosition.width;
} }
@JsonIgnore @JsonIgnore
public float getHeightDir() { public float getHeightDir() {
return position[3]; return this.directionAdjustedPosition.height;
} }
} }

View File

@ -2,6 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor.model.text;
import java.awt.geom.AffineTransform; import java.awt.geom.AffineTransform;
import java.awt.geom.Point2D; import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Locale; import java.util.Locale;
@ -11,6 +12,7 @@ import org.apache.pdfbox.text.TextPosition;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
import lombok.Builder; import lombok.Builder;
@ -44,21 +46,19 @@ public class TextPositionSequence implements CharSequence {
private boolean isParagraphStart; private boolean isParagraphStart;
public TextPositionSequence(int page) { public TextPositionSequence(List<TextPosition> textPositions, int pageNumber, boolean isParagraphStart) {
this.page = page; this.textPositions = textPositions.stream()
} .map(RedTextPosition::fromTextPosition)
.collect(Collectors.toList());
this.page = pageNumber;
public TextPositionSequence(List<TextPosition> textPositions, int page, boolean isParagraphStart) {
this.textPositions = textPositions.stream().map(RedTextPosition::fromTextPosition).collect(Collectors.toList());
this.page = page;
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir()); this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
this.rotation = textPositions.get(0).getRotation(); this.rotation = textPositions.get(0).getRotation();
this.pageHeight = textPositions.get(0).getPageHeight(); this.pageHeight = textPositions.get(0).getPageHeight();
this.pageWidth = textPositions.get(0).getPageWidth(); this.pageWidth = textPositions.get(0).getPageWidth();
this.isParagraphStart = isParagraphStart; this.isParagraphStart = isParagraphStart;
} }
@ -314,10 +314,18 @@ public class TextPositionSequence implements CharSequence {
topRight = transform.transform(topRight, null); topRight = transform.transform(topRight, null);
return new Rectangle( // return new Rectangle( //
new Point((float) bottomLeft.getX(), (float) bottomLeft.getY()), new Point((float) bottomLeft.getX(), (float) bottomLeft.getY()),
(float) (topRight.getX() - bottomLeft.getX()), (float) (topRight.getX() - bottomLeft.getX()),
(float) (topRight.getY() - bottomLeft.getY()), (float) (topRight.getY() - bottomLeft.getY()),
page); page);
}
public Rectangle2D getBoundingBox() {
return getTextPositions().stream()
.map(RedTextPosition::getInitialUserSpacePosition)
.collect(RectangleTransformations.collectBBox());
} }
} }

View File

@ -33,6 +33,7 @@ public class BodyTextFrameService {
for (ClassificationPage page : classificationDocument.getPages()) { for (ClassificationPage page : classificationDocument.getPages()) {
// var updatedBodyTextFrame = getBodyTextFrameFromRulings(page, bodyTextFrame, landscapeBodyTextFrame); // var updatedBodyTextFrame = getBodyTextFrameFromRulings(page, bodyTextFrame, landscapeBodyTextFrame);
setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame); setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame);
classificationDocument.getVisualizations().addMainBodyVisualization(page.isLandscape() ? landscapeBodyTextFrame : bodyTextFrame, page.getPageNumber());
} }
} }

View File

@ -122,7 +122,7 @@ public class RulingCleaningService {
h = ruling.y1 - ruling.y2; h = ruling.y1 - ruling.y2;
} }
if (ruling.horizontal()) { if (ruling.isHorizontal()) {
return new Rectangle(top - THRESHOLD_Y_HORIZONTAL, left - THRESHOLD_X_HORIZONTAL, w + 2 * THRESHOLD_X_HORIZONTAL, h + 2 * THRESHOLD_Y_HORIZONTAL); return new Rectangle(top - THRESHOLD_Y_HORIZONTAL, left - THRESHOLD_X_HORIZONTAL, w + 2 * THRESHOLD_X_HORIZONTAL, h + 2 * THRESHOLD_Y_HORIZONTAL);
} else { } else {
return new Rectangle(top - THRESHOLD_Y_VERTICAL, left - THRESHOLD_X_VERTICAL, w + 2 * THRESHOLD_X_VERTICAL, h + 2 * THRESHOLD_Y_VERTICAL); return new Rectangle(top - THRESHOLD_Y_VERTICAL, left - THRESHOLD_X_VERTICAL, w + 2 * THRESHOLD_X_VERTICAL, h + 2 * THRESHOLD_Y_VERTICAL);
@ -160,14 +160,14 @@ public class RulingCleaningService {
List<Ruling> vrs = new ArrayList<>(); List<Ruling> vrs = new ArrayList<>();
for (Ruling vr : rulings) { for (Ruling vr : rulings) {
if (vr.vertical()) { if (vr.isVertical()) {
vrs.add(vr); vrs.add(vr);
} }
} }
List<Ruling> hrs = new ArrayList<>(); List<Ruling> hrs = new ArrayList<>();
for (Ruling hr : rulings) { for (Ruling hr : rulings) {
if (hr.horizontal()) { if (hr.isHorizontal()) {
hrs.add(hr); hrs.add(hr);
} }
} }

View File

@ -15,6 +15,7 @@ import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.Doubl
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter; import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
@ -37,11 +38,18 @@ public class DocstrumBlockificationService {
static final float THRESHOLD = 1f; static final float THRESHOLD = 1f;
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Cell> cells, boolean xyOrder) { public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Cell> cells, boolean xyOrder, LayoutparsingVisualizations visualizations) {
CleanRulings usedRulings = RectangleTransformations.extractRulings(cells); CleanRulings usedRulings = RectangleTransformations.extractRulings(cells);
var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder); var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder);
if (!textPositions.isEmpty()) {
visualizations.addZoneVisualizations(zones, textPositions.get(0).getPage());
visualizations.addLineVisualizations(zones, textPositions.get(0).getPage());
visualizations.addCharactersWithNeighbours(zones, textPositions.get(0).getPage());
}
var pageBlocks = toAbstractPageBlocks(zones, usedRulings.getHorizontal(), usedRulings.getVertical(), xyOrder); var pageBlocks = toAbstractPageBlocks(zones, usedRulings.getHorizontal(), usedRulings.getVertical(), xyOrder);
var classificationPage = new ClassificationPage(pageBlocks); var classificationPage = new ClassificationPage(pageBlocks);
@ -58,18 +66,20 @@ public class DocstrumBlockificationService {
zones.forEach(zone -> { zones.forEach(zone -> {
List<TextPositionSequence> textPositionSequences = new ArrayList<>(); List<TextPositionSequence> textPositionSequences = new ArrayList<>();
zone.getLines().forEach(line -> { zone.getLines()
line.getWords().forEach(word -> { .forEach(line -> {
textPositionSequences.add(new TextPositionSequence(word.getTextPositions(), word.getPage())); line.getWords()
}); .forEach(word -> {
}); textPositionSequences.add(new TextPositionSequence(word.getTextPositions(), word.getPage()));
});
});
abstractPageBlocks.addAll(splitZonesAtRulings(textPositionSequences, horizontalRulings, verticalRulings)); abstractPageBlocks.addAll(splitZonesAtRulings(textPositionSequences, horizontalRulings, verticalRulings));
}); });
if (xyOrder) { if (xyOrder) {
abstractPageBlocks.sort(Comparator.comparing(AbstractPageBlock::getMinY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) abstractPageBlocks.sort(Comparator.comparing(AbstractPageBlock::getMinY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(AbstractPageBlock::getMinX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); .thenComparing(AbstractPageBlock::getMinX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
abstractPageBlocks.sort(new Comparator<AbstractPageBlock>() { abstractPageBlocks.sort(new Comparator<AbstractPageBlock>() {
@Override @Override
public int compare(AbstractPageBlock o1, AbstractPageBlock o2) { public int compare(AbstractPageBlock o1, AbstractPageBlock o2) {
@ -134,8 +144,8 @@ public class DocstrumBlockificationService {
private boolean isOnlyIntersectingYAndOnelineOrPrevoiusTwoLines(TextPageBlock previous, TextPageBlock current, ClassificationPage page) { private boolean isOnlyIntersectingYAndOnelineOrPrevoiusTwoLines(TextPageBlock previous, TextPageBlock current, ClassificationPage page) {
return current.intersectsY(previous) // return current.intersectsY(previous) //
&& (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() == 2 && current.getNumberOfLines() == 1) // && (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() == 2 && current.getNumberOfLines() == 1) //
&& numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) <= 0; && numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) <= 0;
} }
@ -144,16 +154,16 @@ public class DocstrumBlockificationService {
ClassificationPage page) { ClassificationPage page) {
return (Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD || Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) // return (Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD || Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) //
&& (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() >= 2 && current.getNumberOfLines() == 1) // && (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() >= 2 && current.getNumberOfLines() == 1) //
&& !hasBetween(current, previous, page.getTextBlocks()) && numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) <= 4; && !hasBetween(current, previous, page.getTextBlocks()) && numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) <= 4;
} }
private boolean areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(TextPageBlock previous, TextPageBlock current, ClassificationPage page) { private boolean areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(TextPageBlock previous, TextPageBlock current, ClassificationPage page) {
return previous.getNumberOfLines() >= 2 && current.getNumberOfLines() >= 2 // return previous.getNumberOfLines() >= 2 && current.getNumberOfLines() >= 2 //
&& previous.intersectsY(current) // && previous.intersectsY(current) //
&& numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) == 0; && numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) == 0;
} }
@ -213,7 +223,7 @@ public class DocstrumBlockificationService {
ListIterator<AbstractPageBlock> itty = blocks.listIterator(); ListIterator<AbstractPageBlock> itty = blocks.listIterator();
while (itty.hasNext()) { while (itty.hasNext()) {
AbstractPageBlock block = itty.next(); AbstractPageBlock block = itty.next();
if(block == null){ if (block == null) {
continue; continue;
} }
if (block instanceof TablePageBlock) { if (block instanceof TablePageBlock) {
@ -224,7 +234,7 @@ public class DocstrumBlockificationService {
for (int i = 0; i < blocks.size(); i++) { for (int i = 0; i < blocks.size(); i++) {
if(blocks.get(i) == null){ if (blocks.get(i) == null) {
continue; continue;
} }
if (blocks.get(i) == current) { if (blocks.get(i) == current) {
@ -249,8 +259,8 @@ public class DocstrumBlockificationService {
} }
} }
var blocksIterator = blocks.iterator(); var blocksIterator = blocks.iterator();
while(blocksIterator.hasNext()){ while (blocksIterator.hasNext()) {
if(blocksIterator.next() == null){ if (blocksIterator.next() == null) {
blocksIterator.remove(); blocksIterator.remove();
} }
} }
@ -338,11 +348,11 @@ public class DocstrumBlockificationService {
if (textBlock == null) { if (textBlock == null) {
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(), textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
wordBlock.getMaxXDirAdj(), wordBlock.getMaxXDirAdj(),
wordBlock.getMinYDirAdj(), wordBlock.getMinYDirAdj(),
wordBlock.getMaxYDirAdj(), wordBlock.getMaxYDirAdj(),
wordBlockList, wordBlockList,
wordBlock.getRotation()); wordBlock.getRotation());
} else { } else {
TextPageBlock spatialEntity = textBlock.union(wordBlock); TextPageBlock spatialEntity = textBlock.union(wordBlock);
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight()); textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
@ -358,7 +368,12 @@ public class DocstrumBlockificationService {
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest()); textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
} }
if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) { if (textBlock != null
&& textBlock.getSequences() != null
&& textBlock.getSequences()
.stream()
.map(t -> round(t.getMinYDirAdj(), 3))
.collect(toSet()).size() == 1) {
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj)); textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
} }
return textBlock; return textBlock;
@ -373,38 +388,34 @@ public class DocstrumBlockificationService {
List<Ruling> horizontalRulingLines, List<Ruling> horizontalRulingLines,
List<Ruling> verticalRulingLines) { List<Ruling> verticalRulingLines) {
return isSplitByRuling(maxX, return isSplitByRuling(maxX, minY, word.getMinXDirAdj(), word.getMinYDirAdj(), verticalRulingLines, word.getDir().getDegrees(), word.getPageWidth(), word.getPageHeight())
minY, //
word.getMinXDirAdj(), || isSplitByRuling(minX,
word.getMinYDirAdj(), minY,
verticalRulingLines, word.getMinXDirAdj(),
word.getDir().getDegrees(), word.getMaxYDirAdj(),
word.getPageWidth(), horizontalRulingLines,
word.getPageHeight()) // word.getDir().getDegrees(),
|| isSplitByRuling(minX, word.getPageWidth(),
minY, word.getPageHeight())
word.getMinXDirAdj(), //
word.getMaxYDirAdj(), || isSplitByRuling(maxX,
horizontalRulingLines, minY,
word.getDir().getDegrees(), word.getMinXDirAdj(),
word.getPageWidth(), word.getMinYDirAdj(),
word.getPageHeight()) // horizontalRulingLines,
|| isSplitByRuling(maxX, word.getDir().getDegrees(),
minY, word.getPageWidth(),
word.getMinXDirAdj(), word.getPageHeight())
word.getMinYDirAdj(), //
horizontalRulingLines, || isSplitByRuling(minX,
word.getDir().getDegrees(), minY,
word.getPageWidth(), word.getMinXDirAdj(),
word.getPageHeight()) // word.getMaxYDirAdj(),
|| isSplitByRuling(minX, verticalRulingLines,
minY, word.getDir().getDegrees(),
word.getMinXDirAdj(), word.getPageWidth(),
word.getMaxYDirAdj(), word.getPageHeight());
verticalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight());
} }

View File

@ -34,7 +34,7 @@ public class DocuMineBlockificationService {
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this! * This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
* Rulings (Table lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling. * Rulings (Table lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
* *
* @param textPositions The words of a page. * @param textPositions The textPositions of a page.
* @param horizontalRulingLines Horizontal table lines. * @param horizontalRulingLines Horizontal table lines.
* @param verticalRulingLines Vertical table lines. * @param verticalRulingLines Vertical table lines.
* @return Page object that contains the Textblock and text statistics. * @return Page object that contains the Textblock and text statistics.

View File

@ -7,7 +7,6 @@ import static java.util.stream.Collectors.toList;
import java.awt.geom.Rectangle2D; import java.awt.geom.Rectangle2D;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList; import java.util.LinkedList;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
@ -15,7 +14,6 @@ import java.util.NoSuchElementException;
import java.util.Set; import java.util.Set;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
@ -52,6 +50,9 @@ public class DocumentGraphFactory {
public Document buildDocumentGraph(LayoutParsingType layoutParsingType, ClassificationDocument document) { public Document buildDocumentGraph(LayoutParsingType layoutParsingType, ClassificationDocument document) {
Document documentGraph = new Document(); Document documentGraph = new Document();
documentGraph.setVisualizations(document.getVisualizations());
Context context = new Context(documentGraph); Context context = new Context(documentGraph);
document.getPages() document.getPages()
@ -85,14 +86,11 @@ public class DocumentGraphFactory {
GenericSemanticNode node; GenericSemanticNode node;
if (originalTextBlock.isHeadline()) { if (originalTextBlock.isHeadline()) {
node = Headline.builder().documentTree(context.getDocumentTree()) node = Headline.builder().documentTree(context.getDocumentTree()).build();
.build();
} else if (originalTextBlock.isToDuplicate()) { } else if (originalTextBlock.isToDuplicate()) {
node = DuplicatedParagraph.builder().documentTree(context.getDocumentTree()) node = DuplicatedParagraph.builder().documentTree(context.getDocumentTree()).build();
.build();
} else { } else {
node = Paragraph.builder().documentTree(context.getDocumentTree()) node = Paragraph.builder().documentTree(context.getDocumentTree()).build();
.build();
} }
page.getMainBody().add(node); page.getMainBody().add(node);
@ -178,8 +176,7 @@ public class DocumentGraphFactory {
private void addFooter(List<TextPageBlock> textBlocks, Context context) { private void addFooter(List<TextPageBlock> textBlocks, Context context) {
Page page = context.getPage(textBlocks.get(0).getPage()); Page page = context.getPage(textBlocks.get(0).getPage());
Footer footer = Footer.builder().documentTree(context.getDocumentTree()) Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build();
.build();
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks), AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks),
footer, footer,
context, context,
@ -194,8 +191,7 @@ public class DocumentGraphFactory {
public void addHeader(List<TextPageBlock> textBlocks, Context context) { public void addHeader(List<TextPageBlock> textBlocks, Context context) {
Page page = context.getPage(textBlocks.get(0).getPage()); Page page = context.getPage(textBlocks.get(0).getPage());
Header header = Header.builder().documentTree(context.getDocumentTree()) Header header = Header.builder().documentTree(context.getDocumentTree()).build();
.build();
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks), header, 0, page); AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks), header, 0, page);
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header); List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header);
header.setTreeId(tocId); header.setTreeId(tocId);
@ -207,8 +203,7 @@ public class DocumentGraphFactory {
private void addEmptyFooter(int pageIndex, Context context) { private void addEmptyFooter(int pageIndex, Context context) {
Page page = context.getPage(pageIndex); Page page = context.getPage(pageIndex);
Footer footer = Footer.builder().documentTree(context.getDocumentTree()) Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build();
.build();
AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(footer, context, page); AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(footer, context, page);
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer); List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer);
footer.setTreeId(tocId); footer.setTreeId(tocId);
@ -220,8 +215,7 @@ public class DocumentGraphFactory {
private void addEmptyHeader(int pageIndex, Context context) { private void addEmptyHeader(int pageIndex, Context context) {
Page page = context.getPage(pageIndex); Page page = context.getPage(pageIndex);
Header header = Header.builder().documentTree(context.getDocumentTree()) Header header = Header.builder().documentTree(context.getDocumentTree()).build();
.build();
AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(header, 0, page); AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(header, 0, page);
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header); List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header);
header.setTreeId(tocId); header.setTreeId(tocId);

View File

@ -29,19 +29,22 @@ public class SearchTextWithTextPositionFactory {
public SearchTextWithTextPositionDto buildSearchTextToTextPositionDto(List<TextPositionSequence> sequences) { public SearchTextWithTextPositionDto buildSearchTextToTextPositionDto(List<TextPositionSequence> sequences) {
if (sequences.isEmpty() || sequences.stream().allMatch(sequence -> sequence.getTextPositions().isEmpty())) { if (sequences.isEmpty() || sequences.stream()
.allMatch(sequence -> sequence.getTextPositions().isEmpty())) {
return SearchTextWithTextPositionDto.empty(); return SearchTextWithTextPositionDto.empty();
} }
Context context = new Context(); Context context = new Context();
RedTextPosition currentTextPosition = sequences.get(0).getTextPositions().get(0); RedTextPosition currentTextPosition = sequences.get(0).getTextPositions()
RedTextPosition previousTextPosition = RedTextPosition.builder().unicode(" ").position(currentTextPosition.getPosition()).build(); .get(0);
RedTextPosition previousTextPosition = RedTextPosition.builder().unicode(" ").directionAdjustedPosition(currentTextPosition.getDirectionAdjustedPosition()).build();
for (TextPositionSequence word : sequences) { for (TextPositionSequence word : sequences) {
for (int i = 0; i < word.getTextPositions().size(); ++i) { for (int i = 0; i < word.getTextPositions().size(); ++i) {
currentTextPosition = word.getTextPositions().get(i); currentTextPosition = word.getTextPositions()
.get(i);
if (isLineBreak(currentTextPosition, previousTextPosition)) { if (isLineBreak(currentTextPosition, previousTextPosition)) {
removeHyphenLinebreaks(context); removeHyphenLinebreaks(context);
context.lineBreaksStringIdx.add(context.stringIdx); context.lineBreaksStringIdx.add(context.stringIdx);
@ -57,7 +60,7 @@ public class SearchTextWithTextPositionFactory {
++context.positionIdx; ++context.positionIdx;
} }
previousTextPosition = RedTextPosition.builder().unicode(" ").position(previousTextPosition.getPosition()).build(); previousTextPosition = RedTextPosition.builder().unicode(" ").directionAdjustedPosition(previousTextPosition.getDirectionAdjustedPosition()).build();
context.stringBuilder.append(" "); context.stringBuilder.append(" ");
context.stringIdxToPositionIdx.add(context.positionIdx); context.stringIdxToPositionIdx.add(context.positionIdx);
++context.stringIdx; ++context.stringIdx;
@ -66,7 +69,7 @@ public class SearchTextWithTextPositionFactory {
assert context.stringBuilder.length() == context.stringIdxToPositionIdx.size(); assert context.stringBuilder.length() == context.stringIdxToPositionIdx.size();
List<Rectangle2D> positions = sequences.stream() List<Rectangle2D> positions = sequences.stream()
.flatMap(sequence -> sequence.getTextPositions().stream().map(textPosition -> mapRedTextPositionToInitialUserSpace(textPosition, sequence))) .map(TextPositionSequence::getBoundingBox)
.toList(); .toList();
return SearchTextWithTextPositionDto.builder() return SearchTextWithTextPositionDto.builder()
@ -153,7 +156,7 @@ public class SearchTextWithTextPositionFactory {
return false; return false;
} }
float deltaY = Math.abs(currentPosition.getYDirAdj() - previousPosition.getYDirAdj()); double deltaY = Math.abs(currentPosition.getYDirAdj() - previousPosition.getYDirAdj());
return deltaY >= currentPosition.getHeightDir(); return deltaY >= currentPosition.getHeightDir();
} }
@ -167,16 +170,16 @@ public class SearchTextWithTextPositionFactory {
private boolean isHyphen(String unicodeCharacter) { private boolean isHyphen(String unicodeCharacter) {
return Objects.equals(unicodeCharacter, "-") || // return Objects.equals(unicodeCharacter, "-") || //
Objects.equals(unicodeCharacter, "~") || // Objects.equals(unicodeCharacter, "~") || //
Objects.equals(unicodeCharacter, "") || // Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "") || // Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "") || // Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "") || // Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "") || // Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "") || // Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "") || // Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "") || // Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "\u00AD"); Objects.equals(unicodeCharacter, "\u00AD");
} }

View File

@ -82,7 +82,6 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine {
private int pageRotation; private int pageRotation;
private PDRectangle pageSize; private PDRectangle pageSize;
private Matrix translateMatrix;
private final GlyphList glyphList; private final GlyphList glyphList;
private final Map<COSDictionary, Float> fontHeightMap = new WeakHashMap<COSDictionary, Float>(); private final Map<COSDictionary, Float> fontHeightMap = new WeakHashMap<COSDictionary, Float>();
@ -134,12 +133,6 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine {
this.pageRotation = page.getRotation(); this.pageRotation = page.getRotation();
this.pageSize = page.getCropBox(); this.pageSize = page.getCropBox();
if (pageSize.getLowerLeftX() == 0 && pageSize.getLowerLeftY() == 0) {
translateMatrix = null;
} else {
// translation matrix for cropbox
translateMatrix = Matrix.getTranslateInstance(-pageSize.getLowerLeftX(), -pageSize.getLowerLeftY());
}
super.processPage(page); super.processPage(page);
} }
@ -265,62 +258,52 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine {
} }
} }
// adjust for cropbox if needed
Matrix translatedTextRenderingMatrix;
if (translateMatrix == null) {
translatedTextRenderingMatrix = textRenderingMatrix;
} else {
translatedTextRenderingMatrix = Matrix.concatenate(translateMatrix, textRenderingMatrix);
nextX -= pageSize.getLowerLeftX();
nextY -= pageSize.getLowerLeftY();
}
// This is a hack for unicode letter with 2 chars e.g. RA see unicodeProblem.pdf // This is a hack for unicode letter with 2 chars e.g. RA see unicodeProblem.pdf
if (unicodeMapping.length() == 2) { if (unicodeMapping.length() == 2) {
processTextPosition(new TextPosition(pageRotation, processTextPosition(new TextPosition(pageRotation,
pageSize.getWidth(), pageSize.getWidth(),
pageSize.getHeight(), pageSize.getHeight(),
translatedTextRenderingMatrix, textRenderingMatrix,
nextX, nextX,
nextY, nextY,
Math.abs(dyDisplay), Math.abs(dyDisplay),
dxDisplay, dxDisplay,
Math.abs(spaceWidthDisplay), Math.abs(spaceWidthDisplay),
Character.toString(unicodeMapping.charAt(0)), Character.toString(unicodeMapping.charAt(0)),
new int[]{code}, new int[]{code},
font, font,
fontSize, fontSize,
(int) (fontSize * textMatrix.getScalingFactorX()))); (int) (fontSize * textMatrix.getScalingFactorX())));
processTextPosition(new TextPosition(pageRotation, processTextPosition(new TextPosition(pageRotation,
pageSize.getWidth(), pageSize.getWidth(),
pageSize.getHeight(), pageSize.getHeight(),
translatedTextRenderingMatrix, textRenderingMatrix,
nextX, nextX,
nextY, nextY,
Math.abs(dyDisplay), Math.abs(dyDisplay),
dxDisplay, dxDisplay,
Math.abs(spaceWidthDisplay), Math.abs(spaceWidthDisplay),
Character.toString(unicodeMapping.charAt(1)), Character.toString(unicodeMapping.charAt(1)),
new int[]{code}, new int[]{code},
font, font,
fontSize, fontSize,
(int) (fontSize * textMatrix.getScalingFactorX()))); (int) (fontSize * textMatrix.getScalingFactorX())));
} else { } else {
processTextPosition(new TextPosition(pageRotation, processTextPosition(new TextPosition(pageRotation,
pageSize.getWidth(), pageSize.getWidth(),
pageSize.getHeight(), pageSize.getHeight(),
translatedTextRenderingMatrix, textRenderingMatrix,
nextX, nextX,
nextY, nextY,
Math.abs(dyDisplay), Math.abs(dyDisplay),
dxDisplay, dxDisplay,
Math.abs(spaceWidthDisplay), Math.abs(spaceWidthDisplay),
unicodeMapping, unicodeMapping,
new int[]{code}, new int[]{code},
font, font,
fontSize, fontSize,
(int) (fontSize * textMatrix.getScalingFactorX()))); (int) (fontSize * textMatrix.getScalingFactorX())));
} }
} }

View File

@ -1007,7 +1007,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
/** /**
* Set the desired word separator for output text. The PDFBox text extraction algorithm will output a space * Set the desired word separator for output text. The PDFBox text extraction algorithm will output a space
* character if there is enough space between two words. By default a space character is used. If you need and * character if there is enough space between two textPositions. By default a space character is used. If you need and
* accurate count of characters that are found in a PDF document then you might want to set the word separator to * accurate count of characters that are found in a PDF document then you might want to set the word separator to
* the empty string. * the empty string.
* *
@ -1703,7 +1703,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
/** /**
* Write a list of string containing a whole line of a document. * Write a list of string containing a whole line of a document.
* *
* @param line a list with the words of the given line * @param line a list with the textPositions of the given line
* @throws IOException if something went wrong * @throws IOException if something went wrong
*/ */
private void writeLine(List<WordWithTextPositions> line, boolean isParagraphEnd) throws IOException { private void writeLine(List<WordWithTextPositions> line, boolean isParagraphEnd) throws IOException {
@ -1744,9 +1744,9 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
/** /**
* Handles the LTR and RTL direction of the given words. The whole implementation stands and falls with the given * Handles the LTR and RTL direction of the given textPositions. The whole implementation stands and falls with the given
* word. If the word is a full line, the results will be the best. If the word contains of single words or * word. If the word is a full line, the results will be the best. If the word contains of single textPositions or
* characters, the order of the characters in a word or words in a line may wrong, due to RTL and LTR marks and * characters, the order of the characters in a word or textPositions in a line may wrong, due to RTL and LTR marks and
* characters! * characters!
* <p> * <p>
* Based on http://www.nesterovsky-bros.com/weblog/2013/07/28/VisualToLogicalConversionInJava.aspx * Based on http://www.nesterovsky-bros.com/weblog/2013/07/28/VisualToLogicalConversionInJava.aspx

View File

@ -70,7 +70,9 @@ public class LayoutGridService {
Visualizations layoutGrid = this.addLayoutGrid(document, layerVisibilityDefaultValue, false); Visualizations layoutGrid = this.addLayoutGrid(document, layerVisibilityDefaultValue, false);
Visualizations visualLayoutGrid = this.addLayoutGrid(document, layerVisibilityDefaultValue, true); Visualizations visualLayoutGrid = this.addLayoutGrid(document, layerVisibilityDefaultValue, true);
viewerDocumentService.addVisualizationsOnPage(originFile, destinationFile, List.of(layoutGrid, visualLayoutGrid)); List<Visualizations> allVisualizations = Stream.concat(Stream.of(layoutGrid, visualLayoutGrid), document.getVisualizations().streamAll()).toList();
viewerDocumentService.addVisualizationsOnPage(originFile, destinationFile, allVisualizations);
} }

View File

@ -1,12 +1,5 @@
package com.knecon.fforesight.service.layoutparser.processor.utils; package com.knecon.fforesight.service.layoutparser.processor.utils;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import lombok.experimental.UtilityClass;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
import org.apache.pdfbox.text.TextPosition;
import java.awt.geom.Rectangle2D; import java.awt.geom.Rectangle2D;
import java.util.Collection; import java.util.Collection;
import java.util.Collections; import java.util.Collections;
@ -14,13 +7,24 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
import org.apache.pdfbox.text.TextPosition;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import lombok.experimental.UtilityClass;
@UtilityClass @UtilityClass
public class MarkedContentUtils { public class MarkedContentUtils {
public static final String HEADER = "Header"; public static final String HEADER = "Header";
public static final String FOOTER = "Footer"; public static final String FOOTER = "Footer";
public List<Rectangle2D> getMarkedContentBboxPerLine(List<PDMarkedContent> markedContents, String subtype) {
public List<Rectangle2D> getMarkedContentBboxPerLine(List<PDMarkedContent> markedContents, String subtype, PDPage pdPage) {
if (markedContents == null) { if (markedContents == null) {
return Collections.emptyList(); return Collections.emptyList();
@ -31,7 +35,8 @@ public class MarkedContentUtils {
.filter(m -> m.getProperties() != null) .filter(m -> m.getProperties() != null)
.filter(m -> m.getProperties().getItem("Subtype") != null) .filter(m -> m.getProperties().getItem("Subtype") != null)
.filter(m -> ((COSName) m.getProperties().getItem("Subtype")).getName().equals(subtype)) .filter(m -> ((COSName) m.getProperties().getItem("Subtype")).getName().equals(subtype))
.map(PDMarkedContent::getContents).flatMap(Collection::stream) .map(PDMarkedContent::getContents)
.flatMap(Collection::stream)
.filter(t -> t instanceof TextPosition) .filter(t -> t instanceof TextPosition)
.map(t -> (TextPosition) t) .map(t -> (TextPosition) t)
.filter(t -> !t.getUnicode().equals(" ")) .filter(t -> !t.getUnicode().equals(" "))
@ -41,16 +46,77 @@ public class MarkedContentUtils {
return Collections.emptyList(); return Collections.emptyList();
} }
return markedContentByYPosition.values().stream() return markedContentByYPosition.values()
.map(textPositions -> new TextPositionSequence(textPositions.stream() .stream()
.toList(), 0, true) .map(textPositions -> new TextPositionSequence(textPositions, 0, true).getRectangle())
.getRectangle()) .map(t -> new Rectangle2D.Float(t.getTopLeft().getX(), t.getTopLeft().getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight())))
.map(t -> new Rectangle2D.Float(t.getTopLeft().getX(), t.getTopLeft().getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight()))).collect(Collectors.toList()); .collect(Collectors.toList());
}
public List<MarkedContentPosition> getMarkedContentPositions(List<PDMarkedContent> markedContents, PDPage pdPage) {
if (markedContents == null) {
return Collections.emptyList();
}
return markedContents.stream()
.filter(m -> !m.getContents().isEmpty())
.map(markedContent -> MarkedContentPosition.fromPDMarkedContent(markedContent, pdPage))
.toList();
} }
public boolean intersects(TextPageBlock textBlock, Map<String, List<Rectangle2D>> markedContentBboxPerType, String type) { public boolean intersects(TextPageBlock textBlock, Map<String, List<Rectangle2D>> markedContentBboxPerType, String type) {
return markedContentBboxPerType.get(type) != null && markedContentBboxPerType.get(type).stream().anyMatch(rectangle -> rectangle.intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight()));
return markedContentBboxPerType.get(type) != null && markedContentBboxPerType.get(type)
.stream()
.anyMatch(rectangle -> rectangle.intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight()));
}
public record MarkedContentPosition(String type, String subType, List<Rectangle2D> textPositions) {
public static MarkedContentPosition fromPDMarkedContent(PDMarkedContent markedContent, PDPage pdPage) {
return new MarkedContentPosition(markedContent.getTag(), parseSubType(markedContent), parseTextPositions(markedContent.getContents(), pdPage));
}
private static List<Rectangle2D> parseTextPositions(List<Object> contents, PDPage pdPage) {
return contents.stream()
.filter(content -> content instanceof TextPosition)
.map(content -> (TextPosition) content)
.filter(content -> !content.getUnicode().equals(" "))
.map(textPositions -> new TextPositionSequence(List.of(textPositions), 0, true).getRectangle())
.map(t -> new Rectangle2D.Float(t.getTopLeft().getX(), t.getTopLeft().getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight())))
.collect(Collectors.toList());
}
private static String parseSubType(PDMarkedContent markedContent) {
if (markedContent == null || markedContent.getProperties() == null || markedContent.getProperties().getItem("Subtype") == null) {
return null;
}
return ((COSName) markedContent.getProperties().getItem("Subtype")).getName();
}
public String formattedType() {
if (subType == null || subType.isEmpty()) {
return type;
}
if (type.equals("Artifact")) {
return subType;
}
return String.format("%s-%s", type, subType);
}
} }
} }

View File

@ -52,7 +52,10 @@ public class RectangleTransformations {
public static Rectangle2D bBoxUnionAtomicTextBlock(List<AtomicTextBlock> atomicTextBlocks) { public static Rectangle2D bBoxUnionAtomicTextBlock(List<AtomicTextBlock> atomicTextBlocks) {
return atomicTextBlocks.stream().flatMap(atomicTextBlock -> atomicTextBlock.getPositions().stream()).collect(new Rectangle2DBBoxCollector()); return atomicTextBlocks.stream()
.flatMap(atomicTextBlock -> atomicTextBlock.getPositions()
.stream())
.collect(new Rectangle2DBBoxCollector());
} }
@ -77,7 +80,10 @@ public class RectangleTransformations {
public static Rectangle2D atomicTextBlockBBox(List<AtomicTextBlock> atomicTextBlocks) { public static Rectangle2D atomicTextBlockBBox(List<AtomicTextBlock> atomicTextBlocks) {
return atomicTextBlocks.stream().flatMap(atomicTextBlock -> atomicTextBlock.getPositions().stream()).collect(new Rectangle2DBBoxCollector()); return atomicTextBlocks.stream()
.flatMap(atomicTextBlock -> atomicTextBlock.getPositions()
.stream())
.collect(new Rectangle2DBBoxCollector());
} }
@ -89,16 +95,18 @@ public class RectangleTransformations {
public static Rectangle2D rectangleBBox(List<Rectangle> rectangles) { public static Rectangle2D rectangleBBox(List<Rectangle> rectangles) {
return rectangles.stream().map(RectangleTransformations::toRectangle2D).collect(new Rectangle2DBBoxCollector()); return rectangles.stream()
.map(RectangleTransformations::toRectangle2D)
.collect(new Rectangle2DBBoxCollector());
} }
public static Rectangle2D toRectangle2D(Rectangle redactionLogRectangle) { public static Rectangle2D toRectangle2D(Rectangle redactionLogRectangle) {
return new Rectangle2D.Double(redactionLogRectangle.getTopLeft().getX(), return new Rectangle2D.Double(redactionLogRectangle.getTopLeft().getX(),
redactionLogRectangle.getTopLeft().getY() + redactionLogRectangle.getHeight(), redactionLogRectangle.getTopLeft().getY() + redactionLogRectangle.getHeight(),
redactionLogRectangle.getWidth(), redactionLogRectangle.getWidth(),
-redactionLogRectangle.getHeight()); -redactionLogRectangle.getHeight());
} }
@ -111,15 +119,16 @@ public class RectangleTransformations {
public static Rectangle toRedactionLogRectangle(Rectangle2D rectangle2D, int pageNumber) { public static Rectangle toRedactionLogRectangle(Rectangle2D rectangle2D, int pageNumber) {
return new Rectangle(new Point((float) rectangle2D.getMinX(), (float) (rectangle2D.getMinY() + rectangle2D.getHeight())), return new Rectangle(new Point((float) rectangle2D.getMinX(), (float) (rectangle2D.getMinY() + rectangle2D.getHeight())),
(float) rectangle2D.getWidth(), (float) rectangle2D.getWidth(),
-(float) rectangle2D.getHeight(), -(float) rectangle2D.getHeight(),
pageNumber); pageNumber);
} }
public static Rectangle2D rectangle2DBBox(List<Rectangle2D> rectangle2DList) { public static Rectangle2D rectangle2DBBox(List<Rectangle2D> rectangle2DList) {
return rectangle2DList.stream().collect(new Rectangle2DBBoxCollector()); return rectangle2DList.stream()
.collect(new Rectangle2DBBoxCollector());
} }
@ -134,7 +143,9 @@ public class RectangleTransformations {
if (rectangle2DList.isEmpty()) { if (rectangle2DList.isEmpty()) {
return Collections.emptyList(); return Collections.emptyList();
} }
double splitThreshold = rectangle2DList.stream().mapToDouble(RectangularShape::getWidth).average().orElse(5) * 5.0; double splitThreshold = rectangle2DList.stream()
.mapToDouble(RectangularShape::getWidth).average()
.orElse(5) * 5.0;
List<List<Rectangle2D>> rectangleListsWithGaps = new LinkedList<>(); List<List<Rectangle2D>> rectangleListsWithGaps = new LinkedList<>();
List<Rectangle2D> rectangleListWithoutGaps = new LinkedList<>(); List<Rectangle2D> rectangleListWithoutGaps = new LinkedList<>();
@ -195,9 +206,9 @@ public class RectangleTransformations {
public BinaryOperator<BBox> combiner() { public BinaryOperator<BBox> combiner() {
return (b1, b2) -> new BBox(Math.min(b1.lowerLeftX, b2.lowerLeftX), return (b1, b2) -> new BBox(Math.min(b1.lowerLeftX, b2.lowerLeftX),
Math.min(b1.lowerLeftY, b2.lowerLeftY), Math.min(b1.lowerLeftY, b2.lowerLeftY),
Math.max(b1.upperRightX, b2.upperRightX), Math.max(b1.upperRightX, b2.upperRightX),
Math.max(b1.upperRightY, b2.upperRightY)); Math.max(b1.upperRightY, b2.upperRightY));
} }

View File

@ -14,23 +14,24 @@ public class RectangularIntersectionFinder {
public static List<Rectangle2D> find(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) { public static List<Rectangle2D> find(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
// Fix for 211.pdf // // Fix for 211.pdf
for (Ruling r : horizontalRulingLines) { // for (Ruling r : horizontalRulingLines) {
if (r.getX2() < r.getX1()) { // if (r.getX2() < r.getX1()) {
double a = r.getX2(); // double a = r.getX2();
r.x2 = (float) r.getX1(); // r.x2 = (float) r.getX1();
r.x1 = (float) a; // r.x1 = (float) a;
} // }
} // }
List<Rectangle2D> foundRectangles = new ArrayList<>(); List<Rectangle2D> foundRectangles = new ArrayList<>();
Map<Point2D, Ruling[]> intersectionPoints = Ruling.findIntersections(horizontalRulingLines, verticalRulingLines); Map<Point2D, RulingIntersectionFinder.IntersectingRulings> intersectionPoints = RulingIntersectionFinder.findNaive(horizontalRulingLines, verticalRulingLines);
List<Point2D> intersectionPointsList = new ArrayList<>(intersectionPoints.keySet()); List<Point2D> intersectionPointsList = new ArrayList<>(intersectionPoints.keySet());
intersectionPointsList.sort(Y_FIRST_POINT_COMPARATOR); intersectionPointsList.sort(Y_FIRST_POINT_COMPARATOR);
for (int i = 0; i < intersectionPointsList.size(); i++) { for (int i = 0; i < intersectionPointsList.size(); i++) {
Point2D topLeft = intersectionPointsList.get(i); Point2D topLeft = intersectionPointsList.get(i);
Ruling[] hv = intersectionPoints.get(topLeft); RulingIntersectionFinder.IntersectingRulings intersectingRulingsFromTopLeft = intersectionPoints.get(topLeft);
// CrossingPointsDirectlyBelow( topLeft ); // CrossingPointsDirectlyBelow( topLeft );
List<Point2D> xPoints = new ArrayList<>(); List<Point2D> xPoints = new ArrayList<>();
@ -48,18 +49,19 @@ public class RectangularIntersectionFinder {
outer: outer:
for (Point2D xPoint : xPoints) { for (Point2D xPoint : xPoints) {
// is there a vertical edge b/w topLeft and xPoint? // is there a vertical edge b/w topLeft and xPoint?
if (!hv[1].equals(intersectionPoints.get(xPoint)[1])) { if (!intersectingRulingsFromTopLeft.vertical().equals(intersectionPoints.get(xPoint).vertical())) {
continue; continue;
} }
for (Point2D yPoint : yPoints) { for (Point2D yPoint : yPoints) {
// is there a horizontal edge b/w topLeft and yPoint ? // is there a horizontal edge b/w topLeft and yPoint ?
if (!hv[0].equals(intersectionPoints.get(yPoint)[0])) { if (!intersectingRulingsFromTopLeft.horizontal().equals(intersectionPoints.get(yPoint).horizontal())) {
continue; continue;
} }
Point2D btmRight = new Point2D.Float((float) yPoint.getX(), (float) xPoint.getY()); Point2D btmRight = new Point2D.Float((float) yPoint.getX(), (float) xPoint.getY());
if (intersectionPoints.containsKey(btmRight) if (intersectionPoints.containsKey(btmRight)
&& intersectionPoints.get(btmRight)[0].equals(intersectionPoints.get(xPoint)[0]) && intersectionPoints.get(btmRight).horizontal().equals(intersectionPoints.get(xPoint).horizontal())
&& intersectionPoints.get(btmRight)[1].equals(intersectionPoints.get(yPoint)[1])) { && intersectionPoints.get(btmRight).vertical().equals(intersectionPoints.get(yPoint).vertical())) {
foundRectangles.add(new Rectangle2D.Double(topLeft.getX(), topLeft.getY(), btmRight.getX() - topLeft.getX(), btmRight.getY() - topLeft.getY())); foundRectangles.add(new Rectangle2D.Double(topLeft.getX(), topLeft.getY(), btmRight.getX() - topLeft.getX(), btmRight.getY() - topLeft.getY()));
break outer; break outer;
} }

View File

@ -0,0 +1,201 @@
package com.knecon.fforesight.service.layoutparser.processor.utils;
import java.awt.geom.Point2D;
import java.util.Collections;
import java.util.Comparator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.TreeMap;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import lombok.experimental.UtilityClass;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@UtilityClass
public class RulingIntersectionFinder {
public static final int PERPENDICULAR_UNIT_EXPAND_AMOUNT = 2;
public static final Comparator<Point2D> Y_THEN_X_POINT_COMPARATOR = Comparator.comparingDouble(Point2D::getY).thenComparing(Point2D::getX);
/**
* Implementation to find line intersection in O(P + n log n), where n is the number of lines and P the numer of intersections
* based on <a href="http://people.csail.mit.edu/indyk/6.838-old/handouts/lec2.pdf">Segment Intersection by Piotr Indyk</a>
* The algorithm assumes there are only horizontal and vertical lines which are unique in their coordinates. (E.g. no overlapping horizontal lines exist)
* As a high level overview, the algorithm uses a sweep line advancing from left to right.
* It dynamically updates the horizontal rulings which are intersected by the current sweep line.
* When the sweep line hits a vertical line, it then checks for all intersections with the currently intersected horizontal rulings.
* THe trick of the algorithm is using a binary search tree to store the currently intersected horizontal rulings. This way the lookup should be in O(log n).
* This way the initial sorting step has the highest complexity class (O(n log n) and thus determines the complexity class of the entire algorithm
*
* Unfortunately, the implementation here takes a few liberties compared to the original algorithm. The binary search tree is replaced by an ordered Set which is simply looped over.
* Therefore, this implementation's worst case, where all horizontal lines span the entire sweep, you are essentially performing the naive approach with a bunch of overhead.
* Since we are using this implementation to find table cells, one can expect this worst case to always be the case.
*
* A simple runtime comparison for a single page with the most lines we can expect (SinglePages/AbsolutelyEnormousTable.pdf with 30 horizontals and 144 verticals) shows this implementation takes roughly 14 ms, whereas the naive approach takes 7 ms. Both are negligible, but the naive approach is two times as fast.
*
* If we would like to make this faster, we would need a better data structure for 'TreeMap<Ruling, Void> horizontalRulingsInCurrentSweep', where we can query the TreeMap for all horizontal rulings in a given interval in O(log n).
*
* @param horizontals a list of non-overlapping horizontal rulings
* @param verticals a list of non-overlapping vertical rulings
* @return a Map of each found intersection point pointing to the two lines forming the intersection.
*/
public Map<Point2D, IntersectingRulings> find(List<Ruling> horizontals, List<Ruling> verticals) {
long start = System.currentTimeMillis();
List<SweepStep> sweepTrajectory = buildSweepTrajectory(horizontals, verticals);
TreeMap<Ruling, Void> horizontalRulingsInCurrentSweep = new TreeMap<>(Comparator.comparingDouble(Ruling::getTop));
TreeMap<Point2D, IntersectingRulings> intersections = new TreeMap<>(Y_THEN_X_POINT_COMPARATOR);
for (SweepStep step : sweepTrajectory) {
switch (step.type) {
case VERTICAL: // check for intersections with currently intersected horizontal lines
for (Ruling horizontalRuling : horizontalRulingsInCurrentSweep.navigableKeySet()) {
Optional<Point2D> intersectionPoint = findIntersectionPoint(horizontalRuling, step.ruling);
if (intersectionPoint.isEmpty()) {
continue;
}
intersections.put(intersectionPoint.get(), new IntersectingRulings(horizontalRuling, step.ruling));
}
break;
case HORIZONTAL_ENTRY: // sweep line now intersects this horizontal ruling
horizontalRulingsInCurrentSweep.put(step.ruling, null);
break;
case HORIZONTAL_EXIT: // sweep line no longer intersects this horizontal ruling
horizontalRulingsInCurrentSweep.remove(step.ruling);
break;
}
}
log.debug("Finished building intersections with line sweep in {} ms", System.currentTimeMillis() - start);
return intersections;
}
/**
* Naive Approach in O(n^2) of finding intersections between lines by iterating over all lines.
*
* @param horizontals a list of non-overlapping horizontal rulings
* @param verticals a list of non-overlapping vertical rulings
* @return a Map of each found intersection point pointing to the two lines forming the intersection.
*/
public Map<Point2D, IntersectingRulings> findNaive(List<Ruling> horizontals, List<Ruling> verticals) {
long start = System.currentTimeMillis();
TreeMap<Point2D, IntersectingRulings> intersections = new TreeMap<>(Y_THEN_X_POINT_COMPARATOR);
for (Ruling horizontal : horizontals) {
for (Ruling vertical : verticals) {
Optional<Point2D> intersectionPoint = findIntersectionPoint(horizontal, vertical);
if (intersectionPoint.isEmpty()) {
continue;
}
intersections.put(intersectionPoint.get(), new IntersectingRulings(horizontal, vertical));
}
}
log.debug("Finished building intersections naively in {} ms", System.currentTimeMillis() - start);
return intersections;
}
private static List<SweepStep> buildSweepTrajectory(List<Ruling> horizontals, List<Ruling> verticals) {
List<SweepStep> sweepTrajectory = new LinkedList<>();
for (Ruling horizontalRuling : horizontals) {
sweepTrajectory.add(new SweepStep(SweepStep.Type.HORIZONTAL_ENTRY, horizontalRuling.getLeft() - PERPENDICULAR_UNIT_EXPAND_AMOUNT, horizontalRuling));
sweepTrajectory.add(new SweepStep(SweepStep.Type.HORIZONTAL_EXIT, horizontalRuling.getRight() + PERPENDICULAR_UNIT_EXPAND_AMOUNT, horizontalRuling));
}
for (Ruling verticalRuling : verticals) {
sweepTrajectory.add(new SweepStep(SweepStep.Type.VERTICAL, verticalRuling.getLeft(), verticalRuling));
}
Collections.sort(sweepTrajectory);
return sweepTrajectory;
}
public Optional<Point2D> findIntersectionPoint(Ruling horizontal, Ruling vertical) {
if (!horizontal.isHorizontal() || !vertical.isVertical()) {
log.warn("lines must be orthogonal, vertical and horizontal");
return Optional.empty();
}
Ruling expanded_horizontal = horizontal.expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT);
Ruling expanded_vertical = vertical.expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT);
if (!expanded_horizontal.intersectsLine(expanded_vertical)) {
return Optional.empty();
}
return Optional.of(new Point2D.Float(vertical.getLeft(), horizontal.getTop()));
}
private class SweepStep implements Comparable<SweepStep> {
protected Type type;
protected float y_position;
protected Ruling ruling;
private enum Type {
VERTICAL,
HORIZONTAL_EXIT,
HORIZONTAL_ENTRY
}
public SweepStep(Type type, float y_position, Ruling ruling) {
this.type = type;
this.y_position = y_position;
this.ruling = ruling;
}
@Override
public int compareTo(SweepStep other) {
int rv;
if (DoubleComparisons.feq(y_position, other.y_position)) {
if (type == SweepStep.Type.VERTICAL && other.type == SweepStep.Type.HORIZONTAL_ENTRY) {
rv = 1;
} else if (type == SweepStep.Type.VERTICAL && other.type == SweepStep.Type.HORIZONTAL_EXIT) {
rv = -1;
} else if (type == SweepStep.Type.HORIZONTAL_ENTRY && other.type == SweepStep.Type.VERTICAL) {
rv = -1;
} else if (type == SweepStep.Type.HORIZONTAL_EXIT && other.type == SweepStep.Type.VERTICAL) {
rv = 1;
} else {
rv = Double.compare(y_position, other.y_position);
}
} else {
return Double.compare(y_position, other.y_position);
}
return rv;
}
}
public record IntersectingRulings(Ruling horizontal, Ruling vertical) {
}
}

View File

@ -0,0 +1,252 @@
package com.knecon.fforesight.service.layoutparser.processor.visualization;
import java.awt.Color;
import java.awt.geom.Line2D;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.util.Collection;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Stream;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import com.knecon.fforesight.service.viewerdoc.ContentStreams;
import com.knecon.fforesight.service.viewerdoc.model.ColoredLine;
import com.knecon.fforesight.service.viewerdoc.model.ColoredRectangle;
import com.knecon.fforesight.service.viewerdoc.model.PlacedText;
import com.knecon.fforesight.service.viewerdoc.model.Standard14EmbeddableFont;
import com.knecon.fforesight.service.viewerdoc.model.Visualizations;
import com.knecon.fforesight.service.viewerdoc.model.VisualizationsOnPage;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.NoArgsConstructor;
import lombok.Setter;
import lombok.experimental.FieldDefaults;
@Getter
@NoArgsConstructor
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class LayoutparsingVisualizations {
static final Standard14EmbeddableFont FONT = Standard14EmbeddableFont.helvetica();
static final Color WORDS_COLOR = new Color(68, 84, 147);
static final Color LINES_COLOR = new Color(152, 45, 179);
static final Color ZONES_COLOR = new Color(131, 38, 38);
static final Color RULINGS_COLOR = new Color(21, 221, 174);
static final Color CELLS_COLOR = new Color(31, 214, 27);
static final Color MAIN_BODY_COLOR = new Color(171, 131, 6);
static final Color MARKED_CONTENT_COLOR = new Color(171, 131, 6);
static final List<Color> ROTATING_CHARACTER_COLOR = List.of(new Color(255, 87, 51),
new Color(255, 195, 0),
new Color(76, 175, 80),
new Color(33, 150, 243),
new Color(155, 89, 182),
new Color(233, 30, 99),
new Color(0, 188, 212),
new Color(121, 85, 72));
@Setter
boolean active = false;
final Visualizations words = Visualizations.builder().layer(ContentStreams.WORDS).build();
final Visualizations lines = Visualizations.builder().layer(ContentStreams.LINES).build();
final Visualizations zones = Visualizations.builder().layer(ContentStreams.ZONES).build();
final Visualizations mainBody = Visualizations.builder().layer(ContentStreams.MAIN_BODY).build();
final Visualizations rulings = Visualizations.builder().layer(ContentStreams.RULINGS).build();
final Visualizations cells = Visualizations.builder().layer(ContentStreams.CELLS).build();
final Visualizations markedContent = Visualizations.builder().layer(ContentStreams.MARKED_CONTENT).build();
final Visualizations neighbours = Visualizations.builder().layer(ContentStreams.NEIGHBOURS).build();
final Visualizations characters = Visualizations.builder().layer(ContentStreams.CHARACTERS).build();
public Stream<Visualizations> streamAll() {
if (!active) {
return Stream.empty();
}
return Stream.of(characters, //
neighbours,//
words, //
lines, //
zones, //
rulings, //
cells, //
mainBody, //
markedContent //
);
}
public void addTextVisualizations(List<TextPositionSequence> textPositionSequences, int pageNumber) {
if (!active) {
return;
}
List<ColoredRectangle> list = textPositionSequences.stream()
.map(textPositionSequence -> textPositionSequence.getTextPositions()
.stream()
.map(RedTextPosition::getInitialUserSpacePosition)
.collect(RectangleTransformations.collectBBox()))
.map(rect -> new ColoredRectangle(rect, WORDS_COLOR, 1))
.toList();
this.words.getVisualizationsOnPages().put(pageNumber - 1, VisualizationsOnPage.builder().coloredRectangles(list).build());
}
public void addCleanRulingVisualization(CleanRulings cleanRulings, int pageNumber) {
if (!active) {
return;
}
this.rulings.getVisualizationsOnPages()
.put(pageNumber - 1,
VisualizationsOnPage.builder()
.coloredLines(Stream.of(cleanRulings.getHorizontal(), cleanRulings.getVertical())
.flatMap(Collection::stream)
.map(ruling -> new ColoredLine(ruling, RULINGS_COLOR, 1))
.toList())
.build());
}
public void addCellVisualizations(List<? extends Rectangle2D> cells, int pageNumber) {
if (!active) {
return;
}
this.cells.getVisualizationsOnPages()
.put(pageNumber - 1,
VisualizationsOnPage.builder()
.coloredRectangles(cells.stream()
.map(ruling -> new ColoredRectangle(ruling, CELLS_COLOR, 1))
.toList())
.build());
}
public void addZoneVisualizations(List<Zone> zones, int page) {
if (!active) {
return;
}
this.zones.getVisualizationsOnPages()
.put(page - 1,
VisualizationsOnPage.builder()
.coloredRectangles(zones.stream()
.map(BoundingBox::getBBox)
.map(zone -> new ColoredRectangle(zone, ZONES_COLOR, 1))
.toList())
.build());
}
public void addLineVisualizations(List<Zone> zones, int page) {
if (!active) {
return;
}
this.lines.getVisualizationsOnPages()
.put(page - 1,
VisualizationsOnPage.builder()
.coloredRectangles(zones.stream()
.map(Zone::getLines)
.flatMap(Collection::stream)
.map(BoundingBox::getBBox)
.map(line -> new ColoredRectangle(line, LINES_COLOR, 1))
.toList())
.build());
}
public void addMainBodyVisualization(Rectangle rectangle, int pageNumber) {
if (!active) {
return;
}
this.mainBody.getVisualizationsOnPages()
.put(pageNumber - 1,
VisualizationsOnPage.builder()
.coloredRectangles(List.of(new ColoredRectangle(new Rectangle2D.Double(rectangle.getTopLeft().getX(),
rectangle.getTopLeft().getY(),
rectangle.getWidth(),
rectangle.getHeight()), MAIN_BODY_COLOR, 1)))
.build());
}
public void addMarkedContentVisualizations(List<PDMarkedContent> markedContents, int pageNumber, PDPage pdPage) {
if (!active) {
return;
}
List<MarkedContentUtils.MarkedContentPosition> markedContentBBoxMapBySubType = MarkedContentUtils.getMarkedContentPositions(markedContents, pdPage);
VisualizationsOnPage visualizationsOnPage = VisualizationsOnPage.builder().build();
this.markedContent.getVisualizationsOnPages().put(pageNumber - 1, visualizationsOnPage);
markedContentBBoxMapBySubType.forEach(markedContentPosition -> {
var bbox = markedContentPosition.textPositions()
.stream()
.collect(RectangleTransformations.collectBBox());
String type = markedContentPosition.formattedType();
float translationAmount = ((FONT.getStringWidth(type) / 1000) * 10 + (2 * 1) + 4);
visualizationsOnPage.getPlacedTexts()
.add(PlacedText.textFacingUp(type, new Point2D.Double(bbox.getX() - translationAmount, bbox.getY() + bbox.getHeight()), 10, Color.BLACK, FONT));
visualizationsOnPage.getColoredRectangles().add(new ColoredRectangle(bbox, MARKED_CONTENT_COLOR, 1));
}
);
}
public void addCharactersWithNeighbours(List<Zone> zones, int page) {
if (!active) {
return;
}
VisualizationsOnPage neighbourVisualizations = VisualizationsOnPage.builder().build();
neighbours.getVisualizationsOnPages().put(page - 1, neighbourVisualizations);
VisualizationsOnPage characterVisualizations = VisualizationsOnPage.builder().build();
characters.getVisualizationsOnPages().put(page - 1, characterVisualizations);
AtomicInteger index = new AtomicInteger(0);
zones.forEach(zone -> zone.getLines()
.stream()
.map(Line::getCharacters)
.flatMap(Collection::stream)
.forEach(character -> {
Color color = ROTATING_CHARACTER_COLOR.get(index.getAndIncrement() % ROTATING_CHARACTER_COLOR.size());
Rectangle2D charBBox = character.getTextPosition().getInitialUserSpacePosition();
characterVisualizations.getColoredRectangles().add(new ColoredRectangle(charBBox, color, 1));
character.getNeighbors()
.forEach(neighbor -> {
Rectangle2D neighborBBox = neighbor.getCharacter().getTextPosition().getInitialUserSpacePosition();
Line2D line = new Line2D.Double(new Point2D.Double(charBBox.getCenterX(), charBBox.getCenterY()),
new Point2D.Double(neighborBBox.getCenterX(), neighborBBox.getCenterY()));
neighbourVisualizations.getColoredLines().add(new ColoredLine(line, color, 1));
});
}));
}
}

View File

@ -1,10 +1,20 @@
package com.knecon.fforesight.service.layoutparser.server; package com.knecon.fforesight.service.layoutparser.server;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Arrays; import java.util.Arrays;
import java.util.Comparator;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.core.io.ClassPathResource;
import com.iqser.red.storage.commons.utils.FileSystemBackedStorageService;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
@ -22,26 +32,63 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
@Test @Test
@SneakyThrows
public void testLayoutParserEndToEnd() { public void testLayoutParserEndToEnd() {
prepareStorage("files/bdr/Wie weiter bei Kristeneinrichtungen.pdf"); String filePath = "files/bdr/Wie weiter bei Kristeneinrichtungen.pdf";
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER);
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest); runForFile(filePath);
Arrays.stream(finishedEvent.message().split("\n")) }
.forEach(log::info);
@Test
@Disabled
@SneakyThrows
public void testLayoutParserEndToEndWithFolder() {
String folder = "/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles-pdftron-ocred";
List<Path> pdfFiles = Files.walk(Path.of(folder))
.filter(path -> path.getFileName().toString().endsWith(".pdf"))
.sorted(Comparator.comparing(Path::getFileName))
.peek(System.out::println)
.toList();
System.out.printf("Found %d pdf files to process %n", pdfFiles.size());
AtomicInteger count = new AtomicInteger(0);
pdfFiles.stream()
.peek(path -> log.info("{}/{}-{}", count.getAndIncrement(), pdfFiles.size(), path.getFileName()))
.forEach(path -> runForFile(path.toFile().toString()));
} }
@Test
@SneakyThrows @SneakyThrows
public void testLayoutParserEndToEnd_RED_8747() { private void runForFile(String filePath) {
String fileName = Path.of(filePath).getFileName().toString();
File file;
if (filePath.startsWith("files")) { // from resources
file = new ClassPathResource(filePath).getFile();
} else { // absolute path
file = new File(filePath);
}
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LayoutParsingType.REDACT_MANAGER, true);
prepareStorage(layoutParsingRequest, file);
prepareStorage("files/syngenta/CustomerFiles/SinglePages/Page26_fRR A23317A PI0015600 CEU core part B6 - CZ.pdf");
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER_OLD);
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest); LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
Arrays.stream(finishedEvent.message().split("\n")) Arrays.stream(finishedEvent.message().split("\n"))
.forEach(log::info); .forEach(log::info);
File tmpFile = new File("/tmp/layout-E2E/" + fileName + "_VIEWER.pdf");
assert tmpFile.getParentFile().exists() || tmpFile.getParentFile().mkdirs();
storageService.downloadTo(TENANT_ID, layoutParsingRequest.viewerDocumentStorageId(), tmpFile);
}
@AfterEach
public void cleanUpTmp() {
((FileSystemBackedStorageService) storageService).clearStorage();
} }
} }

View File

@ -23,6 +23,10 @@ import lombok.SneakyThrows;
public class ViewerDocumentTest extends BuildDocumentTest { public class ViewerDocumentTest extends BuildDocumentTest {
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
@Test @Test
@SneakyThrows @SneakyThrows
public void testViewerDocument() { public void testViewerDocument() {
@ -31,11 +35,9 @@ public class ViewerDocumentTest extends BuildDocumentTest {
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf"; String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
var documentFile = new ClassPathResource(fileName).getFile(); var documentFile = new ClassPathResource(fileName).getFile();
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
long start = System.currentTimeMillis(); long start = System.currentTimeMillis();
Document document = buildGraph(fileName, LayoutParsingType.DOCUMINE); Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER);
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true); layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000); System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
} }
@ -55,11 +57,11 @@ public class ViewerDocumentTest extends BuildDocumentTest {
var documentFile = new ClassPathResource(fileName).getFile(); var documentFile = new ClassPathResource(fileName).getFile();
var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE, var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE,
documentFile, documentFile,
new ImageServiceResponse(), new ImageServiceResponse(),
tableResponse, tableResponse,
new VisualLayoutParsingResponse(), new VisualLayoutParsingResponse(),
Map.of("file", Path.of(fileName).getFileName().toFile().toString())); Map.of("file", Path.of(fileName).getFileName().toFile().toString()));
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null); ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService); LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.DOCUMINE, classificationDocument); Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.DOCUMINE, classificationDocument);

View File

@ -1,6 +1,9 @@
package com.knecon.fforesight.service.layoutparser.server.utils; package com.knecon.fforesight.service.layoutparser.server.utils;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream; import java.io.InputStream;
import java.nio.file.Path;
import java.util.Map; import java.util.Map;
import java.util.Optional; import java.util.Optional;
@ -102,29 +105,22 @@ public abstract class AbstractTest {
} }
@SneakyThrows protected LayoutParsingRequest buildDefaultLayoutParsingRequest(String fileName, LayoutParsingType layoutParsingType, boolean debug) {
protected LayoutParsingRequest prepareStorage(InputStream fileInputStream) {
storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileInputStream);
return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER_OLD);
}
protected LayoutParsingRequest buildDefaultLayoutParsingRequest(LayoutParsingType layoutParsingType) {
var identifier = debug ? Map.of("fileId", fileName, "debug", "true") : Map.of("fileId", fileName);
return LayoutParsingRequest.builder() return LayoutParsingRequest.builder()
.identifier(Map.of("fileId", "1337")) .identifier(identifier)
.layoutParsingType(layoutParsingType) .layoutParsingType(layoutParsingType)
.originFileStorageId(ORIGIN_FILE_ID) .originFileStorageId(fileName + ORIGIN_FILE_ID)
.tablesFileStorageId(Optional.of(TABLE_FILE_ID)) .tablesFileStorageId(Optional.of(fileName + TABLE_FILE_ID))
.imagesFileStorageId(Optional.of(IMAGE_FILE_ID)) .imagesFileStorageId(Optional.of(fileName + IMAGE_FILE_ID))
.visualLayoutParsingFileId(Optional.of(VISUAL_LAYOUT_FILE)) .visualLayoutParsingFileId(Optional.of(fileName + VISUAL_LAYOUT_FILE))
.structureFileStorageId(STRUCTURE_FILE_ID) .structureFileStorageId(fileName + STRUCTURE_FILE_ID)
.textBlockFileStorageId(TEXT_FILE_ID) .textBlockFileStorageId(fileName + TEXT_FILE_ID)
.positionBlockFileStorageId(POSITION_FILE_ID) .positionBlockFileStorageId(fileName + POSITION_FILE_ID)
.pageFileStorageId(PAGES_FILE_ID) .pageFileStorageId(fileName + PAGES_FILE_ID)
.simplifiedTextStorageId(SIMPLIFIED_ID) .simplifiedTextStorageId(fileName + SIMPLIFIED_ID)
.viewerDocumentStorageId(VIEWER_DOCUMENT_ID) .viewerDocumentStorageId(fileName + VIEWER_DOCUMENT_ID)
.build(); .build();
} }
@ -148,10 +144,28 @@ public abstract class AbstractTest {
ClassPathResource imageInfoFileResource = new ClassPathResource(imageInfoFile); ClassPathResource imageInfoFileResource = new ClassPathResource(imageInfoFile);
ClassPathResource visualLayoutParsingResponseResource = new ClassPathResource(visualLayoutParsingResponseFile); ClassPathResource visualLayoutParsingResponseResource = new ClassPathResource(visualLayoutParsingResponseFile);
return prepareStorage(pdfFileResource.getInputStream(), return prepareStorage(Path.of(file).getFileName().toString(),
cvServiceResponseFileResource.getInputStream(), pdfFileResource.getInputStream(),
imageInfoFileResource.getInputStream(), cvServiceResponseFileResource.getInputStream(),
visualLayoutParsingResponseResource.getInputStream()); imageInfoFileResource.getInputStream(),
visualLayoutParsingResponseResource.getInputStream());
}
@SneakyThrows
protected void prepareStorage(LayoutParsingRequest layoutParsingRequest, File file) {
ClassPathResource cvServiceResponseFileResource = new ClassPathResource("cv_table_parsing_response/empty.json");
ClassPathResource imageInfoFileResource = new ClassPathResource("image_service_response/empty.json");
ClassPathResource visualLayoutParsingResponseResource = new ClassPathResource("visual_layout_parsing_response/empty.json");
try (var in = new FileInputStream(file)) {
prepareStorage(layoutParsingRequest,
in,
cvServiceResponseFileResource.getInputStream(),
imageInfoFileResource.getInputStream(),
visualLayoutParsingResponseResource.getInputStream());
}
} }
@ -162,12 +176,27 @@ public abstract class AbstractTest {
storageService.storeObject(TenantContext.getTenantId(), TABLE_FILE_ID, cvServiceResponseFileStream); storageService.storeObject(TenantContext.getTenantId(), TABLE_FILE_ID, cvServiceResponseFileStream);
storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileStream); storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileStream);
return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER_OLD); return buildDefaultLayoutParsingRequest("test", LayoutParsingType.REDACT_MANAGER_OLD, true);
} }
@SneakyThrows @SneakyThrows
protected LayoutParsingRequest prepareStorage(InputStream fileStream, protected void prepareStorage(LayoutParsingRequest layoutParsingRequest,
InputStream fileStream,
InputStream cvServiceResponseFileStream,
InputStream imageInfoStream,
InputStream visualLayoutParsingResponseFileStream) {
storageService.storeObject(TenantContext.getTenantId(), layoutParsingRequest.imagesFileStorageId().get(), imageInfoStream);
storageService.storeObject(TenantContext.getTenantId(), layoutParsingRequest.tablesFileStorageId().get(), cvServiceResponseFileStream);
storageService.storeObject(TenantContext.getTenantId(), layoutParsingRequest.originFileStorageId(), fileStream);
storageService.storeObject(TenantContext.getTenantId(), layoutParsingRequest.visualLayoutParsingFileId().get(), visualLayoutParsingResponseFileStream);
}
@SneakyThrows
protected LayoutParsingRequest prepareStorage(String fileName,
InputStream fileStream,
InputStream cvServiceResponseFileStream, InputStream cvServiceResponseFileStream,
InputStream imageInfoStream, InputStream imageInfoStream,
InputStream visualLayoutParsingResponseFileStream) { InputStream visualLayoutParsingResponseFileStream) {
@ -177,7 +206,7 @@ public abstract class AbstractTest {
storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileStream); storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileStream);
storageService.storeObject(TenantContext.getTenantId(), VISUAL_LAYOUT_FILE, visualLayoutParsingResponseFileStream); storageService.storeObject(TenantContext.getTenantId(), VISUAL_LAYOUT_FILE, visualLayoutParsingResponseFileStream);
return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER_OLD); return buildDefaultLayoutParsingRequest(fileName, LayoutParsingType.REDACT_MANAGER_OLD, true);
} }

View File

@ -1,11 +1,13 @@
package com.knecon.fforesight.service.layoutparser.server.utils; package com.knecon.fforesight.service.layoutparser.server.utils;
import java.io.File; import java.io.File;
import java.nio.file.Path;
import java.util.Map; import java.util.Map;
import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.core.io.ClassPathResource; import org.springframework.core.io.ClassPathResource;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline; import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
@ -28,11 +30,11 @@ public abstract class BuildDocumentTest extends AbstractTest {
File fileResource = new ClassPathResource(filename).getFile(); File fileResource = new ClassPathResource(filename).getFile();
prepareStorage(filename); prepareStorage(filename);
return layoutParsingPipeline.parseLayout(layoutParsingType, return layoutParsingPipeline.parseLayout(layoutParsingType,
fileResource, fileResource,
layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID), layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID),
new TableServiceResponse(), new TableServiceResponse(),
new VisualLayoutParsingResponse(), new VisualLayoutParsingResponse(),
Map.of("file",filename)); Map.of("file", filename, "debug", "true"));
} }
@ -46,13 +48,25 @@ public abstract class BuildDocumentTest extends AbstractTest {
@SneakyThrows @SneakyThrows
protected Document buildGraph(String filename, LayoutParsingType layoutParsingType) { protected Document buildGraph(String filename, LayoutParsingType layoutParsingType) {
if (filename.equals("files/syngenta/CustomerFiles/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf")) { if (!filename.startsWith("files") && filename.startsWith("/")) {
prepareStorage(filename, "cv_table_parsing_response/empty.json", "image_service_response/S-Metolachlor_RAR_01_Volume_1_2018-09-06.IMAGE_INFO.json"); LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(Path.of(filename).getFileName().toString(), LayoutParsingType.REDACT_MANAGER, true);
prepareStorage(layoutParsingRequest, new File(filename));
return DocumentGraphFactory.buildDocumentGraph(layoutParsingType,
layoutParsingPipeline.parseLayout(layoutParsingType,
new File(filename),
layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get()),
new TableServiceResponse(),
new VisualLayoutParsingResponse(),
layoutParsingRequest.identifier()));
} else { } else {
prepareStorage(filename); if (filename.equals("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf")) {
prepareStorage(filename, "cv_table_parsing_response/empty.json", "image_service_response/S-Metolachlor_RAR_01_Volume_1_2018-09-06.IMAGE_INFO.json");
} else {
prepareStorage(filename);
}
return DocumentGraphFactory.buildDocumentGraph(layoutParsingType, parseLayout(filename, layoutParsingType));
} }
return DocumentGraphFactory.buildDocumentGraph(layoutParsingType, parseLayout(filename, layoutParsingType));
} }
} }

View File

@ -26,6 +26,23 @@ public class ContentStreams {
public static Identifier ESCAPE_END = new Identifier("escape start", COSName.getPDFName("ESCAPE_END"), false); public static Identifier ESCAPE_END = new Identifier("escape start", COSName.getPDFName("ESCAPE_END"), false);
public static Identifier RULINGS = new Identifier("Rulings", COSName.getPDFName("KNECON_RULINGS"), true);
public static Identifier WORDS = new Identifier("Words", COSName.getPDFName("KNECON_WORDS"), true);
public static Identifier ZONES = new Identifier("Text Zones", COSName.getPDFName("KNECON_ZONES"), true);
public static Identifier LINES = new Identifier("Text Lines", COSName.getPDFName("KNECON_LINES"), true);
public static Identifier CELLS = new Identifier("Cells", COSName.getPDFName("KNECON_CELLS"), true);
public static Identifier MAIN_BODY = new Identifier("Main Text Body", COSName.getPDFName("KNECON_MAIN_BODY"), true);
public static Identifier MARKED_CONTENT = new Identifier("Marked content", COSName.getPDFName("KNECON_MARKED_CONTENT"), true);
public static Identifier NEIGHBOURS = new Identifier("Neighbours", COSName.getPDFName("KNECON_NEIGHBOURS"), true);
public static Identifier CHARACTERS = new Identifier("Characters", COSName.getPDFName("KNECON_CHARACTERS"), true);
public static List<Identifier> allContentStreams = List.of(KNECON_LAYOUT, public static List<Identifier> allContentStreams = List.of(KNECON_LAYOUT,
KNECON_VISUAL_PARSING, KNECON_VISUAL_PARSING,
KNECON_OCR, KNECON_OCR,
@ -33,7 +50,16 @@ public class ContentStreams {
KNECON_OCR_TEXT_DEBUG, KNECON_OCR_TEXT_DEBUG,
OTHER, OTHER,
ESCAPE_START, ESCAPE_START,
ESCAPE_END); ESCAPE_END,
RULINGS,
WORDS,
ZONES,
LINES,
MAIN_BODY,
MARKED_CONTENT,
NEIGHBOURS,
CHARACTERS,
CELLS);
public record Identifier(String name, COSName cosName, boolean optionalContent) { public record Identifier(String name, COSName cosName, boolean optionalContent) {

View File

@ -1,5 +1,6 @@
package com.knecon.fforesight.service.viewerdoc.model; package com.knecon.fforesight.service.viewerdoc.model;
import java.util.LinkedHashMap;
import java.util.Map; import java.util.Map;
import com.knecon.fforesight.service.viewerdoc.ContentStreams; import com.knecon.fforesight.service.viewerdoc.ContentStreams;
@ -17,7 +18,8 @@ import lombok.experimental.FieldDefaults;
public class Visualizations { public class Visualizations {
ContentStreams.Identifier layer; ContentStreams.Identifier layer;
Map<Integer, VisualizationsOnPage> visualizationsOnPages; @Builder.Default
Map<Integer, VisualizationsOnPage> visualizationsOnPages = new LinkedHashMap<>();
boolean layerVisibilityDefaultValue; boolean layerVisibilityDefaultValue;
} }

View File

@ -53,12 +53,6 @@ public class ViewerDocumentService {
private final ObservationRegistry registry; private final ObservationRegistry registry;
public void addVisualizationsOnPage(File originFile, File destinationFile, Visualizations visualizations) {
addVisualizationsOnPage(originFile, destinationFile, List.of(visualizations));
}
@Observed(name = "ViewerDocumentService", contextualName = "add-visualizations") @Observed(name = "ViewerDocumentService", contextualName = "add-visualizations")
@SneakyThrows @SneakyThrows
public void addVisualizationsOnPage(File originFile, File destinationFile, List<Visualizations> visualizations) { public void addVisualizationsOnPage(File originFile, File destinationFile, List<Visualizations> visualizations) {
@ -70,9 +64,14 @@ public class ViewerDocumentService {
PDDocument pdDocument = openPDDocument(tmpFile.toFile()); PDDocument pdDocument = openPDDocument(tmpFile.toFile());
enrichObservation(pdDocument, visualizations.stream().map(Visualizations::getLayer).toList()); enrichObservation(pdDocument,
visualizations.stream()
.map(Visualizations::getLayer)
.toList());
Set<ContentStreams.Identifier> allLayers = visualizations.stream().map(Visualizations::getLayer).collect(Collectors.toUnmodifiableSet()); Set<ContentStreams.Identifier> allLayers = visualizations.stream()
.map(Visualizations::getLayer)
.collect(Collectors.toUnmodifiableSet());
Map<ContentStreams.Identifier, PDOptionalContentGroup> optionalContentGroupMap = addLayersToDocument(visualizations, pdDocument); Map<ContentStreams.Identifier, PDOptionalContentGroup> optionalContentGroupMap = addLayersToDocument(visualizations, pdDocument);
@ -186,7 +185,8 @@ public class ViewerDocumentService {
contentStream.setFont(font, placedText.fontSize()); contentStream.setFont(font, placedText.fontSize());
contentStream.beginText(); contentStream.beginText();
contentStream.setNonStrokingColor(placedText.color()); contentStream.setNonStrokingColor(placedText.color());
if (placedText.renderingMode().isPresent()) { if (placedText.renderingMode()
.isPresent()) {
contentStream.setRenderingMode(placedText.renderingMode().get()); contentStream.setRenderingMode(placedText.renderingMode().get());
} else { } else {
contentStream.setRenderingMode(RenderingMode.FILL); contentStream.setRenderingMode(RenderingMode.FILL);
@ -229,11 +229,11 @@ public class ViewerDocumentService {
Matrix textMatrix; Matrix textMatrix;
if (placedText.textMatrix().isEmpty()) { if (placedText.textMatrix().isEmpty()) {
textMatrix = new Matrix((float) textDeRotationMatrix.getScaleX(), textMatrix = new Matrix((float) textDeRotationMatrix.getScaleX(),
(float) textDeRotationMatrix.getShearX(), (float) textDeRotationMatrix.getShearX(),
(float) textDeRotationMatrix.getShearY(), (float) textDeRotationMatrix.getShearY(),
(float) textDeRotationMatrix.getScaleY(), (float) textDeRotationMatrix.getScaleY(),
(float) placedText.lineStart().getX(), (float) placedText.lineStart().getX(),
(float) placedText.lineStart().getY()); (float) placedText.lineStart().getY());
} else { } else {
textMatrix = placedText.textMatrix().get(); textMatrix = placedText.textMatrix().get();
} }