RED-8825: general improvements
* classify rulings as underline/striketrough * improve performance of CleanRulings.lineBetween * use lineBetween where possible * wip, still todo: - Header/Footer by Ruling for all rotations - actually the ticket, optimizing layoutparsing for documine
This commit is contained in:
parent
e4663ac8db
commit
1916e626df
@ -45,6 +45,7 @@ import com.knecon.fforesight.service.layoutparser.processor.services.RulingClean
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.TableExtractionService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.TextRulingsClassifier;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService;
|
||||
@ -263,26 +264,21 @@ public class LayoutParsingPipeline {
|
||||
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), stripper.getRulings());
|
||||
|
||||
List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontals(), cleanRulings.getVerticals());
|
||||
classificationDocument.getVisualizations().addCellVisualizations(emptyTableCells, pageNumber);
|
||||
|
||||
var graphics = graphicExtractorService.extractPathElementGraphics(originDocument,
|
||||
pdPage,
|
||||
pageNumber,
|
||||
cleanRulings,
|
||||
stripper.getTextPositionSequences(),
|
||||
emptyTableCells,
|
||||
false);
|
||||
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(words, cleanRulings);
|
||||
|
||||
var graphics = graphicExtractorService.extractPathElementGraphics(originDocument, pdPage, pageNumber, cleanRulings, stripper.getTextPositionSequences(), false);
|
||||
|
||||
pdfImages.computeIfAbsent(pageNumber, x -> new ArrayList<>())
|
||||
.addAll(graphics.stream()
|
||||
.map(g -> new ClassifiedImage(new Rectangle2D.Double(g.x1, g.y1, g.width(), g.height()), ImageType.GRAPHIC, false, stripper.getPageNumber()))
|
||||
.toList());
|
||||
|
||||
classificationDocument.getVisualizations().addCellVisualizations(emptyTableCells, pageNumber);
|
||||
.map(g -> new ClassifiedImage(new Rectangle2D.Double(g.x1, g.y1, g.width(), g.height()), ImageType.GRAPHIC, false, stripper.getPageNumber()))
|
||||
.toList());
|
||||
|
||||
ClassificationPage classificationPage = switch (layoutParsingType) {
|
||||
case REDACT_MANAGER_OLD ->
|
||||
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, classificationDocument.getVisualizations());
|
||||
case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings.getHorizontals(), cleanRulings.getVerticals());
|
||||
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings, classificationDocument.getVisualizations());
|
||||
case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings);
|
||||
case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG ->
|
||||
docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getVisualizations());
|
||||
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getVisualizations());
|
||||
|
||||
@ -68,6 +68,7 @@ public class ZoneBuilderService {
|
||||
if (rulings.lineBetween(outerLine.getBBox(), innerLine.getBBox())) {
|
||||
return;
|
||||
}
|
||||
|
||||
unionFind.union(outerLine, innerLine);
|
||||
|
||||
});
|
||||
@ -151,7 +152,9 @@ public class ZoneBuilderService {
|
||||
outputZone.add(new Line(characters, characterSpacing));
|
||||
}
|
||||
|
||||
return new Zone(outputZone.stream().sorted(Comparator.comparing(Line::getY0)).collect(Collectors.toList()));
|
||||
return new Zone(outputZone.stream()
|
||||
.sorted(Comparator.comparing(Line::getY0))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,25 +1,37 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.table;
|
||||
|
||||
import java.awt.geom.Line2D;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.Getter;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@Getter
|
||||
public class CleanRulings {
|
||||
|
||||
List<Ruling> horizontals;
|
||||
List<Ruling> verticals;
|
||||
List<Ruling> horizontals; // unmodifiable sorted by Y list
|
||||
List<Ruling> verticals; // unmodifiable sorted by X list
|
||||
|
||||
|
||||
public CleanRulings(List<Ruling> horizontals, List<Ruling> verticals) {
|
||||
|
||||
this.horizontals = horizontals.stream()
|
||||
.peek(Ruling::assertHorizontal)
|
||||
.sorted(Comparator.comparing(Line2D.Float::getY1))
|
||||
.toList();
|
||||
this.verticals = verticals.stream()
|
||||
.peek(Ruling::assertVertical)
|
||||
.sorted(Comparator.comparing(Line2D.Float::getX1))
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
public CleanRulings getTableLines() {
|
||||
@ -33,6 +45,28 @@ public class CleanRulings {
|
||||
}
|
||||
|
||||
|
||||
public CleanRulings withoutTextRulings() {
|
||||
|
||||
return new CleanRulings(horizontals.stream()
|
||||
.filter(ruling -> !(ruling.getClassification().equals(Ruling.Classification.UNDERLINE) || ruling.getClassification()
|
||||
.equals(Ruling.Classification.STRIKETROUGH)))
|
||||
.toList(),
|
||||
verticals.stream()
|
||||
.filter(ruling -> !(ruling.getClassification().equals(Ruling.Classification.UNDERLINE) || ruling.getClassification()
|
||||
.equals(Ruling.Classification.STRIKETROUGH)))
|
||||
.toList());
|
||||
}
|
||||
|
||||
|
||||
public List<Ruling> buildAll() {
|
||||
|
||||
ArrayList<Ruling> rulings = new ArrayList<>(horizontals.size() + verticals.size());
|
||||
rulings.addAll(horizontals);
|
||||
rulings.addAll(verticals);
|
||||
return rulings;
|
||||
}
|
||||
|
||||
|
||||
public boolean lineBetween(Character a, Character b) {
|
||||
|
||||
return lineBetween(a.getTextPosition().getInitialUserSpacePosition(), b.getTextPosition().getInitialUserSpacePosition());
|
||||
@ -53,28 +87,122 @@ public class CleanRulings {
|
||||
Ruling ruling = new Ruling(p1, p2);
|
||||
|
||||
if (ruling.isHorizontal()) {
|
||||
return verticals.stream()
|
||||
return getVerticalsInXInterval(ruling.x1, ruling.x2).stream()
|
||||
.anyMatch(vertical -> vertical.intersectsLine(ruling));
|
||||
|
||||
}
|
||||
|
||||
if (ruling.isVertical()) {
|
||||
return horizontals.stream()
|
||||
return getHorizontalsInYInterval(ruling.y1, ruling.y2).stream()
|
||||
.anyMatch(horizontal -> horizontal.intersectsLine(ruling));
|
||||
|
||||
}
|
||||
|
||||
return buildAll().stream()
|
||||
return Stream.of(getVerticalsInXInterval(ruling.x1, ruling.x2), getHorizontalsInYInterval(ruling.y1, ruling.y2))
|
||||
.flatMap(Collection::stream)
|
||||
.anyMatch(other -> other.intersectsLine(ruling));
|
||||
}
|
||||
|
||||
|
||||
public List<Ruling> buildAll() {
|
||||
public List<Ruling> getHorizontalsInYInterval(float y1, float y2) {
|
||||
|
||||
ArrayList<Ruling> rulings = new ArrayList<>(horizontals.size() + verticals.size());
|
||||
rulings.addAll(horizontals);
|
||||
rulings.addAll(verticals);
|
||||
return rulings;
|
||||
float startY = Math.min(y1, y2);
|
||||
float endY = Math.max(y1, y2);
|
||||
|
||||
if (horizontals.isEmpty() || Float.isNaN(startY) || Float.isNaN(endY)) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
int firstGreaterThanIdx = findFirstHorizontalRulingIdxAbove(startY);
|
||||
|
||||
if (firstGreaterThanIdx == -1) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
List<Ruling> result = new ArrayList<>();
|
||||
for (int i = firstGreaterThanIdx; i < horizontals.size(); i++) {
|
||||
Ruling horizontal = horizontals.get(i);
|
||||
if (horizontal.y1 > endY) {
|
||||
break;
|
||||
}
|
||||
result.add(horizontal);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
private int findFirstHorizontalRulingIdxAbove(float y) {
|
||||
|
||||
int low = 0;
|
||||
int high = horizontals.size() - 1;
|
||||
|
||||
while (low <= high) {
|
||||
int mid = low + (high - low) / 2;
|
||||
Line2D.Float midLine = horizontals.get(mid);
|
||||
float midY = midLine.y1;
|
||||
|
||||
if (midY == y) {
|
||||
return mid;
|
||||
} else if (midY > y) {
|
||||
high = mid - 1;
|
||||
} else {
|
||||
low = mid + 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Return the index of the first element greater than y or -1 if not found
|
||||
return horizontals.size() > low && horizontals.get(low).y1 > y ? low : -1;
|
||||
}
|
||||
|
||||
|
||||
public List<Ruling> getVerticalsInXInterval(float x1, float x2) {
|
||||
|
||||
float startX = Math.min(x1, x2);
|
||||
float endX = Math.max(x1, x2);
|
||||
|
||||
if (verticals.isEmpty() || Float.isNaN(startX) || Float.isNaN(endX)) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
int firstGreaterThanIdx = findFirstVerticalRulingIdxRightOf(startX);
|
||||
|
||||
if (firstGreaterThanIdx == -1) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
List<Ruling> result = new ArrayList<>();
|
||||
for (int i = firstGreaterThanIdx; i < verticals.size(); i++) {
|
||||
Ruling horizontal = verticals.get(i);
|
||||
if (horizontal.x1 > endX) {
|
||||
break;
|
||||
}
|
||||
result.add(horizontal);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
private int findFirstVerticalRulingIdxRightOf(float x) {
|
||||
|
||||
int low = 0;
|
||||
int high = verticals.size() - 1;
|
||||
|
||||
while (low <= high) {
|
||||
int mid = low + (high - low) / 2;
|
||||
Line2D.Float midLine = verticals.get(mid);
|
||||
float midX = midLine.x1;
|
||||
|
||||
if (midX == x) {
|
||||
return mid;
|
||||
} else if (midX > x) {
|
||||
high = mid - 1;
|
||||
} else {
|
||||
low = mid + 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Return the index of the first element greater than y or -1 if not found
|
||||
return verticals.size() > low && verticals.get(low).x1 > x ? low : -1;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -72,6 +72,25 @@ public class Ruling extends Line2D.Float {
|
||||
}
|
||||
|
||||
|
||||
public void assertHorizontal() {
|
||||
|
||||
if (isHorizontal()) {
|
||||
return;
|
||||
}
|
||||
throw new IllegalArgumentException("Ruling " + this + " is not horizontal");
|
||||
|
||||
}
|
||||
|
||||
|
||||
public void assertVertical() {
|
||||
|
||||
if (isVertical()) {
|
||||
return;
|
||||
}
|
||||
throw new IllegalArgumentException("Ruling " + this + " is not vertical");
|
||||
}
|
||||
|
||||
|
||||
public boolean isVertical() {
|
||||
|
||||
return this.length() > 0 && DoubleComparisons.feq(this.x1, this.x2); //diff < ORIENTATION_CHECK_THRESHOLD;
|
||||
|
||||
@ -44,6 +44,8 @@ public class TextPositionSequence implements CharSequence {
|
||||
private float pageHeight;
|
||||
private float pageWidth;
|
||||
private boolean isParagraphStart;
|
||||
private boolean strikethrough;
|
||||
private boolean underline;
|
||||
|
||||
|
||||
public TextPositionSequence(List<TextPosition> textPositions, int pageNumber, boolean isParagraphStart) {
|
||||
|
||||
@ -44,9 +44,9 @@ public class GapDetectionService {
|
||||
|
||||
if (yDifference > avgTextPositionHeight * Y_GAP_FACTOR) {
|
||||
yGapContext.addGap(mainBodyTextFrame.getMinX(),
|
||||
previousTextPositionBBox.getMaxY(),
|
||||
mainBodyTextFrame.getWidth(),
|
||||
-(previousTextPositionBBox.getMaxY() - currentTextPositionBBox.getMinY()));
|
||||
previousTextPositionBBox.getMaxY(),
|
||||
mainBodyTextFrame.getWidth(),
|
||||
-(previousTextPositionBBox.getMaxY() - currentTextPositionBBox.getMinY()));
|
||||
}
|
||||
if (yDifference > avgTextPositionHeight * NEW_LINE_FACTOR) {
|
||||
|
||||
@ -72,29 +72,34 @@ public class GapDetectionService {
|
||||
return mirrorY(RectangleTransformations.toRectangle2D(textPosition.getRectangle()));
|
||||
}
|
||||
|
||||
|
||||
private static Rectangle2D mirrorY(Rectangle2D rectangle2D) {
|
||||
|
||||
return new Rectangle2D.Double(rectangle2D.getX(), Math.min(rectangle2D.getMinY(), rectangle2D.getMaxY()), rectangle2D.getWidth(), Math.abs(rectangle2D.getHeight()));
|
||||
}
|
||||
|
||||
|
||||
private static void addGapToLine(Rectangle2D currentTextPosition, Rectangle2D previousTextPosition, XGapsContext context) {
|
||||
|
||||
context.gapsInCurrentLine.add(new Rectangle2D.Double(previousTextPosition.getMaxX(),
|
||||
previousTextPosition.getMinY(),
|
||||
currentTextPosition.getMinX() - previousTextPosition.getMaxX(),
|
||||
(previousTextPosition.getHeight() + currentTextPosition.getHeight()) / 2));
|
||||
previousTextPosition.getMinY(),
|
||||
currentTextPosition.getMinX() - previousTextPosition.getMaxX(),
|
||||
(previousTextPosition.getHeight() + currentTextPosition.getHeight()) / 2));
|
||||
}
|
||||
|
||||
|
||||
private static void assertAllTextPositionsHaveSameDir(List<TextPositionSequence> textPositionSequences) {
|
||||
|
||||
assert textPositionSequences.stream().map(TextPositionSequence::getDir).allMatch(a -> a.equals(textPositionSequences.get(0).getDir()));
|
||||
assert textPositionSequences.stream()
|
||||
.map(TextPositionSequence::getDir)
|
||||
.allMatch(a -> a.equals(textPositionSequences.get(0).getDir()));
|
||||
}
|
||||
|
||||
|
||||
private static double getAvgTextPositionHeight(List<TextPositionSequence> textPositionSequences) {
|
||||
|
||||
return textPositionSequences.stream().mapToDouble(TextPositionSequence::getHeight).average().orElseThrow();
|
||||
return textPositionSequences.stream()
|
||||
.mapToDouble(TextPositionSequence::getHeight).average().orElseThrow();
|
||||
}
|
||||
|
||||
|
||||
@ -142,9 +147,9 @@ public class GapDetectionService {
|
||||
public void addGapToRightEdgeOfMainBody(Rectangle2D textPosition) {
|
||||
|
||||
Rectangle2D leftGap = new Rectangle2D.Double(textPosition.getMaxX(),
|
||||
textPosition.getMinY(),
|
||||
mainBodyTextFrame.getMaxX() - textPosition.getMaxX(),
|
||||
textPosition.getHeight());
|
||||
textPosition.getMinY(),
|
||||
mainBodyTextFrame.getMaxX() - textPosition.getMaxX(),
|
||||
textPosition.getHeight());
|
||||
gapsInCurrentLine.add(leftGap);
|
||||
}
|
||||
|
||||
@ -152,9 +157,9 @@ public class GapDetectionService {
|
||||
public void addGapFromLeftEdgeOfMainBody(Rectangle2D textPosition) {
|
||||
|
||||
Rectangle2D leftGap = new Rectangle2D.Double(mainBodyTextFrame.getMinX(),
|
||||
textPosition.getMinY(),
|
||||
textPosition.getMinX() - mainBodyTextFrame.getMinX(),
|
||||
textPosition.getHeight());
|
||||
textPosition.getMinY(),
|
||||
textPosition.getMinX() - mainBodyTextFrame.getMinX(),
|
||||
textPosition.getHeight());
|
||||
gapsInCurrentLine.add(leftGap);
|
||||
}
|
||||
|
||||
|
||||
@ -45,7 +45,7 @@ public class RulingCleaningService {
|
||||
verticalAndHorizontalRulingLines.horizontalLines.sort(X_FIRST_RULING_COMPARATOR);
|
||||
verticalAndHorizontalRulingLines = cleanRulings(verticalAndHorizontalRulingLines);
|
||||
|
||||
return CleanRulings.builder().verticals(verticalAndHorizontalRulingLines.verticalLines()).horizontals(verticalAndHorizontalRulingLines.horizontalLines()).build();
|
||||
return new CleanRulings(verticalAndHorizontalRulingLines.horizontalLines(), verticalAndHorizontalRulingLines.verticalLines());
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -0,0 +1,99 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class TextRulingsClassifier {
|
||||
|
||||
private final static double STRIKETHROUGH_ZONE = 0.5; // multiplied with text height, determines height of intersection interval for strikethrough lines
|
||||
private final static double UNDERLINE_ZONE = 0.2; // multiplied with text height, determines height of intersection interval of underline lines
|
||||
private final static double TEXT_BBOX_THRESHOLD_FACTOR = 0.15; // multiplied with text width, subtracted from word width
|
||||
|
||||
|
||||
public static void classifyUnderlinedAndStrikethroughText(List<TextPositionSequence> words, CleanRulings cleanRulings) {
|
||||
|
||||
for (TextPositionSequence word : words) {
|
||||
if (word.getDir().equals(TextDirection.ZERO) || word.getDir().equals(TextDirection.HALF_CIRCLE)) {
|
||||
handleHorizontalText(cleanRulings, word);
|
||||
} else {
|
||||
handleVerticalText(cleanRulings, word);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static void handleVerticalText(CleanRulings cleanRulings, TextPositionSequence word) {
|
||||
|
||||
float lowerY = (float) (word.getBoundingBox().getMinY() + TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
|
||||
float upperY = (float) (word.getBoundingBox().getMaxY() - TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
|
||||
|
||||
float strikethroughCenterX = (float) word.getBoundingBox().getCenterX();
|
||||
float strikethroughBoxHeight = (float) ((word.getHeight() * STRIKETHROUGH_ZONE) / 2);
|
||||
|
||||
float underlineCenterX = (float) (word.getDir().equals(TextDirection.QUARTER_CIRCLE) ? word.getBoundingBox().getMinX() : word.getBoundingBox().getMaxX());
|
||||
float underlineBoxHeight = (float) ((word.getHeight() * UNDERLINE_ZONE) / 2);
|
||||
|
||||
float leftX = Math.min(underlineCenterX - underlineBoxHeight, strikethroughCenterX - strikethroughBoxHeight);
|
||||
float rightX = Math.max(underlineCenterX + underlineBoxHeight, strikethroughCenterX + strikethroughBoxHeight);
|
||||
|
||||
List<Ruling> rulingsIntersectingWord = cleanRulings.getVerticalsInXInterval(leftX, rightX)
|
||||
.stream()
|
||||
.filter(ruling -> ruling.getClassification().equals(Ruling.Classification.OTHER))
|
||||
.filter(ruling -> ruling.y1 <= lowerY && upperY <= ruling.y2)
|
||||
.toList();
|
||||
|
||||
for (Ruling ruling : rulingsIntersectingWord) {
|
||||
if (strikethroughCenterX - strikethroughBoxHeight < ruling.x1 && ruling.x1 < strikethroughCenterX + strikethroughBoxHeight) {
|
||||
ruling.setClassification(Ruling.Classification.STRIKETROUGH);
|
||||
word.setStrikethrough(true);
|
||||
}
|
||||
|
||||
if (underlineCenterX - underlineBoxHeight < ruling.x1 && ruling.x1 < underlineCenterX + underlineBoxHeight) {
|
||||
ruling.setClassification(Ruling.Classification.UNDERLINE);
|
||||
word.setUnderline(true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static void handleHorizontalText(CleanRulings cleanRulings, TextPositionSequence word) {
|
||||
|
||||
float leftX = (float) (word.getBoundingBox().getMinX() + TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
|
||||
float rightX = (float) (word.getBoundingBox().getMaxX() - TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
|
||||
|
||||
float strikethroughCenterY = (float) word.getBoundingBox().getCenterY();
|
||||
float strikethroughBoxHeight = (float) ((word.getHeight() * STRIKETHROUGH_ZONE) / 2);
|
||||
|
||||
float underlineCenterY = (float) (word.getDir().equals(TextDirection.ZERO) ? word.getBoundingBox().getMinY() : word.getBoundingBox().getMaxY());
|
||||
float underlineBoxHeight = (float) ((word.getHeight() * UNDERLINE_ZONE) / 2);
|
||||
|
||||
float lowerY = Math.min(underlineCenterY - underlineBoxHeight, strikethroughCenterY - strikethroughBoxHeight);
|
||||
float upperY = Math.max(underlineCenterY + underlineBoxHeight, strikethroughCenterY + strikethroughBoxHeight);
|
||||
|
||||
List<Ruling> rulingsIntersectingWord = cleanRulings.getHorizontalsInYInterval(lowerY, upperY)
|
||||
.stream()
|
||||
.filter(ruling -> ruling.getClassification().equals(Ruling.Classification.OTHER))
|
||||
.filter(ruling -> ruling.x1 <= leftX && rightX <= ruling.x2)
|
||||
.toList();
|
||||
|
||||
for (Ruling ruling : rulingsIntersectingWord) {
|
||||
if (strikethroughCenterY - strikethroughBoxHeight < ruling.y1 && ruling.y1 < strikethroughCenterY + strikethroughBoxHeight) {
|
||||
ruling.setClassification(Ruling.Classification.STRIKETROUGH);
|
||||
word.setStrikethrough(true);
|
||||
}
|
||||
|
||||
if (underlineCenterY - underlineBoxHeight < ruling.y1 && ruling.y1 < underlineCenterY + underlineBoxHeight) {
|
||||
ruling.setClassification(Ruling.Classification.UNDERLINE);
|
||||
word.setUnderline(true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -38,7 +38,7 @@ public class DocstrumBlockificationService {
|
||||
|
||||
public ClassificationPage blockify(List<TextPositionSequence> textPositions, CleanRulings rulings, boolean xyOrder, LayoutparsingVisualizations visualizations) {
|
||||
|
||||
CleanRulings usedRulings = rulings.getTableLines();
|
||||
CleanRulings usedRulings = rulings.withoutTextRulings();
|
||||
|
||||
var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder, usedRulings, visualizations);
|
||||
|
||||
|
||||
@ -15,11 +15,10 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.Orientation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
|
||||
|
||||
@Service
|
||||
public class DocuMineBlockificationService {
|
||||
@ -34,16 +33,17 @@ public class DocuMineBlockificationService {
|
||||
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
|
||||
* Rulings (Table lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
|
||||
*
|
||||
* @param textPositions The textPositions of a page.
|
||||
* @param horizontalRulingLines Horizontal table lines.
|
||||
* @param verticalRulingLines Vertical table lines.
|
||||
* @param textPositions The textPositions of a page.
|
||||
* @param cleanRulings All rulings on a page
|
||||
* @return Page object that contains the Textblock and text statistics.
|
||||
*/
|
||||
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
||||
public ClassificationPage blockify(List<TextPositionSequence> textPositions, CleanRulings cleanRulings) {
|
||||
|
||||
List<TextPositionSequence> chunkWords = new ArrayList<>();
|
||||
List<AbstractPageBlock> chunkBlockList1 = new ArrayList<>();
|
||||
|
||||
CleanRulings usedRulings = cleanRulings.withoutTextRulings();
|
||||
|
||||
float minX = 1000;
|
||||
float maxX = 0;
|
||||
float minY = 1000;
|
||||
@ -59,12 +59,15 @@ public class DocuMineBlockificationService {
|
||||
boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj();
|
||||
boolean negativeXGap = prev != null && word.getMinXDirAdj() - minX < -5;
|
||||
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
|
||||
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
|
||||
boolean isSplitByRuling = prev != null && usedRulings.lineBetween(prev.getBoundingBox(), word.getBoundingBox());
|
||||
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
|
||||
boolean splitByOtherFontAndOtherY = prev != null && Math.abs(prev.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight() * 0.2 && (word.getFontStyle()
|
||||
.contains("bold") && !prev.getFontStyle().contains("bold") || prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold"));
|
||||
.contains("bold")
|
||||
&& !prev.getFontStyle()
|
||||
.contains("bold") || prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold"));
|
||||
|
||||
Matcher matcher = pattern.matcher(chunkWords.stream().collect(Collectors.joining(" ")).toString());
|
||||
Matcher matcher = pattern.matcher(chunkWords.stream()
|
||||
.collect(Collectors.joining(" ")).toString());
|
||||
boolean startsOnSameX = Math.abs(minX - word.getMinXDirAdj()) < 5 && matcher.matches();
|
||||
|
||||
if (prev != null && (lineSeparation || startFromTop || splitByDir || isSplitByRuling || splitByOtherFontAndOtherY || negativeXGap || startsOnSameX)) {
|
||||
@ -86,7 +89,11 @@ public class DocuMineBlockificationService {
|
||||
wasSplitted = false;
|
||||
cb1.setOrientation(Orientation.RIGHT);
|
||||
splitX1 = null;
|
||||
} else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation || !startFromTop || !splitByX || !newLineAfterSplit || !isSplitByRuling)) {
|
||||
} else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation
|
||||
|| !startFromTop
|
||||
|| !splitByX
|
||||
|| !newLineAfterSplit
|
||||
|| !isSplitByRuling)) {
|
||||
cb1.setOrientation(Orientation.LEFT);
|
||||
}
|
||||
|
||||
@ -149,11 +156,11 @@ public class DocuMineBlockificationService {
|
||||
|
||||
if (textBlock == null) {
|
||||
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
|
||||
wordBlock.getMaxXDirAdj(),
|
||||
wordBlock.getMinYDirAdj(),
|
||||
wordBlock.getMaxYDirAdj(),
|
||||
wordBlockList,
|
||||
wordBlock.getRotation());
|
||||
wordBlock.getMaxXDirAdj(),
|
||||
wordBlock.getMinYDirAdj(),
|
||||
wordBlock.getMaxYDirAdj(),
|
||||
wordBlockList,
|
||||
wordBlock.getRotation());
|
||||
} else {
|
||||
TextPageBlock spatialEntity = textBlock.union(wordBlock);
|
||||
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
|
||||
@ -169,68 +176,17 @@ public class DocuMineBlockificationService {
|
||||
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
|
||||
}
|
||||
|
||||
if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) {
|
||||
if (textBlock != null
|
||||
&& textBlock.getSequences() != null
|
||||
&& textBlock.getSequences()
|
||||
.stream()
|
||||
.map(t -> round(t.getMinYDirAdj(), 3))
|
||||
.collect(toSet()).size() == 1) {
|
||||
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
|
||||
}
|
||||
return textBlock;
|
||||
}
|
||||
|
||||
|
||||
private boolean isSplitByRuling(float minX,
|
||||
float minY,
|
||||
float maxX,
|
||||
float maxY,
|
||||
TextPositionSequence word,
|
||||
List<Ruling> horizontalRulingLines,
|
||||
List<Ruling> verticalRulingLines) {
|
||||
|
||||
return isSplitByRuling(maxX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMinYDirAdj(),
|
||||
verticalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight()) //
|
||||
|| isSplitByRuling(minX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMaxYDirAdj(),
|
||||
horizontalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight()) //
|
||||
|| isSplitByRuling(maxX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMinYDirAdj(),
|
||||
horizontalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight()) //
|
||||
|| isSplitByRuling(minX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMaxYDirAdj(),
|
||||
verticalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight()); //
|
||||
}
|
||||
|
||||
|
||||
private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List<Ruling> rulingLines, float dir, float pageWidth, float pageHeight) {
|
||||
|
||||
for (Ruling ruling : rulingLines) {
|
||||
var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight);
|
||||
if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
private double round(float value, int decimalPoints) {
|
||||
|
||||
var d = Math.pow(10, decimalPoints);
|
||||
|
||||
@ -13,14 +13,10 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.Orientation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
|
||||
|
||||
@SuppressWarnings("all")
|
||||
@ -39,9 +35,9 @@ public class RedactManagerBlockificationService {
|
||||
* @param visualizations
|
||||
* @return Page object that contains the Textblock and text statistics.
|
||||
*/
|
||||
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Cell> cells, LayoutparsingVisualizations visualizations) {
|
||||
public ClassificationPage blockify(List<TextPositionSequence> textPositions, CleanRulings cleanRulings, LayoutparsingVisualizations visualizations) {
|
||||
|
||||
CleanRulings usedRulings = RectangleTransformations.extractRulings(cells);
|
||||
CleanRulings usedRulings = cleanRulings.withoutTextRulings();
|
||||
|
||||
int indexOnPage = 0;
|
||||
List<TextPositionSequence> chunkWords = new ArrayList<>();
|
||||
@ -59,7 +55,7 @@ public class RedactManagerBlockificationService {
|
||||
boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj();
|
||||
boolean xIsBeforeFirstX = prev != null && word.getMinXDirAdj() < minX;
|
||||
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
|
||||
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, usedRulings.getHorizontals(), usedRulings.getVerticals());
|
||||
boolean isSplitByRuling = prev != null && usedRulings.lineBetween(prev.getBoundingBox(), word.getBoundingBox());
|
||||
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
|
||||
|
||||
if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) {
|
||||
@ -83,7 +79,11 @@ public class RedactManagerBlockificationService {
|
||||
wasSplitted = false;
|
||||
cb1.setOrientation(Orientation.RIGHT);
|
||||
splitX1 = null;
|
||||
} else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation || !startFromTop || !splitByX || !newLineAfterSplit || !isSplitByRuling)) {
|
||||
} else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation
|
||||
|| !startFromTop
|
||||
|| !splitByX
|
||||
|| !newLineAfterSplit
|
||||
|| !isSplitByRuling)) {
|
||||
cb1.setOrientation(Orientation.LEFT);
|
||||
}
|
||||
|
||||
@ -152,8 +152,11 @@ public class RedactManagerBlockificationService {
|
||||
TextPageBlock block = (TextPageBlock) itty.next();
|
||||
|
||||
if (previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold(block.getMaxY(),
|
||||
previous.getMaxY()) || previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation()
|
||||
.equals(Orientation.RIGHT) && equalsWithThreshold(block.getMaxY(), previous.getMaxY())) {
|
||||
previous.getMaxY())
|
||||
|| previous != null
|
||||
&& previous.getOrientation().equals(Orientation.LEFT)
|
||||
&& block.getOrientation().equals(Orientation.RIGHT)
|
||||
&& equalsWithThreshold(block.getMaxY(), previous.getMaxY())) {
|
||||
previous.add(block);
|
||||
itty.remove();
|
||||
continue;
|
||||
@ -162,7 +165,9 @@ public class RedactManagerBlockificationService {
|
||||
previous = block;
|
||||
}
|
||||
|
||||
visualizations.addTextBlockVisualizations(chunkBlockList.stream().map(tb -> (TextPageBlock) tb).toList(), textPositions.get(0).getPage());
|
||||
visualizations.addTextBlockVisualizations(chunkBlockList.stream()
|
||||
.map(tb -> (TextPageBlock) tb)
|
||||
.toList(), textPositions.get(0).getPage());
|
||||
|
||||
return new ClassificationPage(chunkBlockList);
|
||||
}
|
||||
@ -194,11 +199,11 @@ public class RedactManagerBlockificationService {
|
||||
|
||||
if (textBlock == null) {
|
||||
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
|
||||
wordBlock.getMaxXDirAdj(),
|
||||
wordBlock.getMinYDirAdj(),
|
||||
wordBlock.getMaxYDirAdj(),
|
||||
wordBlockList,
|
||||
wordBlock.getRotation());
|
||||
wordBlock.getMaxXDirAdj(),
|
||||
wordBlock.getMinYDirAdj(),
|
||||
wordBlock.getMaxYDirAdj(),
|
||||
wordBlockList,
|
||||
wordBlock.getRotation());
|
||||
} else {
|
||||
TextPageBlock spatialEntity = textBlock.union(wordBlock);
|
||||
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
|
||||
@ -214,68 +219,18 @@ public class RedactManagerBlockificationService {
|
||||
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
|
||||
}
|
||||
|
||||
if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) {
|
||||
if (textBlock != null
|
||||
&& textBlock.getSequences() != null
|
||||
&& textBlock.getSequences()
|
||||
.stream()
|
||||
.map(t -> round(t.getMinYDirAdj(), 3))
|
||||
.collect(toSet()).size() == 1) {
|
||||
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
|
||||
}
|
||||
return textBlock;
|
||||
}
|
||||
|
||||
|
||||
private boolean isSplitByRuling(float minX,
|
||||
float minY,
|
||||
float maxX,
|
||||
float maxY,
|
||||
TextPositionSequence word,
|
||||
List<Ruling> horizontalRulingLines,
|
||||
List<Ruling> verticalRulingLines) {
|
||||
|
||||
return isSplitByRuling(maxX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMinYDirAdj(),
|
||||
verticalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight()) //
|
||||
|| isSplitByRuling(minX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMaxYDirAdj(),
|
||||
horizontalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight()) //
|
||||
|| isSplitByRuling(maxX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMinYDirAdj(),
|
||||
horizontalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight()) //
|
||||
|| isSplitByRuling(minX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMaxYDirAdj(),
|
||||
verticalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight());
|
||||
}
|
||||
|
||||
|
||||
private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List<Ruling> rulingLines, float dir, float pageWidth, float pageHeight) {
|
||||
|
||||
for (Ruling ruling : rulingLines) {
|
||||
var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight);
|
||||
if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
private double round(float value, int decimalPoints) {
|
||||
|
||||
var d = Math.pow(10, decimalPoints);
|
||||
|
||||
@ -80,14 +80,18 @@ public class DocumentGraphFactory {
|
||||
}
|
||||
|
||||
|
||||
public void addParagraphOrHeadline(GenericSemanticNode parentNode, TextPageBlock originalTextBlock, Context context, List<TextPageBlock> textBlocksToMerge) {
|
||||
public void addParagraphOrHeadline(GenericSemanticNode parentNode,
|
||||
TextPageBlock originalTextBlock,
|
||||
Context context,
|
||||
List<TextPageBlock> textBlocksToMerge,
|
||||
LayoutParsingType layoutParsingType) {
|
||||
|
||||
Page page = context.getPage(originalTextBlock.getPage());
|
||||
|
||||
GenericSemanticNode node;
|
||||
if (originalTextBlock.isHeadline()) {
|
||||
node = Headline.builder().documentTree(context.getDocumentTree()).build();
|
||||
} else if (originalTextBlock.isToDuplicate()) {
|
||||
} else if (originalTextBlock.isToDuplicate() && layoutParsingType.equals(LayoutParsingType.REDACT_MANAGER)) {
|
||||
node = DuplicatedParagraph.builder().documentTree(context.getDocumentTree()).build();
|
||||
} else {
|
||||
node = Paragraph.builder().documentTree(context.getDocumentTree()).build();
|
||||
@ -269,8 +273,7 @@ public class DocumentGraphFactory {
|
||||
return pages.keySet()
|
||||
.stream()
|
||||
.filter(page -> page.getNumber() == pageIndex)
|
||||
.findFirst()
|
||||
.orElseThrow(() -> new NoSuchElementException(format("ClassificationPage with number %d not found", pageIndex)));
|
||||
.findFirst().orElseThrow(() -> new NoSuchElementException(format("ClassificationPage with number %d not found", pageIndex)));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -49,8 +49,7 @@ public class SectionNodeFactory {
|
||||
|
||||
Map<Integer, List<AbstractPageBlock>> blocksPerPage = pageBlocks.stream()
|
||||
.collect(groupingBy(AbstractPageBlock::getPage));
|
||||
Section section = Section.builder().documentTree(context.getDocumentTree())
|
||||
.build();
|
||||
Section section = Section.builder().documentTree(context.getDocumentTree()).build();
|
||||
|
||||
context.getSections().add(section);
|
||||
blocksPerPage.keySet()
|
||||
@ -121,12 +120,12 @@ public class SectionNodeFactory {
|
||||
case REDACT_MANAGER, CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> {
|
||||
alreadyMerged.add(abstractPageBlock);
|
||||
remainingBlocks.remove(abstractPageBlock);
|
||||
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, new ArrayList<>());
|
||||
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, new ArrayList<>(), layoutParsingType);
|
||||
}
|
||||
default -> {
|
||||
List<TextPageBlock> textBlocks = findTextBlocksWithSameClassificationAndAlignsY((TextPageBlock) abstractPageBlock, remainingBlocks);
|
||||
alreadyMerged.addAll(textBlocks);
|
||||
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, textBlocks);
|
||||
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, textBlocks, layoutParsingType);
|
||||
}
|
||||
}
|
||||
} else if (abstractPageBlock instanceof TablePageBlock tablePageBlock) {
|
||||
|
||||
@ -159,7 +159,7 @@ public class TableNodeFactory {
|
||||
tableCell.setLeafTextBlock(textBlock);
|
||||
} else {
|
||||
cell.getTextBlocks()
|
||||
.forEach(tb -> DocumentGraphFactory.addParagraphOrHeadline(tableCell, tb, context, emptyList()));
|
||||
.forEach(tb -> DocumentGraphFactory.addParagraphOrHeadline(tableCell, tb, context, emptyList(), layoutParsingType));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -1,7 +1,6 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.graphics;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@ -9,8 +8,8 @@ import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.SearchTextWithTextPositionFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
@ -32,33 +31,32 @@ public class GraphicExtractorService {
|
||||
int pageNumber,
|
||||
CleanRulings cleanRulings,
|
||||
List<TextPositionSequence> textPositionSequences,
|
||||
List<Cell> emptyTableCells,
|
||||
boolean graphicsRaster) {
|
||||
|
||||
var characterBBoxes = getCharacterBBoxes(textPositionSequences);
|
||||
var tableLineBBoxes = getLineBBoxesFromTableCells(emptyTableCells);
|
||||
var underLineBBoxes = getUnderlineBBoxes(cleanRulings, characterBBoxes);
|
||||
var strikeThroughBBoxes = getStrikeThroughBBoxes(cleanRulings, characterBBoxes);
|
||||
List<Box> characterBBoxes = getCharacterBBoxes(textPositionSequences);
|
||||
List<Box> classifiedRulingsBoxes = getLineBBoxesOfAllClassifiedRulings(cleanRulings);
|
||||
|
||||
GraphicBBDetector graphicBBDetector = new GraphicBBDetector(pdPage, true);
|
||||
var graphicBBoxes = graphicBBDetector.findGraphicBB();
|
||||
List<Box> graphicBBoxes = graphicBBDetector.findGraphicBB();
|
||||
|
||||
if (graphicsRaster) {
|
||||
// This should only be used if ocr was performed, it is currently in an early stage and needs to be improved.
|
||||
graphicBBoxes.addAll(findGraphicsRaster.findCCBoundingBoxes(pdDocument,
|
||||
characterBBoxes.stream().map(box -> new Rectangle2D.Double(box.x1 - 2, box.y1 - 2, box.width() + 4, box.height() + 4)).collect(Collectors.toList()),
|
||||
PageInformation.fromPDPage(pageNumber, pdPage)));
|
||||
characterBBoxes.stream()
|
||||
.map(box -> new Rectangle2D.Double(box.x1 - 2, box.y1 - 2, box.width() + 4, box.height() + 4))
|
||||
.collect(Collectors.toList()),
|
||||
PageInformation.fromPDPage(pageNumber, pdPage)));
|
||||
}
|
||||
|
||||
var filteredGraphicBBoxes = graphicBBoxes.stream()
|
||||
.filter(box -> !box.intersectsAny(tableLineBBoxes, 4))
|
||||
.filter(box -> !box.intersectsAny(underLineBBoxes, 4))
|
||||
.filter(box -> !box.intersectsAny(strikeThroughBBoxes, 4))
|
||||
List<Box> filteredGraphicBBoxes = graphicBBoxes.stream()
|
||||
.filter(box -> !box.intersectsAny(classifiedRulingsBoxes, 4))
|
||||
.collect(Collectors.toList());
|
||||
|
||||
var clusters = graphicsClusteringService.getClusters(filteredGraphicBBoxes, 14);
|
||||
List<Box> clusters = graphicsClusteringService.getClusters(filteredGraphicBBoxes, 14);
|
||||
|
||||
return clusters.stream().filter(box -> box.area() > 500 && box.height() > 50 && box.width() > 50).toList();
|
||||
return clusters.stream()
|
||||
.filter(box -> box.area() > 500 && box.height() > 50 && box.width() > 50)
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
@ -74,34 +72,13 @@ public class GraphicExtractorService {
|
||||
}
|
||||
|
||||
|
||||
private List<Box> getLineBBoxesFromTableCells(List<Cell> emptyTableCells) {
|
||||
private List<Box> getLineBBoxesOfAllClassifiedRulings(CleanRulings cleanRulings) {
|
||||
|
||||
List<Box> expandedTableLines = new ArrayList<>();
|
||||
|
||||
emptyTableCells.forEach(cell -> {
|
||||
expandedTableLines.add(new Box(new Rectangle2D.Double(cell.x, cell.y - 1, cell.width, 2)));
|
||||
expandedTableLines.add(new Box(new Rectangle2D.Double(cell.x, cell.y + cell.height - 1, cell.width, 2)));
|
||||
expandedTableLines.add(new Box(new Rectangle2D.Double(cell.x - 1, cell.y, 2, cell.height)));
|
||||
expandedTableLines.add(new Box(new Rectangle2D.Double(cell.x + cell.width - 1, cell.y, 2, cell.height)));
|
||||
});
|
||||
|
||||
return expandedTableLines;
|
||||
}
|
||||
|
||||
|
||||
private List<Box> getUnderlineBBoxes(CleanRulings cleanRulings, List<Box> characterBBoxes) {
|
||||
|
||||
return cleanRulings.getHorizontal()
|
||||
return cleanRulings.buildAll()
|
||||
.stream()
|
||||
.filter(ruling -> !ruling.getClassification().equals(Ruling.Classification.OTHER))
|
||||
.map(h -> new Box(h.x1, h.y1, h.x2, h.y2))
|
||||
.filter(box -> box.intersectsAnyAndOver(characterBBoxes, 6))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
||||
private List<Box> getStrikeThroughBBoxes(CleanRulings cleanRulings, List<Box> characterBBoxes) {
|
||||
|
||||
return cleanRulings.getHorizontal().stream().map(h -> new Box(h.x1, h.y1, h.x2, h.y2)).filter(box -> box.intersectsCenter(characterBBoxes, 2)).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -144,8 +144,7 @@ public class RectangleTransformations {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
double splitThreshold = rectangle2DList.stream()
|
||||
.mapToDouble(RectangularShape::getWidth).average()
|
||||
.orElse(5) * 5.0;
|
||||
.mapToDouble(RectangularShape::getWidth).average().orElse(5) * 5.0;
|
||||
|
||||
List<List<Rectangle2D>> rectangleListsWithGaps = new LinkedList<>();
|
||||
List<Rectangle2D> rectangleListWithoutGaps = new LinkedList<>();
|
||||
@ -182,7 +181,7 @@ public class RectangleTransformations {
|
||||
verticalRulings.add(new Ruling(new Point2D.Float(rectangle.x + rectangle.width, rectangle.y),
|
||||
new Point2D.Float(rectangle.x + rectangle.width, rectangle.y + rectangle.height)));
|
||||
});
|
||||
return CleanRulings.builder().verticals(verticalRulings).horizontals(horizontalRulings).build();
|
||||
return new CleanRulings(verticalRulings, horizontalRulings);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -9,6 +9,7 @@ import java.util.List;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
@ -29,21 +30,21 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
|
||||
@Autowired
|
||||
private LayoutParsingPipeline layoutParsingPipeline;
|
||||
|
||||
|
||||
@Disabled
|
||||
@Test
|
||||
public void testLayoutParserEndToEnd() {
|
||||
|
||||
String filePath = "files/SinglePages/VV-931175_Page1.pdf";
|
||||
String filePath = "/home/kschuettler/Dokumente/TestFiles/tables with striketrough text.pdf";
|
||||
|
||||
runForFile(filePath);
|
||||
}
|
||||
|
||||
@Test
|
||||
// @Disabled
|
||||
@Disabled
|
||||
@SneakyThrows
|
||||
public void testLayoutParserEndToEndWithFolder() {
|
||||
|
||||
String folder = "/home/kschuettler/iqser/fforesight/layout-parser/layoutparser/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages";
|
||||
String folder = "/home/kschuettler/Dokumente/TestFiles/large number of prod files";
|
||||
List<Path> pdfFiles = Files.walk(Path.of(folder))
|
||||
.filter(path -> path.getFileName().toString().endsWith(".pdf"))
|
||||
.sorted(Comparator.comparing(Path::getFileName))
|
||||
@ -77,7 +78,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
|
||||
Arrays.stream(finishedEvent.message().split("\n"))
|
||||
.forEach(log::info);
|
||||
|
||||
File tmpFile = new File("/tmp/layout-E2E/" + fileName + "_VIEWER.pdf");
|
||||
File tmpFile = new File("/tmp/layoutparserEND2END/" + fileName + "_VIEWER.pdf");
|
||||
assert tmpFile.getParentFile().exists() || tmpFile.getParentFile().mkdirs();
|
||||
|
||||
storageService.downloadTo(TENANT_ID, layoutParsingRequest.viewerDocumentStorageId(), tmpFile);
|
||||
|
||||
@ -1,11 +1,14 @@
|
||||
package com.knecon.fforesight.service.layoutparser.server.model;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
@ -35,4 +38,81 @@ class CleanRulingsTest {
|
||||
assertTrue(cleanRulings.lineBetween(a, f));
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testSingleLineInRange() {
|
||||
|
||||
List<Ruling> horizontals = List.of(new Ruling(new Point2D.Float(0, 1), new Point2D.Float(100, 1)));
|
||||
List<Ruling> verticals = List.of(new Ruling(new Point2D.Float(1, 0), new Point2D.Float(1, 100)));
|
||||
|
||||
CleanRulings cleanRulings = new CleanRulings(horizontals, verticals);
|
||||
|
||||
assertEquals(0, cleanRulings.getVerticalsInXInterval(-2, -1).size());
|
||||
assertEquals(0, cleanRulings.getVerticalsInXInterval(-2, -5).size());
|
||||
assertEquals(0, cleanRulings.getVerticalsInXInterval(-2, Float.NaN).size());
|
||||
assertEquals(1, cleanRulings.getVerticalsInXInterval(1, 10).size());
|
||||
assertEquals(0, cleanRulings.getVerticalsInXInterval(100, 101).size());
|
||||
assertEquals(verticals.size(), cleanRulings.getVerticalsInXInterval(Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY).size());
|
||||
assertEquals(1, cleanRulings.getVerticalsInXInterval(1 - 1e-5f, 1 + 1e-5f).size());
|
||||
assertEquals(0, cleanRulings.getVerticalsInXInterval(1e-5f, 1 - 1e-5f).size());
|
||||
|
||||
assertEquals(0, cleanRulings.getHorizontalsInYInterval(-2, -1).size());
|
||||
assertEquals(1, cleanRulings.getHorizontalsInYInterval(1, 10).size());
|
||||
assertEquals(0, cleanRulings.getHorizontalsInYInterval(100, 1001).size());
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testLinesInRange() {
|
||||
|
||||
List<Ruling> horizontals = IntStream.range(0, 101).boxed()
|
||||
.map(y -> new Ruling(new Point2D.Float(0, y), new Point2D.Float(100, y)))
|
||||
.toList();
|
||||
List<Ruling> verticals = IntStream.range(0, 101).boxed()
|
||||
.map(x -> new Ruling(new Point2D.Float(x, 0), new Point2D.Float(x, 100)))
|
||||
.toList();
|
||||
CleanRulings cleanRulings = new CleanRulings(horizontals, verticals);
|
||||
|
||||
assertEquals(0, cleanRulings.getVerticalsInXInterval(-2, -1).size());
|
||||
assertEquals(0, cleanRulings.getVerticalsInXInterval(-2, -5).size());
|
||||
assertEquals(0, cleanRulings.getVerticalsInXInterval(-2, Float.NaN).size());
|
||||
assertEquals(10, cleanRulings.getVerticalsInXInterval(1, 10).size());
|
||||
assertEquals(1, cleanRulings.getVerticalsInXInterval(100, 101).size());
|
||||
assertEquals(verticals.size(), cleanRulings.getVerticalsInXInterval(Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY).size());
|
||||
assertEquals(1, cleanRulings.getVerticalsInXInterval(-1e-5f, 1e-5f).size());
|
||||
assertEquals(1, cleanRulings.getVerticalsInXInterval(0, 0).size());
|
||||
assertEquals(0, cleanRulings.getVerticalsInXInterval(1e-5f, 1 - 1e-5f).size());
|
||||
|
||||
assertEquals(0, cleanRulings.getHorizontalsInYInterval(-2, -1).size());
|
||||
assertEquals(10, cleanRulings.getHorizontalsInYInterval(1, 10).size());
|
||||
assertEquals(1, cleanRulings.getHorizontalsInYInterval(100, 1001).size());
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testLinesInRangePerformance() {
|
||||
|
||||
List<Ruling> horizontals = IntStream.range(0, (int) 1e6).boxed()
|
||||
.map(y -> new Ruling(new Point2D.Float(0, y), new Point2D.Float(100, y)))
|
||||
.toList();
|
||||
CleanRulings cleanRulings = new CleanRulings(horizontals, Collections.emptyList());
|
||||
|
||||
float startY = 29;
|
||||
float endY = 3000;
|
||||
long start = System.currentTimeMillis();
|
||||
var result = cleanRulings.getHorizontalsInYInterval(startY, endY);
|
||||
long time = System.currentTimeMillis() - start;
|
||||
|
||||
start = System.currentTimeMillis();
|
||||
var result2 = cleanRulings.getHorizontals()
|
||||
.stream()
|
||||
.filter(ruling -> ruling.getY1() >= startY && ruling.getY1() <= endY)
|
||||
.toList();
|
||||
long time2 = System.currentTimeMillis() - start;
|
||||
|
||||
assertEquals(result, result2);
|
||||
assertTrue(time < time2);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user