RED-8825: general improvements

* classify rulings as underline/striketrough
* improve performance of CleanRulings.lineBetween
* use lineBetween where possible
* wip, still todo:
 - Header/Footer by Ruling for all rotations
 - actually the ticket, optimizing layoutparsing for documine
This commit is contained in:
Kilian Schuettler 2024-04-29 17:15:19 +02:00
parent e4663ac8db
commit 1916e626df
18 changed files with 471 additions and 249 deletions

View File

@ -45,6 +45,7 @@ import com.knecon.fforesight.service.layoutparser.processor.services.RulingClean
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService;
import com.knecon.fforesight.service.layoutparser.processor.services.TableExtractionService;
import com.knecon.fforesight.service.layoutparser.processor.services.TextRulingsClassifier;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService;
@ -263,26 +264,21 @@ public class LayoutParsingPipeline {
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), stripper.getRulings());
List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontals(), cleanRulings.getVerticals());
classificationDocument.getVisualizations().addCellVisualizations(emptyTableCells, pageNumber);
var graphics = graphicExtractorService.extractPathElementGraphics(originDocument,
pdPage,
pageNumber,
cleanRulings,
stripper.getTextPositionSequences(),
emptyTableCells,
false);
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(words, cleanRulings);
var graphics = graphicExtractorService.extractPathElementGraphics(originDocument, pdPage, pageNumber, cleanRulings, stripper.getTextPositionSequences(), false);
pdfImages.computeIfAbsent(pageNumber, x -> new ArrayList<>())
.addAll(graphics.stream()
.map(g -> new ClassifiedImage(new Rectangle2D.Double(g.x1, g.y1, g.width(), g.height()), ImageType.GRAPHIC, false, stripper.getPageNumber()))
.toList());
classificationDocument.getVisualizations().addCellVisualizations(emptyTableCells, pageNumber);
.map(g -> new ClassifiedImage(new Rectangle2D.Double(g.x1, g.y1, g.width(), g.height()), ImageType.GRAPHIC, false, stripper.getPageNumber()))
.toList());
ClassificationPage classificationPage = switch (layoutParsingType) {
case REDACT_MANAGER_OLD ->
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, classificationDocument.getVisualizations());
case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings.getHorizontals(), cleanRulings.getVerticals());
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings, classificationDocument.getVisualizations());
case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings);
case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG ->
docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getVisualizations());
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getVisualizations());

View File

@ -68,6 +68,7 @@ public class ZoneBuilderService {
if (rulings.lineBetween(outerLine.getBBox(), innerLine.getBBox())) {
return;
}
unionFind.union(outerLine, innerLine);
});
@ -151,7 +152,9 @@ public class ZoneBuilderService {
outputZone.add(new Line(characters, characterSpacing));
}
return new Zone(outputZone.stream().sorted(Comparator.comparing(Line::getY0)).collect(Collectors.toList()));
return new Zone(outputZone.stream()
.sorted(Comparator.comparing(Line::getY0))
.collect(Collectors.toList()));
}
}

View File

@ -1,25 +1,37 @@
package com.knecon.fforesight.service.layoutparser.processor.model.table;
import java.awt.geom.Line2D;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.Getter;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
@Getter
public class CleanRulings {
List<Ruling> horizontals;
List<Ruling> verticals;
List<Ruling> horizontals; // unmodifiable sorted by Y list
List<Ruling> verticals; // unmodifiable sorted by X list
public CleanRulings(List<Ruling> horizontals, List<Ruling> verticals) {
this.horizontals = horizontals.stream()
.peek(Ruling::assertHorizontal)
.sorted(Comparator.comparing(Line2D.Float::getY1))
.toList();
this.verticals = verticals.stream()
.peek(Ruling::assertVertical)
.sorted(Comparator.comparing(Line2D.Float::getX1))
.toList();
}
public CleanRulings getTableLines() {
@ -33,6 +45,28 @@ public class CleanRulings {
}
public CleanRulings withoutTextRulings() {
return new CleanRulings(horizontals.stream()
.filter(ruling -> !(ruling.getClassification().equals(Ruling.Classification.UNDERLINE) || ruling.getClassification()
.equals(Ruling.Classification.STRIKETROUGH)))
.toList(),
verticals.stream()
.filter(ruling -> !(ruling.getClassification().equals(Ruling.Classification.UNDERLINE) || ruling.getClassification()
.equals(Ruling.Classification.STRIKETROUGH)))
.toList());
}
public List<Ruling> buildAll() {
ArrayList<Ruling> rulings = new ArrayList<>(horizontals.size() + verticals.size());
rulings.addAll(horizontals);
rulings.addAll(verticals);
return rulings;
}
public boolean lineBetween(Character a, Character b) {
return lineBetween(a.getTextPosition().getInitialUserSpacePosition(), b.getTextPosition().getInitialUserSpacePosition());
@ -53,28 +87,122 @@ public class CleanRulings {
Ruling ruling = new Ruling(p1, p2);
if (ruling.isHorizontal()) {
return verticals.stream()
return getVerticalsInXInterval(ruling.x1, ruling.x2).stream()
.anyMatch(vertical -> vertical.intersectsLine(ruling));
}
if (ruling.isVertical()) {
return horizontals.stream()
return getHorizontalsInYInterval(ruling.y1, ruling.y2).stream()
.anyMatch(horizontal -> horizontal.intersectsLine(ruling));
}
return buildAll().stream()
return Stream.of(getVerticalsInXInterval(ruling.x1, ruling.x2), getHorizontalsInYInterval(ruling.y1, ruling.y2))
.flatMap(Collection::stream)
.anyMatch(other -> other.intersectsLine(ruling));
}
public List<Ruling> buildAll() {
public List<Ruling> getHorizontalsInYInterval(float y1, float y2) {
ArrayList<Ruling> rulings = new ArrayList<>(horizontals.size() + verticals.size());
rulings.addAll(horizontals);
rulings.addAll(verticals);
return rulings;
float startY = Math.min(y1, y2);
float endY = Math.max(y1, y2);
if (horizontals.isEmpty() || Float.isNaN(startY) || Float.isNaN(endY)) {
return Collections.emptyList();
}
int firstGreaterThanIdx = findFirstHorizontalRulingIdxAbove(startY);
if (firstGreaterThanIdx == -1) {
return Collections.emptyList();
}
List<Ruling> result = new ArrayList<>();
for (int i = firstGreaterThanIdx; i < horizontals.size(); i++) {
Ruling horizontal = horizontals.get(i);
if (horizontal.y1 > endY) {
break;
}
result.add(horizontal);
}
return result;
}
private int findFirstHorizontalRulingIdxAbove(float y) {
int low = 0;
int high = horizontals.size() - 1;
while (low <= high) {
int mid = low + (high - low) / 2;
Line2D.Float midLine = horizontals.get(mid);
float midY = midLine.y1;
if (midY == y) {
return mid;
} else if (midY > y) {
high = mid - 1;
} else {
low = mid + 1;
}
}
// Return the index of the first element greater than y or -1 if not found
return horizontals.size() > low && horizontals.get(low).y1 > y ? low : -1;
}
public List<Ruling> getVerticalsInXInterval(float x1, float x2) {
float startX = Math.min(x1, x2);
float endX = Math.max(x1, x2);
if (verticals.isEmpty() || Float.isNaN(startX) || Float.isNaN(endX)) {
return Collections.emptyList();
}
int firstGreaterThanIdx = findFirstVerticalRulingIdxRightOf(startX);
if (firstGreaterThanIdx == -1) {
return Collections.emptyList();
}
List<Ruling> result = new ArrayList<>();
for (int i = firstGreaterThanIdx; i < verticals.size(); i++) {
Ruling horizontal = verticals.get(i);
if (horizontal.x1 > endX) {
break;
}
result.add(horizontal);
}
return result;
}
private int findFirstVerticalRulingIdxRightOf(float x) {
int low = 0;
int high = verticals.size() - 1;
while (low <= high) {
int mid = low + (high - low) / 2;
Line2D.Float midLine = verticals.get(mid);
float midX = midLine.x1;
if (midX == x) {
return mid;
} else if (midX > x) {
high = mid - 1;
} else {
low = mid + 1;
}
}
// Return the index of the first element greater than y or -1 if not found
return verticals.size() > low && verticals.get(low).x1 > x ? low : -1;
}
}

View File

@ -72,6 +72,25 @@ public class Ruling extends Line2D.Float {
}
public void assertHorizontal() {
if (isHorizontal()) {
return;
}
throw new IllegalArgumentException("Ruling " + this + " is not horizontal");
}
public void assertVertical() {
if (isVertical()) {
return;
}
throw new IllegalArgumentException("Ruling " + this + " is not vertical");
}
public boolean isVertical() {
return this.length() > 0 && DoubleComparisons.feq(this.x1, this.x2); //diff < ORIENTATION_CHECK_THRESHOLD;

View File

@ -44,6 +44,8 @@ public class TextPositionSequence implements CharSequence {
private float pageHeight;
private float pageWidth;
private boolean isParagraphStart;
private boolean strikethrough;
private boolean underline;
public TextPositionSequence(List<TextPosition> textPositions, int pageNumber, boolean isParagraphStart) {

View File

@ -44,9 +44,9 @@ public class GapDetectionService {
if (yDifference > avgTextPositionHeight * Y_GAP_FACTOR) {
yGapContext.addGap(mainBodyTextFrame.getMinX(),
previousTextPositionBBox.getMaxY(),
mainBodyTextFrame.getWidth(),
-(previousTextPositionBBox.getMaxY() - currentTextPositionBBox.getMinY()));
previousTextPositionBBox.getMaxY(),
mainBodyTextFrame.getWidth(),
-(previousTextPositionBBox.getMaxY() - currentTextPositionBBox.getMinY()));
}
if (yDifference > avgTextPositionHeight * NEW_LINE_FACTOR) {
@ -72,29 +72,34 @@ public class GapDetectionService {
return mirrorY(RectangleTransformations.toRectangle2D(textPosition.getRectangle()));
}
private static Rectangle2D mirrorY(Rectangle2D rectangle2D) {
return new Rectangle2D.Double(rectangle2D.getX(), Math.min(rectangle2D.getMinY(), rectangle2D.getMaxY()), rectangle2D.getWidth(), Math.abs(rectangle2D.getHeight()));
}
private static void addGapToLine(Rectangle2D currentTextPosition, Rectangle2D previousTextPosition, XGapsContext context) {
context.gapsInCurrentLine.add(new Rectangle2D.Double(previousTextPosition.getMaxX(),
previousTextPosition.getMinY(),
currentTextPosition.getMinX() - previousTextPosition.getMaxX(),
(previousTextPosition.getHeight() + currentTextPosition.getHeight()) / 2));
previousTextPosition.getMinY(),
currentTextPosition.getMinX() - previousTextPosition.getMaxX(),
(previousTextPosition.getHeight() + currentTextPosition.getHeight()) / 2));
}
private static void assertAllTextPositionsHaveSameDir(List<TextPositionSequence> textPositionSequences) {
assert textPositionSequences.stream().map(TextPositionSequence::getDir).allMatch(a -> a.equals(textPositionSequences.get(0).getDir()));
assert textPositionSequences.stream()
.map(TextPositionSequence::getDir)
.allMatch(a -> a.equals(textPositionSequences.get(0).getDir()));
}
private static double getAvgTextPositionHeight(List<TextPositionSequence> textPositionSequences) {
return textPositionSequences.stream().mapToDouble(TextPositionSequence::getHeight).average().orElseThrow();
return textPositionSequences.stream()
.mapToDouble(TextPositionSequence::getHeight).average().orElseThrow();
}
@ -142,9 +147,9 @@ public class GapDetectionService {
public void addGapToRightEdgeOfMainBody(Rectangle2D textPosition) {
Rectangle2D leftGap = new Rectangle2D.Double(textPosition.getMaxX(),
textPosition.getMinY(),
mainBodyTextFrame.getMaxX() - textPosition.getMaxX(),
textPosition.getHeight());
textPosition.getMinY(),
mainBodyTextFrame.getMaxX() - textPosition.getMaxX(),
textPosition.getHeight());
gapsInCurrentLine.add(leftGap);
}
@ -152,9 +157,9 @@ public class GapDetectionService {
public void addGapFromLeftEdgeOfMainBody(Rectangle2D textPosition) {
Rectangle2D leftGap = new Rectangle2D.Double(mainBodyTextFrame.getMinX(),
textPosition.getMinY(),
textPosition.getMinX() - mainBodyTextFrame.getMinX(),
textPosition.getHeight());
textPosition.getMinY(),
textPosition.getMinX() - mainBodyTextFrame.getMinX(),
textPosition.getHeight());
gapsInCurrentLine.add(leftGap);
}

View File

@ -45,7 +45,7 @@ public class RulingCleaningService {
verticalAndHorizontalRulingLines.horizontalLines.sort(X_FIRST_RULING_COMPARATOR);
verticalAndHorizontalRulingLines = cleanRulings(verticalAndHorizontalRulingLines);
return CleanRulings.builder().verticals(verticalAndHorizontalRulingLines.verticalLines()).horizontals(verticalAndHorizontalRulingLines.horizontalLines()).build();
return new CleanRulings(verticalAndHorizontalRulingLines.horizontalLines(), verticalAndHorizontalRulingLines.verticalLines());
}

View File

@ -0,0 +1,99 @@
package com.knecon.fforesight.service.layoutparser.processor.services;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import lombok.experimental.UtilityClass;
@UtilityClass
public class TextRulingsClassifier {
private final static double STRIKETHROUGH_ZONE = 0.5; // multiplied with text height, determines height of intersection interval for strikethrough lines
private final static double UNDERLINE_ZONE = 0.2; // multiplied with text height, determines height of intersection interval of underline lines
private final static double TEXT_BBOX_THRESHOLD_FACTOR = 0.15; // multiplied with text width, subtracted from word width
public static void classifyUnderlinedAndStrikethroughText(List<TextPositionSequence> words, CleanRulings cleanRulings) {
for (TextPositionSequence word : words) {
if (word.getDir().equals(TextDirection.ZERO) || word.getDir().equals(TextDirection.HALF_CIRCLE)) {
handleHorizontalText(cleanRulings, word);
} else {
handleVerticalText(cleanRulings, word);
}
}
}
private static void handleVerticalText(CleanRulings cleanRulings, TextPositionSequence word) {
float lowerY = (float) (word.getBoundingBox().getMinY() + TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
float upperY = (float) (word.getBoundingBox().getMaxY() - TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
float strikethroughCenterX = (float) word.getBoundingBox().getCenterX();
float strikethroughBoxHeight = (float) ((word.getHeight() * STRIKETHROUGH_ZONE) / 2);
float underlineCenterX = (float) (word.getDir().equals(TextDirection.QUARTER_CIRCLE) ? word.getBoundingBox().getMinX() : word.getBoundingBox().getMaxX());
float underlineBoxHeight = (float) ((word.getHeight() * UNDERLINE_ZONE) / 2);
float leftX = Math.min(underlineCenterX - underlineBoxHeight, strikethroughCenterX - strikethroughBoxHeight);
float rightX = Math.max(underlineCenterX + underlineBoxHeight, strikethroughCenterX + strikethroughBoxHeight);
List<Ruling> rulingsIntersectingWord = cleanRulings.getVerticalsInXInterval(leftX, rightX)
.stream()
.filter(ruling -> ruling.getClassification().equals(Ruling.Classification.OTHER))
.filter(ruling -> ruling.y1 <= lowerY && upperY <= ruling.y2)
.toList();
for (Ruling ruling : rulingsIntersectingWord) {
if (strikethroughCenterX - strikethroughBoxHeight < ruling.x1 && ruling.x1 < strikethroughCenterX + strikethroughBoxHeight) {
ruling.setClassification(Ruling.Classification.STRIKETROUGH);
word.setStrikethrough(true);
}
if (underlineCenterX - underlineBoxHeight < ruling.x1 && ruling.x1 < underlineCenterX + underlineBoxHeight) {
ruling.setClassification(Ruling.Classification.UNDERLINE);
word.setUnderline(true);
}
}
}
private static void handleHorizontalText(CleanRulings cleanRulings, TextPositionSequence word) {
float leftX = (float) (word.getBoundingBox().getMinX() + TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
float rightX = (float) (word.getBoundingBox().getMaxX() - TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
float strikethroughCenterY = (float) word.getBoundingBox().getCenterY();
float strikethroughBoxHeight = (float) ((word.getHeight() * STRIKETHROUGH_ZONE) / 2);
float underlineCenterY = (float) (word.getDir().equals(TextDirection.ZERO) ? word.getBoundingBox().getMinY() : word.getBoundingBox().getMaxY());
float underlineBoxHeight = (float) ((word.getHeight() * UNDERLINE_ZONE) / 2);
float lowerY = Math.min(underlineCenterY - underlineBoxHeight, strikethroughCenterY - strikethroughBoxHeight);
float upperY = Math.max(underlineCenterY + underlineBoxHeight, strikethroughCenterY + strikethroughBoxHeight);
List<Ruling> rulingsIntersectingWord = cleanRulings.getHorizontalsInYInterval(lowerY, upperY)
.stream()
.filter(ruling -> ruling.getClassification().equals(Ruling.Classification.OTHER))
.filter(ruling -> ruling.x1 <= leftX && rightX <= ruling.x2)
.toList();
for (Ruling ruling : rulingsIntersectingWord) {
if (strikethroughCenterY - strikethroughBoxHeight < ruling.y1 && ruling.y1 < strikethroughCenterY + strikethroughBoxHeight) {
ruling.setClassification(Ruling.Classification.STRIKETROUGH);
word.setStrikethrough(true);
}
if (underlineCenterY - underlineBoxHeight < ruling.y1 && ruling.y1 < underlineCenterY + underlineBoxHeight) {
ruling.setClassification(Ruling.Classification.UNDERLINE);
word.setUnderline(true);
}
}
}
}

View File

@ -38,7 +38,7 @@ public class DocstrumBlockificationService {
public ClassificationPage blockify(List<TextPositionSequence> textPositions, CleanRulings rulings, boolean xyOrder, LayoutparsingVisualizations visualizations) {
CleanRulings usedRulings = rulings.getTableLines();
CleanRulings usedRulings = rulings.withoutTextRulings();
var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder, usedRulings, visualizations);

View File

@ -15,11 +15,10 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.Orientation;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
@Service
public class DocuMineBlockificationService {
@ -34,16 +33,17 @@ public class DocuMineBlockificationService {
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
* Rulings (Table lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
*
* @param textPositions The textPositions of a page.
* @param horizontalRulingLines Horizontal table lines.
* @param verticalRulingLines Vertical table lines.
* @param textPositions The textPositions of a page.
* @param cleanRulings All rulings on a page
* @return Page object that contains the Textblock and text statistics.
*/
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
public ClassificationPage blockify(List<TextPositionSequence> textPositions, CleanRulings cleanRulings) {
List<TextPositionSequence> chunkWords = new ArrayList<>();
List<AbstractPageBlock> chunkBlockList1 = new ArrayList<>();
CleanRulings usedRulings = cleanRulings.withoutTextRulings();
float minX = 1000;
float maxX = 0;
float minY = 1000;
@ -59,12 +59,15 @@ public class DocuMineBlockificationService {
boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj();
boolean negativeXGap = prev != null && word.getMinXDirAdj() - minX < -5;
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
boolean isSplitByRuling = prev != null && usedRulings.lineBetween(prev.getBoundingBox(), word.getBoundingBox());
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
boolean splitByOtherFontAndOtherY = prev != null && Math.abs(prev.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight() * 0.2 && (word.getFontStyle()
.contains("bold") && !prev.getFontStyle().contains("bold") || prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold"));
.contains("bold")
&& !prev.getFontStyle()
.contains("bold") || prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold"));
Matcher matcher = pattern.matcher(chunkWords.stream().collect(Collectors.joining(" ")).toString());
Matcher matcher = pattern.matcher(chunkWords.stream()
.collect(Collectors.joining(" ")).toString());
boolean startsOnSameX = Math.abs(minX - word.getMinXDirAdj()) < 5 && matcher.matches();
if (prev != null && (lineSeparation || startFromTop || splitByDir || isSplitByRuling || splitByOtherFontAndOtherY || negativeXGap || startsOnSameX)) {
@ -86,7 +89,11 @@ public class DocuMineBlockificationService {
wasSplitted = false;
cb1.setOrientation(Orientation.RIGHT);
splitX1 = null;
} else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation || !startFromTop || !splitByX || !newLineAfterSplit || !isSplitByRuling)) {
} else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation
|| !startFromTop
|| !splitByX
|| !newLineAfterSplit
|| !isSplitByRuling)) {
cb1.setOrientation(Orientation.LEFT);
}
@ -149,11 +156,11 @@ public class DocuMineBlockificationService {
if (textBlock == null) {
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
wordBlock.getMaxXDirAdj(),
wordBlock.getMinYDirAdj(),
wordBlock.getMaxYDirAdj(),
wordBlockList,
wordBlock.getRotation());
wordBlock.getMaxXDirAdj(),
wordBlock.getMinYDirAdj(),
wordBlock.getMaxYDirAdj(),
wordBlockList,
wordBlock.getRotation());
} else {
TextPageBlock spatialEntity = textBlock.union(wordBlock);
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
@ -169,68 +176,17 @@ public class DocuMineBlockificationService {
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
}
if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) {
if (textBlock != null
&& textBlock.getSequences() != null
&& textBlock.getSequences()
.stream()
.map(t -> round(t.getMinYDirAdj(), 3))
.collect(toSet()).size() == 1) {
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
}
return textBlock;
}
private boolean isSplitByRuling(float minX,
float minY,
float maxX,
float maxY,
TextPositionSequence word,
List<Ruling> horizontalRulingLines,
List<Ruling> verticalRulingLines) {
return isSplitByRuling(maxX,
minY,
word.getMinXDirAdj(),
word.getMinYDirAdj(),
verticalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(minX,
minY,
word.getMinXDirAdj(),
word.getMaxYDirAdj(),
horizontalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(maxX,
minY,
word.getMinXDirAdj(),
word.getMinYDirAdj(),
horizontalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(minX,
minY,
word.getMinXDirAdj(),
word.getMaxYDirAdj(),
verticalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()); //
}
private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List<Ruling> rulingLines, float dir, float pageWidth, float pageHeight) {
for (Ruling ruling : rulingLines) {
var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight);
if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) {
return true;
}
}
return false;
}
private double round(float value, int decimalPoints) {
var d = Math.pow(10, decimalPoints);

View File

@ -13,14 +13,10 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.Orientation;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
@SuppressWarnings("all")
@ -39,9 +35,9 @@ public class RedactManagerBlockificationService {
* @param visualizations
* @return Page object that contains the Textblock and text statistics.
*/
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Cell> cells, LayoutparsingVisualizations visualizations) {
public ClassificationPage blockify(List<TextPositionSequence> textPositions, CleanRulings cleanRulings, LayoutparsingVisualizations visualizations) {
CleanRulings usedRulings = RectangleTransformations.extractRulings(cells);
CleanRulings usedRulings = cleanRulings.withoutTextRulings();
int indexOnPage = 0;
List<TextPositionSequence> chunkWords = new ArrayList<>();
@ -59,7 +55,7 @@ public class RedactManagerBlockificationService {
boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj();
boolean xIsBeforeFirstX = prev != null && word.getMinXDirAdj() < minX;
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, usedRulings.getHorizontals(), usedRulings.getVerticals());
boolean isSplitByRuling = prev != null && usedRulings.lineBetween(prev.getBoundingBox(), word.getBoundingBox());
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) {
@ -83,7 +79,11 @@ public class RedactManagerBlockificationService {
wasSplitted = false;
cb1.setOrientation(Orientation.RIGHT);
splitX1 = null;
} else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation || !startFromTop || !splitByX || !newLineAfterSplit || !isSplitByRuling)) {
} else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation
|| !startFromTop
|| !splitByX
|| !newLineAfterSplit
|| !isSplitByRuling)) {
cb1.setOrientation(Orientation.LEFT);
}
@ -152,8 +152,11 @@ public class RedactManagerBlockificationService {
TextPageBlock block = (TextPageBlock) itty.next();
if (previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold(block.getMaxY(),
previous.getMaxY()) || previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation()
.equals(Orientation.RIGHT) && equalsWithThreshold(block.getMaxY(), previous.getMaxY())) {
previous.getMaxY())
|| previous != null
&& previous.getOrientation().equals(Orientation.LEFT)
&& block.getOrientation().equals(Orientation.RIGHT)
&& equalsWithThreshold(block.getMaxY(), previous.getMaxY())) {
previous.add(block);
itty.remove();
continue;
@ -162,7 +165,9 @@ public class RedactManagerBlockificationService {
previous = block;
}
visualizations.addTextBlockVisualizations(chunkBlockList.stream().map(tb -> (TextPageBlock) tb).toList(), textPositions.get(0).getPage());
visualizations.addTextBlockVisualizations(chunkBlockList.stream()
.map(tb -> (TextPageBlock) tb)
.toList(), textPositions.get(0).getPage());
return new ClassificationPage(chunkBlockList);
}
@ -194,11 +199,11 @@ public class RedactManagerBlockificationService {
if (textBlock == null) {
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
wordBlock.getMaxXDirAdj(),
wordBlock.getMinYDirAdj(),
wordBlock.getMaxYDirAdj(),
wordBlockList,
wordBlock.getRotation());
wordBlock.getMaxXDirAdj(),
wordBlock.getMinYDirAdj(),
wordBlock.getMaxYDirAdj(),
wordBlockList,
wordBlock.getRotation());
} else {
TextPageBlock spatialEntity = textBlock.union(wordBlock);
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
@ -214,68 +219,18 @@ public class RedactManagerBlockificationService {
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
}
if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) {
if (textBlock != null
&& textBlock.getSequences() != null
&& textBlock.getSequences()
.stream()
.map(t -> round(t.getMinYDirAdj(), 3))
.collect(toSet()).size() == 1) {
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
}
return textBlock;
}
private boolean isSplitByRuling(float minX,
float minY,
float maxX,
float maxY,
TextPositionSequence word,
List<Ruling> horizontalRulingLines,
List<Ruling> verticalRulingLines) {
return isSplitByRuling(maxX,
minY,
word.getMinXDirAdj(),
word.getMinYDirAdj(),
verticalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(minX,
minY,
word.getMinXDirAdj(),
word.getMaxYDirAdj(),
horizontalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(maxX,
minY,
word.getMinXDirAdj(),
word.getMinYDirAdj(),
horizontalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(minX,
minY,
word.getMinXDirAdj(),
word.getMaxYDirAdj(),
verticalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight());
}
private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List<Ruling> rulingLines, float dir, float pageWidth, float pageHeight) {
for (Ruling ruling : rulingLines) {
var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight);
if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) {
return true;
}
}
return false;
}
private double round(float value, int decimalPoints) {
var d = Math.pow(10, decimalPoints);

View File

@ -80,14 +80,18 @@ public class DocumentGraphFactory {
}
public void addParagraphOrHeadline(GenericSemanticNode parentNode, TextPageBlock originalTextBlock, Context context, List<TextPageBlock> textBlocksToMerge) {
public void addParagraphOrHeadline(GenericSemanticNode parentNode,
TextPageBlock originalTextBlock,
Context context,
List<TextPageBlock> textBlocksToMerge,
LayoutParsingType layoutParsingType) {
Page page = context.getPage(originalTextBlock.getPage());
GenericSemanticNode node;
if (originalTextBlock.isHeadline()) {
node = Headline.builder().documentTree(context.getDocumentTree()).build();
} else if (originalTextBlock.isToDuplicate()) {
} else if (originalTextBlock.isToDuplicate() && layoutParsingType.equals(LayoutParsingType.REDACT_MANAGER)) {
node = DuplicatedParagraph.builder().documentTree(context.getDocumentTree()).build();
} else {
node = Paragraph.builder().documentTree(context.getDocumentTree()).build();
@ -269,8 +273,7 @@ public class DocumentGraphFactory {
return pages.keySet()
.stream()
.filter(page -> page.getNumber() == pageIndex)
.findFirst()
.orElseThrow(() -> new NoSuchElementException(format("ClassificationPage with number %d not found", pageIndex)));
.findFirst().orElseThrow(() -> new NoSuchElementException(format("ClassificationPage with number %d not found", pageIndex)));
}
}

View File

@ -49,8 +49,7 @@ public class SectionNodeFactory {
Map<Integer, List<AbstractPageBlock>> blocksPerPage = pageBlocks.stream()
.collect(groupingBy(AbstractPageBlock::getPage));
Section section = Section.builder().documentTree(context.getDocumentTree())
.build();
Section section = Section.builder().documentTree(context.getDocumentTree()).build();
context.getSections().add(section);
blocksPerPage.keySet()
@ -121,12 +120,12 @@ public class SectionNodeFactory {
case REDACT_MANAGER, CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> {
alreadyMerged.add(abstractPageBlock);
remainingBlocks.remove(abstractPageBlock);
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, new ArrayList<>());
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, new ArrayList<>(), layoutParsingType);
}
default -> {
List<TextPageBlock> textBlocks = findTextBlocksWithSameClassificationAndAlignsY((TextPageBlock) abstractPageBlock, remainingBlocks);
alreadyMerged.addAll(textBlocks);
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, textBlocks);
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, textBlocks, layoutParsingType);
}
}
} else if (abstractPageBlock instanceof TablePageBlock tablePageBlock) {

View File

@ -159,7 +159,7 @@ public class TableNodeFactory {
tableCell.setLeafTextBlock(textBlock);
} else {
cell.getTextBlocks()
.forEach(tb -> DocumentGraphFactory.addParagraphOrHeadline(tableCell, tb, context, emptyList()));
.forEach(tb -> DocumentGraphFactory.addParagraphOrHeadline(tableCell, tb, context, emptyList(), layoutParsingType));
}
}

View File

@ -1,7 +1,6 @@
package com.knecon.fforesight.service.layoutparser.processor.services.graphics;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
@ -9,8 +8,8 @@ import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.SearchTextWithTextPositionFactory;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
@ -32,33 +31,32 @@ public class GraphicExtractorService {
int pageNumber,
CleanRulings cleanRulings,
List<TextPositionSequence> textPositionSequences,
List<Cell> emptyTableCells,
boolean graphicsRaster) {
var characterBBoxes = getCharacterBBoxes(textPositionSequences);
var tableLineBBoxes = getLineBBoxesFromTableCells(emptyTableCells);
var underLineBBoxes = getUnderlineBBoxes(cleanRulings, characterBBoxes);
var strikeThroughBBoxes = getStrikeThroughBBoxes(cleanRulings, characterBBoxes);
List<Box> characterBBoxes = getCharacterBBoxes(textPositionSequences);
List<Box> classifiedRulingsBoxes = getLineBBoxesOfAllClassifiedRulings(cleanRulings);
GraphicBBDetector graphicBBDetector = new GraphicBBDetector(pdPage, true);
var graphicBBoxes = graphicBBDetector.findGraphicBB();
List<Box> graphicBBoxes = graphicBBDetector.findGraphicBB();
if (graphicsRaster) {
// This should only be used if ocr was performed, it is currently in an early stage and needs to be improved.
graphicBBoxes.addAll(findGraphicsRaster.findCCBoundingBoxes(pdDocument,
characterBBoxes.stream().map(box -> new Rectangle2D.Double(box.x1 - 2, box.y1 - 2, box.width() + 4, box.height() + 4)).collect(Collectors.toList()),
PageInformation.fromPDPage(pageNumber, pdPage)));
characterBBoxes.stream()
.map(box -> new Rectangle2D.Double(box.x1 - 2, box.y1 - 2, box.width() + 4, box.height() + 4))
.collect(Collectors.toList()),
PageInformation.fromPDPage(pageNumber, pdPage)));
}
var filteredGraphicBBoxes = graphicBBoxes.stream()
.filter(box -> !box.intersectsAny(tableLineBBoxes, 4))
.filter(box -> !box.intersectsAny(underLineBBoxes, 4))
.filter(box -> !box.intersectsAny(strikeThroughBBoxes, 4))
List<Box> filteredGraphicBBoxes = graphicBBoxes.stream()
.filter(box -> !box.intersectsAny(classifiedRulingsBoxes, 4))
.collect(Collectors.toList());
var clusters = graphicsClusteringService.getClusters(filteredGraphicBBoxes, 14);
List<Box> clusters = graphicsClusteringService.getClusters(filteredGraphicBBoxes, 14);
return clusters.stream().filter(box -> box.area() > 500 && box.height() > 50 && box.width() > 50).toList();
return clusters.stream()
.filter(box -> box.area() > 500 && box.height() > 50 && box.width() > 50)
.toList();
}
@ -74,34 +72,13 @@ public class GraphicExtractorService {
}
private List<Box> getLineBBoxesFromTableCells(List<Cell> emptyTableCells) {
private List<Box> getLineBBoxesOfAllClassifiedRulings(CleanRulings cleanRulings) {
List<Box> expandedTableLines = new ArrayList<>();
emptyTableCells.forEach(cell -> {
expandedTableLines.add(new Box(new Rectangle2D.Double(cell.x, cell.y - 1, cell.width, 2)));
expandedTableLines.add(new Box(new Rectangle2D.Double(cell.x, cell.y + cell.height - 1, cell.width, 2)));
expandedTableLines.add(new Box(new Rectangle2D.Double(cell.x - 1, cell.y, 2, cell.height)));
expandedTableLines.add(new Box(new Rectangle2D.Double(cell.x + cell.width - 1, cell.y, 2, cell.height)));
});
return expandedTableLines;
}
private List<Box> getUnderlineBBoxes(CleanRulings cleanRulings, List<Box> characterBBoxes) {
return cleanRulings.getHorizontal()
return cleanRulings.buildAll()
.stream()
.filter(ruling -> !ruling.getClassification().equals(Ruling.Classification.OTHER))
.map(h -> new Box(h.x1, h.y1, h.x2, h.y2))
.filter(box -> box.intersectsAnyAndOver(characterBBoxes, 6))
.collect(Collectors.toList());
}
private List<Box> getStrikeThroughBBoxes(CleanRulings cleanRulings, List<Box> characterBBoxes) {
return cleanRulings.getHorizontal().stream().map(h -> new Box(h.x1, h.y1, h.x2, h.y2)).filter(box -> box.intersectsCenter(characterBBoxes, 2)).collect(Collectors.toList());
}
}

View File

@ -144,8 +144,7 @@ public class RectangleTransformations {
return Collections.emptyList();
}
double splitThreshold = rectangle2DList.stream()
.mapToDouble(RectangularShape::getWidth).average()
.orElse(5) * 5.0;
.mapToDouble(RectangularShape::getWidth).average().orElse(5) * 5.0;
List<List<Rectangle2D>> rectangleListsWithGaps = new LinkedList<>();
List<Rectangle2D> rectangleListWithoutGaps = new LinkedList<>();
@ -182,7 +181,7 @@ public class RectangleTransformations {
verticalRulings.add(new Ruling(new Point2D.Float(rectangle.x + rectangle.width, rectangle.y),
new Point2D.Float(rectangle.x + rectangle.width, rectangle.y + rectangle.height)));
});
return CleanRulings.builder().verticals(verticalRulings).horizontals(horizontalRulings).build();
return new CleanRulings(verticalRulings, horizontalRulings);
}

View File

@ -9,6 +9,7 @@ import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.core.io.ClassPathResource;
@ -29,21 +30,21 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
@Autowired
private LayoutParsingPipeline layoutParsingPipeline;
@Disabled
@Test
public void testLayoutParserEndToEnd() {
String filePath = "files/SinglePages/VV-931175_Page1.pdf";
String filePath = "/home/kschuettler/Dokumente/TestFiles/tables with striketrough text.pdf";
runForFile(filePath);
}
@Test
// @Disabled
@Disabled
@SneakyThrows
public void testLayoutParserEndToEndWithFolder() {
String folder = "/home/kschuettler/iqser/fforesight/layout-parser/layoutparser/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages";
String folder = "/home/kschuettler/Dokumente/TestFiles/large number of prod files";
List<Path> pdfFiles = Files.walk(Path.of(folder))
.filter(path -> path.getFileName().toString().endsWith(".pdf"))
.sorted(Comparator.comparing(Path::getFileName))
@ -77,7 +78,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
Arrays.stream(finishedEvent.message().split("\n"))
.forEach(log::info);
File tmpFile = new File("/tmp/layout-E2E/" + fileName + "_VIEWER.pdf");
File tmpFile = new File("/tmp/layoutparserEND2END/" + fileName + "_VIEWER.pdf");
assert tmpFile.getParentFile().exists() || tmpFile.getParentFile().mkdirs();
storageService.downloadTo(TENANT_ID, layoutParsingRequest.viewerDocumentStorageId(), tmpFile);

View File

@ -1,11 +1,14 @@
package com.knecon.fforesight.service.layoutparser.server.model;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.util.Collections;
import java.util.List;
import java.util.stream.IntStream;
import org.junit.jupiter.api.Test;
@ -35,4 +38,81 @@ class CleanRulingsTest {
assertTrue(cleanRulings.lineBetween(a, f));
}
@Test
public void testSingleLineInRange() {
List<Ruling> horizontals = List.of(new Ruling(new Point2D.Float(0, 1), new Point2D.Float(100, 1)));
List<Ruling> verticals = List.of(new Ruling(new Point2D.Float(1, 0), new Point2D.Float(1, 100)));
CleanRulings cleanRulings = new CleanRulings(horizontals, verticals);
assertEquals(0, cleanRulings.getVerticalsInXInterval(-2, -1).size());
assertEquals(0, cleanRulings.getVerticalsInXInterval(-2, -5).size());
assertEquals(0, cleanRulings.getVerticalsInXInterval(-2, Float.NaN).size());
assertEquals(1, cleanRulings.getVerticalsInXInterval(1, 10).size());
assertEquals(0, cleanRulings.getVerticalsInXInterval(100, 101).size());
assertEquals(verticals.size(), cleanRulings.getVerticalsInXInterval(Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY).size());
assertEquals(1, cleanRulings.getVerticalsInXInterval(1 - 1e-5f, 1 + 1e-5f).size());
assertEquals(0, cleanRulings.getVerticalsInXInterval(1e-5f, 1 - 1e-5f).size());
assertEquals(0, cleanRulings.getHorizontalsInYInterval(-2, -1).size());
assertEquals(1, cleanRulings.getHorizontalsInYInterval(1, 10).size());
assertEquals(0, cleanRulings.getHorizontalsInYInterval(100, 1001).size());
}
@Test
public void testLinesInRange() {
List<Ruling> horizontals = IntStream.range(0, 101).boxed()
.map(y -> new Ruling(new Point2D.Float(0, y), new Point2D.Float(100, y)))
.toList();
List<Ruling> verticals = IntStream.range(0, 101).boxed()
.map(x -> new Ruling(new Point2D.Float(x, 0), new Point2D.Float(x, 100)))
.toList();
CleanRulings cleanRulings = new CleanRulings(horizontals, verticals);
assertEquals(0, cleanRulings.getVerticalsInXInterval(-2, -1).size());
assertEquals(0, cleanRulings.getVerticalsInXInterval(-2, -5).size());
assertEquals(0, cleanRulings.getVerticalsInXInterval(-2, Float.NaN).size());
assertEquals(10, cleanRulings.getVerticalsInXInterval(1, 10).size());
assertEquals(1, cleanRulings.getVerticalsInXInterval(100, 101).size());
assertEquals(verticals.size(), cleanRulings.getVerticalsInXInterval(Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY).size());
assertEquals(1, cleanRulings.getVerticalsInXInterval(-1e-5f, 1e-5f).size());
assertEquals(1, cleanRulings.getVerticalsInXInterval(0, 0).size());
assertEquals(0, cleanRulings.getVerticalsInXInterval(1e-5f, 1 - 1e-5f).size());
assertEquals(0, cleanRulings.getHorizontalsInYInterval(-2, -1).size());
assertEquals(10, cleanRulings.getHorizontalsInYInterval(1, 10).size());
assertEquals(1, cleanRulings.getHorizontalsInYInterval(100, 1001).size());
}
@Test
public void testLinesInRangePerformance() {
List<Ruling> horizontals = IntStream.range(0, (int) 1e6).boxed()
.map(y -> new Ruling(new Point2D.Float(0, y), new Point2D.Float(100, y)))
.toList();
CleanRulings cleanRulings = new CleanRulings(horizontals, Collections.emptyList());
float startY = 29;
float endY = 3000;
long start = System.currentTimeMillis();
var result = cleanRulings.getHorizontalsInYInterval(startY, endY);
long time = System.currentTimeMillis() - start;
start = System.currentTimeMillis();
var result2 = cleanRulings.getHorizontals()
.stream()
.filter(ruling -> ruling.getY1() >= startY && ruling.getY1() <= endY)
.toList();
long time2 = System.currentTimeMillis() - start;
assertEquals(result, result2);
assertTrue(time < time2);
}
}