RED-7141: Fixed pr finding and improved speed

This commit is contained in:
Dominique Eifländer 2024-03-07 16:38:21 +01:00
parent 05523585c0
commit cb9127b4f3
9 changed files with 199 additions and 85 deletions

View File

@ -291,7 +291,7 @@ public class LayoutParsingPipeline {
if (layoutParsingType == LayoutParsingType.REDACT_MANAGER) {
docstrumBlockificationService.combineBlocks(classificationPage);
} else if (layoutParsingType == LayoutParsingType.CLARIFYND) {
docstrumBlockificationService.mergeZones(classificationPage.getTextBlocks());
docstrumBlockificationService.mergeIntersectingBlocks(classificationPage.getTextBlocks());
}
buildPageStatistics(classificationPage);

View File

@ -1,7 +1,6 @@
package com.knecon.fforesight.service.layoutparser.processor.model.text;
import org.apache.pdfbox.text.TextPosition;
import org.springframework.beans.BeanUtils;
import com.fasterxml.jackson.annotation.JsonIgnore;
@ -50,10 +49,14 @@ public class RedTextPosition {
public static RedTextPosition fromTextPosition(TextPosition textPosition) {
var pos = new RedTextPosition();
BeanUtils.copyProperties(textPosition, pos);
pos.setFontName(textPosition.getFont().getName());
pos.setRotation(textPosition.getRotation());
pos.setPageHeight(textPosition.getPageHeight());
pos.setPageWidth(textPosition.getPageWidth());
pos.setUnicode(textPosition.getUnicode());
pos.setDir(textPosition.getDir());
pos.setWidthOfSpace(textPosition.getWidthOfSpace());
pos.setFontSizeInPt(textPosition.getFontSizeInPt());
pos.setFontName(textPosition.getFont().getName());
var position = new float[4];

View File

@ -15,6 +15,7 @@ import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlo
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@ -24,12 +25,18 @@ import lombok.extern.slf4j.Slf4j;
@Builder
@NoArgsConstructor
@AllArgsConstructor
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
public class TextPositionSequence implements CharSequence {
public static final int HEIGHT_PADDING = 2;
@EqualsAndHashCode.Include
private int page;
@EqualsAndHashCode.Include
private List<RedTextPosition> textPositions = new ArrayList<>();
@EqualsAndHashCode.Include
private TextDirection dir;
private int rotation;
private float pageHeight;

View File

@ -22,6 +22,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFre
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.DocstrumSegmentationService;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone;
import com.knecon.fforesight.service.layoutparser.processor.utils.QuickSort;
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionSequenceComparator;
@ -51,8 +52,16 @@ public class DocstrumBlockificationService {
usedVerticalRulings.add(new Ruling(new Point2D.Float(cell.x + cell.width, cell.y), new Point2D.Float(cell.x + cell.width, cell.y + cell.height)));
});
List<AbstractPageBlock> abstractPageBlocks = new ArrayList<>();
var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder);
var pageBlocks = toAbstractPageBlocks(zones, usedHorizonalRulings, usedVerticalRulings);
return new ClassificationPage(pageBlocks);
}
private List<AbstractPageBlock> toAbstractPageBlocks(List<Zone> zones, List<Ruling> horizontalRulings, List<Ruling> verticalRulings) {
List<AbstractPageBlock> abstractPageBlocks = new ArrayList<>();
zones.forEach(zone -> {
List<TextPositionSequence> textPositionSequences = new ArrayList<>();
@ -62,20 +71,21 @@ public class DocstrumBlockificationService {
});
});
abstractPageBlocks.addAll(splitZonesAtRulings(textPositionSequences, usedHorizonalRulings, usedVerticalRulings));
abstractPageBlocks.addAll(splitZonesAtRulings(textPositionSequences, horizontalRulings, verticalRulings));
});
return new ClassificationPage(abstractPageBlocks);
return abstractPageBlocks;
}
public void combineBlocks(ClassificationPage page) {
mergeZones(page.getTextBlocks());
mergeIntersectingBlocks(page.getTextBlocks());
TextPageBlock previous = new TextPageBlock();
ListIterator<AbstractPageBlock> itty = page.getTextBlocks().listIterator();
while (itty.hasNext()) {
AbstractPageBlock block = itty.next();
if (block instanceof TablePageBlock) {
continue;
@ -84,60 +94,28 @@ public class DocstrumBlockificationService {
if (previous != null && !previous.getSequences().isEmpty()) {
if (current.getDir() == previous.getDir() //
&& previous.getNumberOfLines() >= 2 && current.getNumberOfLines() >= 2 //
&& previous.intersectsY(current) //
&& !hasBetween(current, previous, page.getTextBlocks()) //
&& numberOfYIntersections(current, previous, page.getTextBlocks()) == 0) {
previous.getSequences().addAll(current.getSequences());
previous = buildTextBlock(previous.getSequences(), 0);
previous.setToDuplicate(true);
itty.remove();
itty.previous();
itty.set(previous);
itty.next();
if (current.getDir() != previous.getDir()) {
previous = current;
continue;
}
if (current.getDir() == previous.getDir() && (previous.almostIntersects(current, 0, 0))) {
previous.getSequences().addAll(current.getSequences());
boolean toDuplicate = previous.isToDuplicate();
previous = buildTextBlock(previous.getSequences(), 0);
previous.setToDuplicate(toDuplicate);
itty.remove();
itty.previous();
itty.set(previous);
itty.next();
if (areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(previous, current, page)) {
combineBlocksAndResetIterator(previous, current, itty, true);
continue;
}
if (current.getDir() == previous.getDir() //
&& (Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD || Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) //
&& (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() == 2 && current.getNumberOfLines() == 1) //
&& !hasBetween(current, previous, page.getTextBlocks()) && numberOfYIntersections(current, previous, page.getTextBlocks()) <= 4) {
previous.getSequences().addAll(current.getSequences());
previous = buildTextBlock(previous.getSequences(), 0);
itty.remove();
itty.previous();
itty.set(previous);
itty.next();
if (previous.almostIntersects(current, 0, 0)) {
combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate());
continue;
}
if (current.getDir() == previous.getDir() //
&& current.intersectsY(previous) //
&& (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() == 2 && current.getNumberOfLines() == 1) //
&& !hasBetween(current, previous, page.getTextBlocks()) //
&& numberOfYIntersections(current, previous, page.getTextBlocks()) <= 0) {
previous.getSequences().addAll(current.getSequences());
previous = buildTextBlock(previous.getSequences(), 0);
itty.remove();
itty.previous();
itty.set(previous);
itty.next();
if (isSameTopOrBottomWithPreviousHavingMaxTwoLinesAndCurrentThanOneAndMax4OtherBlocksOnHeight(previous, current, page)) {
combineBlocksAndResetIterator(previous, current, itty, false);
continue;
}
if (isOnlyIntersectingYAndOnelineOrPrevoiusTwoLines(previous, current, page)) {
combineBlocksAndResetIterator(previous, current, itty, false);
continue;
}
@ -145,8 +123,45 @@ public class DocstrumBlockificationService {
previous = current;
}
mergeZones(page.getTextBlocks());
mergeIntersectingBlocks(page.getTextBlocks());
}
private boolean isOnlyIntersectingYAndOnelineOrPrevoiusTwoLines(TextPageBlock previous, TextPageBlock current, ClassificationPage page) {
return current.intersectsY(previous) //
&& (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() == 2 && current.getNumberOfLines() == 1) //
&& numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) <= 0;
}
private boolean isSameTopOrBottomWithPreviousHavingMaxTwoLinesAndCurrentThanOneAndMax4OtherBlocksOnHeight(TextPageBlock previous,
TextPageBlock current,
ClassificationPage page) {
return (Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD || Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) //
&& (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() == 2 && current.getNumberOfLines() == 1) //
&& !hasBetween(current, previous, page.getTextBlocks()) && numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) <= 4;
}
private boolean areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(TextPageBlock previous, TextPageBlock current, ClassificationPage page) {
return previous.getNumberOfLines() >= 2 && current.getNumberOfLines() >= 2 //
&& previous.intersectsY(current) //
&& numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) == 0;
}
private void combineBlocksAndResetIterator(TextPageBlock previous, TextPageBlock current, ListIterator<AbstractPageBlock> itty, boolean toDuplicate) {
previous.getSequences().addAll(current.getSequences());
previous = buildTextBlock(previous.getSequences(), 0);
previous.setToDuplicate(toDuplicate);
itty.remove();
itty.previous();
itty.set(previous);
itty.next();
}
@ -167,7 +182,7 @@ public class DocstrumBlockificationService {
}
private int numberOfYIntersections(TextPageBlock block, TextPageBlock other, List<AbstractPageBlock> allBlocks) {
private int numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(TextPageBlock block, TextPageBlock other, List<AbstractPageBlock> allBlocks) {
double minY = Math.min(block.getMinY(), other.getMinY());
double maxY = Math.min(block.getMaxY(), other.getMaxY());
@ -188,9 +203,9 @@ public class DocstrumBlockificationService {
}
public void mergeZones(List<AbstractPageBlock> zones) {
public void mergeIntersectingBlocks(List<AbstractPageBlock> blocks) {
ListIterator<AbstractPageBlock> itty = zones.listIterator();
ListIterator<AbstractPageBlock> itty = blocks.listIterator();
Set<AbstractPageBlock> toRemove = new HashSet<>();
while (itty.hasNext()) {
AbstractPageBlock block = itty.next();
@ -204,19 +219,19 @@ public class DocstrumBlockificationService {
continue;
}
for (int i = 0; i < zones.size(); i++) {
for (int i = 0; i < blocks.size(); i++) {
if (toRemove.contains(zones.get(i))) {
if (toRemove.contains(blocks.get(i))) {
continue;
}
if (zones.get(i) == current) {
if (blocks.get(i) == current) {
continue;
}
if (zones.get(i) instanceof TablePageBlock) {
if (blocks.get(i) instanceof TablePageBlock) {
continue;
}
TextPageBlock inner = (TextPageBlock) zones.get(i);
TextPageBlock inner = (TextPageBlock) blocks.get(i);
if (inner.isToDuplicate()) {
continue;
@ -232,7 +247,7 @@ public class DocstrumBlockificationService {
}
}
}
zones.removeAll(toRemove);
blocks.removeAll(toRemove);
}

View File

@ -5,16 +5,23 @@ import java.util.Arrays;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.utils.FastAtan2;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.Setter;
@Data
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
public class Character {
@EqualsAndHashCode.Include
private final double x;
@EqualsAndHashCode.Include
private final double y;
private final RedTextPosition textPosition;
@Setter
private List<Neighbor> neighbors = new ArrayList<>();
@ -67,18 +74,12 @@ public class Character {
}
public void setNeighbors(List<Neighbor> neighbors) {
this.neighbors = neighbors;
}
public double angle(Character character) {
if (getX() > character.getX()) {
return Math.atan2(getY() - character.getY(), getX() - character.getX());
return FastAtan2.atan2(getY() - character.getY(), getX() - character.getX());
} else {
return Math.atan2(character.getY() - getY(), character.getX() - getX());
return FastAtan2.atan2(character.getY() - getY(), character.getX() - getX());
}
}

View File

@ -8,16 +8,22 @@ import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import lombok.Data;
import lombok.EqualsAndHashCode;
@Data
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
public class Line extends BoundingBox {
private static final double WORD_DISTANCE_MULTIPLIER = 0.18;
@EqualsAndHashCode.Include
private final double x0;
@EqualsAndHashCode.Include
private final double y0;
@EqualsAndHashCode.Include
private final double x1;
@EqualsAndHashCode.Include
private final double y1;
private final double height;

View File

@ -46,32 +46,40 @@ public class NearestNeighbourService {
while (start > 0 && characters.get(i).getX() - characters.get(start - 1).getX() < searchDistance) {
start--;
candidates.add(new Neighbor(characters.get(start), characters.get(i)));
clearLeastDistant(candidates, maxNeighborCount);
clearMostDistant(candidates, maxNeighborCount);
newCandidatesFound = true;
}
while (end < characters.size() && characters.get(end).getX() - characters.get(i).getX() < searchDistance) {
candidates.add(new Neighbor(characters.get(end), characters.get(i)));
clearLeastDistant(candidates, maxNeighborCount);
clearMostDistant(candidates, maxNeighborCount);
end++;
newCandidatesFound = true;
}
if (newCandidatesFound && candidates.size() >= maxNeighborCount) {
distance = candidates.get(maxNeighborCount - 1).getDistance();
distance = candidates.stream().mapToDouble(Neighbor::getDistance).max().orElse(Double.POSITIVE_INFINITY);
}
}
clearLeastDistant(candidates, maxNeighborCount);
clearMostDistant(candidates, maxNeighborCount);
characters.get(i).setNeighbors(new ArrayList<>(candidates));
}
}
private void clearLeastDistant(List<Neighbor> candidates, int maxNeighborCount) {
private void clearMostDistant(List<Neighbor> candidates, int maxNeighborCount) {
if (candidates.size() > maxNeighborCount) {
candidates.sort(Comparator.comparingDouble(Neighbor::getDistance));
candidates.remove(candidates.remove(candidates.size() - 1));
double maxDistance = 0;
int maxIndex = 0;
for (int i = 0; i < candidates.size(); i++) {
Neighbor candidate = candidates.get(i);
if (candidate.getDistance() > maxDistance) {
maxDistance = candidate.getDistance();
maxIndex = i;
}
}
candidates.remove(maxIndex);
}
}

View File

@ -0,0 +1,76 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.utils;
public class FastAtan2 {
static final private int Size_Ac = 1000;
static final private int Size_Ar = Size_Ac + 1;
static final private double Pi = (float) Math.PI;
static final private double Pi_H = Pi / 2;
static final private double[] Atan2 = new double[Size_Ar];
static final private double[] Atan2_PM = new double[Size_Ar];
static final private double[] Atan2_MP = new double[Size_Ar];
static final private double[] Atan2_MM = new double[Size_Ar];
static final private double[] Atan2_R = new double[Size_Ar];
static final private double[] Atan2_RPM = new double[Size_Ar];
static final private double[] Atan2_RMP = new double[Size_Ar];
static final private double[] Atan2_RMM = new double[Size_Ar];
static {
for (int i = 0; i <= Size_Ac; i++) {
double d = (double) i / Size_Ac;
double x = 1;
double y = x * d;
double v = Math.atan2(y, x);
Atan2[i] = v;
Atan2_PM[i] = Pi - v;
Atan2_MP[i] = -v;
Atan2_MM[i] = -Pi + v;
Atan2_R[i] = Pi_H - v;
Atan2_RPM[i] = Pi_H + v;
Atan2_RMP[i] = -Pi_H + v;
Atan2_RMM[i] = -Pi_H - v;
}
}
@SuppressWarnings("ParameterAssignment")
static public double atan2(double y, double x) {
if (y < 0) {
if (x < 0) {
//(y < x) because == (-y > -x)
if (y < x) {
return Atan2_RMM[(int) (x / y * Size_Ac)];
} else {
return Atan2_MM[(int) (y / x * Size_Ac)];
}
} else {
y = -y;
if (y > x) {
return Atan2_RMP[(int) (x / y * Size_Ac)];
} else {
return Atan2_MP[(int) (y / x * Size_Ac)];
}
}
} else {
if (x < 0) {
x = -x;
if (y > x) {
return Atan2_RPM[(int) (x / y * Size_Ac)];
} else {
return Atan2_PM[(int) (y / x * Size_Ac)];
}
} else {
if (y > x) {
return Atan2_R[(int) (x / y * Size_Ac)];
} else {
return Atan2[(int) (y / x * Size_Ac)];
}
}
}
}
}

View File

@ -2,7 +2,6 @@ package com.knecon.fforesight.service.layoutparser.processor.utils;
import static java.lang.String.format;
import java.awt.geom.Area;
import java.awt.geom.Rectangle2D;
import java.awt.geom.RectangularShape;
import java.util.Collections;
@ -40,11 +39,10 @@ public class RectangleTransformations {
public static double calculateIntersectedArea(Rectangle2D r1, Rectangle2D r2) {
Area a1 = new Area(r1);
Area a2 = new Area(r2);
a1.intersect(a2);
Rectangle2D intersection = a1.getBounds2D();
return intersection.getWidth() * intersection.getHeight();
double xOverlap = Math.max(0, Math.min(r1.getMaxX(), r2.getMaxX()) - Math.max(r1.getMinX(), r2.getMinX()));
double yOverlap = Math.max(0, Math.min(r1.getMaxY(), r2.getMaxY()) - Math.max(r1.getY(), r2.getY()));
return xOverlap * yOverlap;
}