RED-8666
This commit is contained in:
parent
aef1146e8f
commit
2567d89fbb
@ -6,5 +6,5 @@ public enum LayoutParsingType {
|
||||
DOCUMINE,
|
||||
|
||||
DOCSTRUM,
|
||||
DOCSTRUM_XY
|
||||
DOCSTRUM_ROW_WISE
|
||||
}
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor;
|
||||
|
||||
import static com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType.DOCSTRUM;
|
||||
import static com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType.DOCSTRUM_XY;
|
||||
import static com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType.DOCSTRUM_ROW_WISE;
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
@ -266,8 +266,8 @@ public class LayoutParsingPipeline {
|
||||
case REDACT_MANAGER -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
case TAAS -> taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
case DOCSTRUM -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical(), false);
|
||||
case DOCSTRUM_XY -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical(), true);
|
||||
case DOCSTRUM -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical(), true);
|
||||
case DOCSTRUM_ROW_WISE -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical(), false);
|
||||
};
|
||||
classificationPage.setCleanRulings(cleanRulings);
|
||||
classificationPage.setRotation(rotation);
|
||||
@ -291,7 +291,7 @@ public class LayoutParsingPipeline {
|
||||
|
||||
tableExtractionService.extractTables(cleanRulings, classificationPage);
|
||||
|
||||
if (layoutParsingType == DOCSTRUM || layoutParsingType == DOCSTRUM_XY) {
|
||||
if (layoutParsingType == DOCSTRUM || layoutParsingType == DOCSTRUM_ROW_WISE) {
|
||||
// docstrumBlockificationService.combineBlocks(classificationPage); //todo 8666
|
||||
}
|
||||
|
||||
@ -310,12 +310,12 @@ public class LayoutParsingPipeline {
|
||||
case TAAS -> taasClassificationService.classifyDocument(classificationDocument);
|
||||
case DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument);
|
||||
case REDACT_MANAGER -> redactManagerClassificationService.classifyDocument(classificationDocument);
|
||||
case DOCSTRUM_XY -> redactManagerClassificationService.classifyDocument(classificationDocument);
|
||||
case DOCSTRUM_ROW_WISE -> redactManagerClassificationService.classifyDocument(classificationDocument);
|
||||
}
|
||||
|
||||
log.info("Building Sections for {}", identifier);
|
||||
|
||||
if (layoutParsingType == DOCSTRUM || layoutParsingType == DOCSTRUM_XY) {
|
||||
if (layoutParsingType == DOCSTRUM || layoutParsingType == DOCSTRUM_ROW_WISE) {
|
||||
// Currently for debugging return paragraphs as sections, because there is a merging logic in sectionBuilder
|
||||
List<ClassificationSection> sections = new ArrayList<>();
|
||||
for (var page : classificationPages) {
|
||||
|
||||
@ -35,10 +35,10 @@ public class DocstrumBlockificationService {
|
||||
static final float THRESHOLD = 2f;
|
||||
|
||||
|
||||
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines, boolean xyOder) {
|
||||
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines, boolean columnWise) {
|
||||
|
||||
List<AbstractPageBlock> abstractPageBlocks = new ArrayList<>();
|
||||
var zones = docstrumSegmentationService.segmentPage(textPositions, xyOder);
|
||||
var zones = docstrumSegmentationService.segmentPage(textPositions, columnWise);
|
||||
zones.forEach(zone -> {
|
||||
|
||||
List<TextPositionSequence> textPositionSequences = new ArrayList<>();
|
||||
|
||||
@ -4,8 +4,12 @@ import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.tuple.ImmutablePair;
|
||||
import org.apache.commons.lang3.tuple.Pair;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone;
|
||||
|
||||
import lombok.Getter;
|
||||
@ -47,17 +51,17 @@ public class UnsupervisedReadingOrderDetector {
|
||||
switch (spatialReasoningRule) {
|
||||
case COLUMN_WISE:
|
||||
if (useRenderingOrder) {
|
||||
zoneComparator = (Zone z1, Zone z2, double t) -> getBeforeInReadingVertical(z1, z2, t) || getBeforeInRendering(z1, z2);
|
||||
zoneComparator = (Zone z1, Zone z2, double t) -> getBeforeInReadingColumnWise(z1, z2, t) || getBeforeInRendering(z1, z2);
|
||||
} else {
|
||||
zoneComparator = this::getBeforeInReadingVertical;
|
||||
zoneComparator = this::getBeforeInReadingColumnWise;
|
||||
}
|
||||
break;
|
||||
|
||||
case ROW_WISE:
|
||||
if (useRenderingOrder) {
|
||||
zoneComparator = (Zone z1, Zone z2, double t) -> getBeforeInReadingHorizontal(z1, z2, t) || getBeforeInRendering(z1, z2);
|
||||
zoneComparator = (Zone z1, Zone z2, double t) -> getBeforeInReadingRowWise(z1, z2, t) || getBeforeInRendering(z1, z2);
|
||||
} else {
|
||||
zoneComparator = this::getBeforeInReadingHorizontal;
|
||||
zoneComparator = this::getBeforeInReadingRowWise;
|
||||
}
|
||||
break;
|
||||
|
||||
@ -159,8 +163,8 @@ public class UnsupervisedReadingOrderDetector {
|
||||
|
||||
private boolean getBeforeInReading(Zone z1, Zone z2, double tolerance) {
|
||||
|
||||
IntervalRelations xRelation = getIntervalRelationX(z1, z2, tolerance);
|
||||
IntervalRelations yRelation = getIntervalRelationY(z1, z2, tolerance);
|
||||
IntervalRelations xRelation = getIntervalRelationX(z1, z2, tolerance).get(0);
|
||||
IntervalRelations yRelation = getIntervalRelationY(z1, z2, tolerance).get(0);
|
||||
|
||||
return xRelation == IntervalRelations.PRECEDES
|
||||
|| yRelation == IntervalRelations.PRECEDES
|
||||
@ -171,119 +175,87 @@ public class UnsupervisedReadingOrderDetector {
|
||||
}
|
||||
|
||||
|
||||
private boolean getBeforeInReadingVertical(Zone z1, Zone z2, double tolerance) {
|
||||
private boolean getBeforeInReadingColumnWise(Zone z1, Zone z2, double tolerance) {
|
||||
|
||||
IntervalRelations xRelation = getIntervalRelationX(z1, z2, tolerance);
|
||||
IntervalRelations yRelation = getIntervalRelationY(z1, z2, tolerance);
|
||||
IntervalRelations xRelation = getIntervalRelationX(z1, z2, tolerance).get(0);
|
||||
IntervalRelations yRelation = getIntervalRelationY(z1, z2, tolerance).get(0);
|
||||
|
||||
return getIntervalRelations(xRelation, yRelation);
|
||||
}
|
||||
|
||||
|
||||
private static boolean getIntervalRelations(IntervalRelations relation1, IntervalRelations relation2) {
|
||||
|
||||
return relation1 == IntervalRelations.PRECEDES
|
||||
|| relation1 == IntervalRelations.MEETS
|
||||
|| (relation2 == IntervalRelations.PRECEDES || relation2 == IntervalRelations.MEETS || relation2 == IntervalRelations.OVERLAPS) && //
|
||||
(relation1 == IntervalRelations.OVERLAPS
|
||||
|| relation1 == IntervalRelations.STARTS
|
||||
|| relation1 == IntervalRelations.FINISHES_INVERSE
|
||||
|| relation1 == IntervalRelations.EQUALS
|
||||
|| relation1 == IntervalRelations.DURING
|
||||
|| relation1 == IntervalRelations.DURING_INVERSE
|
||||
|| relation1 == IntervalRelations.FINISHES
|
||||
|| relation1 == IntervalRelations.STARTS_INVERSE
|
||||
|| relation1 == IntervalRelations.OVERLAPS_INVERSE);
|
||||
return relation1 == IntervalRelations.PRECEDES //
|
||||
|| relation1 == IntervalRelations.MEETS //
|
||||
|| relation1 == IntervalRelations.OVERLAPS && //
|
||||
(relation2 == IntervalRelations.PRECEDES //
|
||||
|| relation2 == IntervalRelations.MEETS //
|
||||
|| relation2 == IntervalRelations.OVERLAPS) //
|
||||
|| ((relation2 == IntervalRelations.PRECEDES || relation2 == IntervalRelations.MEETS || relation2 == IntervalRelations.OVERLAPS) && //
|
||||
(relation1 == IntervalRelations.STARTS //
|
||||
|| relation1 == IntervalRelations.FINISHES_INVERSE //
|
||||
|| relation1 == IntervalRelations.EQUALS //
|
||||
|| relation1 == IntervalRelations.DURING //
|
||||
|| relation1 == IntervalRelations.DURING_INVERSE //
|
||||
|| relation1 == IntervalRelations.FINISHES //
|
||||
|| relation1 == IntervalRelations.STARTS_INVERSE //
|
||||
|| relation1 == IntervalRelations.OVERLAPS_INVERSE));
|
||||
}
|
||||
|
||||
|
||||
private boolean getBeforeInReadingHorizontal(Zone z1, Zone z2, double tolerance) {
|
||||
|
||||
IntervalRelations xRelation = getIntervalRelationX(z1, z2, tolerance);
|
||||
IntervalRelations yRelation = getIntervalRelationY(z1, z2, tolerance);
|
||||
private boolean getBeforeInReadingRowWise(Zone z1, Zone z2, double tolerance) {
|
||||
|
||||
return getIntervalRelations(yRelation, xRelation);
|
||||
IntervalRelations xRelations = getIntervalRelationX(z1, z2, tolerance).get(0);
|
||||
IntervalRelations yRelations = getIntervalRelationY(z1, z2, tolerance).get(0);
|
||||
|
||||
return getIntervalRelations(yRelations, xRelations);
|
||||
}
|
||||
|
||||
|
||||
private static IntervalRelations getIntervalRelationX(Zone z1, Zone z2, double t) {
|
||||
private static List<IntervalRelations> getIntervalRelationX(Zone z1, Zone z2, double t) {
|
||||
|
||||
double z1_minX = z1.getX();
|
||||
double z1_maxX = z1_minX + z1.getWidth();
|
||||
double z2_minX = z2.getX();
|
||||
double z2_maxX = z2_minX + z2.getWidth();
|
||||
return getIntervalRelation(new ImmutablePair<>(z1.getX(), z1.getX() + z1.getWidth()), new ImmutablePair<>(z2.getX(), z2.getX() + z2.getWidth()), t);
|
||||
}
|
||||
|
||||
// this is very wrong: check https://www.cs.rug.nl/~aiellom/publications/ijdarNoi.pdf
|
||||
if (z1_maxX < z2_minX - t) {
|
||||
return IntervalRelations.PRECEDES;
|
||||
} else if (z1_maxX >= z2_minX - t) {
|
||||
return IntervalRelations.PRECEDES_INVERSE;
|
||||
} else if (z2_minX - t <= z1_maxX && z1_maxX <= z2_minX + t) {
|
||||
return IntervalRelations.MEETS;
|
||||
} else if (z2_minX - t > z1_maxX && z1_maxX > z2_minX + t) {
|
||||
return IntervalRelations.MEETS_INVERSE;
|
||||
} else if (z1_minX < z2_minX - t && (z2_minX + t < z1_maxX && z1_maxX < z2_maxX - t)) {
|
||||
return IntervalRelations.OVERLAPS;
|
||||
} else if (z1_minX >= z2_minX - t && (z2_minX + t >= z1_maxX && z1_maxX >= z2_maxX - t)) {
|
||||
return IntervalRelations.OVERLAPS_INVERSE;
|
||||
} else if (z2_minX - t <= z1_minX && z1_minX <= z2_minX + t && z1_maxX < z2_maxX - t) {
|
||||
return IntervalRelations.STARTS;
|
||||
} else if (z2_minX - t > z1_minX && z1_minX > z2_minX + t && z1_maxX >= z2_maxX - t) {
|
||||
return IntervalRelations.STARTS_INVERSE;
|
||||
} else if (z1_minX > z2_minX + t && z1_maxX < z2_maxX - t) {
|
||||
return IntervalRelations.DURING;
|
||||
} else if (z1_minX <= z2_minX + t && z1_maxX >= z2_maxX - t) {
|
||||
return IntervalRelations.DURING_INVERSE;
|
||||
} else if (z1_minX > z2_minX + t && (z2_maxX - t <= z1_maxX && z1_maxX <= z2_maxX + t)) {
|
||||
return IntervalRelations.FINISHES;
|
||||
} else if (z1_minX <= z2_minX + t && (z2_maxX - t > z1_maxX && z1_maxX > z2_maxX + t)) {
|
||||
return IntervalRelations.FINISHES_INVERSE;
|
||||
} else if (z2_minX - t <= z1_minX && z1_minX <= z2_minX + t && (z2_maxX - t <= z1_maxX && z1_maxX <= z2_maxX + t)) {
|
||||
return IntervalRelations.EQUALS;
|
||||
|
||||
private static List<IntervalRelations> getIntervalRelationY(Zone z1, Zone z2, double t) {
|
||||
|
||||
return getIntervalRelation(new ImmutablePair<>(z1.getY(), z1.getY() + z1.getHeight()), new ImmutablePair<>(z2.getY(), z2.getY() + z2.getHeight()), t);
|
||||
|
||||
}
|
||||
|
||||
|
||||
private static List<IntervalRelations> getIntervalRelation(Pair<Double, Double> a, Pair<Double, Double> b, double t) {
|
||||
|
||||
var intervalRelations = getIntervalRelation(a, b, t, false);
|
||||
intervalRelations.addAll(getIntervalRelation(b, a, t, true));
|
||||
if ((b.getLeft() - t <= a.getLeft() && a.getLeft() <= b.getLeft() + t) && (b.getRight() - t <= a.getRight() && a.getRight() <= b.getRight() + t)) {
|
||||
intervalRelations.add(IntervalRelations.EQUALS);
|
||||
}
|
||||
return intervalRelations;
|
||||
|
||||
|
||||
return IntervalRelations.UNKNOWN;
|
||||
}
|
||||
|
||||
|
||||
private static IntervalRelations getIntervalRelationY(Zone z1, Zone z2, double t) {
|
||||
private static List<IntervalRelations> getIntervalRelation(Pair<Double, Double> a, Pair<Double, Double> b, double t, boolean inverse) {
|
||||
|
||||
double z1_minY = z1.getY();
|
||||
double z1_maxY = z1_minY + z1.getHeight();
|
||||
double z2_minY = z2.getY();
|
||||
double z2_maxY = z2_minY + z2.getHeight();
|
||||
|
||||
// this is very wrong: check https://www.cs.rug.nl/~aiellom/publications/ijdarNoi.pdf
|
||||
if (z1_minY < z2_maxY - t) {
|
||||
return IntervalRelations.PRECEDES_INVERSE;
|
||||
} else if (z1_minY >= z2_maxY - t) {
|
||||
return IntervalRelations.PRECEDES;
|
||||
} else if (z2_maxY - t <= z1_minY && z1_minY <= z2_maxY + t) {
|
||||
return IntervalRelations.MEETS_INVERSE;
|
||||
} else if (z2_maxY - t > z1_minY && z1_minY > z2_maxY + t) {
|
||||
return IntervalRelations.MEETS;
|
||||
} else if (z1_maxY < z2_maxY - t && (z2_maxY + t < z1_minY && z1_minY < z2_minY - t)) {
|
||||
return IntervalRelations.OVERLAPS_INVERSE;
|
||||
} else if (z1_maxY >= z2_maxY - t && (z2_maxY + t >= z1_minY && z1_minY >= z2_minY - t)) {
|
||||
return IntervalRelations.OVERLAPS;
|
||||
} else if (z2_maxY - t <= z1_maxY && z1_maxY <= z2_maxY + t && z1_minY < z2_minY - t) {
|
||||
return IntervalRelations.STARTS_INVERSE;
|
||||
} else if (z2_maxY - t > z1_maxY && z1_maxY > z2_maxY + t && z1_minY >= z2_minY - t) {
|
||||
return IntervalRelations.STARTS;
|
||||
} else if (z1_maxY > z2_maxY + t && z1_minY < z2_minY - t) {
|
||||
return IntervalRelations.DURING_INVERSE;
|
||||
} else if (z1_maxY <= z2_maxY + t && z1_minY >= z2_minY - t) {
|
||||
return IntervalRelations.DURING;
|
||||
} else if (z1_maxY > z2_maxY + t && (z2_minY - t <= z1_minY && z1_minY <= z2_minY + t)) {
|
||||
return IntervalRelations.FINISHES_INVERSE;
|
||||
} else if (z1_maxY <= z2_maxY + t && (z2_minY - t > z1_minY && z1_minY > z2_minY + t)) {
|
||||
return IntervalRelations.FINISHES;
|
||||
} else if (z2_maxY - t <= z1_maxY && z1_maxY <= z2_maxY + t && (z2_minY - t <= z1_minY && z1_minY <= z2_minY + t)) {
|
||||
return IntervalRelations.EQUALS;
|
||||
List<IntervalRelations> intervalRelations = new ArrayList<>();
|
||||
if (a.getRight() < b.getLeft() - t) {
|
||||
intervalRelations.add(inverse ? IntervalRelations.PRECEDES_INVERSE : IntervalRelations.PRECEDES);
|
||||
} if (b.getLeft() - t <= a.getRight() && a.getRight() <= b.getLeft() + t) {
|
||||
intervalRelations.add(inverse ? IntervalRelations.MEETS_INVERSE : IntervalRelations.MEETS);
|
||||
} if (a.getLeft() < b.getLeft() - t && (b.getLeft() + t < a.getRight() && a.getRight() < b.getRight() - t)) {
|
||||
intervalRelations.add(inverse ? IntervalRelations.OVERLAPS_INVERSE : IntervalRelations.OVERLAPS);
|
||||
} if ((b.getLeft() - t <= a.getLeft() && a.getLeft() <= b.getLeft() + t) && a.getRight() < b.getRight() - t) {
|
||||
intervalRelations.add(inverse ? IntervalRelations.STARTS_INVERSE : IntervalRelations.STARTS);
|
||||
} if (a.getLeft() > b.getLeft() + t && a.getRight() < b.getRight() + t) {
|
||||
intervalRelations.add(inverse ? IntervalRelations.DURING_INVERSE : IntervalRelations.DURING);
|
||||
} if (a.getLeft() > b.getLeft() + t && (b.getRight() - t <= a.getRight() && a.getRight() <= b.getRight() + t)) {
|
||||
intervalRelations.add(inverse ? IntervalRelations.FINISHES_INVERSE : IntervalRelations.FINISHES);
|
||||
}
|
||||
|
||||
return IntervalRelations.UNKNOWN;
|
||||
|
||||
return intervalRelations;
|
||||
}
|
||||
|
||||
}
|
||||
@ -27,7 +27,7 @@ public class ReadingOrderService {
|
||||
|
||||
SpatialReasoningRules spatialReasoningRules = columnWise ? SpatialReasoningRules.COLUMN_WISE : SpatialReasoningRules.ROW_WISE;
|
||||
|
||||
var unsupervisedReadingOrderDetector = new UnsupervisedReadingOrderDetector(5, spatialReasoningRules, false);
|
||||
var unsupervisedReadingOrderDetector = new UnsupervisedReadingOrderDetector(1, spatialReasoningRules, false);
|
||||
|
||||
return unsupervisedReadingOrderDetector.get(zones);
|
||||
}
|
||||
|
||||
@ -33,7 +33,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
|
||||
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
|
||||
|
||||
Document document = buildGraph(fileName, LayoutParsingType.DOCSTRUM_XY);
|
||||
Document document = buildGraph(fileName, LayoutParsingType.DOCSTRUM);
|
||||
long start = System.currentTimeMillis();
|
||||
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
|
||||
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user