RED-8666
This commit is contained in:
parent
91401361e9
commit
7f56ed15c8
@ -45,12 +45,13 @@ public class RedTextPosition {
|
|||||||
@JsonIgnore
|
@JsonIgnore
|
||||||
private String fontName;
|
private String fontName;
|
||||||
|
|
||||||
|
|
||||||
@JsonIgnore
|
@JsonIgnore
|
||||||
TextPosition textPosition;
|
private int textSequence;
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public static RedTextPosition fromTextPosition(TextPosition textPosition) {
|
public static RedTextPosition fromTextPosition(TextPosition textPosition, int textSequence) {
|
||||||
|
|
||||||
var pos = new RedTextPosition();
|
var pos = new RedTextPosition();
|
||||||
BeanUtils.copyProperties(textPosition, pos);
|
BeanUtils.copyProperties(textPosition, pos);
|
||||||
@ -66,7 +67,7 @@ public class RedTextPosition {
|
|||||||
position[3] = textPosition.getHeightDir();
|
position[3] = textPosition.getHeightDir();
|
||||||
|
|
||||||
pos.setPosition(position);
|
pos.setPosition(position);
|
||||||
pos.textPosition = textPosition;
|
pos.setTextSequence(textSequence);
|
||||||
return pos;
|
return pos;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -43,9 +43,9 @@ public class TextPositionSequence implements CharSequence {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public TextPositionSequence(List<TextPosition> textPositions, int page, boolean isParagraphStart) {
|
public TextPositionSequence(List<TextPosition> textPositions, int page, boolean isParagraphStart, int textSequence) {
|
||||||
|
|
||||||
this.textPositions = textPositions.stream().map(RedTextPosition::fromTextPosition).collect(Collectors.toList());
|
this.textPositions = textPositions.stream().map(textPosition -> RedTextPosition.fromTextPosition(textPosition, textSequence)).collect(Collectors.toList());
|
||||||
this.page = page;
|
this.page = page;
|
||||||
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
|
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
|
||||||
this.rotation = textPositions.get(0).getRotation();
|
this.rotation = textPositions.get(0).getRotation();
|
||||||
@ -133,9 +133,9 @@ public class TextPositionSequence implements CharSequence {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void add(TextPosition textPosition) {
|
public void add(TextPosition textPosition, int textSequence) {
|
||||||
|
|
||||||
this.textPositions.add(RedTextPosition.fromTextPosition(textPosition));
|
this.textPositions.add(RedTextPosition.fromTextPosition(textPosition, textSequence));
|
||||||
|
|
||||||
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
|
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
|
||||||
this.rotation = textPositions.get(0).getRotation();
|
this.rotation = textPositions.get(0).getRotation();
|
||||||
|
|||||||
@ -29,7 +29,7 @@ public class DocstrumSegmentationService {
|
|||||||
private final ReadingOrderService readingOrderService;
|
private final ReadingOrderService readingOrderService;
|
||||||
|
|
||||||
|
|
||||||
public List<Zone> segmentPage(List<TextPositionSequence> textPositions, boolean xyOder) {
|
public List<Zone> segmentPage(List<TextPositionSequence> textPositions, boolean columnWise) {
|
||||||
|
|
||||||
List<Zone> zones = new ArrayList<>();
|
List<Zone> zones = new ArrayList<>();
|
||||||
zones.addAll(computeZones(textPositions, TextDirection.ZERO));
|
zones.addAll(computeZones(textPositions, TextDirection.ZERO));
|
||||||
@ -37,7 +37,7 @@ public class DocstrumSegmentationService {
|
|||||||
zones.addAll(computeZones(textPositions, TextDirection.HALF_CIRCLE));
|
zones.addAll(computeZones(textPositions, TextDirection.HALF_CIRCLE));
|
||||||
zones.addAll(computeZones(textPositions, TextDirection.THREE_QUARTER_CIRCLE));
|
zones.addAll(computeZones(textPositions, TextDirection.THREE_QUARTER_CIRCLE));
|
||||||
|
|
||||||
return readingOrderService.resolve(zones, xyOder);
|
return readingOrderService.resolve(zones, columnWise);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -11,6 +11,7 @@ public class Zone extends BoundingBox {
|
|||||||
|
|
||||||
private List<Line> lines;
|
private List<Line> lines;
|
||||||
|
|
||||||
|
private int readingOrder = -1;
|
||||||
|
|
||||||
public Zone(List<Line> lines) {
|
public Zone(List<Line> lines) {
|
||||||
|
|
||||||
|
|||||||
@ -1,8 +1,13 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder;
|
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone;
|
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone;
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
|
||||||
import lombok.Getter;
|
import lombok.Getter;
|
||||||
|
|
||||||
public class UnsupervisedReadingOrderDetector {
|
public class UnsupervisedReadingOrderDetector {
|
||||||
@ -13,17 +18,21 @@ public class UnsupervisedReadingOrderDetector {
|
|||||||
private double tolerance = 5;
|
private double tolerance = 5;
|
||||||
private ZoneComparator zoneComparator;
|
private ZoneComparator zoneComparator;
|
||||||
|
|
||||||
|
|
||||||
public boolean useRenderingOrder() {
|
public boolean useRenderingOrder() {
|
||||||
|
|
||||||
return useRenderingOrder;
|
return useRenderingOrder;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public UnsupervisedReadingOrderDetector() {
|
public UnsupervisedReadingOrderDetector() {
|
||||||
|
|
||||||
configureComparator();
|
configureComparator();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public UnsupervisedReadingOrderDetector(double tolerance, SpatialReasoningRules spatialReasoningRule, boolean useRenderingOrder) {
|
public UnsupervisedReadingOrderDetector(double tolerance, SpatialReasoningRules spatialReasoningRule, boolean useRenderingOrder) {
|
||||||
|
|
||||||
this.tolerance = tolerance;
|
this.tolerance = tolerance;
|
||||||
this.spatialReasoningRule = spatialReasoningRule;
|
this.spatialReasoningRule = spatialReasoningRule;
|
||||||
this.useRenderingOrder = useRenderingOrder;
|
this.useRenderingOrder = useRenderingOrder;
|
||||||
@ -33,12 +42,12 @@ public class UnsupervisedReadingOrderDetector {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void configureComparator() { // Or a suitable method name
|
public void configureComparator() {
|
||||||
|
|
||||||
switch (spatialReasoningRule) {
|
switch (spatialReasoningRule) {
|
||||||
case COLUMN_WISE:
|
case COLUMN_WISE:
|
||||||
if (useRenderingOrder) {
|
if (useRenderingOrder) {
|
||||||
zoneComparator = (Zone z1, Zone z2, double t) ->
|
zoneComparator = (Zone z1, Zone z2, double t) -> getBeforeInReadingVertical(z1, z2, t) || getBeforeInRendering(z1, z2);
|
||||||
getBeforeInReadingVertical(z1, z2, t) || getBeforeInRendering(z1, z2);
|
|
||||||
} else {
|
} else {
|
||||||
zoneComparator = this::getBeforeInReadingVertical;
|
zoneComparator = this::getBeforeInReadingVertical;
|
||||||
}
|
}
|
||||||
@ -46,8 +55,7 @@ public class UnsupervisedReadingOrderDetector {
|
|||||||
|
|
||||||
case ROW_WISE:
|
case ROW_WISE:
|
||||||
if (useRenderingOrder) {
|
if (useRenderingOrder) {
|
||||||
zoneComparator = (Zone z1, Zone z2, double t) ->
|
zoneComparator = (Zone z1, Zone z2, double t) -> getBeforeInReadingHorizontal(z1, z2, t) || getBeforeInRendering(z1, z2);
|
||||||
getBeforeInReadingHorizontal(z1, z2, t) || getBeforeInRendering(z1, z2);
|
|
||||||
} else {
|
} else {
|
||||||
zoneComparator = this::getBeforeInReadingHorizontal;
|
zoneComparator = this::getBeforeInReadingHorizontal;
|
||||||
}
|
}
|
||||||
@ -56,8 +64,7 @@ public class UnsupervisedReadingOrderDetector {
|
|||||||
case BASIC:
|
case BASIC:
|
||||||
default:
|
default:
|
||||||
if (useRenderingOrder) {
|
if (useRenderingOrder) {
|
||||||
zoneComparator = (Zone z1, Zone z2, double t) ->
|
zoneComparator = (Zone z1, Zone z2, double t) -> getBeforeInReading(z1, z2, t) || getBeforeInRendering(z1, z2);
|
||||||
getBeforeInReading(z1, z2, t) || getBeforeInRendering(z1, z2);
|
|
||||||
} else {
|
} else {
|
||||||
zoneComparator = this::getBeforeInReading;
|
zoneComparator = this::getBeforeInReading;
|
||||||
}
|
}
|
||||||
@ -65,171 +72,207 @@ public class UnsupervisedReadingOrderDetector {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean getBeforeInRendering(Zone z1, Zone z2) {
|
|
||||||
double avgTextSequenceZ1 = z1.getTextLines().stream()
|
|
||||||
.flatMap(tl -> tl.getWords().stream())
|
|
||||||
.flatMap(w -> w.getLetters().stream())
|
|
||||||
.mapToDouble(l -> l.getTextSequence())
|
|
||||||
.average().orElse(0);
|
|
||||||
|
|
||||||
double avgTextSequenceZ2 = z2.getTextLines().stream()
|
public List<Zone> get(List<Zone> zones) {
|
||||||
.flatMap(tl -> tl.getWords().stream())
|
|
||||||
.flatMap(w -> w.getLetters().stream())
|
int readingOrder = 0;
|
||||||
.mapToDouble(l -> l.getTextSequence())
|
Map<Integer, List<Integer>> graph = buildGraph(zones);
|
||||||
.average().orElse(0);
|
|
||||||
|
List<Zone> orderedZones = new ArrayList<>();
|
||||||
|
|
||||||
|
while (!graph.isEmpty()) {
|
||||||
|
int maxCount = graph.values()
|
||||||
|
.stream()
|
||||||
|
.mapToInt(List::size)
|
||||||
|
.max()
|
||||||
|
.orElse(0);
|
||||||
|
|
||||||
|
Map.Entry<Integer, List<Integer>> current = graph.entrySet()
|
||||||
|
.stream()
|
||||||
|
.filter(entry -> entry.getValue().size() == maxCount)
|
||||||
|
.findFirst()
|
||||||
|
.orElse(null);
|
||||||
|
|
||||||
|
if (current != null) {
|
||||||
|
int index = current.getKey();
|
||||||
|
graph.remove(index);
|
||||||
|
|
||||||
|
for (List<Integer> valueList : graph.values()) {
|
||||||
|
valueList.remove(Integer.valueOf(index));
|
||||||
|
}
|
||||||
|
|
||||||
|
Zone zone = zones.get(index);
|
||||||
|
zone.setReadingOrder(readingOrder++);
|
||||||
|
orderedZones.add(zone);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return orderedZones;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private Map<Integer, List<Integer>> buildGraph(List<Zone> zones) {
|
||||||
|
|
||||||
|
Map<Integer, List<Integer>> graph = new HashMap<>();
|
||||||
|
|
||||||
|
for (int i = 0; i < zones.size(); i++) {
|
||||||
|
graph.put(i, new ArrayList<>());
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < zones.size(); i++) {
|
||||||
|
Zone zone1 = zones.get(i);
|
||||||
|
for (int j = 0; j < zones.size(); j++) {
|
||||||
|
if (i == j) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
Zone zone2 = zones.get(j);
|
||||||
|
|
||||||
|
if (zoneComparator.isBefore(zone1, zone2, tolerance)) {
|
||||||
|
graph.get(i).add(j);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return graph;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private boolean getBeforeInRendering(Zone z1, Zone z2) {
|
||||||
|
|
||||||
|
double avgTextSequenceZ1 = z1.getLines()
|
||||||
|
.stream()
|
||||||
|
.flatMap(line -> line.getCharacters()
|
||||||
|
.stream())
|
||||||
|
.map(character -> character.getTextPosition().getTextSequence())
|
||||||
|
.collect(Collectors.averagingDouble(Integer::intValue));
|
||||||
|
|
||||||
|
double avgTextSequenceZ2 = z2.getLines()
|
||||||
|
.stream()
|
||||||
|
.flatMap(line -> line.getCharacters()
|
||||||
|
.stream())
|
||||||
|
.map(character -> character.getTextPosition().getTextSequence())
|
||||||
|
.collect(Collectors.averagingDouble(Integer::intValue));
|
||||||
|
|
||||||
return avgTextSequenceZ1 < avgTextSequenceZ2;
|
return avgTextSequenceZ1 < avgTextSequenceZ2;
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean getBeforeInReading(Zone z1, Zone z2, double T) {
|
|
||||||
IntervalRelations xRelation = getIntervalRelationX(z1, z2, T);
|
|
||||||
IntervalRelations yRelation = getIntervalRelationY(z1, z2, T);
|
|
||||||
|
|
||||||
return xRelation == IntervalRelations.PRECEDES ||
|
private boolean getBeforeInReading(Zone z1, Zone z2, double tolerance) {
|
||||||
yRelation == IntervalRelations.PRECEDES ||
|
|
||||||
xRelation == IntervalRelations.MEETS ||
|
|
||||||
yRelation == IntervalRelations.MEETS ||
|
|
||||||
xRelation == IntervalRelations.OVERLAPS ||
|
|
||||||
yRelation == IntervalRelations.OVERLAPS;
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean getBeforeInReadingVertical(Zone z1, Zone z2, double T) {
|
IntervalRelations xRelation = getIntervalRelationX(z1, z2, tolerance);
|
||||||
IntervalRelations xRelation = getIntervalRelationX(z1, z2, T);
|
IntervalRelations yRelation = getIntervalRelationY(z1, z2, tolerance);
|
||||||
IntervalRelations yRelation = getIntervalRelationY(z1, z2, T);
|
|
||||||
|
|
||||||
return xRelation == IntervalRelations.PRECEDES ||
|
return xRelation == IntervalRelations.PRECEDES
|
||||||
xRelation == IntervalRelations.MEETS ||
|
|| yRelation == IntervalRelations.PRECEDES
|
||||||
(xRelation == IntervalRelations.OVERLAPS && (yRelation == IntervalRelations.PRECEDES ||
|
|| xRelation == IntervalRelations.MEETS
|
||||||
yRelation == IntervalRelations.MEETS ||
|
|| yRelation == IntervalRelations.MEETS
|
||||||
yRelation == IntervalRelations.OVERLAPS)) ||
|
|| xRelation == IntervalRelations.OVERLAPS
|
||||||
((yRelation == IntervalRelations.PRECEDES || yRelation == IntervalRelations.MEETS ||
|
|| yRelation == IntervalRelations.OVERLAPS;
|
||||||
yRelation == IntervalRelations.OVERLAPS) &&
|
|
||||||
(xRelation == IntervalRelations.PRECEDES ||
|
|
||||||
xRelation == IntervalRelations.MEETS ||
|
|
||||||
xRelation == IntervalRelations.OVERLAPS ||
|
|
||||||
xRelation == IntervalRelations.STARTS ||
|
|
||||||
xRelation == IntervalRelations.FINISHES_INVERSE ||
|
|
||||||
xRelation == IntervalRelations.EQUALS ||
|
|
||||||
xRelation == IntervalRelations.DURING ||
|
|
||||||
xRelation == IntervalRelations.DURING_INVERSE ||
|
|
||||||
xRelation == IntervalRelations.FINISHES ||
|
|
||||||
xRelation == IntervalRelations.STARTS_INVERSE ||
|
|
||||||
xRelation == IntervalRelations.OVERLAPS_INVERSE));
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean getBeforeInReadingHorizontal(Zone z1, Zone z2, double T) {
|
|
||||||
IntervalRelations xRelation = getIntervalRelationX(z1, z2, T);
|
|
||||||
IntervalRelations yRelation = getIntervalRelationY(z1, z2, T);
|
|
||||||
|
|
||||||
return yRelation == IntervalRelations.PRECEDES ||
|
|
||||||
yRelation == IntervalRelations.MEETS ||
|
|
||||||
(yRelation == IntervalRelations.OVERLAPS && (xRelation == IntervalRelations.PRECEDES ||
|
|
||||||
xRelation == IntervalRelations.MEETS ||
|
|
||||||
xRelation == IntervalRelations.OVERLAPS)) ||
|
|
||||||
((xRelation == IntervalRelations.PRECEDES || xRelation == IntervalRelations.MEETS ||
|
|
||||||
xRelation == IntervalRelations.OVERLAPS) &&
|
|
||||||
(yRelation == IntervalRelations.PRECEDES ||
|
|
||||||
yRelation == IntervalRelations.MEETS ||
|
|
||||||
yRelation == IntervalRelations.OVERLAPS ||
|
|
||||||
yRelation == IntervalRelations.STARTS ||
|
|
||||||
yRelation == IntervalRelations.FINISHES_INVERSE ||
|
|
||||||
yRelation == IntervalRelations.EQUALS ||
|
|
||||||
yRelation == IntervalRelations.DURING ||
|
|
||||||
yRelation == IntervalRelations.DURING_INVERSE ||
|
|
||||||
yRelation == IntervalRelations.FINISHES ||
|
|
||||||
yRelation == IntervalRelations.STARTS_INVERSE ||
|
|
||||||
yRelation == IntervalRelations.OVERLAPS_INVERSE));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public class ZoneComparator {
|
private boolean getBeforeInReadingVertical(Zone z1, Zone z2, double tolerance) {
|
||||||
|
|
||||||
// Other methods and classes...
|
IntervalRelations xRelation = getIntervalRelationX(z1, z2, tolerance);
|
||||||
|
IntervalRelations yRelation = getIntervalRelationY(z1, z2, tolerance);
|
||||||
|
|
||||||
private static IntervalRelations getIntervalRelationX(Zone z1, Zone z2, double T) {
|
return getIntervalRelations(xRelation, yRelation);
|
||||||
if (z1.getX() < z2.getX() + z2.getWidth() - T) {
|
}
|
||||||
return IntervalRelations.PRECEDES;
|
|
||||||
} else if (z1.getX() >= z2.getX() + z2.getWidth() - T) {
|
|
||||||
return IntervalRelations.PRECEDESI;
|
|
||||||
} else if (z2.getX() + z2.getWidth() - T <= z1.getX()
|
|
||||||
&& z1.getX() <= z2.getX() + z2.getWidth() + T) {
|
|
||||||
return IntervalRelations.MEETS;
|
|
||||||
} else if (z2.getX() + z2.getWidth() - T > z1.getX()
|
|
||||||
&& z1.getX() > z2.getX() + z2.getWidth() + T) {
|
|
||||||
return IntervalRelations.MEETSI;
|
|
||||||
} else if (z1.getX() + z1.getWidth() < z2.getX() + z2.getWidth() - T
|
|
||||||
&& (z2.getX() + z2.getWidth() + T < z1.getX()
|
|
||||||
&& z1.getX() < z2.getX() - T)) {
|
|
||||||
return IntervalRelations.OVERLAPSI;
|
|
||||||
} else if (z1.getX() + z1.getWidth() >= z2.getX() + z2.getWidth() - T
|
|
||||||
&& (z2.getX() + z2.getWidth() + T >= z1.getX()
|
|
||||||
&& z1.getX() >= z2.getX() - T)) {
|
|
||||||
return IntervalRelations.OVERLAPS;
|
|
||||||
} else if (z2.getX() + z2.getWidth() - T <= z1.getX() + z1.getWidth()
|
|
||||||
&& z1.getX() + z1.getWidth() <= z2.getX() + z2.getWidth() + T
|
|
||||||
&& z1.getX() < z2.getX() - T) {
|
|
||||||
return IntervalRelations.STARTSI;
|
|
||||||
} else if (z2.getX() + z2.getWidth() - T > z1.getX() + z1.getWidth()
|
|
||||||
&& z1.getX() + z1.getWidth() > z2.getX() + z2.getWidth() + T
|
|
||||||
&& z1.getX() >= z2.getX() - T) {
|
|
||||||
return IntervalRelations.STARTS;
|
|
||||||
} else if (z1.getX() + z1.getWidth() > z2.getX() + z2.getWidth() + T
|
|
||||||
&& z1.getX() < z2.getX() - T) {
|
|
||||||
return IntervalRelations.DURINGI;
|
|
||||||
} else if (z1.getX() + z1.getWidth() <= z2.getX() + z2.getWidth() + T
|
|
||||||
&& z1.getX() >= z2.getX() - T) {
|
|
||||||
return IntervalRelations.DURING;
|
|
||||||
} else if (z1.getX() + z1.getWidth() > z2.getX() + z2.getWidth() + T
|
|
||||||
&& (z2.getX() - T <= z1.getX()
|
|
||||||
&& z1.getX() <= z2.getX() + T)) {
|
|
||||||
return IntervalRelations.FINISHESI;
|
|
||||||
} else if (z1.getX() + z1.getWidth() <= z2.getX() + z2.getWidth() + T
|
|
||||||
&& (z2.getX() - T > z1.getX()
|
|
||||||
&& z1.getX() > z2.getX() + T)) {
|
|
||||||
return IntervalRelations.FINISHES;
|
|
||||||
} else if (z2.getX() + z2.getWidth() - T <= z1.getX() + z1.getWidth()
|
|
||||||
&& z1.getX() + z1.getWidth() <= z2.getX() + z2.getWidth() + T
|
|
||||||
&& (z2.getX() - T <= z1.getX()
|
|
||||||
&& z1.getX() <= z2.getX() + T)) {
|
|
||||||
return IntervalRelations.EQUALS;
|
|
||||||
}
|
|
||||||
|
|
||||||
return IntervalRelations.UNKNOWN;
|
|
||||||
|
private static boolean getIntervalRelations(IntervalRelations relation1, IntervalRelations relation2) {
|
||||||
|
|
||||||
|
return relation1 == IntervalRelations.PRECEDES
|
||||||
|
|| relation1 == IntervalRelations.MEETS
|
||||||
|
|| (relation2 == IntervalRelations.PRECEDES || relation2 == IntervalRelations.MEETS || relation2 == IntervalRelations.OVERLAPS) && //
|
||||||
|
(relation1 == IntervalRelations.OVERLAPS
|
||||||
|
|| relation1 == IntervalRelations.STARTS
|
||||||
|
|| relation1 == IntervalRelations.FINISHES_INVERSE
|
||||||
|
|| relation1 == IntervalRelations.EQUALS
|
||||||
|
|| relation1 == IntervalRelations.DURING
|
||||||
|
|| relation1 == IntervalRelations.DURING_INVERSE
|
||||||
|
|| relation1 == IntervalRelations.FINISHES
|
||||||
|
|| relation1 == IntervalRelations.STARTS_INVERSE
|
||||||
|
|| relation1 == IntervalRelations.OVERLAPS_INVERSE);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private boolean getBeforeInReadingHorizontal(Zone z1, Zone z2, double tolerance) {
|
||||||
|
|
||||||
|
IntervalRelations xRelation = getIntervalRelationX(z1, z2, tolerance);
|
||||||
|
IntervalRelations yRelation = getIntervalRelationY(z1, z2, tolerance);
|
||||||
|
|
||||||
|
return getIntervalRelations(yRelation, xRelation);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static IntervalRelations getIntervalRelationX(Zone z1, Zone z2, double t) {
|
||||||
|
|
||||||
|
if (z1.getX() + z1.getWidth() < z2.getX() - t) {
|
||||||
|
return IntervalRelations.PRECEDES;
|
||||||
|
} else if (z1.getX() + z1.getWidth() >= z2.getX() - t) {
|
||||||
|
return IntervalRelations.PRECEDES_INVERSE;
|
||||||
|
} else if (z2.getX() - t <= z1.getX() + z1.getWidth() && z1.getX() + z1.getWidth() <= z2.getX() + t) {
|
||||||
|
return IntervalRelations.MEETS;
|
||||||
|
} else if (z2.getX() - t > z1.getX() + z1.getWidth() && z1.getX() + z1.getWidth() > z2.getX() + t) {
|
||||||
|
return IntervalRelations.MEETS_INVERSE;
|
||||||
|
} else if (z1.getX() < z2.getX() - t && (z2.getX() + t < z1.getX() + z1.getWidth() && z1.getX() + z1.getWidth() < z2.getX() + z2.getWidth() - t)) {
|
||||||
|
return IntervalRelations.OVERLAPS;
|
||||||
|
} else if (z1.getX() >= z2.getX() - t && (z2.getX() + t >= z1.getX() + z1.getWidth() && z1.getX() + z1.getWidth() >= z2.getX() + z2.getWidth() - t)) {
|
||||||
|
return IntervalRelations.OVERLAPS_INVERSE;
|
||||||
|
} else if (z2.getX() - t <= z1.getX() && z1.getX() <= z2.getX() + t && z1.getX() + z1.getWidth() < z2.getX() + z2.getWidth() - t) {
|
||||||
|
return IntervalRelations.STARTS;
|
||||||
|
} else if (z2.getX() - t > z1.getX() && z1.getX() > z2.getX() + t && z1.getX() + z1.getWidth() >= z2.getX() + z2.getWidth() - t) {
|
||||||
|
return IntervalRelations.STARTS_INVERSE;
|
||||||
|
} else if (z1.getX() > z2.getX() + t && z1.getX() + z1.getWidth() < z2.getX() + z2.getWidth() - t) {
|
||||||
|
return IntervalRelations.DURING;
|
||||||
|
} else if (z1.getX() <= z2.getX() + t && z1.getX() + z1.getWidth() >= z2.getX() + z2.getWidth() - t) {
|
||||||
|
return IntervalRelations.DURING_INVERSE;
|
||||||
|
} else if (z1.getX() > z2.getX() + t && (z2.getX() + z2.getWidth() - t <= z1.getX() + z1.getWidth() && z1.getX() + z1.getWidth() <= z2.getX() + z2.getWidth() + t)) {
|
||||||
|
return IntervalRelations.FINISHES;
|
||||||
|
} else if (z1.getX() <= z2.getX() + t && (z2.getX() + z2.getWidth() - t > z1.getX() + z1.getWidth() && z1.getX() + z1.getWidth() > z2.getX() + z2.getWidth() + t)) {
|
||||||
|
return IntervalRelations.FINISHES_INVERSE;
|
||||||
|
} else if (z2.getX() - t <= z1.getX() && z1.getX() <= z2.getX() + t && (z2.getX() + z2.getWidth() - t <= z1.getX() + z1.getWidth()
|
||||||
|
&& z1.getX() + z1.getWidth() <= z2.getX() + z2.getWidth() + t)) {
|
||||||
|
return IntervalRelations.EQUALS;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static IntervalRelations getIntervalRelationY(Zone z1, Zone z2, double T) {
|
return IntervalRelations.UNKNOWN;
|
||||||
if (z1.getBottom() < z2.getTop() - T) {
|
}
|
||||||
return IntervalRelations.PRECEDESI;
|
|
||||||
} else if (z1.getBottom() >= z2.getTop() - T) {
|
|
||||||
return IntervalRelations.PRECEDES;
|
|
||||||
} else if (z2.getTop() - T <= z1.getBottom()
|
|
||||||
&& z1.getBottom() <= z2.getTop() + T) {
|
|
||||||
return IntervalRelations.MEETSI;
|
|
||||||
} else if (z2.getTop() - T > z1.getBottom()
|
|
||||||
&& z1.getBottom() > z2.getTop() + T) {
|
|
||||||
return IntervalRelations.MEETS;
|
|
||||||
} else if (z1.getTop() < z2.getTop() - T
|
|
||||||
&& (z2.getTop() + T < z1.getBottom()
|
|
||||||
&& z1.getBottom() < z2.getBottom() - T)) {
|
|
||||||
return IntervalRelations.OVERLAPSI;
|
|
||||||
} else if (z1.getTop() >= z2.getTop() - T
|
|
||||||
&& (z2.getTop() + T >= z1.getBottom()
|
|
||||||
&& z1.getBottom() >= z2.getBottom() - T)) {
|
|
||||||
return IntervalRelations.OVERLAPS;
|
|
||||||
} else if (z2.getTop() - T <= z1.getTop()
|
|
||||||
&& z1.getTop() <= z2.getTop() + T
|
|
||||||
&& z1.getBottom() < z2.getBottom() - T) {
|
|
||||||
return IntervalRelations.STARTSI;
|
|
||||||
} else if (z2.getTop() - T > z1.getTop()
|
|
||||||
&& z1.getTop() > z2.getTop() + T
|
|
||||||
&& z1.getBottom() >= z2.getBottom() - T) {
|
|
||||||
return IntervalRelations.STARTS;
|
|
||||||
} else if (z1.getTop() > z2.getTop() + T
|
|
||||||
&& z1.getBottom() < z2.getBottom() - T) {
|
|
||||||
return IntervalRelations.DURINGI;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
private static IntervalRelations getIntervalRelationY(Zone z1, Zone z2, double t) {
|
||||||
|
|
||||||
|
if (z1.getY() + z1.getWidth() < z2.getY() - t) {
|
||||||
|
return IntervalRelations.PRECEDES;
|
||||||
|
} else if (z1.getY() + z1.getWidth() >= z2.getY() - t) {
|
||||||
|
return IntervalRelations.PRECEDES_INVERSE;
|
||||||
|
} else if (z2.getY() - t <= z1.getY() + z1.getWidth() && z1.getY() + z1.getWidth() <= z2.getY() + t) {
|
||||||
|
return IntervalRelations.MEETS;
|
||||||
|
} else if (z2.getY() - t > z1.getY() + z1.getWidth() && z1.getY() + z1.getWidth() > z2.getY() + t) {
|
||||||
|
return IntervalRelations.MEETS_INVERSE;
|
||||||
|
} else if (z1.getY() < z2.getY() - t && (z2.getY() + t < z1.getY() + z1.getWidth() && z1.getY() + z1.getWidth() < z2.getY() + z2.getWidth() - t)) {
|
||||||
|
return IntervalRelations.OVERLAPS;
|
||||||
|
} else if (z1.getY() >= z2.getY() - t && (z2.getY() + t >= z1.getY() + z1.getWidth() && z1.getY() + z1.getWidth() >= z2.getY() + z2.getWidth() - t)) {
|
||||||
|
return IntervalRelations.OVERLAPS_INVERSE;
|
||||||
|
} else if (z2.getY() - t <= z1.getY() && z1.getY() <= z2.getY() + t && z1.getY() + z1.getWidth() < z2.getY() + z2.getWidth() - t) {
|
||||||
|
return IntervalRelations.STARTS;
|
||||||
|
} else if (z2.getY() - t > z1.getY() && z1.getY() > z2.getY() + t && z1.getY() + z1.getWidth() >= z2.getY() + z2.getWidth() - t) {
|
||||||
|
return IntervalRelations.STARTS_INVERSE;
|
||||||
|
} else if (z1.getY() > z2.getY() + t && z1.getY() + z1.getWidth() < z2.getY() + z2.getWidth() - t) {
|
||||||
|
return IntervalRelations.DURING;
|
||||||
|
} else if (z1.getY() <= z2.getY() + t && z1.getY() + z1.getWidth() >= z2.getY() + z2.getWidth() - t) {
|
||||||
|
return IntervalRelations.DURING_INVERSE;
|
||||||
|
} else if (z1.getY() > z2.getY() + t && (z2.getY() + z2.getWidth() - t <= z1.getY() + z1.getWidth() && z1.getY() + z1.getWidth() <= z2.getY() + z2.getWidth() + t)) {
|
||||||
|
return IntervalRelations.FINISHES;
|
||||||
|
} else if (z1.getY() <= z2.getY() + t && (z2.getY() + z2.getWidth() - t > z1.getY() + z1.getWidth() && z1.getY() + z1.getWidth() > z2.getY() + z2.getWidth() + t)) {
|
||||||
|
return IntervalRelations.FINISHES_INVERSE;
|
||||||
|
} else if (z2.getY() - t <= z1.getY() && z1.getY() <= z2.getY() + t && (z2.getY() + z2.getWidth() - t <= z1.getY() + z1.getWidth()
|
||||||
|
&& z1.getY() + z1.getWidth() <= z2.getY() + z2.getWidth() + t)) {
|
||||||
|
return IntervalRelations.EQUALS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return IntervalRelations.UNKNOWN;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -9,6 +9,8 @@ import org.springframework.stereotype.Service;
|
|||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.BoundingBox;
|
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.BoundingBox;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone;
|
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder.SpatialReasoningRules;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder.UnsupervisedReadingOrderDetector;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.utils.DoubleUtils;
|
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.utils.DoubleUtils;
|
||||||
|
|
||||||
@Service
|
@Service
|
||||||
@ -17,7 +19,20 @@ public class ReadingOrderService {
|
|||||||
private static final double THRESHOLD = 1;
|
private static final double THRESHOLD = 1;
|
||||||
|
|
||||||
|
|
||||||
public List<Zone> resolve(List<Zone> zones, boolean xyOrder) {
|
public List<Zone> resolve(List<Zone> zones, boolean columnWise) {
|
||||||
|
|
||||||
|
if (zones.isEmpty() || zones.size() == 1) {
|
||||||
|
return zones;
|
||||||
|
}
|
||||||
|
|
||||||
|
SpatialReasoningRules spatialReasoningRules = columnWise ? SpatialReasoningRules.COLUMN_WISE : SpatialReasoningRules.ROW_WISE;
|
||||||
|
|
||||||
|
var unsupervisedReadingOrderDetector = new UnsupervisedReadingOrderDetector(5, spatialReasoningRules, true);
|
||||||
|
|
||||||
|
return unsupervisedReadingOrderDetector.get(zones);
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<Zone> resolveOld(List<Zone> zones, boolean xyOrder) {
|
||||||
|
|
||||||
if (zones.isEmpty() || zones.size() == 1) {
|
if (zones.isEmpty() || zones.size() == 1) {
|
||||||
return zones;
|
return zones;
|
||||||
|
|||||||
@ -257,7 +257,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
|||||||
|
|
||||||
if (textPositions.get(i).getDir() != direction && startIndex != i) {
|
if (textPositions.get(i).getDir() != direction && startIndex != i) {
|
||||||
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
||||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart));
|
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart, textPositionSequences.size()));
|
||||||
startIndex = i;
|
startIndex = i;
|
||||||
direction = textPositions.get(i).getDir();
|
direction = textPositions.get(i).getDir();
|
||||||
}
|
}
|
||||||
@ -266,7 +266,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
|||||||
if (checkIfCurrentPositionIsToTheRightOfPreviousPosition(i, textPositions)) {
|
if (checkIfCurrentPositionIsToTheRightOfPreviousPosition(i, textPositions)) {
|
||||||
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
||||||
if (checkIfSequenceContainsOnlyWhitespaces(sublist)) {
|
if (checkIfSequenceContainsOnlyWhitespaces(sublist)) {
|
||||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart));
|
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart, textPositionSequences.size()));
|
||||||
}
|
}
|
||||||
startIndex = i;
|
startIndex = i;
|
||||||
}
|
}
|
||||||
@ -274,7 +274,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
|||||||
if (textPositions.get(i).getRotation() == 0 && i > 0 && textPositions.get(i).getX() > textPositions.get(i - 1).getEndX() + 1) {
|
if (textPositions.get(i).getRotation() == 0 && i > 0 && textPositions.get(i).getX() > textPositions.get(i - 1).getEndX() + 1) {
|
||||||
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
||||||
if (checkIfSequenceContainsOnlyWhitespaces(sublist)) {
|
if (checkIfSequenceContainsOnlyWhitespaces(sublist)) {
|
||||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart));
|
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart, textPositionSequences.size()));
|
||||||
}
|
}
|
||||||
startIndex = i;
|
startIndex = i;
|
||||||
}
|
}
|
||||||
@ -288,10 +288,10 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
|||||||
// Remove false sequence ends (whitespaces)
|
// Remove false sequence ends (whitespaces)
|
||||||
if (checkIfGapSizeBetweenCharactersSmallerThanMaximum(previous, sublist, 0.01f)) {
|
if (checkIfGapSizeBetweenCharactersSmallerThanMaximum(previous, sublist, 0.01f)) {
|
||||||
for (TextPosition t : sublist) {
|
for (TextPosition t : sublist) {
|
||||||
textPositionSequences.get(textPositionSequences.size() - 1).add(t);
|
textPositionSequences.get(textPositionSequences.size() - 1).add(t, textPositionSequences.size());
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart));
|
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart, textPositionSequences.size()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
startIndex = i + 1;
|
startIndex = i + 1;
|
||||||
@ -311,10 +311,10 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
|||||||
if (previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0)
|
if (previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0)
|
||||||
.getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < 0.01) {
|
.getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < 0.01) {
|
||||||
for (TextPosition t : sublist) {
|
for (TextPosition t : sublist) {
|
||||||
textPositionSequences.get(textPositionSequences.size() - 1).add(t);
|
textPositionSequences.get(textPositionSequences.size() - 1).add(t, textPositionSequences.size());
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, isParagraphStart));
|
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, isParagraphStart, textPositionSequences.size()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
super.writeString(text);
|
super.writeString(text);
|
||||||
|
|||||||
@ -43,7 +43,7 @@ public class MarkedContentUtils {
|
|||||||
|
|
||||||
return markedContentByYPosition.values().stream()
|
return markedContentByYPosition.values().stream()
|
||||||
.map(textPositions -> new TextPositionSequence(textPositions.stream()
|
.map(textPositions -> new TextPositionSequence(textPositions.stream()
|
||||||
.toList(), 0, true)
|
.toList(), 0, true, 0)
|
||||||
.getRectangle())
|
.getRectangle())
|
||||||
.map(t -> new Rectangle2D.Float(t.getTopLeft().getX(), t.getTopLeft().getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight()))).collect(Collectors.toList());
|
.map(t -> new Rectangle2D.Float(t.getTopLeft().getX(), t.getTopLeft().getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight()))).collect(Collectors.toList());
|
||||||
}
|
}
|
||||||
|
|||||||
@ -26,14 +26,14 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
|||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void testViewerDocument() {
|
public void testViewerDocument() {
|
||||||
|
|
||||||
String fileName = "files/Plenarprotokoll 1 (keine Druchsache!) (1).pdf";
|
String fileName = "files/new/A8240D TRESO dRR Part B Section 9 core - Germany Commenting - 07_05_2019.pdf";
|
||||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||||
|
|
||||||
var documentFile = new ClassPathResource(fileName).getFile();
|
var documentFile = new ClassPathResource(fileName).getFile();
|
||||||
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
|
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
|
||||||
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
|
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
|
||||||
|
|
||||||
Document document = buildGraph(fileName, LayoutParsingType.DOCSTRUM);
|
Document document = buildGraph(fileName, LayoutParsingType.DOCSTRUM_XY);
|
||||||
long start = System.currentTimeMillis();
|
long start = System.currentTimeMillis();
|
||||||
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
|
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
|
||||||
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
|
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user