diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/DocstrumSegmenter.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/DocstrumSegmenter.java
index 896658f..f660f6d 100644
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/DocstrumSegmenter.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/DocstrumSegmenter.java
@@ -1,22 +1,14 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Comparator;
import java.util.List;
import java.util.stream.Collectors;
import org.springframework.stereotype.Service;
-import com.google.common.collect.Lists;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
-import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Line;
-import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Zone;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.Character;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.CharacterLine;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.CharacterZone;
-import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.utils.BoundingBoxBuilder;
-import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.utils.ZoneUtils;
import lombok.RequiredArgsConstructor;
@@ -29,21 +21,12 @@ public class DocstrumSegmenter {
private final LineBuilderService lineBuilderService;
private final ZoneBuilderService zoneBuilderService;
- public static final int MAX_ZONES_PER_PAGE = 300;
- /**
- * Word distance multiplier.
- *
- * Maximum distance between components that belong to the same word is
- * equal to the product of this value and estimated within-line spacing.
- */
- private static final double WORD_DIST_MULT = 0.2;
+ public List segmentPage(List textPositions) {
- public List segmentPage(List textPositions) {
+ var positions = textPositions.stream().map(TextPositionSequence::getTextPositions).flatMap(List::stream).toList();
- var positions = textPositions.stream().map(t -> t.getTextPositions()).flatMap(List::stream).collect(Collectors.toList());
-
- var components = positions.stream().map(chunk -> new Character(chunk)).collect(Collectors.toList());
+ var components = positions.stream().map(Character::new).collect(Collectors.toList());
nearestNeighbourService.findNearestNeighbors(components);
@@ -52,44 +35,8 @@ public class DocstrumSegmenter {
List lines = lineBuilderService.buildLines(components, characterSpacing, lineSpacing);
- List zones = zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing);
- return convertToBxModel(zones, WORD_DIST_MULT * characterSpacing);
- }
+ return zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing);
-
- private List convertToBxModel(List zones, double wordSpacing) {
-
- List zoneList = new ArrayList<>();
- if (zones.size() > MAX_ZONES_PER_PAGE) {
- CharacterZone oneZone = new CharacterZone();
- for (CharacterZone zone : zones) {
- oneZone.getLines().addAll(zone.getLines());
- }
- zones = new ArrayList<>();
- zones.add(oneZone);
- }
-
- for (CharacterZone characterZone : zones) {
- Zone zone = new Zone();
- for (CharacterLine line : characterZone.getLines()) {
- zone.addLine(line.convertToBxLine(wordSpacing));
- }
- List zLines = Lists.newArrayList(zone.getLines());
- Collections.sort(zLines, new Comparator() {
-
- @Override
- public int compare(Line o1, Line o2) {
-
- return Double.compare(o1.getbBox().getY(), o2.getbBox().getY());
- }
-
- });
- zone.setLines(zLines);
- BoundingBoxBuilder.setBounds(zone);
- zoneList.add(zone);
- }
- ZoneUtils.sortZonesYX(zoneList);
- return zoneList;
}
}
diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/HierarchicalReadingOrderResolver.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/HierarchicalReadingOrderResolver.java
index 7b6fa43..0521dc5 100644
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/HierarchicalReadingOrderResolver.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/HierarchicalReadingOrderResolver.java
@@ -9,7 +9,7 @@ import java.util.List;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.BBoxObject;
-import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Zone;
+import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.CharacterZone;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.utils.DoubleUtils;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder.BBoxZoneGroup;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder.DistElem;
@@ -51,11 +51,11 @@ public class HierarchicalReadingOrderResolver {
};
- public List resolve(List zones) {
+ public List resolve(List zones) {
- List orderedZones;
+ List orderedZones;
if (zones.size() > MAX_ZONES) {
- orderedZones = new ArrayList(zones);
+ orderedZones = new ArrayList<>(zones);
Collections.sort(orderedZones, YX_ASCENDING_ORDER);
} else {
orderedZones = reorderZones(zones);
@@ -64,19 +64,19 @@ public class HierarchicalReadingOrderResolver {
}
- private List reorderZones(List unorderedZones) {
+ private List reorderZones(List unorderedZones) {
if (unorderedZones.isEmpty()) {
- return new ArrayList();
+ return new ArrayList<>();
} else if (unorderedZones.size() == 1) {
- List ret = new ArrayList(1);
+ List ret = new ArrayList<>(1);
ret.add(unorderedZones.get(0));
return ret;
} else {
BBoxZoneGroup bxZonesTree = groupZonesHierarchically(unorderedZones);
sortGroupedZones(bxZonesTree);
TreeToListConverter treeConverter = new TreeToListConverter();
- List orderedZones = treeConverter.convertToList(bxZonesTree);
+ List orderedZones = treeConverter.convertToList(bxZonesTree);
assert unorderedZones.size() == orderedZones.size();
return orderedZones;
}
@@ -90,15 +90,15 @@ public class HierarchicalReadingOrderResolver {
* @param zones is a list of unordered zones
* @return root of the zones clustered in a tree
*/
- private BBoxZoneGroup groupZonesHierarchically(List zones) {
+ private BBoxZoneGroup groupZonesHierarchically(List zones) {
/*
* Distance tuples are stored sorted by ascending distance value
*/
List> dists = new ArrayList>(zones.size() * zones.size() / 2);
for (int idx1 = 0; idx1 < zones.size(); ++idx1) {
for (int idx2 = idx1 + 1; idx2 < zones.size(); ++idx2) {
- Zone zone1 = zones.get(idx1);
- Zone zone2 = zones.get(idx2);
+ CharacterZone zone1 = zones.get(idx1);
+ CharacterZone zone2 = zones.get(idx2);
dists.add(new DistElem(false, distance(zone1, zone2), zone1, zone2));
}
}
diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/LineBuilderService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/LineBuilderService.java
index 59a03fa..8f32a9e 100644
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/LineBuilderService.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/LineBuilderService.java
@@ -40,9 +40,9 @@ public class LineBuilderService {
sets.forEach(group -> {
List lineComponents = new ArrayList<>(group);
lineComponents.sort(Comparator.comparingDouble(Character::getX));
- lines.add(new CharacterLine(lineComponents));
+ lines.add(new CharacterLine(lineComponents, characterSpacing));
});
-
+
return lines;
}
diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/ZoneBuilderService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/ZoneBuilderService.java
index a560aa4..ed30f34 100644
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/ZoneBuilderService.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/ZoneBuilderService.java
@@ -25,6 +25,8 @@ public class ZoneBuilderService {
private static final double ANGLE_TOLERANCE = Math.PI / 6;
+ public static final int MAX_ZONES = 300;
+
public List buildZones(List lines, double characterSpacing, double lineSpacing) {
@@ -64,6 +66,14 @@ public class ZoneBuilderService {
zones.add(new CharacterZone(new ArrayList<>(group)));
});
+ if (zones.size() > MAX_ZONES) {
+ List oneZoneLines = new ArrayList<>();
+ for (CharacterZone zone : zones) {
+ oneZoneLines.addAll(zone.getLines());
+ }
+ return List.of(new CharacterZone(oneZoneLines));
+ }
+
return zones;
}
diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/refactor/docstrum/CharacterLine.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/refactor/docstrum/CharacterLine.java
index f6bc990..1d4c719 100644
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/refactor/docstrum/CharacterLine.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/refactor/docstrum/CharacterLine.java
@@ -1,8 +1,12 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum;
+import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
+import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
+import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.BBoxObject;
+import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Line;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Word;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.utils.BoundingBoxBuilder;
@@ -10,7 +14,9 @@ import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.mo
import lombok.Data;
@Data
-public class CharacterLine {
+public class CharacterLine extends BBoxObject {
+
+ private static final double WORD_DISTANCE_MULTIPLIER = 0.2;
private final double x0;
private final double y0;
@@ -21,9 +27,10 @@ public class CharacterLine {
private final double height;
private final List characters;
+ private final List words = new ArrayList<>();
- public CharacterLine(List characters) {
+ public CharacterLine(List characters, double wordSpacing) {
this.characters = characters;
@@ -55,6 +62,8 @@ public class CharacterLine {
throw new IllegalArgumentException("Component list must not be empty");
}
height = computeHeight();
+ computeWords(wordSpacing * WORD_DISTANCE_MULTIPLIER);
+ buildBox();
}
@@ -112,6 +121,25 @@ public class CharacterLine {
}
+ public void computeWords(double wordSpacing) {
+
+ TextPositionSequence word = new TextPositionSequence();
+ Character previous = null;
+ for (Character current : characters) {
+ if (previous != null) {
+ double dist = current.getTextPosition().getXDirAdj() - previous.getTextPosition().getXDirAdj() - previous.getTextPosition().getWidthDirAdj();
+ if (dist > wordSpacing) {
+ words.add(word);
+ word = new TextPositionSequence();
+ }
+ }
+ word.getTextPositions().add(current.getTextPosition());
+ previous = current;
+ }
+ words.add(word);
+ }
+
+
public Line convertToBxLine(double wordSpacing) {
Line line = new Line();
@@ -135,5 +163,25 @@ public class CharacterLine {
return line;
}
+
+ public void buildBox() {
+
+ double minX = Double.POSITIVE_INFINITY;
+ double minY = Double.POSITIVE_INFINITY;
+ double maxX = Double.NEGATIVE_INFINITY;
+ double maxY = Double.NEGATIVE_INFINITY;
+
+ for (Character character : characters) {
+
+ minX = Math.min(minX, character.getTextPosition().getXDirAdj());
+ minY = Math.min(minY, character.getTextPosition().getYDirAdj());
+ maxX = Math.max(maxX, character.getTextPosition().getXDirAdj() + character.getTextPosition().getWidthDirAdj());
+ maxY = Math.max(maxY, character.getTextPosition().getYDirAdj() + character.getTextPosition().getHeightDir());
+
+ }
+
+ this.setbBox(new BoundingBox(minX, minY, maxX - minX, maxY - minY));
+ }
+
}
diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/refactor/docstrum/CharacterZone.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/refactor/docstrum/CharacterZone.java
index 97255d6..d47f89b 100644
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/refactor/docstrum/CharacterZone.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/refactor/docstrum/CharacterZone.java
@@ -1,24 +1,44 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum;
-import java.util.ArrayList;
+import java.util.Comparator;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.BBoxObject;
+import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.BoundingBox;
-import lombok.AllArgsConstructor;
import lombok.Data;
-import lombok.NoArgsConstructor;
@Data
-@NoArgsConstructor
-@AllArgsConstructor
public class CharacterZone extends BBoxObject {
- private List lines = new ArrayList<>();
+ private List lines;
+
+
+ public CharacterZone(List lines) {
+
+ lines.sort(Comparator.comparingDouble(CharacterLine::getY));
+ this.lines = lines;
+ buildBox();
+ }
public void buildBox() {
+ double minX = Double.POSITIVE_INFINITY;
+ double minY = Double.POSITIVE_INFINITY;
+ double maxX = Double.NEGATIVE_INFINITY;
+ double maxY = Double.NEGATIVE_INFINITY;
+
+ for (CharacterLine line : lines) {
+
+ minX = Math.min(minX, line.getX());
+ minY = Math.min(minY, line.getY());
+ maxX = Math.max(maxX, line.getX() + line.getWidth());
+ maxY = Math.max(maxY, line.getY() + line.getHeight());
+
+ }
+
+ this.setbBox(new BoundingBox(minX, minY, maxX - minX, maxY - minY));
}
}
diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/readingorder/DocumentPlane.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/readingorder/DocumentPlane.java
index cf6a6ce..adcf323 100644
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/readingorder/DocumentPlane.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/readingorder/DocumentPlane.java
@@ -7,7 +7,7 @@ import java.util.Map;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.BBoxObject;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.BoundingBox;
-import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Zone;
+import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.CharacterZone;
/**
* A set-like data structure for objects placed on a plane. Can efficiently find objects in a certain rectangular area.
@@ -82,12 +82,12 @@ public class DocumentPlane {
}
- public DocumentPlane(List objectList, int gridSize) {
+ public DocumentPlane(List objectList, int gridSize) {
this.grid = new HashMap>();
this.objs = new ArrayList();
this.gridSize = gridSize;
- for (Zone obj : objectList) {
+ for (CharacterZone obj : objectList) {
add(obj);
}
}
diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/readingorder/TreeToListConverter.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/readingorder/TreeToListConverter.java
index 7719246..81cdaf1 100644
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/readingorder/TreeToListConverter.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/readingorder/TreeToListConverter.java
@@ -3,25 +3,22 @@ package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.r
import java.util.ArrayList;
import java.util.List;
-import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Zone;
+import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.CharacterZone;
-/**
- * @author Pawel Szostek
- */
public class TreeToListConverter {
- public List convertToList(BBoxZoneGroup obj) {
+ public List convertToList(BBoxZoneGroup obj) {
- List ret = new ArrayList();
- if (obj.getLeftChild() instanceof Zone) {
- Zone zone = (Zone) obj.getLeftChild();
+ List ret = new ArrayList<>();
+ if (obj.getLeftChild() instanceof CharacterZone) {
+ CharacterZone zone = (CharacterZone) obj.getLeftChild();
ret.add(zone);
} else { // obj.getLeftChild() instanceof BxZoneGroup
ret.addAll(convertToList((BBoxZoneGroup) obj.getLeftChild()));
}
- if (obj.getRightChild() instanceof Zone) {
- Zone zone = (Zone) obj.getRightChild();
+ if (obj.getRightChild() instanceof CharacterZone) {
+ CharacterZone zone = (CharacterZone) obj.getRightChild();
ret.add(zone);
} else { // obj.getRightChild() instanceof BxZoneGroup
ret.addAll(convertToList((BBoxZoneGroup) obj.getRightChild()));