diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/DocstrumSegmenter.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/DocstrumSegmenter.java index 896658f..f660f6d 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/DocstrumSegmenter.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/DocstrumSegmenter.java @@ -1,22 +1,14 @@ package com.knecon.fforesight.service.layoutparser.processor.services.docstrum; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Comparator; import java.util.List; import java.util.stream.Collectors; import org.springframework.stereotype.Service; -import com.google.common.collect.Lists; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; -import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Line; -import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Zone; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.Character; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.CharacterLine; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.CharacterZone; -import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.utils.BoundingBoxBuilder; -import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.utils.ZoneUtils; import lombok.RequiredArgsConstructor; @@ -29,21 +21,12 @@ public class DocstrumSegmenter { private final LineBuilderService lineBuilderService; private final ZoneBuilderService zoneBuilderService; - public static final int MAX_ZONES_PER_PAGE = 300; - /** - * Word distance multiplier. - *

- * Maximum distance between components that belong to the same word is - * equal to the product of this value and estimated within-line spacing. - */ - private static final double WORD_DIST_MULT = 0.2; + public List segmentPage(List textPositions) { - public List segmentPage(List textPositions) { + var positions = textPositions.stream().map(TextPositionSequence::getTextPositions).flatMap(List::stream).toList(); - var positions = textPositions.stream().map(t -> t.getTextPositions()).flatMap(List::stream).collect(Collectors.toList()); - - var components = positions.stream().map(chunk -> new Character(chunk)).collect(Collectors.toList()); + var components = positions.stream().map(Character::new).collect(Collectors.toList()); nearestNeighbourService.findNearestNeighbors(components); @@ -52,44 +35,8 @@ public class DocstrumSegmenter { List lines = lineBuilderService.buildLines(components, characterSpacing, lineSpacing); - List zones = zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing); - return convertToBxModel(zones, WORD_DIST_MULT * characterSpacing); - } + return zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing); - - private List convertToBxModel(List zones, double wordSpacing) { - - List zoneList = new ArrayList<>(); - if (zones.size() > MAX_ZONES_PER_PAGE) { - CharacterZone oneZone = new CharacterZone(); - for (CharacterZone zone : zones) { - oneZone.getLines().addAll(zone.getLines()); - } - zones = new ArrayList<>(); - zones.add(oneZone); - } - - for (CharacterZone characterZone : zones) { - Zone zone = new Zone(); - for (CharacterLine line : characterZone.getLines()) { - zone.addLine(line.convertToBxLine(wordSpacing)); - } - List zLines = Lists.newArrayList(zone.getLines()); - Collections.sort(zLines, new Comparator() { - - @Override - public int compare(Line o1, Line o2) { - - return Double.compare(o1.getbBox().getY(), o2.getbBox().getY()); - } - - }); - zone.setLines(zLines); - BoundingBoxBuilder.setBounds(zone); - zoneList.add(zone); - } - ZoneUtils.sortZonesYX(zoneList); - return zoneList; } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/HierarchicalReadingOrderResolver.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/HierarchicalReadingOrderResolver.java index 7b6fa43..0521dc5 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/HierarchicalReadingOrderResolver.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/HierarchicalReadingOrderResolver.java @@ -9,7 +9,7 @@ import java.util.List; import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.BBoxObject; -import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Zone; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.CharacterZone; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.utils.DoubleUtils; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder.BBoxZoneGroup; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder.DistElem; @@ -51,11 +51,11 @@ public class HierarchicalReadingOrderResolver { }; - public List resolve(List zones) { + public List resolve(List zones) { - List orderedZones; + List orderedZones; if (zones.size() > MAX_ZONES) { - orderedZones = new ArrayList(zones); + orderedZones = new ArrayList<>(zones); Collections.sort(orderedZones, YX_ASCENDING_ORDER); } else { orderedZones = reorderZones(zones); @@ -64,19 +64,19 @@ public class HierarchicalReadingOrderResolver { } - private List reorderZones(List unorderedZones) { + private List reorderZones(List unorderedZones) { if (unorderedZones.isEmpty()) { - return new ArrayList(); + return new ArrayList<>(); } else if (unorderedZones.size() == 1) { - List ret = new ArrayList(1); + List ret = new ArrayList<>(1); ret.add(unorderedZones.get(0)); return ret; } else { BBoxZoneGroup bxZonesTree = groupZonesHierarchically(unorderedZones); sortGroupedZones(bxZonesTree); TreeToListConverter treeConverter = new TreeToListConverter(); - List orderedZones = treeConverter.convertToList(bxZonesTree); + List orderedZones = treeConverter.convertToList(bxZonesTree); assert unorderedZones.size() == orderedZones.size(); return orderedZones; } @@ -90,15 +90,15 @@ public class HierarchicalReadingOrderResolver { * @param zones is a list of unordered zones * @return root of the zones clustered in a tree */ - private BBoxZoneGroup groupZonesHierarchically(List zones) { + private BBoxZoneGroup groupZonesHierarchically(List zones) { /* * Distance tuples are stored sorted by ascending distance value */ List> dists = new ArrayList>(zones.size() * zones.size() / 2); for (int idx1 = 0; idx1 < zones.size(); ++idx1) { for (int idx2 = idx1 + 1; idx2 < zones.size(); ++idx2) { - Zone zone1 = zones.get(idx1); - Zone zone2 = zones.get(idx2); + CharacterZone zone1 = zones.get(idx1); + CharacterZone zone2 = zones.get(idx2); dists.add(new DistElem(false, distance(zone1, zone2), zone1, zone2)); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/LineBuilderService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/LineBuilderService.java index 59a03fa..8f32a9e 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/LineBuilderService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/LineBuilderService.java @@ -40,9 +40,9 @@ public class LineBuilderService { sets.forEach(group -> { List lineComponents = new ArrayList<>(group); lineComponents.sort(Comparator.comparingDouble(Character::getX)); - lines.add(new CharacterLine(lineComponents)); + lines.add(new CharacterLine(lineComponents, characterSpacing)); }); - + return lines; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/ZoneBuilderService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/ZoneBuilderService.java index a560aa4..ed30f34 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/ZoneBuilderService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/ZoneBuilderService.java @@ -25,6 +25,8 @@ public class ZoneBuilderService { private static final double ANGLE_TOLERANCE = Math.PI / 6; + public static final int MAX_ZONES = 300; + public List buildZones(List lines, double characterSpacing, double lineSpacing) { @@ -64,6 +66,14 @@ public class ZoneBuilderService { zones.add(new CharacterZone(new ArrayList<>(group))); }); + if (zones.size() > MAX_ZONES) { + List oneZoneLines = new ArrayList<>(); + for (CharacterZone zone : zones) { + oneZoneLines.addAll(zone.getLines()); + } + return List.of(new CharacterZone(oneZoneLines)); + } + return zones; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/refactor/docstrum/CharacterLine.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/refactor/docstrum/CharacterLine.java index f6bc990..1d4c719 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/refactor/docstrum/CharacterLine.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/refactor/docstrum/CharacterLine.java @@ -1,8 +1,12 @@ package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum; +import java.util.ArrayList; import java.util.Arrays; import java.util.List; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.BBoxObject; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.BoundingBox; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Line; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Word; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.utils.BoundingBoxBuilder; @@ -10,7 +14,9 @@ import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.mo import lombok.Data; @Data -public class CharacterLine { +public class CharacterLine extends BBoxObject { + + private static final double WORD_DISTANCE_MULTIPLIER = 0.2; private final double x0; private final double y0; @@ -21,9 +27,10 @@ public class CharacterLine { private final double height; private final List characters; + private final List words = new ArrayList<>(); - public CharacterLine(List characters) { + public CharacterLine(List characters, double wordSpacing) { this.characters = characters; @@ -55,6 +62,8 @@ public class CharacterLine { throw new IllegalArgumentException("Component list must not be empty"); } height = computeHeight(); + computeWords(wordSpacing * WORD_DISTANCE_MULTIPLIER); + buildBox(); } @@ -112,6 +121,25 @@ public class CharacterLine { } + public void computeWords(double wordSpacing) { + + TextPositionSequence word = new TextPositionSequence(); + Character previous = null; + for (Character current : characters) { + if (previous != null) { + double dist = current.getTextPosition().getXDirAdj() - previous.getTextPosition().getXDirAdj() - previous.getTextPosition().getWidthDirAdj(); + if (dist > wordSpacing) { + words.add(word); + word = new TextPositionSequence(); + } + } + word.getTextPositions().add(current.getTextPosition()); + previous = current; + } + words.add(word); + } + + public Line convertToBxLine(double wordSpacing) { Line line = new Line(); @@ -135,5 +163,25 @@ public class CharacterLine { return line; } + + public void buildBox() { + + double minX = Double.POSITIVE_INFINITY; + double minY = Double.POSITIVE_INFINITY; + double maxX = Double.NEGATIVE_INFINITY; + double maxY = Double.NEGATIVE_INFINITY; + + for (Character character : characters) { + + minX = Math.min(minX, character.getTextPosition().getXDirAdj()); + minY = Math.min(minY, character.getTextPosition().getYDirAdj()); + maxX = Math.max(maxX, character.getTextPosition().getXDirAdj() + character.getTextPosition().getWidthDirAdj()); + maxY = Math.max(maxY, character.getTextPosition().getYDirAdj() + character.getTextPosition().getHeightDir()); + + } + + this.setbBox(new BoundingBox(minX, minY, maxX - minX, maxY - minY)); + } + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/refactor/docstrum/CharacterZone.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/refactor/docstrum/CharacterZone.java index 97255d6..d47f89b 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/refactor/docstrum/CharacterZone.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/refactor/docstrum/CharacterZone.java @@ -1,24 +1,44 @@ package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum; -import java.util.ArrayList; +import java.util.Comparator; import java.util.List; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.BBoxObject; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.BoundingBox; -import lombok.AllArgsConstructor; import lombok.Data; -import lombok.NoArgsConstructor; @Data -@NoArgsConstructor -@AllArgsConstructor public class CharacterZone extends BBoxObject { - private List lines = new ArrayList<>(); + private List lines; + + + public CharacterZone(List lines) { + + lines.sort(Comparator.comparingDouble(CharacterLine::getY)); + this.lines = lines; + buildBox(); + } public void buildBox() { + double minX = Double.POSITIVE_INFINITY; + double minY = Double.POSITIVE_INFINITY; + double maxX = Double.NEGATIVE_INFINITY; + double maxY = Double.NEGATIVE_INFINITY; + + for (CharacterLine line : lines) { + + minX = Math.min(minX, line.getX()); + minY = Math.min(minY, line.getY()); + maxX = Math.max(maxX, line.getX() + line.getWidth()); + maxY = Math.max(maxY, line.getY() + line.getHeight()); + + } + + this.setbBox(new BoundingBox(minX, minY, maxX - minX, maxY - minY)); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/readingorder/DocumentPlane.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/readingorder/DocumentPlane.java index cf6a6ce..adcf323 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/readingorder/DocumentPlane.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/readingorder/DocumentPlane.java @@ -7,7 +7,7 @@ import java.util.Map; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.BBoxObject; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.BoundingBox; -import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Zone; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.CharacterZone; /** * A set-like data structure for objects placed on a plane. Can efficiently find objects in a certain rectangular area. @@ -82,12 +82,12 @@ public class DocumentPlane { } - public DocumentPlane(List objectList, int gridSize) { + public DocumentPlane(List objectList, int gridSize) { this.grid = new HashMap>(); this.objs = new ArrayList(); this.gridSize = gridSize; - for (Zone obj : objectList) { + for (CharacterZone obj : objectList) { add(obj); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/readingorder/TreeToListConverter.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/readingorder/TreeToListConverter.java index 7719246..81cdaf1 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/readingorder/TreeToListConverter.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/readingorder/TreeToListConverter.java @@ -3,25 +3,22 @@ package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.r import java.util.ArrayList; import java.util.List; -import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Zone; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.CharacterZone; -/** - * @author Pawel Szostek - */ public class TreeToListConverter { - public List convertToList(BBoxZoneGroup obj) { + public List convertToList(BBoxZoneGroup obj) { - List ret = new ArrayList(); - if (obj.getLeftChild() instanceof Zone) { - Zone zone = (Zone) obj.getLeftChild(); + List ret = new ArrayList<>(); + if (obj.getLeftChild() instanceof CharacterZone) { + CharacterZone zone = (CharacterZone) obj.getLeftChild(); ret.add(zone); } else { // obj.getLeftChild() instanceof BxZoneGroup ret.addAll(convertToList((BBoxZoneGroup) obj.getLeftChild())); } - if (obj.getRightChild() instanceof Zone) { - Zone zone = (Zone) obj.getRightChild(); + if (obj.getRightChild() instanceof CharacterZone) { + CharacterZone zone = (CharacterZone) obj.getRightChild(); ret.add(zone); } else { // obj.getRightChild() instanceof BxZoneGroup ret.addAll(convertToList((BBoxZoneGroup) obj.getRightChild()));