More refactoring

This commit is contained in:
Dominique Eifländer 2024-02-16 13:48:03 +01:00
parent b2fb6829cb
commit e394f2fa7c
8 changed files with 113 additions and 91 deletions

View File

@ -1,22 +1,14 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum; package com.knecon.fforesight.service.layoutparser.processor.services.docstrum;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List; import java.util.List;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import com.google.common.collect.Lists;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Line;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Zone;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.Character; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.Character;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.CharacterLine; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.CharacterLine;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.CharacterZone; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.CharacterZone;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.utils.BoundingBoxBuilder;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.utils.ZoneUtils;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
@ -29,21 +21,12 @@ public class DocstrumSegmenter {
private final LineBuilderService lineBuilderService; private final LineBuilderService lineBuilderService;
private final ZoneBuilderService zoneBuilderService; private final ZoneBuilderService zoneBuilderService;
public static final int MAX_ZONES_PER_PAGE = 300;
/**
* Word distance multiplier.
* <p>
* Maximum distance between components that belong to the same word is
* equal to the product of this value and estimated within-line spacing.
*/
private static final double WORD_DIST_MULT = 0.2;
public List<CharacterZone> segmentPage(List<TextPositionSequence> textPositions) {
public List<Zone> segmentPage(List<TextPositionSequence> textPositions) { var positions = textPositions.stream().map(TextPositionSequence::getTextPositions).flatMap(List::stream).toList();
var positions = textPositions.stream().map(t -> t.getTextPositions()).flatMap(List::stream).collect(Collectors.toList()); var components = positions.stream().map(Character::new).collect(Collectors.toList());
var components = positions.stream().map(chunk -> new Character(chunk)).collect(Collectors.toList());
nearestNeighbourService.findNearestNeighbors(components); nearestNeighbourService.findNearestNeighbors(components);
@ -52,44 +35,8 @@ public class DocstrumSegmenter {
List<CharacterLine> lines = lineBuilderService.buildLines(components, characterSpacing, lineSpacing); List<CharacterLine> lines = lineBuilderService.buildLines(components, characterSpacing, lineSpacing);
List<CharacterZone> zones = zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing); return zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing);
return convertToBxModel(zones, WORD_DIST_MULT * characterSpacing);
}
private List<Zone> convertToBxModel(List<CharacterZone> zones, double wordSpacing) {
List<Zone> zoneList = new ArrayList<>();
if (zones.size() > MAX_ZONES_PER_PAGE) {
CharacterZone oneZone = new CharacterZone();
for (CharacterZone zone : zones) {
oneZone.getLines().addAll(zone.getLines());
}
zones = new ArrayList<>();
zones.add(oneZone);
}
for (CharacterZone characterZone : zones) {
Zone zone = new Zone();
for (CharacterLine line : characterZone.getLines()) {
zone.addLine(line.convertToBxLine(wordSpacing));
}
List<Line> zLines = Lists.newArrayList(zone.getLines());
Collections.sort(zLines, new Comparator<Line>() {
@Override
public int compare(Line o1, Line o2) {
return Double.compare(o1.getbBox().getY(), o2.getbBox().getY());
}
});
zone.setLines(zLines);
BoundingBoxBuilder.setBounds(zone);
zoneList.add(zone);
}
ZoneUtils.sortZonesYX(zoneList);
return zoneList;
} }
} }

View File

@ -9,7 +9,7 @@ import java.util.List;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.BBoxObject; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.BBoxObject;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Zone; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.CharacterZone;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.utils.DoubleUtils; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.utils.DoubleUtils;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder.BBoxZoneGroup; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder.BBoxZoneGroup;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder.DistElem; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder.DistElem;
@ -51,11 +51,11 @@ public class HierarchicalReadingOrderResolver {
}; };
public List<Zone> resolve(List<Zone> zones) { public List<CharacterZone> resolve(List<CharacterZone> zones) {
List<Zone> orderedZones; List<CharacterZone> orderedZones;
if (zones.size() > MAX_ZONES) { if (zones.size() > MAX_ZONES) {
orderedZones = new ArrayList<Zone>(zones); orderedZones = new ArrayList<>(zones);
Collections.sort(orderedZones, YX_ASCENDING_ORDER); Collections.sort(orderedZones, YX_ASCENDING_ORDER);
} else { } else {
orderedZones = reorderZones(zones); orderedZones = reorderZones(zones);
@ -64,19 +64,19 @@ public class HierarchicalReadingOrderResolver {
} }
private List<Zone> reorderZones(List<Zone> unorderedZones) { private List<CharacterZone> reorderZones(List<CharacterZone> unorderedZones) {
if (unorderedZones.isEmpty()) { if (unorderedZones.isEmpty()) {
return new ArrayList<Zone>(); return new ArrayList<>();
} else if (unorderedZones.size() == 1) { } else if (unorderedZones.size() == 1) {
List<Zone> ret = new ArrayList<Zone>(1); List<CharacterZone> ret = new ArrayList<>(1);
ret.add(unorderedZones.get(0)); ret.add(unorderedZones.get(0));
return ret; return ret;
} else { } else {
BBoxZoneGroup bxZonesTree = groupZonesHierarchically(unorderedZones); BBoxZoneGroup bxZonesTree = groupZonesHierarchically(unorderedZones);
sortGroupedZones(bxZonesTree); sortGroupedZones(bxZonesTree);
TreeToListConverter treeConverter = new TreeToListConverter(); TreeToListConverter treeConverter = new TreeToListConverter();
List<Zone> orderedZones = treeConverter.convertToList(bxZonesTree); List<CharacterZone> orderedZones = treeConverter.convertToList(bxZonesTree);
assert unorderedZones.size() == orderedZones.size(); assert unorderedZones.size() == orderedZones.size();
return orderedZones; return orderedZones;
} }
@ -90,15 +90,15 @@ public class HierarchicalReadingOrderResolver {
* @param zones is a list of unordered zones * @param zones is a list of unordered zones
* @return root of the zones clustered in a tree * @return root of the zones clustered in a tree
*/ */
private BBoxZoneGroup groupZonesHierarchically(List<Zone> zones) { private BBoxZoneGroup groupZonesHierarchically(List<CharacterZone> zones) {
/* /*
* Distance tuples are stored sorted by ascending distance value * Distance tuples are stored sorted by ascending distance value
*/ */
List<DistElem<BBoxObject>> dists = new ArrayList<DistElem<BBoxObject>>(zones.size() * zones.size() / 2); List<DistElem<BBoxObject>> dists = new ArrayList<DistElem<BBoxObject>>(zones.size() * zones.size() / 2);
for (int idx1 = 0; idx1 < zones.size(); ++idx1) { for (int idx1 = 0; idx1 < zones.size(); ++idx1) {
for (int idx2 = idx1 + 1; idx2 < zones.size(); ++idx2) { for (int idx2 = idx1 + 1; idx2 < zones.size(); ++idx2) {
Zone zone1 = zones.get(idx1); CharacterZone zone1 = zones.get(idx1);
Zone zone2 = zones.get(idx2); CharacterZone zone2 = zones.get(idx2);
dists.add(new DistElem<BBoxObject>(false, distance(zone1, zone2), zone1, zone2)); dists.add(new DistElem<BBoxObject>(false, distance(zone1, zone2), zone1, zone2));
} }
} }

View File

@ -40,9 +40,9 @@ public class LineBuilderService {
sets.forEach(group -> { sets.forEach(group -> {
List<Character> lineComponents = new ArrayList<>(group); List<Character> lineComponents = new ArrayList<>(group);
lineComponents.sort(Comparator.comparingDouble(Character::getX)); lineComponents.sort(Comparator.comparingDouble(Character::getX));
lines.add(new CharacterLine(lineComponents)); lines.add(new CharacterLine(lineComponents, characterSpacing));
}); });
return lines; return lines;
} }

View File

@ -25,6 +25,8 @@ public class ZoneBuilderService {
private static final double ANGLE_TOLERANCE = Math.PI / 6; private static final double ANGLE_TOLERANCE = Math.PI / 6;
public static final int MAX_ZONES = 300;
public List<CharacterZone> buildZones(List<CharacterLine> lines, double characterSpacing, double lineSpacing) { public List<CharacterZone> buildZones(List<CharacterLine> lines, double characterSpacing, double lineSpacing) {
@ -64,6 +66,14 @@ public class ZoneBuilderService {
zones.add(new CharacterZone(new ArrayList<>(group))); zones.add(new CharacterZone(new ArrayList<>(group)));
}); });
if (zones.size() > MAX_ZONES) {
List<CharacterLine> oneZoneLines = new ArrayList<>();
for (CharacterZone zone : zones) {
oneZoneLines.addAll(zone.getLines());
}
return List.of(new CharacterZone(oneZoneLines));
}
return zones; return zones;
} }

View File

@ -1,8 +1,12 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum; package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum;
import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.List; import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.BBoxObject;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Line; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Line;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Word; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Word;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.utils.BoundingBoxBuilder; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.utils.BoundingBoxBuilder;
@ -10,7 +14,9 @@ import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.mo
import lombok.Data; import lombok.Data;
@Data @Data
public class CharacterLine { public class CharacterLine extends BBoxObject {
private static final double WORD_DISTANCE_MULTIPLIER = 0.2;
private final double x0; private final double x0;
private final double y0; private final double y0;
@ -21,9 +27,10 @@ public class CharacterLine {
private final double height; private final double height;
private final List<Character> characters; private final List<Character> characters;
private final List<TextPositionSequence> words = new ArrayList<>();
public CharacterLine(List<Character> characters) { public CharacterLine(List<Character> characters, double wordSpacing) {
this.characters = characters; this.characters = characters;
@ -55,6 +62,8 @@ public class CharacterLine {
throw new IllegalArgumentException("Component list must not be empty"); throw new IllegalArgumentException("Component list must not be empty");
} }
height = computeHeight(); height = computeHeight();
computeWords(wordSpacing * WORD_DISTANCE_MULTIPLIER);
buildBox();
} }
@ -112,6 +121,25 @@ public class CharacterLine {
} }
public void computeWords(double wordSpacing) {
TextPositionSequence word = new TextPositionSequence();
Character previous = null;
for (Character current : characters) {
if (previous != null) {
double dist = current.getTextPosition().getXDirAdj() - previous.getTextPosition().getXDirAdj() - previous.getTextPosition().getWidthDirAdj();
if (dist > wordSpacing) {
words.add(word);
word = new TextPositionSequence();
}
}
word.getTextPositions().add(current.getTextPosition());
previous = current;
}
words.add(word);
}
public Line convertToBxLine(double wordSpacing) { public Line convertToBxLine(double wordSpacing) {
Line line = new Line(); Line line = new Line();
@ -135,5 +163,25 @@ public class CharacterLine {
return line; return line;
} }
public void buildBox() {
double minX = Double.POSITIVE_INFINITY;
double minY = Double.POSITIVE_INFINITY;
double maxX = Double.NEGATIVE_INFINITY;
double maxY = Double.NEGATIVE_INFINITY;
for (Character character : characters) {
minX = Math.min(minX, character.getTextPosition().getXDirAdj());
minY = Math.min(minY, character.getTextPosition().getYDirAdj());
maxX = Math.max(maxX, character.getTextPosition().getXDirAdj() + character.getTextPosition().getWidthDirAdj());
maxY = Math.max(maxY, character.getTextPosition().getYDirAdj() + character.getTextPosition().getHeightDir());
}
this.setbBox(new BoundingBox(minX, minY, maxX - minX, maxY - minY));
}
} }

View File

@ -1,24 +1,44 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum; package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum;
import java.util.ArrayList; import java.util.Comparator;
import java.util.List; import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.BBoxObject; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.BBoxObject;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.BoundingBox;
import lombok.AllArgsConstructor;
import lombok.Data; import lombok.Data;
import lombok.NoArgsConstructor;
@Data @Data
@NoArgsConstructor
@AllArgsConstructor
public class CharacterZone extends BBoxObject { public class CharacterZone extends BBoxObject {
private List<CharacterLine> lines = new ArrayList<>(); private List<CharacterLine> lines;
public CharacterZone(List<CharacterLine> lines) {
lines.sort(Comparator.comparingDouble(CharacterLine::getY));
this.lines = lines;
buildBox();
}
public void buildBox() { public void buildBox() {
double minX = Double.POSITIVE_INFINITY;
double minY = Double.POSITIVE_INFINITY;
double maxX = Double.NEGATIVE_INFINITY;
double maxY = Double.NEGATIVE_INFINITY;
for (CharacterLine line : lines) {
minX = Math.min(minX, line.getX());
minY = Math.min(minY, line.getY());
maxX = Math.max(maxX, line.getX() + line.getWidth());
maxY = Math.max(maxY, line.getY() + line.getHeight());
}
this.setbBox(new BoundingBox(minX, minY, maxX - minX, maxY - minY));
} }
} }

View File

@ -7,7 +7,7 @@ import java.util.Map;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.BBoxObject; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.BBoxObject;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.BoundingBox; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Zone; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.CharacterZone;
/** /**
* A set-like data structure for objects placed on a plane. Can efficiently find objects in a certain rectangular area. * A set-like data structure for objects placed on a plane. Can efficiently find objects in a certain rectangular area.
@ -82,12 +82,12 @@ public class DocumentPlane {
} }
public DocumentPlane(List<Zone> objectList, int gridSize) { public DocumentPlane(List<CharacterZone> objectList, int gridSize) {
this.grid = new HashMap<GridXY, List<BBoxObject>>(); this.grid = new HashMap<GridXY, List<BBoxObject>>();
this.objs = new ArrayList<BBoxObject>(); this.objs = new ArrayList<BBoxObject>();
this.gridSize = gridSize; this.gridSize = gridSize;
for (Zone obj : objectList) { for (CharacterZone obj : objectList) {
add(obj); add(obj);
} }
} }

View File

@ -3,25 +3,22 @@ package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.r
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Zone; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.CharacterZone;
/**
* @author Pawel Szostek
*/
public class TreeToListConverter { public class TreeToListConverter {
public List<Zone> convertToList(BBoxZoneGroup obj) { public List<CharacterZone> convertToList(BBoxZoneGroup obj) {
List<Zone> ret = new ArrayList<Zone>(); List<CharacterZone> ret = new ArrayList<>();
if (obj.getLeftChild() instanceof Zone) { if (obj.getLeftChild() instanceof CharacterZone) {
Zone zone = (Zone) obj.getLeftChild(); CharacterZone zone = (CharacterZone) obj.getLeftChild();
ret.add(zone); ret.add(zone);
} else { // obj.getLeftChild() instanceof BxZoneGroup } else { // obj.getLeftChild() instanceof BxZoneGroup
ret.addAll(convertToList((BBoxZoneGroup) obj.getLeftChild())); ret.addAll(convertToList((BBoxZoneGroup) obj.getLeftChild()));
} }
if (obj.getRightChild() instanceof Zone) { if (obj.getRightChild() instanceof CharacterZone) {
Zone zone = (Zone) obj.getRightChild(); CharacterZone zone = (CharacterZone) obj.getRightChild();
ret.add(zone); ret.add(zone);
} else { // obj.getRightChild() instanceof BxZoneGroup } else { // obj.getRightChild() instanceof BxZoneGroup
ret.addAll(convertToList((BBoxZoneGroup) obj.getRightChild())); ret.addAll(convertToList((BBoxZoneGroup) obj.getRightChild()));