More refactoring

This commit is contained in:
Dominique Eifländer 2024-02-16 13:48:03 +01:00
parent b2fb6829cb
commit e394f2fa7c
8 changed files with 113 additions and 91 deletions

View File

@ -1,22 +1,14 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.stream.Collectors;
import org.springframework.stereotype.Service;
import com.google.common.collect.Lists;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Line;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Zone;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.Character;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.CharacterLine;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.CharacterZone;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.utils.BoundingBoxBuilder;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.utils.ZoneUtils;
import lombok.RequiredArgsConstructor;
@ -29,21 +21,12 @@ public class DocstrumSegmenter {
private final LineBuilderService lineBuilderService;
private final ZoneBuilderService zoneBuilderService;
public static final int MAX_ZONES_PER_PAGE = 300;
/**
* Word distance multiplier.
* <p>
* Maximum distance between components that belong to the same word is
* equal to the product of this value and estimated within-line spacing.
*/
private static final double WORD_DIST_MULT = 0.2;
public List<CharacterZone> segmentPage(List<TextPositionSequence> textPositions) {
public List<Zone> segmentPage(List<TextPositionSequence> textPositions) {
var positions = textPositions.stream().map(TextPositionSequence::getTextPositions).flatMap(List::stream).toList();
var positions = textPositions.stream().map(t -> t.getTextPositions()).flatMap(List::stream).collect(Collectors.toList());
var components = positions.stream().map(chunk -> new Character(chunk)).collect(Collectors.toList());
var components = positions.stream().map(Character::new).collect(Collectors.toList());
nearestNeighbourService.findNearestNeighbors(components);
@ -52,44 +35,8 @@ public class DocstrumSegmenter {
List<CharacterLine> lines = lineBuilderService.buildLines(components, characterSpacing, lineSpacing);
List<CharacterZone> zones = zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing);
return convertToBxModel(zones, WORD_DIST_MULT * characterSpacing);
}
return zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing);
private List<Zone> convertToBxModel(List<CharacterZone> zones, double wordSpacing) {
List<Zone> zoneList = new ArrayList<>();
if (zones.size() > MAX_ZONES_PER_PAGE) {
CharacterZone oneZone = new CharacterZone();
for (CharacterZone zone : zones) {
oneZone.getLines().addAll(zone.getLines());
}
zones = new ArrayList<>();
zones.add(oneZone);
}
for (CharacterZone characterZone : zones) {
Zone zone = new Zone();
for (CharacterLine line : characterZone.getLines()) {
zone.addLine(line.convertToBxLine(wordSpacing));
}
List<Line> zLines = Lists.newArrayList(zone.getLines());
Collections.sort(zLines, new Comparator<Line>() {
@Override
public int compare(Line o1, Line o2) {
return Double.compare(o1.getbBox().getY(), o2.getbBox().getY());
}
});
zone.setLines(zLines);
BoundingBoxBuilder.setBounds(zone);
zoneList.add(zone);
}
ZoneUtils.sortZonesYX(zoneList);
return zoneList;
}
}

View File

@ -9,7 +9,7 @@ import java.util.List;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.BBoxObject;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Zone;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.CharacterZone;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.utils.DoubleUtils;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder.BBoxZoneGroup;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder.DistElem;
@ -51,11 +51,11 @@ public class HierarchicalReadingOrderResolver {
};
public List<Zone> resolve(List<Zone> zones) {
public List<CharacterZone> resolve(List<CharacterZone> zones) {
List<Zone> orderedZones;
List<CharacterZone> orderedZones;
if (zones.size() > MAX_ZONES) {
orderedZones = new ArrayList<Zone>(zones);
orderedZones = new ArrayList<>(zones);
Collections.sort(orderedZones, YX_ASCENDING_ORDER);
} else {
orderedZones = reorderZones(zones);
@ -64,19 +64,19 @@ public class HierarchicalReadingOrderResolver {
}
private List<Zone> reorderZones(List<Zone> unorderedZones) {
private List<CharacterZone> reorderZones(List<CharacterZone> unorderedZones) {
if (unorderedZones.isEmpty()) {
return new ArrayList<Zone>();
return new ArrayList<>();
} else if (unorderedZones.size() == 1) {
List<Zone> ret = new ArrayList<Zone>(1);
List<CharacterZone> ret = new ArrayList<>(1);
ret.add(unorderedZones.get(0));
return ret;
} else {
BBoxZoneGroup bxZonesTree = groupZonesHierarchically(unorderedZones);
sortGroupedZones(bxZonesTree);
TreeToListConverter treeConverter = new TreeToListConverter();
List<Zone> orderedZones = treeConverter.convertToList(bxZonesTree);
List<CharacterZone> orderedZones = treeConverter.convertToList(bxZonesTree);
assert unorderedZones.size() == orderedZones.size();
return orderedZones;
}
@ -90,15 +90,15 @@ public class HierarchicalReadingOrderResolver {
* @param zones is a list of unordered zones
* @return root of the zones clustered in a tree
*/
private BBoxZoneGroup groupZonesHierarchically(List<Zone> zones) {
private BBoxZoneGroup groupZonesHierarchically(List<CharacterZone> zones) {
/*
* Distance tuples are stored sorted by ascending distance value
*/
List<DistElem<BBoxObject>> dists = new ArrayList<DistElem<BBoxObject>>(zones.size() * zones.size() / 2);
for (int idx1 = 0; idx1 < zones.size(); ++idx1) {
for (int idx2 = idx1 + 1; idx2 < zones.size(); ++idx2) {
Zone zone1 = zones.get(idx1);
Zone zone2 = zones.get(idx2);
CharacterZone zone1 = zones.get(idx1);
CharacterZone zone2 = zones.get(idx2);
dists.add(new DistElem<BBoxObject>(false, distance(zone1, zone2), zone1, zone2));
}
}

View File

@ -40,9 +40,9 @@ public class LineBuilderService {
sets.forEach(group -> {
List<Character> lineComponents = new ArrayList<>(group);
lineComponents.sort(Comparator.comparingDouble(Character::getX));
lines.add(new CharacterLine(lineComponents));
lines.add(new CharacterLine(lineComponents, characterSpacing));
});
return lines;
}

View File

@ -25,6 +25,8 @@ public class ZoneBuilderService {
private static final double ANGLE_TOLERANCE = Math.PI / 6;
public static final int MAX_ZONES = 300;
public List<CharacterZone> buildZones(List<CharacterLine> lines, double characterSpacing, double lineSpacing) {
@ -64,6 +66,14 @@ public class ZoneBuilderService {
zones.add(new CharacterZone(new ArrayList<>(group)));
});
if (zones.size() > MAX_ZONES) {
List<CharacterLine> oneZoneLines = new ArrayList<>();
for (CharacterZone zone : zones) {
oneZoneLines.addAll(zone.getLines());
}
return List.of(new CharacterZone(oneZoneLines));
}
return zones;
}

View File

@ -1,8 +1,12 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.BBoxObject;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Line;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Word;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.utils.BoundingBoxBuilder;
@ -10,7 +14,9 @@ import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.mo
import lombok.Data;
@Data
public class CharacterLine {
public class CharacterLine extends BBoxObject {
private static final double WORD_DISTANCE_MULTIPLIER = 0.2;
private final double x0;
private final double y0;
@ -21,9 +27,10 @@ public class CharacterLine {
private final double height;
private final List<Character> characters;
private final List<TextPositionSequence> words = new ArrayList<>();
public CharacterLine(List<Character> characters) {
public CharacterLine(List<Character> characters, double wordSpacing) {
this.characters = characters;
@ -55,6 +62,8 @@ public class CharacterLine {
throw new IllegalArgumentException("Component list must not be empty");
}
height = computeHeight();
computeWords(wordSpacing * WORD_DISTANCE_MULTIPLIER);
buildBox();
}
@ -112,6 +121,25 @@ public class CharacterLine {
}
public void computeWords(double wordSpacing) {
TextPositionSequence word = new TextPositionSequence();
Character previous = null;
for (Character current : characters) {
if (previous != null) {
double dist = current.getTextPosition().getXDirAdj() - previous.getTextPosition().getXDirAdj() - previous.getTextPosition().getWidthDirAdj();
if (dist > wordSpacing) {
words.add(word);
word = new TextPositionSequence();
}
}
word.getTextPositions().add(current.getTextPosition());
previous = current;
}
words.add(word);
}
public Line convertToBxLine(double wordSpacing) {
Line line = new Line();
@ -135,5 +163,25 @@ public class CharacterLine {
return line;
}
public void buildBox() {
double minX = Double.POSITIVE_INFINITY;
double minY = Double.POSITIVE_INFINITY;
double maxX = Double.NEGATIVE_INFINITY;
double maxY = Double.NEGATIVE_INFINITY;
for (Character character : characters) {
minX = Math.min(minX, character.getTextPosition().getXDirAdj());
minY = Math.min(minY, character.getTextPosition().getYDirAdj());
maxX = Math.max(maxX, character.getTextPosition().getXDirAdj() + character.getTextPosition().getWidthDirAdj());
maxY = Math.max(maxY, character.getTextPosition().getYDirAdj() + character.getTextPosition().getHeightDir());
}
this.setbBox(new BoundingBox(minX, minY, maxX - minX, maxY - minY));
}
}

View File

@ -1,24 +1,44 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.BBoxObject;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.BoundingBox;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@NoArgsConstructor
@AllArgsConstructor
public class CharacterZone extends BBoxObject {
private List<CharacterLine> lines = new ArrayList<>();
private List<CharacterLine> lines;
public CharacterZone(List<CharacterLine> lines) {
lines.sort(Comparator.comparingDouble(CharacterLine::getY));
this.lines = lines;
buildBox();
}
public void buildBox() {
double minX = Double.POSITIVE_INFINITY;
double minY = Double.POSITIVE_INFINITY;
double maxX = Double.NEGATIVE_INFINITY;
double maxY = Double.NEGATIVE_INFINITY;
for (CharacterLine line : lines) {
minX = Math.min(minX, line.getX());
minY = Math.min(minY, line.getY());
maxX = Math.max(maxX, line.getX() + line.getWidth());
maxY = Math.max(maxY, line.getY() + line.getHeight());
}
this.setbBox(new BoundingBox(minX, minY, maxX - minX, maxY - minY));
}
}

View File

@ -7,7 +7,7 @@ import java.util.Map;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.BBoxObject;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Zone;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.CharacterZone;
/**
* A set-like data structure for objects placed on a plane. Can efficiently find objects in a certain rectangular area.
@ -82,12 +82,12 @@ public class DocumentPlane {
}
public DocumentPlane(List<Zone> objectList, int gridSize) {
public DocumentPlane(List<CharacterZone> objectList, int gridSize) {
this.grid = new HashMap<GridXY, List<BBoxObject>>();
this.objs = new ArrayList<BBoxObject>();
this.gridSize = gridSize;
for (Zone obj : objectList) {
for (CharacterZone obj : objectList) {
add(obj);
}
}

View File

@ -3,25 +3,22 @@ package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.r
import java.util.ArrayList;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Zone;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.CharacterZone;
/**
* @author Pawel Szostek
*/
public class TreeToListConverter {
public List<Zone> convertToList(BBoxZoneGroup obj) {
public List<CharacterZone> convertToList(BBoxZoneGroup obj) {
List<Zone> ret = new ArrayList<Zone>();
if (obj.getLeftChild() instanceof Zone) {
Zone zone = (Zone) obj.getLeftChild();
List<CharacterZone> ret = new ArrayList<>();
if (obj.getLeftChild() instanceof CharacterZone) {
CharacterZone zone = (CharacterZone) obj.getLeftChild();
ret.add(zone);
} else { // obj.getLeftChild() instanceof BxZoneGroup
ret.addAll(convertToList((BBoxZoneGroup) obj.getLeftChild()));
}
if (obj.getRightChild() instanceof Zone) {
Zone zone = (Zone) obj.getRightChild();
if (obj.getRightChild() instanceof CharacterZone) {
CharacterZone zone = (CharacterZone) obj.getRightChild();
ret.add(zone);
} else { // obj.getRightChild() instanceof BxZoneGroup
ret.addAll(convertToList((BBoxZoneGroup) obj.getRightChild()));