More refactoring
This commit is contained in:
parent
b2fb6829cb
commit
e394f2fa7c
@ -1,22 +1,14 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Line;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Zone;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.Character;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.CharacterLine;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.CharacterZone;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.utils.BoundingBoxBuilder;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.utils.ZoneUtils;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@ -29,21 +21,12 @@ public class DocstrumSegmenter {
|
||||
private final LineBuilderService lineBuilderService;
|
||||
private final ZoneBuilderService zoneBuilderService;
|
||||
|
||||
public static final int MAX_ZONES_PER_PAGE = 300;
|
||||
/**
|
||||
* Word distance multiplier.
|
||||
* <p>
|
||||
* Maximum distance between components that belong to the same word is
|
||||
* equal to the product of this value and estimated within-line spacing.
|
||||
*/
|
||||
private static final double WORD_DIST_MULT = 0.2;
|
||||
|
||||
public List<CharacterZone> segmentPage(List<TextPositionSequence> textPositions) {
|
||||
|
||||
public List<Zone> segmentPage(List<TextPositionSequence> textPositions) {
|
||||
var positions = textPositions.stream().map(TextPositionSequence::getTextPositions).flatMap(List::stream).toList();
|
||||
|
||||
var positions = textPositions.stream().map(t -> t.getTextPositions()).flatMap(List::stream).collect(Collectors.toList());
|
||||
|
||||
var components = positions.stream().map(chunk -> new Character(chunk)).collect(Collectors.toList());
|
||||
var components = positions.stream().map(Character::new).collect(Collectors.toList());
|
||||
|
||||
nearestNeighbourService.findNearestNeighbors(components);
|
||||
|
||||
@ -52,44 +35,8 @@ public class DocstrumSegmenter {
|
||||
|
||||
List<CharacterLine> lines = lineBuilderService.buildLines(components, characterSpacing, lineSpacing);
|
||||
|
||||
List<CharacterZone> zones = zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing);
|
||||
return convertToBxModel(zones, WORD_DIST_MULT * characterSpacing);
|
||||
}
|
||||
return zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing);
|
||||
|
||||
|
||||
private List<Zone> convertToBxModel(List<CharacterZone> zones, double wordSpacing) {
|
||||
|
||||
List<Zone> zoneList = new ArrayList<>();
|
||||
if (zones.size() > MAX_ZONES_PER_PAGE) {
|
||||
CharacterZone oneZone = new CharacterZone();
|
||||
for (CharacterZone zone : zones) {
|
||||
oneZone.getLines().addAll(zone.getLines());
|
||||
}
|
||||
zones = new ArrayList<>();
|
||||
zones.add(oneZone);
|
||||
}
|
||||
|
||||
for (CharacterZone characterZone : zones) {
|
||||
Zone zone = new Zone();
|
||||
for (CharacterLine line : characterZone.getLines()) {
|
||||
zone.addLine(line.convertToBxLine(wordSpacing));
|
||||
}
|
||||
List<Line> zLines = Lists.newArrayList(zone.getLines());
|
||||
Collections.sort(zLines, new Comparator<Line>() {
|
||||
|
||||
@Override
|
||||
public int compare(Line o1, Line o2) {
|
||||
|
||||
return Double.compare(o1.getbBox().getY(), o2.getbBox().getY());
|
||||
}
|
||||
|
||||
});
|
||||
zone.setLines(zLines);
|
||||
BoundingBoxBuilder.setBounds(zone);
|
||||
zoneList.add(zone);
|
||||
}
|
||||
ZoneUtils.sortZonesYX(zoneList);
|
||||
return zoneList;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -9,7 +9,7 @@ import java.util.List;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.BBoxObject;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Zone;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.CharacterZone;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.utils.DoubleUtils;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder.BBoxZoneGroup;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder.DistElem;
|
||||
@ -51,11 +51,11 @@ public class HierarchicalReadingOrderResolver {
|
||||
};
|
||||
|
||||
|
||||
public List<Zone> resolve(List<Zone> zones) {
|
||||
public List<CharacterZone> resolve(List<CharacterZone> zones) {
|
||||
|
||||
List<Zone> orderedZones;
|
||||
List<CharacterZone> orderedZones;
|
||||
if (zones.size() > MAX_ZONES) {
|
||||
orderedZones = new ArrayList<Zone>(zones);
|
||||
orderedZones = new ArrayList<>(zones);
|
||||
Collections.sort(orderedZones, YX_ASCENDING_ORDER);
|
||||
} else {
|
||||
orderedZones = reorderZones(zones);
|
||||
@ -64,19 +64,19 @@ public class HierarchicalReadingOrderResolver {
|
||||
}
|
||||
|
||||
|
||||
private List<Zone> reorderZones(List<Zone> unorderedZones) {
|
||||
private List<CharacterZone> reorderZones(List<CharacterZone> unorderedZones) {
|
||||
|
||||
if (unorderedZones.isEmpty()) {
|
||||
return new ArrayList<Zone>();
|
||||
return new ArrayList<>();
|
||||
} else if (unorderedZones.size() == 1) {
|
||||
List<Zone> ret = new ArrayList<Zone>(1);
|
||||
List<CharacterZone> ret = new ArrayList<>(1);
|
||||
ret.add(unorderedZones.get(0));
|
||||
return ret;
|
||||
} else {
|
||||
BBoxZoneGroup bxZonesTree = groupZonesHierarchically(unorderedZones);
|
||||
sortGroupedZones(bxZonesTree);
|
||||
TreeToListConverter treeConverter = new TreeToListConverter();
|
||||
List<Zone> orderedZones = treeConverter.convertToList(bxZonesTree);
|
||||
List<CharacterZone> orderedZones = treeConverter.convertToList(bxZonesTree);
|
||||
assert unorderedZones.size() == orderedZones.size();
|
||||
return orderedZones;
|
||||
}
|
||||
@ -90,15 +90,15 @@ public class HierarchicalReadingOrderResolver {
|
||||
* @param zones is a list of unordered zones
|
||||
* @return root of the zones clustered in a tree
|
||||
*/
|
||||
private BBoxZoneGroup groupZonesHierarchically(List<Zone> zones) {
|
||||
private BBoxZoneGroup groupZonesHierarchically(List<CharacterZone> zones) {
|
||||
/*
|
||||
* Distance tuples are stored sorted by ascending distance value
|
||||
*/
|
||||
List<DistElem<BBoxObject>> dists = new ArrayList<DistElem<BBoxObject>>(zones.size() * zones.size() / 2);
|
||||
for (int idx1 = 0; idx1 < zones.size(); ++idx1) {
|
||||
for (int idx2 = idx1 + 1; idx2 < zones.size(); ++idx2) {
|
||||
Zone zone1 = zones.get(idx1);
|
||||
Zone zone2 = zones.get(idx2);
|
||||
CharacterZone zone1 = zones.get(idx1);
|
||||
CharacterZone zone2 = zones.get(idx2);
|
||||
dists.add(new DistElem<BBoxObject>(false, distance(zone1, zone2), zone1, zone2));
|
||||
}
|
||||
}
|
||||
|
||||
@ -40,9 +40,9 @@ public class LineBuilderService {
|
||||
sets.forEach(group -> {
|
||||
List<Character> lineComponents = new ArrayList<>(group);
|
||||
lineComponents.sort(Comparator.comparingDouble(Character::getX));
|
||||
lines.add(new CharacterLine(lineComponents));
|
||||
lines.add(new CharacterLine(lineComponents, characterSpacing));
|
||||
});
|
||||
|
||||
|
||||
return lines;
|
||||
}
|
||||
|
||||
|
||||
@ -25,6 +25,8 @@ public class ZoneBuilderService {
|
||||
|
||||
private static final double ANGLE_TOLERANCE = Math.PI / 6;
|
||||
|
||||
public static final int MAX_ZONES = 300;
|
||||
|
||||
|
||||
public List<CharacterZone> buildZones(List<CharacterLine> lines, double characterSpacing, double lineSpacing) {
|
||||
|
||||
@ -64,6 +66,14 @@ public class ZoneBuilderService {
|
||||
zones.add(new CharacterZone(new ArrayList<>(group)));
|
||||
});
|
||||
|
||||
if (zones.size() > MAX_ZONES) {
|
||||
List<CharacterLine> oneZoneLines = new ArrayList<>();
|
||||
for (CharacterZone zone : zones) {
|
||||
oneZoneLines.addAll(zone.getLines());
|
||||
}
|
||||
return List.of(new CharacterZone(oneZoneLines));
|
||||
}
|
||||
|
||||
return zones;
|
||||
}
|
||||
|
||||
|
||||
@ -1,8 +1,12 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.BBoxObject;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.BoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Line;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Word;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.utils.BoundingBoxBuilder;
|
||||
@ -10,7 +14,9 @@ import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.mo
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
public class CharacterLine {
|
||||
public class CharacterLine extends BBoxObject {
|
||||
|
||||
private static final double WORD_DISTANCE_MULTIPLIER = 0.2;
|
||||
|
||||
private final double x0;
|
||||
private final double y0;
|
||||
@ -21,9 +27,10 @@ public class CharacterLine {
|
||||
private final double height;
|
||||
|
||||
private final List<Character> characters;
|
||||
private final List<TextPositionSequence> words = new ArrayList<>();
|
||||
|
||||
|
||||
public CharacterLine(List<Character> characters) {
|
||||
public CharacterLine(List<Character> characters, double wordSpacing) {
|
||||
|
||||
this.characters = characters;
|
||||
|
||||
@ -55,6 +62,8 @@ public class CharacterLine {
|
||||
throw new IllegalArgumentException("Component list must not be empty");
|
||||
}
|
||||
height = computeHeight();
|
||||
computeWords(wordSpacing * WORD_DISTANCE_MULTIPLIER);
|
||||
buildBox();
|
||||
}
|
||||
|
||||
|
||||
@ -112,6 +121,25 @@ public class CharacterLine {
|
||||
}
|
||||
|
||||
|
||||
public void computeWords(double wordSpacing) {
|
||||
|
||||
TextPositionSequence word = new TextPositionSequence();
|
||||
Character previous = null;
|
||||
for (Character current : characters) {
|
||||
if (previous != null) {
|
||||
double dist = current.getTextPosition().getXDirAdj() - previous.getTextPosition().getXDirAdj() - previous.getTextPosition().getWidthDirAdj();
|
||||
if (dist > wordSpacing) {
|
||||
words.add(word);
|
||||
word = new TextPositionSequence();
|
||||
}
|
||||
}
|
||||
word.getTextPositions().add(current.getTextPosition());
|
||||
previous = current;
|
||||
}
|
||||
words.add(word);
|
||||
}
|
||||
|
||||
|
||||
public Line convertToBxLine(double wordSpacing) {
|
||||
|
||||
Line line = new Line();
|
||||
@ -135,5 +163,25 @@ public class CharacterLine {
|
||||
return line;
|
||||
}
|
||||
|
||||
|
||||
public void buildBox() {
|
||||
|
||||
double minX = Double.POSITIVE_INFINITY;
|
||||
double minY = Double.POSITIVE_INFINITY;
|
||||
double maxX = Double.NEGATIVE_INFINITY;
|
||||
double maxY = Double.NEGATIVE_INFINITY;
|
||||
|
||||
for (Character character : characters) {
|
||||
|
||||
minX = Math.min(minX, character.getTextPosition().getXDirAdj());
|
||||
minY = Math.min(minY, character.getTextPosition().getYDirAdj());
|
||||
maxX = Math.max(maxX, character.getTextPosition().getXDirAdj() + character.getTextPosition().getWidthDirAdj());
|
||||
maxY = Math.max(maxY, character.getTextPosition().getYDirAdj() + character.getTextPosition().getHeightDir());
|
||||
|
||||
}
|
||||
|
||||
this.setbBox(new BoundingBox(minX, minY, maxX - minX, maxY - minY));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -1,24 +1,44 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.BBoxObject;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.BoundingBox;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class CharacterZone extends BBoxObject {
|
||||
|
||||
private List<CharacterLine> lines = new ArrayList<>();
|
||||
private List<CharacterLine> lines;
|
||||
|
||||
|
||||
public CharacterZone(List<CharacterLine> lines) {
|
||||
|
||||
lines.sort(Comparator.comparingDouble(CharacterLine::getY));
|
||||
this.lines = lines;
|
||||
buildBox();
|
||||
}
|
||||
|
||||
|
||||
public void buildBox() {
|
||||
|
||||
double minX = Double.POSITIVE_INFINITY;
|
||||
double minY = Double.POSITIVE_INFINITY;
|
||||
double maxX = Double.NEGATIVE_INFINITY;
|
||||
double maxY = Double.NEGATIVE_INFINITY;
|
||||
|
||||
for (CharacterLine line : lines) {
|
||||
|
||||
minX = Math.min(minX, line.getX());
|
||||
minY = Math.min(minY, line.getY());
|
||||
maxX = Math.max(maxX, line.getX() + line.getWidth());
|
||||
maxY = Math.max(maxY, line.getY() + line.getHeight());
|
||||
|
||||
}
|
||||
|
||||
this.setbBox(new BoundingBox(minX, minY, maxX - minX, maxY - minY));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -7,7 +7,7 @@ import java.util.Map;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.BBoxObject;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.BoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Zone;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.CharacterZone;
|
||||
|
||||
/**
|
||||
* A set-like data structure for objects placed on a plane. Can efficiently find objects in a certain rectangular area.
|
||||
@ -82,12 +82,12 @@ public class DocumentPlane {
|
||||
}
|
||||
|
||||
|
||||
public DocumentPlane(List<Zone> objectList, int gridSize) {
|
||||
public DocumentPlane(List<CharacterZone> objectList, int gridSize) {
|
||||
|
||||
this.grid = new HashMap<GridXY, List<BBoxObject>>();
|
||||
this.objs = new ArrayList<BBoxObject>();
|
||||
this.gridSize = gridSize;
|
||||
for (Zone obj : objectList) {
|
||||
for (CharacterZone obj : objectList) {
|
||||
add(obj);
|
||||
}
|
||||
}
|
||||
|
||||
@ -3,25 +3,22 @@ package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.r
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Zone;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.CharacterZone;
|
||||
|
||||
/**
|
||||
* @author Pawel Szostek
|
||||
*/
|
||||
public class TreeToListConverter {
|
||||
|
||||
public List<Zone> convertToList(BBoxZoneGroup obj) {
|
||||
public List<CharacterZone> convertToList(BBoxZoneGroup obj) {
|
||||
|
||||
List<Zone> ret = new ArrayList<Zone>();
|
||||
if (obj.getLeftChild() instanceof Zone) {
|
||||
Zone zone = (Zone) obj.getLeftChild();
|
||||
List<CharacterZone> ret = new ArrayList<>();
|
||||
if (obj.getLeftChild() instanceof CharacterZone) {
|
||||
CharacterZone zone = (CharacterZone) obj.getLeftChild();
|
||||
ret.add(zone);
|
||||
} else { // obj.getLeftChild() instanceof BxZoneGroup
|
||||
ret.addAll(convertToList((BBoxZoneGroup) obj.getLeftChild()));
|
||||
}
|
||||
|
||||
if (obj.getRightChild() instanceof Zone) {
|
||||
Zone zone = (Zone) obj.getRightChild();
|
||||
if (obj.getRightChild() instanceof CharacterZone) {
|
||||
CharacterZone zone = (CharacterZone) obj.getRightChild();
|
||||
ret.add(zone);
|
||||
} else { // obj.getRightChild() instanceof BxZoneGroup
|
||||
ret.addAll(convertToList((BBoxZoneGroup) obj.getRightChild()));
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user