REmove more
This commit is contained in:
parent
4afa8daafa
commit
4de6c12aec
@ -14,7 +14,6 @@ import com.google.common.collect.Lists;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.DisjointSets;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Histogram;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.BoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Line;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Word;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Zone;
|
||||
@ -27,8 +26,6 @@ import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.mo
|
||||
public class DocstrumSegmenter {
|
||||
|
||||
public static final int MAX_ZONES_PER_PAGE = 300;
|
||||
public static final double ORIENTATION_MARGIN = 0.2;
|
||||
public static final int LINES_PER_PAGE_MARGIN = 100;
|
||||
|
||||
private static final double DISTANCE_STEP = 16.0;
|
||||
|
||||
@ -181,21 +178,7 @@ public class DocstrumSegmenter {
|
||||
double characterSpacing = computeCharacterSpacing(components, orientation);
|
||||
double lineSpacing = computeLineSpacing(components, orientation);
|
||||
|
||||
List<ComponentLine> lines = determineLines(components, orientation, characterSpacing * COMP_DIST_CHAR, lineSpacing * MAX_VERTICAL_COMP_DIST);
|
||||
|
||||
if (Math.abs(orientation) > ORIENTATION_MARGIN) {
|
||||
List<ComponentLine> linesZero = determineLines(components, 0, characterSpacing * COMP_DIST_CHAR, lineSpacing * MAX_VERTICAL_COMP_DIST);
|
||||
|
||||
if (Math.abs(lines.size() - LINES_PER_PAGE_MARGIN) > Math.abs(linesZero.size() - LINES_PER_PAGE_MARGIN)) {
|
||||
orientation = 0;
|
||||
lines = linesZero;
|
||||
}
|
||||
}
|
||||
|
||||
double lineOrientation = computeOrientation(lines);
|
||||
if (!Double.isNaN(lineOrientation)) {
|
||||
orientation = lineOrientation;
|
||||
}
|
||||
List<ComponentLine> lines = determineLines(components, characterSpacing * COMP_DIST_CHAR, lineSpacing * MAX_VERTICAL_COMP_DIST);
|
||||
|
||||
List<List<ComponentLine>> zones = determineZones(lines,
|
||||
orientation,
|
||||
@ -207,19 +190,10 @@ public class DocstrumSegmenter {
|
||||
0.0,
|
||||
0.0,
|
||||
lineSpacing * MAX_VERTICAL_MERGE_DIST);
|
||||
zones = mergeZones(zones, characterSpacing * 0.5);
|
||||
zones = mergeLines(zones, orientation, Double.NEGATIVE_INFINITY, 0.0, 0.0, lineSpacing * MAX_VERTICAL_MERGE_DIST);
|
||||
return convertToBxModel(zones, WORD_DIST_MULT * characterSpacing);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Performs for each component search for nearest-neighbors and stores the
|
||||
* result in component's neighbors attribute.
|
||||
*
|
||||
* @param components array of components
|
||||
* equal to the number of nearest-neighbors per component.
|
||||
*/
|
||||
private void findNeighbors(Character[] components) {
|
||||
|
||||
if (components.length == 0) {
|
||||
@ -352,15 +326,14 @@ public class DocstrumSegmenter {
|
||||
* Groups components into text lines.
|
||||
*
|
||||
* @param components component list
|
||||
* @param orientation - estimated text orientation
|
||||
* @param maxHorizontalDistance - maximum horizontal distance between components
|
||||
* @param maxVerticalDistance - maximum vertical distance between components
|
||||
* @return lines of components
|
||||
*/
|
||||
private List<ComponentLine> determineLines(List<Character> components, double orientation, double maxHorizontalDistance, double maxVerticalDistance) {
|
||||
private List<ComponentLine> determineLines(List<Character> components, double maxHorizontalDistance, double maxVerticalDistance) {
|
||||
|
||||
DisjointSets<Character> sets = new DisjointSets<Character>(components);
|
||||
AngleFilter filter = AngleFilter.newInstance(orientation - ANGLE_TOLERANCE, orientation + ANGLE_TOLERANCE);
|
||||
AngleFilter filter = AngleFilter.newInstance(-ANGLE_TOLERANCE, ANGLE_TOLERANCE);
|
||||
for (Character component : components) {
|
||||
for (Neighbor neighbor : component.getNeighbors()) {
|
||||
double x = neighbor.getHorizontalDistance() / maxHorizontalDistance;
|
||||
@ -374,24 +347,12 @@ public class DocstrumSegmenter {
|
||||
for (Set<Character> group : sets) {
|
||||
List<Character> lineComponents = new ArrayList<Character>(group);
|
||||
Collections.sort(lineComponents, Character.CharacterXComparator.getInstance());
|
||||
lines.add(new ComponentLine(lineComponents, orientation));
|
||||
lines.add(new ComponentLine(lineComponents));
|
||||
}
|
||||
return lines;
|
||||
}
|
||||
|
||||
|
||||
private double computeOrientation(List<ComponentLine> lines) {
|
||||
// Compute weighted mean of line angles
|
||||
double valueSum = 0.0;
|
||||
double weightSum = 0.0;
|
||||
for (ComponentLine line : lines) {
|
||||
valueSum += line.getAngle() * line.getLength();
|
||||
weightSum += line.getLength();
|
||||
}
|
||||
return valueSum / weightSum;
|
||||
}
|
||||
|
||||
|
||||
private List<List<ComponentLine>> determineZones(List<ComponentLine> lines,
|
||||
double orientation,
|
||||
double minHorizontalDistance,
|
||||
@ -444,111 +405,6 @@ public class DocstrumSegmenter {
|
||||
}
|
||||
|
||||
|
||||
private List<List<ComponentLine>> mergeZones(List<List<ComponentLine>> zones, double tolerance) {
|
||||
|
||||
List<BoundingBox> bounds = new ArrayList<BoundingBox>(zones.size());
|
||||
for (List<ComponentLine> zone : zones) {
|
||||
BoundingBoxBuilder builder = new BoundingBoxBuilder();
|
||||
for (ComponentLine line : zone) {
|
||||
for (Character component : line.getComponents()) {
|
||||
builder.expand(component.getChunk());
|
||||
}
|
||||
}
|
||||
bounds.add(builder.getBounds());
|
||||
}
|
||||
|
||||
List<List<ComponentLine>> outputZones = new ArrayList<List<ComponentLine>>();
|
||||
mainFor:
|
||||
for (int i = 0; i < zones.size(); i++) {
|
||||
for (int j = 0; j < zones.size(); j++) {
|
||||
if (i == j || bounds.get(j) == null || bounds.get(i) == null) {
|
||||
continue;
|
||||
}
|
||||
if (bounds.get(j).contains(bounds.get(i), tolerance)) {
|
||||
zones.get(j).addAll(zones.get(i));
|
||||
bounds.set(i, null);
|
||||
continue mainFor;
|
||||
}
|
||||
}
|
||||
outputZones.add(zones.get(i));
|
||||
}
|
||||
return outputZones;
|
||||
}
|
||||
|
||||
|
||||
private List<List<ComponentLine>> mergeLines(List<List<ComponentLine>> zones,
|
||||
double orientation,
|
||||
double minHorizontalDistance,
|
||||
double maxHorizontalDistance,
|
||||
double minVerticalDistance,
|
||||
double maxVerticalDistance) {
|
||||
|
||||
List<List<ComponentLine>> outputZones = new ArrayList<List<ComponentLine>>(zones.size());
|
||||
for (List<ComponentLine> zone : zones) {
|
||||
outputZones.add(mergeLinesInZone(zone, orientation, minHorizontalDistance, maxHorizontalDistance, minVerticalDistance, maxVerticalDistance));
|
||||
}
|
||||
return outputZones;
|
||||
}
|
||||
|
||||
|
||||
private List<ComponentLine> mergeLinesInZone(List<ComponentLine> lines,
|
||||
double orientation,
|
||||
double minHorizontalDistance,
|
||||
double maxHorizontalDistance,
|
||||
double minVerticalDistance,
|
||||
double maxVerticalDistance) {
|
||||
|
||||
DisjointSets<ComponentLine> sets = new DisjointSets<ComponentLine>(lines);
|
||||
for (int i = 0; i < lines.size(); i++) {
|
||||
ComponentLine li = lines.get(i);
|
||||
for (int j = i + 1; j < lines.size(); j++) {
|
||||
ComponentLine lj = lines.get(j);
|
||||
double hDist = li.horizontalDistance(lj, orientation);
|
||||
double vDist = li.verticalDistance(lj, orientation);
|
||||
if (minHorizontalDistance <= hDist && hDist <= maxHorizontalDistance && minVerticalDistance <= vDist && vDist <= maxVerticalDistance) {
|
||||
sets.union(li, lj);
|
||||
} else if (minVerticalDistance <= vDist && vDist <= maxVerticalDistance && Math.abs(hDist - Math.min(li.getLength(), lj.getLength())) < 0.1) {
|
||||
boolean componentOverlap = false;
|
||||
int overlappingCount = 0;
|
||||
for (Character ci : li.getComponents()) {
|
||||
for (Character cj : lj.getComponents()) {
|
||||
double dist = ci.overlappingDistance(cj, orientation);
|
||||
if (dist > 2) {
|
||||
componentOverlap = true;
|
||||
}
|
||||
if (dist > 0) {
|
||||
overlappingCount++;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!componentOverlap && overlappingCount <= 2) {
|
||||
sets.union(li, lj);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
List<ComponentLine> outputZone = new ArrayList<ComponentLine>();
|
||||
for (Set<ComponentLine> group : sets) {
|
||||
List<Character> components = new ArrayList<Character>();
|
||||
for (ComponentLine line : group) {
|
||||
components.addAll(line.getComponents());
|
||||
}
|
||||
Collections.sort(components, Character.CharacterXComparator.getInstance());
|
||||
outputZone.add(new ComponentLine(components, orientation));
|
||||
}
|
||||
return outputZone;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Converts list of zones from internal format (using components and
|
||||
* component lines) to BxModel.
|
||||
*
|
||||
* @param zones zones in internal format
|
||||
* @param wordSpacing - maximum allowed distance between components that
|
||||
* belong to one word
|
||||
* @return BxModel page
|
||||
*/
|
||||
private List<Zone> convertToBxModel(List<List<ComponentLine>> zones, double wordSpacing) {
|
||||
|
||||
List<Zone> zoneList = new ArrayList<>();
|
||||
@ -630,7 +486,7 @@ public class DocstrumSegmenter {
|
||||
private final List<Character> components;
|
||||
|
||||
|
||||
public ComponentLine(List<Character> components, double orientation) {
|
||||
public ComponentLine(List<Character> components) {
|
||||
|
||||
this.components = components;
|
||||
|
||||
@ -653,7 +509,7 @@ public class DocstrumSegmenter {
|
||||
} else if (!components.isEmpty()) {
|
||||
Character component = components.get(0);
|
||||
double dx = component.getChunk().getWidthDirAdj() / 3;
|
||||
double dy = dx * Math.tan(orientation);
|
||||
double dy = dx * Math.tan(0);
|
||||
this.x0 = component.getX() - dx;
|
||||
this.x1 = component.getX() + dx;
|
||||
this.y0 = component.getY() - dy;
|
||||
@ -671,12 +527,6 @@ public class DocstrumSegmenter {
|
||||
}
|
||||
|
||||
|
||||
public double getSlope() {
|
||||
|
||||
return (y1 - y0) / (x1 - x0);
|
||||
}
|
||||
|
||||
|
||||
public double getLength() {
|
||||
|
||||
return Math.sqrt((x0 - x1) * (x0 - x1) + (y0 - y1) * (y0 - y1));
|
||||
@ -699,12 +549,6 @@ public class DocstrumSegmenter {
|
||||
}
|
||||
|
||||
|
||||
public List<Character> getComponents() {
|
||||
|
||||
return components;
|
||||
}
|
||||
|
||||
|
||||
public double angularDifference(ComponentLine j) {
|
||||
|
||||
double diff = Math.abs(getAngle() - j.getAngle());
|
||||
|
||||
@ -8,11 +8,7 @@ import java.util.List;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.BBoxObject;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Line;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Word;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Zone;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.utils.DoubleUtils;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder.BBoxZoneGroup;
|
||||
@ -24,7 +20,6 @@ import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.re
|
||||
public class HierarchicalReadingOrderResolver {
|
||||
|
||||
static final int GRIDSIZE = 50;
|
||||
static final double BOXES_FLOW = 0.5;
|
||||
static final double EPS = 0.01;
|
||||
static final int MAX_ZONES = 1000;
|
||||
static final Comparator<BBoxObject> Y_ASCENDING_ORDER = new Comparator<BBoxObject>() {
|
||||
@ -45,15 +40,6 @@ public class HierarchicalReadingOrderResolver {
|
||||
}
|
||||
};
|
||||
|
||||
static final Comparator<RedTextPosition> TP_X_ASCENDING_ORDER = new Comparator<RedTextPosition>() {
|
||||
|
||||
@Override
|
||||
public int compare(RedTextPosition o1, RedTextPosition o2) {
|
||||
|
||||
return DoubleUtils.compareDouble(o1.getXDirAdj(), o2.getXDirAdj(), EPS);
|
||||
}
|
||||
};
|
||||
|
||||
static final Comparator<BBoxObject> YX_ASCENDING_ORDER = new Comparator<BBoxObject>() {
|
||||
|
||||
@Override
|
||||
@ -67,21 +53,6 @@ public class HierarchicalReadingOrderResolver {
|
||||
|
||||
public List<Zone> resolve(List<Zone> zones) {
|
||||
|
||||
for (Zone zone : zones) {
|
||||
List<Line> lines = Lists.newArrayList(zone.getLines());
|
||||
for (Line line : lines) {
|
||||
List<Word> words = Lists.newArrayList(line.getWords());
|
||||
for (Word word : words) {
|
||||
List<RedTextPosition> chunks = Lists.newArrayList(word.getTextPositions());
|
||||
Collections.sort(chunks, TP_X_ASCENDING_ORDER);
|
||||
word.setTextPositions(chunks);
|
||||
}
|
||||
Collections.sort(words, X_ASCENDING_ORDER);
|
||||
line.setWords(words);
|
||||
}
|
||||
Collections.sort(lines, YX_ASCENDING_ORDER);
|
||||
zone.setLines(lines);
|
||||
}
|
||||
List<Zone> orderedZones;
|
||||
if (zones.size() > MAX_ZONES) {
|
||||
orderedZones = new ArrayList<Zone>(zones);
|
||||
|
||||
@ -255,26 +255,4 @@ public class DocumentPlane {
|
||||
return objs_.size();
|
||||
}
|
||||
|
||||
|
||||
public String dump() {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (GridXY iter : grid.keySet()) {
|
||||
sb.append(iter.toString()).append(" [");
|
||||
for (BBoxObject obj : grid.get(iter)) {
|
||||
if (obj instanceof BBoxZoneGroup) {
|
||||
BBoxZoneGroup group = (BBoxZoneGroup) obj;
|
||||
sb.append(group.getLeftChild());
|
||||
sb.append(group.getRightChild());
|
||||
} else if (obj instanceof Zone) {
|
||||
Zone zone = (Zone) obj;
|
||||
sb.append(zone);
|
||||
}
|
||||
sb.append("\n");
|
||||
}
|
||||
sb.append("]\n");
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -25,7 +25,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
@SneakyThrows
|
||||
public void testViewerDocument() {
|
||||
|
||||
String fileName = "files/new/agb1.pdf";
|
||||
String fileName = "files/Plenarprotokoll 1 (keine Druchsache!) (1).pdf";
|
||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||
|
||||
var documentFile = new ClassPathResource(fileName).getFile();
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user