diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/DocstrumSegmenter.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/DocstrumSegmenter.java index 12ed4a7..7c3c2fe 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/DocstrumSegmenter.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/DocstrumSegmenter.java @@ -14,7 +14,6 @@ import com.google.common.collect.Lists; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.DisjointSets; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Histogram; -import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.BoundingBox; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Line; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Word; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Zone; @@ -27,8 +26,6 @@ import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.mo public class DocstrumSegmenter { public static final int MAX_ZONES_PER_PAGE = 300; - public static final double ORIENTATION_MARGIN = 0.2; - public static final int LINES_PER_PAGE_MARGIN = 100; private static final double DISTANCE_STEP = 16.0; @@ -181,21 +178,7 @@ public class DocstrumSegmenter { double characterSpacing = computeCharacterSpacing(components, orientation); double lineSpacing = computeLineSpacing(components, orientation); - List lines = determineLines(components, orientation, characterSpacing * COMP_DIST_CHAR, lineSpacing * MAX_VERTICAL_COMP_DIST); - - if (Math.abs(orientation) > ORIENTATION_MARGIN) { - List linesZero = determineLines(components, 0, characterSpacing * COMP_DIST_CHAR, lineSpacing * MAX_VERTICAL_COMP_DIST); - - if (Math.abs(lines.size() - LINES_PER_PAGE_MARGIN) > Math.abs(linesZero.size() - LINES_PER_PAGE_MARGIN)) { - orientation = 0; - lines = linesZero; - } - } - - double lineOrientation = computeOrientation(lines); - if (!Double.isNaN(lineOrientation)) { - orientation = lineOrientation; - } + List lines = determineLines(components, characterSpacing * COMP_DIST_CHAR, lineSpacing * MAX_VERTICAL_COMP_DIST); List> zones = determineZones(lines, orientation, @@ -207,19 +190,10 @@ public class DocstrumSegmenter { 0.0, 0.0, lineSpacing * MAX_VERTICAL_MERGE_DIST); - zones = mergeZones(zones, characterSpacing * 0.5); - zones = mergeLines(zones, orientation, Double.NEGATIVE_INFINITY, 0.0, 0.0, lineSpacing * MAX_VERTICAL_MERGE_DIST); return convertToBxModel(zones, WORD_DIST_MULT * characterSpacing); } - /** - * Performs for each component search for nearest-neighbors and stores the - * result in component's neighbors attribute. - * - * @param components array of components - * equal to the number of nearest-neighbors per component. - */ private void findNeighbors(Character[] components) { if (components.length == 0) { @@ -352,15 +326,14 @@ public class DocstrumSegmenter { * Groups components into text lines. * * @param components component list - * @param orientation - estimated text orientation * @param maxHorizontalDistance - maximum horizontal distance between components * @param maxVerticalDistance - maximum vertical distance between components * @return lines of components */ - private List determineLines(List components, double orientation, double maxHorizontalDistance, double maxVerticalDistance) { + private List determineLines(List components, double maxHorizontalDistance, double maxVerticalDistance) { DisjointSets sets = new DisjointSets(components); - AngleFilter filter = AngleFilter.newInstance(orientation - ANGLE_TOLERANCE, orientation + ANGLE_TOLERANCE); + AngleFilter filter = AngleFilter.newInstance(-ANGLE_TOLERANCE, ANGLE_TOLERANCE); for (Character component : components) { for (Neighbor neighbor : component.getNeighbors()) { double x = neighbor.getHorizontalDistance() / maxHorizontalDistance; @@ -374,24 +347,12 @@ public class DocstrumSegmenter { for (Set group : sets) { List lineComponents = new ArrayList(group); Collections.sort(lineComponents, Character.CharacterXComparator.getInstance()); - lines.add(new ComponentLine(lineComponents, orientation)); + lines.add(new ComponentLine(lineComponents)); } return lines; } - private double computeOrientation(List lines) { - // Compute weighted mean of line angles - double valueSum = 0.0; - double weightSum = 0.0; - for (ComponentLine line : lines) { - valueSum += line.getAngle() * line.getLength(); - weightSum += line.getLength(); - } - return valueSum / weightSum; - } - - private List> determineZones(List lines, double orientation, double minHorizontalDistance, @@ -444,111 +405,6 @@ public class DocstrumSegmenter { } - private List> mergeZones(List> zones, double tolerance) { - - List bounds = new ArrayList(zones.size()); - for (List zone : zones) { - BoundingBoxBuilder builder = new BoundingBoxBuilder(); - for (ComponentLine line : zone) { - for (Character component : line.getComponents()) { - builder.expand(component.getChunk()); - } - } - bounds.add(builder.getBounds()); - } - - List> outputZones = new ArrayList>(); - mainFor: - for (int i = 0; i < zones.size(); i++) { - for (int j = 0; j < zones.size(); j++) { - if (i == j || bounds.get(j) == null || bounds.get(i) == null) { - continue; - } - if (bounds.get(j).contains(bounds.get(i), tolerance)) { - zones.get(j).addAll(zones.get(i)); - bounds.set(i, null); - continue mainFor; - } - } - outputZones.add(zones.get(i)); - } - return outputZones; - } - - - private List> mergeLines(List> zones, - double orientation, - double minHorizontalDistance, - double maxHorizontalDistance, - double minVerticalDistance, - double maxVerticalDistance) { - - List> outputZones = new ArrayList>(zones.size()); - for (List zone : zones) { - outputZones.add(mergeLinesInZone(zone, orientation, minHorizontalDistance, maxHorizontalDistance, minVerticalDistance, maxVerticalDistance)); - } - return outputZones; - } - - - private List mergeLinesInZone(List lines, - double orientation, - double minHorizontalDistance, - double maxHorizontalDistance, - double minVerticalDistance, - double maxVerticalDistance) { - - DisjointSets sets = new DisjointSets(lines); - for (int i = 0; i < lines.size(); i++) { - ComponentLine li = lines.get(i); - for (int j = i + 1; j < lines.size(); j++) { - ComponentLine lj = lines.get(j); - double hDist = li.horizontalDistance(lj, orientation); - double vDist = li.verticalDistance(lj, orientation); - if (minHorizontalDistance <= hDist && hDist <= maxHorizontalDistance && minVerticalDistance <= vDist && vDist <= maxVerticalDistance) { - sets.union(li, lj); - } else if (minVerticalDistance <= vDist && vDist <= maxVerticalDistance && Math.abs(hDist - Math.min(li.getLength(), lj.getLength())) < 0.1) { - boolean componentOverlap = false; - int overlappingCount = 0; - for (Character ci : li.getComponents()) { - for (Character cj : lj.getComponents()) { - double dist = ci.overlappingDistance(cj, orientation); - if (dist > 2) { - componentOverlap = true; - } - if (dist > 0) { - overlappingCount++; - } - } - } - if (!componentOverlap && overlappingCount <= 2) { - sets.union(li, lj); - } - } - } - } - List outputZone = new ArrayList(); - for (Set group : sets) { - List components = new ArrayList(); - for (ComponentLine line : group) { - components.addAll(line.getComponents()); - } - Collections.sort(components, Character.CharacterXComparator.getInstance()); - outputZone.add(new ComponentLine(components, orientation)); - } - return outputZone; - } - - - /** - * Converts list of zones from internal format (using components and - * component lines) to BxModel. - * - * @param zones zones in internal format - * @param wordSpacing - maximum allowed distance between components that - * belong to one word - * @return BxModel page - */ private List convertToBxModel(List> zones, double wordSpacing) { List zoneList = new ArrayList<>(); @@ -630,7 +486,7 @@ public class DocstrumSegmenter { private final List components; - public ComponentLine(List components, double orientation) { + public ComponentLine(List components) { this.components = components; @@ -653,7 +509,7 @@ public class DocstrumSegmenter { } else if (!components.isEmpty()) { Character component = components.get(0); double dx = component.getChunk().getWidthDirAdj() / 3; - double dy = dx * Math.tan(orientation); + double dy = dx * Math.tan(0); this.x0 = component.getX() - dx; this.x1 = component.getX() + dx; this.y0 = component.getY() - dy; @@ -671,12 +527,6 @@ public class DocstrumSegmenter { } - public double getSlope() { - - return (y1 - y0) / (x1 - x0); - } - - public double getLength() { return Math.sqrt((x0 - x1) * (x0 - x1) + (y0 - y1) * (y0 - y1)); @@ -699,12 +549,6 @@ public class DocstrumSegmenter { } - public List getComponents() { - - return components; - } - - public double angularDifference(ComponentLine j) { double diff = Math.abs(getAngle() - j.getAngle()); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/HierarchicalReadingOrderResolver.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/HierarchicalReadingOrderResolver.java index ff5bc14..7b6fa43 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/HierarchicalReadingOrderResolver.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/HierarchicalReadingOrderResolver.java @@ -8,11 +8,7 @@ import java.util.List; import org.springframework.stereotype.Service; -import com.google.common.collect.Lists; -import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.BBoxObject; -import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Line; -import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Word; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Zone; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.utils.DoubleUtils; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder.BBoxZoneGroup; @@ -24,7 +20,6 @@ import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.re public class HierarchicalReadingOrderResolver { static final int GRIDSIZE = 50; - static final double BOXES_FLOW = 0.5; static final double EPS = 0.01; static final int MAX_ZONES = 1000; static final Comparator Y_ASCENDING_ORDER = new Comparator() { @@ -45,15 +40,6 @@ public class HierarchicalReadingOrderResolver { } }; - static final Comparator TP_X_ASCENDING_ORDER = new Comparator() { - - @Override - public int compare(RedTextPosition o1, RedTextPosition o2) { - - return DoubleUtils.compareDouble(o1.getXDirAdj(), o2.getXDirAdj(), EPS); - } - }; - static final Comparator YX_ASCENDING_ORDER = new Comparator() { @Override @@ -67,21 +53,6 @@ public class HierarchicalReadingOrderResolver { public List resolve(List zones) { - for (Zone zone : zones) { - List lines = Lists.newArrayList(zone.getLines()); - for (Line line : lines) { - List words = Lists.newArrayList(line.getWords()); - for (Word word : words) { - List chunks = Lists.newArrayList(word.getTextPositions()); - Collections.sort(chunks, TP_X_ASCENDING_ORDER); - word.setTextPositions(chunks); - } - Collections.sort(words, X_ASCENDING_ORDER); - line.setWords(words); - } - Collections.sort(lines, YX_ASCENDING_ORDER); - zone.setLines(lines); - } List orderedZones; if (zones.size() > MAX_ZONES) { orderedZones = new ArrayList(zones); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/readingorder/DocumentPlane.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/readingorder/DocumentPlane.java index 71699be..cf6a6ce 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/readingorder/DocumentPlane.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/readingorder/DocumentPlane.java @@ -255,26 +255,4 @@ public class DocumentPlane { return objs_.size(); } - - public String dump() { - - StringBuilder sb = new StringBuilder(); - for (GridXY iter : grid.keySet()) { - sb.append(iter.toString()).append(" ["); - for (BBoxObject obj : grid.get(iter)) { - if (obj instanceof BBoxZoneGroup) { - BBoxZoneGroup group = (BBoxZoneGroup) obj; - sb.append(group.getLeftChild()); - sb.append(group.getRightChild()); - } else if (obj instanceof Zone) { - Zone zone = (Zone) obj; - sb.append(zone); - } - sb.append("\n"); - } - sb.append("]\n"); - } - return sb.toString(); - } - } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java index 3755838..8874153 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java @@ -25,7 +25,7 @@ public class ViewerDocumentTest extends BuildDocumentTest { @SneakyThrows public void testViewerDocument() { - String fileName = "files/new/agb1.pdf"; + String fileName = "files/Plenarprotokoll 1 (keine Druchsache!) (1).pdf"; String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf"; var documentFile = new ClassPathResource(fileName).getFile(); diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/Cyberport__SD-Faktura-Kopie_(ZRG2)_-_03.09.2021.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/Cyberport__SD-Faktura-Kopie_(ZRG2)_-_03.09.2021.pdf new file mode 100644 index 0000000..2a1ebf3 Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/Cyberport__SD-Faktura-Kopie_(ZRG2)_-_03.09.2021.pdf differ diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/brokenTableOnOcr_ocred.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/brokenTableOnOcr_ocred.pdf new file mode 100644 index 0000000..1a00988 Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/brokenTableOnOcr_ocred.pdf differ diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/cyberport Rechnung.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/cyberport Rechnung.pdf new file mode 100644 index 0000000..4178131 Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/cyberport Rechnung.pdf differ