higher = new ArrayList<>();
+ for (Float value : countPerValue.keySet()) {
+ if (value > mostPopular) {
+ higher.add(value);
+ }
+ }
+
+ return higher.stream().sorted(Collections.reverseOrder()).collect(Collectors.toList());
+ }
+
+
+ public Float getHighest() {
+
+ Float highest = null;
+ for (Float value : countPerValue.keySet()) {
+ if (highest == null || value > highest) {
+ highest = value;
+ }
+ }
+ return highest;
+ }
+
+}
diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/Rectangle.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/Rectangle.java
new file mode 100644
index 0000000..c3323fd
--- /dev/null
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/Rectangle.java
@@ -0,0 +1,218 @@
+package com.knecon.fforesight.service.layoutparser.processor.classification.dto;
+
+import java.awt.geom.Point2D;
+import java.awt.geom.Rectangle2D;
+import java.util.Comparator;
+import java.util.List;
+
+@SuppressWarnings("all")
+public class Rectangle extends Rectangle2D.Float {
+
+ protected static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f;
+ /**
+ * Ill-defined comparator, from when Rectangle was Comparable.
+ *
+ * see https://github.com/tabulapdf/tabula-java/issues/116
+ *
+ * @deprecated with no replacement
+ */
+ @Deprecated
+ public static final Comparator ILL_DEFINED_ORDER = new Comparator() {
+ @Override
+ public int compare(Rectangle o1, Rectangle o2) {
+
+ if (o1.equals(o2)) {
+ return 0;
+ }
+ if (o1.verticalOverlap(o2) > VERTICAL_COMPARISON_THRESHOLD) {
+ return o1.isLtrDominant() == -1 && o2.isLtrDominant() == -1 ? -java.lang.Double.compare(o1.getX(), o2.getX()) : java.lang.Double.compare(o1.getX(), o2.getX());
+ } else {
+ return java.lang.Float.compare(o1.getBottom(), o2.getBottom());
+ }
+ }
+ };
+
+
+ public Rectangle() {
+
+ super();
+ }
+
+
+ public Rectangle(float top, float left, float width, float height) {
+
+ super();
+ this.setRect(left, top, width, height);
+ }
+
+
+ /**
+ * @param rectangles
+ * @return minimum bounding box that contains all the rectangles
+ */
+ public static Rectangle boundingBoxOf(List extends Rectangle> rectangles) {
+
+ float minx = java.lang.Float.MAX_VALUE;
+ float miny = java.lang.Float.MAX_VALUE;
+ float maxx = java.lang.Float.MIN_VALUE;
+ float maxy = java.lang.Float.MIN_VALUE;
+
+ for (Rectangle r : rectangles) {
+ minx = (float) Math.min(r.getMinX(), minx);
+ miny = (float) Math.min(r.getMinY(), miny);
+ maxx = (float) Math.max(r.getMaxX(), maxx);
+ maxy = (float) Math.max(r.getMaxY(), maxy);
+ }
+ return new Rectangle(miny, minx, maxx - minx, maxy - miny);
+ }
+
+
+ public int compareTo(Rectangle other) {
+
+ return ILL_DEFINED_ORDER.compare(this, other);
+ }
+
+
+ // I'm bad at Java and need this for fancy sorting in
+ // technology.tabula.TextChunk.
+ public int isLtrDominant() {
+
+ return 0;
+ }
+
+
+ public float getArea() {
+
+ return this.width * this.height;
+ }
+
+
+ public float verticalOverlap(Rectangle other) {
+
+ return Math.max(0, Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
+ }
+
+
+ public boolean verticallyOverlaps(Rectangle other) {
+
+ return verticalOverlap(other) > 0;
+ }
+
+
+ public float horizontalOverlap(Rectangle other) {
+
+ return Math.max(0, Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
+ }
+
+
+ public boolean horizontallyOverlaps(Rectangle other) {
+
+ return horizontalOverlap(other) > 0;
+ }
+
+
+ public float verticalOverlapRatio(Rectangle other) {
+
+ float rv = 0, delta = Math.min(this.getBottom() - this.getTop(), other.getBottom() - other.getTop());
+
+ if (other.getTop() <= this.getTop() && this.getTop() <= other.getBottom() && other.getBottom() <= this.getBottom()) {
+ rv = (other.getBottom() - this.getTop()) / delta;
+ } else if (this.getTop() <= other.getTop() && other.getTop() <= this.getBottom() && this.getBottom() <= other.getBottom()) {
+ rv = (this.getBottom() - other.getTop()) / delta;
+ } else if (this.getTop() <= other.getTop() && other.getTop() <= other.getBottom() && other.getBottom() <= this.getBottom()) {
+ rv = (other.getBottom() - other.getTop()) / delta;
+ } else if (other.getTop() <= this.getTop() && this.getTop() <= this.getBottom() && this.getBottom() <= other.getBottom()) {
+ rv = (this.getBottom() - this.getTop()) / delta;
+ }
+
+ return rv;
+
+ }
+
+
+ public float overlapRatio(Rectangle other) {
+
+ double intersectionWidth = Math.max(0, Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
+ double intersectionHeight = Math.max(0, Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
+ double intersectionArea = Math.max(0, intersectionWidth * intersectionHeight);
+ double unionArea = this.getArea() + other.getArea() - intersectionArea;
+
+ return (float) (intersectionArea / unionArea);
+ }
+
+
+ public Rectangle merge(Rectangle other) {
+
+ this.setRect(this.createUnion(other));
+ return this;
+ }
+
+
+ public float getTop() {
+
+ return (float) this.getMinY();
+ }
+
+
+ public void setTop(float top) {
+
+ float deltaHeight = top - this.y;
+ this.setRect(this.x, top, this.width, this.height - deltaHeight);
+ }
+
+
+ public float getRight() {
+
+ return (float) this.getMaxX();
+ }
+
+
+ public void setRight(float right) {
+
+ this.setRect(this.x, this.y, right - this.x, this.height);
+ }
+
+
+ public float getLeft() {
+
+ return (float) this.getMinX();
+ }
+
+
+ public void setLeft(float left) {
+
+ float deltaWidth = left - this.x;
+ this.setRect(left, this.y, this.width - deltaWidth, this.height);
+ }
+
+
+ public float getBottom() {
+
+ return (float) this.getMaxY();
+ }
+
+
+ public void setBottom(float bottom) {
+
+ this.setRect(this.x, this.y, this.width, bottom - this.y);
+ }
+
+
+ public Point2D[] getPoints() {
+
+ return new Point2D[]{new Point2D.Float(this.getLeft(), this.getTop()), new Point2D.Float(this.getRight(), this.getTop()), new Point2D.Float(this.getRight(),
+ this.getBottom()), new Point2D.Float(this.getLeft(), this.getBottom())};
+ }
+
+
+ @Override
+ public String toString() {
+
+ StringBuilder sb = new StringBuilder();
+ String s = super.toString();
+ sb.append(s.substring(0, s.length() - 1));
+ sb.append(String.format(",bottom=%f,right=%f]", this.getBottom(), this.getRight()));
+ return sb.toString();
+ }
+
+}
diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/image/ClassifiedImage.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/image/ClassifiedImage.java
new file mode 100644
index 0000000..0ed4851
--- /dev/null
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/image/ClassifiedImage.java
@@ -0,0 +1,25 @@
+package com.knecon.fforesight.service.layoutparser.processor.classification.dto.image;
+
+import java.awt.geom.Rectangle2D;
+
+import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ImageType;
+
+import lombok.Data;
+import lombok.NonNull;
+import lombok.RequiredArgsConstructor;
+
+@Data
+@RequiredArgsConstructor
+public class ClassifiedImage {
+
+ @NonNull
+ private Rectangle2D position;
+ @NonNull
+ private ImageType imageType;
+ private boolean isAppendedToSection;
+ @NonNull
+ private boolean hasTransparency;
+ @NonNull
+ private int page;
+
+}
diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/table/CleanRulings.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/table/CleanRulings.java
new file mode 100644
index 0000000..b09bf5b
--- /dev/null
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/table/CleanRulings.java
@@ -0,0 +1,17 @@
+package com.knecon.fforesight.service.layoutparser.processor.classification.dto.table;
+
+import java.util.List;
+
+import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Ruling;
+
+import lombok.Builder;
+import lombok.Data;
+
+@Data
+@Builder
+public class CleanRulings {
+
+ List horizontal;
+ List vertical;
+
+}
diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/table/CvParsedTableCell.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/table/CvParsedTableCell.java
new file mode 100644
index 0000000..0b11042
--- /dev/null
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/table/CvParsedTableCell.java
@@ -0,0 +1,21 @@
+package com.knecon.fforesight.service.layoutparser.processor.classification.dto.table;
+
+import lombok.AllArgsConstructor;
+import lombok.Builder;
+import lombok.Data;
+import lombok.RequiredArgsConstructor;
+
+@Data
+@Builder
+@AllArgsConstructor
+@RequiredArgsConstructor
+public class CvParsedTableCell {
+
+ private float x0;
+ private float y0;
+ private float x1;
+ private float y1;
+ private float width;
+ private float height;
+
+}
diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/table/Ruling.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/table/Ruling.java
new file mode 100644
index 0000000..109a06f
--- /dev/null
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/table/Ruling.java
@@ -0,0 +1,437 @@
+package com.knecon.fforesight.service.layoutparser.processor.classification.dto.table;
+
+import java.awt.geom.Line2D;
+import java.awt.geom.Point2D;
+import java.awt.geom.Rectangle2D;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.Formatter;
+import java.util.List;
+import java.util.Map;
+import java.util.TreeMap;
+
+import com.knecon.fforesight.service.layoutparser.processor.classification.utils.CohenSutherlandClipping;
+import com.knecon.fforesight.service.layoutparser.processor.classification.utils.DoubleComparisons;
+
+import lombok.extern.slf4j.Slf4j;
+
+@Slf4j
+@SuppressWarnings("all")
+public class Ruling extends Line2D.Float {
+
+ private static int PERPENDICULAR_PIXEL_EXPAND_AMOUNT = 2;
+
+
+ public Ruling(Point2D p1, Point2D p2) {
+
+ super(p1, p2);
+ }
+
+
+ public static List cropRulingsToArea(List rulings, Rectangle2D area) {
+
+ ArrayList rv = new ArrayList<>();
+ for (Ruling r : rulings) {
+ if (r.intersects(area)) {
+ rv.add(r.intersect(area));
+ }
+ }
+ return rv;
+ }
+
+
+ // log(n) implementation of find_intersections
+ // based on http://people.csail.mit.edu/indyk/6.838-old/handouts/lec2.pdf
+ public static Map findIntersections(List horizontals, List verticals) {
+
+ class SortObject {
+
+ protected SOType type;
+ protected float position;
+ protected Ruling ruling;
+
+
+ public SortObject(SOType type, float position, Ruling ruling) {
+
+ this.type = type;
+ this.position = position;
+ this.ruling = ruling;
+ }
+
+ }
+
+ List sos = new ArrayList<>();
+
+ TreeMap tree = new TreeMap<>(new Comparator() {
+ @Override
+ public int compare(Ruling o1, Ruling o2) {
+
+ return java.lang.Double.compare(o1.getTop(), o2.getTop());
+ }
+ });
+
+ TreeMap rv = new TreeMap<>(new Comparator() {
+ @Override
+ public int compare(Point2D o1, Point2D o2) {
+
+ if (o1.getY() > o2.getY()) {
+ return 1;
+ }
+ if (o1.getY() < o2.getY()) {
+ return -1;
+ }
+ if (o1.getX() > o2.getX()) {
+ return 1;
+ }
+ if (o1.getX() < o2.getX()) {
+ return -1;
+ }
+ return 0;
+ }
+ });
+
+ for (Ruling h : horizontals) {
+ sos.add(new SortObject(SOType.HLEFT, h.getLeft() - PERPENDICULAR_PIXEL_EXPAND_AMOUNT, h));
+ sos.add(new SortObject(SOType.HRIGHT, h.getRight() + PERPENDICULAR_PIXEL_EXPAND_AMOUNT, h));
+ }
+
+ for (Ruling v : verticals) {
+ sos.add(new SortObject(SOType.VERTICAL, v.getLeft(), v));
+ }
+
+ Collections.sort(sos, new Comparator() {
+ @Override
+ public int compare(SortObject a, SortObject b) {
+
+ int rv;
+ if (DoubleComparisons.feq(a.position, b.position)) {
+ if (a.type == SOType.VERTICAL && b.type == SOType.HLEFT) {
+ rv = 1;
+ } else if (a.type == SOType.VERTICAL && b.type == SOType.HRIGHT) {
+ rv = -1;
+ } else if (a.type == SOType.HLEFT && b.type == SOType.VERTICAL) {
+ rv = -1;
+ } else if (a.type == SOType.HRIGHT && b.type == SOType.VERTICAL) {
+ rv = 1;
+ } else {
+ rv = java.lang.Double.compare(a.position, b.position);
+ }
+ } else {
+ return java.lang.Double.compare(a.position, b.position);
+ }
+ return rv;
+ }
+ });
+
+ for (SortObject so : sos) {
+ switch (so.type) {
+ case VERTICAL:
+ for (Map.Entry h : tree.entrySet()) {
+ try {
+ Point2D i = h.getKey().intersectionPoint(so.ruling);
+ if (i == null) {
+ continue;
+ }
+ rv.put(i, new Ruling[]{h.getKey().expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT), so.ruling.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT)});
+ } catch (UnsupportedOperationException e) {
+ log.info("Some line are oblique, ignoring...");
+ continue;
+ }
+ }
+ break;
+ case HRIGHT:
+ tree.remove(so.ruling);
+ break;
+ case HLEFT:
+ tree.put(so.ruling, true);
+ break;
+ }
+ }
+
+ return rv;
+
+ }
+
+
+ public boolean vertical() {
+
+ return this.length() > 0 && DoubleComparisons.feq(this.x1, this.x2); //diff < ORIENTATION_CHECK_THRESHOLD;
+ }
+
+
+ public boolean horizontal() {
+
+ return this.length() > 0 && DoubleComparisons.feq(this.y1, this.y2); //diff < ORIENTATION_CHECK_THRESHOLD;
+ }
+
+ // attributes that make sense only for non-oblique lines
+ // these are used to have a single collapse method (in page, currently)
+
+
+ public boolean oblique() {
+
+ return !(this.vertical() || this.horizontal());
+ }
+
+
+ public float getPosition() {
+
+ if (this.oblique()) {
+ throw new UnsupportedOperationException();
+ }
+ return this.vertical() ? this.getLeft() : this.getTop();
+ }
+
+
+ public float getStart() {
+
+ if (this.oblique()) {
+ throw new UnsupportedOperationException();
+ }
+ return this.vertical() ? this.getTop() : this.getLeft();
+ }
+
+
+ public void setStart(float v) {
+
+ if (this.oblique()) {
+ throw new UnsupportedOperationException();
+ }
+ if (this.vertical()) {
+ this.setTop(v);
+ } else {
+ this.setLeft(v);
+ }
+ }
+
+
+ public float getEnd() {
+
+ if (this.oblique()) {
+ throw new UnsupportedOperationException();
+ }
+ return this.vertical() ? this.getBottom() : this.getRight();
+ }
+
+
+ public void setEnd(float v) {
+
+ if (this.oblique()) {
+ throw new UnsupportedOperationException();
+ }
+ if (this.vertical()) {
+ this.setBottom(v);
+ } else {
+ this.setRight(v);
+ }
+ }
+
+
+ public void setStartEnd(float start, float end) {
+
+ if (this.oblique()) {
+ throw new UnsupportedOperationException();
+ }
+ if (this.vertical()) {
+ this.setTop(start);
+ this.setBottom(end);
+ } else {
+ this.setLeft(start);
+ this.setRight(end);
+ }
+ }
+
+
+ public boolean perpendicularTo(Ruling other) {
+
+ return this.vertical() == other.horizontal();
+ }
+
+
+ public boolean nearlyIntersects(Ruling another, int colinearOrParallelExpandAmount) {
+
+ if (this.intersectsLine(another)) {
+ return true;
+ }
+
+ boolean rv = false;
+
+ if (this.perpendicularTo(another)) {
+ rv = this.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT).intersectsLine(another);
+ } else {
+ rv = this.expand(colinearOrParallelExpandAmount).intersectsLine(another.expand(colinearOrParallelExpandAmount));
+ }
+
+ return rv;
+ }
+
+
+ public double length() {
+
+ return Math.sqrt(Math.pow(this.x1 - this.x2, 2) + Math.pow(this.y1 - this.y2, 2));
+ }
+
+
+ public Ruling intersect(Rectangle2D clip) {
+
+ Float clipee = (Float) this.clone();
+ boolean clipped = new CohenSutherlandClipping(clip).clip(clipee);
+
+ if (clipped) {
+ return new Ruling(clipee.getP1(), clipee.getP2());
+ } else {
+ return this;
+ }
+ }
+
+
+ public Ruling expand(float amount) {
+
+ Ruling r = (Ruling) this.clone();
+ try {
+ r.setStart(this.getStart() - amount);
+ r.setEnd(this.getEnd() + amount);
+ } catch (UnsupportedOperationException e) {
+ log.warn("Could not expand ruling!");
+ }
+ return r;
+ }
+
+
+ public Point2D intersectionPoint(Ruling other) {
+
+ Ruling this_l = this.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT);
+ Ruling other_l = other.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT);
+ Ruling horizontal, vertical;
+
+ if (!this_l.intersectsLine(other_l)) {
+ return null;
+ }
+
+ if (this_l.horizontal() && other_l.vertical()) {
+ horizontal = this_l;
+ vertical = other_l;
+ } else if (this_l.vertical() && other_l.horizontal()) {
+ vertical = this_l;
+ horizontal = other_l;
+ } else {
+ log.warn("lines must be orthogonal, vertical and horizontal");
+ return null;
+ }
+ return new Point2D.Float(vertical.getLeft(), horizontal.getTop());
+ }
+
+
+ @Override
+ public boolean equals(Object other) {
+
+ if (this == other) {
+ return true;
+ }
+
+ if (!(other instanceof Ruling)) {
+ return false;
+ }
+
+ Ruling o = (Ruling) other;
+ return this.getP1().equals(o.getP1()) && this.getP2().equals(o.getP2());
+ }
+
+
+ @Override
+ public int hashCode() {
+
+ return super.hashCode();
+ }
+
+
+ public float getTop() {
+
+ return this.y1;
+ }
+
+
+ public void setTop(float v) {
+
+ setLine(this.getLeft(), v, this.getRight(), this.getBottom());
+ }
+
+
+ public float getLeft() {
+
+ return this.x1;
+ }
+
+
+ public void setLeft(float v) {
+
+ setLine(v, this.getTop(), this.getRight(), this.getBottom());
+ }
+
+
+ public float getBottom() {
+
+ return this.y2;
+ }
+
+
+ public void setBottom(float v) {
+
+ setLine(this.getLeft(), this.getTop(), this.getRight(), v);
+ }
+
+
+ public float getRight() {
+
+ return this.x2;
+ }
+
+
+ public void setRight(float v) {
+
+ setLine(this.getLeft(), this.getTop(), v, this.getBottom());
+ }
+
+
+ public float getWidth() {
+
+ return this.getRight() - this.getLeft();
+ }
+
+
+ public float getHeight() {
+
+ return this.getBottom() - this.getTop();
+ }
+
+
+ public double getAngle() {
+
+ double angle = Math.toDegrees(Math.atan2(this.getP2().getY() - this.getP1().getY(), this.getP2().getX() - this.getP1().getX()));
+
+ if (angle < 0) {
+ angle += 360;
+ }
+ return angle;
+ }
+
+
+ @Override
+ public String toString() {
+
+ StringBuilder sb = new StringBuilder();
+ Formatter formatter = new Formatter(sb);
+ String rv = formatter.format("%s[minX=%f minY=%f maxX=%f maxY=%f]", this.getClass().toString(), this.x1, this.y1, this.x2, this.y2).toString();
+ formatter.close();
+ return rv;
+ }
+
+
+ private enum SOType {
+ VERTICAL,
+ HRIGHT,
+ HLEFT
+ }
+
+}
diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/table/Table.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/table/Table.java
new file mode 100644
index 0000000..cee62ef
--- /dev/null
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/table/Table.java
@@ -0,0 +1,350 @@
+package com.knecon.fforesight.service.layoutparser.processor.classification.dto.table;
+
+import java.awt.geom.Point2D;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.TreeMap;
+import java.util.stream.Collectors;
+
+import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer;
+import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
+import com.knecon.fforesight.service.layoutparser.processor.classification.dto.Rectangle;
+
+import lombok.Getter;
+import lombok.Setter;
+import lombok.extern.slf4j.Slf4j;
+
+@Slf4j
+public class Table extends AbstractTextContainer {
+
+ private final TreeMap cells = new TreeMap<>();
+
+ private final int rotation;
+ @Getter
+ @Setter
+ private String headline;
+ private int unrotatedRowCount;
+ private int unrotatedColCount;
+ private int rowCount = -1;
+ private int colCount = -1;
+ private List> rows;
+
+
+ public Table(List cells, Rectangle area, int rotation) {
+
+ addCells(cells);
+ minX = area.getLeft();
+ minY = area.getBottom();
+ maxX = area.getRight();
+ maxY = area.getTop();
+ classification = "Table";
+ this.rotation = rotation;
+
+ }
+
+
+ public List> getRows() {
+
+ if (rows == null) {
+ rows = computeRows();
+
+ // Ignore rows that does not contain any cells and values.
+ List> rowsToRemove = new ArrayList<>();
+ for (List row : rows) {
+ if (row.size() == 1 && row.get(0).getTextBlocks().isEmpty()) {
+ rowsToRemove.add(row);
+ }
+ }
+ rows.removeAll(rowsToRemove);
+
+ computeHeaders();
+ }
+
+ return rows;
+
+ }
+
+
+ public int getRowCount() {
+
+ if (rowCount == -1) {
+ rowCount = getRows().size();
+ }
+ return rowCount;
+ }
+
+
+ public int getColCount() {
+
+ if (colCount == -1) {
+ colCount = getRows().stream().mapToInt(List::size).max().orElse(0);
+ }
+ return colCount;
+
+ }
+
+
+ /**
+ * Detect header cells (either first row or first column):
+ * Column is marked as header if cell text is bold and row cell text is not bold.
+ * Defaults to row.
+ */
+ private void computeHeaders() {
+
+ if (rows == null) {
+ rows = computeRows();
+ }
+ // A bold cell is a header cell as long as every cell to the left/top is bold, too
+ // we move from left to right and top to bottom
+ for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
+ List rowCells = rows.get(rowIndex);
+ if (rowCells.size() == 1) {
+ continue;
+ }
+
+ for (int colIndex = 0; colIndex < rowCells.size(); colIndex++) {
+ TableCell cell = rowCells.get(colIndex);
+ List cellsToTheLeft = rowCells.subList(0, colIndex);
+ TableCell lastHeaderCell = null;
+ for (TableCell leftCell : cellsToTheLeft) {
+ if (leftCell.isHeaderCell()) {
+ lastHeaderCell = leftCell;
+ } else {
+ break;
+ }
+ }
+ if (lastHeaderCell != null) {
+ cell.getHeaderCells().add(lastHeaderCell);
+ }
+ List cellsToTheTop = new ArrayList<>();
+ for (int i = 0; i < rowIndex; i++) {
+ try {
+ cellsToTheTop.add(rows.get(i).get(colIndex));
+ } catch (IndexOutOfBoundsException e) {
+ log.debug("No cell {} in row {}, ignoring.", colIndex, rowIndex);
+ }
+ }
+ for (TableCell topCell : cellsToTheTop) {
+ if (topCell.isHeaderCell()) {
+ lastHeaderCell = topCell;
+ } else {
+ break;
+ }
+ }
+ if (lastHeaderCell != null) {
+ cell.getHeaderCells().add(lastHeaderCell);
+ }
+ if (!cell.getTextBlocks().isEmpty() && cell.getTextBlocks().get(0).getMostPopularWordStyle().equals("bold")) {
+ cell.setHeaderCell(true);
+ }
+ }
+ }
+
+ }
+
+
+ private List> computeRows() {
+
+ List> rows = new ArrayList<>();
+ if (rotation == 90) {
+ for (int i = 0; i < unrotatedColCount; i++) { // rows
+ List lastRow = new ArrayList<>();
+ for (int j = unrotatedRowCount - 1; j >= 0; j--) { // cols
+ TableCell cell = cells.get(new TableCellPosition(j, i));
+ if (cell != null) {
+ lastRow.add(cell);
+ }
+ }
+ rows.add(lastRow);
+ }
+ } else if (rotation == 270) {
+ for (int i = unrotatedColCount - 1; i >= 0; i--) { // rows
+ List lastRow = new ArrayList<>();
+ for (int j = 0; j < unrotatedRowCount; j++) { // cols
+ TableCell cell = cells.get(new TableCellPosition(j, i));
+ if (cell != null) {
+ lastRow.add(cell);
+ }
+ }
+ rows.add(lastRow);
+ }
+ } else {
+ for (int i = 0; i < unrotatedRowCount; i++) {
+ List lastRow = new ArrayList<>();
+ for (int j = 0; j < unrotatedColCount; j++) {
+ TableCell cell = cells.get(new TableCellPosition(i, j)); // JAVA_8 use getOrDefault()
+ if (cell != null) {
+ lastRow.add(cell);
+ }
+ }
+ rows.add(lastRow);
+ }
+ }
+
+ return rows;
+
+ }
+
+
+ private void add(TableCell chunk, int row, int col) {
+
+ unrotatedRowCount = Math.max(unrotatedRowCount, row + 1);
+ unrotatedColCount = Math.max(unrotatedColCount, col + 1);
+
+ TableCellPosition cp = new TableCellPosition(row, col);
+ cells.put(cp, chunk);
+
+ }
+
+
+ private void addCells(List cells) {
+
+ if (cells.isEmpty()) {
+ return;
+ }
+
+ cells.removeIf(cell -> cell.getWidth() < 1.1 || cell.getHeight() < 1.1);
+
+ List> rowsOfCells = calculateStructure(cells);
+
+ for (int i = 0; i < rowsOfCells.size(); i++) {
+ for (int j = 0; j < rowsOfCells.get(i).size(); j++) {
+ add(rowsOfCells.get(i).get(j), i, j);
+ }
+ }
+
+ }
+
+
+ /**
+ * Calculates the structure of the table. For spanning rows and columns multiple cells with the same values will be inserted.
+ *
+ * @param cells The found cells
+ * @return Table Structure
+ */
+ private List> calculateStructure(List cells) {
+
+ List> matrix = new ArrayList<>();
+
+ if (cells.isEmpty()) {
+ return matrix;
+ }
+
+ Set uniqueX = new HashSet<>();
+ Set uniqueY = new HashSet<>();
+ cells.stream().filter(c -> !c.getTextBlocks().isEmpty() || c.getHeight() > 3 && c.getWidth() > 3).forEach(c -> {
+ uniqueX.add(c.getLeft());
+ uniqueX.add(c.getRight());
+ uniqueY.add(c.getBottom());
+ uniqueY.add(c.getTop());
+ });
+
+ var sortedUniqueX = uniqueX.stream().sorted().collect(Collectors.toList());
+ var sortedUniqueY = uniqueY.stream().sorted().collect(Collectors.toList());
+
+ Float prevY = null;
+ for (Float y : sortedUniqueY) {
+
+ List row = new ArrayList<>();
+
+ Float prevX = null;
+ for (Float x : sortedUniqueX) {
+
+ if (prevY != null && prevX != null) {
+ var cell = new TableCell(new Point2D.Float(prevX, prevY), new Point2D.Float(x, y));
+
+ var intersectionCell = cells.stream().filter(c -> cell.intersects(c) && cell.overlapRatio(c) > 0.1f).findFirst();
+ if (intersectionCell.isPresent()) {
+ cell.getTextBlocks().addAll(intersectionCell.get().getTextBlocks());
+ }
+ row.add(cell);
+ }
+ prevX = x;
+ }
+
+ if (prevY != null && prevX != null) {
+ matrix.add(row);
+ }
+ prevY = y;
+ }
+
+ Collections.reverse(matrix);
+
+ return matrix;
+ }
+
+
+ @Override
+ public String getText() {
+
+ StringBuilder sb = new StringBuilder();
+ List> rows = getRows();
+
+ int i = 0;
+ for (List row : rows) {
+ if (i != 0) {
+ sb.append("\n");
+ }
+ if (!row.isEmpty()) {
+ boolean firstColumn = true;
+ for (TableCell column : row) {
+ if (!firstColumn) {
+ sb.append(",");
+ }
+ if (column != null && column.getTextBlocks() != null) {
+ boolean first = true;
+ for (ClassificationTextBlock textBlock : column.getTextBlocks()) {
+ if (!first) {
+ sb.append("\n");
+ }
+ sb.append('\"').append(textBlock.getText().replaceAll("\"", "\\\"")).append('\"');
+ first = false;
+ }
+ }
+ firstColumn = false;
+ }
+ }
+ i++;
+ }
+
+ return sb.toString();
+ }
+
+
+ public String getTextAsHtml() {
+
+ StringBuilder sb = new StringBuilder();
+ List> rows = getRows();
+
+ sb.append("");
+ int i = 0;
+ for (List row : rows) {
+ sb.append("\n");
+ if (!row.isEmpty()) {
+ for (TableCell column : row) {
+ sb.append(i == 0 ? "\n| " : "\n | ");
+ if (column != null && column.getTextBlocks() != null) {
+ boolean first = true;
+ for (ClassificationTextBlock textBlock : column.getTextBlocks()) {
+ if (!first) {
+ sb.append(" ");
+ }
+ sb.append(textBlock.getText().replaceAll("\\n", " "));
+ first = false;
+ }
+ }
+ sb.append(i == 0 ? "" : " | ");
+ }
+ }
+ sb.append("
");
+ i++;
+ }
+ sb.append("
");
+
+ return sb.toString();
+ }
+
+}
\ No newline at end of file
diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/table/TableCell.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/table/TableCell.java
new file mode 100644
index 0000000..578371f
--- /dev/null
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/table/TableCell.java
@@ -0,0 +1,38 @@
+package com.knecon.fforesight.service.layoutparser.processor.classification.dto.table;
+
+import java.awt.geom.Point2D;
+import java.util.ArrayList;
+import java.util.List;
+
+import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
+import com.knecon.fforesight.service.layoutparser.processor.classification.dto.Rectangle;
+
+import lombok.Data;
+import lombok.EqualsAndHashCode;
+import lombok.NoArgsConstructor;
+
+@SuppressWarnings("serial")
+@Data
+@EqualsAndHashCode(callSuper = true)
+@NoArgsConstructor
+public class TableCell extends Rectangle {
+
+ private List textBlocks = new ArrayList<>();
+
+ private List headerCells = new ArrayList<>();
+
+ private boolean isHeaderCell;
+
+
+ public TableCell(Point2D topLeft, Point2D bottomRight) {
+
+ super((float) topLeft.getY(), (float) topLeft.getX(), (float) (bottomRight.getX() - topLeft.getX()), (float) (bottomRight.getY() - topLeft.getY()));
+ }
+
+
+ public void addTextBlock(ClassificationTextBlock textBlock) {
+
+ textBlocks.add(textBlock);
+ }
+
+}
diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/table/TableCellPosition.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/table/TableCellPosition.java
new file mode 100644
index 0000000..42cb649
--- /dev/null
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/table/TableCellPosition.java
@@ -0,0 +1,22 @@
+package com.knecon.fforesight.service.layoutparser.processor.classification.dto.table;
+
+import lombok.RequiredArgsConstructor;
+import lombok.Value;
+
+@Value
+@RequiredArgsConstructor
+public class TableCellPosition implements Comparable {
+
+ int row;
+
+ int col;
+
+
+ @Override
+ public int compareTo(TableCellPosition other) {
+
+ int rowDiff = row - other.row;
+ return rowDiff != 0 ? rowDiff : col - other.col;
+ }
+
+}
\ No newline at end of file
diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/ClassificationTextBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/ClassificationTextBlock.java
new file mode 100644
index 0000000..1076cf8
--- /dev/null
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/ClassificationTextBlock.java
@@ -0,0 +1,286 @@
+package com.knecon.fforesight.service.layoutparser.processor.classification.dto.text;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer;
+import com.knecon.fforesight.service.layoutparser.processor.classification.utils.TextNormalizationUtilities;
+
+import lombok.AllArgsConstructor;
+import lombok.Builder;
+import lombok.Data;
+import lombok.EqualsAndHashCode;
+import lombok.NoArgsConstructor;
+
+@EqualsAndHashCode(callSuper = true)
+@AllArgsConstructor
+@Builder
+@Data
+@NoArgsConstructor
+public class ClassificationTextBlock extends AbstractTextContainer {
+
+ @Builder.Default
+ private List sequences = new ArrayList<>();
+
+ private int rotation;
+
+ private int indexOnPage;
+
+ private String mostPopularWordFont;
+
+ private String mostPopularWordStyle;
+
+ private float mostPopularWordFontSize;
+
+ private float mostPopularWordHeight;
+
+ private float mostPopularWordSpaceWidth;
+
+ private float highestFontSize;
+
+ private String classification;
+
+
+ public TextDirection getDir() {
+
+ return sequences.get(0).getDir();
+ }
+
+ private float getPageHeight() {
+
+ return sequences.get(0).getPageHeight();
+ }
+
+
+ private float getPageWidth() {
+
+ return sequences.get(0).getPageWidth();
+ }
+
+
+ /**
+ * Returns the minX value in pdf coordinate system.
+ * Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
+ * 0 -> LowerLeft
+ * 90 -> UpperLeft
+ * 180 -> UpperRight
+ * 270 -> LowerRight
+ *
+ * @return the minX value in pdf coordinate system
+ */
+ public float getPdfMinX() {
+
+ if (getDir().getDegrees() == 90) {
+ return minY;
+ } else if (getDir().getDegrees() == 180) {
+ return getPageWidth() - maxX;
+
+ } else if (getDir().getDegrees() == 270) {
+
+ return getPageWidth() - maxY;
+ } else {
+ return minX;
+ }
+ }
+
+ /**
+ * Returns the maxX value in pdf coordinate system.
+ * Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
+ * 0 -> LowerLeft
+ * 90 -> UpperLeft
+ * 180 -> UpperRight
+ * 270 -> LowerRight
+ *
+ * @return the maxX value in pdf coordinate system
+ */
+ public float getPdfMaxX() {
+
+ if (getDir().getDegrees() == 90) {
+ return maxY;
+ } else if (getDir().getDegrees() == 180) {
+ return getPageWidth() - minX;
+ } else if (getDir().getDegrees() == 270) {
+ return getPageWidth() - minY;
+
+ } else {
+ return maxX;
+ }
+ }
+
+
+ /**
+ * Returns the minY value in pdf coordinate system.
+ * Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
+ * 0 -> LowerLeft
+ * 90 -> UpperLeft
+ * 180 -> UpperRight
+ * 270 -> LowerRight
+ *
+ * @return the minY value in pdf coordinate system
+ */
+ public float getPdfMinY() {
+
+ if (getDir().getDegrees() == 90) {
+ return minX;
+ } else if (getDir().getDegrees() == 180) {
+ return maxY;
+
+ } else if (getDir().getDegrees() == 270) {
+ return getPageHeight() - maxX;
+
+ } else {
+ return getPageHeight() - maxY;
+ }
+ }
+
+
+ /**
+ * Returns the maxY value in pdf coordinate system.
+ * Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
+ * 0 -> LowerLeft
+ * 90 -> UpperLeft
+ * 180 -> UpperRight
+ * 270 -> LowerRight
+ *
+ * @return the maxY value in pdf coordinate system
+ */
+ public float getPdfMaxY() {
+
+ if (getDir().getDegrees() == 90) {
+ return maxX;
+ } else if (getDir().getDegrees() == 180) {
+
+ return minY;
+ } else if (getDir().getDegrees() == 270) {
+ return getPageHeight() - minX;
+ } else {
+ return getPageHeight() - minY;
+ }
+ }
+
+
+ public ClassificationTextBlock(float minX, float maxX, float minY, float maxY, List sequences, int rotation, int indexOnPage) {
+ super();
+ this.indexOnPage = indexOnPage;
+ super.minX = minX;
+ super.maxX = maxX;
+ super.minY = minY;
+ super.maxY = maxY;
+ this.sequences = sequences;
+ this.rotation = rotation;
+ }
+
+
+ public ClassificationTextBlock union(TextPositionSequence r) {
+
+ ClassificationTextBlock union = this.copy();
+ union.add(r);
+ return union;
+ }
+
+
+ public ClassificationTextBlock union(ClassificationTextBlock r) {
+
+ ClassificationTextBlock union = this.copy();
+ union.add(r);
+ return union;
+ }
+
+
+ public void add(ClassificationTextBlock r) {
+
+ if (r.getMinX() < minX) {
+ minX = r.getMinX();
+ }
+ if (r.getMaxX() > maxX) {
+ maxX = r.getMaxX();
+ }
+ if (r.getMinY() < minY) {
+ minY = r.getMinY();
+ }
+ if (r.getMaxY() > maxY) {
+ maxY = r.getMaxY();
+ }
+ sequences.addAll(r.getSequences());
+ }
+
+
+ public void add(TextPositionSequence r) {
+
+ if (r.getMinXDirAdj() < minX) {
+ minX = r.getMinXDirAdj();
+ }
+ if (r.getMaxXDirAdj() > maxX) {
+ maxX = r.getMaxXDirAdj();
+ }
+ if (r.getMinYDirAdj() < minY) {
+ minY = r.getMinYDirAdj();
+ }
+ if (r.getMaxYDirAdj() > maxY) {
+ maxY = r.getMaxYDirAdj();
+ }
+ }
+
+
+ public ClassificationTextBlock copy() {
+
+ return new ClassificationTextBlock(minX, maxX, minY, maxY, sequences, rotation, indexOnPage);
+ }
+
+
+ public void resize(float x1, float y1, float width, float height) {
+
+ set(x1, y1, x1 + width, y1 + height);
+ }
+
+
+ public void set(float x1, float y1, float x2, float y2) {
+
+ this.minX = Math.min(x1, x2);
+ this.maxX = Math.max(x1, x2);
+ this.minY = Math.min(y1, y2);
+ this.maxY = Math.max(y1, y2);
+ }
+
+
+ @Override
+ public String toString() {
+
+ StringBuilder builder = new StringBuilder();
+
+ for (int i = 0; i < sequences.size(); i++) {
+ String sequenceAsString = sequences.get(i).toString();
+ // Fix for missing Whitespace. This is recognized in getSequences method. See PDFTextStripper Line 1730.
+ if (i != 0 && sequences.get(i - 1).charAt(sequences.get(i - 1).length() - 1) != ' ' && sequenceAsString.charAt(0) != ' ') {
+ builder.append(' ');
+ }
+ builder.append(sequenceAsString);
+ }
+
+ return builder.toString();
+
+ }
+
+
+ @Override
+ public String getText() {
+
+ StringBuilder sb = new StringBuilder();
+
+ TextPositionSequence previous = null;
+ for (TextPositionSequence word : sequences) {
+ if (previous != null) {
+ if (Math.abs(previous.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight()) {
+ sb.append('\n');
+ } else {
+ sb.append(' ');
+ }
+ }
+ sb.append(word.toString());
+ previous = word;
+ }
+
+ return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString());
+
+ }
+}
diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/RedTextPosition.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/RedTextPosition.java
new file mode 100644
index 0000000..1266286
--- /dev/null
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/RedTextPosition.java
@@ -0,0 +1,106 @@
+package com.knecon.fforesight.service.layoutparser.processor.classification.dto.text;
+
+import org.apache.pdfbox.text.TextPosition;
+import org.springframework.beans.BeanUtils;
+
+import com.dslplatform.json.CompiledJson;
+import com.dslplatform.json.JsonAttribute;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+
+import lombok.AllArgsConstructor;
+import lombok.Builder;
+import lombok.Data;
+import lombok.NoArgsConstructor;
+import lombok.SneakyThrows;
+
+@Data
+@Builder
+@NoArgsConstructor
+@AllArgsConstructor
+@CompiledJson
+public class RedTextPosition {
+
+ private String textMatrix;
+ private float[] position;
+
+ @JsonIgnore
+ private int rotation;
+
+ @JsonIgnore
+ private float pageHeight;
+
+ @JsonIgnore
+ private float pageWidth;
+
+ private String unicode;
+
+ @JsonIgnore
+ private float dir;
+
+ // not used in reanalysis
+ @JsonIgnore
+ @JsonAttribute(ignore = true)
+ private float widthOfSpace;
+
+ // not used in reanalysis
+ @JsonIgnore
+ @JsonAttribute(ignore = true)
+ private float fontSizeInPt;
+
+ // not used in reanalysis
+ @JsonIgnore
+ @JsonAttribute(ignore = true)
+ private String fontName;
+
+
+ @SneakyThrows
+ public static RedTextPosition fromTextPosition(TextPosition textPosition) {
+
+ var pos = new RedTextPosition();
+ BeanUtils.copyProperties(textPosition, pos);
+ pos.setFontName(textPosition.getFont().getName());
+
+ pos.setFontSizeInPt(textPosition.getFontSizeInPt());
+
+ pos.setTextMatrix(textPosition.getTextMatrix().toString());
+
+ var position = new float[4];
+
+ position[0] = textPosition.getXDirAdj();
+ position[1] = textPosition.getYDirAdj();
+ position[2] = textPosition.getWidthDirAdj();
+ position[3] = textPosition.getHeightDir();
+
+ pos.setPosition(position);
+ return pos;
+ }
+
+
+ @JsonIgnore
+ public float getXDirAdj() {
+
+ return position[0];
+ }
+
+
+ @JsonIgnore
+ public float getYDirAdj() {
+
+ return position[1];
+ }
+
+
+ @JsonIgnore
+ public float getWidthDirAdj() {
+
+ return position[2];
+ }
+
+
+ @JsonIgnore
+ public float getHeightDir() {
+
+ return position[3];
+ }
+
+}
diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/StringFrequencyCounter.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/StringFrequencyCounter.java
new file mode 100644
index 0000000..4c6d3d3
--- /dev/null
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/StringFrequencyCounter.java
@@ -0,0 +1,47 @@
+package com.knecon.fforesight.service.layoutparser.processor.classification.dto.text;
+
+import java.util.HashMap;
+import java.util.Map;
+
+import lombok.Getter;
+
+public class StringFrequencyCounter {
+
+ @Getter
+ private final Map countPerValue = new HashMap<>();
+
+
+ public void add(String value) {
+
+ if (!countPerValue.containsKey(value)) {
+ countPerValue.put(value, 1);
+ } else {
+ countPerValue.put(value, countPerValue.get(value) + 1);
+ }
+ }
+
+
+ public void addAll(Map otherCounter) {
+
+ for (Map.Entry entry : otherCounter.entrySet()) {
+ if (countPerValue.containsKey(entry.getKey())) {
+ countPerValue.put(entry.getKey(), countPerValue.get(entry.getKey()) + entry.getValue());
+ } else {
+ countPerValue.put(entry.getKey(), entry.getValue());
+ }
+ }
+ }
+
+
+ public String getMostPopular() {
+
+ Map.Entry mostPopular = null;
+ for (Map.Entry entry : countPerValue.entrySet()) {
+ if (mostPopular == null || entry.getValue() > mostPopular.getValue()) {
+ mostPopular = entry;
+ }
+ }
+ return mostPopular != null ? mostPopular.getKey() : null;
+ }
+
+}
diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/TextBlockOrientation.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/TextBlockOrientation.java
new file mode 100644
index 0000000..5ff10a4
--- /dev/null
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/TextBlockOrientation.java
@@ -0,0 +1,8 @@
+package com.knecon.fforesight.service.layoutparser.processor.classification.dto.text;
+
+public enum TextBlockOrientation {
+
+ NONE,
+ LEFT,
+ RIGHT
+}
diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/TextDirection.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/TextDirection.java
new file mode 100644
index 0000000..ef31669
--- /dev/null
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/TextDirection.java
@@ -0,0 +1,54 @@
+package com.knecon.fforesight.service.layoutparser.processor.classification.dto.text;
+
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonValue;
+
+import lombok.Getter;
+
+@Getter
+public enum TextDirection {
+ ZERO(0f),
+ QUARTER_CIRCLE(90f),
+ HALF_CIRCLE(180f),
+ THREE_QUARTER_CIRCLE(270f);
+
+ public static final String VALUE_STRING_SUFFIX = "°";
+
+ @JsonValue
+ private final float degrees;
+ private final float radians;
+
+
+ TextDirection(float degreeValue) {
+
+ degrees = degreeValue;
+ radians = (float) Math.toRadians(degreeValue);
+ }
+
+
+ @Override
+ public String toString() {
+
+ return degrees + VALUE_STRING_SUFFIX;
+ }
+
+
+ @com.dslplatform.json.JsonValue
+ public float jsonValue() {
+
+ return getDegrees();
+ }
+
+
+ @JsonCreator(mode = JsonCreator.Mode.DELEGATING)
+ public static TextDirection fromDegrees(float degrees) {
+
+ for (var dir : TextDirection.values()) {
+ if (degrees == dir.degrees) {
+ return dir;
+ }
+ }
+
+ throw new IllegalArgumentException(String.format("A value of %f is not supported by TextDirection", degrees));
+ }
+}
diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/TextPositionSequence.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/TextPositionSequence.java
new file mode 100644
index 0000000..ac525d5
--- /dev/null
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/TextPositionSequence.java
@@ -0,0 +1,298 @@
+package com.knecon.fforesight.service.layoutparser.processor.classification.dto.text;
+
+import java.awt.geom.AffineTransform;
+import java.awt.geom.Point2D;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.stream.Collectors;
+
+import org.apache.pdfbox.text.TextPosition;
+
+import com.dslplatform.json.JsonAttribute;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
+import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
+
+import lombok.AllArgsConstructor;
+import lombok.Builder;
+import lombok.Data;
+import lombok.NoArgsConstructor;
+import lombok.SneakyThrows;
+import lombok.extern.slf4j.Slf4j;
+
+@Slf4j
+@Data
+@Builder
+@NoArgsConstructor
+@AllArgsConstructor
+public class TextPositionSequence implements CharSequence {
+
+ public static final int HEIGHT_PADDING = 2;
+ private int page;
+ private List textPositions = new ArrayList<>();
+
+ private TextDirection dir;
+ private int rotation;
+ private float pageHeight;
+ private float pageWidth;
+
+
+ public TextPositionSequence(List textPositions, int page) {
+
+ this.textPositions = textPositions.stream().map(RedTextPosition::fromTextPosition).collect(Collectors.toList());
+ this.page = page;
+ this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
+ this.rotation = textPositions.get(0).getRotation();
+ this.pageHeight = textPositions.get(0).getPageHeight();
+ this.pageWidth = textPositions.get(0).getPageWidth();
+ }
+
+
+ @Override
+ public int length() {
+
+ return textPositions.size();
+ }
+
+
+ @Override
+ public char charAt(int index) {
+
+ RedTextPosition textPosition = textPositionAt(index);
+ String text = textPosition.getUnicode();
+ return text.charAt(0);
+ }
+
+
+ @Override
+ public TextPositionSequence subSequence(int start, int end) {
+
+ var textPositionSequence = new TextPositionSequence();
+ textPositionSequence.textPositions = textPositions.subList(start, end);
+ textPositionSequence.page = page;
+ textPositionSequence.dir = dir;
+ textPositionSequence.rotation = rotation;
+ textPositionSequence.pageHeight = pageHeight;
+ textPositionSequence.pageWidth = pageWidth;
+
+ return textPositionSequence;
+ }
+
+
+ @Override
+ public String toString() {
+
+ StringBuilder builder = new StringBuilder(length());
+ for (int i = 0; i < length(); i++) {
+ builder.append(charAt(i));
+ }
+ return builder.toString();
+ }
+
+
+ public RedTextPosition textPositionAt(int index) {
+
+ return textPositions.get(index);
+ }
+
+
+ public void add(TextPositionSequence textPositionSequence, RedTextPosition textPosition) {
+
+ this.textPositions.add(textPosition);
+ this.page = textPositionSequence.getPage();
+ this.dir = textPositionSequence.getDir();
+ this.rotation = textPositionSequence.getRotation();
+ this.pageHeight = textPositionSequence.getPageHeight();
+ this.pageWidth = textPositionSequence.getPageWidth();
+ }
+
+
+ public void add(TextPosition textPosition) {
+
+ this.textPositions.add(RedTextPosition.fromTextPosition(textPosition));
+
+ this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
+ this.rotation = textPositions.get(0).getRotation();
+ this.pageHeight = textPositions.get(0).getPageHeight();
+ this.pageWidth = textPositions.get(0).getPageWidth();
+
+ }
+
+
+ /**
+ * This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
+ * This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt.
+ *
+ * @return the text direction adjusted minX value
+ */
+ @JsonIgnore
+ @JsonAttribute(ignore = true)
+ public float getMinXDirAdj() {
+
+ return textPositions.get(0).getXDirAdj();
+
+ }
+
+
+ /**
+ * This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
+ * This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt.
+ *
+ * @return the text direction adjusted maxX value
+ */
+ @JsonIgnore
+ @JsonAttribute(ignore = true)
+ public float getMaxXDirAdj() {
+
+ return textPositions.get(textPositions.size() - 1).getXDirAdj() + textPositions.get(textPositions.size() - 1).getWidthDirAdj() + HEIGHT_PADDING;
+
+ }
+
+
+ /**
+ * This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
+ * This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt.
+ *
+ * @return the text direction adjusted minY value. The upper border of the bounding box of the word.
+ */
+ @JsonIgnore
+ @JsonAttribute(ignore = true)
+ public float getMinYDirAdj() {
+
+ return textPositions.get(0).getYDirAdj() - getTextHeight();
+
+ }
+
+
+ /**
+ * This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
+ * This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt.
+ *
+ * @return the text direction adjusted maxY value. The lower border of the bounding box of the word.
+ */
+ @JsonIgnore
+ @JsonAttribute(ignore = true)
+ public float getMaxYDirAdj() {
+
+ return textPositions.get(0).getYDirAdj();
+
+ }
+
+
+ @JsonIgnore
+ @JsonAttribute(ignore = true)
+ public float getTextHeight() {
+
+ return textPositions.get(0).getHeightDir() + HEIGHT_PADDING;
+ }
+
+
+ @JsonIgnore
+ @JsonAttribute(ignore = true)
+ public float getHeight() {
+
+ return getMaxYDirAdj() - getMinYDirAdj();
+ }
+
+
+ @JsonIgnore
+ @JsonAttribute(ignore = true)
+ public float getWidth() {
+
+ return getMaxXDirAdj() - getMinXDirAdj();
+ }
+
+
+ @JsonIgnore
+ @JsonAttribute(ignore = true)
+ public String getFont() {
+
+ return textPositions.get(0).getFontName().toLowerCase().replaceAll(",bold", "").replaceAll(",italic", "");
+ }
+
+
+ @JsonIgnore
+ @JsonAttribute(ignore = true)
+ public String getFontStyle() {
+
+ String lowercaseFontName = textPositions.get(0).getFontName().toLowerCase();
+
+ if (lowercaseFontName.contains("bold") && lowercaseFontName.contains("italic")) {
+ return "bold, italic";
+ } else if (lowercaseFontName.contains("bold")) {
+ return "bold";
+ } else if (lowercaseFontName.contains("italic")) {
+ return "italic";
+ } else {
+ return "standard";
+ }
+
+ }
+
+
+ @JsonIgnore
+ @JsonAttribute(ignore = true)
+ public float getFontSize() {
+
+ return textPositions.get(0).getFontSizeInPt();
+ }
+
+
+ @JsonIgnore
+ @JsonAttribute(ignore = true)
+ public float getSpaceWidth() {
+
+ return textPositions.get(0).getWidthOfSpace();
+ }
+
+
+ /**
+ * This returns the bounding box of the word in Pdf Coordinate System where {0,0} rotated with the page rotation.
+ * 0 -> LowerLeft
+ * 90 -> UpperLeft
+ * 180 -> UpperRight
+ * 270 -> LowerRight
+ *
+ * @return bounding box of the word in Pdf Coordinate System
+ */
+ @JsonIgnore
+ @JsonAttribute(ignore = true)
+ @SneakyThrows
+ public Rectangle getRectangle() {
+
+ log.debug("ClassificationPage: '{}', Word: '{}', Rotation: '{}', textRotation {}", page, this, rotation, dir);
+
+ float textHeight = getTextHeight();
+
+ RedTextPosition firstTextPos = textPositions.get(0);
+ RedTextPosition lastTextPos = textPositions.get(textPositions.size() - 1);
+
+ Point2D bottomLeft = new Point2D.Double(firstTextPos.getXDirAdj(), firstTextPos.getYDirAdj() - HEIGHT_PADDING);
+ Point2D topRight = new Point2D.Double(lastTextPos.getXDirAdj() + lastTextPos.getWidthDirAdj(), lastTextPos.getYDirAdj() + textHeight + HEIGHT_PADDING);
+
+ AffineTransform transform = new AffineTransform();
+ if (dir == TextDirection.ZERO || dir == TextDirection.HALF_CIRCLE) {
+ transform.rotate(dir.getRadians(), pageWidth / 2f, pageHeight / 2f);
+ transform.translate(0f, pageHeight + textHeight);
+ transform.scale(1., -1.);
+ } else if (dir == TextDirection.QUARTER_CIRCLE) {
+ transform.rotate(dir.getRadians(), pageWidth / 2f, pageWidth / 2f);
+ transform.translate(0f, pageWidth + textHeight);
+ transform.scale(1., -1.);
+ } else {
+ transform.rotate(dir.getRadians(), pageHeight / 2f, pageHeight / 2f);
+ transform.translate(0f, pageWidth + textHeight);
+ transform.scale(1., -1.);
+ }
+
+ bottomLeft = transform.transform(bottomLeft, null);
+ topRight = transform.transform(topRight, null);
+
+ return new Rectangle( //
+ new Point((float) bottomLeft.getX(), (float) bottomLeft.getY()),
+ (float) (topRight.getX() - bottomLeft.getX()),
+ (float) (topRight.getY() - bottomLeft.getY()),
+ page);
+ }
+
+}
diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/UnclassifiedText.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/UnclassifiedText.java
new file mode 100644
index 0000000..16be334
--- /dev/null
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/UnclassifiedText.java
@@ -0,0 +1,14 @@
+package com.knecon.fforesight.service.layoutparser.processor.classification.dto.text;
+
+import java.util.List;
+
+import lombok.AllArgsConstructor;
+import lombok.Data;
+
+@Data
+@AllArgsConstructor
+public class UnclassifiedText {
+
+ private List textBlocks;
+
+}
diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/LegacyPDFStreamEngine.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/LegacyPDFStreamEngine.java
new file mode 100644
index 0000000..5aa1439
--- /dev/null
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/LegacyPDFStreamEngine.java
@@ -0,0 +1,384 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.knecon.fforesight.service.layoutparser.processor.classification.parsing;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Map;
+import java.util.WeakHashMap;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.fontbox.ttf.TrueTypeFont;
+import org.apache.fontbox.util.BoundingBox;
+import org.apache.pdfbox.contentstream.PDFStreamEngine;
+import org.apache.pdfbox.contentstream.operator.DrawObject;
+import org.apache.pdfbox.contentstream.operator.state.Concatenate;
+import org.apache.pdfbox.contentstream.operator.state.Restore;
+import org.apache.pdfbox.contentstream.operator.state.Save;
+import org.apache.pdfbox.contentstream.operator.state.SetGraphicsStateParameters;
+import org.apache.pdfbox.contentstream.operator.state.SetMatrix;
+import org.apache.pdfbox.contentstream.operator.text.BeginText;
+import org.apache.pdfbox.contentstream.operator.text.EndText;
+import org.apache.pdfbox.contentstream.operator.text.MoveText;
+import org.apache.pdfbox.contentstream.operator.text.MoveTextSetLeading;
+import org.apache.pdfbox.contentstream.operator.text.NextLine;
+import org.apache.pdfbox.contentstream.operator.text.SetCharSpacing;
+import org.apache.pdfbox.contentstream.operator.text.SetFontAndSize;
+import org.apache.pdfbox.contentstream.operator.text.SetTextHorizontalScaling;
+import org.apache.pdfbox.contentstream.operator.text.SetTextLeading;
+import org.apache.pdfbox.contentstream.operator.text.SetTextRenderingMode;
+import org.apache.pdfbox.contentstream.operator.text.SetTextRise;
+import org.apache.pdfbox.contentstream.operator.text.SetWordSpacing;
+import org.apache.pdfbox.contentstream.operator.text.ShowText;
+import org.apache.pdfbox.contentstream.operator.text.ShowTextAdjusted;
+import org.apache.pdfbox.contentstream.operator.text.ShowTextLine;
+import org.apache.pdfbox.contentstream.operator.text.ShowTextLineAndSpace;
+import org.apache.pdfbox.cos.COSDictionary;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.pdmodel.common.PDRectangle;
+import org.apache.pdfbox.pdmodel.font.PDCIDFont;
+import org.apache.pdfbox.pdmodel.font.PDCIDFontType2;
+import org.apache.pdfbox.pdmodel.font.PDFont;
+import org.apache.pdfbox.pdmodel.font.PDFontDescriptor;
+import org.apache.pdfbox.pdmodel.font.PDSimpleFont;
+import org.apache.pdfbox.pdmodel.font.PDTrueTypeFont;
+import org.apache.pdfbox.pdmodel.font.PDType0Font;
+import org.apache.pdfbox.pdmodel.font.PDType3Font;
+import org.apache.pdfbox.pdmodel.font.encoding.GlyphList;
+import org.apache.pdfbox.pdmodel.graphics.state.PDGraphicsState;
+import org.apache.pdfbox.text.TextPosition;
+import org.apache.pdfbox.util.Matrix;
+import org.apache.pdfbox.util.Vector;
+
+/**
+ * LEGACY text calculations which are known to be incorrect but are depended on by PDFTextStripper.
+ *
+ * This class exists only so that we don't break the code of users who have their own subclasses of
+ * PDFTextStripper. It replaces the mostly empty implementation of showGlyph() in PDFStreamEngine
+ * with a heuristic implementation which is backwards compatible.
+ *
+ * DO NOT USE THIS CODE UNLESS YOU ARE WORKING WITH PDFTextStripper.
+ * THIS CODE IS DELIBERATELY INCORRECT, USE PDFStreamEngine INSTEAD.
+ */
+@SuppressWarnings({"PMD", "checkstyle:all"})
+class LegacyPDFStreamEngine extends PDFStreamEngine {
+
+ private static final Log LOG = LogFactory.getLog(LegacyPDFStreamEngine.class);
+
+ private int pageRotation;
+ private PDRectangle pageSize;
+ private Matrix translateMatrix;
+ private final GlyphList glyphList;
+ private final Map fontHeightMap = new WeakHashMap();
+
+
+ /**
+ * Constructor.
+ */
+ LegacyPDFStreamEngine() throws IOException {
+
+ addOperator(new BeginText());
+ addOperator(new Concatenate());
+ addOperator(new DrawObject()); // special text version
+ addOperator(new EndText());
+ addOperator(new SetGraphicsStateParameters());
+ addOperator(new Save());
+ addOperator(new Restore());
+ addOperator(new NextLine());
+ addOperator(new SetCharSpacing());
+ addOperator(new MoveText());
+ addOperator(new MoveTextSetLeading());
+ addOperator(new SetFontAndSize());
+ addOperator(new ShowText());
+ addOperator(new ShowTextAdjusted());
+ addOperator(new SetTextLeading());
+ addOperator(new SetMatrix());
+ addOperator(new SetTextRenderingMode());
+ addOperator(new SetTextRise());
+ addOperator(new SetWordSpacing());
+ addOperator(new SetTextHorizontalScaling());
+ addOperator(new ShowTextLine());
+ addOperator(new ShowTextLineAndSpace());
+
+ // load additional glyph list for Unicode mapping
+ String path = "/org/apache/pdfbox/resources/glyphlist/additional.txt";
+ InputStream input = GlyphList.class.getResourceAsStream(path);
+ glyphList = new GlyphList(GlyphList.getAdobeGlyphList(), input);
+ }
+
+
+ /**
+ * This will initialize and process the contents of the stream.
+ *
+ * @param page the page to process
+ * @throws IOException if there is an error accessing the stream.
+ */
+ @Override
+ public void processPage(PDPage page) throws IOException {
+
+ this.pageRotation = page.getRotation();
+ this.pageSize = page.getCropBox();
+
+ if (pageSize.getLowerLeftX() == 0 && pageSize.getLowerLeftY() == 0) {
+ translateMatrix = null;
+ } else {
+ // translation matrix for cropbox
+ translateMatrix = Matrix.getTranslateInstance(-pageSize.getLowerLeftX(), -pageSize.getLowerLeftY());
+ }
+ super.processPage(page);
+ }
+
+
+ /**
+ * Called when a glyph is to be processed. The heuristic calculations here were originally
+ * written by Ben Litchfield for PDFStreamEngine.
+ */
+ @Override
+ protected void showGlyph(Matrix textRenderingMatrix, PDFont font, int code,Vector displacement) throws IOException {
+ //
+ // legacy calculations which were previously in PDFStreamEngine
+ //
+ // DO NOT USE THIS CODE UNLESS YOU ARE WORKING WITH PDFTextStripper.
+ // THIS CODE IS DELIBERATELY INCORRECT
+ //
+
+ PDGraphicsState state = getGraphicsState();
+ Matrix ctm = state.getCurrentTransformationMatrix();
+ float fontSize = state.getTextState().getFontSize();
+ float horizontalScaling = state.getTextState().getHorizontalScaling() / 100f;
+ Matrix textMatrix = getTextMatrix();
+
+ float displacementX = displacement.getX();
+ // the sorting algorithm is based on the width of the character. As the displacement
+ // for vertical characters doesn't provide any suitable value for it, we have to
+ // calculate our own
+ if (font.isVertical()) {
+ displacementX = font.getWidth(code) / 1000;
+ // there may be an additional scaling factor for true type fonts
+ TrueTypeFont ttf = null;
+ if (font instanceof PDTrueTypeFont) {
+ ttf = ((PDTrueTypeFont) font).getTrueTypeFont();
+ } else if (font instanceof PDType0Font) {
+ PDCIDFont cidFont = ((PDType0Font) font).getDescendantFont();
+ if (cidFont instanceof PDCIDFontType2) {
+ ttf = ((PDCIDFontType2) cidFont).getTrueTypeFont();
+ }
+ }
+ if (ttf != null && ttf.getUnitsPerEm() != 1000) {
+ displacementX *= 1000f / ttf.getUnitsPerEm();
+ }
+ }
+
+ //
+ // legacy calculations which were previously in PDFStreamEngine
+ //
+ // DO NOT USE THIS CODE UNLESS YOU ARE WORKING WITH PDFTextStripper.
+ // THIS CODE IS DELIBERATELY INCORRECT
+ //
+
+ // (modified) combined displacement, this is calculated *without* taking the character
+ // spacing and word spacing into account, due to legacy code in TextStripper
+ float tx = displacementX * fontSize * horizontalScaling;
+ float ty = displacement.getY() * fontSize;
+
+ // (modified) combined displacement matrix
+ Matrix td = Matrix.getTranslateInstance(tx, ty);
+
+ // (modified) text rendering matrix
+ Matrix nextTextRenderingMatrix = td.multiply(textMatrix).multiply(ctm); // text space -> device space
+ float nextX = nextTextRenderingMatrix.getTranslateX();
+ float nextY = nextTextRenderingMatrix.getTranslateY();
+
+ // (modified) width and height calculations
+ float dxDisplay = nextX - textRenderingMatrix.getTranslateX();
+ Float fontHeight = fontHeightMap.get(font.getCOSObject());
+ if (fontHeight == null) {
+ fontHeight = computeFontHeight(font);
+ fontHeightMap.put(font.getCOSObject(), fontHeight);
+ }
+ float dyDisplay = fontHeight * textRenderingMatrix.getScalingFactorY();
+
+ //
+ // start of the original method
+ //
+
+ // Note on variable names. There are three different units being used in this code.
+ // Character sizes are given in glyph units, text locations are initially given in text
+ // units, and we want to save the data in display units. The variable names should end with
+ // Text or Disp to represent if the values are in text or disp units (no glyph units are
+ // saved).
+
+ float glyphSpaceToTextSpaceFactor = 1 / 1000f;
+ if (font instanceof PDType3Font) {
+ glyphSpaceToTextSpaceFactor = font.getFontMatrix().getScaleX();
+ }
+
+ float spaceWidthText = 0;
+ try {
+ // to avoid crash as described in PDFBOX-614, see what the space displacement should be
+ spaceWidthText = font.getSpaceWidth() * glyphSpaceToTextSpaceFactor;
+ } catch (Throwable exception) {
+ LOG.warn(exception, exception);
+ }
+
+ if (spaceWidthText == 0) {
+ spaceWidthText = font.getAverageFontWidth() * glyphSpaceToTextSpaceFactor;
+ // the average space width appears to be higher than necessary so make it smaller
+ spaceWidthText *= .80f;
+ }
+ if (spaceWidthText == 0) {
+ spaceWidthText = 1.0f; // if could not find font, use a generic value
+ }
+
+ // the space width has to be transformed into display units
+ float spaceWidthDisplay = spaceWidthText * textRenderingMatrix.getScalingFactorX();
+
+ // use our additional glyph list for Unicode mapping
+ String unicodeMapping = font.toUnicode(code, glyphList);
+
+ // when there is no Unicode mapping available, Acrobat simply coerces the character code
+ // into Unicode, so we do the same. Subclasses of PDFStreamEngine don't necessarily want
+ // this, which is why we leave it until this point in PDFTextStreamEngine.
+ if (unicodeMapping == null) {
+ if (font instanceof PDSimpleFont) {
+ char c = (char) code;
+ unicodeMapping = new String(new char[]{c});
+ } else {
+ // Acrobat doesn't seem to coerce composite font's character codes, instead it
+ // skips them. See the "allah2.pdf" TestTextStripper file.
+ return;
+ }
+ }
+
+ // adjust for cropbox if needed
+ Matrix translatedTextRenderingMatrix;
+ if (translateMatrix == null) {
+ translatedTextRenderingMatrix = textRenderingMatrix;
+ } else {
+ translatedTextRenderingMatrix = Matrix.concatenate(translateMatrix, textRenderingMatrix);
+ nextX -= pageSize.getLowerLeftX();
+ nextY -= pageSize.getLowerLeftY();
+ }
+
+ // This is a hack for unicode letter with 2 chars e.g. RA see unicodeProblem.pdf
+ if (unicodeMapping.length() == 2) {
+ processTextPosition(new TextPosition(pageRotation,
+ pageSize.getWidth(),
+ pageSize.getHeight(),
+ translatedTextRenderingMatrix,
+ nextX,
+ nextY,
+ Math.abs(dyDisplay),
+ dxDisplay,
+ Math.abs(spaceWidthDisplay),
+ Character.toString(unicodeMapping.charAt(0)),
+ new int[]{code},
+ font,
+ fontSize,
+ (int) (fontSize * textMatrix.getScalingFactorX())));
+ processTextPosition(new TextPosition(pageRotation,
+ pageSize.getWidth(),
+ pageSize.getHeight(),
+ translatedTextRenderingMatrix,
+ nextX,
+ nextY,
+ Math.abs(dyDisplay),
+ dxDisplay,
+ Math.abs(spaceWidthDisplay),
+ Character.toString(unicodeMapping.charAt(1)),
+ new int[]{code},
+ font,
+ fontSize,
+ (int) (fontSize * textMatrix.getScalingFactorX())));
+ } else {
+
+ processTextPosition(new TextPosition(pageRotation,
+ pageSize.getWidth(),
+ pageSize.getHeight(),
+ translatedTextRenderingMatrix,
+ nextX,
+ nextY,
+ Math.abs(dyDisplay),
+ dxDisplay,
+ Math.abs(spaceWidthDisplay),
+ unicodeMapping,
+ new int[]{code},
+ font,
+ fontSize,
+ (int) (fontSize * textMatrix.getScalingFactorX())));
+ }
+ }
+
+
+ /**
+ * Compute the font height. Override this if you want to use own calculations.
+ *
+ * @param font the font.
+ * @return the font height.
+ * @throws IOException if there is an error while getting the font bounding box.
+ */
+ protected float computeFontHeight(PDFont font) throws IOException {
+
+ BoundingBox bbox = font.getBoundingBox();
+ if (bbox.getLowerLeftY() < Short.MIN_VALUE) {
+ // PDFBOX-2158 and PDFBOX-3130
+ // files by Salmat eSolutions / ClibPDF Library
+ bbox.setLowerLeftY(-(bbox.getLowerLeftY() + 65536));
+ }
+ // 1/2 the bbox is used as the height todo: why?
+ float glyphHeight = bbox.getHeight() / 2;
+
+ // sometimes the bbox has very high values, but CapHeight is OK
+ PDFontDescriptor fontDescriptor = font.getFontDescriptor();
+ if (fontDescriptor != null) {
+ float capHeight = fontDescriptor.getCapHeight();
+ if (Float.compare(capHeight, 0) != 0 && (capHeight < glyphHeight || Float.compare(glyphHeight, 0) == 0)) {
+ glyphHeight = capHeight;
+ }
+ // PDFBOX-3464, PDFBOX-4480, PDFBOX-4553:
+ // sometimes even CapHeight has very high value, but Ascent and Descent are ok
+ float ascent = fontDescriptor.getAscent();
+ float descent = fontDescriptor.getDescent();
+ if (capHeight > ascent && ascent > 0 && descent < 0 && ((ascent - descent) / 2 < glyphHeight || Float.compare(glyphHeight, 0) == 0)) {
+ glyphHeight = (ascent - descent) / 2;
+ }
+ }
+
+ // transformPoint from glyph space -> text space
+ float height;
+ if (font instanceof PDType3Font) {
+ height = font.getFontMatrix().transformPoint(0, glyphHeight).y;
+ } else {
+ height = glyphHeight / 1000;
+ }
+
+ return height;
+ }
+
+
+ /**
+ * A method provided as an event interface to allow a subclass to perform some specific
+ * functionality when text needs to be processed.
+ *
+ * @param text The text to be processed.
+ */
+ protected void processTextPosition(TextPosition text) {
+ // subclasses can override to provide specific functionality
+ }
+
+}
diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/PDFAreaTextStripper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/PDFAreaTextStripper.java
new file mode 100644
index 0000000..b799434
--- /dev/null
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/PDFAreaTextStripper.java
@@ -0,0 +1,82 @@
+package com.knecon.fforesight.service.layoutparser.processor.classification.parsing;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.pdfbox.text.PDFTextStripperByArea;
+import org.apache.pdfbox.text.TextPosition;
+
+import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextPositionSequence;
+
+import lombok.Getter;
+import lombok.Setter;
+
+public class PDFAreaTextStripper extends PDFTextStripperByArea {
+
+ @Getter
+ private List textPositionSequences = new ArrayList<>();
+
+ @Setter
+ private int pageNumber;
+
+
+ public PDFAreaTextStripper() throws IOException {
+
+ }
+
+
+ @Override
+ public void writeString(String text, List textPositions) throws IOException {
+
+ int startIndex = 0;
+ for (int i = 0; i <= textPositions.size() - 1; i++) {
+
+ if (i == 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i).getUnicode().equals("\u00A0"))) {
+ startIndex++;
+ continue;
+ }
+
+ // Strange but sometimes this is happening, for example: Metolachlor2.pdf
+ if (i > 0 && textPositions.get(i).getX() < textPositions.get(i - 1).getX()) {
+ List sublist = textPositions.subList(startIndex, i);
+ if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) {
+ textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
+ }
+ startIndex = i;
+ }
+
+ if (textPositions.get(i).getRotation() == 0 && i > 0 && textPositions.get(i).getX() > textPositions.get(i - 1).getEndX() + 1) {
+ List sublist = textPositions.subList(startIndex, i);
+ if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) {
+ textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
+ }
+ startIndex = i;
+ }
+
+ if (i > 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i).getUnicode().equals("\u00A0")) && i <= textPositions.size() - 2) {
+ List sublist = textPositions.subList(startIndex, i);
+ if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) {
+ textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
+ }
+ startIndex = i + 1;
+ }
+ }
+
+ List sublist = textPositions.subList(startIndex, textPositions.size());
+ if (!sublist.isEmpty() && (sublist.get(sublist.size() - 1).getUnicode().equals(" ") || sublist.get(sublist.size() - 1).getUnicode().equals("\u00A0"))) {
+ sublist = sublist.subList(0, sublist.size() - 1);
+ }
+ if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) {
+ textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
+ }
+ super.writeString(text);
+ }
+
+
+ public void clearPositions() {
+
+ textPositionSequences = new ArrayList<>();
+ }
+
+}
diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/PDFLinesTextStripper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/PDFLinesTextStripper.java
new file mode 100644
index 0000000..ae5c958
--- /dev/null
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/PDFLinesTextStripper.java
@@ -0,0 +1,335 @@
+package com.knecon.fforesight.service.layoutparser.processor.classification.parsing;
+
+import java.awt.geom.Point2D;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.List;
+
+import org.apache.pdfbox.contentstream.operator.Operator;
+import org.apache.pdfbox.contentstream.operator.OperatorName;
+import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColor;
+import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColorN;
+import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColorSpace;
+import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceCMYKColor;
+import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceGrayColor;
+import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceRGBColor;
+import org.apache.pdfbox.contentstream.operator.color.SetStrokingColor;
+import org.apache.pdfbox.contentstream.operator.color.SetStrokingColorN;
+import org.apache.pdfbox.contentstream.operator.color.SetStrokingColorSpace;
+import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceCMYKColor;
+import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceGrayColor;
+import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceRGBColor;
+import org.apache.pdfbox.contentstream.operator.state.SetFlatness;
+import org.apache.pdfbox.contentstream.operator.state.SetLineCapStyle;
+import org.apache.pdfbox.contentstream.operator.state.SetLineDashPattern;
+import org.apache.pdfbox.contentstream.operator.state.SetLineJoinStyle;
+import org.apache.pdfbox.contentstream.operator.state.SetLineMiterLimit;
+import org.apache.pdfbox.contentstream.operator.state.SetLineWidth;
+import org.apache.pdfbox.contentstream.operator.state.SetRenderingIntent;
+import org.apache.pdfbox.contentstream.operator.text.SetFontAndSize;
+import org.apache.pdfbox.cos.COSBase;
+import org.apache.pdfbox.cos.COSNumber;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.text.TextPosition;
+
+import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.RedTextPosition;
+import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextPositionSequence;
+import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Ruling;
+
+import lombok.Getter;
+import lombok.Setter;
+import lombok.extern.slf4j.Slf4j;
+
+@Slf4j
+public class PDFLinesTextStripper extends PDFTextStripper {
+
+ @Getter
+ private final List