From 01493dc033e9191b985ed82862890d78a000adb3 Mon Sep 17 00:00:00 2001
From: yhampe <yannik.hampe@knecon.com>
Date: Tue, 7 Nov 2023 08:47:28 +0100
Subject: [PATCH] TAAS-103: Table Detection and rotated text

* added page property to DocumentStructure to be able to get page of found tables

* added a method to TableExtractionService to get the table area

* added calculateMinCharWidthAndMaxCharHeightInsideTable to LayoutParsingPipeline to calculate the values based upon table area

* refactored PDFLinesTextStripper for better readability

*removed textMatrix from RedTextPosition as it is no longer needed
---
 .../processor/LayoutParsingPipeline.java      | 51 +++++++++++++++++--
 .../processor/model/graph/nodes/Table.java    |  2 +-
 .../processor/model/text/RedTextPosition.java |  3 --
 .../services/TableExtractionService.java      | 12 ++++-
 .../services/mapper/PropertiesMapper.java     |  5 ++
 .../parsing/PDFLinesTextStripper.java         | 36 ++++++++-----
 6 files changed, 89 insertions(+), 20 deletions(-)
diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java
index 3ec8d47..5b82f93 100644
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java
@@ -26,6 +26,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
 import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
 import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
 import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
+import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle;
 import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
 import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
 import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.CvTableParsingAdapter;
@@ -189,8 +190,15 @@ public class LayoutParsingPipeline {
             PDRectangle cropbox = pdPage.getCropBox();
             CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber),
                     stripper.getRulings(),
-                    stripper.getMinCharWidth(),
-                    stripper.getMaxCharHeight());
+                    1,
+                   1);
+
+            List<Rectangle> spreedSheetArea = tableExtractionService.getSpreadSheetArea(cleanRulings, layoutParsingType);
+
+            Map<String,Float> newValues = calculateMinCharWidthAndMaxCharHeightInsideTable(stripper,spreedSheetArea,10f,1f);
+
+            cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), stripper.getRulings(), newValues.get("minCharWidth"), newValues.get("maxCharHeight"));
+
 
             ClassificationPage classificationPage = switch (layoutParsingType) {
                 case REDACT_MANAGER -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
@@ -234,6 +242,43 @@ public class LayoutParsingPipeline {
         return classificationDocument;
     }
 
+    /**
+     * Finds the smallest character by width
+     * and the largest character by height
+     * inside a table area
+     *
+     * @param stripper the stripper containing the words
+     * @param spreedSheetArea the table area
+     * @param initialMinCharWidth an initial value for a minimum char width
+     * @param initialMaxCharHeight an initial value for a maximum char heigth
+     *
+     * @return Map with both values
+     */
+
+    private Map<String, Float> calculateMinCharWidthAndMaxCharHeightInsideTable(PDFLinesTextStripper stripper, List<Rectangle> spreedSheetArea, float initialMinCharWidth, float initialMaxCharHeight) {
+
+        float newMinCharWidth = initialMinCharWidth;
+        float newMaxCharHeight = initialMaxCharHeight;
+        Map<String,Float> result = new HashMap<>();
+        for(var textPositionSequence: stripper.getTextPositionSequences() ) {
+            for(var redTextPosition: textPositionSequence.getTextPositions()) {
+                for(var area: spreedSheetArea) {
+                    if(area.contains(redTextPosition.getPosition()[0], redTextPosition.getPosition()[1], redTextPosition.getPosition()[2], redTextPosition.getPosition()[3])) {
+                        if(redTextPosition.getHeightDir() > newMaxCharHeight) {
+                            newMaxCharHeight = redTextPosition.getHeightDir();
+                        }
+                        if(redTextPosition.getWidthDirAdj() < newMinCharWidth)  {
+                            newMinCharWidth = redTextPosition.getWidthDirAdj();
+                        }
+                    }
+                }
+            }
+        }
+        result.put("minCharWidth",newMinCharWidth);
+        result.put("maxCharHeight",newMaxCharHeight);
+        return result;
+    }
+
 
     private Map<String, List<Rectangle2D>> convertMarkedContents(List<PDMarkedContent> pdMarkedContents) {
 
@@ -246,7 +291,7 @@ public class LayoutParsingPipeline {
 
     private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) {
 
-//        if (!classificationPage.isLandscape()) {
+ //        if (!classificationPage.isLandscape()) {
             document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue());
 //        }
         document.getFontCounter().addAll(classificationPage.getFontCounter().getCountPerValue());
diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Table.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Table.java
index bd33f6d..b08e2b5 100644
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Table.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Table.java
@@ -34,7 +34,7 @@ public class Table implements SemanticNode {
 
     int numberOfRows;
     int numberOfCols;
-
+    int page;
     TextBlock textBlock;
 
     @Builder.Default
diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/RedTextPosition.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/RedTextPosition.java
index 92059ae..ccea113 100644
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/RedTextPosition.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/RedTextPosition.java
@@ -17,7 +17,6 @@ import lombok.SneakyThrows;
 @AllArgsConstructor
 public class RedTextPosition {
 
-    private String textMatrix;
     private float[] position;
 
     @JsonIgnore
@@ -56,8 +55,6 @@ public class RedTextPosition {
 
         pos.setFontSizeInPt(textPosition.getFontSizeInPt());
 
-        pos.setTextMatrix(textPosition.getTextMatrix().toString());
-
         var position = new float[4];
 
         position[0] = textPosition.getXDirAdj();
diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java
index 4fe5b10..1d486c9 100644
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java
@@ -79,10 +79,12 @@ public class TableExtractionService {
      * @param cleanRulings The lines used to build the table.
      * @param page         Page object that contains textblocks and statistics.
      */
-    public void extractTables(CleanRulings cleanRulings, ClassificationPage page, LayoutParsingType layoutParsingType) {
+        public void extractTables(CleanRulings cleanRulings, ClassificationPage page, LayoutParsingType layoutParsingType) {
 
         List<Cell> cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical(), layoutParsingType);
 
+
+
         List<TextPageBlock> toBeRemoved = new ArrayList<>();
 
         for (AbstractPageBlock abstractPageBlock : page.getTextBlocks()) {
@@ -134,6 +136,14 @@ public class TableExtractionService {
         page.getTextBlocks().removeAll(toBeRemoved);
     }
 
+    public List<Rectangle> getSpreadSheetArea(CleanRulings cleanRulings, LayoutParsingType layoutParsingType) {
+
+        List<Cell> cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical(), layoutParsingType);
+        List<Rectangle> spreadsheetAreas = findSpreadsheetsFromCells(cells).stream().filter(r -> r.getWidth() > 0f && r.getHeight() > 0f).toList();
+        return spreadsheetAreas;
+
+    }
+
 
     public List<Cell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines, LayoutParsingType layoutParsingType) {
 
diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/PropertiesMapper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/PropertiesMapper.java
index f6c66cb..d82a257 100644
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/PropertiesMapper.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/PropertiesMapper.java
@@ -1,6 +1,7 @@
 package com.knecon.fforesight.service.layoutparser.processor.services.mapper;
 
 import java.awt.geom.Rectangle2D;
+import java.util.Collections;
 import java.util.HashMap;
 import java.util.Locale;
 import java.util.Map;
@@ -8,6 +9,7 @@ import java.util.Map;
 import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
 import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image;
 import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
+import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
 import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
 import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
 
@@ -44,6 +46,8 @@ public class PropertiesMapper {
     public static Map<String, String> buildTableProperties(Table table) {
 
         Map<String, String> properties = new HashMap<>();
+        Page page = table.getFirstPage();
+        properties.put(DocumentStructure.TableProperties.PAGE, String.valueOf(page.getNumber()));
         properties.put(DocumentStructure.TableProperties.NUMBER_OF_ROWS, String.valueOf(table.getNumberOfRows()));
         properties.put(DocumentStructure.TableProperties.NUMBER_OF_COLS, String.valueOf(table.getNumberOfCols()));
         return properties;
@@ -69,6 +73,7 @@ public class PropertiesMapper {
 
     public static void parseTableProperties(Map<String, String> properties, Table.TableBuilder builder) {
 
+        builder.page(Integer.parseInt(properties.get(DocumentStructure.TableProperties.PAGE)));
         builder.numberOfRows(Integer.parseInt(properties.get(DocumentStructure.TableProperties.NUMBER_OF_ROWS)));
         builder.numberOfCols(Integer.parseInt(properties.get(DocumentStructure.TableProperties.NUMBER_OF_COLS)));
     }
diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java
index d3309bd..f92add4 100644
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java
@@ -254,11 +254,9 @@ public class PDFLinesTextStripper extends PDFTextStripper {
             }
 
             // Strange but sometimes this is happening, for example: Metolachlor2.pdf
-            if (i > 0 && textPositions.get(i).getXDirAdj() < textPositions.get(i - 1).getXDirAdj()) {
+            if (checkIfCurrentPositionIsToTheRightOfPreviousPosition(i,textPositions)) {
                 List<TextPosition> sublist = textPositions.subList(startIndex, i);
-                if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0)
-                        .getUnicode()
-                        .equals("\t")))) {
+                if (checkIfSequenceContainsOnlyWhitespaces(sublist)) {
                     textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart));
                 }
                 startIndex = i;
@@ -266,9 +264,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
 
             if (textPositions.get(i).getRotation() == 0 && i > 0 && textPositions.get(i).getX() > textPositions.get(i - 1).getEndX() + 1) {
                 List<TextPosition> sublist = textPositions.subList(startIndex, i);
-                if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0)
-                        .getUnicode()
-                        .equals("\t")))) {
+                if (checkIfSequenceContainsOnlyWhitespaces(sublist)) {
                     textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart));
                 }
                 startIndex = i;
@@ -278,13 +274,10 @@ public class PDFLinesTextStripper extends PDFTextStripper {
                     .getUnicode()
                     .equals("\t")) && i <= textPositions.size() - 2) {
                 List<TextPosition> sublist = textPositions.subList(startIndex, i);
-                if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0)
-                        .getUnicode()
-                        .equals("\t")))) {
+                if (checkIfSequenceContainsOnlyWhitespaces(sublist)) {
 
                     // Remove false sequence ends (whitespaces)
-                    if (previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0)
-                            .getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < 0.01) {
+                    if (checkIfGapSizeBetweenCharactersSmallerThanMaximum(previous,sublist,0.01f)) {
                         for (TextPosition t : sublist) {
                             textPositionSequences.get(textPositionSequences.size() - 1).add(t);
                         }
@@ -318,6 +311,25 @@ public class PDFLinesTextStripper extends PDFTextStripper {
         super.writeString(text);
     }
 
+    public boolean checkIfCurrentPositionIsToTheRightOfPreviousPosition(int i, List<TextPosition> textPositions) {
+        return i > 0 && textPositions.get(i).getXDirAdj() < textPositions.get(i - 1).getXDirAdj();
+    }
+
+    public boolean checkIfSequenceContainsOnlyWhitespaces(List<TextPosition> sublist) {
+        return !(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0)
+                .getUnicode()
+                .equals("\t")));
+    }
+
+    public boolean checkIfGapSizeBetweenCharactersSmallerThanMaximum(RedTextPosition previous, List<TextPosition> sublist, float maximumGapSize) {
+        return previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0)
+                .getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < maximumGapSize;
+    }
+
+    // !(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0)
+    //                        .getUnicode()
+    //                        .equals("\t")))
+
 
     @Override
     public String getText(PDDocument doc) throws IOException {