From 01493dc033e9191b985ed82862890d78a000adb3 Mon Sep 17 00:00:00 2001 From: yhampe Date: Tue, 7 Nov 2023 08:47:28 +0100 Subject: [PATCH] TAAS-103: Table Detection and rotated text * added page property to DocumentStructure to be able to get page of found tables * added a method to TableExtractionService to get the table area * added calculateMinCharWidthAndMaxCharHeightInsideTable to LayoutParsingPipeline to calculate the values based upon table area * refactored PDFLinesTextStripper for better readability *removed textMatrix from RedTextPosition as it is no longer needed --- .../processor/LayoutParsingPipeline.java | 51 +++++++++++++++++-- .../processor/model/graph/nodes/Table.java | 2 +- .../processor/model/text/RedTextPosition.java | 3 -- .../services/TableExtractionService.java | 12 ++++- .../services/mapper/PropertiesMapper.java | 5 ++ .../parsing/PDFLinesTextStripper.java | 36 ++++++++----- 6 files changed, 89 insertions(+), 20 deletions(-) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index 3ec8d47..5b82f93 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -26,6 +26,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.CvTableParsingAdapter; @@ -189,8 +190,15 @@ public class LayoutParsingPipeline { PDRectangle cropbox = pdPage.getCropBox(); CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), stripper.getRulings(), - stripper.getMinCharWidth(), - stripper.getMaxCharHeight()); + 1, + 1); + + List spreedSheetArea = tableExtractionService.getSpreadSheetArea(cleanRulings, layoutParsingType); + + Map newValues = calculateMinCharWidthAndMaxCharHeightInsideTable(stripper,spreedSheetArea,10f,1f); + + cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), stripper.getRulings(), newValues.get("minCharWidth"), newValues.get("maxCharHeight")); + ClassificationPage classificationPage = switch (layoutParsingType) { case REDACT_MANAGER -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); @@ -234,6 +242,43 @@ public class LayoutParsingPipeline { return classificationDocument; } + /** + * Finds the smallest character by width + * and the largest character by height + * inside a table area + * + * @param stripper the stripper containing the words + * @param spreedSheetArea the table area + * @param initialMinCharWidth an initial value for a minimum char width + * @param initialMaxCharHeight an initial value for a maximum char heigth + * + * @return Map with both values + */ + + private Map calculateMinCharWidthAndMaxCharHeightInsideTable(PDFLinesTextStripper stripper, List spreedSheetArea, float initialMinCharWidth, float initialMaxCharHeight) { + + float newMinCharWidth = initialMinCharWidth; + float newMaxCharHeight = initialMaxCharHeight; + Map result = new HashMap<>(); + for(var textPositionSequence: stripper.getTextPositionSequences() ) { + for(var redTextPosition: textPositionSequence.getTextPositions()) { + for(var area: spreedSheetArea) { + if(area.contains(redTextPosition.getPosition()[0], redTextPosition.getPosition()[1], redTextPosition.getPosition()[2], redTextPosition.getPosition()[3])) { + if(redTextPosition.getHeightDir() > newMaxCharHeight) { + newMaxCharHeight = redTextPosition.getHeightDir(); + } + if(redTextPosition.getWidthDirAdj() < newMinCharWidth) { + newMinCharWidth = redTextPosition.getWidthDirAdj(); + } + } + } + } + } + result.put("minCharWidth",newMinCharWidth); + result.put("maxCharHeight",newMaxCharHeight); + return result; + } + private Map> convertMarkedContents(List pdMarkedContents) { @@ -246,7 +291,7 @@ public class LayoutParsingPipeline { private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) { -// if (!classificationPage.isLandscape()) { + // if (!classificationPage.isLandscape()) { document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue()); // } document.getFontCounter().addAll(classificationPage.getFontCounter().getCountPerValue()); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Table.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Table.java index bd33f6d..b08e2b5 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Table.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Table.java @@ -34,7 +34,7 @@ public class Table implements SemanticNode { int numberOfRows; int numberOfCols; - + int page; TextBlock textBlock; @Builder.Default diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/RedTextPosition.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/RedTextPosition.java index 92059ae..ccea113 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/RedTextPosition.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/RedTextPosition.java @@ -17,7 +17,6 @@ import lombok.SneakyThrows; @AllArgsConstructor public class RedTextPosition { - private String textMatrix; private float[] position; @JsonIgnore @@ -56,8 +55,6 @@ public class RedTextPosition { pos.setFontSizeInPt(textPosition.getFontSizeInPt()); - pos.setTextMatrix(textPosition.getTextMatrix().toString()); - var position = new float[4]; position[0] = textPosition.getXDirAdj(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java index 4fe5b10..1d486c9 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java @@ -79,10 +79,12 @@ public class TableExtractionService { * @param cleanRulings The lines used to build the table. * @param page Page object that contains textblocks and statistics. */ - public void extractTables(CleanRulings cleanRulings, ClassificationPage page, LayoutParsingType layoutParsingType) { + public void extractTables(CleanRulings cleanRulings, ClassificationPage page, LayoutParsingType layoutParsingType) { List cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical(), layoutParsingType); + + List toBeRemoved = new ArrayList<>(); for (AbstractPageBlock abstractPageBlock : page.getTextBlocks()) { @@ -134,6 +136,14 @@ public class TableExtractionService { page.getTextBlocks().removeAll(toBeRemoved); } + public List getSpreadSheetArea(CleanRulings cleanRulings, LayoutParsingType layoutParsingType) { + + List cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical(), layoutParsingType); + List spreadsheetAreas = findSpreadsheetsFromCells(cells).stream().filter(r -> r.getWidth() > 0f && r.getHeight() > 0f).toList(); + return spreadsheetAreas; + + } + public List findCells(List horizontalRulingLines, List verticalRulingLines, LayoutParsingType layoutParsingType) { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/PropertiesMapper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/PropertiesMapper.java index f6c66cb..d82a257 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/PropertiesMapper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/PropertiesMapper.java @@ -1,6 +1,7 @@ package com.knecon.fforesight.service.layoutparser.processor.services.mapper; import java.awt.geom.Rectangle2D; +import java.util.Collections; import java.util.HashMap; import java.util.Locale; import java.util.Map; @@ -8,6 +9,7 @@ import java.util.Map; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell; @@ -44,6 +46,8 @@ public class PropertiesMapper { public static Map buildTableProperties(Table table) { Map properties = new HashMap<>(); + Page page = table.getFirstPage(); + properties.put(DocumentStructure.TableProperties.PAGE, String.valueOf(page.getNumber())); properties.put(DocumentStructure.TableProperties.NUMBER_OF_ROWS, String.valueOf(table.getNumberOfRows())); properties.put(DocumentStructure.TableProperties.NUMBER_OF_COLS, String.valueOf(table.getNumberOfCols())); return properties; @@ -69,6 +73,7 @@ public class PropertiesMapper { public static void parseTableProperties(Map properties, Table.TableBuilder builder) { + builder.page(Integer.parseInt(properties.get(DocumentStructure.TableProperties.PAGE))); builder.numberOfRows(Integer.parseInt(properties.get(DocumentStructure.TableProperties.NUMBER_OF_ROWS))); builder.numberOfCols(Integer.parseInt(properties.get(DocumentStructure.TableProperties.NUMBER_OF_COLS))); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java index d3309bd..f92add4 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java @@ -254,11 +254,9 @@ public class PDFLinesTextStripper extends PDFTextStripper { } // Strange but sometimes this is happening, for example: Metolachlor2.pdf - if (i > 0 && textPositions.get(i).getXDirAdj() < textPositions.get(i - 1).getXDirAdj()) { + if (checkIfCurrentPositionIsToTheRightOfPreviousPosition(i,textPositions)) { List sublist = textPositions.subList(startIndex, i); - if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0) - .getUnicode() - .equals("\t")))) { + if (checkIfSequenceContainsOnlyWhitespaces(sublist)) { textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart)); } startIndex = i; @@ -266,9 +264,7 @@ public class PDFLinesTextStripper extends PDFTextStripper { if (textPositions.get(i).getRotation() == 0 && i > 0 && textPositions.get(i).getX() > textPositions.get(i - 1).getEndX() + 1) { List sublist = textPositions.subList(startIndex, i); - if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0) - .getUnicode() - .equals("\t")))) { + if (checkIfSequenceContainsOnlyWhitespaces(sublist)) { textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart)); } startIndex = i; @@ -278,13 +274,10 @@ public class PDFLinesTextStripper extends PDFTextStripper { .getUnicode() .equals("\t")) && i <= textPositions.size() - 2) { List sublist = textPositions.subList(startIndex, i); - if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0) - .getUnicode() - .equals("\t")))) { + if (checkIfSequenceContainsOnlyWhitespaces(sublist)) { // Remove false sequence ends (whitespaces) - if (previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0) - .getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < 0.01) { + if (checkIfGapSizeBetweenCharactersSmallerThanMaximum(previous,sublist,0.01f)) { for (TextPosition t : sublist) { textPositionSequences.get(textPositionSequences.size() - 1).add(t); } @@ -318,6 +311,25 @@ public class PDFLinesTextStripper extends PDFTextStripper { super.writeString(text); } + public boolean checkIfCurrentPositionIsToTheRightOfPreviousPosition(int i, List textPositions) { + return i > 0 && textPositions.get(i).getXDirAdj() < textPositions.get(i - 1).getXDirAdj(); + } + + public boolean checkIfSequenceContainsOnlyWhitespaces(List sublist) { + return !(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0) + .getUnicode() + .equals("\t"))); + } + + public boolean checkIfGapSizeBetweenCharactersSmallerThanMaximum(RedTextPosition previous, List sublist, float maximumGapSize) { + return previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0) + .getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < maximumGapSize; + } + + // !(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0) + // .getUnicode() + // .equals("\t"))) + @Override public String getText(PDDocument doc) throws IOException {