From 270129cd7358b47349d3de0df41c3fdd0905ab1e Mon Sep 17 00:00:00 2001 From: Kilian Schuettler Date: Tue, 25 Jul 2023 18:12:57 +0200 Subject: [PATCH] outputs almost equal current redaction-service in regards to RedactManager * 3/200 files have minimal whitespace/sorting errors, most likely rounding errors --- .../processor/LayoutParsingPipeline.java | 1 + .../processor/model/table/Ruling.java | 10 +++++++--- .../processor/services/PdfParsingService.java | 2 +- .../services/RulingCleaningService.java | 5 ++++- .../services/TableExtractionService.java | 20 +++++++++++-------- 5 files changed, 25 insertions(+), 13 deletions(-) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index f945b6e..5a61430 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -102,6 +102,7 @@ public class LayoutParsingPipeline { } sectionsBuilderService.buildSections(classificationDocument); + sectionsBuilderService.addImagesToSections(classificationDocument); return DocumentGraphFactory.buildDocumentGraph(classificationDocument); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Ruling.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Ruling.java index 39240aa..53f6bde 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Ruling.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Ruling.java @@ -30,14 +30,18 @@ public class Ruling extends Line2D.Float { public Ruling straightenVertical() { + double y1 = Math.min(getY1(), getY2()); + double y2 = Math.max(getY1(), getY2()); double x = (getX1() + getX2()) / 2; - return new Ruling(new Point2D.Double(x, getY1()), new Point2D.Double(x, getY2())); + return new Ruling(new Point2D.Double(x, y1), new Point2D.Double(x, y2)); } - public Ruling straightenHorizonatl() { + public Ruling straightenHorizontal() { + double x1 = Math.min(getX1(), getX2()); + double x2 = Math.max(getX1(), getX2()); double y = (getY1() + getY2()) / 2; - return new Ruling(new Point2D.Double(getX1(), y), new Point2D.Double(getX2(), y)); + return new Ruling(new Point2D.Double(x1, y), new Point2D.Double(x2, y)); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PdfParsingService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PdfParsingService.java index 5b7fb23..aa00c21 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PdfParsingService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PdfParsingService.java @@ -108,7 +108,7 @@ public class PdfParsingService { imageServiceResponseAdapter.findOcr(classificationPage); } - tableExtractionService.extractTables(cleanRulings, classificationPage); + tableExtractionService.extractTables(cleanRulings, classificationPage, layoutParsingType); buildPageStatistics(classificationPage); increaseDocumentStatistics(classificationPage, document); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/RulingCleaningService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/RulingCleaningService.java index bb102c9..e949c52 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/RulingCleaningService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/RulingCleaningService.java @@ -53,7 +53,10 @@ public class RulingCleaningService { } List horizontalRulingLines = collapseOrientedRulings(hrs); - return CleanRulings.builder().vertical(verticalRulingLines).horizontal(horizontalRulingLines).build(); + return CleanRulings.builder() + .vertical(verticalRulingLines) + .horizontal(horizontalRulingLines) + .build(); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java index c89db54..4fe5b10 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java @@ -12,6 +12,7 @@ import java.util.Set; import org.springframework.stereotype.Service; +import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; @@ -78,9 +79,9 @@ public class TableExtractionService { * @param cleanRulings The lines used to build the table. * @param page Page object that contains textblocks and statistics. */ - public void extractTables(CleanRulings cleanRulings, ClassificationPage page) { + public void extractTables(CleanRulings cleanRulings, ClassificationPage page, LayoutParsingType layoutParsingType) { - List cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical()); + List cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical(), layoutParsingType); List toBeRemoved = new ArrayList<>(); @@ -134,13 +135,16 @@ public class TableExtractionService { } - public List findCells(List horizontalRulingLines, List verticalRulingLines) { + public List findCells(List horizontalRulingLines, List verticalRulingLines, LayoutParsingType layoutParsingType) { - for (Ruling r : horizontalRulingLines) { - if (r.getX2() < r.getX1()) { - double a = r.getX2(); - r.x2 = (float) r.getX1(); - r.x1 = (float) a; + if (layoutParsingType.equals(LayoutParsingType.TAAS)) { + // TODO: breaks some tables, for example "1 Abamectin Prr.pdf" try to fix this upstream in RulingCleaningService + for (Ruling r : horizontalRulingLines) { + if (r.getX2() < r.getX1()) { + double a = r.getX2(); + r.x2 = (float) r.getX1(); + r.x1 = (float) a; + } } }