From a101b98a400b2a35635f2c5e0e894ac849288c3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dominique=20Eifl=C3=A4nder?= Date: Thu, 4 Feb 2021 09:55:17 +0100 Subject: [PATCH] Fixed several table extraction problems --- .../server/parsing/PDFLinesTextStripper.java | 31 ++++++++++++++++--- .../server/parsing/model/ParsedElements.java | 1 + .../service/EntityRedactionService.java | 10 +++--- .../segmentation/PdfSegmentationService.java | 6 ++-- .../segmentation/SectionsBuilderService.java | 3 ++ .../server/tableextraction/model/Table.java | 4 +++ .../service/RulingCleaningService.java | 27 ++++++++-------- .../v1/server/RedactionIntegrationTest.java | 4 +-- 8 files changed, 59 insertions(+), 27 deletions(-) diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFLinesTextStripper.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFLinesTextStripper.java index ad828d0c..5a4265bd 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFLinesTextStripper.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFLinesTextStripper.java @@ -54,7 +54,16 @@ public class PDFLinesTextStripper extends PDFTextStripper { protected PDPage pdpage; @Getter - private int minCharWidths; + private int minCharWidth; + + @Getter + private int maxCharWidth; + + @Getter + private int minCharHeight; + + @Getter + private int maxCharHeight; @Getter private final List textPositionSequences = new ArrayList<>(); @@ -280,8 +289,19 @@ public class PDFLinesTextStripper extends PDFTextStripper { for (int i = 0; i <= textPositions.size() - 1; i++) { int charWidth = (int) textPositions.get(i).getWidthDirAdj(); - if (charWidth < minCharWidths) { - minCharWidths = charWidth; + if (charWidth < minCharWidth) { + minCharWidth = charWidth; + } + if (charWidth > maxCharWidth) { + maxCharWidth = charWidth; + } + + int charHeight = (int) textPositions.get(i).getHeightDir(); + if (charHeight < minCharHeight) { + minCharHeight = charHeight; + } + if (charWidth > maxCharHeight) { + maxCharHeight = charHeight; } if (i == 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i) @@ -333,7 +353,10 @@ public class PDFLinesTextStripper extends PDFTextStripper { @Override public String getText(PDDocument doc) throws IOException { - minCharWidths = Integer.MAX_VALUE; + minCharWidth = Integer.MAX_VALUE; + maxCharWidth = 0; + minCharHeight = Integer.MAX_VALUE; + maxCharHeight = 0; textPositionSequences.clear(); imageBounds = new ArrayList<>(); rulings.clear(); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/ParsedElements.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/ParsedElements.java index 03781dc6..479ac2d5 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/ParsedElements.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/ParsedElements.java @@ -20,4 +20,5 @@ public class ParsedElements { private boolean rotated; private float minCharWidth; + private float maxCharWidth; } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java index 70e0088a..a649c356 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java @@ -157,7 +157,6 @@ public class EntityRedactionService { List sectionSearchableTextPairs = new ArrayList<>(); - int rowNumber = 0; for (List row : table.getRows()) { SearchableText searchableRow = new SearchableText(); Map tabularData = new HashMap<>(); @@ -170,11 +169,14 @@ public class EntityRedactionService { addSectionToManualRedactions(cell.getTextBlocks(), manualRedactions, table.getHeadline(), sectionNumber.intValue()); int cellStart = start; - if(rowNumber != 0) { + if (!cell.isHeaderCell()) { cell.getHeaderCells().forEach(headerCell -> { StringBuilder headerBuilder = new StringBuilder(); headerCell.getTextBlocks().forEach(textBlock -> headerBuilder.append(textBlock.getText())); - String headerName = headerBuilder.toString().replaceAll("\n", "").replaceAll(" ", "").replaceAll("-", ""); + String headerName = headerBuilder.toString() + .replaceAll("\n", "") + .replaceAll(" ", "") + .replaceAll("-", ""); tabularData.put(headerName, new CellValue(cell.getTextBlocks(), cellStart)); }); } @@ -205,8 +207,8 @@ public class EntityRedactionService { .build(), searchableRow)); sectionNumber.incrementAndGet(); - rowNumber++; } + return sectionSearchableTextPairs; } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java index f1478344..0e6f43f3 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java @@ -61,13 +61,13 @@ public class PdfSegmentationService { .rulings(stripper.getRulings()) .sequences(stripper.getTextPositionSequences()) .imageBounds(stripper.getImageBounds()) - .minCharWidth(stripper.getMinCharWidths()) + .minCharWidth(stripper.getMinCharWidth()) + .maxCharWidth(stripper.getMaxCharWidth()) .landscape(isLandscape) .rotated(isRotated) .build(); - CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(parsedElements.getRulings(), parsedElements - .getMinCharWidth()); + CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(parsedElements.getRulings(), stripper.getMinCharWidth(), stripper.getMaxCharHeight()); Page page = blockificationService.blockify(parsedElements.getSequences(), cleanRulings.getHorizontal(), cleanRulings .getVertical()); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java index 75ed5542..1b824ea9 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java @@ -188,6 +188,9 @@ public class SectionsBuilderService { for (int i = table.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table List row = table.getRows().get(i); + if(row.size() == 1){ + continue; + } boolean allNonHeader = true; for (Cell cell : row) { if (cell.isHeaderCell()) { diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Table.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Table.java index 67a4cc72..8f55b482 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Table.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Table.java @@ -110,6 +110,10 @@ public class Table extends AbstractTextContainer { // we move from left to right and top to bottom for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) { List rowCells = rows.get(rowIndex); + if(rowCells.size() == 1){ + continue; + } + for (int colIndex = 0; colIndex < rowCells.size(); colIndex++) { Cell cell = rowCells.get(colIndex); List cellsToTheLeft = rowCells.subList(0, colIndex); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/service/RulingCleaningService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/service/RulingCleaningService.java index 5bbc29c0..82ca3bb7 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/service/RulingCleaningService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/service/RulingCleaningService.java @@ -18,9 +18,10 @@ import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils; @Service public class RulingCleaningService { - public CleanRulings getCleanRulings(List rulings, float minCharWidth){ + public CleanRulings getCleanRulings(List rulings, float minCharWidth, float maxCharHeight) { + if (!rulings.isEmpty()) { - snapPoints(rulings, minCharWidth , minCharWidth); + snapPoints(rulings, minCharWidth, maxCharHeight); } List vrs = new ArrayList<>(); @@ -39,13 +40,10 @@ public class RulingCleaningService { } List horizontalRulingLines = collapseOrientedRulings(hrs); - return CleanRulings - .builder() - .vertical(verticalRulingLines) - .horizontal(horizontalRulingLines) - .build(); + return CleanRulings.builder().vertical(verticalRulingLines).horizontal(horizontalRulingLines).build(); } + public void snapPoints(List rulings, float xThreshold, float yThreshold) { // collect points and keep a Line -> p1,p2 map @@ -122,12 +120,14 @@ public class RulingCleaningService { private List collapseOrientedRulings(List lines) { + int COLINEAR_OR_PARALLEL_PIXEL_EXPAND_AMOUNT = 1; return collapseOrientedRulings(lines, COLINEAR_OR_PARALLEL_PIXEL_EXPAND_AMOUNT); } private List collapseOrientedRulings(List lines, int expandAmount) { + ArrayList rv = new ArrayList<>(); lines.sort((a, b) -> { final float diff = a.getPosition() - b.getPosition(); @@ -141,25 +141,24 @@ public class RulingCleaningService { final float lastStart = last.getStart(); final float lastEnd = last.getEnd(); - final boolean lastFlipped = lastStart > lastEnd; + final boolean lastFlipped = lastStart > lastEnd; final boolean nextFlipped = next_line.getStart() > next_line.getEnd(); boolean differentDirections = nextFlipped != lastFlipped; - float nextS = differentDirections ? next_line.getEnd() : next_line.getStart(); + float nextS = differentDirections ? next_line.getEnd() : next_line.getStart(); float nextE = differentDirections ? next_line.getStart() : next_line.getEnd(); final float newStart = lastFlipped ? Math.max(nextS, lastStart) : Math.min(nextS, lastStart); - final float newEnd = lastFlipped ? Math.min(nextE, lastEnd) : Math.max(nextE, lastEnd); + final float newEnd = lastFlipped ? Math.min(nextE, lastEnd) : Math.max(nextE, lastEnd); last.setStartEnd(newStart, newEnd); assert !last.oblique(); - } - else if (next_line.length() == 0) { + } else if (next_line.length() == 0) { continue; - } - else { + } else { rv.add(next_line); } } return rv; } + } diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java index e6eef8d8..eb34a1dd 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java @@ -441,7 +441,7 @@ public class RedactionIntegrationTest { System.out.println("redactionTest"); long start = System.currentTimeMillis(); - ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"); + ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf"); AnalyzeRequest request = AnalyzeRequest.builder() .ruleSetId(TEST_RULESET_ID) @@ -590,7 +590,7 @@ public class RedactionIntegrationTest { public void htmlTablesTest() throws IOException { System.out.println("htmlTablesTest"); - ClassPathResource pdfFileResource = new ClassPathResource("files/Fludioxonil/52 Fludioxonil_RAR_07_Volume_3CA_B-5_2018-02-21.pdf"); + ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf"); RedactionRequest request = RedactionRequest.builder() .document(IOUtils.toByteArray(pdfFileResource.getInputStream()))