diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFLinesTextStripper.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFLinesTextStripper.java index ed69fecd..d6d7e102 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFLinesTextStripper.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFLinesTextStripper.java @@ -44,10 +44,10 @@ import lombok.extern.slf4j.Slf4j; public class PDFLinesTextStripper extends PDFTextStripper { @Getter - private float minCharWidth = Float.MAX_VALUE; + private int maxCharWidths; @Getter - private float minCharHeight = Float.MAX_VALUE; + private int maxCharHeight; @Getter private final List textPositionSequences = new ArrayList<>(); @@ -201,8 +201,16 @@ public class PDFLinesTextStripper extends PDFTextStripper { int startIndex = 0; for (int i = 0; i <= textPositions.size() - 1; i++) { - minCharWidth = Math.min(minCharWidth, textPositions.get(i).getWidthDirAdj()); - minCharHeight = Math.min(minCharHeight, textPositions.get(i).getHeightDir()); + + int charHeight = (int) textPositions.get(i).getHeightDir(); + if(charHeight > maxCharHeight){ + maxCharHeight = charHeight; + } + + int charWidth = (int) textPositions.get(i).getWidthDirAdj(); + if(charWidth > maxCharWidths){ + maxCharWidths = charWidth; + } if (i == 0 && textPositions.get(i).getUnicode().equals(" ")) { startIndex++; @@ -241,8 +249,8 @@ public class PDFLinesTextStripper extends PDFTextStripper { @Override public String getText(PDDocument doc) throws IOException { - minCharWidth = Float.MAX_VALUE; - minCharHeight = Float.MAX_VALUE; + maxCharWidths = 0; + maxCharWidths = 0; textPositionSequences.clear(); rulings.clear(); graphicsPath.clear(); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/ParsedElements.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/ParsedElements.java index 5fc5717f..28983e69 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/ParsedElements.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/ParsedElements.java @@ -17,6 +17,6 @@ public class ParsedElements { private boolean landscape; private boolean rotated; - private float minCharWidth; - private float minCharHeight; + private float maxCharWidth; + private float maxCharHeight; } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java index 9caff395..8106d5eb 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java @@ -21,7 +21,6 @@ import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractT import com.iqser.red.service.redaction.v1.server.tableextraction.model.CleanRulings; import com.iqser.red.service.redaction.v1.server.tableextraction.service.RulingCleaningService; import com.iqser.red.service.redaction.v1.server.tableextraction.service.TableExtractionService; -import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; @@ -37,6 +36,7 @@ public class PdfSegmentationService { private final ClassificationService classificationService; private final SectionsBuilderService sectionsBuilderService; + public Document parseDocument(PDDocument pdDocument) throws IOException { Document document = new Document(); @@ -56,19 +56,21 @@ public class PdfSegmentationService { int rotation = pdPage.getRotation(); boolean isRotated = rotation != 0 && rotation != 360; - ParsedElements parsedElements = ParsedElements - .builder() + + ParsedElements parsedElements = ParsedElements.builder() .rulings(stripper.getRulings()) .sequences(stripper.getTextPositionSequences()) - .minCharWidth(Utils.round(stripper.getMinCharWidth(), 2)) - .minCharHeight(Utils.round(stripper.getMinCharHeight(), 2)) + .maxCharWidth(stripper.getMaxCharWidths()) + .maxCharHeight(stripper.getMaxCharWidths()) .landscape(isLandscape) .rotated(isRotated) .build(); - CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(parsedElements.getRulings(), parsedElements.getMinCharWidth(), parsedElements.getMinCharHeight()); + CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(parsedElements.getRulings(), parsedElements + .getMaxCharWidth(), parsedElements.getMaxCharHeight()); - Page page = blockificationService.blockify(parsedElements.getSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); + Page page = blockificationService.blockify(parsedElements.getSequences(), cleanRulings.getHorizontal(), cleanRulings + .getVertical()); page.setRotation(rotation); tableExtractionService.extractTables(cleanRulings, page); @@ -91,7 +93,10 @@ public class PdfSegmentationService { } + + private void increaseDocumentStatistics(Page page, Document document) { + if (!page.isLandscape()) { document.getFontSizeCounter().addAll(page.getFontSizeCounter().getCountPerValue()); } @@ -100,6 +105,7 @@ public class PdfSegmentationService { document.getFontStyleCounter().addAll(page.getFontStyleCounter().getCountPerValue()); } + private void buildPageStatistics(Page page) { // Collect all statistics for the page, except from blocks inside tables, as tables will always be added to BodyTextFrame. diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/service/RulingCleaningService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/service/RulingCleaningService.java index 1dbe435d..1d3d81f2 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/service/RulingCleaningService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/service/RulingCleaningService.java @@ -18,9 +18,9 @@ import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils; @Service public class RulingCleaningService { - public CleanRulings getCleanRulings(List rulings, float minCharWidth, float minCharHeight){ + public CleanRulings getCleanRulings(List rulings, float maxCharWidth, float maxCharHeight){ if (!rulings.isEmpty()) { - snapPoints(rulings, minCharWidth , minCharHeight); + snapPoints(rulings, maxCharWidth , maxCharHeight); } List vrs = new ArrayList<>(); diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java index 6879382d..4a6a3491 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java @@ -1,5 +1,6 @@ package com.iqser.red.service.redaction.v1.server; +import static org.assertj.core.api.Assertions.assertThat; import static org.mockito.Mockito.when; import static org.springframework.boot.test.context.SpringBootTest.WebEnvironment.DEFINED_PORT; @@ -269,7 +270,7 @@ public class RedactionIntegrationTest { System.out.println("redactionTest"); long start = System.currentTimeMillis(); - ClassPathResource pdfFileResource = new ClassPathResource("files/new/Thiabendazole DAR Addendum for ED_April_2020.pdf"); + ClassPathResource pdfFileResource = new ClassPathResource("files/Fludioxonil/50 Fludioxonil_RAR_01_Volume_1_2018-02-21.pdf"); RedactionRequest request = RedactionRequest.builder() .document(IOUtils.toByteArray(pdfFileResource.getInputStream())) @@ -388,8 +389,7 @@ public class RedactionIntegrationTest { public void htmlTablesTest() throws IOException { System.out.println("htmlTablesTest"); - ClassPathResource pdfFileResource = new ClassPathResource("files/Fludioxonil/51 " + - "Fludioxonil_RAR_02_Volume_2_2018-02-21.pdf"); + ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf"); RedactionRequest request = RedactionRequest.builder() .document(IOUtils.toByteArray(pdfFileResource.getInputStream())) @@ -422,6 +422,27 @@ public class RedactionIntegrationTest { } + @Test + public void phantomCellsDocumentTest() throws IOException { + + ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Phantom Cells.pdf"); + + RedactionRequest request = RedactionRequest.builder() + .document(IOUtils.toByteArray(pdfFileResource.getInputStream())) + .build(); + request.setFlatRedaction(false); + + RedactionResult result = redactionController.redact(request); + + result.getRedactionLog().getRedactionLogEntry().forEach(entry -> { + if(!entry.isHint()){ + assertThat(entry.getReason()).isEqualTo("Not redacted because row is not a vertebrate study"); + } + }); + } + + + private static String loadFromClassPath(String path) { URL resource = ResourceLoader.class.getClassLoader().getResource(path); diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/Phantom Cells.pdf b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/Phantom Cells.pdf new file mode 100644 index 00000000..441a6af5 Binary files /dev/null and b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/Phantom Cells.pdf differ