Pull request #43: RED-264: Avoid phantom cells, by merging line till rounded value of biggest char height/width

Merge in RED/redaction-service from RED-264 to master

* commit 'e1adcfb02c4c7d46140884b3d38f8daf1e1aae92':
  Added missing unittest
  RED-264: Avoid phantom cells, by merging line till rounded value of biggest char height/width
This commit is contained in:
Dominique Eiflaender 2020-09-29 15:04:20 +02:00
commit 8b1574845c
6 changed files with 55 additions and 20 deletions

View File

@ -44,10 +44,10 @@ import lombok.extern.slf4j.Slf4j;
public class PDFLinesTextStripper extends PDFTextStripper {
@Getter
private float minCharWidth = Float.MAX_VALUE;
private int maxCharWidths;
@Getter
private float minCharHeight = Float.MAX_VALUE;
private int maxCharHeight;
@Getter
private final List<TextPositionSequence> textPositionSequences = new ArrayList<>();
@ -201,8 +201,16 @@ public class PDFLinesTextStripper extends PDFTextStripper {
int startIndex = 0;
for (int i = 0; i <= textPositions.size() - 1; i++) {
minCharWidth = Math.min(minCharWidth, textPositions.get(i).getWidthDirAdj());
minCharHeight = Math.min(minCharHeight, textPositions.get(i).getHeightDir());
int charHeight = (int) textPositions.get(i).getHeightDir();
if(charHeight > maxCharHeight){
maxCharHeight = charHeight;
}
int charWidth = (int) textPositions.get(i).getWidthDirAdj();
if(charWidth > maxCharWidths){
maxCharWidths = charWidth;
}
if (i == 0 && textPositions.get(i).getUnicode().equals(" ")) {
startIndex++;
@ -241,8 +249,8 @@ public class PDFLinesTextStripper extends PDFTextStripper {
@Override
public String getText(PDDocument doc) throws IOException {
minCharWidth = Float.MAX_VALUE;
minCharHeight = Float.MAX_VALUE;
maxCharWidths = 0;
maxCharWidths = 0;
textPositionSequences.clear();
rulings.clear();
graphicsPath.clear();

View File

@ -17,6 +17,6 @@ public class ParsedElements {
private boolean landscape;
private boolean rotated;
private float minCharWidth;
private float minCharHeight;
private float maxCharWidth;
private float maxCharHeight;
}

View File

@ -21,7 +21,6 @@ import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractT
import com.iqser.red.service.redaction.v1.server.tableextraction.model.CleanRulings;
import com.iqser.red.service.redaction.v1.server.tableextraction.service.RulingCleaningService;
import com.iqser.red.service.redaction.v1.server.tableextraction.service.TableExtractionService;
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@ -37,6 +36,7 @@ public class PdfSegmentationService {
private final ClassificationService classificationService;
private final SectionsBuilderService sectionsBuilderService;
public Document parseDocument(PDDocument pdDocument) throws IOException {
Document document = new Document();
@ -56,19 +56,21 @@ public class PdfSegmentationService {
int rotation = pdPage.getRotation();
boolean isRotated = rotation != 0 && rotation != 360;
ParsedElements parsedElements = ParsedElements
.builder()
ParsedElements parsedElements = ParsedElements.builder()
.rulings(stripper.getRulings())
.sequences(stripper.getTextPositionSequences())
.minCharWidth(Utils.round(stripper.getMinCharWidth(), 2))
.minCharHeight(Utils.round(stripper.getMinCharHeight(), 2))
.maxCharWidth(stripper.getMaxCharWidths())
.maxCharHeight(stripper.getMaxCharWidths())
.landscape(isLandscape)
.rotated(isRotated)
.build();
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(parsedElements.getRulings(), parsedElements.getMinCharWidth(), parsedElements.getMinCharHeight());
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(parsedElements.getRulings(), parsedElements
.getMaxCharWidth(), parsedElements.getMaxCharHeight());
Page page = blockificationService.blockify(parsedElements.getSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
Page page = blockificationService.blockify(parsedElements.getSequences(), cleanRulings.getHorizontal(), cleanRulings
.getVertical());
page.setRotation(rotation);
tableExtractionService.extractTables(cleanRulings, page);
@ -91,7 +93,10 @@ public class PdfSegmentationService {
}
private void increaseDocumentStatistics(Page page, Document document) {
if (!page.isLandscape()) {
document.getFontSizeCounter().addAll(page.getFontSizeCounter().getCountPerValue());
}
@ -100,6 +105,7 @@ public class PdfSegmentationService {
document.getFontStyleCounter().addAll(page.getFontStyleCounter().getCountPerValue());
}
private void buildPageStatistics(Page page) {
// Collect all statistics for the page, except from blocks inside tables, as tables will always be added to BodyTextFrame.

View File

@ -18,9 +18,9 @@ import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils;
@Service
public class RulingCleaningService {
public CleanRulings getCleanRulings(List<Ruling> rulings, float minCharWidth, float minCharHeight){
public CleanRulings getCleanRulings(List<Ruling> rulings, float maxCharWidth, float maxCharHeight){
if (!rulings.isEmpty()) {
snapPoints(rulings, minCharWidth , minCharHeight);
snapPoints(rulings, maxCharWidth , maxCharHeight);
}
List<Ruling> vrs = new ArrayList<>();

View File

@ -1,5 +1,6 @@
package com.iqser.red.service.redaction.v1.server;
import static org.assertj.core.api.Assertions.assertThat;
import static org.mockito.Mockito.when;
import static org.springframework.boot.test.context.SpringBootTest.WebEnvironment.DEFINED_PORT;
@ -269,7 +270,7 @@ public class RedactionIntegrationTest {
System.out.println("redactionTest");
long start = System.currentTimeMillis();
ClassPathResource pdfFileResource = new ClassPathResource("files/new/Thiabendazole DAR Addendum for ED_April_2020.pdf");
ClassPathResource pdfFileResource = new ClassPathResource("files/Fludioxonil/50 Fludioxonil_RAR_01_Volume_1_2018-02-21.pdf");
RedactionRequest request = RedactionRequest.builder()
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
@ -388,8 +389,7 @@ public class RedactionIntegrationTest {
public void htmlTablesTest() throws IOException {
System.out.println("htmlTablesTest");
ClassPathResource pdfFileResource = new ClassPathResource("files/Fludioxonil/51 " +
"Fludioxonil_RAR_02_Volume_2_2018-02-21.pdf");
ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf");
RedactionRequest request = RedactionRequest.builder()
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
@ -422,6 +422,27 @@ public class RedactionIntegrationTest {
}
@Test
public void phantomCellsDocumentTest() throws IOException {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Phantom Cells.pdf");
RedactionRequest request = RedactionRequest.builder()
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
.build();
request.setFlatRedaction(false);
RedactionResult result = redactionController.redact(request);
result.getRedactionLog().getRedactionLogEntry().forEach(entry -> {
if(!entry.isHint()){
assertThat(entry.getReason()).isEqualTo("Not redacted because row is not a vertebrate study");
}
});
}
private static String loadFromClassPath(String path) {
URL resource = ResourceLoader.class.getClassLoader().getResource(path);