Pull request #43: RED-264: Avoid phantom cells, by merging line till rounded value of biggest char height/width
Merge in RED/redaction-service from RED-264 to master * commit 'e1adcfb02c4c7d46140884b3d38f8daf1e1aae92': Added missing unittest RED-264: Avoid phantom cells, by merging line till rounded value of biggest char height/width
This commit is contained in:
commit
8b1574845c
@ -44,10 +44,10 @@ import lombok.extern.slf4j.Slf4j;
|
||||
public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
|
||||
@Getter
|
||||
private float minCharWidth = Float.MAX_VALUE;
|
||||
private int maxCharWidths;
|
||||
|
||||
@Getter
|
||||
private float minCharHeight = Float.MAX_VALUE;
|
||||
private int maxCharHeight;
|
||||
|
||||
@Getter
|
||||
private final List<TextPositionSequence> textPositionSequences = new ArrayList<>();
|
||||
@ -201,8 +201,16 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
|
||||
int startIndex = 0;
|
||||
for (int i = 0; i <= textPositions.size() - 1; i++) {
|
||||
minCharWidth = Math.min(minCharWidth, textPositions.get(i).getWidthDirAdj());
|
||||
minCharHeight = Math.min(minCharHeight, textPositions.get(i).getHeightDir());
|
||||
|
||||
int charHeight = (int) textPositions.get(i).getHeightDir();
|
||||
if(charHeight > maxCharHeight){
|
||||
maxCharHeight = charHeight;
|
||||
}
|
||||
|
||||
int charWidth = (int) textPositions.get(i).getWidthDirAdj();
|
||||
if(charWidth > maxCharWidths){
|
||||
maxCharWidths = charWidth;
|
||||
}
|
||||
|
||||
if (i == 0 && textPositions.get(i).getUnicode().equals(" ")) {
|
||||
startIndex++;
|
||||
@ -241,8 +249,8 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
@Override
|
||||
public String getText(PDDocument doc) throws IOException {
|
||||
|
||||
minCharWidth = Float.MAX_VALUE;
|
||||
minCharHeight = Float.MAX_VALUE;
|
||||
maxCharWidths = 0;
|
||||
maxCharWidths = 0;
|
||||
textPositionSequences.clear();
|
||||
rulings.clear();
|
||||
graphicsPath.clear();
|
||||
|
||||
@ -17,6 +17,6 @@ public class ParsedElements {
|
||||
private boolean landscape;
|
||||
private boolean rotated;
|
||||
|
||||
private float minCharWidth;
|
||||
private float minCharHeight;
|
||||
private float maxCharWidth;
|
||||
private float maxCharHeight;
|
||||
}
|
||||
|
||||
@ -21,7 +21,6 @@ import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractT
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.CleanRulings;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.service.RulingCleaningService;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.service.TableExtractionService;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
@ -37,6 +36,7 @@ public class PdfSegmentationService {
|
||||
private final ClassificationService classificationService;
|
||||
private final SectionsBuilderService sectionsBuilderService;
|
||||
|
||||
|
||||
public Document parseDocument(PDDocument pdDocument) throws IOException {
|
||||
|
||||
Document document = new Document();
|
||||
@ -56,19 +56,21 @@ public class PdfSegmentationService {
|
||||
int rotation = pdPage.getRotation();
|
||||
boolean isRotated = rotation != 0 && rotation != 360;
|
||||
|
||||
ParsedElements parsedElements = ParsedElements
|
||||
.builder()
|
||||
|
||||
ParsedElements parsedElements = ParsedElements.builder()
|
||||
.rulings(stripper.getRulings())
|
||||
.sequences(stripper.getTextPositionSequences())
|
||||
.minCharWidth(Utils.round(stripper.getMinCharWidth(), 2))
|
||||
.minCharHeight(Utils.round(stripper.getMinCharHeight(), 2))
|
||||
.maxCharWidth(stripper.getMaxCharWidths())
|
||||
.maxCharHeight(stripper.getMaxCharWidths())
|
||||
.landscape(isLandscape)
|
||||
.rotated(isRotated)
|
||||
.build();
|
||||
|
||||
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(parsedElements.getRulings(), parsedElements.getMinCharWidth(), parsedElements.getMinCharHeight());
|
||||
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(parsedElements.getRulings(), parsedElements
|
||||
.getMaxCharWidth(), parsedElements.getMaxCharHeight());
|
||||
|
||||
Page page = blockificationService.blockify(parsedElements.getSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
Page page = blockificationService.blockify(parsedElements.getSequences(), cleanRulings.getHorizontal(), cleanRulings
|
||||
.getVertical());
|
||||
page.setRotation(rotation);
|
||||
|
||||
tableExtractionService.extractTables(cleanRulings, page);
|
||||
@ -91,7 +93,10 @@ public class PdfSegmentationService {
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
private void increaseDocumentStatistics(Page page, Document document) {
|
||||
|
||||
if (!page.isLandscape()) {
|
||||
document.getFontSizeCounter().addAll(page.getFontSizeCounter().getCountPerValue());
|
||||
}
|
||||
@ -100,6 +105,7 @@ public class PdfSegmentationService {
|
||||
document.getFontStyleCounter().addAll(page.getFontStyleCounter().getCountPerValue());
|
||||
}
|
||||
|
||||
|
||||
private void buildPageStatistics(Page page) {
|
||||
|
||||
// Collect all statistics for the page, except from blocks inside tables, as tables will always be added to BodyTextFrame.
|
||||
|
||||
@ -18,9 +18,9 @@ import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils;
|
||||
@Service
|
||||
public class RulingCleaningService {
|
||||
|
||||
public CleanRulings getCleanRulings(List<Ruling> rulings, float minCharWidth, float minCharHeight){
|
||||
public CleanRulings getCleanRulings(List<Ruling> rulings, float maxCharWidth, float maxCharHeight){
|
||||
if (!rulings.isEmpty()) {
|
||||
snapPoints(rulings, minCharWidth , minCharHeight);
|
||||
snapPoints(rulings, maxCharWidth , maxCharHeight);
|
||||
}
|
||||
|
||||
List<Ruling> vrs = new ArrayList<>();
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
package com.iqser.red.service.redaction.v1.server;
|
||||
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
import static org.mockito.Mockito.when;
|
||||
import static org.springframework.boot.test.context.SpringBootTest.WebEnvironment.DEFINED_PORT;
|
||||
|
||||
@ -269,7 +270,7 @@ public class RedactionIntegrationTest {
|
||||
|
||||
System.out.println("redactionTest");
|
||||
long start = System.currentTimeMillis();
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/new/Thiabendazole DAR Addendum for ED_April_2020.pdf");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Fludioxonil/50 Fludioxonil_RAR_01_Volume_1_2018-02-21.pdf");
|
||||
|
||||
RedactionRequest request = RedactionRequest.builder()
|
||||
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
|
||||
@ -388,8 +389,7 @@ public class RedactionIntegrationTest {
|
||||
public void htmlTablesTest() throws IOException {
|
||||
|
||||
System.out.println("htmlTablesTest");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Fludioxonil/51 " +
|
||||
"Fludioxonil_RAR_02_Volume_2_2018-02-21.pdf");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf");
|
||||
|
||||
RedactionRequest request = RedactionRequest.builder()
|
||||
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
|
||||
@ -422,6 +422,27 @@ public class RedactionIntegrationTest {
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void phantomCellsDocumentTest() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Phantom Cells.pdf");
|
||||
|
||||
RedactionRequest request = RedactionRequest.builder()
|
||||
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
|
||||
.build();
|
||||
request.setFlatRedaction(false);
|
||||
|
||||
RedactionResult result = redactionController.redact(request);
|
||||
|
||||
result.getRedactionLog().getRedactionLogEntry().forEach(entry -> {
|
||||
if(!entry.isHint()){
|
||||
assertThat(entry.getReason()).isEqualTo("Not redacted because row is not a vertebrate study");
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
|
||||
private static String loadFromClassPath(String path) {
|
||||
|
||||
URL resource = ResourceLoader.class.getClassLoader().getResource(path);
|
||||
|
||||
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user