RED-1535: Improved document parsing, added orientation to textblock

This commit is contained in:
Dominique Eifländer 2021-06-01 10:47:55 +02:00
parent 837f97c157
commit 53ce6cb47c
7 changed files with 101 additions and 12 deletions

View File

@ -0,0 +1,6 @@
package com.iqser.red.service.redaction.v1.server.classification.model;
public enum Orientation {
NONE, LEFT, RIGHT
}

View File

@ -32,6 +32,7 @@ public class TextBlock extends AbstractTextContainer {
private String classification;
public TextBlock(float minX, float maxX, float minY, float maxY, List<TextPositionSequence> sequences, int rotation) {
this.minX = minX;
this.maxX = maxX;

View File

@ -1,6 +1,7 @@
package com.iqser.red.service.redaction.v1.server.classification.service;
import com.iqser.red.service.redaction.v1.server.classification.model.FloatFrequencyCounter;
import com.iqser.red.service.redaction.v1.server.classification.model.Orientation;
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
import com.iqser.red.service.redaction.v1.server.classification.model.StringFrequencyCounter;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
@ -11,16 +12,21 @@ import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Rectangle;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
import org.springframework.stereotype.Service;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
@Service
@SuppressWarnings("all")
public class BlockificationService {
public Page blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
static final float THRESHOLD = 1f;
public Page blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines,
List<Ruling> verticalRulingLines) {
List<TextPositionSequence> chunkWords = new ArrayList<>();
List<AbstractTextContainer> chunkBlockList1 = new ArrayList<>();
@ -28,21 +34,37 @@ public class BlockificationService {
float minX = 1000, maxX = 0, minY = 1000, maxY = 0;
TextPositionSequence prev = null;
boolean wasSplitted = false;
Float splitX1 = null;
for (TextPositionSequence word : textPositions) {
boolean lineSeparation = minY - word.getY2() > word.getHeight() * 1.25;
boolean startFromTop = word.getY1() > maxY + word.getHeight();
boolean splitByX = prev != null && maxX + 50 < word.getX1() && prev.getY1() == word.getY1();
boolean newLineAfterSplit = prev != null && word.getY1() != prev.getY1() && wasSplitted && splitX1 != word.getX1();
boolean splittedByRuling = word.getRotation() == 0 && isSplittedByRuling(maxX, minY, word.getX1(), word.getY1(), verticalRulingLines) || word
.getRotation() == 0 && isSplittedByRuling(minX, minY, word.getX1(), word.getY2(), horizontalRulingLines) || word
.getRotation() == 90 && isSplittedByRuling(maxX, minY, word.getX1(), word.getY1(), horizontalRulingLines) || word
.getRotation() == 90 && isSplittedByRuling(minX, minY, word.getX1(), word.getY2(), verticalRulingLines);
if (prev != null && (lineSeparation || startFromTop || word.getRotation() == 0 && isSplittedByRuling(maxX, minY, word
.getX1(), word.getY1(), verticalRulingLines) || word.getRotation() == 0 && isSplittedByRuling(minX, minY, word
.getX1(), word.getY2(), horizontalRulingLines) || word.getRotation() == 90 && isSplittedByRuling(maxX, minY, word
.getX1(), word.getY1(), horizontalRulingLines) || word.getRotation() == 90 && isSplittedByRuling(minX, minY, word
.getX1(), word.getY2(), verticalRulingLines))) {
if (prev != null && (lineSeparation || startFromTop || splitByX || newLineAfterSplit || splittedByRuling)) {
TextBlock cb1 = buildTextBlock(chunkWords);
chunkBlockList1.add(cb1);
chunkWords = new ArrayList<>();
if (splitByX && !splittedByRuling) {
wasSplitted = true;
cb1.setOrientation(Orientation.LEFT);
splitX1 = word.getX1();
}
if (newLineAfterSplit && !splittedByRuling) {
wasSplitted = false;
cb1.setOrientation(Orientation.RIGHT);
splitX1 = null;
}
minX = 1000;
maxX = 0;
minY = 1000;
@ -72,9 +94,62 @@ public class BlockificationService {
chunkBlockList1.add(cb1);
}
Iterator<AbstractTextContainer> itty = chunkBlockList1.iterator();
TextBlock previousLeft = null;
TextBlock previousRight = null;
while (itty.hasNext()) {
TextBlock block = (TextBlock) itty.next();
if(previousLeft != null && block.getOrientation().equals(Orientation.LEFT)){
if (previousLeft.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousLeft.getMinY()){
previousLeft.add(block);
itty.remove();
continue;
}
}
if(previousRight != null && block.getOrientation().equals(Orientation.RIGHT)){
if (previousRight.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousRight.getMinY()){
previousRight.add(block);
itty.remove();
continue;
}
}
if (block.getOrientation().equals(Orientation.LEFT)) {
previousLeft = block;
} else if (block.getOrientation().equals(Orientation.RIGHT)) {
previousRight = block;
}
}
itty = chunkBlockList1.iterator();
TextBlock previous = null;
while (itty.hasNext()) {
TextBlock block = (TextBlock) itty.next();
if(previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold(block.getMaxY(), previous
.getMaxY())||
previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.RIGHT) && equalsWithThreshold(block.getMaxY(), previous
.getMaxY())){
previous.add(block);
itty.remove();
continue;
}
previous = block;
}
return new Page(chunkBlockList1);
}
private boolean equalsWithThreshold(float f1, float f2){
return Math.abs(f1 - f2) < THRESHOLD;
}
private TextBlock buildTextBlock(List<TextPositionSequence> wordBlockList) {
@ -117,7 +192,8 @@ public class BlockificationService {
}
private boolean isSplittedByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List<Ruling> rulingLines) {
private boolean isSplittedByRuling(float previousX2, float previousY1, float currentX1, float currentY1,
List<Ruling> rulingLines) {
for (Ruling ruling : rulingLines) {
if (ruling.intersectsLine(previousX2, previousY1, currentX1, currentY1)) {
@ -128,7 +204,8 @@ public class BlockificationService {
}
public Rectangle calculateBodyTextFrame(List<Page> pages, FloatFrequencyCounter documentFontSizeCounter, boolean landscape) {
public Rectangle calculateBodyTextFrame(List<Page> pages, FloatFrequencyCounter documentFontSizeCounter,
boolean landscape) {
float minX = 10000;
float maxX = -100;

View File

@ -2,6 +2,8 @@ package com.iqser.red.service.redaction.v1.server.tableextraction.model;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import com.iqser.red.service.redaction.v1.server.classification.model.Orientation;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
@ -18,6 +20,8 @@ public abstract class AbstractTextContainer {
protected String classification;
protected int page;
private Orientation orientation = Orientation.NONE;
public abstract String getText();
public boolean contains(AbstractTextContainer other) {

View File

@ -102,7 +102,7 @@ public class PdfVisualisationService {
contentStream.newLineAtOffset(textBlock.getMinX(), textBlock.getMaxY());
contentStream.showText(textBlock.getClassification());
contentStream.showText(textBlock.getClassification() + textBlock.getOrientation());
contentStream.endText();
}

View File

@ -680,8 +680,8 @@ public class RedactionIntegrationTest {
dictionary.get(AUTHOR).add("physical");
reanlysisVersions.put("physical", 2L);
dictionary.get(VERTEBRATE).add("s-metolachlor");
reanlysisVersions.put("s-metolachlor", 3L);
// dictionary.get(VERTEBRATE).add("s-metolachlor");
// reanlysisVersions.put("s-metolachlor", 3L);
when(dictionaryClient.getVersion(TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(3L);
@ -805,7 +805,7 @@ public class RedactionIntegrationTest {
public void classificationTest() throws IOException {
System.out.println("classificationTest");
ClassPathResource pdfFileResource = new ClassPathResource("files/Trinexapac/93 Trinexapac-ethyl_RAR_03_Volume_3CA_B-1_2017-03-31.pdf");
ClassPathResource pdfFileResource = new ClassPathResource("files/new/Single Study - Oral (Gavage) Mouse.pdf");
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());

View File

@ -1652,3 +1652,4 @@ Zoecon Corp.
Zoecon Corp., Palo Alto, USA
Zyma SA
Zyma SA, Nyon, Switzerland
Mambo-Tox Ltd. Biomedical Sciences Building Bassett Crescent East Southampton SO16 7PX UK