RED-1535: Improved document parsing, added orientation to textblock
This commit is contained in:
parent
837f97c157
commit
53ce6cb47c
@ -0,0 +1,6 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
||||
|
||||
public enum Orientation {
|
||||
|
||||
NONE, LEFT, RIGHT
|
||||
}
|
||||
@ -32,6 +32,7 @@ public class TextBlock extends AbstractTextContainer {
|
||||
|
||||
private String classification;
|
||||
|
||||
|
||||
public TextBlock(float minX, float maxX, float minY, float maxY, List<TextPositionSequence> sequences, int rotation) {
|
||||
this.minX = minX;
|
||||
this.maxX = maxX;
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.service;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.FloatFrequencyCounter;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Orientation;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.StringFrequencyCounter;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
@ -11,16 +12,21 @@ import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Rectangle;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
@Service
|
||||
@SuppressWarnings("all")
|
||||
public class BlockificationService {
|
||||
|
||||
public Page blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
||||
static final float THRESHOLD = 1f;
|
||||
|
||||
public Page blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines,
|
||||
List<Ruling> verticalRulingLines) {
|
||||
|
||||
List<TextPositionSequence> chunkWords = new ArrayList<>();
|
||||
List<AbstractTextContainer> chunkBlockList1 = new ArrayList<>();
|
||||
@ -28,21 +34,37 @@ public class BlockificationService {
|
||||
float minX = 1000, maxX = 0, minY = 1000, maxY = 0;
|
||||
TextPositionSequence prev = null;
|
||||
|
||||
boolean wasSplitted = false;
|
||||
Float splitX1 = null;
|
||||
for (TextPositionSequence word : textPositions) {
|
||||
|
||||
boolean lineSeparation = minY - word.getY2() > word.getHeight() * 1.25;
|
||||
boolean startFromTop = word.getY1() > maxY + word.getHeight();
|
||||
boolean splitByX = prev != null && maxX + 50 < word.getX1() && prev.getY1() == word.getY1();
|
||||
boolean newLineAfterSplit = prev != null && word.getY1() != prev.getY1() && wasSplitted && splitX1 != word.getX1();
|
||||
boolean splittedByRuling = word.getRotation() == 0 && isSplittedByRuling(maxX, minY, word.getX1(), word.getY1(), verticalRulingLines) || word
|
||||
.getRotation() == 0 && isSplittedByRuling(minX, minY, word.getX1(), word.getY2(), horizontalRulingLines) || word
|
||||
.getRotation() == 90 && isSplittedByRuling(maxX, minY, word.getX1(), word.getY1(), horizontalRulingLines) || word
|
||||
.getRotation() == 90 && isSplittedByRuling(minX, minY, word.getX1(), word.getY2(), verticalRulingLines);
|
||||
|
||||
if (prev != null && (lineSeparation || startFromTop || word.getRotation() == 0 && isSplittedByRuling(maxX, minY, word
|
||||
.getX1(), word.getY1(), verticalRulingLines) || word.getRotation() == 0 && isSplittedByRuling(minX, minY, word
|
||||
.getX1(), word.getY2(), horizontalRulingLines) || word.getRotation() == 90 && isSplittedByRuling(maxX, minY, word
|
||||
.getX1(), word.getY1(), horizontalRulingLines) || word.getRotation() == 90 && isSplittedByRuling(minX, minY, word
|
||||
.getX1(), word.getY2(), verticalRulingLines))) {
|
||||
if (prev != null && (lineSeparation || startFromTop || splitByX || newLineAfterSplit || splittedByRuling)) {
|
||||
|
||||
TextBlock cb1 = buildTextBlock(chunkWords);
|
||||
chunkBlockList1.add(cb1);
|
||||
chunkWords = new ArrayList<>();
|
||||
|
||||
if (splitByX && !splittedByRuling) {
|
||||
wasSplitted = true;
|
||||
cb1.setOrientation(Orientation.LEFT);
|
||||
splitX1 = word.getX1();
|
||||
}
|
||||
|
||||
if (newLineAfterSplit && !splittedByRuling) {
|
||||
wasSplitted = false;
|
||||
cb1.setOrientation(Orientation.RIGHT);
|
||||
splitX1 = null;
|
||||
}
|
||||
|
||||
minX = 1000;
|
||||
maxX = 0;
|
||||
minY = 1000;
|
||||
@ -72,9 +94,62 @@ public class BlockificationService {
|
||||
chunkBlockList1.add(cb1);
|
||||
}
|
||||
|
||||
Iterator<AbstractTextContainer> itty = chunkBlockList1.iterator();
|
||||
|
||||
TextBlock previousLeft = null;
|
||||
TextBlock previousRight = null;
|
||||
while (itty.hasNext()) {
|
||||
TextBlock block = (TextBlock) itty.next();
|
||||
|
||||
if(previousLeft != null && block.getOrientation().equals(Orientation.LEFT)){
|
||||
if (previousLeft.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousLeft.getMinY()){
|
||||
previousLeft.add(block);
|
||||
itty.remove();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if(previousRight != null && block.getOrientation().equals(Orientation.RIGHT)){
|
||||
if (previousRight.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousRight.getMinY()){
|
||||
previousRight.add(block);
|
||||
itty.remove();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (block.getOrientation().equals(Orientation.LEFT)) {
|
||||
previousLeft = block;
|
||||
} else if (block.getOrientation().equals(Orientation.RIGHT)) {
|
||||
previousRight = block;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
itty = chunkBlockList1.iterator();
|
||||
TextBlock previous = null;
|
||||
while (itty.hasNext()) {
|
||||
TextBlock block = (TextBlock) itty.next();
|
||||
|
||||
if(previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold(block.getMaxY(), previous
|
||||
.getMaxY())||
|
||||
previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.RIGHT) && equalsWithThreshold(block.getMaxY(), previous
|
||||
.getMaxY())){
|
||||
previous.add(block);
|
||||
itty.remove();
|
||||
continue;
|
||||
}
|
||||
|
||||
previous = block;
|
||||
}
|
||||
|
||||
|
||||
return new Page(chunkBlockList1);
|
||||
}
|
||||
|
||||
private boolean equalsWithThreshold(float f1, float f2){
|
||||
return Math.abs(f1 - f2) < THRESHOLD;
|
||||
}
|
||||
|
||||
|
||||
private TextBlock buildTextBlock(List<TextPositionSequence> wordBlockList) {
|
||||
|
||||
@ -117,7 +192,8 @@ public class BlockificationService {
|
||||
}
|
||||
|
||||
|
||||
private boolean isSplittedByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List<Ruling> rulingLines) {
|
||||
private boolean isSplittedByRuling(float previousX2, float previousY1, float currentX1, float currentY1,
|
||||
List<Ruling> rulingLines) {
|
||||
|
||||
for (Ruling ruling : rulingLines) {
|
||||
if (ruling.intersectsLine(previousX2, previousY1, currentX1, currentY1)) {
|
||||
@ -128,7 +204,8 @@ public class BlockificationService {
|
||||
}
|
||||
|
||||
|
||||
public Rectangle calculateBodyTextFrame(List<Page> pages, FloatFrequencyCounter documentFontSizeCounter, boolean landscape) {
|
||||
public Rectangle calculateBodyTextFrame(List<Page> pages, FloatFrequencyCounter documentFontSizeCounter,
|
||||
boolean landscape) {
|
||||
|
||||
float minX = 10000;
|
||||
float maxX = -100;
|
||||
|
||||
@ -2,6 +2,8 @@ package com.iqser.red.service.redaction.v1.server.tableextraction.model;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.iqser.red.service.redaction.v1.model.Rectangle;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Orientation;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
@ -18,6 +20,8 @@ public abstract class AbstractTextContainer {
|
||||
protected String classification;
|
||||
protected int page;
|
||||
|
||||
private Orientation orientation = Orientation.NONE;
|
||||
|
||||
public abstract String getText();
|
||||
|
||||
public boolean contains(AbstractTextContainer other) {
|
||||
|
||||
@ -102,7 +102,7 @@ public class PdfVisualisationService {
|
||||
|
||||
contentStream.newLineAtOffset(textBlock.getMinX(), textBlock.getMaxY());
|
||||
|
||||
contentStream.showText(textBlock.getClassification());
|
||||
contentStream.showText(textBlock.getClassification() + textBlock.getOrientation());
|
||||
|
||||
contentStream.endText();
|
||||
}
|
||||
|
||||
@ -680,8 +680,8 @@ public class RedactionIntegrationTest {
|
||||
dictionary.get(AUTHOR).add("physical");
|
||||
reanlysisVersions.put("physical", 2L);
|
||||
|
||||
dictionary.get(VERTEBRATE).add("s-metolachlor");
|
||||
reanlysisVersions.put("s-metolachlor", 3L);
|
||||
// dictionary.get(VERTEBRATE).add("s-metolachlor");
|
||||
// reanlysisVersions.put("s-metolachlor", 3L);
|
||||
|
||||
when(dictionaryClient.getVersion(TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(3L);
|
||||
|
||||
@ -805,7 +805,7 @@ public class RedactionIntegrationTest {
|
||||
public void classificationTest() throws IOException {
|
||||
|
||||
System.out.println("classificationTest");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Trinexapac/93 Trinexapac-ethyl_RAR_03_Volume_3CA_B-1_2017-03-31.pdf");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/new/Single Study - Oral (Gavage) Mouse.pdf");
|
||||
|
||||
|
||||
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
|
||||
|
||||
@ -1652,3 +1652,4 @@ Zoecon Corp.
|
||||
Zoecon Corp., Palo Alto, USA
|
||||
Zyma SA
|
||||
Zyma SA, Nyon, Switzerland
|
||||
Mambo-Tox Ltd. Biomedical Sciences Building Bassett Crescent East Southampton SO16 7PX UK
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user