Pull request #259: RED-2536

Merge in RED/redaction-service from RED-2536 to master

* commit 'd892d6e81ee12927e57dc46c6439a65276824896':
  RED-2536: Treat \t same as whitespace
  RED-2223: Fixed stange end of textposition sequences that leads to wrong whitespaces
This commit is contained in:
Dominique Eiflaender 2021-10-28 12:52:47 +02:00
commit 65b186be28
11 changed files with 175 additions and 62 deletions

View File

@ -1,5 +1,7 @@
package com.iqser.red.service.redaction.v1.server.classification.service;
import static java.util.stream.Collectors.toSet;
import com.iqser.red.service.redaction.v1.server.classification.model.FloatFrequencyCounter;
import com.iqser.red.service.redaction.v1.server.classification.model.Orientation;
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
@ -16,6 +18,7 @@ import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
import org.springframework.stereotype.Service;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
@ -25,9 +28,12 @@ public class BlockificationService {
static final float THRESHOLD = 1f;
public Page blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines,
List<Ruling> verticalRulingLines) {
sortRotatedSequences(textPositions);
List<TextPositionSequence> chunkWords = new ArrayList<>();
List<AbstractTextContainer> chunkBlockList1 = new ArrayList<>();
@ -50,7 +56,7 @@ public class BlockificationService {
if (prev != null && (lineSeparation || startFromTop || splitByX || newLineAfterSplit || splittedByRuling)) {
Orientation prevOrientation = null;
if(!chunkBlockList1.isEmpty()) {
if (!chunkBlockList1.isEmpty()) {
prevOrientation = chunkBlockList1.get(chunkBlockList1.size() - 1).getOrientation();
}
@ -62,15 +68,11 @@ public class BlockificationService {
wasSplitted = true;
cb1.setOrientation(Orientation.LEFT);
splitX1 = word.getX1();
} else
if (newLineAfterSplit && !splittedByRuling) {
} else if (newLineAfterSplit && !splittedByRuling) {
wasSplitted = false;
cb1.setOrientation(Orientation.RIGHT);
splitX1 = null;
} else
if(prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation || !startFromTop || !splitByX || !newLineAfterSplit || !splittedByRuling)){
} else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation || !startFromTop || !splitByX || !newLineAfterSplit || !splittedByRuling)) {
cb1.setOrientation(Orientation.LEFT);
}
@ -110,16 +112,18 @@ public class BlockificationService {
while (itty.hasNext()) {
TextBlock block = (TextBlock) itty.next();
if(previousLeft != null && block.getOrientation().equals(Orientation.LEFT)){
if (previousLeft.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousLeft.getMinY()){
if (previousLeft != null && block.getOrientation().equals(Orientation.LEFT)) {
if (previousLeft.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousLeft
.getMinY()) {
previousLeft.add(block);
itty.remove();
continue;
}
}
if(previousRight != null && block.getOrientation().equals(Orientation.RIGHT)){
if (previousRight.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousRight.getMinY()){
if (previousRight != null && block.getOrientation().equals(Orientation.RIGHT)) {
if (previousRight.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousRight
.getMinY()) {
previousRight.add(block);
itty.remove();
continue;
@ -133,16 +137,16 @@ public class BlockificationService {
}
}
itty = chunkBlockList1.iterator();
TextBlock previous = null;
while (itty.hasNext()) {
TextBlock block = (TextBlock) itty.next();
if(previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold(block.getMaxY(), previous
.getMaxY())||
previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.RIGHT) && equalsWithThreshold(block.getMaxY(), previous
.getMaxY())){
if (previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation()
.equals(Orientation.LEFT) && equalsWithThreshold(block.getMaxY(), previous.getMaxY()) || previous != null && previous
.getOrientation()
.equals(Orientation.LEFT) && block.getOrientation()
.equals(Orientation.RIGHT) && equalsWithThreshold(block.getMaxY(), previous.getMaxY())) {
previous.add(block);
itty.remove();
continue;
@ -151,11 +155,12 @@ public class BlockificationService {
previous = block;
}
return new Page(chunkBlockList1);
}
private boolean equalsWithThreshold(float f1, float f2){
private boolean equalsWithThreshold(float f1, float f2) {
return Math.abs(f1 - f2) < THRESHOLD;
}
@ -197,6 +202,13 @@ public class BlockificationService {
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
}
if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences()
.stream()
.map(t -> round(t.getY1(), 3))
.collect(toSet())
.size() == 1) {
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getX1));
}
return textBlock;
}
@ -291,4 +303,30 @@ public class BlockificationService {
return new Rectangle(minY, minX, maxX - minX, maxY - minY);
}
private void sortRotatedSequences(List<TextPositionSequence> sequences) {
List<TextPositionSequence> rotatedWords = new ArrayList<>();
Iterator<TextPositionSequence> itty = sequences.iterator();
while (itty.hasNext()) {
var pos = itty.next();
if (pos.getTextPositions().get(0).getDir() == 270) {
rotatedWords.add(pos);
itty.remove();
}
}
if (!rotatedWords.isEmpty() && !sequences.isEmpty()) {
rotatedWords.sort(Comparator.comparing(TextPositionSequence::getX1));
}
sequences.addAll(rotatedWords);
}
private double round(float value, int decimalPoints) {
var d = Math.pow(10, decimalPoints);
return Math.round(value * d) / d;
}
}

View File

@ -1,16 +1,33 @@
package com.iqser.red.service.redaction.v1.server.parsing;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
import lombok.Getter;
import lombok.Setter;
import lombok.extern.slf4j.Slf4j;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.lang3.reflect.FieldUtils;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.contentstream.operator.OperatorName;
import org.apache.pdfbox.contentstream.operator.color.*;
import org.apache.pdfbox.contentstream.operator.state.*;
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColor;
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColorN;
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColorSpace;
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceCMYKColor;
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceGrayColor;
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceRGBColor;
import org.apache.pdfbox.contentstream.operator.color.SetStrokingColor;
import org.apache.pdfbox.contentstream.operator.color.SetStrokingColorN;
import org.apache.pdfbox.contentstream.operator.color.SetStrokingColorSpace;
import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceCMYKColor;
import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceGrayColor;
import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceRGBColor;
import org.apache.pdfbox.contentstream.operator.state.SetFlatness;
import org.apache.pdfbox.contentstream.operator.state.SetLineCapStyle;
import org.apache.pdfbox.contentstream.operator.state.SetLineDashPattern;
import org.apache.pdfbox.contentstream.operator.state.SetLineJoinStyle;
import org.apache.pdfbox.contentstream.operator.state.SetLineMiterLimit;
import org.apache.pdfbox.contentstream.operator.state.SetLineWidth;
import org.apache.pdfbox.contentstream.operator.state.SetRenderingIntent;
import org.apache.pdfbox.contentstream.operator.text.SetFontAndSize;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSName;
@ -23,12 +40,14 @@ import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import org.apache.pdfbox.util.Matrix;
import java.awt.geom.AffineTransform;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import com.iqser.red.service.redaction.v1.server.parsing.model.RedTextPosition;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
import lombok.Getter;
import lombok.Setter;
import lombok.extern.slf4j.Slf4j;
@Slf4j
public class PDFLinesTextStripper extends PDFTextStripper {
@ -189,16 +208,19 @@ public class PDFLinesTextStripper extends PDFTextStripper {
COSName objectName = (COSName) arguments.get(0);
PDXObject xobject = getResources().getXObject(objectName);
if (xobject instanceof PDImageXObject) {
PDImageXObject image = (PDImageXObject)xobject;
PDImageXObject image = (PDImageXObject) xobject;
Matrix ctmNew = getGraphicsState().getCurrentTransformationMatrix();
Rectangle2D rect = new Rectangle2D.Float(ctmNew.getTranslateX(), ctmNew.getTranslateY(), ctmNew.getScaleX(), ctmNew.getScaleY());
Rectangle2D rect = new Rectangle2D.Float(ctmNew.getTranslateX(), ctmNew.getTranslateY(), ctmNew.getScaleX(), ctmNew
.getScaleY());
// Memory Hack - sofReference kills me
FieldUtils.writeField(image, "cachedImageSubsampling", -1, true);
if (rect.getHeight() > 2 && rect.getWidth() > 2) {
this.images.add(new PdfImage(image.getImage(), rect, pageNumber, image.getImage().getColorModel().hasAlpha()));
this.images.add(new PdfImage(image.getImage(), rect, pageNumber, image.getImage()
.getColorModel()
.hasAlpha()));
}
}
} catch (Exception e) {
@ -207,8 +229,6 @@ public class PDFLinesTextStripper extends PDFTextStripper {
}
private float floatValue(COSBase value) {
if (value instanceof COSNumber) {
@ -247,8 +267,16 @@ public class PDFLinesTextStripper extends PDFTextStripper {
public void writeString(String text, List<TextPosition> textPositions) throws IOException {
int startIndex = 0;
RedTextPosition previous = null;
for (int i = 0; i <= textPositions.size() - 1; i++) {
if (!textPositionSequences.isEmpty()) {
previous = textPositionSequences.get(textPositionSequences.size() - 1)
.getTextPositions()
.get(textPositionSequences.get(textPositionSequences.size() - 1).getTextPositions().size() - 1);
}
int charWidth = (int) textPositions.get(i).getWidthDirAdj();
if (charWidth < minCharWidth) {
minCharWidth = charWidth;
@ -267,42 +295,54 @@ public class PDFLinesTextStripper extends PDFTextStripper {
if (i == 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i)
.getUnicode()
.equals("\u00A0"))) {
.equals("\u00A0") || textPositions.get(i)
.getUnicode()
.equals("\t"))) {
startIndex++;
continue;
}
// Strange but sometimes this is happening, for example: Metolachlor2.pdf
if (i > 0 && textPositions.get(i).getX() < textPositions.get(i - 1).getX()) {
if (i > 0 && textPositions.get(i).getXDirAdj() < textPositions.get(i - 1).getXDirAdj()) {
List<TextPosition> sublist = textPositions.subList(startIndex, i);
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0)
.getUnicode()
.equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) {
.equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0).getUnicode().equals("\t")))) {
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
}
startIndex = i;
}
if (textPositions.get(i).getRotation() == 0 && i > 0 && textPositions.get(i).getX() > textPositions.get(i - 1).getEndX() + 1) {
if (textPositions.get(i).getRotation() == 0 && i > 0 && textPositions.get(i)
.getX() > textPositions.get(i - 1).getEndX() + 1) {
List<TextPosition> sublist = textPositions.subList(startIndex, i);
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0)
.getUnicode()
.equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) {
.equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0).getUnicode().equals("\t")))) {
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
}
startIndex = i;
}
if (i > 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i)
.getUnicode()
.equals("\u00A0")) && i <= textPositions.size() - 2) {
.equals("\u00A0") || textPositions.get(i)
.getUnicode()
.equals("\t")) && i <= textPositions.size() - 2) {
List<TextPosition> sublist = textPositions.subList(startIndex, i);
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0)
.getUnicode()
.equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) {
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
.equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0).getUnicode().equals("\t")))) {
// Remove false sequence ends (whitespaces)
if (previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0)
.getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < 0.01) {
for (TextPosition t : sublist) {
textPositionSequences.get(textPositionSequences.size() - 1).add(t);
}
} else {
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
}
}
startIndex = i + 1;
}
@ -311,13 +351,23 @@ public class PDFLinesTextStripper extends PDFTextStripper {
List<TextPosition> sublist = textPositions.subList(startIndex, textPositions.size());
if (!sublist.isEmpty() && (sublist.get(sublist.size() - 1)
.getUnicode()
.equals(" ") || sublist.get(sublist.size() - 1).getUnicode().equals("\u00A0"))) {
.equals(" ") || sublist.get(sublist.size() - 1).getUnicode().equals("\u00A0") || sublist.get(sublist.size() - 1).getUnicode().equals("\t"))) {
sublist = sublist.subList(0, sublist.size() - 1);
}
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0)
.getUnicode()
.equals("\u00A0")))) {
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
.equals("\u00A0") || sublist.get(0)
.getUnicode()
.equals("\t")))) {
if (previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0)
.getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < 0.01) {
for (TextPosition t : sublist) {
textPositionSequences.get(textPositionSequences.size() - 1).add(t);
}
} else {
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
}
}
super.writeString(text);
}

View File

@ -22,6 +22,7 @@ public class RedTextPosition {
private float width;
private float heightDir;
private float widthDirAdj;
private float dir;
// not used in reanalysis
@JsonIgnore

View File

@ -138,6 +138,13 @@ public class TextPositionSequence implements CharSequence {
}
@JsonIgnore
public float getRotationAdjustedX() {
return textPositions.get(0).getXDirAdj();
}
@JsonIgnore
public float getY1() {
@ -235,7 +242,7 @@ public class TextPositionSequence implements CharSequence {
float posYInit;
float posYEnd;
if (textPositions.get(0).getRotation() == 90) {
if (textPositions.get(0).getRotation() == 90 && textPositions.get(0).getDir() != 0.0f) {
posXEnd = textPositions.get(0).getYDirAdj() + 2;
posYInit = getY1();
posYEnd = textPositions.get(textPositions.size() - 1).getXDirAdj() - height + 4;
@ -246,17 +253,24 @@ public class TextPositionSequence implements CharSequence {
posXInit = textPositions.get(0).getPageWidth() - textPositions.get(0).getYDirAdj() - 2;
posXEnd = textPositions.get(0).getPageWidth() - textPositions.get(textPositions.size() - 1)
.getYDirAdj() + height;
} else if(textPositions.get(0).getRotation() == 0 && textPositions.stream().map(t -> t.getY()).collect(toSet()).size() > 1) {
} else if(textPositions.get(0).getRotation() == 0 && textPositions.get(0).getDir() == 270f) {
posYInit = textPositions.get(0).getPageHeight() - getX1();
posYEnd = textPositions.get(0).getPageHeight() - getX2() - textPositions.get(0)
.getWidthDirAdj() - textPositions.get(textPositions.size() - 1).getWidthDirAdj() - 3;
posXInit = textPositions.get(0).getPageWidth() - textPositions.get(0).getYDirAdj() - 2;
posXEnd = textPositions.get(0).getPageWidth() - textPositions.get(textPositions.size() - 1)
.getYDirAdj() + height;
} else if(textPositions.get(0).getRotation() == 90 && textPositions.get(0).getDir() == 0.0f){
posXInit = textPositions.get(textPositions.size() - 1)
.getXDirAdj() + textPositions.get(textPositions.size() - 1).getHeightDir();
posXEnd = textPositions.get(0).getXDirAdj();
posYInit = textPositions.get(0).getPageHeight() - textPositions.get(0).getYDirAdj() - 2;
posYEnd = textPositions.get(0).getPageHeight() - textPositions.get(textPositions.size() - 1)
.getYDirAdj() + 2;
}
else {
posXEnd = textPositions.get(textPositions.size() - 1)
.getXDirAdj() + textPositions.get(textPositions.size() - 1).getWidth() + 1;
.getXDirAdj() + textPositions.get(textPositions.size() - 1).getWidthDirAdj() + 1;
posYInit = textPositions.get(0).getPageHeight() - textPositions.get(0).getYDirAdj() - 2;
posYEnd = textPositions.get(0).getPageHeight() - textPositions.get(textPositions.size() - 1)
.getYDirAdj() + 2;

View File

@ -133,7 +133,7 @@ public class RedactionLogCreatorService {
int startIndex = 0;
for (int i = 1; i < textPositions.size(); i++) {
float yDirAdj = textPositions.get(i).getYDirAdj();
if (yDirAdj != y) {
if (round(yDirAdj,3) != round(y, 3)) {
rectangles.add(TextPositionSequence.fromData(textPositions.subList(startIndex, i), page)
.getRectangle());
y = yDirAdj;
@ -149,6 +149,11 @@ public class RedactionLogCreatorService {
return rectangles;
}
private double round(float value, int decimalPoints) {
var d = Math.pow(10, decimalPoints);
return Math.round(value * d) / d;
}
private RedactionLogEntry createRedactionLogEntry(Entity entity, String dossierTemplateId) {

View File

@ -653,7 +653,7 @@ public class RedactionIntegrationTest {
public void redactionTest() throws IOException {
long start = System.currentTimeMillis();
ClassPathResource pdfFileResource = new ClassPathResource("files/S11.pdf");
ClassPathResource pdfFileResource = new ClassPathResource("files/new/VV-919901.pdf");
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
request.setExcludedPages(Set.of(1));
@ -886,7 +886,7 @@ public class RedactionIntegrationTest {
public void classificationTest() throws IOException {
System.out.println("classificationTest");
ClassPathResource pdfFileResource = new ClassPathResource("files/S11.pdf");
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Single Table.pdf");
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
@ -908,7 +908,7 @@ public class RedactionIntegrationTest {
public void sectionsTest() throws IOException {
System.out.println("sectionsTest");
ClassPathResource pdfFileResource = new ClassPathResource("files/S11.pdf");
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Single Table.pdf");
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
@ -930,7 +930,7 @@ public class RedactionIntegrationTest {
public void htmlTablesTest() throws IOException {
System.out.println("htmlTablesTest");
ClassPathResource pdfFileResource = new ClassPathResource("files/S11.pdf");
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Single Table.pdf");
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
@ -1195,10 +1195,10 @@ public class RedactionIntegrationTest {
private static String getTemporaryDirectory() {
String tmpdir = System.getProperty("java.io.tmpdir");
if (StringUtils.isNotBlank(tmpdir)) {
return tmpdir;
}
// String tmpdir = System.getProperty("java.io.tmpdir");
// if (StringUtils.isNotBlank(tmpdir)) {
// return tmpdir;
// }
return "/tmp";
}

View File

@ -0,0 +1,5 @@
Amendment 1
Report Number: 33168
Page
Report Number: BFI0714
Tesh Consultants International