Pull request #112: Annotate images
Merge in RED/redaction-service from annotateImages to master * commit 'acddfafa5b60d0120a48bdf47fe218bf59e359d2': Annotate images
This commit is contained in:
commit
0cec88f1b4
@ -1,5 +1,6 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
package com.iqser.red.service.redaction.v1.server.classification.model;
|
||||||
|
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
||||||
@ -16,6 +17,8 @@ public class Page {
|
|||||||
@NonNull
|
@NonNull
|
||||||
private List<AbstractTextContainer> textBlocks;
|
private List<AbstractTextContainer> textBlocks;
|
||||||
|
|
||||||
|
private List<Rectangle2D> imageBounds;
|
||||||
|
|
||||||
private Rectangle bodyTextFrame;
|
private Rectangle bodyTextFrame;
|
||||||
|
|
||||||
private boolean landscape;
|
private boolean landscape;
|
||||||
|
|||||||
@ -1,6 +1,8 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.parsing;
|
package com.iqser.red.service.redaction.v1.server.parsing;
|
||||||
|
|
||||||
|
import java.awt.geom.AffineTransform;
|
||||||
import java.awt.geom.Point2D;
|
import java.awt.geom.Point2D;
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@ -28,10 +30,16 @@ import org.apache.pdfbox.contentstream.operator.state.SetLineWidth;
|
|||||||
import org.apache.pdfbox.contentstream.operator.state.SetRenderingIntent;
|
import org.apache.pdfbox.contentstream.operator.state.SetRenderingIntent;
|
||||||
import org.apache.pdfbox.contentstream.operator.text.SetFontAndSize;
|
import org.apache.pdfbox.contentstream.operator.text.SetFontAndSize;
|
||||||
import org.apache.pdfbox.cos.COSBase;
|
import org.apache.pdfbox.cos.COSBase;
|
||||||
|
import org.apache.pdfbox.cos.COSName;
|
||||||
import org.apache.pdfbox.cos.COSNumber;
|
import org.apache.pdfbox.cos.COSNumber;
|
||||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||||
|
import org.apache.pdfbox.pdmodel.PDPage;
|
||||||
|
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||||
|
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
|
||||||
|
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
|
||||||
import org.apache.pdfbox.text.PDFTextStripper;
|
import org.apache.pdfbox.text.PDFTextStripper;
|
||||||
import org.apache.pdfbox.text.TextPosition;
|
import org.apache.pdfbox.text.TextPosition;
|
||||||
|
import org.apache.pdfbox.util.Matrix;
|
||||||
|
|
||||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
|
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
|
||||||
@ -43,6 +51,9 @@ import lombok.extern.slf4j.Slf4j;
|
|||||||
@Slf4j
|
@Slf4j
|
||||||
public class PDFLinesTextStripper extends PDFTextStripper {
|
public class PDFLinesTextStripper extends PDFTextStripper {
|
||||||
|
|
||||||
|
@Setter
|
||||||
|
protected PDPage pdpage;
|
||||||
|
|
||||||
@Getter
|
@Getter
|
||||||
private int maxCharWidths;
|
private int maxCharWidths;
|
||||||
|
|
||||||
@ -57,13 +68,18 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
|||||||
|
|
||||||
private final List<Ruling> graphicsPath = new ArrayList<>();
|
private final List<Ruling> graphicsPath = new ArrayList<>();
|
||||||
|
|
||||||
|
@Getter
|
||||||
|
private List<Rectangle2D> imageBounds = new ArrayList<>();
|
||||||
|
|
||||||
private float path_x;
|
private float path_x;
|
||||||
private float path_y;
|
private float path_y;
|
||||||
|
|
||||||
@Setter
|
@Setter
|
||||||
private int pageNumber;
|
private int pageNumber;
|
||||||
|
|
||||||
|
|
||||||
public PDFLinesTextStripper() throws IOException {
|
public PDFLinesTextStripper() throws IOException {
|
||||||
|
|
||||||
super();
|
super();
|
||||||
this.addOperator(new SetStrokingColorSpace());
|
this.addOperator(new SetStrokingColorSpace());
|
||||||
this.addOperator(new SetNonStrokingColorSpace());
|
this.addOperator(new SetNonStrokingColorSpace());
|
||||||
@ -87,9 +103,9 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
|||||||
this.addOperator(new SetLineWidth());
|
this.addOperator(new SetLineWidth());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected void processOperator(Operator operator, List<COSBase> arguments)
|
protected void processOperator(Operator operator, List<COSBase> arguments) throws IOException {
|
||||||
throws IOException {
|
|
||||||
|
|
||||||
String operation = operator.getName();
|
String operation = operator.getName();
|
||||||
|
|
||||||
@ -110,9 +126,11 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
|||||||
|
|
||||||
// The direction of vertical lines must always be from bottom to top for the table extraction algorithm.
|
// The direction of vertical lines must always be from bottom to top for the table extraction algorithm.
|
||||||
if (pos.getY() > path_y) {
|
if (pos.getY() > path_y) {
|
||||||
graphicsPath.add(new Ruling(new Point2D.Float(path_x, path_y), new Point2D.Float((float) pos.getX(), (float) pos.getY())));
|
graphicsPath.add(new Ruling(new Point2D.Float(path_x, path_y), new Point2D.Float((float) pos.getX(), (float) pos
|
||||||
|
.getY())));
|
||||||
} else {
|
} else {
|
||||||
graphicsPath.add(new Ruling(new Point2D.Float(path_x, (float) pos.getY()), new Point2D.Float((float) pos.getX(), path_y)));
|
graphicsPath.add(new Ruling(new Point2D.Float(path_x, (float) pos.getY()), new Point2D.Float((float) pos
|
||||||
|
.getX(), path_y)));
|
||||||
}
|
}
|
||||||
|
|
||||||
path_x = (float) pos.getX();
|
path_x = (float) pos.getX();
|
||||||
@ -133,19 +151,25 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
|||||||
Point2D p2 = transformPosition(x + width, y + height);
|
Point2D p2 = transformPosition(x + width, y + height);
|
||||||
|
|
||||||
// Horizontal lines
|
// Horizontal lines
|
||||||
graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p1.getY()), new Point2D.Float((float) p2.getX(), (float) p1.getY())));
|
graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p1.getY()), new Point2D.Float((float) p2
|
||||||
graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p2.getY()), new Point2D.Float((float) p2.getX(), (float) p2.getY())));
|
.getX(), (float) p1.getY())));
|
||||||
|
graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p2.getY()), new Point2D.Float((float) p2
|
||||||
|
.getX(), (float) p2.getY())));
|
||||||
|
|
||||||
// Vertical lines, direction must always be from bottom to top for the table extraction algorithm.
|
// Vertical lines, direction must always be from bottom to top for the table extraction algorithm.
|
||||||
if (p2.getY() > p1.getY()) {
|
if (p2.getY() > p1.getY()) {
|
||||||
graphicsPath.add(new Ruling(new Point2D.Float((float) p2.getX(), (float) p1.getY()), new Point2D.Float((float) p2.getX(), (float) p2.getY())));
|
graphicsPath.add(new Ruling(new Point2D.Float((float) p2.getX(), (float) p1.getY()), new Point2D.Float((float) p2
|
||||||
|
.getX(), (float) p2.getY())));
|
||||||
} else {
|
} else {
|
||||||
graphicsPath.add(new Ruling(new Point2D.Float((float) p2.getX(), (float) p2.getY()), new Point2D.Float((float) p2.getX(), (float) p1.getY())));
|
graphicsPath.add(new Ruling(new Point2D.Float((float) p2.getX(), (float) p2.getY()), new Point2D.Float((float) p2
|
||||||
|
.getX(), (float) p1.getY())));
|
||||||
}
|
}
|
||||||
if (p2.getY() > p1.getY()) {
|
if (p2.getY() > p1.getY()) {
|
||||||
graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p1.getY()), new Point2D.Float((float) p1.getX(), (float) p2.getY())));
|
graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p1.getY()), new Point2D.Float((float) p1
|
||||||
|
.getX(), (float) p2.getY())));
|
||||||
} else {
|
} else {
|
||||||
graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p2.getY()), new Point2D.Float((float) p1.getX(), (float) p1.getY())));
|
graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p2.getY()), new Point2D.Float((float) p1
|
||||||
|
.getX(), (float) p1.getY())));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
@ -168,12 +192,80 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
|||||||
case OperatorName.ENDPATH:
|
case OperatorName.ENDPATH:
|
||||||
graphicsPath.clear();
|
graphicsPath.clear();
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case OperatorName.DRAW_OBJECT:
|
||||||
|
processImageOperation(arguments);
|
||||||
|
break;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
super.processOperator(operator, arguments);
|
super.processOperator(operator, arguments);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
protected void processImageOperation(List<COSBase> arguments) {
|
||||||
|
|
||||||
|
try {
|
||||||
|
COSName objectName = (COSName) arguments.get(0);
|
||||||
|
PDXObject xobject = getResources().getXObject(objectName);
|
||||||
|
if (xobject instanceof PDImageXObject) {
|
||||||
|
PDImageXObject pdfImage = (PDImageXObject) xobject;
|
||||||
|
|
||||||
|
Rectangle2D imageBounds = calculateImagePosition(pdfImage);
|
||||||
|
|
||||||
|
Rectangle2D rect = new Rectangle2D.Float((float) imageBounds.getX(), (float) imageBounds.getY(), (float) imageBounds
|
||||||
|
.getWidth(), (float) imageBounds.getHeight());
|
||||||
|
|
||||||
|
this.imageBounds.add(rect);
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
log.warn("Problem during image extraction: {}", e.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private Rectangle2D calculateImagePosition(PDImageXObject pdfImage) throws IOException {
|
||||||
|
|
||||||
|
Matrix ctm = getGraphicsState().getCurrentTransformationMatrix();
|
||||||
|
|
||||||
|
Rectangle2D imageBounds = pdfImage.getImage().getRaster().getBounds();
|
||||||
|
|
||||||
|
AffineTransform imageTransform = new AffineTransform(ctm.createAffineTransform());
|
||||||
|
imageTransform.scale(1.0 / pdfImage.getWidth(), -1.0 / pdfImage.getHeight());
|
||||||
|
imageTransform.translate(0, -pdfImage.getHeight());
|
||||||
|
|
||||||
|
AffineTransform pageTransform = createCurrentPageTransformation();
|
||||||
|
pageTransform.concatenate(imageTransform);
|
||||||
|
|
||||||
|
return pageTransform.createTransformedShape(imageBounds).getBounds2D();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
protected AffineTransform createCurrentPageTransformation() {
|
||||||
|
|
||||||
|
PDRectangle cb = pdpage.getCropBox();
|
||||||
|
AffineTransform pageTransform = new AffineTransform();
|
||||||
|
|
||||||
|
switch (pdpage.getRotation()) {
|
||||||
|
case 90:
|
||||||
|
pageTransform.translate(cb.getHeight(), 0);
|
||||||
|
break;
|
||||||
|
case 180:
|
||||||
|
pageTransform.translate(cb.getWidth(), cb.getHeight());
|
||||||
|
break;
|
||||||
|
case 270:
|
||||||
|
pageTransform.translate(0, cb.getWidth());
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
pageTransform.rotate(Math.toRadians(pdpage.getRotation()));
|
||||||
|
|
||||||
|
return pageTransform;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
private float floatValue(COSBase value) {
|
private float floatValue(COSBase value) {
|
||||||
|
|
||||||
if (value instanceof COSNumber) {
|
if (value instanceof COSNumber) {
|
||||||
return ((COSNumber) value).floatValue();
|
return ((COSNumber) value).floatValue();
|
||||||
} else {
|
} else {
|
||||||
@ -181,21 +273,31 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private Point2D.Float transformPosition(float x, float y) {
|
private Point2D.Float transformPosition(float x, float y) {
|
||||||
|
|
||||||
return super.transformedPoint(x, y);
|
return super.transformedPoint(x, y);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void addVisibleRulings(List<Ruling> path, boolean stroke) throws IOException {
|
private void addVisibleRulings(List<Ruling> path, boolean stroke) throws IOException {
|
||||||
|
|
||||||
try {
|
try {
|
||||||
if (stroke && !getGraphicsState().getStrokingColor().isPattern() && getGraphicsState().getStrokingColor().toRGB() == 0 || !stroke && !getGraphicsState().getNonStrokingColor().isPattern() && getGraphicsState().getNonStrokingColor().toRGB() == 0) {
|
if (stroke && !getGraphicsState().getStrokingColor().isPattern() && getGraphicsState().getStrokingColor()
|
||||||
|
.toRGB() == 0 || !stroke && !getGraphicsState().getNonStrokingColor()
|
||||||
|
.isPattern() && getGraphicsState().getNonStrokingColor().toRGB() == 0) {
|
||||||
rulings.addAll(path);
|
rulings.addAll(path);
|
||||||
}
|
}
|
||||||
} catch (UnsupportedOperationException e) {
|
} catch (UnsupportedOperationException e) {
|
||||||
log.error("UnsupportedOperationException: " + getGraphicsState().getStrokingColor().getColorSpace().getName() + " or " + getGraphicsState().getNonStrokingColor().getColorSpace().getName() + " does not support toRGB");
|
log.error("UnsupportedOperationException: " + getGraphicsState().getStrokingColor()
|
||||||
|
.getColorSpace()
|
||||||
|
.getName() + " or " + getGraphicsState().getNonStrokingColor()
|
||||||
|
.getColorSpace()
|
||||||
|
.getName() + " does not support toRGB");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void writeString(String text, List<TextPosition> textPositions) throws IOException {
|
public void writeString(String text, List<TextPosition> textPositions) throws IOException {
|
||||||
|
|
||||||
@ -203,16 +305,18 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
|||||||
for (int i = 0; i <= textPositions.size() - 1; i++) {
|
for (int i = 0; i <= textPositions.size() - 1; i++) {
|
||||||
|
|
||||||
int charHeight = (int) textPositions.get(i).getHeightDir();
|
int charHeight = (int) textPositions.get(i).getHeightDir();
|
||||||
if(charHeight > maxCharHeight){
|
if (charHeight > maxCharHeight) {
|
||||||
maxCharHeight = charHeight;
|
maxCharHeight = charHeight;
|
||||||
}
|
}
|
||||||
|
|
||||||
int charWidth = (int) textPositions.get(i).getWidthDirAdj();
|
int charWidth = (int) textPositions.get(i).getWidthDirAdj();
|
||||||
if(charWidth > maxCharWidths){
|
if (charWidth > maxCharWidths) {
|
||||||
maxCharWidths = charWidth;
|
maxCharWidths = charWidth;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (i == 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i).getUnicode().equals("\u00A0"))) {
|
if (i == 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i)
|
||||||
|
.getUnicode()
|
||||||
|
.equals("\u00A0"))) {
|
||||||
startIndex++;
|
startIndex++;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -220,15 +324,21 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
|||||||
// Strange but sometimes this is happening, for example: Metolachlor2.pdf
|
// Strange but sometimes this is happening, for example: Metolachlor2.pdf
|
||||||
if (i > 0 && textPositions.get(i).getX() < textPositions.get(i - 1).getX()) {
|
if (i > 0 && textPositions.get(i).getX() < textPositions.get(i - 1).getX()) {
|
||||||
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
||||||
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) {
|
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0)
|
||||||
|
.getUnicode()
|
||||||
|
.equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) {
|
||||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
|
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
|
||||||
}
|
}
|
||||||
startIndex = i;
|
startIndex = i;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (i > 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i).getUnicode().equals("\u00A0")) && i <= textPositions.size() - 2) {
|
if (i > 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i)
|
||||||
|
.getUnicode()
|
||||||
|
.equals("\u00A0")) && i <= textPositions.size() - 2) {
|
||||||
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
||||||
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) {
|
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0)
|
||||||
|
.getUnicode()
|
||||||
|
.equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) {
|
||||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
|
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
|
||||||
}
|
}
|
||||||
startIndex = i + 1;
|
startIndex = i + 1;
|
||||||
@ -236,21 +346,27 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
|||||||
}
|
}
|
||||||
|
|
||||||
List<TextPosition> sublist = textPositions.subList(startIndex, textPositions.size());
|
List<TextPosition> sublist = textPositions.subList(startIndex, textPositions.size());
|
||||||
if (!sublist.isEmpty() && (sublist.get(sublist.size() - 1).getUnicode().equals(" ") || sublist.get(sublist.size() - 1).getUnicode().equals("\u00A0"))) {
|
if (!sublist.isEmpty() && (sublist.get(sublist.size() - 1)
|
||||||
|
.getUnicode()
|
||||||
|
.equals(" ") || sublist.get(sublist.size() - 1).getUnicode().equals("\u00A0"))) {
|
||||||
sublist = sublist.subList(0, sublist.size() - 1);
|
sublist = sublist.subList(0, sublist.size() - 1);
|
||||||
}
|
}
|
||||||
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) {
|
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0)
|
||||||
|
.getUnicode()
|
||||||
|
.equals("\u00A0")))) {
|
||||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
|
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
|
||||||
}
|
}
|
||||||
super.writeString(text);
|
super.writeString(text);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String getText(PDDocument doc) throws IOException {
|
public String getText(PDDocument doc) throws IOException {
|
||||||
|
|
||||||
maxCharWidths = 0;
|
maxCharWidths = 0;
|
||||||
maxCharWidths = 0;
|
maxCharWidths = 0;
|
||||||
textPositionSequences.clear();
|
textPositionSequences.clear();
|
||||||
|
imageBounds = new ArrayList<>();
|
||||||
rulings.clear();
|
rulings.clear();
|
||||||
graphicsPath.clear();
|
graphicsPath.clear();
|
||||||
path_x = 0.0f;
|
path_x = 0.0f;
|
||||||
|
|||||||
@ -1,5 +1,6 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.parsing.model;
|
package com.iqser.red.service.redaction.v1.server.parsing.model;
|
||||||
|
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
|
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
|
||||||
@ -13,6 +14,7 @@ public class ParsedElements {
|
|||||||
|
|
||||||
private List<TextPositionSequence> sequences;
|
private List<TextPositionSequence> sequences;
|
||||||
private List<Ruling> rulings;
|
private List<Ruling> rulings;
|
||||||
|
private List<Rectangle2D> imageBounds;
|
||||||
|
|
||||||
private boolean landscape;
|
private boolean landscape;
|
||||||
private boolean rotated;
|
private boolean rotated;
|
||||||
|
|||||||
@ -53,25 +53,24 @@ public class AnnotationService {
|
|||||||
|
|
||||||
List<RedactionLogEntry> logEntries = redactionLogPerPage.get(page);
|
List<RedactionLogEntry> logEntries = redactionLogPerPage.get(page);
|
||||||
if (logEntries != null && !logEntries.isEmpty()) {
|
if (logEntries != null && !logEntries.isEmpty()) {
|
||||||
addAnnotations(logEntries, pdPage, page, redactionLog.getRuleSetId());
|
addAnnotations(logEntries, pdPage, page);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void addAnnotations(List<RedactionLogEntry> logEntries, PDPage pdPage, int page,
|
private void addAnnotations(List<RedactionLogEntry> logEntries, PDPage pdPage, int page) throws IOException {
|
||||||
String ruleSetId) throws IOException {
|
|
||||||
|
|
||||||
List<PDAnnotation> annotations = pdPage.getAnnotations();
|
List<PDAnnotation> annotations = pdPage.getAnnotations();
|
||||||
|
|
||||||
for (RedactionLogEntry entry : logEntries) {
|
for (RedactionLogEntry entry : logEntries) {
|
||||||
annotations.addAll(createAnnotation(entry, page, ruleSetId, pdPage.getMediaBox(), pdPage.getCropBox()));
|
annotations.addAll(createAnnotation(entry, page, pdPage.getMediaBox(), pdPage.getCropBox()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private List<PDAnnotation> createAnnotation(RedactionLogEntry redactionLogEntry, int page, String ruleSetId,
|
private List<PDAnnotation> createAnnotation(RedactionLogEntry redactionLogEntry, int page, PDRectangle mediaBox,
|
||||||
PDRectangle mediaBox, PDRectangle cropBox) {
|
PDRectangle cropBox) {
|
||||||
|
|
||||||
List<PDAnnotation> annotations = new ArrayList<>();
|
List<PDAnnotation> annotations = new ArrayList<>();
|
||||||
|
|
||||||
@ -89,7 +88,7 @@ public class AnnotationService {
|
|||||||
PDRectangle pdRectangle = toPDRectangle(rectangles, mediaBox, cropBox);
|
PDRectangle pdRectangle = toPDRectangle(rectangles, mediaBox, cropBox);
|
||||||
annotation.setRectangle(pdRectangle);
|
annotation.setRectangle(pdRectangle);
|
||||||
annotation.setQuadPoints(toQuadPoints(rectangles, mediaBox, cropBox));
|
annotation.setQuadPoints(toQuadPoints(rectangles, mediaBox, cropBox));
|
||||||
if (!dictionaryService.isHint(redactionLogEntry.getType(), ruleSetId)) {
|
if (!redactionLogEntry.isHint()) {
|
||||||
annotation.setContents(createAnnotationContent(redactionLogEntry));
|
annotation.setContents(createAnnotationContent(redactionLogEntry));
|
||||||
}
|
}
|
||||||
annotation.setTitlePopup(redactionLogEntry.getId());
|
annotation.setTitlePopup(redactionLogEntry.getId());
|
||||||
|
|||||||
@ -1,5 +1,6 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.redaction.service;
|
package com.iqser.red.service.redaction.v1.server.redaction.service;
|
||||||
|
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@ -27,6 +28,7 @@ import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
|||||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
|
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
|
||||||
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
|
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.redaction.utils.IdBuilder;
|
||||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
||||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
|
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
|
||||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
|
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
|
||||||
@ -37,6 +39,8 @@ import lombok.RequiredArgsConstructor;
|
|||||||
@RequiredArgsConstructor
|
@RequiredArgsConstructor
|
||||||
public class RedactionLogCreatorService {
|
public class RedactionLogCreatorService {
|
||||||
|
|
||||||
|
private static final String IMAGE = "image";
|
||||||
|
|
||||||
private final DictionaryService dictionaryService;
|
private final DictionaryService dictionaryService;
|
||||||
|
|
||||||
|
|
||||||
@ -56,6 +60,30 @@ public class RedactionLogCreatorService {
|
|||||||
if (manualRedactionPages.contains(page)) {
|
if (manualRedactionPages.contains(page)) {
|
||||||
addManualEntries(classifiedDoc, manualRedactions, page, ruleSetId);
|
addManualEntries(classifiedDoc, manualRedactions, page, ruleSetId);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!classifiedDoc.getPages().get(page - 1).getImageBounds().isEmpty()) {
|
||||||
|
addImageEntries(classifiedDoc, page, ruleSetId);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void addImageEntries(Document classifiedDoc, int pageNumber, String ruleSetId) {
|
||||||
|
|
||||||
|
for (Rectangle2D imageBounds : classifiedDoc.getPages().get(pageNumber - 1).getImageBounds()) {
|
||||||
|
RedactionLogEntry redactionLogEntry = RedactionLogEntry.builder()
|
||||||
|
.id(IdBuilder.buildId(imageBounds, pageNumber))
|
||||||
|
.color(getColor(IMAGE, ruleSetId))
|
||||||
|
.type(IMAGE)
|
||||||
|
.redacted(false)
|
||||||
|
.isHint(true)
|
||||||
|
.manual(false)
|
||||||
|
.isDictionaryEntry(false)
|
||||||
|
.isRecommendation(false)
|
||||||
|
.positions(List.of(new Rectangle(new Point((float) imageBounds.getX(), (float) imageBounds.getY()), (float) imageBounds
|
||||||
|
.getWidth(), (float) imageBounds.getHeight(), pageNumber)))
|
||||||
|
.build();
|
||||||
|
classifiedDoc.getRedactionLogEntities().add(redactionLogEntry);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -1,5 +1,6 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.redaction.utils;
|
package com.iqser.red.service.redaction.v1.server.redaction.utils;
|
||||||
|
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
@ -23,4 +24,16 @@ public class IdBuilder {
|
|||||||
|
|
||||||
return hashFunction.hashString(sb.toString(), StandardCharsets.UTF_8).toString();
|
return hashFunction.hashString(sb.toString(), StandardCharsets.UTF_8).toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public String buildId(Rectangle2D rectangle2D, int page){
|
||||||
|
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
sb.append("x").append(rectangle2D.getX()).append("y").append(rectangle2D.getY()).append("h").append(rectangle2D.getHeight()).append("w").append(rectangle2D.getWidth()).append("p").append(page);
|
||||||
|
|
||||||
|
return hashFunction.hashString(sb.toString(), StandardCharsets.UTF_8).toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -48,6 +48,7 @@ public class PdfSegmentationService {
|
|||||||
stripper.setPageNumber(pageNumber);
|
stripper.setPageNumber(pageNumber);
|
||||||
stripper.setStartPage(pageNumber);
|
stripper.setStartPage(pageNumber);
|
||||||
stripper.setEndPage(pageNumber);
|
stripper.setEndPage(pageNumber);
|
||||||
|
stripper.setPdpage(pdPage);
|
||||||
stripper.getText(pdDocument);
|
stripper.getText(pdDocument);
|
||||||
|
|
||||||
PDRectangle pdr = pdPage.getMediaBox();
|
PDRectangle pdr = pdPage.getMediaBox();
|
||||||
@ -56,10 +57,10 @@ public class PdfSegmentationService {
|
|||||||
int rotation = pdPage.getRotation();
|
int rotation = pdPage.getRotation();
|
||||||
boolean isRotated = rotation != 0 && rotation != 360;
|
boolean isRotated = rotation != 0 && rotation != 360;
|
||||||
|
|
||||||
|
|
||||||
ParsedElements parsedElements = ParsedElements.builder()
|
ParsedElements parsedElements = ParsedElements.builder()
|
||||||
.rulings(stripper.getRulings())
|
.rulings(stripper.getRulings())
|
||||||
.sequences(stripper.getTextPositionSequences())
|
.sequences(stripper.getTextPositionSequences())
|
||||||
|
.imageBounds(stripper.getImageBounds())
|
||||||
.maxCharWidth(stripper.getMaxCharWidths())
|
.maxCharWidth(stripper.getMaxCharWidths())
|
||||||
.maxCharHeight(stripper.getMaxCharWidths())
|
.maxCharHeight(stripper.getMaxCharWidths())
|
||||||
.landscape(isLandscape)
|
.landscape(isLandscape)
|
||||||
@ -81,8 +82,10 @@ public class PdfSegmentationService {
|
|||||||
|
|
||||||
page.setPageNumber(pageNumber);
|
page.setPageNumber(pageNumber);
|
||||||
increaseDocumentStatistics(page, document);
|
increaseDocumentStatistics(page, document);
|
||||||
|
page.setImageBounds(parsedElements.getImageBounds());
|
||||||
pages.add(page);
|
pages.add(page);
|
||||||
}
|
}
|
||||||
|
|
||||||
document.setPages(pages);
|
document.setPages(pages);
|
||||||
|
|
||||||
classificationService.classifyDocument(document);
|
classificationService.classifyDocument(document);
|
||||||
@ -90,11 +93,9 @@ public class PdfSegmentationService {
|
|||||||
sectionsBuilderService.buildSections(document);
|
sectionsBuilderService.buildSections(document);
|
||||||
|
|
||||||
return document;
|
return document;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
private void increaseDocumentStatistics(Page page, Document document) {
|
private void increaseDocumentStatistics(Page page, Document document) {
|
||||||
|
|
||||||
if (!page.isLandscape()) {
|
if (!page.isLandscape()) {
|
||||||
|
|||||||
@ -81,6 +81,7 @@ public class RedactionIntegrationTest {
|
|||||||
private static final String PUBLISHED_INFORMATION = "published_information";
|
private static final String PUBLISHED_INFORMATION = "published_information";
|
||||||
private static final String TEST_METHOD = "test_method";
|
private static final String TEST_METHOD = "test_method";
|
||||||
private static final String PURITY = "purity";
|
private static final String PURITY = "purity";
|
||||||
|
private static final String IMAGE = "image";
|
||||||
|
|
||||||
private static final String RECOMMENDATION_AUTHOR = "recommendation_CBI_author";
|
private static final String RECOMMENDATION_AUTHOR = "recommendation_CBI_author";
|
||||||
private static final String RECOMMENDATION_ADDRESS = "recommendation_CBI_address";
|
private static final String RECOMMENDATION_ADDRESS = "recommendation_CBI_address";
|
||||||
@ -157,6 +158,7 @@ public class RedactionIntegrationTest {
|
|||||||
when(dictionaryClient.getDictionaryForType(RECOMMENDATION_ADDRESS, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(RECOMMENDATION_ADDRESS));
|
when(dictionaryClient.getDictionaryForType(RECOMMENDATION_ADDRESS, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(RECOMMENDATION_ADDRESS));
|
||||||
when(dictionaryClient.getDictionaryForType(FALSE_POSITIVE, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(FALSE_POSITIVE));
|
when(dictionaryClient.getDictionaryForType(FALSE_POSITIVE, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(FALSE_POSITIVE));
|
||||||
when(dictionaryClient.getDictionaryForType(PURITY, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(PURITY));
|
when(dictionaryClient.getDictionaryForType(PURITY, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(PURITY));
|
||||||
|
when(dictionaryClient.getDictionaryForType(IMAGE, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(IMAGE));
|
||||||
when(dictionaryClient.getColors(TEST_RULESET_ID)).thenReturn(colors);
|
when(dictionaryClient.getColors(TEST_RULESET_ID)).thenReturn(colors);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -238,6 +240,11 @@ public class RedactionIntegrationTest {
|
|||||||
.stream()
|
.stream()
|
||||||
.map(this::cleanDictionaryEntry)
|
.map(this::cleanDictionaryEntry)
|
||||||
.collect(Collectors.toSet()));
|
.collect(Collectors.toSet()));
|
||||||
|
dictionary.computeIfAbsent(IMAGE, v -> new ArrayList<>())
|
||||||
|
.addAll(ResourceLoader.load("dictionaries/image.txt")
|
||||||
|
.stream()
|
||||||
|
.map(this::cleanDictionaryEntry)
|
||||||
|
.collect(Collectors.toSet()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -264,6 +271,7 @@ public class RedactionIntegrationTest {
|
|||||||
typeColorMap.put(RECOMMENDATION_ADDRESS, "#8df06c");
|
typeColorMap.put(RECOMMENDATION_ADDRESS, "#8df06c");
|
||||||
typeColorMap.put(FALSE_POSITIVE, "#ffffff");
|
typeColorMap.put(FALSE_POSITIVE, "#ffffff");
|
||||||
typeColorMap.put(PURITY, "#ffe187");
|
typeColorMap.put(PURITY, "#ffe187");
|
||||||
|
typeColorMap.put(IMAGE, "#fcc5fb");
|
||||||
|
|
||||||
hintTypeMap.put(VERTEBRATE, true);
|
hintTypeMap.put(VERTEBRATE, true);
|
||||||
hintTypeMap.put(ADDRESS, false);
|
hintTypeMap.put(ADDRESS, false);
|
||||||
@ -280,6 +288,7 @@ public class RedactionIntegrationTest {
|
|||||||
hintTypeMap.put(RECOMMENDATION_ADDRESS, false);
|
hintTypeMap.put(RECOMMENDATION_ADDRESS, false);
|
||||||
hintTypeMap.put(FALSE_POSITIVE, true);
|
hintTypeMap.put(FALSE_POSITIVE, true);
|
||||||
hintTypeMap.put(PURITY, false);
|
hintTypeMap.put(PURITY, false);
|
||||||
|
hintTypeMap.put(IMAGE, true);
|
||||||
|
|
||||||
caseInSensitiveMap.put(VERTEBRATE, true);
|
caseInSensitiveMap.put(VERTEBRATE, true);
|
||||||
caseInSensitiveMap.put(ADDRESS, false);
|
caseInSensitiveMap.put(ADDRESS, false);
|
||||||
@ -296,6 +305,7 @@ public class RedactionIntegrationTest {
|
|||||||
caseInSensitiveMap.put(RECOMMENDATION_ADDRESS, false);
|
caseInSensitiveMap.put(RECOMMENDATION_ADDRESS, false);
|
||||||
caseInSensitiveMap.put(FALSE_POSITIVE, false);
|
caseInSensitiveMap.put(FALSE_POSITIVE, false);
|
||||||
caseInSensitiveMap.put(PURITY, false);
|
caseInSensitiveMap.put(PURITY, false);
|
||||||
|
caseInSensitiveMap.put(IMAGE, true);
|
||||||
|
|
||||||
recommendationTypeMap.put(VERTEBRATE, false);
|
recommendationTypeMap.put(VERTEBRATE, false);
|
||||||
recommendationTypeMap.put(ADDRESS, false);
|
recommendationTypeMap.put(ADDRESS, false);
|
||||||
@ -312,6 +322,8 @@ public class RedactionIntegrationTest {
|
|||||||
recommendationTypeMap.put(RECOMMENDATION_ADDRESS, true);
|
recommendationTypeMap.put(RECOMMENDATION_ADDRESS, true);
|
||||||
recommendationTypeMap.put(FALSE_POSITIVE, false);
|
recommendationTypeMap.put(FALSE_POSITIVE, false);
|
||||||
recommendationTypeMap.put(PURITY, false);
|
recommendationTypeMap.put(PURITY, false);
|
||||||
|
recommendationTypeMap.put(IMAGE, false);
|
||||||
|
|
||||||
|
|
||||||
rankTypeMap.put(FALSE_POSITIVE, 160);
|
rankTypeMap.put(FALSE_POSITIVE, 160);
|
||||||
rankTypeMap.put(PURITY, 155);
|
rankTypeMap.put(PURITY, 155);
|
||||||
@ -328,6 +340,8 @@ public class RedactionIntegrationTest {
|
|||||||
rankTypeMap.put(HINT_ONLY, 50);
|
rankTypeMap.put(HINT_ONLY, 50);
|
||||||
rankTypeMap.put(RECOMMENDATION_AUTHOR, 40);
|
rankTypeMap.put(RECOMMENDATION_AUTHOR, 40);
|
||||||
rankTypeMap.put(RECOMMENDATION_ADDRESS, 30);
|
rankTypeMap.put(RECOMMENDATION_ADDRESS, 30);
|
||||||
|
rankTypeMap.put(IMAGE, 30);
|
||||||
|
|
||||||
|
|
||||||
colors.setDefaultColor("#acfc00");
|
colors.setDefaultColor("#acfc00");
|
||||||
colors.setNotRedacted("#cccccc");
|
colors.setNotRedacted("#cccccc");
|
||||||
@ -427,7 +441,7 @@ public class RedactionIntegrationTest {
|
|||||||
|
|
||||||
System.out.println("redactionTest");
|
System.out.println("redactionTest");
|
||||||
long start = System.currentTimeMillis();
|
long start = System.currentTimeMillis();
|
||||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf");
|
ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_08_Volume_3CA_B-6_2018-09-06.pdf");
|
||||||
|
|
||||||
AnalyzeRequest request = AnalyzeRequest.builder()
|
AnalyzeRequest request = AnalyzeRequest.builder()
|
||||||
.ruleSetId(TEST_RULESET_ID)
|
.ruleSetId(TEST_RULESET_ID)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user