Pull request #112: Annotate images

Merge in RED/redaction-service from annotateImages to master

* commit 'acddfafa5b60d0120a48bdf47fe218bf59e359d2':
  Annotate images
This commit is contained in:
Dominique Eiflaender 2021-02-01 14:12:28 +01:00
commit 0cec88f1b4
9 changed files with 207 additions and 31 deletions

View File

@ -1,5 +1,6 @@
package com.iqser.red.service.redaction.v1.server.classification.model; package com.iqser.red.service.redaction.v1.server.classification.model;
import java.awt.geom.Rectangle2D;
import java.util.List; import java.util.List;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer; import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
@ -16,6 +17,8 @@ public class Page {
@NonNull @NonNull
private List<AbstractTextContainer> textBlocks; private List<AbstractTextContainer> textBlocks;
private List<Rectangle2D> imageBounds;
private Rectangle bodyTextFrame; private Rectangle bodyTextFrame;
private boolean landscape; private boolean landscape;

View File

@ -1,6 +1,8 @@
package com.iqser.red.service.redaction.v1.server.parsing; package com.iqser.red.service.redaction.v1.server.parsing;
import java.awt.geom.AffineTransform;
import java.awt.geom.Point2D; import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
@ -28,10 +30,16 @@ import org.apache.pdfbox.contentstream.operator.state.SetLineWidth;
import org.apache.pdfbox.contentstream.operator.state.SetRenderingIntent; import org.apache.pdfbox.contentstream.operator.state.SetRenderingIntent;
import org.apache.pdfbox.contentstream.operator.text.SetFontAndSize; import org.apache.pdfbox.contentstream.operator.text.SetFontAndSize;
import org.apache.pdfbox.cos.COSBase; import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSNumber; import org.apache.pdfbox.cos.COSNumber;
import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.text.PDFTextStripper; import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition; import org.apache.pdfbox.text.TextPosition;
import org.apache.pdfbox.util.Matrix;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling; import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
@ -43,6 +51,9 @@ import lombok.extern.slf4j.Slf4j;
@Slf4j @Slf4j
public class PDFLinesTextStripper extends PDFTextStripper { public class PDFLinesTextStripper extends PDFTextStripper {
@Setter
protected PDPage pdpage;
@Getter @Getter
private int maxCharWidths; private int maxCharWidths;
@ -57,13 +68,18 @@ public class PDFLinesTextStripper extends PDFTextStripper {
private final List<Ruling> graphicsPath = new ArrayList<>(); private final List<Ruling> graphicsPath = new ArrayList<>();
@Getter
private List<Rectangle2D> imageBounds = new ArrayList<>();
private float path_x; private float path_x;
private float path_y; private float path_y;
@Setter @Setter
private int pageNumber; private int pageNumber;
public PDFLinesTextStripper() throws IOException { public PDFLinesTextStripper() throws IOException {
super(); super();
this.addOperator(new SetStrokingColorSpace()); this.addOperator(new SetStrokingColorSpace());
this.addOperator(new SetNonStrokingColorSpace()); this.addOperator(new SetNonStrokingColorSpace());
@ -87,9 +103,9 @@ public class PDFLinesTextStripper extends PDFTextStripper {
this.addOperator(new SetLineWidth()); this.addOperator(new SetLineWidth());
} }
@Override @Override
protected void processOperator(Operator operator, List<COSBase> arguments) protected void processOperator(Operator operator, List<COSBase> arguments) throws IOException {
throws IOException {
String operation = operator.getName(); String operation = operator.getName();
@ -110,9 +126,11 @@ public class PDFLinesTextStripper extends PDFTextStripper {
// The direction of vertical lines must always be from bottom to top for the table extraction algorithm. // The direction of vertical lines must always be from bottom to top for the table extraction algorithm.
if (pos.getY() > path_y) { if (pos.getY() > path_y) {
graphicsPath.add(new Ruling(new Point2D.Float(path_x, path_y), new Point2D.Float((float) pos.getX(), (float) pos.getY()))); graphicsPath.add(new Ruling(new Point2D.Float(path_x, path_y), new Point2D.Float((float) pos.getX(), (float) pos
.getY())));
} else { } else {
graphicsPath.add(new Ruling(new Point2D.Float(path_x, (float) pos.getY()), new Point2D.Float((float) pos.getX(), path_y))); graphicsPath.add(new Ruling(new Point2D.Float(path_x, (float) pos.getY()), new Point2D.Float((float) pos
.getX(), path_y)));
} }
path_x = (float) pos.getX(); path_x = (float) pos.getX();
@ -133,19 +151,25 @@ public class PDFLinesTextStripper extends PDFTextStripper {
Point2D p2 = transformPosition(x + width, y + height); Point2D p2 = transformPosition(x + width, y + height);
// Horizontal lines // Horizontal lines
graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p1.getY()), new Point2D.Float((float) p2.getX(), (float) p1.getY()))); graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p1.getY()), new Point2D.Float((float) p2
graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p2.getY()), new Point2D.Float((float) p2.getX(), (float) p2.getY()))); .getX(), (float) p1.getY())));
graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p2.getY()), new Point2D.Float((float) p2
.getX(), (float) p2.getY())));
// Vertical lines, direction must always be from bottom to top for the table extraction algorithm. // Vertical lines, direction must always be from bottom to top for the table extraction algorithm.
if (p2.getY() > p1.getY()) { if (p2.getY() > p1.getY()) {
graphicsPath.add(new Ruling(new Point2D.Float((float) p2.getX(), (float) p1.getY()), new Point2D.Float((float) p2.getX(), (float) p2.getY()))); graphicsPath.add(new Ruling(new Point2D.Float((float) p2.getX(), (float) p1.getY()), new Point2D.Float((float) p2
.getX(), (float) p2.getY())));
} else { } else {
graphicsPath.add(new Ruling(new Point2D.Float((float) p2.getX(), (float) p2.getY()), new Point2D.Float((float) p2.getX(), (float) p1.getY()))); graphicsPath.add(new Ruling(new Point2D.Float((float) p2.getX(), (float) p2.getY()), new Point2D.Float((float) p2
.getX(), (float) p1.getY())));
} }
if (p2.getY() > p1.getY()) { if (p2.getY() > p1.getY()) {
graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p1.getY()), new Point2D.Float((float) p1.getX(), (float) p2.getY()))); graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p1.getY()), new Point2D.Float((float) p1
.getX(), (float) p2.getY())));
} else { } else {
graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p2.getY()), new Point2D.Float((float) p1.getX(), (float) p1.getY()))); graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p2.getY()), new Point2D.Float((float) p1
.getX(), (float) p1.getY())));
} }
} }
break; break;
@ -168,12 +192,80 @@ public class PDFLinesTextStripper extends PDFTextStripper {
case OperatorName.ENDPATH: case OperatorName.ENDPATH:
graphicsPath.clear(); graphicsPath.clear();
break; break;
case OperatorName.DRAW_OBJECT:
processImageOperation(arguments);
break;
} }
super.processOperator(operator, arguments); super.processOperator(operator, arguments);
} }
protected void processImageOperation(List<COSBase> arguments) {
try {
COSName objectName = (COSName) arguments.get(0);
PDXObject xobject = getResources().getXObject(objectName);
if (xobject instanceof PDImageXObject) {
PDImageXObject pdfImage = (PDImageXObject) xobject;
Rectangle2D imageBounds = calculateImagePosition(pdfImage);
Rectangle2D rect = new Rectangle2D.Float((float) imageBounds.getX(), (float) imageBounds.getY(), (float) imageBounds
.getWidth(), (float) imageBounds.getHeight());
this.imageBounds.add(rect);
}
} catch (Exception e) {
log.warn("Problem during image extraction: {}", e.getMessage());
}
}
private Rectangle2D calculateImagePosition(PDImageXObject pdfImage) throws IOException {
Matrix ctm = getGraphicsState().getCurrentTransformationMatrix();
Rectangle2D imageBounds = pdfImage.getImage().getRaster().getBounds();
AffineTransform imageTransform = new AffineTransform(ctm.createAffineTransform());
imageTransform.scale(1.0 / pdfImage.getWidth(), -1.0 / pdfImage.getHeight());
imageTransform.translate(0, -pdfImage.getHeight());
AffineTransform pageTransform = createCurrentPageTransformation();
pageTransform.concatenate(imageTransform);
return pageTransform.createTransformedShape(imageBounds).getBounds2D();
}
protected AffineTransform createCurrentPageTransformation() {
PDRectangle cb = pdpage.getCropBox();
AffineTransform pageTransform = new AffineTransform();
switch (pdpage.getRotation()) {
case 90:
pageTransform.translate(cb.getHeight(), 0);
break;
case 180:
pageTransform.translate(cb.getWidth(), cb.getHeight());
break;
case 270:
pageTransform.translate(0, cb.getWidth());
break;
}
pageTransform.rotate(Math.toRadians(pdpage.getRotation()));
return pageTransform;
}
private float floatValue(COSBase value) { private float floatValue(COSBase value) {
if (value instanceof COSNumber) { if (value instanceof COSNumber) {
return ((COSNumber) value).floatValue(); return ((COSNumber) value).floatValue();
} else { } else {
@ -181,21 +273,31 @@ public class PDFLinesTextStripper extends PDFTextStripper {
} }
} }
private Point2D.Float transformPosition(float x, float y) { private Point2D.Float transformPosition(float x, float y) {
return super.transformedPoint(x, y); return super.transformedPoint(x, y);
} }
private void addVisibleRulings(List<Ruling> path, boolean stroke) throws IOException { private void addVisibleRulings(List<Ruling> path, boolean stroke) throws IOException {
try { try {
if (stroke && !getGraphicsState().getStrokingColor().isPattern() && getGraphicsState().getStrokingColor().toRGB() == 0 || !stroke && !getGraphicsState().getNonStrokingColor().isPattern() && getGraphicsState().getNonStrokingColor().toRGB() == 0) { if (stroke && !getGraphicsState().getStrokingColor().isPattern() && getGraphicsState().getStrokingColor()
.toRGB() == 0 || !stroke && !getGraphicsState().getNonStrokingColor()
.isPattern() && getGraphicsState().getNonStrokingColor().toRGB() == 0) {
rulings.addAll(path); rulings.addAll(path);
} }
} catch (UnsupportedOperationException e) { } catch (UnsupportedOperationException e) {
log.error("UnsupportedOperationException: " + getGraphicsState().getStrokingColor().getColorSpace().getName() + " or " + getGraphicsState().getNonStrokingColor().getColorSpace().getName() + " does not support toRGB"); log.error("UnsupportedOperationException: " + getGraphicsState().getStrokingColor()
.getColorSpace()
.getName() + " or " + getGraphicsState().getNonStrokingColor()
.getColorSpace()
.getName() + " does not support toRGB");
} }
} }
@Override @Override
public void writeString(String text, List<TextPosition> textPositions) throws IOException { public void writeString(String text, List<TextPosition> textPositions) throws IOException {
@ -203,16 +305,18 @@ public class PDFLinesTextStripper extends PDFTextStripper {
for (int i = 0; i <= textPositions.size() - 1; i++) { for (int i = 0; i <= textPositions.size() - 1; i++) {
int charHeight = (int) textPositions.get(i).getHeightDir(); int charHeight = (int) textPositions.get(i).getHeightDir();
if(charHeight > maxCharHeight){ if (charHeight > maxCharHeight) {
maxCharHeight = charHeight; maxCharHeight = charHeight;
} }
int charWidth = (int) textPositions.get(i).getWidthDirAdj(); int charWidth = (int) textPositions.get(i).getWidthDirAdj();
if(charWidth > maxCharWidths){ if (charWidth > maxCharWidths) {
maxCharWidths = charWidth; maxCharWidths = charWidth;
} }
if (i == 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i).getUnicode().equals("\u00A0"))) { if (i == 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i)
.getUnicode()
.equals("\u00A0"))) {
startIndex++; startIndex++;
continue; continue;
} }
@ -220,15 +324,21 @@ public class PDFLinesTextStripper extends PDFTextStripper {
// Strange but sometimes this is happening, for example: Metolachlor2.pdf // Strange but sometimes this is happening, for example: Metolachlor2.pdf
if (i > 0 && textPositions.get(i).getX() < textPositions.get(i - 1).getX()) { if (i > 0 && textPositions.get(i).getX() < textPositions.get(i - 1).getX()) {
List<TextPosition> sublist = textPositions.subList(startIndex, i); List<TextPosition> sublist = textPositions.subList(startIndex, i);
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) { if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0)
.getUnicode()
.equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) {
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber)); textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
} }
startIndex = i; startIndex = i;
} }
if (i > 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i).getUnicode().equals("\u00A0")) && i <= textPositions.size() - 2) { if (i > 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i)
.getUnicode()
.equals("\u00A0")) && i <= textPositions.size() - 2) {
List<TextPosition> sublist = textPositions.subList(startIndex, i); List<TextPosition> sublist = textPositions.subList(startIndex, i);
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) { if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0)
.getUnicode()
.equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) {
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber)); textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
} }
startIndex = i + 1; startIndex = i + 1;
@ -236,21 +346,27 @@ public class PDFLinesTextStripper extends PDFTextStripper {
} }
List<TextPosition> sublist = textPositions.subList(startIndex, textPositions.size()); List<TextPosition> sublist = textPositions.subList(startIndex, textPositions.size());
if (!sublist.isEmpty() && (sublist.get(sublist.size() - 1).getUnicode().equals(" ") || sublist.get(sublist.size() - 1).getUnicode().equals("\u00A0"))) { if (!sublist.isEmpty() && (sublist.get(sublist.size() - 1)
.getUnicode()
.equals(" ") || sublist.get(sublist.size() - 1).getUnicode().equals("\u00A0"))) {
sublist = sublist.subList(0, sublist.size() - 1); sublist = sublist.subList(0, sublist.size() - 1);
} }
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) { if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0)
.getUnicode()
.equals("\u00A0")))) {
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber)); textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
} }
super.writeString(text); super.writeString(text);
} }
@Override @Override
public String getText(PDDocument doc) throws IOException { public String getText(PDDocument doc) throws IOException {
maxCharWidths = 0; maxCharWidths = 0;
maxCharWidths = 0; maxCharWidths = 0;
textPositionSequences.clear(); textPositionSequences.clear();
imageBounds = new ArrayList<>();
rulings.clear(); rulings.clear();
graphicsPath.clear(); graphicsPath.clear();
path_x = 0.0f; path_x = 0.0f;

View File

@ -1,5 +1,6 @@
package com.iqser.red.service.redaction.v1.server.parsing.model; package com.iqser.red.service.redaction.v1.server.parsing.model;
import java.awt.geom.Rectangle2D;
import java.util.List; import java.util.List;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling; import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
@ -13,6 +14,7 @@ public class ParsedElements {
private List<TextPositionSequence> sequences; private List<TextPositionSequence> sequences;
private List<Ruling> rulings; private List<Ruling> rulings;
private List<Rectangle2D> imageBounds;
private boolean landscape; private boolean landscape;
private boolean rotated; private boolean rotated;

View File

@ -53,25 +53,24 @@ public class AnnotationService {
List<RedactionLogEntry> logEntries = redactionLogPerPage.get(page); List<RedactionLogEntry> logEntries = redactionLogPerPage.get(page);
if (logEntries != null && !logEntries.isEmpty()) { if (logEntries != null && !logEntries.isEmpty()) {
addAnnotations(logEntries, pdPage, page, redactionLog.getRuleSetId()); addAnnotations(logEntries, pdPage, page);
} }
} }
} }
private void addAnnotations(List<RedactionLogEntry> logEntries, PDPage pdPage, int page, private void addAnnotations(List<RedactionLogEntry> logEntries, PDPage pdPage, int page) throws IOException {
String ruleSetId) throws IOException {
List<PDAnnotation> annotations = pdPage.getAnnotations(); List<PDAnnotation> annotations = pdPage.getAnnotations();
for (RedactionLogEntry entry : logEntries) { for (RedactionLogEntry entry : logEntries) {
annotations.addAll(createAnnotation(entry, page, ruleSetId, pdPage.getMediaBox(), pdPage.getCropBox())); annotations.addAll(createAnnotation(entry, page, pdPage.getMediaBox(), pdPage.getCropBox()));
} }
} }
private List<PDAnnotation> createAnnotation(RedactionLogEntry redactionLogEntry, int page, String ruleSetId, private List<PDAnnotation> createAnnotation(RedactionLogEntry redactionLogEntry, int page, PDRectangle mediaBox,
PDRectangle mediaBox, PDRectangle cropBox) { PDRectangle cropBox) {
List<PDAnnotation> annotations = new ArrayList<>(); List<PDAnnotation> annotations = new ArrayList<>();
@ -89,7 +88,7 @@ public class AnnotationService {
PDRectangle pdRectangle = toPDRectangle(rectangles, mediaBox, cropBox); PDRectangle pdRectangle = toPDRectangle(rectangles, mediaBox, cropBox);
annotation.setRectangle(pdRectangle); annotation.setRectangle(pdRectangle);
annotation.setQuadPoints(toQuadPoints(rectangles, mediaBox, cropBox)); annotation.setQuadPoints(toQuadPoints(rectangles, mediaBox, cropBox));
if (!dictionaryService.isHint(redactionLogEntry.getType(), ruleSetId)) { if (!redactionLogEntry.isHint()) {
annotation.setContents(createAnnotationContent(redactionLogEntry)); annotation.setContents(createAnnotationContent(redactionLogEntry));
} }
annotation.setTitlePopup(redactionLogEntry.getId()); annotation.setTitlePopup(redactionLogEntry.getId());

View File

@ -1,5 +1,6 @@
package com.iqser.red.service.redaction.v1.server.redaction.service; package com.iqser.red.service.redaction.v1.server.redaction.service;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashSet; import java.util.HashSet;
import java.util.List; import java.util.List;
@ -27,6 +28,7 @@ import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity; import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence; import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.utils.IdBuilder;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer; import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell; import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table; import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
@ -37,6 +39,8 @@ import lombok.RequiredArgsConstructor;
@RequiredArgsConstructor @RequiredArgsConstructor
public class RedactionLogCreatorService { public class RedactionLogCreatorService {
private static final String IMAGE = "image";
private final DictionaryService dictionaryService; private final DictionaryService dictionaryService;
@ -56,6 +60,30 @@ public class RedactionLogCreatorService {
if (manualRedactionPages.contains(page)) { if (manualRedactionPages.contains(page)) {
addManualEntries(classifiedDoc, manualRedactions, page, ruleSetId); addManualEntries(classifiedDoc, manualRedactions, page, ruleSetId);
} }
if (!classifiedDoc.getPages().get(page - 1).getImageBounds().isEmpty()) {
addImageEntries(classifiedDoc, page, ruleSetId);
}
}
}
private void addImageEntries(Document classifiedDoc, int pageNumber, String ruleSetId) {
for (Rectangle2D imageBounds : classifiedDoc.getPages().get(pageNumber - 1).getImageBounds()) {
RedactionLogEntry redactionLogEntry = RedactionLogEntry.builder()
.id(IdBuilder.buildId(imageBounds, pageNumber))
.color(getColor(IMAGE, ruleSetId))
.type(IMAGE)
.redacted(false)
.isHint(true)
.manual(false)
.isDictionaryEntry(false)
.isRecommendation(false)
.positions(List.of(new Rectangle(new Point((float) imageBounds.getX(), (float) imageBounds.getY()), (float) imageBounds
.getWidth(), (float) imageBounds.getHeight(), pageNumber)))
.build();
classifiedDoc.getRedactionLogEntities().add(redactionLogEntry);
} }
} }

View File

@ -1,5 +1,6 @@
package com.iqser.red.service.redaction.v1.server.redaction.utils; package com.iqser.red.service.redaction.v1.server.redaction.utils;
import java.awt.geom.Rectangle2D;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.util.List; import java.util.List;
@ -23,4 +24,16 @@ public class IdBuilder {
return hashFunction.hashString(sb.toString(), StandardCharsets.UTF_8).toString(); return hashFunction.hashString(sb.toString(), StandardCharsets.UTF_8).toString();
} }
public String buildId(Rectangle2D rectangle2D, int page){
StringBuilder sb = new StringBuilder();
sb.append("x").append(rectangle2D.getX()).append("y").append(rectangle2D.getY()).append("h").append(rectangle2D.getHeight()).append("w").append(rectangle2D.getWidth()).append("p").append(page);
return hashFunction.hashString(sb.toString(), StandardCharsets.UTF_8).toString();
}
} }

View File

@ -48,6 +48,7 @@ public class PdfSegmentationService {
stripper.setPageNumber(pageNumber); stripper.setPageNumber(pageNumber);
stripper.setStartPage(pageNumber); stripper.setStartPage(pageNumber);
stripper.setEndPage(pageNumber); stripper.setEndPage(pageNumber);
stripper.setPdpage(pdPage);
stripper.getText(pdDocument); stripper.getText(pdDocument);
PDRectangle pdr = pdPage.getMediaBox(); PDRectangle pdr = pdPage.getMediaBox();
@ -56,10 +57,10 @@ public class PdfSegmentationService {
int rotation = pdPage.getRotation(); int rotation = pdPage.getRotation();
boolean isRotated = rotation != 0 && rotation != 360; boolean isRotated = rotation != 0 && rotation != 360;
ParsedElements parsedElements = ParsedElements.builder() ParsedElements parsedElements = ParsedElements.builder()
.rulings(stripper.getRulings()) .rulings(stripper.getRulings())
.sequences(stripper.getTextPositionSequences()) .sequences(stripper.getTextPositionSequences())
.imageBounds(stripper.getImageBounds())
.maxCharWidth(stripper.getMaxCharWidths()) .maxCharWidth(stripper.getMaxCharWidths())
.maxCharHeight(stripper.getMaxCharWidths()) .maxCharHeight(stripper.getMaxCharWidths())
.landscape(isLandscape) .landscape(isLandscape)
@ -81,8 +82,10 @@ public class PdfSegmentationService {
page.setPageNumber(pageNumber); page.setPageNumber(pageNumber);
increaseDocumentStatistics(page, document); increaseDocumentStatistics(page, document);
page.setImageBounds(parsedElements.getImageBounds());
pages.add(page); pages.add(page);
} }
document.setPages(pages); document.setPages(pages);
classificationService.classifyDocument(document); classificationService.classifyDocument(document);
@ -90,11 +93,9 @@ public class PdfSegmentationService {
sectionsBuilderService.buildSections(document); sectionsBuilderService.buildSections(document);
return document; return document;
} }
private void increaseDocumentStatistics(Page page, Document document) { private void increaseDocumentStatistics(Page page, Document document) {
if (!page.isLandscape()) { if (!page.isLandscape()) {

View File

@ -81,6 +81,7 @@ public class RedactionIntegrationTest {
private static final String PUBLISHED_INFORMATION = "published_information"; private static final String PUBLISHED_INFORMATION = "published_information";
private static final String TEST_METHOD = "test_method"; private static final String TEST_METHOD = "test_method";
private static final String PURITY = "purity"; private static final String PURITY = "purity";
private static final String IMAGE = "image";
private static final String RECOMMENDATION_AUTHOR = "recommendation_CBI_author"; private static final String RECOMMENDATION_AUTHOR = "recommendation_CBI_author";
private static final String RECOMMENDATION_ADDRESS = "recommendation_CBI_address"; private static final String RECOMMENDATION_ADDRESS = "recommendation_CBI_address";
@ -157,6 +158,7 @@ public class RedactionIntegrationTest {
when(dictionaryClient.getDictionaryForType(RECOMMENDATION_ADDRESS, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(RECOMMENDATION_ADDRESS)); when(dictionaryClient.getDictionaryForType(RECOMMENDATION_ADDRESS, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(RECOMMENDATION_ADDRESS));
when(dictionaryClient.getDictionaryForType(FALSE_POSITIVE, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(FALSE_POSITIVE)); when(dictionaryClient.getDictionaryForType(FALSE_POSITIVE, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(FALSE_POSITIVE));
when(dictionaryClient.getDictionaryForType(PURITY, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(PURITY)); when(dictionaryClient.getDictionaryForType(PURITY, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(PURITY));
when(dictionaryClient.getDictionaryForType(IMAGE, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(IMAGE));
when(dictionaryClient.getColors(TEST_RULESET_ID)).thenReturn(colors); when(dictionaryClient.getColors(TEST_RULESET_ID)).thenReturn(colors);
} }
@ -238,6 +240,11 @@ public class RedactionIntegrationTest {
.stream() .stream()
.map(this::cleanDictionaryEntry) .map(this::cleanDictionaryEntry)
.collect(Collectors.toSet())); .collect(Collectors.toSet()));
dictionary.computeIfAbsent(IMAGE, v -> new ArrayList<>())
.addAll(ResourceLoader.load("dictionaries/image.txt")
.stream()
.map(this::cleanDictionaryEntry)
.collect(Collectors.toSet()));
} }
@ -264,6 +271,7 @@ public class RedactionIntegrationTest {
typeColorMap.put(RECOMMENDATION_ADDRESS, "#8df06c"); typeColorMap.put(RECOMMENDATION_ADDRESS, "#8df06c");
typeColorMap.put(FALSE_POSITIVE, "#ffffff"); typeColorMap.put(FALSE_POSITIVE, "#ffffff");
typeColorMap.put(PURITY, "#ffe187"); typeColorMap.put(PURITY, "#ffe187");
typeColorMap.put(IMAGE, "#fcc5fb");
hintTypeMap.put(VERTEBRATE, true); hintTypeMap.put(VERTEBRATE, true);
hintTypeMap.put(ADDRESS, false); hintTypeMap.put(ADDRESS, false);
@ -280,6 +288,7 @@ public class RedactionIntegrationTest {
hintTypeMap.put(RECOMMENDATION_ADDRESS, false); hintTypeMap.put(RECOMMENDATION_ADDRESS, false);
hintTypeMap.put(FALSE_POSITIVE, true); hintTypeMap.put(FALSE_POSITIVE, true);
hintTypeMap.put(PURITY, false); hintTypeMap.put(PURITY, false);
hintTypeMap.put(IMAGE, true);
caseInSensitiveMap.put(VERTEBRATE, true); caseInSensitiveMap.put(VERTEBRATE, true);
caseInSensitiveMap.put(ADDRESS, false); caseInSensitiveMap.put(ADDRESS, false);
@ -296,6 +305,7 @@ public class RedactionIntegrationTest {
caseInSensitiveMap.put(RECOMMENDATION_ADDRESS, false); caseInSensitiveMap.put(RECOMMENDATION_ADDRESS, false);
caseInSensitiveMap.put(FALSE_POSITIVE, false); caseInSensitiveMap.put(FALSE_POSITIVE, false);
caseInSensitiveMap.put(PURITY, false); caseInSensitiveMap.put(PURITY, false);
caseInSensitiveMap.put(IMAGE, true);
recommendationTypeMap.put(VERTEBRATE, false); recommendationTypeMap.put(VERTEBRATE, false);
recommendationTypeMap.put(ADDRESS, false); recommendationTypeMap.put(ADDRESS, false);
@ -312,6 +322,8 @@ public class RedactionIntegrationTest {
recommendationTypeMap.put(RECOMMENDATION_ADDRESS, true); recommendationTypeMap.put(RECOMMENDATION_ADDRESS, true);
recommendationTypeMap.put(FALSE_POSITIVE, false); recommendationTypeMap.put(FALSE_POSITIVE, false);
recommendationTypeMap.put(PURITY, false); recommendationTypeMap.put(PURITY, false);
recommendationTypeMap.put(IMAGE, false);
rankTypeMap.put(FALSE_POSITIVE, 160); rankTypeMap.put(FALSE_POSITIVE, 160);
rankTypeMap.put(PURITY, 155); rankTypeMap.put(PURITY, 155);
@ -328,6 +340,8 @@ public class RedactionIntegrationTest {
rankTypeMap.put(HINT_ONLY, 50); rankTypeMap.put(HINT_ONLY, 50);
rankTypeMap.put(RECOMMENDATION_AUTHOR, 40); rankTypeMap.put(RECOMMENDATION_AUTHOR, 40);
rankTypeMap.put(RECOMMENDATION_ADDRESS, 30); rankTypeMap.put(RECOMMENDATION_ADDRESS, 30);
rankTypeMap.put(IMAGE, 30);
colors.setDefaultColor("#acfc00"); colors.setDefaultColor("#acfc00");
colors.setNotRedacted("#cccccc"); colors.setNotRedacted("#cccccc");
@ -427,7 +441,7 @@ public class RedactionIntegrationTest {
System.out.println("redactionTest"); System.out.println("redactionTest");
long start = System.currentTimeMillis(); long start = System.currentTimeMillis();
ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf"); ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_08_Volume_3CA_B-6_2018-09-06.pdf");
AnalyzeRequest request = AnalyzeRequest.builder() AnalyzeRequest request = AnalyzeRequest.builder()
.ruleSetId(TEST_RULESET_ID) .ruleSetId(TEST_RULESET_ID)