Annotate images
This commit is contained in:
parent
6b68890fcf
commit
acddfafa5b
@ -1,5 +1,6 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.List;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
||||
@ -16,6 +17,8 @@ public class Page {
|
||||
@NonNull
|
||||
private List<AbstractTextContainer> textBlocks;
|
||||
|
||||
private List<Rectangle2D> imageBounds;
|
||||
|
||||
private Rectangle bodyTextFrame;
|
||||
|
||||
private boolean landscape;
|
||||
|
||||
@ -1,6 +1,8 @@
|
||||
package com.iqser.red.service.redaction.v1.server.parsing;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
@ -28,10 +30,16 @@ import org.apache.pdfbox.contentstream.operator.state.SetLineWidth;
|
||||
import org.apache.pdfbox.contentstream.operator.state.SetRenderingIntent;
|
||||
import org.apache.pdfbox.contentstream.operator.text.SetFontAndSize;
|
||||
import org.apache.pdfbox.cos.COSBase;
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
import org.apache.pdfbox.cos.COSNumber;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
|
||||
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
|
||||
import org.apache.pdfbox.text.PDFTextStripper;
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
import org.apache.pdfbox.util.Matrix;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
|
||||
@ -43,6 +51,9 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@Slf4j
|
||||
public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
|
||||
@Setter
|
||||
protected PDPage pdpage;
|
||||
|
||||
@Getter
|
||||
private int maxCharWidths;
|
||||
|
||||
@ -57,13 +68,18 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
|
||||
private final List<Ruling> graphicsPath = new ArrayList<>();
|
||||
|
||||
@Getter
|
||||
private List<Rectangle2D> imageBounds = new ArrayList<>();
|
||||
|
||||
private float path_x;
|
||||
private float path_y;
|
||||
|
||||
@Setter
|
||||
private int pageNumber;
|
||||
|
||||
|
||||
public PDFLinesTextStripper() throws IOException {
|
||||
|
||||
super();
|
||||
this.addOperator(new SetStrokingColorSpace());
|
||||
this.addOperator(new SetNonStrokingColorSpace());
|
||||
@ -87,9 +103,9 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
this.addOperator(new SetLineWidth());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
protected void processOperator(Operator operator, List<COSBase> arguments)
|
||||
throws IOException {
|
||||
protected void processOperator(Operator operator, List<COSBase> arguments) throws IOException {
|
||||
|
||||
String operation = operator.getName();
|
||||
|
||||
@ -110,9 +126,11 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
|
||||
// The direction of vertical lines must always be from bottom to top for the table extraction algorithm.
|
||||
if (pos.getY() > path_y) {
|
||||
graphicsPath.add(new Ruling(new Point2D.Float(path_x, path_y), new Point2D.Float((float) pos.getX(), (float) pos.getY())));
|
||||
graphicsPath.add(new Ruling(new Point2D.Float(path_x, path_y), new Point2D.Float((float) pos.getX(), (float) pos
|
||||
.getY())));
|
||||
} else {
|
||||
graphicsPath.add(new Ruling(new Point2D.Float(path_x, (float) pos.getY()), new Point2D.Float((float) pos.getX(), path_y)));
|
||||
graphicsPath.add(new Ruling(new Point2D.Float(path_x, (float) pos.getY()), new Point2D.Float((float) pos
|
||||
.getX(), path_y)));
|
||||
}
|
||||
|
||||
path_x = (float) pos.getX();
|
||||
@ -133,19 +151,25 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
Point2D p2 = transformPosition(x + width, y + height);
|
||||
|
||||
// Horizontal lines
|
||||
graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p1.getY()), new Point2D.Float((float) p2.getX(), (float) p1.getY())));
|
||||
graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p2.getY()), new Point2D.Float((float) p2.getX(), (float) p2.getY())));
|
||||
graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p1.getY()), new Point2D.Float((float) p2
|
||||
.getX(), (float) p1.getY())));
|
||||
graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p2.getY()), new Point2D.Float((float) p2
|
||||
.getX(), (float) p2.getY())));
|
||||
|
||||
// Vertical lines, direction must always be from bottom to top for the table extraction algorithm.
|
||||
if (p2.getY() > p1.getY()) {
|
||||
graphicsPath.add(new Ruling(new Point2D.Float((float) p2.getX(), (float) p1.getY()), new Point2D.Float((float) p2.getX(), (float) p2.getY())));
|
||||
graphicsPath.add(new Ruling(new Point2D.Float((float) p2.getX(), (float) p1.getY()), new Point2D.Float((float) p2
|
||||
.getX(), (float) p2.getY())));
|
||||
} else {
|
||||
graphicsPath.add(new Ruling(new Point2D.Float((float) p2.getX(), (float) p2.getY()), new Point2D.Float((float) p2.getX(), (float) p1.getY())));
|
||||
graphicsPath.add(new Ruling(new Point2D.Float((float) p2.getX(), (float) p2.getY()), new Point2D.Float((float) p2
|
||||
.getX(), (float) p1.getY())));
|
||||
}
|
||||
if (p2.getY() > p1.getY()) {
|
||||
graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p1.getY()), new Point2D.Float((float) p1.getX(), (float) p2.getY())));
|
||||
graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p1.getY()), new Point2D.Float((float) p1
|
||||
.getX(), (float) p2.getY())));
|
||||
} else {
|
||||
graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p2.getY()), new Point2D.Float((float) p1.getX(), (float) p1.getY())));
|
||||
graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p2.getY()), new Point2D.Float((float) p1
|
||||
.getX(), (float) p1.getY())));
|
||||
}
|
||||
}
|
||||
break;
|
||||
@ -168,12 +192,80 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
case OperatorName.ENDPATH:
|
||||
graphicsPath.clear();
|
||||
break;
|
||||
|
||||
case OperatorName.DRAW_OBJECT:
|
||||
processImageOperation(arguments);
|
||||
break;
|
||||
|
||||
}
|
||||
|
||||
super.processOperator(operator, arguments);
|
||||
}
|
||||
|
||||
|
||||
protected void processImageOperation(List<COSBase> arguments) {
|
||||
|
||||
try {
|
||||
COSName objectName = (COSName) arguments.get(0);
|
||||
PDXObject xobject = getResources().getXObject(objectName);
|
||||
if (xobject instanceof PDImageXObject) {
|
||||
PDImageXObject pdfImage = (PDImageXObject) xobject;
|
||||
|
||||
Rectangle2D imageBounds = calculateImagePosition(pdfImage);
|
||||
|
||||
Rectangle2D rect = new Rectangle2D.Float((float) imageBounds.getX(), (float) imageBounds.getY(), (float) imageBounds
|
||||
.getWidth(), (float) imageBounds.getHeight());
|
||||
|
||||
this.imageBounds.add(rect);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.warn("Problem during image extraction: {}", e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private Rectangle2D calculateImagePosition(PDImageXObject pdfImage) throws IOException {
|
||||
|
||||
Matrix ctm = getGraphicsState().getCurrentTransformationMatrix();
|
||||
|
||||
Rectangle2D imageBounds = pdfImage.getImage().getRaster().getBounds();
|
||||
|
||||
AffineTransform imageTransform = new AffineTransform(ctm.createAffineTransform());
|
||||
imageTransform.scale(1.0 / pdfImage.getWidth(), -1.0 / pdfImage.getHeight());
|
||||
imageTransform.translate(0, -pdfImage.getHeight());
|
||||
|
||||
AffineTransform pageTransform = createCurrentPageTransformation();
|
||||
pageTransform.concatenate(imageTransform);
|
||||
|
||||
return pageTransform.createTransformedShape(imageBounds).getBounds2D();
|
||||
}
|
||||
|
||||
|
||||
protected AffineTransform createCurrentPageTransformation() {
|
||||
|
||||
PDRectangle cb = pdpage.getCropBox();
|
||||
AffineTransform pageTransform = new AffineTransform();
|
||||
|
||||
switch (pdpage.getRotation()) {
|
||||
case 90:
|
||||
pageTransform.translate(cb.getHeight(), 0);
|
||||
break;
|
||||
case 180:
|
||||
pageTransform.translate(cb.getWidth(), cb.getHeight());
|
||||
break;
|
||||
case 270:
|
||||
pageTransform.translate(0, cb.getWidth());
|
||||
break;
|
||||
}
|
||||
|
||||
pageTransform.rotate(Math.toRadians(pdpage.getRotation()));
|
||||
|
||||
return pageTransform;
|
||||
}
|
||||
|
||||
|
||||
private float floatValue(COSBase value) {
|
||||
|
||||
if (value instanceof COSNumber) {
|
||||
return ((COSNumber) value).floatValue();
|
||||
} else {
|
||||
@ -181,21 +273,31 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private Point2D.Float transformPosition(float x, float y) {
|
||||
|
||||
return super.transformedPoint(x, y);
|
||||
}
|
||||
|
||||
|
||||
private void addVisibleRulings(List<Ruling> path, boolean stroke) throws IOException {
|
||||
|
||||
try {
|
||||
if (stroke && !getGraphicsState().getStrokingColor().isPattern() && getGraphicsState().getStrokingColor().toRGB() == 0 || !stroke && !getGraphicsState().getNonStrokingColor().isPattern() && getGraphicsState().getNonStrokingColor().toRGB() == 0) {
|
||||
if (stroke && !getGraphicsState().getStrokingColor().isPattern() && getGraphicsState().getStrokingColor()
|
||||
.toRGB() == 0 || !stroke && !getGraphicsState().getNonStrokingColor()
|
||||
.isPattern() && getGraphicsState().getNonStrokingColor().toRGB() == 0) {
|
||||
rulings.addAll(path);
|
||||
}
|
||||
} catch (UnsupportedOperationException e) {
|
||||
log.error("UnsupportedOperationException: " + getGraphicsState().getStrokingColor().getColorSpace().getName() + " or " + getGraphicsState().getNonStrokingColor().getColorSpace().getName() + " does not support toRGB");
|
||||
log.error("UnsupportedOperationException: " + getGraphicsState().getStrokingColor()
|
||||
.getColorSpace()
|
||||
.getName() + " or " + getGraphicsState().getNonStrokingColor()
|
||||
.getColorSpace()
|
||||
.getName() + " does not support toRGB");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void writeString(String text, List<TextPosition> textPositions) throws IOException {
|
||||
|
||||
@ -203,16 +305,18 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
for (int i = 0; i <= textPositions.size() - 1; i++) {
|
||||
|
||||
int charHeight = (int) textPositions.get(i).getHeightDir();
|
||||
if(charHeight > maxCharHeight){
|
||||
if (charHeight > maxCharHeight) {
|
||||
maxCharHeight = charHeight;
|
||||
}
|
||||
|
||||
int charWidth = (int) textPositions.get(i).getWidthDirAdj();
|
||||
if(charWidth > maxCharWidths){
|
||||
if (charWidth > maxCharWidths) {
|
||||
maxCharWidths = charWidth;
|
||||
}
|
||||
|
||||
if (i == 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i).getUnicode().equals("\u00A0"))) {
|
||||
if (i == 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i)
|
||||
.getUnicode()
|
||||
.equals("\u00A0"))) {
|
||||
startIndex++;
|
||||
continue;
|
||||
}
|
||||
@ -220,15 +324,21 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
// Strange but sometimes this is happening, for example: Metolachlor2.pdf
|
||||
if (i > 0 && textPositions.get(i).getX() < textPositions.get(i - 1).getX()) {
|
||||
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
||||
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) {
|
||||
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0)
|
||||
.getUnicode()
|
||||
.equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) {
|
||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
|
||||
}
|
||||
startIndex = i;
|
||||
}
|
||||
|
||||
if (i > 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i).getUnicode().equals("\u00A0")) && i <= textPositions.size() - 2) {
|
||||
if (i > 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i)
|
||||
.getUnicode()
|
||||
.equals("\u00A0")) && i <= textPositions.size() - 2) {
|
||||
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
||||
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) {
|
||||
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0)
|
||||
.getUnicode()
|
||||
.equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) {
|
||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
|
||||
}
|
||||
startIndex = i + 1;
|
||||
@ -236,21 +346,27 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
}
|
||||
|
||||
List<TextPosition> sublist = textPositions.subList(startIndex, textPositions.size());
|
||||
if (!sublist.isEmpty() && (sublist.get(sublist.size() - 1).getUnicode().equals(" ") || sublist.get(sublist.size() - 1).getUnicode().equals("\u00A0"))) {
|
||||
if (!sublist.isEmpty() && (sublist.get(sublist.size() - 1)
|
||||
.getUnicode()
|
||||
.equals(" ") || sublist.get(sublist.size() - 1).getUnicode().equals("\u00A0"))) {
|
||||
sublist = sublist.subList(0, sublist.size() - 1);
|
||||
}
|
||||
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) {
|
||||
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0)
|
||||
.getUnicode()
|
||||
.equals("\u00A0")))) {
|
||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
|
||||
}
|
||||
super.writeString(text);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String getText(PDDocument doc) throws IOException {
|
||||
|
||||
maxCharWidths = 0;
|
||||
maxCharWidths = 0;
|
||||
textPositionSequences.clear();
|
||||
imageBounds = new ArrayList<>();
|
||||
rulings.clear();
|
||||
graphicsPath.clear();
|
||||
path_x = 0.0f;
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
package com.iqser.red.service.redaction.v1.server.parsing.model;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.List;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
|
||||
@ -13,6 +14,7 @@ public class ParsedElements {
|
||||
|
||||
private List<TextPositionSequence> sequences;
|
||||
private List<Ruling> rulings;
|
||||
private List<Rectangle2D> imageBounds;
|
||||
|
||||
private boolean landscape;
|
||||
private boolean rotated;
|
||||
|
||||
@ -53,25 +53,24 @@ public class AnnotationService {
|
||||
|
||||
List<RedactionLogEntry> logEntries = redactionLogPerPage.get(page);
|
||||
if (logEntries != null && !logEntries.isEmpty()) {
|
||||
addAnnotations(logEntries, pdPage, page, redactionLog.getRuleSetId());
|
||||
addAnnotations(logEntries, pdPage, page);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void addAnnotations(List<RedactionLogEntry> logEntries, PDPage pdPage, int page,
|
||||
String ruleSetId) throws IOException {
|
||||
private void addAnnotations(List<RedactionLogEntry> logEntries, PDPage pdPage, int page) throws IOException {
|
||||
|
||||
List<PDAnnotation> annotations = pdPage.getAnnotations();
|
||||
|
||||
for (RedactionLogEntry entry : logEntries) {
|
||||
annotations.addAll(createAnnotation(entry, page, ruleSetId, pdPage.getMediaBox(), pdPage.getCropBox()));
|
||||
annotations.addAll(createAnnotation(entry, page, pdPage.getMediaBox(), pdPage.getCropBox()));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private List<PDAnnotation> createAnnotation(RedactionLogEntry redactionLogEntry, int page, String ruleSetId,
|
||||
PDRectangle mediaBox, PDRectangle cropBox) {
|
||||
private List<PDAnnotation> createAnnotation(RedactionLogEntry redactionLogEntry, int page, PDRectangle mediaBox,
|
||||
PDRectangle cropBox) {
|
||||
|
||||
List<PDAnnotation> annotations = new ArrayList<>();
|
||||
|
||||
@ -89,7 +88,7 @@ public class AnnotationService {
|
||||
PDRectangle pdRectangle = toPDRectangle(rectangles, mediaBox, cropBox);
|
||||
annotation.setRectangle(pdRectangle);
|
||||
annotation.setQuadPoints(toQuadPoints(rectangles, mediaBox, cropBox));
|
||||
if (!dictionaryService.isHint(redactionLogEntry.getType(), ruleSetId)) {
|
||||
if (!redactionLogEntry.isHint()) {
|
||||
annotation.setContents(createAnnotationContent(redactionLogEntry));
|
||||
}
|
||||
annotation.setTitlePopup(redactionLogEntry.getId());
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.service;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
@ -27,6 +28,7 @@ import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.IdBuilder;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
|
||||
@ -37,6 +39,8 @@ import lombok.RequiredArgsConstructor;
|
||||
@RequiredArgsConstructor
|
||||
public class RedactionLogCreatorService {
|
||||
|
||||
private static final String IMAGE = "image";
|
||||
|
||||
private final DictionaryService dictionaryService;
|
||||
|
||||
|
||||
@ -56,6 +60,30 @@ public class RedactionLogCreatorService {
|
||||
if (manualRedactionPages.contains(page)) {
|
||||
addManualEntries(classifiedDoc, manualRedactions, page, ruleSetId);
|
||||
}
|
||||
|
||||
if (!classifiedDoc.getPages().get(page - 1).getImageBounds().isEmpty()) {
|
||||
addImageEntries(classifiedDoc, page, ruleSetId);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void addImageEntries(Document classifiedDoc, int pageNumber, String ruleSetId) {
|
||||
|
||||
for (Rectangle2D imageBounds : classifiedDoc.getPages().get(pageNumber - 1).getImageBounds()) {
|
||||
RedactionLogEntry redactionLogEntry = RedactionLogEntry.builder()
|
||||
.id(IdBuilder.buildId(imageBounds, pageNumber))
|
||||
.color(getColor(IMAGE, ruleSetId))
|
||||
.type(IMAGE)
|
||||
.redacted(false)
|
||||
.isHint(true)
|
||||
.manual(false)
|
||||
.isDictionaryEntry(false)
|
||||
.isRecommendation(false)
|
||||
.positions(List.of(new Rectangle(new Point((float) imageBounds.getX(), (float) imageBounds.getY()), (float) imageBounds
|
||||
.getWidth(), (float) imageBounds.getHeight(), pageNumber)))
|
||||
.build();
|
||||
classifiedDoc.getRedactionLogEntities().add(redactionLogEntry);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.utils;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.List;
|
||||
|
||||
@ -23,4 +24,16 @@ public class IdBuilder {
|
||||
|
||||
return hashFunction.hashString(sb.toString(), StandardCharsets.UTF_8).toString();
|
||||
}
|
||||
|
||||
|
||||
public String buildId(Rectangle2D rectangle2D, int page){
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append("x").append(rectangle2D.getX()).append("y").append(rectangle2D.getY()).append("h").append(rectangle2D.getHeight()).append("w").append(rectangle2D.getWidth()).append("p").append(page);
|
||||
|
||||
return hashFunction.hashString(sb.toString(), StandardCharsets.UTF_8).toString();
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
@ -48,6 +48,7 @@ public class PdfSegmentationService {
|
||||
stripper.setPageNumber(pageNumber);
|
||||
stripper.setStartPage(pageNumber);
|
||||
stripper.setEndPage(pageNumber);
|
||||
stripper.setPdpage(pdPage);
|
||||
stripper.getText(pdDocument);
|
||||
|
||||
PDRectangle pdr = pdPage.getMediaBox();
|
||||
@ -56,10 +57,10 @@ public class PdfSegmentationService {
|
||||
int rotation = pdPage.getRotation();
|
||||
boolean isRotated = rotation != 0 && rotation != 360;
|
||||
|
||||
|
||||
ParsedElements parsedElements = ParsedElements.builder()
|
||||
.rulings(stripper.getRulings())
|
||||
.sequences(stripper.getTextPositionSequences())
|
||||
.imageBounds(stripper.getImageBounds())
|
||||
.maxCharWidth(stripper.getMaxCharWidths())
|
||||
.maxCharHeight(stripper.getMaxCharWidths())
|
||||
.landscape(isLandscape)
|
||||
@ -81,8 +82,10 @@ public class PdfSegmentationService {
|
||||
|
||||
page.setPageNumber(pageNumber);
|
||||
increaseDocumentStatistics(page, document);
|
||||
page.setImageBounds(parsedElements.getImageBounds());
|
||||
pages.add(page);
|
||||
}
|
||||
|
||||
document.setPages(pages);
|
||||
|
||||
classificationService.classifyDocument(document);
|
||||
@ -90,11 +93,9 @@ public class PdfSegmentationService {
|
||||
sectionsBuilderService.buildSections(document);
|
||||
|
||||
return document;
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
private void increaseDocumentStatistics(Page page, Document document) {
|
||||
|
||||
if (!page.isLandscape()) {
|
||||
|
||||
@ -81,6 +81,7 @@ public class RedactionIntegrationTest {
|
||||
private static final String PUBLISHED_INFORMATION = "published_information";
|
||||
private static final String TEST_METHOD = "test_method";
|
||||
private static final String PURITY = "purity";
|
||||
private static final String IMAGE = "image";
|
||||
|
||||
private static final String RECOMMENDATION_AUTHOR = "recommendation_CBI_author";
|
||||
private static final String RECOMMENDATION_ADDRESS = "recommendation_CBI_address";
|
||||
@ -157,6 +158,7 @@ public class RedactionIntegrationTest {
|
||||
when(dictionaryClient.getDictionaryForType(RECOMMENDATION_ADDRESS, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(RECOMMENDATION_ADDRESS));
|
||||
when(dictionaryClient.getDictionaryForType(FALSE_POSITIVE, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(FALSE_POSITIVE));
|
||||
when(dictionaryClient.getDictionaryForType(PURITY, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(PURITY));
|
||||
when(dictionaryClient.getDictionaryForType(IMAGE, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(IMAGE));
|
||||
when(dictionaryClient.getColors(TEST_RULESET_ID)).thenReturn(colors);
|
||||
}
|
||||
|
||||
@ -238,6 +240,11 @@ public class RedactionIntegrationTest {
|
||||
.stream()
|
||||
.map(this::cleanDictionaryEntry)
|
||||
.collect(Collectors.toSet()));
|
||||
dictionary.computeIfAbsent(IMAGE, v -> new ArrayList<>())
|
||||
.addAll(ResourceLoader.load("dictionaries/image.txt")
|
||||
.stream()
|
||||
.map(this::cleanDictionaryEntry)
|
||||
.collect(Collectors.toSet()));
|
||||
}
|
||||
|
||||
|
||||
@ -264,6 +271,7 @@ public class RedactionIntegrationTest {
|
||||
typeColorMap.put(RECOMMENDATION_ADDRESS, "#8df06c");
|
||||
typeColorMap.put(FALSE_POSITIVE, "#ffffff");
|
||||
typeColorMap.put(PURITY, "#ffe187");
|
||||
typeColorMap.put(IMAGE, "#fcc5fb");
|
||||
|
||||
hintTypeMap.put(VERTEBRATE, true);
|
||||
hintTypeMap.put(ADDRESS, false);
|
||||
@ -280,6 +288,7 @@ public class RedactionIntegrationTest {
|
||||
hintTypeMap.put(RECOMMENDATION_ADDRESS, false);
|
||||
hintTypeMap.put(FALSE_POSITIVE, true);
|
||||
hintTypeMap.put(PURITY, false);
|
||||
hintTypeMap.put(IMAGE, true);
|
||||
|
||||
caseInSensitiveMap.put(VERTEBRATE, true);
|
||||
caseInSensitiveMap.put(ADDRESS, false);
|
||||
@ -296,6 +305,7 @@ public class RedactionIntegrationTest {
|
||||
caseInSensitiveMap.put(RECOMMENDATION_ADDRESS, false);
|
||||
caseInSensitiveMap.put(FALSE_POSITIVE, false);
|
||||
caseInSensitiveMap.put(PURITY, false);
|
||||
caseInSensitiveMap.put(IMAGE, true);
|
||||
|
||||
recommendationTypeMap.put(VERTEBRATE, false);
|
||||
recommendationTypeMap.put(ADDRESS, false);
|
||||
@ -312,6 +322,8 @@ public class RedactionIntegrationTest {
|
||||
recommendationTypeMap.put(RECOMMENDATION_ADDRESS, true);
|
||||
recommendationTypeMap.put(FALSE_POSITIVE, false);
|
||||
recommendationTypeMap.put(PURITY, false);
|
||||
recommendationTypeMap.put(IMAGE, false);
|
||||
|
||||
|
||||
rankTypeMap.put(FALSE_POSITIVE, 160);
|
||||
rankTypeMap.put(PURITY, 155);
|
||||
@ -328,6 +340,8 @@ public class RedactionIntegrationTest {
|
||||
rankTypeMap.put(HINT_ONLY, 50);
|
||||
rankTypeMap.put(RECOMMENDATION_AUTHOR, 40);
|
||||
rankTypeMap.put(RECOMMENDATION_ADDRESS, 30);
|
||||
rankTypeMap.put(IMAGE, 30);
|
||||
|
||||
|
||||
colors.setDefaultColor("#acfc00");
|
||||
colors.setNotRedacted("#cccccc");
|
||||
@ -427,7 +441,7 @@ public class RedactionIntegrationTest {
|
||||
|
||||
System.out.println("redactionTest");
|
||||
long start = System.currentTimeMillis();
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_08_Volume_3CA_B-6_2018-09-06.pdf");
|
||||
|
||||
AnalyzeRequest request = AnalyzeRequest.builder()
|
||||
.ruleSetId(TEST_RULESET_ID)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user