Pull request #134: RED-1260
Merge in RED/redaction-service from RED-1260 to master * commit 'ae28555bf4c740d9872e26fb27615fbf7402f002': RED-1260: First steps for image classification Integrate image classification
This commit is contained in:
commit
9696a421fc
@ -43,4 +43,6 @@ public class RedactionLogEntry {
|
||||
private int startOffset;
|
||||
private int endOffset;
|
||||
|
||||
private boolean isImage;
|
||||
|
||||
}
|
||||
|
||||
@ -1,8 +1,8 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.List;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Rectangle;
|
||||
|
||||
@ -17,7 +17,7 @@ public class Page {
|
||||
@NonNull
|
||||
private List<AbstractTextContainer> textBlocks;
|
||||
|
||||
private List<Rectangle2D> imageBounds;
|
||||
private List<PdfImage> images;
|
||||
|
||||
private Rectangle bodyTextFrame;
|
||||
|
||||
@ -31,7 +31,9 @@ public class Page {
|
||||
private StringFrequencyCounter fontCounter = new StringFrequencyCounter();
|
||||
private StringFrequencyCounter fontStyleCounter = new StringFrequencyCounter();
|
||||
|
||||
|
||||
public boolean isRotated() {
|
||||
|
||||
return rotation != 0;
|
||||
}
|
||||
|
||||
|
||||
@ -0,0 +1,15 @@
|
||||
package com.iqser.red.service.redaction.v1.server.client;
|
||||
|
||||
import org.springframework.cloud.openfeign.FeignClient;
|
||||
import org.springframework.http.MediaType;
|
||||
import org.springframework.web.bind.annotation.PostMapping;
|
||||
import org.springframework.web.bind.annotation.RequestBody;
|
||||
import org.springframework.web.multipart.MultipartFile;
|
||||
|
||||
@FeignClient(name = "ImageClassificationResource", url = "${image-service.url}")
|
||||
public interface ImageClassificationClient {
|
||||
|
||||
@PostMapping(value = "/process_full_img", consumes = MediaType.MULTIPART_FORM_DATA_VALUE, produces = MediaType.APPLICATION_JSON_VALUE)
|
||||
ImageClassificationResponse classify(@RequestBody MultipartFile file);
|
||||
|
||||
}
|
||||
@ -0,0 +1,13 @@
|
||||
package com.iqser.red.service.redaction.v1.server.client;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class ImageClassificationResponse {
|
||||
|
||||
private String category;
|
||||
}
|
||||
@ -0,0 +1,102 @@
|
||||
package com.iqser.red.service.redaction.v1.server.client;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
import org.springframework.lang.NonNull;
|
||||
import org.springframework.lang.Nullable;
|
||||
import org.springframework.util.Assert;
|
||||
import org.springframework.util.FileCopyUtils;
|
||||
import org.springframework.web.multipart.MultipartFile;
|
||||
|
||||
public class MockMultipartFile implements MultipartFile {
|
||||
|
||||
private final String name;
|
||||
private final String originalFilename;
|
||||
@Nullable
|
||||
private final String contentType;
|
||||
private final byte[] content;
|
||||
|
||||
|
||||
public MockMultipartFile(String name, @Nullable byte[] content) {
|
||||
|
||||
this(name, "", (String) null, (byte[]) content);
|
||||
}
|
||||
|
||||
|
||||
public MockMultipartFile(String name, InputStream contentStream) throws IOException {
|
||||
|
||||
this(name, "", (String) null, (byte[]) FileCopyUtils.copyToByteArray(contentStream));
|
||||
}
|
||||
|
||||
|
||||
public MockMultipartFile(String name, @Nullable String originalFilename, @Nullable String contentType,
|
||||
@Nullable byte[] content) {
|
||||
|
||||
Assert.hasLength(name, "Name must not be empty");
|
||||
this.name = name;
|
||||
this.originalFilename = originalFilename != null ? originalFilename : "";
|
||||
this.contentType = contentType;
|
||||
this.content = content != null ? content : new byte[0];
|
||||
}
|
||||
|
||||
|
||||
public MockMultipartFile(String name, @Nullable String originalFilename, @Nullable String contentType,
|
||||
InputStream contentStream) throws IOException {
|
||||
|
||||
this(name, originalFilename, contentType, FileCopyUtils.copyToByteArray(contentStream));
|
||||
}
|
||||
|
||||
|
||||
public String getName() {
|
||||
|
||||
return this.name;
|
||||
}
|
||||
|
||||
|
||||
@NonNull
|
||||
public String getOriginalFilename() {
|
||||
|
||||
return this.originalFilename;
|
||||
}
|
||||
|
||||
|
||||
@Nullable
|
||||
public String getContentType() {
|
||||
|
||||
return this.contentType;
|
||||
}
|
||||
|
||||
|
||||
public boolean isEmpty() {
|
||||
|
||||
return this.content.length == 0;
|
||||
}
|
||||
|
||||
|
||||
public long getSize() {
|
||||
|
||||
return (long) this.content.length;
|
||||
}
|
||||
|
||||
|
||||
public byte[] getBytes() throws IOException {
|
||||
|
||||
return this.content;
|
||||
}
|
||||
|
||||
|
||||
public InputStream getInputStream() throws IOException {
|
||||
|
||||
return new ByteArrayInputStream(this.content);
|
||||
}
|
||||
|
||||
|
||||
public void transferTo(File dest) throws IOException, IllegalStateException {
|
||||
|
||||
FileCopyUtils.copy(this.content, dest);
|
||||
}
|
||||
|
||||
}
|
||||
@ -20,6 +20,7 @@ import com.iqser.red.service.redaction.v1.server.redaction.service.AnnotationSer
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.service.DictionaryService;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.service.DroolsExecutionService;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.service.EntityRedactionService;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.service.ImageClassificationService;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.service.ReanalyzeService;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.service.RedactionLogCreatorService;
|
||||
import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationService;
|
||||
@ -53,6 +54,7 @@ public class RedactionController implements RedactionResource {
|
||||
private final DictionaryService dictionaryService;
|
||||
private final AnnotationService annotationService;
|
||||
private final ReanalyzeService reanalyzeService;
|
||||
private final ImageClassificationService imageClassificationService;
|
||||
|
||||
|
||||
@Override
|
||||
@ -66,6 +68,7 @@ public class RedactionController implements RedactionResource {
|
||||
log.info("Document structure analysis successful, starting redaction analysis...");
|
||||
|
||||
entityRedactionService.processDocument(classifiedDoc, analyzeRequest.getRuleSetId(), analyzeRequest.getManualRedactions());
|
||||
imageClassificationService.classifyImages(classifiedDoc);
|
||||
redactionLogCreatorService.createRedactionLog(classifiedDoc, pdDocument.getNumberOfPages(), analyzeRequest.getManualRedactions(), analyzeRequest
|
||||
.getRuleSetId());
|
||||
|
||||
|
||||
@ -41,6 +41,7 @@ import org.apache.pdfbox.text.TextPosition;
|
||||
import org.apache.pdfbox.util.Matrix;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
|
||||
|
||||
import lombok.Getter;
|
||||
@ -58,7 +59,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
|
||||
@Getter
|
||||
private int maxCharWidth;
|
||||
|
||||
|
||||
@Getter
|
||||
private int minCharHeight;
|
||||
|
||||
@ -74,7 +75,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
private final List<Ruling> graphicsPath = new ArrayList<>();
|
||||
|
||||
@Getter
|
||||
private List<Rectangle2D> imageBounds = new ArrayList<>();
|
||||
private List<PdfImage> images = new ArrayList<>();
|
||||
|
||||
private float path_x;
|
||||
private float path_y;
|
||||
@ -222,7 +223,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
.getWidth(), (float) imageBounds.getHeight());
|
||||
|
||||
if (rect.getHeight() > 2 && rect.getWidth() > 2) {
|
||||
this.imageBounds.add(rect);
|
||||
this.images.add(new PdfImage(pdfImage.getImage(), rect));
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
@ -358,7 +359,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
minCharHeight = Integer.MAX_VALUE;
|
||||
maxCharHeight = 0;
|
||||
textPositionSequences.clear();
|
||||
imageBounds = new ArrayList<>();
|
||||
images = new ArrayList<>();
|
||||
rulings.clear();
|
||||
graphicsPath.clear();
|
||||
path_x = 0.0f;
|
||||
|
||||
@ -1,24 +0,0 @@
|
||||
package com.iqser.red.service.redaction.v1.server.parsing.model;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.List;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
|
||||
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
public class ParsedElements {
|
||||
|
||||
private List<TextPositionSequence> sequences;
|
||||
private List<Ruling> rulings;
|
||||
private List<Rectangle2D> imageBounds;
|
||||
|
||||
private boolean landscape;
|
||||
private boolean rotated;
|
||||
|
||||
private float minCharWidth;
|
||||
private float maxCharWidth;
|
||||
}
|
||||
@ -0,0 +1,5 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.model;
|
||||
|
||||
public enum ImageType {
|
||||
LOGO, FORMULA, SIGNATURE, OTHER, OCR
|
||||
}
|
||||
@ -0,0 +1,24 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.model;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.awt.image.BufferedImage;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.NonNull;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@RequiredArgsConstructor
|
||||
public class PdfImage {
|
||||
|
||||
@NonNull
|
||||
private BufferedImage image;
|
||||
@NonNull
|
||||
private Rectangle2D position;
|
||||
private ImageType imageType;
|
||||
|
||||
}
|
||||
@ -0,0 +1,62 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.service;
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
|
||||
import javax.imageio.ImageIO;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.client.ImageClassificationClient;
|
||||
import com.iqser.red.service.redaction.v1.server.client.ImageClassificationResponse;
|
||||
import com.iqser.red.service.redaction.v1.server.client.MockMultipartFile;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.ImageType;
|
||||
import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class ImageClassificationService {
|
||||
|
||||
private final ImageClassificationClient imageClassificationClient;
|
||||
private final RedactionServiceSettings settings;
|
||||
|
||||
|
||||
public void classifyImages(Document classifiedDoc) {
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
classifiedDoc.getPages().forEach(page -> {
|
||||
page.getImages().forEach(image -> {
|
||||
|
||||
if(settings.isEnableImageClassification()) {
|
||||
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
|
||||
ImageIO.write(image.getImage(), "png", baos);
|
||||
ImageClassificationResponse response = imageClassificationClient.classify(new MockMultipartFile("file", "Image.png", "image/png", baos
|
||||
.toByteArray()));
|
||||
image.setImageType(ImageType.valueOf(response.getCategory()));
|
||||
|
||||
} catch (IOException e) {
|
||||
log.error("Could not classify image", e);
|
||||
}
|
||||
} else {
|
||||
image.setImageType(ImageType.OTHER);
|
||||
}
|
||||
|
||||
if (image.getImageType().equals(ImageType.OTHER)) {
|
||||
page.getTextBlocks().forEach(textblock -> {
|
||||
if (image.getPosition()
|
||||
.contains(textblock.getMinX(), textblock.getMinY(), textblock.getWidth(), textblock.getHeight())) {
|
||||
image.setImageType(ImageType.OCR);
|
||||
}
|
||||
});
|
||||
}
|
||||
});
|
||||
});
|
||||
log.info("Image classification took: " + (System.currentTimeMillis() - start));
|
||||
}
|
||||
|
||||
}
|
||||
@ -248,7 +248,7 @@ public class ReanalyzeService {
|
||||
Iterator<RedactionLogEntry> itty = renalyzeRequest.getRedactionLog().getRedactionLogEntry().iterator();
|
||||
while (itty.hasNext()) {
|
||||
RedactionLogEntry entry = itty.next();
|
||||
if (sectionsToReanaylse.contains(entry.getSectionNumber()) && !entry.getType().equals("image") || entry.getSectionNumber() == 0 && !entry.getType().equals("image")) {
|
||||
if (sectionsToReanaylse.contains(entry.getSectionNumber()) && !entry.isImage() || entry.getSectionNumber() == 0 && !entry.isImage()) {
|
||||
itty.remove();
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,9 +1,9 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.service;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
@ -30,6 +30,8 @@ import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.ImageType;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.IdBuilder;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
|
||||
@ -41,8 +43,6 @@ import lombok.RequiredArgsConstructor;
|
||||
@RequiredArgsConstructor
|
||||
public class RedactionLogCreatorService {
|
||||
|
||||
private static final String IMAGE = "image";
|
||||
|
||||
private final DictionaryService dictionaryService;
|
||||
|
||||
|
||||
@ -56,14 +56,16 @@ public class RedactionLogCreatorService {
|
||||
addSectionGrid(classifiedDoc, page);
|
||||
|
||||
if (classifiedDoc.getEntities().get(page) != null) {
|
||||
classifiedDoc.getRedactionLogEntities().addAll(addEntries(classifiedDoc.getEntities(), manualRedactions, page, ruleSetId));
|
||||
classifiedDoc.getRedactionLogEntities()
|
||||
.addAll(addEntries(classifiedDoc.getEntities(), manualRedactions, page, ruleSetId));
|
||||
}
|
||||
|
||||
if (manualRedactionPages.contains(page)) {
|
||||
classifiedDoc.getRedactionLogEntities().addAll(addManualAddEntries(manualRedactions.getEntriesToAdd(), manualRedactions.getComments(), page, ruleSetId));
|
||||
classifiedDoc.getRedactionLogEntities()
|
||||
.addAll(addManualAddEntries(manualRedactions.getEntriesToAdd(), manualRedactions.getComments(), page, ruleSetId));
|
||||
}
|
||||
|
||||
if (!classifiedDoc.getPages().get(page - 1).getImageBounds().isEmpty()) {
|
||||
if (!classifiedDoc.getPages().get(page - 1).getImages().isEmpty()) {
|
||||
addImageEntries(classifiedDoc, page, ruleSetId);
|
||||
}
|
||||
}
|
||||
@ -72,24 +74,41 @@ public class RedactionLogCreatorService {
|
||||
|
||||
private void addImageEntries(Document classifiedDoc, int pageNumber, String ruleSetId) {
|
||||
|
||||
for (Rectangle2D imageBounds : classifiedDoc.getPages().get(pageNumber - 1).getImageBounds()) {
|
||||
for (PdfImage image : classifiedDoc.getPages().get(pageNumber - 1).getImages()) {
|
||||
RedactionLogEntry redactionLogEntry = RedactionLogEntry.builder()
|
||||
.id(IdBuilder.buildId(imageBounds, pageNumber))
|
||||
.color(getColor(IMAGE, ruleSetId))
|
||||
.type(IMAGE)
|
||||
.redacted(false)
|
||||
.isHint(true)
|
||||
.id(IdBuilder.buildId(image.getPosition(), pageNumber))
|
||||
.color(getColor(image.getImageType().name().toLowerCase(Locale.ROOT), ruleSetId))
|
||||
.isImage(true)
|
||||
.type(image.getImageType().equals(ImageType.OTHER) ? "image" : image.getImageType().name().toLowerCase(Locale.ROOT))
|
||||
.redacted(isImageRedactionType(image.getImageType()))
|
||||
.isHint(!isImageRedactionType(image.getImageType()))
|
||||
.manual(false)
|
||||
.isDictionaryEntry(false)
|
||||
.isRecommendation(false)
|
||||
.positions(List.of(new Rectangle(new Point((float) imageBounds.getX(), (float) imageBounds.getY()), (float) imageBounds
|
||||
.getWidth(), (float) imageBounds.getHeight(), pageNumber)))
|
||||
.positions(List.of(new Rectangle(new Point((float) image.getPosition()
|
||||
.getX(), (float) image.getPosition().getY()), (float) image.getPosition()
|
||||
.getWidth(), (float) image.getPosition().getHeight(), pageNumber)))
|
||||
.build();
|
||||
classifiedDoc.getRedactionLogEntities().add(redactionLogEntry);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private boolean isImageRedactionType(ImageType imageType) {
|
||||
|
||||
if (imageType.equals(ImageType.LOGO)) {
|
||||
return true;
|
||||
}
|
||||
if (imageType.equals(ImageType.FORMULA)) {
|
||||
return true;
|
||||
}
|
||||
if (imageType.equals(ImageType.SIGNATURE)) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
private Set<Integer> getManualRedactionPages(ManualRedactions manualRedactions) {
|
||||
|
||||
Set<Integer> manualRedactionPages = new HashSet<>();
|
||||
@ -107,7 +126,8 @@ public class RedactionLogCreatorService {
|
||||
}
|
||||
|
||||
|
||||
public List<RedactionLogEntry> addEntries(Map<Integer, List<Entity>> entities, ManualRedactions manualRedactions, int page, String ruleSetId) {
|
||||
public List<RedactionLogEntry> addEntries(Map<Integer, List<Entity>> entities, ManualRedactions manualRedactions,
|
||||
int page, String ruleSetId) {
|
||||
|
||||
List<RedactionLogEntry> redactionLogEntities = new ArrayList<>();
|
||||
|
||||
@ -238,8 +258,9 @@ public class RedactionLogCreatorService {
|
||||
}
|
||||
|
||||
|
||||
public List<RedactionLogEntry> addManualAddEntries(Set<ManualRedactionEntry> manualAdds, Map<String, List<Comment>> comments, int page,
|
||||
String ruleSetId) {
|
||||
public List<RedactionLogEntry> addManualAddEntries(Set<ManualRedactionEntry> manualAdds,
|
||||
Map<String, List<Comment>> comments, int page,
|
||||
String ruleSetId) {
|
||||
|
||||
List<RedactionLogEntry> redactionLogEntities = new ArrayList<>();
|
||||
|
||||
|
||||
@ -15,7 +15,6 @@ import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.service.BlockificationService;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.service.ClassificationService;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.PDFLinesTextStripper;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.ParsedElements;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.CleanRulings;
|
||||
@ -57,19 +56,10 @@ public class PdfSegmentationService {
|
||||
int rotation = pdPage.getRotation();
|
||||
boolean isRotated = rotation != 0 && rotation != 360;
|
||||
|
||||
ParsedElements parsedElements = ParsedElements.builder()
|
||||
.rulings(stripper.getRulings())
|
||||
.sequences(stripper.getTextPositionSequences())
|
||||
.imageBounds(stripper.getImageBounds())
|
||||
.minCharWidth(stripper.getMinCharWidth())
|
||||
.maxCharWidth(stripper.getMaxCharWidth())
|
||||
.landscape(isLandscape)
|
||||
.rotated(isRotated)
|
||||
.build();
|
||||
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(stripper.getRulings(), stripper.getMinCharWidth(), stripper
|
||||
.getMaxCharHeight());
|
||||
|
||||
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(parsedElements.getRulings(), stripper.getMinCharWidth(), stripper.getMaxCharHeight());
|
||||
|
||||
Page page = blockificationService.blockify(parsedElements.getSequences(), cleanRulings.getHorizontal(), cleanRulings
|
||||
Page page = blockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings
|
||||
.getVertical());
|
||||
page.setRotation(rotation);
|
||||
|
||||
@ -77,11 +67,11 @@ public class PdfSegmentationService {
|
||||
|
||||
buildPageStatistics(page);
|
||||
|
||||
page.setLandscape(parsedElements.isLandscape() || parsedElements.isRotated());
|
||||
page.setLandscape(isLandscape || isRotated);
|
||||
|
||||
page.setPageNumber(pageNumber);
|
||||
increaseDocumentStatistics(page, document);
|
||||
page.setImageBounds(parsedElements.getImageBounds());
|
||||
page.setImages(stripper.getImages());
|
||||
pages.add(page);
|
||||
}
|
||||
|
||||
|
||||
@ -12,4 +12,6 @@ public class RedactionServiceSettings {
|
||||
|
||||
private int surroundingWordsOffsetWindow = 100;
|
||||
|
||||
private boolean enableImageClassification = true;
|
||||
|
||||
}
|
||||
@ -2,6 +2,7 @@ info:
|
||||
description: Redaction Service Server V1
|
||||
|
||||
configuration-service.url: "http://configuration-service-v1:8080"
|
||||
image-service.url: "http://image-service-v1:8080"
|
||||
|
||||
server:
|
||||
port: 8080
|
||||
|
||||
@ -83,6 +83,7 @@ import com.iqser.red.service.redaction.v1.model.Status;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.client.DictionaryClient;
|
||||
import com.iqser.red.service.redaction.v1.server.client.ImageClassificationClient;
|
||||
import com.iqser.red.service.redaction.v1.server.client.RulesClient;
|
||||
import com.iqser.red.service.redaction.v1.server.controller.RedactionController;
|
||||
import com.iqser.red.service.redaction.v1.server.exception.RedactionException;
|
||||
@ -131,6 +132,9 @@ public class RedactionIntegrationTest {
|
||||
@MockBean
|
||||
private DictionaryClient dictionaryClient;
|
||||
|
||||
@MockBean
|
||||
private ImageClassificationClient imageClassificationClient;
|
||||
|
||||
private final Map<String, List<String>> dictionary = new HashMap<>();
|
||||
private final Map<String, String> typeColorMap = new HashMap<>();
|
||||
private final Map<String, Boolean> hintTypeMap = new HashMap<>();
|
||||
@ -412,7 +416,8 @@ public class RedactionIntegrationTest {
|
||||
}
|
||||
|
||||
|
||||
private List<DictionaryEntry> toDictionaryEntry(List<String> entries){
|
||||
private List<DictionaryEntry> toDictionaryEntry(List<String> entries) {
|
||||
|
||||
List<DictionaryEntry> dictionaryEntries = new ArrayList<>();
|
||||
entries.forEach(entry -> {
|
||||
dictionaryEntries.add(new DictionaryEntry(entry, reanlysisVersions.containsKey(entry) ? reanlysisVersions.get(entry) : 0L, false));
|
||||
@ -450,7 +455,6 @@ public class RedactionIntegrationTest {
|
||||
assertThat(entry.getValue().size()).isEqualTo(1);
|
||||
});
|
||||
|
||||
|
||||
dictionary.get(AUTHOR).add("Drinking water");
|
||||
when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(1L);
|
||||
|
||||
@ -498,7 +502,7 @@ public class RedactionIntegrationTest {
|
||||
|
||||
System.out.println("redactionTest");
|
||||
long start = System.currentTimeMillis();
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_13_Volume_3CP_A9396G_B-1_2018-09-06.pdf");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/new/Single Study - Oral (Gavage) Mouse.pdf");
|
||||
|
||||
AnalyzeRequest request = AnalyzeRequest.builder()
|
||||
.ruleSetId(TEST_RULESET_ID)
|
||||
@ -507,6 +511,12 @@ public class RedactionIntegrationTest {
|
||||
|
||||
AnalyzeResult result = redactionController.analyze(request);
|
||||
|
||||
result.getRedactionLog().getRedactionLogEntry().forEach(entry -> {
|
||||
if (entry.isImage()) {
|
||||
System.out.println("---->" + entry.getType());
|
||||
}
|
||||
});
|
||||
|
||||
long end = System.currentTimeMillis();
|
||||
|
||||
System.out.println("first analysis duration: " + (end - start));
|
||||
@ -519,7 +529,7 @@ public class RedactionIntegrationTest {
|
||||
loop:
|
||||
for (RedactionLogEntry redactionLogEntry : result.getRedactionLog().getRedactionLogEntry()) {
|
||||
for (SectionText sectionText : result.getText().getSectionTexts()) {
|
||||
if (redactionLogEntry.getType().equals("image")) {
|
||||
if (redactionLogEntry.isImage()) {
|
||||
correctFound++;
|
||||
continue loop;
|
||||
}
|
||||
@ -536,7 +546,6 @@ public class RedactionIntegrationTest {
|
||||
}
|
||||
assertThat(correctFound).isEqualTo(result.getRedactionLog().getRedactionLogEntry().size());
|
||||
|
||||
|
||||
dictionary.get(AUTHOR).add("properties");
|
||||
reanlysisVersions.put("properties", 1L);
|
||||
|
||||
@ -575,127 +584,6 @@ public class RedactionIntegrationTest {
|
||||
}
|
||||
|
||||
|
||||
|
||||
@Test
|
||||
@Ignore
|
||||
public void fillRecanTest() throws IOException {
|
||||
|
||||
System.out.println("redactionTest");
|
||||
long start = System.currentTimeMillis();
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/S5.pdf");
|
||||
|
||||
AnalyzeRequest request = AnalyzeRequest.builder()
|
||||
.ruleSetId(TEST_RULESET_ID)
|
||||
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
|
||||
.build();
|
||||
|
||||
AnalyzeResult result = redactionController.analyze(request);
|
||||
|
||||
AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder()
|
||||
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
|
||||
.redactionLog(result.getRedactionLog())
|
||||
.sectionGrid(result.getSectionGrid())
|
||||
.build());
|
||||
|
||||
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Annotated.pdf")) {
|
||||
fileOutputStream.write(annotateResponse.getDocument());
|
||||
}
|
||||
|
||||
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Test.json")) {
|
||||
fileOutputStream.write(objectMapper.writeValueAsBytes(result.getText()));
|
||||
}
|
||||
|
||||
int correctFound = 0;
|
||||
loop:
|
||||
for (RedactionLogEntry redactionLogEntry : result.getRedactionLog().getRedactionLogEntry()) {
|
||||
for (SectionText sectionText : result.getText().getSectionTexts()) {
|
||||
if (redactionLogEntry.getType().equals("image")) {
|
||||
correctFound++;
|
||||
continue loop;
|
||||
}
|
||||
if (redactionLogEntry.getSectionNumber() == sectionText.getSectionNumber()) {
|
||||
String value = sectionText.getText()
|
||||
.substring(redactionLogEntry.getStartOffset(), redactionLogEntry.getEndOffset());
|
||||
if (redactionLogEntry.getValue().equalsIgnoreCase(value)) {
|
||||
correctFound++;
|
||||
} else {
|
||||
throw new RuntimeException("WTF");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
assertThat(correctFound).isEqualTo(result.getRedactionLog().getRedactionLogEntry().size());
|
||||
|
||||
System.out.println("correctFound " + correctFound);
|
||||
|
||||
long end = System.currentTimeMillis();
|
||||
|
||||
System.out.println("duration: " + (end - start));
|
||||
System.out.println("numberOfPages: " + result.getNumberOfPages());
|
||||
|
||||
SectionArea sectionArea = result.getText().getSectionTexts().get(3).getSectionAreas().get(5);
|
||||
|
||||
try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(IOUtils.toByteArray(pdfFileResource.getInputStream())))) {
|
||||
|
||||
PDPage docPage = pdDocument.getPage(0);
|
||||
|
||||
PDFTextStripperByArea textStripper = new PDFTextStripperByArea();
|
||||
|
||||
PDRectangle cropBox = docPage.getCropBox();
|
||||
PDRectangle mediaBox = docPage.getMediaBox();
|
||||
|
||||
|
||||
// if (textPositions.get(0).getRotation() == 90) {
|
||||
// posXEnd = textPositions.get(0).getYDirAdj() + 2;
|
||||
// posYInit = getY1();
|
||||
// posYEnd = textPositions.get(textPositions.size() - 1).getXDirAdj() - height + 4;
|
||||
// } else {
|
||||
// posXEnd = textPositions.get(textPositions.size() - 1)
|
||||
// .getXDirAdj() + textPositions.get(textPositions.size() - 1).getWidth() + 1;
|
||||
// posYInit = textPositions.get(0).getPageHeight() - textPositions.get(0).getYDirAdj() - 2;
|
||||
// posYEnd = textPositions.get(0).getPageHeight() - textPositions.get(textPositions.size() - 1)
|
||||
// .getYDirAdj() + 2;
|
||||
// }
|
||||
|
||||
|
||||
Rectangle2D rect = new Rectangle2D.Float(sectionArea.getTopLeft()
|
||||
.getY(), sectionArea.getTopLeft()
|
||||
.getX() , sectionArea.getHeight(), sectionArea
|
||||
.getWidth() + 0.001f);
|
||||
|
||||
textStripper.addRegion("region", rect);
|
||||
|
||||
|
||||
|
||||
textStripper.extractRegions(docPage);
|
||||
|
||||
String textForRegion = textStripper.getTextForRegion("region");
|
||||
|
||||
System.out.println(textForRegion);
|
||||
|
||||
// fill a rectangle
|
||||
PDPageContentStream contents = new PDPageContentStream (pdDocument, docPage, PDPageContentStream.AppendMode.APPEND, false, false);
|
||||
contents.setNonStrokingColor (Color.RED);
|
||||
contents.addRect (sectionArea.getTopLeft().getX(), sectionArea.getTopLeft().getY(), sectionArea.getWidth(), sectionArea.getHeight());
|
||||
contents.fill ();
|
||||
contents.close ();
|
||||
try (ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream()) {
|
||||
pdDocument.save(byteArrayOutputStream);
|
||||
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Annotated2.pdf")) {
|
||||
fileOutputStream.write(byteArrayOutputStream.toByteArray());
|
||||
}
|
||||
}
|
||||
|
||||
} catch (Exception e) {
|
||||
throw new RedactionException(e);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@Test
|
||||
public void testTableRedaction() throws IOException {
|
||||
|
||||
@ -782,7 +670,6 @@ public class RedactionIntegrationTest {
|
||||
.status(Status.APPROVED)
|
||||
.build()));
|
||||
|
||||
|
||||
ReanalyzeResult reanalyzeResult = redactionController.reanalyze(RenalyzeRequest.builder()
|
||||
.redactionLog(result.getRedactionLog())
|
||||
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
|
||||
@ -791,7 +678,6 @@ public class RedactionIntegrationTest {
|
||||
.ruleSetId(TEST_RULESET_ID)
|
||||
.build());
|
||||
|
||||
|
||||
AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder()
|
||||
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
|
||||
.redactionLog(reanalyzeResult.getRedactionLog())
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
configuration-service.url: "http://configuration-service-v1:8080"
|
||||
image-service.url: "http://image-service-v1:8080"
|
||||
|
||||
ribbon:
|
||||
ConnectTimeout: 600000
|
||||
@ -12,3 +13,6 @@ processing.kafkastreams: false
|
||||
|
||||
platform.multi-tenancy:
|
||||
enabled: false
|
||||
|
||||
redaction-service:
|
||||
enable-image-classification: false
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user