reworked reanalysis and text storage

This commit is contained in:
Timo 2021-04-20 09:51:50 +03:00
parent 5c2596e268
commit 1d4708ad13
18 changed files with 228 additions and 267 deletions

View File

@ -1,5 +1,6 @@
package com.iqser.red.service.redaction.v1.server.classification.model; package com.iqser.red.service.redaction.v1.server.classification.model;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText; import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
import lombok.Data; import lombok.Data;
@ -12,7 +13,7 @@ public class Footer {
private List<TextBlock> textBlocks; private List<TextBlock> textBlocks;
@JsonIgnore
public SearchableText getSearchableText() { public SearchableText getSearchableText() {
SearchableText searchableText = new SearchableText(); SearchableText searchableText = new SearchableText();

View File

@ -1,5 +1,6 @@
package com.iqser.red.service.redaction.v1.server.classification.model; package com.iqser.red.service.redaction.v1.server.classification.model;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText; import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
import lombok.Data; import lombok.Data;
@ -12,7 +13,7 @@ public class Header {
private List<TextBlock> textBlocks; private List<TextBlock> textBlocks;
@JsonIgnore
public SearchableText getSearchableText() { public SearchableText getSearchableText() {
SearchableText searchableText = new SearchableText(); SearchableText searchableText = new SearchableText();

View File

@ -1,5 +1,6 @@
package com.iqser.red.service.redaction.v1.server.classification.model; package com.iqser.red.service.redaction.v1.server.classification.model;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.iqser.red.service.redaction.v1.model.SectionArea; import com.iqser.red.service.redaction.v1.model.SectionArea;
import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue; import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue;
import com.iqser.red.service.redaction.v1.server.redaction.model.Image; import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
@ -36,7 +37,7 @@ public class SectionText {
this.tabularData = tabularData; this.tabularData = tabularData;
} }
@JsonIgnore
public SearchableText getSearchableText() { public SearchableText getSearchableText() {
SearchableText searchableText = new SearchableText(); SearchableText searchableText = new SearchableText();

View File

@ -1,5 +1,6 @@
package com.iqser.red.service.redaction.v1.server.classification.model; package com.iqser.red.service.redaction.v1.server.classification.model;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities; import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer; import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
@ -118,6 +119,7 @@ public class TextBlock extends AbstractTextContainer {
} }
@Override @Override
@JsonIgnore
public String getText() { public String getText() {
StringBuilder sb = new StringBuilder(); StringBuilder sb = new StringBuilder();

View File

@ -1,5 +1,6 @@
package com.iqser.red.service.redaction.v1.server.classification.model; package com.iqser.red.service.redaction.v1.server.classification.model;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText; import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
import lombok.Data; import lombok.Data;
@ -12,7 +13,7 @@ public class UnclassifiedText {
private List<TextBlock> textBlocks; private List<TextBlock> textBlocks;
@JsonIgnore
public SearchableText getSearchableText() { public SearchableText getSearchableText() {
SearchableText searchableText = new SearchableText(); SearchableText searchableText = new SearchableText();

View File

@ -1,10 +0,0 @@
package com.iqser.red.service.redaction.v1.server.parsing.model;
import lombok.Data;
@Data
public class RedMatrix {
private float[] single;
}

View File

@ -1,35 +1,35 @@
package com.iqser.red.service.redaction.v1.server.parsing.model; package com.iqser.red.service.redaction.v1.server.parsing.model;
import com.fasterxml.jackson.annotation.JsonIgnore;
import lombok.Data; import lombok.Data;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import org.apache.pdfbox.text.TextPosition; import org.apache.pdfbox.text.TextPosition;
import org.apache.pdfbox.util.Matrix;
import org.springframework.beans.BeanUtils; import org.springframework.beans.BeanUtils;
@Data @Data
public class RedTextPosition { public class RedTextPosition {
private Matrix textMatrix; private String textMatrix;
private float endX;
private float endY;
private float maxHeight;
private int rotation; private int rotation;
private float x;
private float y; private float y;
private float pageHeight; private float pageHeight;
private float pageWidth; private float pageWidth;
private float widthOfSpace;
private int[] charCodes;
private float fontSize;
private float fontSizePt;
private float[] widths;
private String unicode; private String unicode;
private float direction = -1.0F;
private float XDirAdj; private float XDirAdj;
private float YDirAdj; private float YDirAdj;
private float width; private float width;
private float heightDir; private float heightDir;
// not used in reanalysis
@JsonIgnore
private float widthOfSpace;
// not used in reanalysis
@JsonIgnore
private float fontSizeInPt; private float fontSizeInPt;
// not used in reanalysis
@JsonIgnore
private String fontName; private String fontName;
@ -39,10 +39,12 @@ public class RedTextPosition {
BeanUtils.copyProperties(textPosition, pos); BeanUtils.copyProperties(textPosition, pos);
pos.setFontName(textPosition.getFont().getName()); pos.setFontName(textPosition.getFont().getName());
pos.setCharCodes(textPosition.getCharacterCodes()); pos.setFontSizeInPt(textPosition.getFontSizeInPt());
pos.setWidths(textPosition.getIndividualWidths());
pos.setFontSizePt(textPosition.getFontSizeInPt()); pos.setTextMatrix(textPosition.getTextMatrix().toString());
return pos; return pos;
} }
} }

View File

@ -1,5 +1,7 @@
package com.iqser.red.service.redaction.v1.server.parsing.model; package com.iqser.red.service.redaction.v1.server.parsing.model;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
import com.iqser.red.service.redaction.v1.model.Point; import com.iqser.red.service.redaction.v1.model.Point;
import com.iqser.red.service.redaction.v1.model.Rectangle; import com.iqser.red.service.redaction.v1.model.Rectangle;
import lombok.Data; import lombok.Data;
@ -12,11 +14,14 @@ import java.util.stream.Collectors;
@Data @Data
@NoArgsConstructor @NoArgsConstructor
@JsonIgnoreProperties({ "empty" })
public class TextPositionSequence implements CharSequence { public class TextPositionSequence implements CharSequence {
private int page; private int page;
private List<RedTextPosition> textPositions = new ArrayList<>(); private List<RedTextPosition> textPositions = new ArrayList<>();
private float x1;
private float x2;
public TextPositionSequence(int page) { public TextPositionSequence(int page) {
this.page = page; this.page = page;
@ -38,9 +43,8 @@ public class TextPositionSequence implements CharSequence {
this.page = page; this.page = page;
} }
public void setTextPositions(List<TextPosition> textPositions) {
this.textPositions = textPositions.stream().map(RedTextPosition::fromTextPosition).collect(Collectors.toList());
}
@Override @Override
@ -103,6 +107,7 @@ public class TextPositionSequence implements CharSequence {
} }
@JsonIgnore
public float getX1() { public float getX1() {
if (textPositions.get(0).getRotation() == 90) { if (textPositions.get(0).getRotation() == 90) {
@ -113,6 +118,7 @@ public class TextPositionSequence implements CharSequence {
} }
@JsonIgnore
public float getX2() { public float getX2() {
if (textPositions.get(0).getRotation() == 90) { if (textPositions.get(0).getRotation() == 90) {
@ -123,13 +129,14 @@ public class TextPositionSequence implements CharSequence {
} }
} }
@JsonIgnore
public float getRotationAdjustedY() { public float getRotationAdjustedY() {
return textPositions.get(0).getY(); return textPositions.get(0).getY();
} }
@JsonIgnore
public float getY1() { public float getY1() {
if (textPositions.get(0).getRotation() == 90) { if (textPositions.get(0).getRotation() == 90) {
@ -140,6 +147,7 @@ public class TextPositionSequence implements CharSequence {
} }
@JsonIgnore
public float getY2() { public float getY2() {
if (textPositions.get(0).getRotation() == 90) { if (textPositions.get(0).getRotation() == 90) {
@ -150,26 +158,29 @@ public class TextPositionSequence implements CharSequence {
} }
@JsonIgnore
public float getTextHeight() { public float getTextHeight() {
return textPositions.get(0).getHeightDir() + 2; return textPositions.get(0).getHeightDir() + 2;
} }
@JsonIgnore
public float getHeight() { public float getHeight() {
return getY2() - getY1(); return getY2() - getY1();
} }
@JsonIgnore
public float getWidth() { public float getWidth() {
return getX2() - getX1(); return getX2() - getX1();
} }
@JsonIgnore
public String getFont() { public String getFont() {
return textPositions.get(0).getFontName() return textPositions.get(0).getFontName()
.toLowerCase() .toLowerCase()
.replaceAll(",bold", "") .replaceAll(",bold", "")
@ -177,6 +188,7 @@ public class TextPositionSequence implements CharSequence {
} }
@JsonIgnore
public String getFontStyle() { public String getFontStyle() {
String lowercaseFontName = textPositions.get(0).getFontName().toLowerCase(); String lowercaseFontName = textPositions.get(0).getFontName().toLowerCase();
@ -193,25 +205,25 @@ public class TextPositionSequence implements CharSequence {
} }
@JsonIgnore
public float getFontSize() { public float getFontSize() {
return textPositions.get(0).getFontSizeInPt(); return textPositions.get(0).getFontSizeInPt();
} }
@JsonIgnore
public float getSpaceWidth() { public float getSpaceWidth() {
return textPositions.get(0).getWidthOfSpace(); return textPositions.get(0).getWidthOfSpace();
} }
@JsonIgnore
public int getRotation() { public int getRotation() {
return textPositions.get(0).getRotation(); return textPositions.get(0).getRotation();
} }
@JsonIgnore
public Rectangle getRectangle() { public Rectangle getRectangle() {
float height = getTextHeight(); float height = getTextHeight();

View File

@ -5,8 +5,6 @@ import lombok.Builder;
import lombok.Data; import lombok.Data;
import lombok.NoArgsConstructor; import lombok.NoArgsConstructor;
import java.awt.geom.Rectangle2D;
@Data @Data
@Builder @Builder
@NoArgsConstructor @NoArgsConstructor
@ -14,7 +12,7 @@ import java.awt.geom.Rectangle2D;
public class Image { public class Image {
private String type; private String type;
private Rectangle2D position; private RedRectangle2D position;
private boolean redaction; private boolean redaction;
private String redactionReason; private String redactionReason;
private String legalBasis; private String legalBasis;

View File

@ -1,5 +1,6 @@
package com.iqser.red.service.redaction.v1.server.redaction.model; package com.iqser.red.service.redaction.v1.server.redaction.model;
import com.fasterxml.jackson.annotation.JsonIgnore;
import lombok.Data; import lombok.Data;
import lombok.NonNull; import lombok.NonNull;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
@ -11,9 +12,10 @@ import java.awt.image.BufferedImage;
@RequiredArgsConstructor @RequiredArgsConstructor
public class PdfImage { public class PdfImage {
@JsonIgnore
private BufferedImage image; private BufferedImage image;
@NonNull @NonNull
private Rectangle2D position; private RedRectangle2D position;
private ImageType imageType; private ImageType imageType;
private boolean isAppendedToParagraph; private boolean isAppendedToParagraph;
@ -22,7 +24,7 @@ public class PdfImage {
public PdfImage(BufferedImage image, Rectangle2D position, int page) { public PdfImage(BufferedImage image, Rectangle2D position, int page) {
this.image = image; this.image = image;
this.position = position; this.position = new RedRectangle2D(position.getX(), position.getY(), position.getWidth(), position.getHeight());
this.page = page; this.page = page;
} }

View File

@ -0,0 +1,35 @@
package com.iqser.red.service.redaction.v1.server.redaction.model;
import com.fasterxml.jackson.annotation.JsonIgnore;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@NoArgsConstructor
@AllArgsConstructor
public class RedRectangle2D {
private double x;
private double y;
private double width;
private double height;
@JsonIgnore
public boolean isEmpty() {
return (width <= 0.0f) || (height <= 0.0f);
}
public boolean contains(double x, double y, double w, double h) {
if (isEmpty() || w <= 0 || h <= 0) {
return false;
}
double x0 = getX();
double y0 = getY();
return (x >= x0 &&
y >= y0 &&
(x + w) <= x0 + getWidth() &&
(y + h) <= y0 + getHeight());
}
}

View File

@ -187,6 +187,7 @@ public class EntityRedactionService {
.get(0) .get(0)
.getPage()); .getPage());
sectionText.getSectionAreas().add(sectionArea); sectionText.getSectionAreas().add(sectionArea);
sectionText.getTextBlocks().addAll(cell.getTextBlocks());
addSectionToManualRedactions(cell.getTextBlocks(), manualRedactions, table.getHeadline(), sectionNumber.intValue()); addSectionToManualRedactions(cell.getTextBlocks(), manualRedactions, table.getHeadline(), sectionNumber.intValue());
int cellStart = start; int cellStart = start;
@ -235,6 +236,8 @@ public class EntityRedactionService {
sectionText.setHeadline(table.getHeadline()); sectionText.setHeadline(table.getHeadline());
sectionText.setSectionNumber(sectionNumber.intValue()); sectionText.setSectionNumber(sectionNumber.intValue());
sectionText.setTable(true); sectionText.setTable(true);
sectionText.setTabularData(tabularData);
sectionText.setCellStarts(cellStarts);
classifiedDoc.getSectionText().add(sectionText); classifiedDoc.getSectionText().add(sectionText);
} }
@ -267,6 +270,7 @@ public class EntityRedactionService {
.getSequences() .getSequences()
.get(0) .get(0)
.getPage()); .getPage());
sectionText.getTextBlocks().addAll(cell.getTextBlocks());
sectionText.getSectionAreas().add(sectionArea); sectionText.getSectionAreas().add(sectionArea);
} }
@ -325,6 +329,10 @@ public class EntityRedactionService {
sectionText.setHeadline(headline); sectionText.setHeadline(headline);
sectionText.setSectionNumber(sectionNumber.intValue()); sectionText.setSectionNumber(sectionNumber.intValue());
sectionText.setTable(false); sectionText.setTable(false);
sectionText.setImages(images.stream()
.map(image -> convert(image, sectionNumber.intValue(), headline))
.collect(Collectors.toSet()));
sectionText.setTextBlocks(paragraphTextBlocks);
classifiedDoc.getSectionText().add(sectionText); classifiedDoc.getSectionText().add(sectionText);
} }

View File

@ -12,15 +12,12 @@ import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUti
import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationService; import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationService;
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService; import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import org.apache.pdfbox.io.MemoryUsageSetting;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.kie.api.runtime.KieContainer; import org.kie.api.runtime.KieContainer;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import org.springframework.web.bind.annotation.RequestBody; import org.springframework.web.bind.annotation.RequestBody;
import java.awt.geom.Rectangle2D;
import java.io.IOException;
import java.util.*; import java.util.*;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.Stream; import java.util.stream.Stream;
@ -42,7 +39,6 @@ public class ReanalyzeService {
public AnalyzeResult analyze(AnalyzeRequest analyzeRequest) { public AnalyzeResult analyze(AnalyzeRequest analyzeRequest) {
var pageCount = 0; var pageCount = 0;
Document classifiedDoc; Document classifiedDoc;
@ -56,18 +52,6 @@ public class ReanalyzeService {
log.info("Document structure analysis successful, starting redaction analysis..."); log.info("Document structure analysis successful, starting redaction analysis...");
entityRedactionService.processDocument(classifiedDoc, analyzeRequest.getRuleSetId(), analyzeRequest.getManualRedactions()); entityRedactionService.processDocument(classifiedDoc, analyzeRequest.getRuleSetId(), analyzeRequest.getManualRedactions());
var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.ORIGIN));
// TODO move this to where it makes sense - or remove completly
try (PDDocument pdDocument = PDDocument.load(storedObjectStream, MemoryUsageSetting.setupTempFileOnly())) {
pdDocument.setAllSecurityToBeRemoved(true);
pdfSegmentationService.postProcessSections(pdDocument, classifiedDoc.getSectionText());
} catch (IOException e) {
e.printStackTrace();
}
redactionLogCreatorService.createRedactionLog(classifiedDoc, pageCount, analyzeRequest.getManualRedactions(), analyzeRequest redactionLogCreatorService.createRedactionLog(classifiedDoc, pageCount, analyzeRequest.getManualRedactions(), analyzeRequest
.getRuleSetId()); .getRuleSetId());
@ -89,30 +73,28 @@ public class ReanalyzeService {
return analyzeResponseService.createAnalyzeResponse(pageCount, redactionLog, changeLog); return analyzeResponseService.createAnalyzeResponse(pageCount, redactionLog, changeLog);
} }
public AnalyzeResult reanalyze(@RequestBody AnalyzeRequest renalyzeRequest) {
var text = redactionStorageService.getText(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId()); @SneakyThrows
// new procedure was not applied, we need a complete analysis public AnalyzeResult reanalyze(@RequestBody AnalyzeRequest analyzeRequest) {
var redactionLog = redactionStorageService.getRedactionLog(analyzeRequest.getProjectId(), analyzeRequest.getFileId());
var text = redactionStorageService.getText(analyzeRequest.getProjectId(), analyzeRequest.getFileId());
// not yet ready for reanalysis
if (text.getNumberOfPages() == 0) { if (text.getNumberOfPages() == 0) {
return analyze(AnalyzeRequest.builder() return analyze(analyzeRequest);
.ruleSetId(renalyzeRequest.getRuleSetId())
.manualRedactions(renalyzeRequest.getManualRedactions())
.projectId(renalyzeRequest.getProjectId())
.fileId(renalyzeRequest.getFileId())
.build());
} }
var redactionLog = redactionStorageService.getRedactionLog(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId());
DictionaryIncrement dictionaryIncrement = dictionaryService.getDictionaryIncrements(analyzeRequest.getRuleSetId(), redactionLog.getDictionaryVersion());
DictionaryIncrement dictionaryIncrement = dictionaryService.getDictionaryIncrements(renalyzeRequest.getRuleSetId(), redactionLog.getDictionaryVersion()); Set<String> manualForceAndRemoveIds = getForceAndRemoveIds(analyzeRequest.getManualRedactions());
Set<String> manualForceAndRemoveIds = getForceAndRemoveIds(renalyzeRequest.getManualRedactions());
Map<String, List<Comment>> comments = null; Map<String, List<Comment>> comments = null;
Set<ManualRedactionEntry> manualAdds = null; Set<ManualRedactionEntry> manualAdds = null;
if (renalyzeRequest.getManualRedactions() != null) { if (analyzeRequest.getManualRedactions() != null) {
// TODO comments will be removed from redactionLog, so we ignore this first. // TODO comments will be removed from redactionLog, so we ignore this first.
comments = renalyzeRequest.getManualRedactions().getComments(); comments = analyzeRequest.getManualRedactions().getComments();
manualAdds = renalyzeRequest.getManualRedactions().getEntriesToAdd(); manualAdds = analyzeRequest.getManualRedactions().getEntriesToAdd();
} }
Set<Integer> sectionsToReanalyse = new HashSet<>(); Set<Integer> sectionsToReanalyse = new HashSet<>();
@ -146,27 +128,29 @@ public class ReanalyzeService {
} }
} }
if (sectionsToReanalyse.isEmpty() && (manualAdds == null || manualAdds.isEmpty())) { if (sectionsToReanalyse.isEmpty() && (manualAdds == null || manualAdds.isEmpty())) {
redactionLog.setDictionaryVersion(dictionaryIncrement.getDictionaryVersion()); redactionLog.setDictionaryVersion(dictionaryIncrement.getDictionaryVersion());
var changeLog = redactionChangeLogService.createAndStoreChangeLog(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId(), redactionLog); var changeLog = redactionChangeLogService.createAndStoreChangeLog(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), redactionLog);
redactionStorageService.storeObject(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId(), FileType.REDACTION_LOG, redactionLog); redactionStorageService.storeObject(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.REDACTION_LOG, redactionLog);
return analyzeResponseService.createAnalyzeResponse(text.getNumberOfPages(), redactionLog, changeLog); return analyzeResponseService.createAnalyzeResponse(text.getNumberOfPages(), redactionLog, changeLog);
} }
try {
List<SectionText> reanalysisSections = new ArrayList<>(); List<SectionText> reanalysisSections = new ArrayList<>();
for (SectionText sectionText : text.getSectionTexts()) { for (SectionText sectionText : text.getSectionTexts()) {
if (sectionsToReanalyse.contains(sectionText.getSectionNumber())) { if (sectionsToReanalyse.contains(sectionText.getSectionNumber())) {
reanalysisSections.add(sectionText); reanalysisSections.add(sectionText);
} }
} }
KieContainer kieContainer = droolsExecutionService.updateRules(renalyzeRequest.getRuleSetId());
Dictionary dictionary = dictionaryService.getDeepCopyDictionary(renalyzeRequest.getRuleSetId()); //--
KieContainer kieContainer = droolsExecutionService.updateRules(analyzeRequest.getRuleSetId());
Dictionary dictionary = dictionaryService.getDeepCopyDictionary(analyzeRequest.getRuleSetId());
List<SectionSearchableTextPair> sectionSearchableTextPairs = new ArrayList<>(); List<SectionSearchableTextPair> sectionSearchableTextPairs = new ArrayList<>();
for (SectionText reanalysisSection : reanalysisSections) { for (SectionText reanalysisSection : reanalysisSections) {
@ -229,32 +213,28 @@ public class ReanalyzeService {
List<RedactionLogEntry> newRedactionLogEntries = new ArrayList<>(); List<RedactionLogEntry> newRedactionLogEntries = new ArrayList<>();
for (int page = 1; page <= text.getNumberOfPages(); page++) { for (int page = 1; page <= text.getNumberOfPages(); page++) {
if (entitiesPerPage.get(page) != null) { if (entitiesPerPage.get(page) != null) {
newRedactionLogEntries.addAll(redactionLogCreatorService.addEntries(entitiesPerPage, renalyzeRequest newRedactionLogEntries.addAll(redactionLogCreatorService.addEntries(entitiesPerPage, analyzeRequest
.getManualRedactions(), page, renalyzeRequest.getRuleSetId())); .getManualRedactions(), page, analyzeRequest.getRuleSetId()));
} }
if (imagesPerPage.get(page) != null) { if (imagesPerPage.get(page) != null) {
newRedactionLogEntries.addAll(redactionLogCreatorService.addImageEntries(imagesPerPage, renalyzeRequest newRedactionLogEntries.addAll(redactionLogCreatorService.addImageEntries(imagesPerPage, analyzeRequest
.getManualRedactions(), page, renalyzeRequest.getRuleSetId())); .getManualRedactions(), page, analyzeRequest.getRuleSetId()));
} }
newRedactionLogEntries.addAll(redactionLogCreatorService.addManualAddEntries(manualAdds, comments, page, renalyzeRequest newRedactionLogEntries.addAll(redactionLogCreatorService.addManualAddEntries(manualAdds, comments, page, analyzeRequest
.getRuleSetId())); .getRuleSetId()));
} }
redactionLog.getRedactionLogEntry().removeIf(entry -> sectionsToReanalyse.contains(entry.getSectionNumber()) && !entry.isImage() || entry.getSectionNumber() == 0 && !entry.isImage()); redactionLog.getRedactionLogEntry().removeIf(entry -> sectionsToReanalyse.contains(entry.getSectionNumber()) && !entry.isImage() || entry.getSectionNumber() == 0 && !entry.isImage());
redactionLog.getRedactionLogEntry().addAll(newRedactionLogEntries); redactionLog.getRedactionLogEntry().addAll(newRedactionLogEntries);
redactionLog.setDictionaryVersion(dictionaryIncrement.getDictionaryVersion()); redactionLog.setDictionaryVersion(dictionaryIncrement.getDictionaryVersion());
var changeLog = redactionChangeLogService.createAndStoreChangeLog(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId(), redactionLog); var changeLog = redactionChangeLogService.createAndStoreChangeLog(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), redactionLog);
redactionStorageService.storeObject(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId(), FileType.REDACTION_LOG, redactionLog); redactionStorageService.storeObject(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.REDACTION_LOG, redactionLog);
return analyzeResponseService.createAnalyzeResponse(text.getNumberOfPages(), redactionLog, changeLog); return analyzeResponseService.createAnalyzeResponse(text.getNumberOfPages(), redactionLog, changeLog);
} catch (Exception e) {
throw new RedactionException(e);
}
} }
@ -277,7 +257,7 @@ public class ReanalyzeService {
return Image.builder() return Image.builder()
.type(entry.getType()) .type(entry.getType())
.position(new Rectangle2D.Float(position.getTopLeft().getX(), position.getTopLeft() .position(new RedRectangle2D(position.getTopLeft().getX(), position.getTopLeft()
.getY(), position.getWidth(), position.getHeight())) .getY(), position.getWidth(), position.getHeight()))
.sectionNumber(entry.getSectionNumber()) .sectionNumber(entry.getSectionNumber())
.section(entry.getSection()) .section(entry.getSection())

View File

@ -3,9 +3,9 @@ package com.iqser.red.service.redaction.v1.server.redaction.utils;
import com.google.common.hash.HashFunction; import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing; import com.google.common.hash.Hashing;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.RedRectangle2D;
import lombok.experimental.UtilityClass; import lombok.experimental.UtilityClass;
import java.awt.geom.Rectangle2D;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.util.List; import java.util.List;
@ -25,12 +25,8 @@ public class IdBuilder {
} }
public String buildId(Rectangle2D rectangle2D, int page) { public String buildId(RedRectangle2D rectangle2D, int page) {
return hashFunction.hashString("x" + rectangle2D.getX() + "y" + rectangle2D.getY() + "h" + rectangle2D.getHeight() + "w" + rectangle2D.getWidth() + "p" + page, StandardCharsets.UTF_8).toString();
StringBuilder sb = new StringBuilder();
sb.append("x").append(rectangle2D.getX()).append("y").append(rectangle2D.getY()).append("h").append(rectangle2D.getHeight()).append("w").append(rectangle2D.getWidth()).append("p").append(page);
return hashFunction.hashString(sb.toString(), StandardCharsets.UTF_8).toString();
} }

View File

@ -1,21 +1,15 @@
package com.iqser.red.service.redaction.v1.server.segmentation; package com.iqser.red.service.redaction.v1.server.segmentation;
import com.iqser.red.service.redaction.v1.model.SectionArea;
import com.iqser.red.service.redaction.v1.server.classification.model.Document; import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Page; import com.iqser.red.service.redaction.v1.server.classification.model.Page;
import com.iqser.red.service.redaction.v1.server.classification.model.SectionText;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.classification.service.BlockificationService; import com.iqser.red.service.redaction.v1.server.classification.service.BlockificationService;
import com.iqser.red.service.redaction.v1.server.classification.service.ClassificationService; import com.iqser.red.service.redaction.v1.server.classification.service.ClassificationService;
import com.iqser.red.service.redaction.v1.server.exception.RedactionException;
import com.iqser.red.service.redaction.v1.server.memory.MemoryStats; import com.iqser.red.service.redaction.v1.server.memory.MemoryStats;
import com.iqser.red.service.redaction.v1.server.parsing.PDFAreaTextStripper;
import com.iqser.red.service.redaction.v1.server.parsing.PDFLinesTextStripper; import com.iqser.red.service.redaction.v1.server.parsing.PDFLinesTextStripper;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue;
import com.iqser.red.service.redaction.v1.server.redaction.service.ImageClassificationService; import com.iqser.red.service.redaction.v1.server.redaction.service.ImageClassificationService;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer; import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.CleanRulings; import com.iqser.red.service.redaction.v1.server.tableextraction.model.CleanRulings;
import com.iqser.red.service.redaction.v1.server.tableextraction.service.RulingCleaningService; import com.iqser.red.service.redaction.v1.server.tableextraction.service.RulingCleaningService;
import com.iqser.red.service.redaction.v1.server.tableextraction.service.TableExtractionService; import com.iqser.red.service.redaction.v1.server.tableextraction.service.TableExtractionService;
@ -28,15 +22,12 @@ import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import java.awt.geom.Rectangle2D;
import java.io.File; import java.io.File;
import java.io.FileOutputStream; import java.io.FileOutputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map;
@Slf4j @Slf4j
@Service @Service
@ -53,79 +44,6 @@ public class PdfSegmentationService {
private final ImageClassificationService imageClassificationService; private final ImageClassificationService imageClassificationService;
public void postProcessSections(PDDocument pdDocument, List<SectionText> texts) {
try {
for (SectionText sectionText : texts) {
List<TextBlock> textBlocks = new ArrayList<>();
Map<Integer, List<SectionArea>> sectionAreasPerPage = new HashMap<>();
for (SectionArea sectionArea : sectionText.getSectionAreas()) {
sectionAreasPerPage.computeIfAbsent(sectionArea.getPage(), (x) -> new ArrayList<>())
.add(sectionArea);
}
Map<String, CellValue> tabularData = new HashMap<>();
List<Integer> cellStarts = new ArrayList<>();
for (Integer page : sectionAreasPerPage.keySet()) {
List<SectionArea> areasOnPage = sectionAreasPerPage.get(page);
PDPage pdPage = pdDocument.getPage(page - 1);
PDRectangle cropBox = pdPage.getCropBox();
PDFAreaTextStripper textStripper = new PDFAreaTextStripper();
textStripper.setPageNumber(page);
int cellStart = 0;
for (SectionArea sectionArea : areasOnPage) {
Rectangle2D rect = null;
if (pdPage.getRotation() == 90) {
rect = new Rectangle2D.Float(sectionArea.getTopLeft().getY(), sectionArea.getTopLeft()
.getX(), sectionArea.getHeight(), sectionArea.getWidth() + 0.001f);
} else {
rect = new Rectangle2D.Float(sectionArea.getTopLeft().getX(), -sectionArea.getTopLeft()
.getY() + cropBox.getUpperRightY() - sectionArea.getHeight(), sectionArea.getWidth(), sectionArea
.getHeight() + 0.001f);
}
textStripper.addRegion(String.valueOf(1), rect);
textStripper.extractRegions(pdPage);
textStripper.getTextForRegion(String.valueOf(1));
List<TextPositionSequence> positions = textStripper.getTextPositionSequences();
TextBlock textBlock = new TextBlock(sectionArea.getTopLeft().getX(), sectionArea.getTopLeft()
.getX() + sectionArea.getWidth(), sectionArea.getTopLeft()
.getY(), sectionArea.getTopLeft().getY() + sectionArea.getHeight(), positions, 0);
if (sectionText.isTable()) {
Cell cell = new Cell();
cell.addTextBlock(textBlock);
tabularData.put(sectionArea.getHeader(), new CellValue(cell.getTextBlocks(), cellStart));
cellStarts.add(cellStart);
cellStart = cellStart + cell.toString().trim().length() + 1;
}
textBlocks.add(textBlock);
textStripper.clearPositions();
}
}
sectionText.setTextBlocks(textBlocks);
sectionText.setTabularData(tabularData);
if (sectionText.isTable()) {
sectionText.setCellStarts(cellStarts);
}
}
} catch (Exception e) {
throw new RedactionException(e);
}
}
public Document parseDocument(InputStream documentInputStream) throws IOException { public Document parseDocument(InputStream documentInputStream) throws IOException {
PDDocument pdDocument = null; PDDocument pdDocument = null;
try { try {
@ -141,6 +59,7 @@ public class PdfSegmentationService {
pdDocument = reinitializePDDocument(tempFile, null); pdDocument = reinitializePDDocument(tempFile, null);
long pageCount = pdDocument.getNumberOfPages(); long pageCount = pdDocument.getNumberOfPages();
long t1= System.currentTimeMillis();
for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) { for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) {
if (pageNumber % MAX_PAGES_BEFORE_GC == 0) { if (pageNumber % MAX_PAGES_BEFORE_GC == 0) {

View File

@ -1,5 +1,6 @@
package com.iqser.red.service.redaction.v1.server.tableextraction.model; package com.iqser.red.service.redaction.v1.server.tableextraction.model;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.iqser.red.service.redaction.v1.model.Rectangle; import com.iqser.red.service.redaction.v1.model.Rectangle;
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
import lombok.Data; import lombok.Data;
@ -27,10 +28,12 @@ public abstract class AbstractTextContainer {
return page == other.getPage() && this.minX <= other.getTopLeft().getX() && this.maxX >= other.getTopLeft().getX() + other.getWidth() && this.minY <= other.getTopLeft().getY() && this.maxY >= other.getTopLeft().getY() + other.getHeight(); return page == other.getPage() && this.minX <= other.getTopLeft().getX() && this.maxX >= other.getTopLeft().getX() + other.getWidth() && this.minY <= other.getTopLeft().getY() && this.maxY >= other.getTopLeft().getY() + other.getHeight();
} }
@JsonIgnore
public float getHeight() { public float getHeight() {
return maxY - minY; return maxY - minY;
} }
@JsonIgnore
public float getWidth() { public float getWidth() {
return maxX - minX; return maxX - minX;
} }

View File

@ -44,7 +44,7 @@ public class FilySystemBackedStorageService extends StorageService {
public void clearStorage() { public void clearStorage() {
this.dataMap.forEach((k, v) -> { this.dataMap.forEach((k, v) -> {
v.delete(); // v.delete();
}); });
this.dataMap.clear(); this.dataMap.clear();
} }

View File

@ -458,6 +458,16 @@ public class RedactionIntegrationTest {
assertThat(result).isNotNull(); assertThat(result).isNotNull();
} }
@Test
public void testXXX() {
AnalyzeRequest request = prepareStorage("files/Metolachlor/S-Metolachlor_RAR_08_Volume_3CA_B-6_2018-09-06.pdf");
MemoryStats.printMemoryStats();
AnalyzeResult result = redactionController.analyze(request);
assertThat(result).isNotNull();
}
@Test @Test
public void noExceptionShouldBeThrownForAnyFiles() throws IOException { public void noExceptionShouldBeThrownForAnyFiles() throws IOException {