reworked reanalysis and text storage

This commit is contained in:
Timo 2021-04-20 09:51:50 +03:00
parent 5c2596e268
commit 1d4708ad13
18 changed files with 228 additions and 267 deletions

View File

@ -1,5 +1,6 @@
package com.iqser.red.service.redaction.v1.server.classification.model;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
import lombok.AllArgsConstructor;
import lombok.Data;
@ -12,7 +13,7 @@ public class Footer {
private List<TextBlock> textBlocks;
@JsonIgnore
public SearchableText getSearchableText() {
SearchableText searchableText = new SearchableText();

View File

@ -1,5 +1,6 @@
package com.iqser.red.service.redaction.v1.server.classification.model;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
import lombok.AllArgsConstructor;
import lombok.Data;
@ -12,7 +13,7 @@ public class Header {
private List<TextBlock> textBlocks;
@JsonIgnore
public SearchableText getSearchableText() {
SearchableText searchableText = new SearchableText();

View File

@ -1,5 +1,6 @@
package com.iqser.red.service.redaction.v1.server.classification.model;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.iqser.red.service.redaction.v1.model.SectionArea;
import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue;
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
@ -36,7 +37,7 @@ public class SectionText {
this.tabularData = tabularData;
}
@JsonIgnore
public SearchableText getSearchableText() {
SearchableText searchableText = new SearchableText();

View File

@ -1,5 +1,6 @@
package com.iqser.red.service.redaction.v1.server.classification.model;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
@ -118,6 +119,7 @@ public class TextBlock extends AbstractTextContainer {
}
@Override
@JsonIgnore
public String getText() {
StringBuilder sb = new StringBuilder();

View File

@ -1,5 +1,6 @@
package com.iqser.red.service.redaction.v1.server.classification.model;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
import lombok.AllArgsConstructor;
import lombok.Data;
@ -12,7 +13,7 @@ public class UnclassifiedText {
private List<TextBlock> textBlocks;
@JsonIgnore
public SearchableText getSearchableText() {
SearchableText searchableText = new SearchableText();

View File

@ -1,10 +0,0 @@
package com.iqser.red.service.redaction.v1.server.parsing.model;
import lombok.Data;
@Data
public class RedMatrix {
private float[] single;
}

View File

@ -1,35 +1,35 @@
package com.iqser.red.service.redaction.v1.server.parsing.model;
import com.fasterxml.jackson.annotation.JsonIgnore;
import lombok.Data;
import lombok.SneakyThrows;
import org.apache.pdfbox.text.TextPosition;
import org.apache.pdfbox.util.Matrix;
import org.springframework.beans.BeanUtils;
@Data
public class RedTextPosition {
private Matrix textMatrix;
private float endX;
private float endY;
private float maxHeight;
private String textMatrix;
private int rotation;
private float x;
private float y;
private float pageHeight;
private float pageWidth;
private float widthOfSpace;
private int[] charCodes;
private float fontSize;
private float fontSizePt;
private float[] widths;
private String unicode;
private float direction = -1.0F;
private float XDirAdj;
private float YDirAdj;
private float width;
private float heightDir;
// not used in reanalysis
@JsonIgnore
private float widthOfSpace;
// not used in reanalysis
@JsonIgnore
private float fontSizeInPt;
// not used in reanalysis
@JsonIgnore
private String fontName;
@ -39,10 +39,12 @@ public class RedTextPosition {
BeanUtils.copyProperties(textPosition, pos);
pos.setFontName(textPosition.getFont().getName());
pos.setCharCodes(textPosition.getCharacterCodes());
pos.setWidths(textPosition.getIndividualWidths());
pos.setFontSizePt(textPosition.getFontSizeInPt());
pos.setFontSizeInPt(textPosition.getFontSizeInPt());
pos.setTextMatrix(textPosition.getTextMatrix().toString());
return pos;
}
}

View File

@ -1,5 +1,7 @@
package com.iqser.red.service.redaction.v1.server.parsing.model;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
import com.iqser.red.service.redaction.v1.model.Point;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import lombok.Data;
@ -12,11 +14,14 @@ import java.util.stream.Collectors;
@Data
@NoArgsConstructor
@JsonIgnoreProperties({ "empty" })
public class TextPositionSequence implements CharSequence {
private int page;
private List<RedTextPosition> textPositions = new ArrayList<>();
private float x1;
private float x2;
public TextPositionSequence(int page) {
this.page = page;
@ -38,9 +43,8 @@ public class TextPositionSequence implements CharSequence {
this.page = page;
}
public void setTextPositions(List<TextPosition> textPositions) {
this.textPositions = textPositions.stream().map(RedTextPosition::fromTextPosition).collect(Collectors.toList());
}
@Override
@ -103,6 +107,7 @@ public class TextPositionSequence implements CharSequence {
}
@JsonIgnore
public float getX1() {
if (textPositions.get(0).getRotation() == 90) {
@ -113,6 +118,7 @@ public class TextPositionSequence implements CharSequence {
}
@JsonIgnore
public float getX2() {
if (textPositions.get(0).getRotation() == 90) {
@ -123,13 +129,14 @@ public class TextPositionSequence implements CharSequence {
}
}
@JsonIgnore
public float getRotationAdjustedY() {
return textPositions.get(0).getY();
}
@JsonIgnore
public float getY1() {
if (textPositions.get(0).getRotation() == 90) {
@ -140,6 +147,7 @@ public class TextPositionSequence implements CharSequence {
}
@JsonIgnore
public float getY2() {
if (textPositions.get(0).getRotation() == 90) {
@ -150,26 +158,29 @@ public class TextPositionSequence implements CharSequence {
}
@JsonIgnore
public float getTextHeight() {
return textPositions.get(0).getHeightDir() + 2;
}
@JsonIgnore
public float getHeight() {
return getY2() - getY1();
}
@JsonIgnore
public float getWidth() {
return getX2() - getX1();
}
@JsonIgnore
public String getFont() {
return textPositions.get(0).getFontName()
.toLowerCase()
.replaceAll(",bold", "")
@ -177,6 +188,7 @@ public class TextPositionSequence implements CharSequence {
}
@JsonIgnore
public String getFontStyle() {
String lowercaseFontName = textPositions.get(0).getFontName().toLowerCase();
@ -193,25 +205,25 @@ public class TextPositionSequence implements CharSequence {
}
@JsonIgnore
public float getFontSize() {
return textPositions.get(0).getFontSizeInPt();
}
@JsonIgnore
public float getSpaceWidth() {
return textPositions.get(0).getWidthOfSpace();
}
@JsonIgnore
public int getRotation() {
return textPositions.get(0).getRotation();
}
@JsonIgnore
public Rectangle getRectangle() {
float height = getTextHeight();

View File

@ -5,8 +5,6 @@ import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import java.awt.geom.Rectangle2D;
@Data
@Builder
@NoArgsConstructor
@ -14,7 +12,7 @@ import java.awt.geom.Rectangle2D;
public class Image {
private String type;
private Rectangle2D position;
private RedRectangle2D position;
private boolean redaction;
private String redactionReason;
private String legalBasis;

View File

@ -1,5 +1,6 @@
package com.iqser.red.service.redaction.v1.server.redaction.model;
import com.fasterxml.jackson.annotation.JsonIgnore;
import lombok.Data;
import lombok.NonNull;
import lombok.RequiredArgsConstructor;
@ -11,9 +12,10 @@ import java.awt.image.BufferedImage;
@RequiredArgsConstructor
public class PdfImage {
@JsonIgnore
private BufferedImage image;
@NonNull
private Rectangle2D position;
private RedRectangle2D position;
private ImageType imageType;
private boolean isAppendedToParagraph;
@ -22,7 +24,7 @@ public class PdfImage {
public PdfImage(BufferedImage image, Rectangle2D position, int page) {
this.image = image;
this.position = position;
this.position = new RedRectangle2D(position.getX(), position.getY(), position.getWidth(), position.getHeight());
this.page = page;
}

View File

@ -0,0 +1,35 @@
package com.iqser.red.service.redaction.v1.server.redaction.model;
import com.fasterxml.jackson.annotation.JsonIgnore;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@NoArgsConstructor
@AllArgsConstructor
public class RedRectangle2D {
private double x;
private double y;
private double width;
private double height;
@JsonIgnore
public boolean isEmpty() {
return (width <= 0.0f) || (height <= 0.0f);
}
public boolean contains(double x, double y, double w, double h) {
if (isEmpty() || w <= 0 || h <= 0) {
return false;
}
double x0 = getX();
double y0 = getY();
return (x >= x0 &&
y >= y0 &&
(x + w) <= x0 + getWidth() &&
(y + h) <= y0 + getHeight());
}
}

View File

@ -187,6 +187,7 @@ public class EntityRedactionService {
.get(0)
.getPage());
sectionText.getSectionAreas().add(sectionArea);
sectionText.getTextBlocks().addAll(cell.getTextBlocks());
addSectionToManualRedactions(cell.getTextBlocks(), manualRedactions, table.getHeadline(), sectionNumber.intValue());
int cellStart = start;
@ -235,6 +236,8 @@ public class EntityRedactionService {
sectionText.setHeadline(table.getHeadline());
sectionText.setSectionNumber(sectionNumber.intValue());
sectionText.setTable(true);
sectionText.setTabularData(tabularData);
sectionText.setCellStarts(cellStarts);
classifiedDoc.getSectionText().add(sectionText);
}
@ -267,6 +270,7 @@ public class EntityRedactionService {
.getSequences()
.get(0)
.getPage());
sectionText.getTextBlocks().addAll(cell.getTextBlocks());
sectionText.getSectionAreas().add(sectionArea);
}
@ -325,6 +329,10 @@ public class EntityRedactionService {
sectionText.setHeadline(headline);
sectionText.setSectionNumber(sectionNumber.intValue());
sectionText.setTable(false);
sectionText.setImages(images.stream()
.map(image -> convert(image, sectionNumber.intValue(), headline))
.collect(Collectors.toSet()));
sectionText.setTextBlocks(paragraphTextBlocks);
classifiedDoc.getSectionText().add(sectionText);
}

View File

@ -12,15 +12,12 @@ import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUti
import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationService;
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import org.apache.pdfbox.io.MemoryUsageSetting;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.kie.api.runtime.KieContainer;
import org.springframework.stereotype.Service;
import org.springframework.web.bind.annotation.RequestBody;
import java.awt.geom.Rectangle2D;
import java.io.IOException;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;
@ -42,7 +39,6 @@ public class ReanalyzeService {
public AnalyzeResult analyze(AnalyzeRequest analyzeRequest) {
var pageCount = 0;
Document classifiedDoc;
@ -56,18 +52,6 @@ public class ReanalyzeService {
log.info("Document structure analysis successful, starting redaction analysis...");
entityRedactionService.processDocument(classifiedDoc, analyzeRequest.getRuleSetId(), analyzeRequest.getManualRedactions());
var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.ORIGIN));
// TODO move this to where it makes sense - or remove completly
try (PDDocument pdDocument = PDDocument.load(storedObjectStream, MemoryUsageSetting.setupTempFileOnly())) {
pdDocument.setAllSecurityToBeRemoved(true);
pdfSegmentationService.postProcessSections(pdDocument, classifiedDoc.getSectionText());
} catch (IOException e) {
e.printStackTrace();
}
redactionLogCreatorService.createRedactionLog(classifiedDoc, pageCount, analyzeRequest.getManualRedactions(), analyzeRequest
.getRuleSetId());
@ -89,30 +73,28 @@ public class ReanalyzeService {
return analyzeResponseService.createAnalyzeResponse(pageCount, redactionLog, changeLog);
}
public AnalyzeResult reanalyze(@RequestBody AnalyzeRequest renalyzeRequest) {
var text = redactionStorageService.getText(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId());
// new procedure was not applied, we need a complete analysis
@SneakyThrows
public AnalyzeResult reanalyze(@RequestBody AnalyzeRequest analyzeRequest) {
var redactionLog = redactionStorageService.getRedactionLog(analyzeRequest.getProjectId(), analyzeRequest.getFileId());
var text = redactionStorageService.getText(analyzeRequest.getProjectId(), analyzeRequest.getFileId());
// not yet ready for reanalysis
if (text.getNumberOfPages() == 0) {
return analyze(AnalyzeRequest.builder()
.ruleSetId(renalyzeRequest.getRuleSetId())
.manualRedactions(renalyzeRequest.getManualRedactions())
.projectId(renalyzeRequest.getProjectId())
.fileId(renalyzeRequest.getFileId())
.build());
return analyze(analyzeRequest);
}
var redactionLog = redactionStorageService.getRedactionLog(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId());
DictionaryIncrement dictionaryIncrement = dictionaryService.getDictionaryIncrements(analyzeRequest.getRuleSetId(), redactionLog.getDictionaryVersion());
DictionaryIncrement dictionaryIncrement = dictionaryService.getDictionaryIncrements(renalyzeRequest.getRuleSetId(), redactionLog.getDictionaryVersion());
Set<String> manualForceAndRemoveIds = getForceAndRemoveIds(renalyzeRequest.getManualRedactions());
Set<String> manualForceAndRemoveIds = getForceAndRemoveIds(analyzeRequest.getManualRedactions());
Map<String, List<Comment>> comments = null;
Set<ManualRedactionEntry> manualAdds = null;
if (renalyzeRequest.getManualRedactions() != null) {
if (analyzeRequest.getManualRedactions() != null) {
// TODO comments will be removed from redactionLog, so we ignore this first.
comments = renalyzeRequest.getManualRedactions().getComments();
manualAdds = renalyzeRequest.getManualRedactions().getEntriesToAdd();
comments = analyzeRequest.getManualRedactions().getComments();
manualAdds = analyzeRequest.getManualRedactions().getEntriesToAdd();
}
Set<Integer> sectionsToReanalyse = new HashSet<>();
@ -146,115 +128,113 @@ public class ReanalyzeService {
}
}
if (sectionsToReanalyse.isEmpty() && (manualAdds == null || manualAdds.isEmpty())) {
redactionLog.setDictionaryVersion(dictionaryIncrement.getDictionaryVersion());
var changeLog = redactionChangeLogService.createAndStoreChangeLog(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId(), redactionLog);
redactionStorageService.storeObject(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId(), FileType.REDACTION_LOG, redactionLog);
var changeLog = redactionChangeLogService.createAndStoreChangeLog(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), redactionLog);
redactionStorageService.storeObject(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.REDACTION_LOG, redactionLog);
return analyzeResponseService.createAnalyzeResponse(text.getNumberOfPages(), redactionLog, changeLog);
}
try {
List<SectionText> reanalysisSections = new ArrayList<>();
List<SectionText> reanalysisSections = new ArrayList<>();
for (SectionText sectionText : text.getSectionTexts()) {
if (sectionsToReanalyse.contains(sectionText.getSectionNumber())) {
reanalysisSections.add(sectionText);
}
for (SectionText sectionText : text.getSectionTexts()) {
if (sectionsToReanalyse.contains(sectionText.getSectionNumber())) {
reanalysisSections.add(sectionText);
}
KieContainer kieContainer = droolsExecutionService.updateRules(renalyzeRequest.getRuleSetId());
Dictionary dictionary = dictionaryService.getDeepCopyDictionary(renalyzeRequest.getRuleSetId());
List<SectionSearchableTextPair> sectionSearchableTextPairs = new ArrayList<>();
for (SectionText reanalysisSection : reanalysisSections) {
Set<Entity> entities = entityRedactionService.findEntities(reanalysisSection.getSearchableText(), reanalysisSection
.getHeadline(), reanalysisSection.getSectionNumber(), dictionary, false);
if (reanalysisSection.getCellStarts() != null) {
surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary, reanalysisSection
.getCellStarts());
} else {
surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary);
}
sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder()
.isLocal(false)
.dictionaryTypes(dictionary.getTypes())
.entities(entities)
.text(reanalysisSection.getSearchableText().getAsStringWithLinebreaks())
.searchText(reanalysisSection.getSearchableText().toString())
.headline(reanalysisSection.getHeadline())
.sectionNumber(reanalysisSection.getSectionNumber())
.tabularData(reanalysisSection.getTabularData())
.searchableText(reanalysisSection.getSearchableText())
.dictionary(dictionary)
.images(reanalysisSection.getImages())
.build(), reanalysisSection.getSearchableText()));
}
Set<Entity> entities = new HashSet<>();
Map<Integer, Set<Image>> imagesPerPage = new HashMap<>();
sectionSearchableTextPairs.forEach(sectionSearchableTextPair -> {
Section analysedRowSection = droolsExecutionService.executeRules(kieContainer, sectionSearchableTextPair
.getSection());
entities.addAll(analysedRowSection.getEntities());
EntitySearchUtils.removeEntitiesContainedInLarger(entities);
for (Image image : analysedRowSection.getImages()) {
imagesPerPage.computeIfAbsent(image.getPage(), (a) -> new HashSet<>()).add(image);
}
});
Map<Integer, List<Entity>> entitiesPerPage = new HashMap<>();
for (Entity entity : entities) {
Map<Integer, List<EntityPositionSequence>> sequenceOnPage = new HashMap<>();
for (EntityPositionSequence entityPositionSequence : entity.getPositionSequences()) {
sequenceOnPage.computeIfAbsent(entityPositionSequence.getPageNumber(), (x) -> new ArrayList<>())
.add(entityPositionSequence);
}
for (Map.Entry<Integer, List<EntityPositionSequence>> entry : sequenceOnPage.entrySet()) {
entitiesPerPage.computeIfAbsent(entry.getKey(), (x) -> new ArrayList<>())
.add(new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), entry
.getValue(), entity.getHeadline(), entity.getMatchedRule(), entity.getSectionNumber(), entity
.getLegalBasis(), entity.isDictionaryEntry(), entity.getTextBefore(), entity.getTextAfter(), entity
.getStart(), entity.getEnd()));
}
}
List<RedactionLogEntry> newRedactionLogEntries = new ArrayList<>();
for (int page = 1; page <= text.getNumberOfPages(); page++) {
if (entitiesPerPage.get(page) != null) {
newRedactionLogEntries.addAll(redactionLogCreatorService.addEntries(entitiesPerPage, renalyzeRequest
.getManualRedactions(), page, renalyzeRequest.getRuleSetId()));
}
if (imagesPerPage.get(page) != null) {
newRedactionLogEntries.addAll(redactionLogCreatorService.addImageEntries(imagesPerPage, renalyzeRequest
.getManualRedactions(), page, renalyzeRequest.getRuleSetId()));
}
newRedactionLogEntries.addAll(redactionLogCreatorService.addManualAddEntries(manualAdds, comments, page, renalyzeRequest
.getRuleSetId()));
}
redactionLog.getRedactionLogEntry().removeIf(entry -> sectionsToReanalyse.contains(entry.getSectionNumber()) && !entry.isImage() || entry.getSectionNumber() == 0 && !entry.isImage());
redactionLog.getRedactionLogEntry().addAll(newRedactionLogEntries);
redactionLog.setDictionaryVersion(dictionaryIncrement.getDictionaryVersion());
var changeLog = redactionChangeLogService.createAndStoreChangeLog(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId(), redactionLog);
redactionStorageService.storeObject(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId(), FileType.REDACTION_LOG, redactionLog);
return analyzeResponseService.createAnalyzeResponse(text.getNumberOfPages(), redactionLog, changeLog);
} catch (Exception e) {
throw new RedactionException(e);
}
//--
KieContainer kieContainer = droolsExecutionService.updateRules(analyzeRequest.getRuleSetId());
Dictionary dictionary = dictionaryService.getDeepCopyDictionary(analyzeRequest.getRuleSetId());
List<SectionSearchableTextPair> sectionSearchableTextPairs = new ArrayList<>();
for (SectionText reanalysisSection : reanalysisSections) {
Set<Entity> entities = entityRedactionService.findEntities(reanalysisSection.getSearchableText(), reanalysisSection
.getHeadline(), reanalysisSection.getSectionNumber(), dictionary, false);
if (reanalysisSection.getCellStarts() != null) {
surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary, reanalysisSection
.getCellStarts());
} else {
surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary);
}
sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder()
.isLocal(false)
.dictionaryTypes(dictionary.getTypes())
.entities(entities)
.text(reanalysisSection.getSearchableText().getAsStringWithLinebreaks())
.searchText(reanalysisSection.getSearchableText().toString())
.headline(reanalysisSection.getHeadline())
.sectionNumber(reanalysisSection.getSectionNumber())
.tabularData(reanalysisSection.getTabularData())
.searchableText(reanalysisSection.getSearchableText())
.dictionary(dictionary)
.images(reanalysisSection.getImages())
.build(), reanalysisSection.getSearchableText()));
}
Set<Entity> entities = new HashSet<>();
Map<Integer, Set<Image>> imagesPerPage = new HashMap<>();
sectionSearchableTextPairs.forEach(sectionSearchableTextPair -> {
Section analysedRowSection = droolsExecutionService.executeRules(kieContainer, sectionSearchableTextPair
.getSection());
entities.addAll(analysedRowSection.getEntities());
EntitySearchUtils.removeEntitiesContainedInLarger(entities);
for (Image image : analysedRowSection.getImages()) {
imagesPerPage.computeIfAbsent(image.getPage(), (a) -> new HashSet<>()).add(image);
}
});
Map<Integer, List<Entity>> entitiesPerPage = new HashMap<>();
for (Entity entity : entities) {
Map<Integer, List<EntityPositionSequence>> sequenceOnPage = new HashMap<>();
for (EntityPositionSequence entityPositionSequence : entity.getPositionSequences()) {
sequenceOnPage.computeIfAbsent(entityPositionSequence.getPageNumber(), (x) -> new ArrayList<>())
.add(entityPositionSequence);
}
for (Map.Entry<Integer, List<EntityPositionSequence>> entry : sequenceOnPage.entrySet()) {
entitiesPerPage.computeIfAbsent(entry.getKey(), (x) -> new ArrayList<>())
.add(new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), entry
.getValue(), entity.getHeadline(), entity.getMatchedRule(), entity.getSectionNumber(), entity
.getLegalBasis(), entity.isDictionaryEntry(), entity.getTextBefore(), entity.getTextAfter(), entity
.getStart(), entity.getEnd()));
}
}
List<RedactionLogEntry> newRedactionLogEntries = new ArrayList<>();
for (int page = 1; page <= text.getNumberOfPages(); page++) {
if (entitiesPerPage.get(page) != null) {
newRedactionLogEntries.addAll(redactionLogCreatorService.addEntries(entitiesPerPage, analyzeRequest
.getManualRedactions(), page, analyzeRequest.getRuleSetId()));
}
if (imagesPerPage.get(page) != null) {
newRedactionLogEntries.addAll(redactionLogCreatorService.addImageEntries(imagesPerPage, analyzeRequest
.getManualRedactions(), page, analyzeRequest.getRuleSetId()));
}
newRedactionLogEntries.addAll(redactionLogCreatorService.addManualAddEntries(manualAdds, comments, page, analyzeRequest
.getRuleSetId()));
}
redactionLog.getRedactionLogEntry().removeIf(entry -> sectionsToReanalyse.contains(entry.getSectionNumber()) && !entry.isImage() || entry.getSectionNumber() == 0 && !entry.isImage());
redactionLog.getRedactionLogEntry().addAll(newRedactionLogEntries);
redactionLog.setDictionaryVersion(dictionaryIncrement.getDictionaryVersion());
var changeLog = redactionChangeLogService.createAndStoreChangeLog(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), redactionLog);
redactionStorageService.storeObject(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.REDACTION_LOG, redactionLog);
return analyzeResponseService.createAnalyzeResponse(text.getNumberOfPages(), redactionLog, changeLog);
}
@ -277,7 +257,7 @@ public class ReanalyzeService {
return Image.builder()
.type(entry.getType())
.position(new Rectangle2D.Float(position.getTopLeft().getX(), position.getTopLeft()
.position(new RedRectangle2D(position.getTopLeft().getX(), position.getTopLeft()
.getY(), position.getWidth(), position.getHeight()))
.sectionNumber(entry.getSectionNumber())
.section(entry.getSection())

View File

@ -3,9 +3,9 @@ package com.iqser.red.service.redaction.v1.server.redaction.utils;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.RedRectangle2D;
import lombok.experimental.UtilityClass;
import java.awt.geom.Rectangle2D;
import java.nio.charset.StandardCharsets;
import java.util.List;
@ -25,12 +25,8 @@ public class IdBuilder {
}
public String buildId(Rectangle2D rectangle2D, int page) {
StringBuilder sb = new StringBuilder();
sb.append("x").append(rectangle2D.getX()).append("y").append(rectangle2D.getY()).append("h").append(rectangle2D.getHeight()).append("w").append(rectangle2D.getWidth()).append("p").append(page);
return hashFunction.hashString(sb.toString(), StandardCharsets.UTF_8).toString();
public String buildId(RedRectangle2D rectangle2D, int page) {
return hashFunction.hashString("x" + rectangle2D.getX() + "y" + rectangle2D.getY() + "h" + rectangle2D.getHeight() + "w" + rectangle2D.getWidth() + "p" + page, StandardCharsets.UTF_8).toString();
}

View File

@ -1,21 +1,15 @@
package com.iqser.red.service.redaction.v1.server.segmentation;
import com.iqser.red.service.redaction.v1.model.SectionArea;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
import com.iqser.red.service.redaction.v1.server.classification.model.SectionText;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.classification.service.BlockificationService;
import com.iqser.red.service.redaction.v1.server.classification.service.ClassificationService;
import com.iqser.red.service.redaction.v1.server.exception.RedactionException;
import com.iqser.red.service.redaction.v1.server.memory.MemoryStats;
import com.iqser.red.service.redaction.v1.server.parsing.PDFAreaTextStripper;
import com.iqser.red.service.redaction.v1.server.parsing.PDFLinesTextStripper;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue;
import com.iqser.red.service.redaction.v1.server.redaction.service.ImageClassificationService;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.CleanRulings;
import com.iqser.red.service.redaction.v1.server.tableextraction.service.RulingCleaningService;
import com.iqser.red.service.redaction.v1.server.tableextraction.service.TableExtractionService;
@ -28,15 +22,12 @@ import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.springframework.stereotype.Service;
import java.awt.geom.Rectangle2D;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@Slf4j
@Service
@ -53,79 +44,6 @@ public class PdfSegmentationService {
private final ImageClassificationService imageClassificationService;
public void postProcessSections(PDDocument pdDocument, List<SectionText> texts) {
try {
for (SectionText sectionText : texts) {
List<TextBlock> textBlocks = new ArrayList<>();
Map<Integer, List<SectionArea>> sectionAreasPerPage = new HashMap<>();
for (SectionArea sectionArea : sectionText.getSectionAreas()) {
sectionAreasPerPage.computeIfAbsent(sectionArea.getPage(), (x) -> new ArrayList<>())
.add(sectionArea);
}
Map<String, CellValue> tabularData = new HashMap<>();
List<Integer> cellStarts = new ArrayList<>();
for (Integer page : sectionAreasPerPage.keySet()) {
List<SectionArea> areasOnPage = sectionAreasPerPage.get(page);
PDPage pdPage = pdDocument.getPage(page - 1);
PDRectangle cropBox = pdPage.getCropBox();
PDFAreaTextStripper textStripper = new PDFAreaTextStripper();
textStripper.setPageNumber(page);
int cellStart = 0;
for (SectionArea sectionArea : areasOnPage) {
Rectangle2D rect = null;
if (pdPage.getRotation() == 90) {
rect = new Rectangle2D.Float(sectionArea.getTopLeft().getY(), sectionArea.getTopLeft()
.getX(), sectionArea.getHeight(), sectionArea.getWidth() + 0.001f);
} else {
rect = new Rectangle2D.Float(sectionArea.getTopLeft().getX(), -sectionArea.getTopLeft()
.getY() + cropBox.getUpperRightY() - sectionArea.getHeight(), sectionArea.getWidth(), sectionArea
.getHeight() + 0.001f);
}
textStripper.addRegion(String.valueOf(1), rect);
textStripper.extractRegions(pdPage);
textStripper.getTextForRegion(String.valueOf(1));
List<TextPositionSequence> positions = textStripper.getTextPositionSequences();
TextBlock textBlock = new TextBlock(sectionArea.getTopLeft().getX(), sectionArea.getTopLeft()
.getX() + sectionArea.getWidth(), sectionArea.getTopLeft()
.getY(), sectionArea.getTopLeft().getY() + sectionArea.getHeight(), positions, 0);
if (sectionText.isTable()) {
Cell cell = new Cell();
cell.addTextBlock(textBlock);
tabularData.put(sectionArea.getHeader(), new CellValue(cell.getTextBlocks(), cellStart));
cellStarts.add(cellStart);
cellStart = cellStart + cell.toString().trim().length() + 1;
}
textBlocks.add(textBlock);
textStripper.clearPositions();
}
}
sectionText.setTextBlocks(textBlocks);
sectionText.setTabularData(tabularData);
if (sectionText.isTable()) {
sectionText.setCellStarts(cellStarts);
}
}
} catch (Exception e) {
throw new RedactionException(e);
}
}
public Document parseDocument(InputStream documentInputStream) throws IOException {
PDDocument pdDocument = null;
try {
@ -141,6 +59,7 @@ public class PdfSegmentationService {
pdDocument = reinitializePDDocument(tempFile, null);
long pageCount = pdDocument.getNumberOfPages();
long t1= System.currentTimeMillis();
for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) {
if (pageNumber % MAX_PAGES_BEFORE_GC == 0) {

View File

@ -1,5 +1,6 @@
package com.iqser.red.service.redaction.v1.server.tableextraction.model;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import lombok.AllArgsConstructor;
import lombok.Data;
@ -27,10 +28,12 @@ public abstract class AbstractTextContainer {
return page == other.getPage() && this.minX <= other.getTopLeft().getX() && this.maxX >= other.getTopLeft().getX() + other.getWidth() && this.minY <= other.getTopLeft().getY() && this.maxY >= other.getTopLeft().getY() + other.getHeight();
}
@JsonIgnore
public float getHeight() {
return maxY - minY;
}
@JsonIgnore
public float getWidth() {
return maxX - minX;
}

View File

@ -44,7 +44,7 @@ public class FilySystemBackedStorageService extends StorageService {
public void clearStorage() {
this.dataMap.forEach((k, v) -> {
v.delete();
// v.delete();
});
this.dataMap.clear();
}

View File

@ -458,6 +458,16 @@ public class RedactionIntegrationTest {
assertThat(result).isNotNull();
}
@Test
public void testXXX() {
AnalyzeRequest request = prepareStorage("files/Metolachlor/S-Metolachlor_RAR_08_Volume_3CA_B-6_2018-09-06.pdf");
MemoryStats.printMemoryStats();
AnalyzeResult result = redactionController.analyze(request);
assertThat(result).isNotNull();
}
@Test
public void noExceptionShouldBeThrownForAnyFiles() throws IOException {