reworked reanalysis and text storage
This commit is contained in:
parent
5c2596e268
commit
1d4708ad13
@ -1,5 +1,6 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
@ -12,7 +13,7 @@ public class Footer {
|
||||
|
||||
private List<TextBlock> textBlocks;
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public SearchableText getSearchableText() {
|
||||
|
||||
SearchableText searchableText = new SearchableText();
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
@ -12,7 +13,7 @@ public class Header {
|
||||
|
||||
private List<TextBlock> textBlocks;
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public SearchableText getSearchableText() {
|
||||
|
||||
SearchableText searchableText = new SearchableText();
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.iqser.red.service.redaction.v1.model.SectionArea;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
|
||||
@ -36,7 +37,7 @@ public class SectionText {
|
||||
this.tabularData = tabularData;
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public SearchableText getSearchableText() {
|
||||
|
||||
SearchableText searchableText = new SearchableText();
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
||||
@ -118,6 +119,7 @@ public class TextBlock extends AbstractTextContainer {
|
||||
}
|
||||
|
||||
@Override
|
||||
@JsonIgnore
|
||||
public String getText() {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
@ -12,7 +13,7 @@ public class UnclassifiedText {
|
||||
|
||||
private List<TextBlock> textBlocks;
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public SearchableText getSearchableText() {
|
||||
|
||||
SearchableText searchableText = new SearchableText();
|
||||
|
||||
@ -1,10 +0,0 @@
|
||||
package com.iqser.red.service.redaction.v1.server.parsing.model;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
|
||||
@Data
|
||||
public class RedMatrix {
|
||||
|
||||
private float[] single;
|
||||
}
|
||||
@ -1,35 +1,35 @@
|
||||
package com.iqser.red.service.redaction.v1.server.parsing.model;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import lombok.Data;
|
||||
import lombok.SneakyThrows;
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
import org.apache.pdfbox.util.Matrix;
|
||||
import org.springframework.beans.BeanUtils;
|
||||
|
||||
@Data
|
||||
public class RedTextPosition {
|
||||
|
||||
private Matrix textMatrix;
|
||||
private float endX;
|
||||
private float endY;
|
||||
private float maxHeight;
|
||||
private String textMatrix;
|
||||
private int rotation;
|
||||
private float x;
|
||||
private float y;
|
||||
private float pageHeight;
|
||||
private float pageWidth;
|
||||
private float widthOfSpace;
|
||||
private int[] charCodes;
|
||||
private float fontSize;
|
||||
private float fontSizePt;
|
||||
private float[] widths;
|
||||
private String unicode;
|
||||
private float direction = -1.0F;
|
||||
private float XDirAdj;
|
||||
private float YDirAdj;
|
||||
private float width;
|
||||
private float heightDir;
|
||||
|
||||
// not used in reanalysis
|
||||
@JsonIgnore
|
||||
private float widthOfSpace;
|
||||
|
||||
// not used in reanalysis
|
||||
@JsonIgnore
|
||||
private float fontSizeInPt;
|
||||
|
||||
// not used in reanalysis
|
||||
@JsonIgnore
|
||||
private String fontName;
|
||||
|
||||
|
||||
@ -39,10 +39,12 @@ public class RedTextPosition {
|
||||
BeanUtils.copyProperties(textPosition, pos);
|
||||
pos.setFontName(textPosition.getFont().getName());
|
||||
|
||||
pos.setCharCodes(textPosition.getCharacterCodes());
|
||||
pos.setWidths(textPosition.getIndividualWidths());
|
||||
pos.setFontSizePt(textPosition.getFontSizeInPt());
|
||||
pos.setFontSizeInPt(textPosition.getFontSizeInPt());
|
||||
|
||||
pos.setTextMatrix(textPosition.getTextMatrix().toString());
|
||||
|
||||
return pos;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
@ -1,5 +1,7 @@
|
||||
package com.iqser.red.service.redaction.v1.server.parsing.model;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
|
||||
import com.iqser.red.service.redaction.v1.model.Point;
|
||||
import com.iqser.red.service.redaction.v1.model.Rectangle;
|
||||
import lombok.Data;
|
||||
@ -12,11 +14,14 @@ import java.util.stream.Collectors;
|
||||
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
@JsonIgnoreProperties({ "empty" })
|
||||
public class TextPositionSequence implements CharSequence {
|
||||
|
||||
private int page;
|
||||
private List<RedTextPosition> textPositions = new ArrayList<>();
|
||||
|
||||
private float x1;
|
||||
private float x2;
|
||||
|
||||
public TextPositionSequence(int page) {
|
||||
this.page = page;
|
||||
@ -38,9 +43,8 @@ public class TextPositionSequence implements CharSequence {
|
||||
this.page = page;
|
||||
}
|
||||
|
||||
public void setTextPositions(List<TextPosition> textPositions) {
|
||||
this.textPositions = textPositions.stream().map(RedTextPosition::fromTextPosition).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
@Override
|
||||
@ -103,6 +107,7 @@ public class TextPositionSequence implements CharSequence {
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getX1() {
|
||||
|
||||
if (textPositions.get(0).getRotation() == 90) {
|
||||
@ -113,6 +118,7 @@ public class TextPositionSequence implements CharSequence {
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getX2() {
|
||||
|
||||
if (textPositions.get(0).getRotation() == 90) {
|
||||
@ -123,13 +129,14 @@ public class TextPositionSequence implements CharSequence {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getRotationAdjustedY() {
|
||||
|
||||
return textPositions.get(0).getY();
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getY1() {
|
||||
|
||||
if (textPositions.get(0).getRotation() == 90) {
|
||||
@ -140,6 +147,7 @@ public class TextPositionSequence implements CharSequence {
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getY2() {
|
||||
|
||||
if (textPositions.get(0).getRotation() == 90) {
|
||||
@ -150,26 +158,29 @@ public class TextPositionSequence implements CharSequence {
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getTextHeight() {
|
||||
|
||||
return textPositions.get(0).getHeightDir() + 2;
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getHeight() {
|
||||
|
||||
return getY2() - getY1();
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getWidth() {
|
||||
|
||||
return getX2() - getX1();
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public String getFont() {
|
||||
|
||||
return textPositions.get(0).getFontName()
|
||||
.toLowerCase()
|
||||
.replaceAll(",bold", "")
|
||||
@ -177,6 +188,7 @@ public class TextPositionSequence implements CharSequence {
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public String getFontStyle() {
|
||||
|
||||
String lowercaseFontName = textPositions.get(0).getFontName().toLowerCase();
|
||||
@ -193,25 +205,25 @@ public class TextPositionSequence implements CharSequence {
|
||||
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getFontSize() {
|
||||
|
||||
return textPositions.get(0).getFontSizeInPt();
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getSpaceWidth() {
|
||||
|
||||
return textPositions.get(0).getWidthOfSpace();
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public int getRotation() {
|
||||
|
||||
return textPositions.get(0).getRotation();
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public Rectangle getRectangle() {
|
||||
|
||||
float height = getTextHeight();
|
||||
|
||||
@ -5,8 +5,6 @@ import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@ -14,7 +12,7 @@ import java.awt.geom.Rectangle2D;
|
||||
public class Image {
|
||||
|
||||
private String type;
|
||||
private Rectangle2D position;
|
||||
private RedRectangle2D position;
|
||||
private boolean redaction;
|
||||
private String redactionReason;
|
||||
private String legalBasis;
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.model;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import lombok.Data;
|
||||
import lombok.NonNull;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
@ -11,9 +12,10 @@ import java.awt.image.BufferedImage;
|
||||
@RequiredArgsConstructor
|
||||
public class PdfImage {
|
||||
|
||||
@JsonIgnore
|
||||
private BufferedImage image;
|
||||
@NonNull
|
||||
private Rectangle2D position;
|
||||
private RedRectangle2D position;
|
||||
private ImageType imageType;
|
||||
private boolean isAppendedToParagraph;
|
||||
|
||||
@ -22,7 +24,7 @@ public class PdfImage {
|
||||
|
||||
public PdfImage(BufferedImage image, Rectangle2D position, int page) {
|
||||
this.image = image;
|
||||
this.position = position;
|
||||
this.position = new RedRectangle2D(position.getX(), position.getY(), position.getWidth(), position.getHeight());
|
||||
this.page = page;
|
||||
}
|
||||
|
||||
|
||||
@ -0,0 +1,35 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.model;
|
||||
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class RedRectangle2D {
|
||||
|
||||
private double x;
|
||||
private double y;
|
||||
private double width;
|
||||
private double height;
|
||||
|
||||
@JsonIgnore
|
||||
public boolean isEmpty() {
|
||||
return (width <= 0.0f) || (height <= 0.0f);
|
||||
}
|
||||
|
||||
public boolean contains(double x, double y, double w, double h) {
|
||||
if (isEmpty() || w <= 0 || h <= 0) {
|
||||
return false;
|
||||
}
|
||||
double x0 = getX();
|
||||
double y0 = getY();
|
||||
return (x >= x0 &&
|
||||
y >= y0 &&
|
||||
(x + w) <= x0 + getWidth() &&
|
||||
(y + h) <= y0 + getHeight());
|
||||
}
|
||||
}
|
||||
@ -187,6 +187,7 @@ public class EntityRedactionService {
|
||||
.get(0)
|
||||
.getPage());
|
||||
sectionText.getSectionAreas().add(sectionArea);
|
||||
sectionText.getTextBlocks().addAll(cell.getTextBlocks());
|
||||
|
||||
addSectionToManualRedactions(cell.getTextBlocks(), manualRedactions, table.getHeadline(), sectionNumber.intValue());
|
||||
int cellStart = start;
|
||||
@ -235,6 +236,8 @@ public class EntityRedactionService {
|
||||
sectionText.setHeadline(table.getHeadline());
|
||||
sectionText.setSectionNumber(sectionNumber.intValue());
|
||||
sectionText.setTable(true);
|
||||
sectionText.setTabularData(tabularData);
|
||||
sectionText.setCellStarts(cellStarts);
|
||||
classifiedDoc.getSectionText().add(sectionText);
|
||||
}
|
||||
|
||||
@ -267,6 +270,7 @@ public class EntityRedactionService {
|
||||
.getSequences()
|
||||
.get(0)
|
||||
.getPage());
|
||||
sectionText.getTextBlocks().addAll(cell.getTextBlocks());
|
||||
sectionText.getSectionAreas().add(sectionArea);
|
||||
}
|
||||
|
||||
@ -325,6 +329,10 @@ public class EntityRedactionService {
|
||||
sectionText.setHeadline(headline);
|
||||
sectionText.setSectionNumber(sectionNumber.intValue());
|
||||
sectionText.setTable(false);
|
||||
sectionText.setImages(images.stream()
|
||||
.map(image -> convert(image, sectionNumber.intValue(), headline))
|
||||
.collect(Collectors.toSet()));
|
||||
sectionText.setTextBlocks(paragraphTextBlocks);
|
||||
classifiedDoc.getSectionText().add(sectionText);
|
||||
}
|
||||
|
||||
|
||||
@ -12,15 +12,12 @@ import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUti
|
||||
import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationService;
|
||||
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.pdfbox.io.MemoryUsageSetting;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.kie.api.runtime.KieContainer;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.web.bind.annotation.RequestBody;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
@ -42,7 +39,6 @@ public class ReanalyzeService {
|
||||
|
||||
public AnalyzeResult analyze(AnalyzeRequest analyzeRequest) {
|
||||
|
||||
|
||||
var pageCount = 0;
|
||||
Document classifiedDoc;
|
||||
|
||||
@ -56,18 +52,6 @@ public class ReanalyzeService {
|
||||
log.info("Document structure analysis successful, starting redaction analysis...");
|
||||
|
||||
entityRedactionService.processDocument(classifiedDoc, analyzeRequest.getRuleSetId(), analyzeRequest.getManualRedactions());
|
||||
|
||||
var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.ORIGIN));
|
||||
|
||||
// TODO move this to where it makes sense - or remove completly
|
||||
try (PDDocument pdDocument = PDDocument.load(storedObjectStream, MemoryUsageSetting.setupTempFileOnly())) {
|
||||
pdDocument.setAllSecurityToBeRemoved(true);
|
||||
pdfSegmentationService.postProcessSections(pdDocument, classifiedDoc.getSectionText());
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
||||
|
||||
redactionLogCreatorService.createRedactionLog(classifiedDoc, pageCount, analyzeRequest.getManualRedactions(), analyzeRequest
|
||||
.getRuleSetId());
|
||||
|
||||
@ -89,30 +73,28 @@ public class ReanalyzeService {
|
||||
return analyzeResponseService.createAnalyzeResponse(pageCount, redactionLog, changeLog);
|
||||
}
|
||||
|
||||
public AnalyzeResult reanalyze(@RequestBody AnalyzeRequest renalyzeRequest) {
|
||||
var text = redactionStorageService.getText(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId());
|
||||
// new procedure was not applied, we need a complete analysis
|
||||
|
||||
@SneakyThrows
|
||||
public AnalyzeResult reanalyze(@RequestBody AnalyzeRequest analyzeRequest) {
|
||||
|
||||
var redactionLog = redactionStorageService.getRedactionLog(analyzeRequest.getProjectId(), analyzeRequest.getFileId());
|
||||
var text = redactionStorageService.getText(analyzeRequest.getProjectId(), analyzeRequest.getFileId());
|
||||
|
||||
// not yet ready for reanalysis
|
||||
if (text.getNumberOfPages() == 0) {
|
||||
return analyze(AnalyzeRequest.builder()
|
||||
.ruleSetId(renalyzeRequest.getRuleSetId())
|
||||
.manualRedactions(renalyzeRequest.getManualRedactions())
|
||||
.projectId(renalyzeRequest.getProjectId())
|
||||
.fileId(renalyzeRequest.getFileId())
|
||||
.build());
|
||||
return analyze(analyzeRequest);
|
||||
}
|
||||
var redactionLog = redactionStorageService.getRedactionLog(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId());
|
||||
|
||||
DictionaryIncrement dictionaryIncrement = dictionaryService.getDictionaryIncrements(analyzeRequest.getRuleSetId(), redactionLog.getDictionaryVersion());
|
||||
|
||||
DictionaryIncrement dictionaryIncrement = dictionaryService.getDictionaryIncrements(renalyzeRequest.getRuleSetId(), redactionLog.getDictionaryVersion());
|
||||
|
||||
Set<String> manualForceAndRemoveIds = getForceAndRemoveIds(renalyzeRequest.getManualRedactions());
|
||||
Set<String> manualForceAndRemoveIds = getForceAndRemoveIds(analyzeRequest.getManualRedactions());
|
||||
Map<String, List<Comment>> comments = null;
|
||||
Set<ManualRedactionEntry> manualAdds = null;
|
||||
|
||||
if (renalyzeRequest.getManualRedactions() != null) {
|
||||
if (analyzeRequest.getManualRedactions() != null) {
|
||||
// TODO comments will be removed from redactionLog, so we ignore this first.
|
||||
comments = renalyzeRequest.getManualRedactions().getComments();
|
||||
manualAdds = renalyzeRequest.getManualRedactions().getEntriesToAdd();
|
||||
comments = analyzeRequest.getManualRedactions().getComments();
|
||||
manualAdds = analyzeRequest.getManualRedactions().getEntriesToAdd();
|
||||
}
|
||||
|
||||
Set<Integer> sectionsToReanalyse = new HashSet<>();
|
||||
@ -146,115 +128,113 @@ public class ReanalyzeService {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (sectionsToReanalyse.isEmpty() && (manualAdds == null || manualAdds.isEmpty())) {
|
||||
redactionLog.setDictionaryVersion(dictionaryIncrement.getDictionaryVersion());
|
||||
var changeLog = redactionChangeLogService.createAndStoreChangeLog(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId(), redactionLog);
|
||||
redactionStorageService.storeObject(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId(), FileType.REDACTION_LOG, redactionLog);
|
||||
var changeLog = redactionChangeLogService.createAndStoreChangeLog(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), redactionLog);
|
||||
redactionStorageService.storeObject(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.REDACTION_LOG, redactionLog);
|
||||
return analyzeResponseService.createAnalyzeResponse(text.getNumberOfPages(), redactionLog, changeLog);
|
||||
}
|
||||
|
||||
try {
|
||||
List<SectionText> reanalysisSections = new ArrayList<>();
|
||||
|
||||
List<SectionText> reanalysisSections = new ArrayList<>();
|
||||
for (SectionText sectionText : text.getSectionTexts()) {
|
||||
|
||||
if (sectionsToReanalyse.contains(sectionText.getSectionNumber())) {
|
||||
reanalysisSections.add(sectionText);
|
||||
}
|
||||
for (SectionText sectionText : text.getSectionTexts()) {
|
||||
|
||||
if (sectionsToReanalyse.contains(sectionText.getSectionNumber())) {
|
||||
reanalysisSections.add(sectionText);
|
||||
}
|
||||
|
||||
KieContainer kieContainer = droolsExecutionService.updateRules(renalyzeRequest.getRuleSetId());
|
||||
|
||||
Dictionary dictionary = dictionaryService.getDeepCopyDictionary(renalyzeRequest.getRuleSetId());
|
||||
|
||||
List<SectionSearchableTextPair> sectionSearchableTextPairs = new ArrayList<>();
|
||||
for (SectionText reanalysisSection : reanalysisSections) {
|
||||
|
||||
Set<Entity> entities = entityRedactionService.findEntities(reanalysisSection.getSearchableText(), reanalysisSection
|
||||
.getHeadline(), reanalysisSection.getSectionNumber(), dictionary, false);
|
||||
if (reanalysisSection.getCellStarts() != null) {
|
||||
surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary, reanalysisSection
|
||||
.getCellStarts());
|
||||
} else {
|
||||
surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary);
|
||||
}
|
||||
|
||||
sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder()
|
||||
.isLocal(false)
|
||||
.dictionaryTypes(dictionary.getTypes())
|
||||
.entities(entities)
|
||||
.text(reanalysisSection.getSearchableText().getAsStringWithLinebreaks())
|
||||
.searchText(reanalysisSection.getSearchableText().toString())
|
||||
.headline(reanalysisSection.getHeadline())
|
||||
.sectionNumber(reanalysisSection.getSectionNumber())
|
||||
.tabularData(reanalysisSection.getTabularData())
|
||||
.searchableText(reanalysisSection.getSearchableText())
|
||||
.dictionary(dictionary)
|
||||
.images(reanalysisSection.getImages())
|
||||
.build(), reanalysisSection.getSearchableText()));
|
||||
}
|
||||
|
||||
Set<Entity> entities = new HashSet<>();
|
||||
Map<Integer, Set<Image>> imagesPerPage = new HashMap<>();
|
||||
sectionSearchableTextPairs.forEach(sectionSearchableTextPair -> {
|
||||
Section analysedRowSection = droolsExecutionService.executeRules(kieContainer, sectionSearchableTextPair
|
||||
.getSection());
|
||||
entities.addAll(analysedRowSection.getEntities());
|
||||
EntitySearchUtils.removeEntitiesContainedInLarger(entities);
|
||||
|
||||
for (Image image : analysedRowSection.getImages()) {
|
||||
imagesPerPage.computeIfAbsent(image.getPage(), (a) -> new HashSet<>()).add(image);
|
||||
}
|
||||
|
||||
});
|
||||
|
||||
Map<Integer, List<Entity>> entitiesPerPage = new HashMap<>();
|
||||
for (Entity entity : entities) {
|
||||
Map<Integer, List<EntityPositionSequence>> sequenceOnPage = new HashMap<>();
|
||||
for (EntityPositionSequence entityPositionSequence : entity.getPositionSequences()) {
|
||||
sequenceOnPage.computeIfAbsent(entityPositionSequence.getPageNumber(), (x) -> new ArrayList<>())
|
||||
.add(entityPositionSequence);
|
||||
}
|
||||
|
||||
for (Map.Entry<Integer, List<EntityPositionSequence>> entry : sequenceOnPage.entrySet()) {
|
||||
entitiesPerPage.computeIfAbsent(entry.getKey(), (x) -> new ArrayList<>())
|
||||
.add(new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), entry
|
||||
.getValue(), entity.getHeadline(), entity.getMatchedRule(), entity.getSectionNumber(), entity
|
||||
.getLegalBasis(), entity.isDictionaryEntry(), entity.getTextBefore(), entity.getTextAfter(), entity
|
||||
.getStart(), entity.getEnd()));
|
||||
}
|
||||
}
|
||||
|
||||
List<RedactionLogEntry> newRedactionLogEntries = new ArrayList<>();
|
||||
for (int page = 1; page <= text.getNumberOfPages(); page++) {
|
||||
if (entitiesPerPage.get(page) != null) {
|
||||
newRedactionLogEntries.addAll(redactionLogCreatorService.addEntries(entitiesPerPage, renalyzeRequest
|
||||
.getManualRedactions(), page, renalyzeRequest.getRuleSetId()));
|
||||
}
|
||||
|
||||
if (imagesPerPage.get(page) != null) {
|
||||
newRedactionLogEntries.addAll(redactionLogCreatorService.addImageEntries(imagesPerPage, renalyzeRequest
|
||||
.getManualRedactions(), page, renalyzeRequest.getRuleSetId()));
|
||||
}
|
||||
|
||||
newRedactionLogEntries.addAll(redactionLogCreatorService.addManualAddEntries(manualAdds, comments, page, renalyzeRequest
|
||||
.getRuleSetId()));
|
||||
}
|
||||
|
||||
redactionLog.getRedactionLogEntry().removeIf(entry -> sectionsToReanalyse.contains(entry.getSectionNumber()) && !entry.isImage() || entry.getSectionNumber() == 0 && !entry.isImage());
|
||||
redactionLog.getRedactionLogEntry().addAll(newRedactionLogEntries);
|
||||
redactionLog.setDictionaryVersion(dictionaryIncrement.getDictionaryVersion());
|
||||
|
||||
var changeLog = redactionChangeLogService.createAndStoreChangeLog(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId(), redactionLog);
|
||||
redactionStorageService.storeObject(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId(), FileType.REDACTION_LOG, redactionLog);
|
||||
return analyzeResponseService.createAnalyzeResponse(text.getNumberOfPages(), redactionLog, changeLog);
|
||||
|
||||
|
||||
} catch (Exception e) {
|
||||
throw new RedactionException(e);
|
||||
}
|
||||
|
||||
|
||||
//--
|
||||
|
||||
KieContainer kieContainer = droolsExecutionService.updateRules(analyzeRequest.getRuleSetId());
|
||||
|
||||
Dictionary dictionary = dictionaryService.getDeepCopyDictionary(analyzeRequest.getRuleSetId());
|
||||
|
||||
List<SectionSearchableTextPair> sectionSearchableTextPairs = new ArrayList<>();
|
||||
for (SectionText reanalysisSection : reanalysisSections) {
|
||||
|
||||
Set<Entity> entities = entityRedactionService.findEntities(reanalysisSection.getSearchableText(), reanalysisSection
|
||||
.getHeadline(), reanalysisSection.getSectionNumber(), dictionary, false);
|
||||
if (reanalysisSection.getCellStarts() != null) {
|
||||
surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary, reanalysisSection
|
||||
.getCellStarts());
|
||||
} else {
|
||||
surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary);
|
||||
}
|
||||
|
||||
sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder()
|
||||
.isLocal(false)
|
||||
.dictionaryTypes(dictionary.getTypes())
|
||||
.entities(entities)
|
||||
.text(reanalysisSection.getSearchableText().getAsStringWithLinebreaks())
|
||||
.searchText(reanalysisSection.getSearchableText().toString())
|
||||
.headline(reanalysisSection.getHeadline())
|
||||
.sectionNumber(reanalysisSection.getSectionNumber())
|
||||
.tabularData(reanalysisSection.getTabularData())
|
||||
.searchableText(reanalysisSection.getSearchableText())
|
||||
.dictionary(dictionary)
|
||||
.images(reanalysisSection.getImages())
|
||||
.build(), reanalysisSection.getSearchableText()));
|
||||
}
|
||||
|
||||
Set<Entity> entities = new HashSet<>();
|
||||
Map<Integer, Set<Image>> imagesPerPage = new HashMap<>();
|
||||
sectionSearchableTextPairs.forEach(sectionSearchableTextPair -> {
|
||||
Section analysedRowSection = droolsExecutionService.executeRules(kieContainer, sectionSearchableTextPair
|
||||
.getSection());
|
||||
entities.addAll(analysedRowSection.getEntities());
|
||||
EntitySearchUtils.removeEntitiesContainedInLarger(entities);
|
||||
|
||||
for (Image image : analysedRowSection.getImages()) {
|
||||
imagesPerPage.computeIfAbsent(image.getPage(), (a) -> new HashSet<>()).add(image);
|
||||
}
|
||||
|
||||
});
|
||||
|
||||
Map<Integer, List<Entity>> entitiesPerPage = new HashMap<>();
|
||||
for (Entity entity : entities) {
|
||||
Map<Integer, List<EntityPositionSequence>> sequenceOnPage = new HashMap<>();
|
||||
for (EntityPositionSequence entityPositionSequence : entity.getPositionSequences()) {
|
||||
sequenceOnPage.computeIfAbsent(entityPositionSequence.getPageNumber(), (x) -> new ArrayList<>())
|
||||
.add(entityPositionSequence);
|
||||
}
|
||||
|
||||
for (Map.Entry<Integer, List<EntityPositionSequence>> entry : sequenceOnPage.entrySet()) {
|
||||
entitiesPerPage.computeIfAbsent(entry.getKey(), (x) -> new ArrayList<>())
|
||||
.add(new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), entry
|
||||
.getValue(), entity.getHeadline(), entity.getMatchedRule(), entity.getSectionNumber(), entity
|
||||
.getLegalBasis(), entity.isDictionaryEntry(), entity.getTextBefore(), entity.getTextAfter(), entity
|
||||
.getStart(), entity.getEnd()));
|
||||
}
|
||||
}
|
||||
|
||||
List<RedactionLogEntry> newRedactionLogEntries = new ArrayList<>();
|
||||
for (int page = 1; page <= text.getNumberOfPages(); page++) {
|
||||
if (entitiesPerPage.get(page) != null) {
|
||||
newRedactionLogEntries.addAll(redactionLogCreatorService.addEntries(entitiesPerPage, analyzeRequest
|
||||
.getManualRedactions(), page, analyzeRequest.getRuleSetId()));
|
||||
}
|
||||
|
||||
if (imagesPerPage.get(page) != null) {
|
||||
newRedactionLogEntries.addAll(redactionLogCreatorService.addImageEntries(imagesPerPage, analyzeRequest
|
||||
.getManualRedactions(), page, analyzeRequest.getRuleSetId()));
|
||||
}
|
||||
|
||||
newRedactionLogEntries.addAll(redactionLogCreatorService.addManualAddEntries(manualAdds, comments, page, analyzeRequest
|
||||
.getRuleSetId()));
|
||||
}
|
||||
|
||||
|
||||
redactionLog.getRedactionLogEntry().removeIf(entry -> sectionsToReanalyse.contains(entry.getSectionNumber()) && !entry.isImage() || entry.getSectionNumber() == 0 && !entry.isImage());
|
||||
redactionLog.getRedactionLogEntry().addAll(newRedactionLogEntries);
|
||||
redactionLog.setDictionaryVersion(dictionaryIncrement.getDictionaryVersion());
|
||||
|
||||
var changeLog = redactionChangeLogService.createAndStoreChangeLog(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), redactionLog);
|
||||
redactionStorageService.storeObject(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.REDACTION_LOG, redactionLog);
|
||||
return analyzeResponseService.createAnalyzeResponse(text.getNumberOfPages(), redactionLog, changeLog);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -277,7 +257,7 @@ public class ReanalyzeService {
|
||||
|
||||
return Image.builder()
|
||||
.type(entry.getType())
|
||||
.position(new Rectangle2D.Float(position.getTopLeft().getX(), position.getTopLeft()
|
||||
.position(new RedRectangle2D(position.getTopLeft().getX(), position.getTopLeft()
|
||||
.getY(), position.getWidth(), position.getHeight()))
|
||||
.sectionNumber(entry.getSectionNumber())
|
||||
.section(entry.getSection())
|
||||
|
||||
@ -3,9 +3,9 @@ package com.iqser.red.service.redaction.v1.server.redaction.utils;
|
||||
import com.google.common.hash.HashFunction;
|
||||
import com.google.common.hash.Hashing;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.RedRectangle2D;
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.List;
|
||||
|
||||
@ -25,12 +25,8 @@ public class IdBuilder {
|
||||
}
|
||||
|
||||
|
||||
public String buildId(Rectangle2D rectangle2D, int page) {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append("x").append(rectangle2D.getX()).append("y").append(rectangle2D.getY()).append("h").append(rectangle2D.getHeight()).append("w").append(rectangle2D.getWidth()).append("p").append(page);
|
||||
|
||||
return hashFunction.hashString(sb.toString(), StandardCharsets.UTF_8).toString();
|
||||
public String buildId(RedRectangle2D rectangle2D, int page) {
|
||||
return hashFunction.hashString("x" + rectangle2D.getX() + "y" + rectangle2D.getY() + "h" + rectangle2D.getHeight() + "w" + rectangle2D.getWidth() + "p" + page, StandardCharsets.UTF_8).toString();
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -1,21 +1,15 @@
|
||||
package com.iqser.red.service.redaction.v1.server.segmentation;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.model.SectionArea;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.SectionText;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.service.BlockificationService;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.service.ClassificationService;
|
||||
import com.iqser.red.service.redaction.v1.server.exception.RedactionException;
|
||||
import com.iqser.red.service.redaction.v1.server.memory.MemoryStats;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.PDFAreaTextStripper;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.PDFLinesTextStripper;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.service.ImageClassificationService;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.CleanRulings;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.service.RulingCleaningService;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.service.TableExtractionService;
|
||||
@ -28,15 +22,12 @@ import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@ -53,79 +44,6 @@ public class PdfSegmentationService {
|
||||
private final ImageClassificationService imageClassificationService;
|
||||
|
||||
|
||||
public void postProcessSections(PDDocument pdDocument, List<SectionText> texts) {
|
||||
|
||||
try {
|
||||
for (SectionText sectionText : texts) {
|
||||
|
||||
List<TextBlock> textBlocks = new ArrayList<>();
|
||||
|
||||
Map<Integer, List<SectionArea>> sectionAreasPerPage = new HashMap<>();
|
||||
for (SectionArea sectionArea : sectionText.getSectionAreas()) {
|
||||
sectionAreasPerPage.computeIfAbsent(sectionArea.getPage(), (x) -> new ArrayList<>())
|
||||
.add(sectionArea);
|
||||
}
|
||||
|
||||
Map<String, CellValue> tabularData = new HashMap<>();
|
||||
List<Integer> cellStarts = new ArrayList<>();
|
||||
for (Integer page : sectionAreasPerPage.keySet()) {
|
||||
List<SectionArea> areasOnPage = sectionAreasPerPage.get(page);
|
||||
|
||||
PDPage pdPage = pdDocument.getPage(page - 1);
|
||||
PDRectangle cropBox = pdPage.getCropBox();
|
||||
PDFAreaTextStripper textStripper = new PDFAreaTextStripper();
|
||||
textStripper.setPageNumber(page);
|
||||
|
||||
int cellStart = 0;
|
||||
for (SectionArea sectionArea : areasOnPage) {
|
||||
|
||||
Rectangle2D rect = null;
|
||||
if (pdPage.getRotation() == 90) {
|
||||
rect = new Rectangle2D.Float(sectionArea.getTopLeft().getY(), sectionArea.getTopLeft()
|
||||
.getX(), sectionArea.getHeight(), sectionArea.getWidth() + 0.001f);
|
||||
} else {
|
||||
rect = new Rectangle2D.Float(sectionArea.getTopLeft().getX(), -sectionArea.getTopLeft()
|
||||
.getY() + cropBox.getUpperRightY() - sectionArea.getHeight(), sectionArea.getWidth(), sectionArea
|
||||
.getHeight() + 0.001f);
|
||||
}
|
||||
|
||||
textStripper.addRegion(String.valueOf(1), rect);
|
||||
textStripper.extractRegions(pdPage);
|
||||
textStripper.getTextForRegion(String.valueOf(1));
|
||||
List<TextPositionSequence> positions = textStripper.getTextPositionSequences();
|
||||
|
||||
TextBlock textBlock = new TextBlock(sectionArea.getTopLeft().getX(), sectionArea.getTopLeft()
|
||||
.getX() + sectionArea.getWidth(), sectionArea.getTopLeft()
|
||||
.getY(), sectionArea.getTopLeft().getY() + sectionArea.getHeight(), positions, 0);
|
||||
|
||||
if (sectionText.isTable()) {
|
||||
Cell cell = new Cell();
|
||||
cell.addTextBlock(textBlock);
|
||||
tabularData.put(sectionArea.getHeader(), new CellValue(cell.getTextBlocks(), cellStart));
|
||||
cellStarts.add(cellStart);
|
||||
cellStart = cellStart + cell.toString().trim().length() + 1;
|
||||
}
|
||||
|
||||
textBlocks.add(textBlock);
|
||||
textStripper.clearPositions();
|
||||
}
|
||||
|
||||
}
|
||||
sectionText.setTextBlocks(textBlocks);
|
||||
sectionText.setTabularData(tabularData);
|
||||
if (sectionText.isTable()) {
|
||||
sectionText.setCellStarts(cellStarts);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
} catch (Exception e) {
|
||||
throw new RedactionException(e);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
public Document parseDocument(InputStream documentInputStream) throws IOException {
|
||||
PDDocument pdDocument = null;
|
||||
try {
|
||||
@ -141,6 +59,7 @@ public class PdfSegmentationService {
|
||||
pdDocument = reinitializePDDocument(tempFile, null);
|
||||
long pageCount = pdDocument.getNumberOfPages();
|
||||
|
||||
long t1= System.currentTimeMillis();
|
||||
for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) {
|
||||
|
||||
if (pageNumber % MAX_PAGES_BEFORE_GC == 0) {
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
package com.iqser.red.service.redaction.v1.server.tableextraction.model;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.iqser.red.service.redaction.v1.model.Rectangle;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
@ -27,10 +28,12 @@ public abstract class AbstractTextContainer {
|
||||
return page == other.getPage() && this.minX <= other.getTopLeft().getX() && this.maxX >= other.getTopLeft().getX() + other.getWidth() && this.minY <= other.getTopLeft().getY() && this.maxY >= other.getTopLeft().getY() + other.getHeight();
|
||||
}
|
||||
|
||||
@JsonIgnore
|
||||
public float getHeight() {
|
||||
return maxY - minY;
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getWidth() {
|
||||
return maxX - minX;
|
||||
}
|
||||
|
||||
@ -44,7 +44,7 @@ public class FilySystemBackedStorageService extends StorageService {
|
||||
|
||||
public void clearStorage() {
|
||||
this.dataMap.forEach((k, v) -> {
|
||||
v.delete();
|
||||
// v.delete();
|
||||
});
|
||||
this.dataMap.clear();
|
||||
}
|
||||
|
||||
@ -458,6 +458,16 @@ public class RedactionIntegrationTest {
|
||||
assertThat(result).isNotNull();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testXXX() {
|
||||
AnalyzeRequest request = prepareStorage("files/Metolachlor/S-Metolachlor_RAR_08_Volume_3CA_B-6_2018-09-06.pdf");
|
||||
MemoryStats.printMemoryStats();
|
||||
AnalyzeResult result = redactionController.analyze(request);
|
||||
assertThat(result).isNotNull();
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
@Test
|
||||
public void noExceptionShouldBeThrownForAnyFiles() throws IOException {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user