Pull request #142: Improved redaction performance

Merge in RED/redaction-service from improved-redaction-performance to master

* commit 'b34fc673c4d5a4440d6e5f2391db4420c4d2acf9':
  bamboo-specs/src/main/java/buildjob/PlanSpec.java edited online with Bitbucket
  fixed some test issues
  fixed pmd
  updated planspec
  updated redrect
  set mvn opts
  run tests with real life jvm args to detect oom issues early
  code format, dependecy and test update, logging for reanalysis
  reworked reanalysis and text storage
  Serialization of text
This commit is contained in:
Timo Bejan 2021-04-20 10:30:15 +02:00
commit 07b05b2d89
24 changed files with 337 additions and 271 deletions

View File

@ -1,7 +1,5 @@
package buildjob;
import static com.atlassian.bamboo.specs.builders.task.TestParserTask.createJUnitParserTask;
import com.atlassian.bamboo.specs.api.BambooSpec;
import com.atlassian.bamboo.specs.api.builders.BambooKey;
import com.atlassian.bamboo.specs.api.builders.docker.DockerConfiguration;
@ -24,6 +22,8 @@ import com.atlassian.bamboo.specs.builders.trigger.BitbucketServerTrigger;
import com.atlassian.bamboo.specs.model.task.InjectVariablesScope;
import com.atlassian.bamboo.specs.util.BambooServer;
import static com.atlassian.bamboo.specs.builders.task.TestParserTask.createJUnitParserTask;
/**
* Plan configuration for Bamboo.
* Learn more on: <a href="https://confluence.atlassian.com/display/BAMBOO/Bamboo+Specs">https://confluence.atlassian.com/display/BAMBOO/Bamboo+Specs</a>
@ -33,6 +33,8 @@ public class PlanSpec {
private static final String SERVICE_NAME = "redaction-service";
private static final String JVM_ARGS =" -Xmx4g -XX:+ExitOnOutOfMemoryError -XX:SurvivorRatio=2 -XX:NewRatio=1 -XX:InitialTenuringThreshold=16 -XX:MaxTenuringThreshold=16 -XX:InitiatingHeapOccupancyPercent=35 ";
private static final String SERVICE_KEY = SERVICE_NAME.toUpperCase().replaceAll("-", "");
/**
@ -82,9 +84,12 @@ public class PlanSpec {
.checkoutItems(new CheckoutItem().defaultRepository()),
new ScriptTask()
.description("Build")
.environmentVariables("MAVEN_OPTS="+JVM_ARGS)
.inlineBody("#!/bin/bash\n" +
"set -e\n" +
"export MAVEN_OPTS=\"$MAVEN_OPTS "+JVM_ARGS +"\"\n" +
"if [[ \"${bamboo.version_tag}\" != \"dev\" ]]; then ${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn --no-transfer-progress -f ${bamboo_build_working_directory}/" + SERVICE_NAME + "-v1/pom.xml versions:set -DnewVersion=${bamboo.version_tag}; fi\n" +
"if [[ \"${bamboo.version_tag}\" != \"dev\" ]]; then ${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn --no-transfer-progress -f ${bamboo_build_working_directory}/" + SERVICE_NAME + "-image-v1/pom.xml versions:set -DnewVersion=${bamboo.version_tag}; fi\n" +

View File

@ -32,7 +32,7 @@
<dependency>
<groupId>com.iqser.red</groupId>
<artifactId>platform-commons-dependency</artifactId>
<version>1.2.9</version>
<version>1.3.0</version>
<scope>import</scope>
<type>pom</type>
</dependency>

View File

@ -1,5 +1,6 @@
package com.iqser.red.service.redaction.v1.server.classification.model;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
import lombok.AllArgsConstructor;
import lombok.Data;
@ -12,7 +13,7 @@ public class Footer {
private List<TextBlock> textBlocks;
@JsonIgnore
public SearchableText getSearchableText() {
SearchableText searchableText = new SearchableText();

View File

@ -1,5 +1,6 @@
package com.iqser.red.service.redaction.v1.server.classification.model;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
import lombok.AllArgsConstructor;
import lombok.Data;
@ -12,7 +13,7 @@ public class Header {
private List<TextBlock> textBlocks;
@JsonIgnore
public SearchableText getSearchableText() {
SearchableText searchableText = new SearchableText();

View File

@ -1,5 +1,6 @@
package com.iqser.red.service.redaction.v1.server.classification.model;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.iqser.red.service.redaction.v1.model.SectionArea;
import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue;
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
@ -31,6 +32,12 @@ public class SectionText {
private List<Integer> cellStarts = new ArrayList<>();
public void setTabularData(Map<String, CellValue> tabularData) {
tabularData.remove(null);
this.tabularData = tabularData;
}
@JsonIgnore
public SearchableText getSearchableText() {
SearchableText searchableText = new SearchableText();

View File

@ -1,11 +1,13 @@
package com.iqser.red.service.redaction.v1.server.classification.model;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import java.util.ArrayList;
import java.util.List;
@ -13,6 +15,7 @@ import java.util.List;
@AllArgsConstructor
@Builder
@Data
@NoArgsConstructor
public class TextBlock extends AbstractTextContainer {
@Builder.Default
@ -116,6 +119,7 @@ public class TextBlock extends AbstractTextContainer {
}
@Override
@JsonIgnore
public String getText() {
StringBuilder sb = new StringBuilder();

View File

@ -1,5 +1,6 @@
package com.iqser.red.service.redaction.v1.server.classification.model;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
import lombok.AllArgsConstructor;
import lombok.Data;
@ -12,7 +13,7 @@ public class UnclassifiedText {
private List<TextBlock> textBlocks;
@JsonIgnore
public SearchableText getSearchableText() {
SearchableText searchableText = new SearchableText();

View File

@ -59,7 +59,7 @@ public class RedactionController implements RedactionResource {
try (PDDocument pdDocument = PDDocument.load(storedObjectStream, MemoryUsageSetting.setupTempFileOnly())) {
pdDocument.setAllSecurityToBeRemoved(true);
dictionaryService.updateDictionary(redactionLog.getRuleSetId());
annotationService.annotate(pdDocument, redactionLog, sectionsGrid);
@ -131,7 +131,7 @@ public class RedactionController implements RedactionResource {
try {
var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getProjectId(), redactionRequest.getFileId(), FileType.ORIGIN));
classifiedDoc = pdfSegmentationService.parseDocument(storedObjectStream);
classifiedDoc = pdfSegmentationService.parseDocument(storedObjectStream, true);
} catch (Exception e) {
throw new RedactionException(e);
}

View File

@ -0,0 +1,52 @@
package com.iqser.red.service.redaction.v1.server.parsing.model;
import com.fasterxml.jackson.annotation.JsonIgnore;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.SneakyThrows;
import org.apache.pdfbox.text.TextPosition;
import org.springframework.beans.BeanUtils;
@Data
@NoArgsConstructor
public class RedTextPosition {
private String textMatrix;
private int rotation;
private float y;
private float pageHeight;
private float pageWidth;
private String unicode;
private float XDirAdj;
private float YDirAdj;
private float width;
private float heightDir;
// not used in reanalysis
@JsonIgnore
private float widthOfSpace;
// not used in reanalysis
@JsonIgnore
private float fontSizeInPt;
// not used in reanalysis
@JsonIgnore
private String fontName;
@SneakyThrows
public static RedTextPosition fromTextPosition(TextPosition textPosition) {
var pos = new RedTextPosition();
BeanUtils.copyProperties(textPosition, pos);
pos.setFontName(textPosition.getFont().getName());
pos.setFontSizeInPt(textPosition.getFontSizeInPt());
pos.setTextMatrix(textPosition.getTextMatrix().toString());
return pos;
}
}

View File

@ -1,29 +1,52 @@
package com.iqser.red.service.redaction.v1.server.parsing.model;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
import com.iqser.red.service.redaction.v1.model.Point;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import lombok.Data;
import lombok.RequiredArgsConstructor;
import lombok.NoArgsConstructor;
import org.apache.pdfbox.text.TextPosition;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
@Data
@RequiredArgsConstructor
@NoArgsConstructor
@JsonIgnoreProperties({ "empty" })
public class TextPositionSequence implements CharSequence {
private final int page;
private List<TextPosition> textPositions = new ArrayList<>();
private int page;
private List<RedTextPosition> textPositions = new ArrayList<>();
private float x1;
private float x2;
public TextPositionSequence(int page) {
this.page = page;
}
public static TextPositionSequence fromData(List<RedTextPosition> textPositions, int page) {
var textPositionSequence = new TextPositionSequence();
textPositionSequence.textPositions = textPositions;
textPositionSequence.page = page;
return textPositionSequence;
}
public TextPositionSequence(List<TextPosition> textPositions, int page) {
this.textPositions = textPositions;
this.textPositions = textPositions.stream().map(RedTextPosition::fromTextPosition).collect(Collectors.toList());
this.page = page;
}
@Override
public int length() {
@ -34,7 +57,7 @@ public class TextPositionSequence implements CharSequence {
@Override
public char charAt(int index) {
TextPosition textPosition = textPositionAt(index);
RedTextPosition textPosition = textPositionAt(index);
String text = textPosition.getUnicode();
return text.charAt(0);
}
@ -42,7 +65,7 @@ public class TextPositionSequence implements CharSequence {
public char charAt(int index, boolean caseInSensitive) {
TextPosition textPosition = textPositionAt(index);
RedTextPosition textPosition = textPositionAt(index);
String text = textPosition.getUnicode();
return caseInSensitive ? text.toLowerCase().charAt(0) : text.charAt(0);
}
@ -51,7 +74,7 @@ public class TextPositionSequence implements CharSequence {
@Override
public TextPositionSequence subSequence(int start, int end) {
return new TextPositionSequence(textPositions.subList(start, end), page);
return fromData(textPositions.subList(start, end), page);
}
@ -66,18 +89,25 @@ public class TextPositionSequence implements CharSequence {
}
public TextPosition textPositionAt(int index) {
public RedTextPosition textPositionAt(int index) {
return textPositions.get(index);
}
public void add(TextPosition textPosition) {
public void add(RedTextPosition textPosition) {
this.textPositions.add(textPosition);
}
public void add(TextPosition textPosition) {
this.textPositions.add(RedTextPosition.fromTextPosition(textPosition));
}
@JsonIgnore
public float getX1() {
if (textPositions.get(0).getRotation() == 90) {
@ -88,6 +118,7 @@ public class TextPositionSequence implements CharSequence {
}
@JsonIgnore
public float getX2() {
if (textPositions.get(0).getRotation() == 90) {
@ -98,13 +129,14 @@ public class TextPositionSequence implements CharSequence {
}
}
@JsonIgnore
public float getRotationAdjustedY() {
return textPositions.get(0).getY();
}
@JsonIgnore
public float getY1() {
if (textPositions.get(0).getRotation() == 90) {
@ -115,6 +147,7 @@ public class TextPositionSequence implements CharSequence {
}
@JsonIgnore
public float getY2() {
if (textPositions.get(0).getRotation() == 90) {
@ -125,38 +158,40 @@ public class TextPositionSequence implements CharSequence {
}
@JsonIgnore
public float getTextHeight() {
return textPositions.get(0).getHeightDir() + 2;
}
@JsonIgnore
public float getHeight() {
return getY2() - getY1();
}
@JsonIgnore
public float getWidth() {
return getX2() - getX1();
}
@JsonIgnore
public String getFont() {
return textPositions.get(0)
.getFont()
.toString()
return textPositions.get(0).getFontName()
.toLowerCase()
.replaceAll(",bold", "")
.replaceAll(",italic", "");
}
@JsonIgnore
public String getFontStyle() {
String lowercaseFontName = textPositions.get(0).getFont().toString().toLowerCase();
String lowercaseFontName = textPositions.get(0).getFontName().toLowerCase();
if (lowercaseFontName.contains("bold") && lowercaseFontName.contains("italic")) {
return "bold, italic";
@ -170,25 +205,25 @@ public class TextPositionSequence implements CharSequence {
}
@JsonIgnore
public float getFontSize() {
return textPositions.get(0).getFontSizeInPt();
}
@JsonIgnore
public float getSpaceWidth() {
return textPositions.get(0).getWidthOfSpace();
}
@JsonIgnore
public int getRotation() {
return textPositions.get(0).getRotation();
}
@JsonIgnore
public Rectangle getRectangle() {
float height = getTextHeight();

View File

@ -3,19 +3,23 @@ package com.iqser.red.service.redaction.v1.server.redaction.model;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
import lombok.Value;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
@Value
@Data
@NoArgsConstructor
@AllArgsConstructor
public class CellValue {
private List<TextBlock> textBlocks;
private List<TextBlock> textBlocks = new ArrayList<>();
private int rowSpanStart;
@Override
public String toString() {

View File

@ -5,8 +5,6 @@ import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import java.awt.geom.Rectangle2D;
@Data
@Builder
@NoArgsConstructor
@ -14,7 +12,7 @@ import java.awt.geom.Rectangle2D;
public class Image {
private String type;
private Rectangle2D position;
private RedRectangle2D position;
private boolean redaction;
private String redactionReason;
private String legalBasis;

View File

@ -1,5 +1,6 @@
package com.iqser.red.service.redaction.v1.server.redaction.model;
import com.fasterxml.jackson.annotation.JsonIgnore;
import lombok.Data;
import lombok.NonNull;
import lombok.RequiredArgsConstructor;
@ -11,9 +12,10 @@ import java.awt.image.BufferedImage;
@RequiredArgsConstructor
public class PdfImage {
@JsonIgnore
private BufferedImage image;
@NonNull
private Rectangle2D position;
private RedRectangle2D position;
private ImageType imageType;
private boolean isAppendedToParagraph;
@ -22,7 +24,7 @@ public class PdfImage {
public PdfImage(BufferedImage image, Rectangle2D position, int page) {
this.image = image;
this.position = position;
this.position = new RedRectangle2D(position.getX(), position.getY(), position.getWidth(), position.getHeight());
this.page = page;
}

View File

@ -0,0 +1,35 @@
package com.iqser.red.service.redaction.v1.server.redaction.model;
import com.fasterxml.jackson.annotation.JsonIgnore;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@NoArgsConstructor
@AllArgsConstructor
public class RedRectangle2D {
private double x;
private double y;
private double width;
private double height;
@JsonIgnore
public boolean isEmpty() {
return width <= 0.0f || height <= 0.0f;
}
public boolean contains(double x, double y, double w, double h) {
if (isEmpty() || w <= 0 || h <= 0) {
return false;
}
double x0 = getX();
double y0 = getY();
return x >= x0 &&
y >= y0 &&
(x + w) <= x0 + getWidth() &&
(y + h) <= y0 + getHeight();
}
}

View File

@ -187,6 +187,7 @@ public class EntityRedactionService {
.get(0)
.getPage());
sectionText.getSectionAreas().add(sectionArea);
sectionText.getTextBlocks().addAll(cell.getTextBlocks());
addSectionToManualRedactions(cell.getTextBlocks(), manualRedactions, table.getHeadline(), sectionNumber.intValue());
int cellStart = start;
@ -235,6 +236,8 @@ public class EntityRedactionService {
sectionText.setHeadline(table.getHeadline());
sectionText.setSectionNumber(sectionNumber.intValue());
sectionText.setTable(true);
sectionText.setTabularData(tabularData);
sectionText.setCellStarts(cellStarts);
classifiedDoc.getSectionText().add(sectionText);
}
@ -267,6 +270,7 @@ public class EntityRedactionService {
.getSequences()
.get(0)
.getPage());
sectionText.getTextBlocks().addAll(cell.getTextBlocks());
sectionText.getSectionAreas().add(sectionArea);
}
@ -325,6 +329,10 @@ public class EntityRedactionService {
sectionText.setHeadline(headline);
sectionText.setSectionNumber(sectionNumber.intValue());
sectionText.setTable(false);
sectionText.setImages(images.stream()
.map(image -> convert(image, sectionNumber.intValue(), headline))
.collect(Collectors.toSet()));
sectionText.setTextBlocks(paragraphTextBlocks);
classifiedDoc.getSectionText().add(sectionText);
}

View File

@ -12,12 +12,12 @@ import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUti
import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationService;
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import org.kie.api.runtime.KieContainer;
import org.springframework.stereotype.Service;
import org.springframework.web.bind.annotation.RequestBody;
import java.awt.geom.Rectangle2D;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;
@ -39,7 +39,6 @@ public class ReanalyzeService {
public AnalyzeResult analyze(AnalyzeRequest analyzeRequest) {
var pageCount = 0;
Document classifiedDoc;
@ -74,30 +73,28 @@ public class ReanalyzeService {
return analyzeResponseService.createAnalyzeResponse(pageCount, redactionLog, changeLog);
}
public AnalyzeResult reanalyze(@RequestBody AnalyzeRequest renalyzeRequest) {
var text = redactionStorageService.getText(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId());
// new procedure was not applied, we need a complete analysis
@SneakyThrows
public AnalyzeResult reanalyze(@RequestBody AnalyzeRequest analyzeRequest) {
var redactionLog = redactionStorageService.getRedactionLog(analyzeRequest.getProjectId(), analyzeRequest.getFileId());
var text = redactionStorageService.getText(analyzeRequest.getProjectId(), analyzeRequest.getFileId());
// not yet ready for reanalysis
if (text.getNumberOfPages() == 0) {
return analyze(AnalyzeRequest.builder()
.ruleSetId(renalyzeRequest.getRuleSetId())
.manualRedactions(renalyzeRequest.getManualRedactions())
.projectId(renalyzeRequest.getProjectId())
.fileId(renalyzeRequest.getFileId())
.build());
return analyze(analyzeRequest);
}
var redactionLog = redactionStorageService.getRedactionLog(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId());
DictionaryIncrement dictionaryIncrement = dictionaryService.getDictionaryIncrements(analyzeRequest.getRuleSetId(), redactionLog.getDictionaryVersion());
DictionaryIncrement dictionaryIncrement = dictionaryService.getDictionaryIncrements(renalyzeRequest.getRuleSetId(), redactionLog.getDictionaryVersion());
Set<String> manualForceAndRemoveIds = getForceAndRemoveIds(renalyzeRequest.getManualRedactions());
Set<String> manualForceAndRemoveIds = getForceAndRemoveIds(analyzeRequest.getManualRedactions());
Map<String, List<Comment>> comments = null;
Set<ManualRedactionEntry> manualAdds = null;
if (renalyzeRequest.getManualRedactions() != null) {
if (analyzeRequest.getManualRedactions() != null) {
// TODO comments will be removed from redactionLog, so we ignore this first.
comments = renalyzeRequest.getManualRedactions().getComments();
manualAdds = renalyzeRequest.getManualRedactions().getEntriesToAdd();
comments = analyzeRequest.getManualRedactions().getComments();
manualAdds = analyzeRequest.getManualRedactions().getEntriesToAdd();
}
Set<Integer> sectionsToReanalyse = new HashSet<>();
@ -131,115 +128,114 @@ public class ReanalyzeService {
}
}
log.info("Should reanalyze {} sections for request: {}", sectionsToReanalyse.size(), analyzeRequest);
if (sectionsToReanalyse.isEmpty() && (manualAdds == null || manualAdds.isEmpty())) {
redactionLog.setDictionaryVersion(dictionaryIncrement.getDictionaryVersion());
var changeLog = redactionChangeLogService.createAndStoreChangeLog(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId(), redactionLog);
redactionStorageService.storeObject(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId(), FileType.REDACTION_LOG, redactionLog);
var changeLog = redactionChangeLogService.createAndStoreChangeLog(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), redactionLog);
redactionStorageService.storeObject(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.REDACTION_LOG, redactionLog);
return analyzeResponseService.createAnalyzeResponse(text.getNumberOfPages(), redactionLog, changeLog);
}
try {
List<SectionText> reanalysisSections = new ArrayList<>();
List<SectionText> reanalysisSections = new ArrayList<>();
for (SectionText sectionText : text.getSectionTexts()) {
if (sectionsToReanalyse.contains(sectionText.getSectionNumber())) {
reanalysisSections.add(sectionText);
}
for (SectionText sectionText : text.getSectionTexts()) {
if (sectionsToReanalyse.contains(sectionText.getSectionNumber())) {
reanalysisSections.add(sectionText);
}
KieContainer kieContainer = droolsExecutionService.updateRules(renalyzeRequest.getRuleSetId());
Dictionary dictionary = dictionaryService.getDeepCopyDictionary(renalyzeRequest.getRuleSetId());
List<SectionSearchableTextPair> sectionSearchableTextPairs = new ArrayList<>();
for (SectionText reanalysisSection : reanalysisSections) {
Set<Entity> entities = entityRedactionService.findEntities(reanalysisSection.getSearchableText(), reanalysisSection
.getHeadline(), reanalysisSection.getSectionNumber(), dictionary, false);
if (reanalysisSection.getCellStarts() != null) {
surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary, reanalysisSection
.getCellStarts());
} else {
surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary);
}
sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder()
.isLocal(false)
.dictionaryTypes(dictionary.getTypes())
.entities(entities)
.text(reanalysisSection.getSearchableText().getAsStringWithLinebreaks())
.searchText(reanalysisSection.getSearchableText().toString())
.headline(reanalysisSection.getHeadline())
.sectionNumber(reanalysisSection.getSectionNumber())
.tabularData(reanalysisSection.getTabularData())
.searchableText(reanalysisSection.getSearchableText())
.dictionary(dictionary)
.images(reanalysisSection.getImages())
.build(), reanalysisSection.getSearchableText()));
}
Set<Entity> entities = new HashSet<>();
Map<Integer, Set<Image>> imagesPerPage = new HashMap<>();
sectionSearchableTextPairs.forEach(sectionSearchableTextPair -> {
Section analysedRowSection = droolsExecutionService.executeRules(kieContainer, sectionSearchableTextPair
.getSection());
entities.addAll(analysedRowSection.getEntities());
EntitySearchUtils.removeEntitiesContainedInLarger(entities);
for (Image image : analysedRowSection.getImages()) {
imagesPerPage.computeIfAbsent(image.getPage(), (a) -> new HashSet<>()).add(image);
}
});
Map<Integer, List<Entity>> entitiesPerPage = new HashMap<>();
for (Entity entity : entities) {
Map<Integer, List<EntityPositionSequence>> sequenceOnPage = new HashMap<>();
for (EntityPositionSequence entityPositionSequence : entity.getPositionSequences()) {
sequenceOnPage.computeIfAbsent(entityPositionSequence.getPageNumber(), (x) -> new ArrayList<>())
.add(entityPositionSequence);
}
for (Map.Entry<Integer, List<EntityPositionSequence>> entry : sequenceOnPage.entrySet()) {
entitiesPerPage.computeIfAbsent(entry.getKey(), (x) -> new ArrayList<>())
.add(new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), entry
.getValue(), entity.getHeadline(), entity.getMatchedRule(), entity.getSectionNumber(), entity
.getLegalBasis(), entity.isDictionaryEntry(), entity.getTextBefore(), entity.getTextAfter(), entity
.getStart(), entity.getEnd()));
}
}
List<RedactionLogEntry> newRedactionLogEntries = new ArrayList<>();
for (int page = 1; page <= text.getNumberOfPages(); page++) {
if (entitiesPerPage.get(page) != null) {
newRedactionLogEntries.addAll(redactionLogCreatorService.addEntries(entitiesPerPage, renalyzeRequest
.getManualRedactions(), page, renalyzeRequest.getRuleSetId()));
}
if (imagesPerPage.get(page) != null) {
newRedactionLogEntries.addAll(redactionLogCreatorService.addImageEntries(imagesPerPage, renalyzeRequest
.getManualRedactions(), page, renalyzeRequest.getRuleSetId()));
}
newRedactionLogEntries.addAll(redactionLogCreatorService.addManualAddEntries(manualAdds, comments, page, renalyzeRequest
.getRuleSetId()));
}
redactionLog.getRedactionLogEntry().removeIf(entry -> sectionsToReanalyse.contains(entry.getSectionNumber()) && !entry.isImage() || entry.getSectionNumber() == 0 && !entry.isImage());
redactionLog.getRedactionLogEntry().addAll(newRedactionLogEntries);
redactionLog.setDictionaryVersion(dictionaryIncrement.getDictionaryVersion());
var changeLog = redactionChangeLogService.createAndStoreChangeLog(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId(), redactionLog);
redactionStorageService.storeObject(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId(), FileType.REDACTION_LOG, redactionLog);
return analyzeResponseService.createAnalyzeResponse(text.getNumberOfPages(), redactionLog, changeLog);
} catch (Exception e) {
throw new RedactionException(e);
}
//--
KieContainer kieContainer = droolsExecutionService.updateRules(analyzeRequest.getRuleSetId());
Dictionary dictionary = dictionaryService.getDeepCopyDictionary(analyzeRequest.getRuleSetId());
List<SectionSearchableTextPair> sectionSearchableTextPairs = new ArrayList<>();
for (SectionText reanalysisSection : reanalysisSections) {
Set<Entity> entities = entityRedactionService.findEntities(reanalysisSection.getSearchableText(), reanalysisSection
.getHeadline(), reanalysisSection.getSectionNumber(), dictionary, false);
if (reanalysisSection.getCellStarts() != null) {
surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary, reanalysisSection
.getCellStarts());
} else {
surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary);
}
sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder()
.isLocal(false)
.dictionaryTypes(dictionary.getTypes())
.entities(entities)
.text(reanalysisSection.getSearchableText().getAsStringWithLinebreaks())
.searchText(reanalysisSection.getSearchableText().toString())
.headline(reanalysisSection.getHeadline())
.sectionNumber(reanalysisSection.getSectionNumber())
.tabularData(reanalysisSection.getTabularData())
.searchableText(reanalysisSection.getSearchableText())
.dictionary(dictionary)
.images(reanalysisSection.getImages())
.build(), reanalysisSection.getSearchableText()));
}
Set<Entity> entities = new HashSet<>();
Map<Integer, Set<Image>> imagesPerPage = new HashMap<>();
sectionSearchableTextPairs.forEach(sectionSearchableTextPair -> {
Section analysedRowSection = droolsExecutionService.executeRules(kieContainer, sectionSearchableTextPair
.getSection());
entities.addAll(analysedRowSection.getEntities());
EntitySearchUtils.removeEntitiesContainedInLarger(entities);
for (Image image : analysedRowSection.getImages()) {
imagesPerPage.computeIfAbsent(image.getPage(), (a) -> new HashSet<>()).add(image);
}
});
Map<Integer, List<Entity>> entitiesPerPage = new HashMap<>();
for (Entity entity : entities) {
Map<Integer, List<EntityPositionSequence>> sequenceOnPage = new HashMap<>();
for (EntityPositionSequence entityPositionSequence : entity.getPositionSequences()) {
sequenceOnPage.computeIfAbsent(entityPositionSequence.getPageNumber(), (x) -> new ArrayList<>())
.add(entityPositionSequence);
}
for (Map.Entry<Integer, List<EntityPositionSequence>> entry : sequenceOnPage.entrySet()) {
entitiesPerPage.computeIfAbsent(entry.getKey(), (x) -> new ArrayList<>())
.add(new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), entry
.getValue(), entity.getHeadline(), entity.getMatchedRule(), entity.getSectionNumber(), entity
.getLegalBasis(), entity.isDictionaryEntry(), entity.getTextBefore(), entity.getTextAfter(), entity
.getStart(), entity.getEnd()));
}
}
List<RedactionLogEntry> newRedactionLogEntries = new ArrayList<>();
for (int page = 1; page <= text.getNumberOfPages(); page++) {
if (entitiesPerPage.get(page) != null) {
newRedactionLogEntries.addAll(redactionLogCreatorService.addEntries(entitiesPerPage, analyzeRequest
.getManualRedactions(), page, analyzeRequest.getRuleSetId()));
}
if (imagesPerPage.get(page) != null) {
newRedactionLogEntries.addAll(redactionLogCreatorService.addImageEntries(imagesPerPage, analyzeRequest
.getManualRedactions(), page, analyzeRequest.getRuleSetId()));
}
newRedactionLogEntries.addAll(redactionLogCreatorService.addManualAddEntries(manualAdds, comments, page, analyzeRequest
.getRuleSetId()));
}
redactionLog.getRedactionLogEntry().removeIf(entry -> sectionsToReanalyse.contains(entry.getSectionNumber()) && !entry.isImage() || entry.getSectionNumber() == 0 && !entry.isImage());
redactionLog.getRedactionLogEntry().addAll(newRedactionLogEntries);
redactionLog.setDictionaryVersion(dictionaryIncrement.getDictionaryVersion());
var changeLog = redactionChangeLogService.createAndStoreChangeLog(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), redactionLog);
redactionStorageService.storeObject(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.REDACTION_LOG, redactionLog);
return analyzeResponseService.createAnalyzeResponse(text.getNumberOfPages(), redactionLog, changeLog);
}
@ -262,7 +258,7 @@ public class ReanalyzeService {
return Image.builder()
.type(entry.getType())
.position(new Rectangle2D.Float(position.getTopLeft().getX(), position.getTopLeft()
.position(new RedRectangle2D(position.getTopLeft().getX(), position.getTopLeft()
.getY(), position.getWidth(), position.getHeight()))
.sectionNumber(entry.getSectionNumber())
.section(entry.getSection())

View File

@ -4,6 +4,7 @@ import com.iqser.red.service.redaction.v1.model.*;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.parsing.model.RedTextPosition;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
@ -14,7 +15,6 @@ import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
import lombok.RequiredArgsConstructor;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.pdfbox.text.TextPosition;
import org.springframework.stereotype.Service;
import java.util.ArrayList;
@ -272,24 +272,24 @@ public class RedactionLogCreatorService {
}
private List<Rectangle> getRectanglesPerLine(List<TextPosition> textPositions, int page) {
private List<Rectangle> getRectanglesPerLine(List<RedTextPosition> textPositions, int page) {
List<Rectangle> rectangles = new ArrayList<>();
if (textPositions.size() == 1) {
rectangles.add(new TextPositionSequence(textPositions, page).getRectangle());
rectangles.add( TextPositionSequence.fromData(textPositions, page).getRectangle());
} else {
float y = textPositions.get(0).getYDirAdj();
int startIndex = 0;
for (int i = 1; i < textPositions.size(); i++) {
float yDirAdj = textPositions.get(i).getYDirAdj();
if (yDirAdj != y) {
rectangles.add(new TextPositionSequence(textPositions.subList(startIndex, i), page).getRectangle());
rectangles.add( TextPositionSequence.fromData(textPositions.subList(startIndex, i), page).getRectangle());
y = yDirAdj;
startIndex = i;
}
}
if (startIndex != textPositions.size()) {
rectangles.add(new TextPositionSequence(textPositions.subList(startIndex, textPositions.size()), page).getRectangle());
rectangles.add( TextPositionSequence.fromData(textPositions.subList(startIndex, textPositions.size()), page).getRectangle());
}
}

View File

@ -3,9 +3,9 @@ package com.iqser.red.service.redaction.v1.server.redaction.utils;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.RedRectangle2D;
import lombok.experimental.UtilityClass;
import java.awt.geom.Rectangle2D;
import java.nio.charset.StandardCharsets;
import java.util.List;
@ -25,12 +25,8 @@ public class IdBuilder {
}
public String buildId(Rectangle2D rectangle2D, int page) {
StringBuilder sb = new StringBuilder();
sb.append("x").append(rectangle2D.getX()).append("y").append(rectangle2D.getY()).append("h").append(rectangle2D.getHeight()).append("w").append(rectangle2D.getWidth()).append("p").append(page);
return hashFunction.hashString(sb.toString(), StandardCharsets.UTF_8).toString();
public String buildId(RedRectangle2D rectangle2D, int page) {
return hashFunction.hashString("x" + rectangle2D.getX() + "y" + rectangle2D.getY() + "h" + rectangle2D.getHeight() + "w" + rectangle2D.getWidth() + "p" + page, StandardCharsets.UTF_8).toString();
}

View File

@ -1,21 +1,15 @@
package com.iqser.red.service.redaction.v1.server.segmentation;
import com.iqser.red.service.redaction.v1.model.SectionArea;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
import com.iqser.red.service.redaction.v1.server.classification.model.SectionText;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.classification.service.BlockificationService;
import com.iqser.red.service.redaction.v1.server.classification.service.ClassificationService;
import com.iqser.red.service.redaction.v1.server.exception.RedactionException;
import com.iqser.red.service.redaction.v1.server.memory.MemoryStats;
import com.iqser.red.service.redaction.v1.server.parsing.PDFAreaTextStripper;
import com.iqser.red.service.redaction.v1.server.parsing.PDFLinesTextStripper;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue;
import com.iqser.red.service.redaction.v1.server.redaction.service.ImageClassificationService;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.CleanRulings;
import com.iqser.red.service.redaction.v1.server.tableextraction.service.RulingCleaningService;
import com.iqser.red.service.redaction.v1.server.tableextraction.service.TableExtractionService;
@ -28,15 +22,12 @@ import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.springframework.stereotype.Service;
import java.awt.geom.Rectangle2D;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@Slf4j
@Service
@ -53,80 +44,11 @@ public class PdfSegmentationService {
private final ImageClassificationService imageClassificationService;
private void postProcessSections(PDDocument pdDocument, List<SectionText> texts) {
try {
for (SectionText sectionText : texts) {
List<TextBlock> textBlocks = new ArrayList<>();
Map<Integer, List<SectionArea>> sectionAreasPerPage = new HashMap<>();
for (SectionArea sectionArea : sectionText.getSectionAreas()) {
sectionAreasPerPage.computeIfAbsent(sectionArea.getPage(), (x) -> new ArrayList<>())
.add(sectionArea);
}
Map<String, CellValue> tabularData = new HashMap<>();
List<Integer> cellStarts = new ArrayList<>();
for (Integer page : sectionAreasPerPage.keySet()) {
List<SectionArea> areasOnPage = sectionAreasPerPage.get(page);
PDPage pdPage = pdDocument.getPage(page - 1);
PDRectangle cropBox = pdPage.getCropBox();
PDFAreaTextStripper textStripper = new PDFAreaTextStripper();
textStripper.setPageNumber(page);
int cellStart = 0;
for (SectionArea sectionArea : areasOnPage) {
Rectangle2D rect = null;
if (pdPage.getRotation() == 90) {
rect = new Rectangle2D.Float(sectionArea.getTopLeft().getY(), sectionArea.getTopLeft()
.getX(), sectionArea.getHeight(), sectionArea.getWidth() + 0.001f);
} else {
rect = new Rectangle2D.Float(sectionArea.getTopLeft().getX(), -sectionArea.getTopLeft()
.getY() + cropBox.getUpperRightY() - sectionArea.getHeight(), sectionArea.getWidth(), sectionArea
.getHeight() + 0.001f);
}
textStripper.addRegion(String.valueOf(1), rect);
textStripper.extractRegions(pdPage);
textStripper.getTextForRegion(String.valueOf(1));
List<TextPositionSequence> positions = textStripper.getTextPositionSequences();
TextBlock textBlock = new TextBlock(sectionArea.getTopLeft().getX(), sectionArea.getTopLeft()
.getX() + sectionArea.getWidth(), sectionArea.getTopLeft()
.getY(), sectionArea.getTopLeft().getY() + sectionArea.getHeight(), positions, 0);
if (sectionText.isTable()) {
Cell cell = new Cell();
cell.addTextBlock(textBlock);
tabularData.put(sectionArea.getHeader(), new CellValue(cell.getTextBlocks(), cellStart));
cellStarts.add(cellStart);
cellStart = cellStart + cell.toString().trim().length() + 1;
}
textBlocks.add(textBlock);
textStripper.clearPositions();
}
}
sectionText.setTextBlocks(textBlocks);
sectionText.setTabularData(tabularData);
if (sectionText.isTable()) {
sectionText.setCellStarts(cellStarts);
}
}
} catch (Exception e) {
throw new RedactionException(e);
}
public Document parseDocument(InputStream documentInputStream) throws IOException {
return parseDocument(documentInputStream, false);
}
public Document parseDocument(InputStream documentInputStream) throws IOException {
public Document parseDocument(InputStream documentInputStream, boolean ignoreImages) throws IOException {
PDDocument pdDocument = null;
try {
//create tempFile
@ -166,24 +88,23 @@ public class PdfSegmentationService {
Page page = blockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings
.getVertical());
page.setRotation(rotation);
tableExtractionService.extractTables(cleanRulings, page);
buildPageStatistics(page);
page.setLandscape(isLandscape || isRotated);
page.setPageNumber(pageNumber);
increaseDocumentStatistics(page, document);
page.setImages(stripper.getImages());
imageClassificationService.classifyImages(page);
tableExtractionService.extractTables(cleanRulings, page);
buildPageStatistics(page);
increaseDocumentStatistics(page, document);
if (!ignoreImages) {
imageClassificationService.classifyImages(page);
}
pages.add(page);
}
document.setPages(pages);
@ -194,9 +115,6 @@ public class PdfSegmentationService {
pdDocument = reinitializePDDocument(tempFile, pdDocument);
// This can be improved an done in one pass, but it's complicated to do right away
postProcessSections(pdDocument, document.getSectionText());
IOUtils.close(pdDocument);
tempFile.delete();

View File

@ -50,7 +50,7 @@ public class RedactionStorageService {
try {
return objectMapper.readValue(inputStreamResource.getInputStream(), RedactionLog.class);
} catch (IOException e) {
throw new RuntimeException("Could not convert Text", e);
throw new RuntimeException("Could not convert RedactionLog", e);
}
}

View File

@ -1,5 +1,6 @@
package com.iqser.red.service.redaction.v1.server.tableextraction.model;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import lombok.AllArgsConstructor;
import lombok.Data;
@ -27,10 +28,12 @@ public abstract class AbstractTextContainer {
return page == other.getPage() && this.minX <= other.getTopLeft().getX() && this.maxX >= other.getTopLeft().getX() + other.getWidth() && this.minY <= other.getTopLeft().getY() && this.maxY >= other.getTopLeft().getY() + other.getHeight();
}
@JsonIgnore
public float getHeight() {
return maxY - minY;
}
@JsonIgnore
public float getWidth() {
return maxX - minX;
}

View File

@ -12,11 +12,11 @@ import java.io.FileOutputStream;
import java.util.HashMap;
import java.util.Map;
public class FilySystemBackedStorageService extends StorageService {
public class FileSystemBackedStorageService extends StorageService {
private Map<String, File> dataMap = new HashMap<>();
private final Map<String, File> dataMap = new HashMap<>();
public FilySystemBackedStorageService() {
public FileSystemBackedStorageService() {
super(null, null);
}

View File

@ -134,7 +134,7 @@ public class RedactionIntegrationTest {
@Bean
@Primary
public StorageService inmemoryStorage() {
return new FilySystemBackedStorageService();
return new FileSystemBackedStorageService();
}
}
@ -142,8 +142,8 @@ public class RedactionIntegrationTest {
@After
public void cleanupStorage() {
if (this.storageService instanceof FilySystemBackedStorageService) {
((FilySystemBackedStorageService) this.storageService).clearStorage();
if (this.storageService instanceof FileSystemBackedStorageService) {
((FileSystemBackedStorageService) this.storageService).clearStorage();
}
}

View File

@ -2,7 +2,7 @@ package com.iqser.red.service.redaction.v1.server.redaction.service;
import com.amazonaws.services.s3.AmazonS3;
import com.iqser.red.service.configuration.v1.api.model.*;
import com.iqser.red.service.redaction.v1.server.FilySystemBackedStorageService;
import com.iqser.red.service.redaction.v1.server.FileSystemBackedStorageService;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.client.DictionaryClient;
import com.iqser.red.service.redaction.v1.server.client.RulesClient;
@ -97,7 +97,7 @@ public class EntityRedactionServiceTest {
@Bean
@Primary
public StorageService inmemoryStorage() {
return new FilySystemBackedStorageService();
return new FileSystemBackedStorageService();
}
}