Pull request #142: Improved redaction performance
Merge in RED/redaction-service from improved-redaction-performance to master * commit 'b34fc673c4d5a4440d6e5f2391db4420c4d2acf9': bamboo-specs/src/main/java/buildjob/PlanSpec.java edited online with Bitbucket fixed some test issues fixed pmd updated planspec updated redrect set mvn opts run tests with real life jvm args to detect oom issues early code format, dependecy and test update, logging for reanalysis reworked reanalysis and text storage Serialization of text
This commit is contained in:
commit
07b05b2d89
@ -1,7 +1,5 @@
|
||||
package buildjob;
|
||||
|
||||
import static com.atlassian.bamboo.specs.builders.task.TestParserTask.createJUnitParserTask;
|
||||
|
||||
import com.atlassian.bamboo.specs.api.BambooSpec;
|
||||
import com.atlassian.bamboo.specs.api.builders.BambooKey;
|
||||
import com.atlassian.bamboo.specs.api.builders.docker.DockerConfiguration;
|
||||
@ -24,6 +22,8 @@ import com.atlassian.bamboo.specs.builders.trigger.BitbucketServerTrigger;
|
||||
import com.atlassian.bamboo.specs.model.task.InjectVariablesScope;
|
||||
import com.atlassian.bamboo.specs.util.BambooServer;
|
||||
|
||||
import static com.atlassian.bamboo.specs.builders.task.TestParserTask.createJUnitParserTask;
|
||||
|
||||
/**
|
||||
* Plan configuration for Bamboo.
|
||||
* Learn more on: <a href="https://confluence.atlassian.com/display/BAMBOO/Bamboo+Specs">https://confluence.atlassian.com/display/BAMBOO/Bamboo+Specs</a>
|
||||
@ -33,6 +33,8 @@ public class PlanSpec {
|
||||
|
||||
private static final String SERVICE_NAME = "redaction-service";
|
||||
|
||||
private static final String JVM_ARGS =" -Xmx4g -XX:+ExitOnOutOfMemoryError -XX:SurvivorRatio=2 -XX:NewRatio=1 -XX:InitialTenuringThreshold=16 -XX:MaxTenuringThreshold=16 -XX:InitiatingHeapOccupancyPercent=35 ";
|
||||
|
||||
private static final String SERVICE_KEY = SERVICE_NAME.toUpperCase().replaceAll("-", "");
|
||||
|
||||
/**
|
||||
@ -82,9 +84,12 @@ public class PlanSpec {
|
||||
.checkoutItems(new CheckoutItem().defaultRepository()),
|
||||
new ScriptTask()
|
||||
.description("Build")
|
||||
.environmentVariables("MAVEN_OPTS="+JVM_ARGS)
|
||||
.inlineBody("#!/bin/bash\n" +
|
||||
"set -e\n" +
|
||||
|
||||
"export MAVEN_OPTS=\"$MAVEN_OPTS "+JVM_ARGS +"\"\n" +
|
||||
|
||||
"if [[ \"${bamboo.version_tag}\" != \"dev\" ]]; then ${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn --no-transfer-progress -f ${bamboo_build_working_directory}/" + SERVICE_NAME + "-v1/pom.xml versions:set -DnewVersion=${bamboo.version_tag}; fi\n" +
|
||||
"if [[ \"${bamboo.version_tag}\" != \"dev\" ]]; then ${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn --no-transfer-progress -f ${bamboo_build_working_directory}/" + SERVICE_NAME + "-image-v1/pom.xml versions:set -DnewVersion=${bamboo.version_tag}; fi\n" +
|
||||
|
||||
|
||||
@ -32,7 +32,7 @@
|
||||
<dependency>
|
||||
<groupId>com.iqser.red</groupId>
|
||||
<artifactId>platform-commons-dependency</artifactId>
|
||||
<version>1.2.9</version>
|
||||
<version>1.3.0</version>
|
||||
<scope>import</scope>
|
||||
<type>pom</type>
|
||||
</dependency>
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
@ -12,7 +13,7 @@ public class Footer {
|
||||
|
||||
private List<TextBlock> textBlocks;
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public SearchableText getSearchableText() {
|
||||
|
||||
SearchableText searchableText = new SearchableText();
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
@ -12,7 +13,7 @@ public class Header {
|
||||
|
||||
private List<TextBlock> textBlocks;
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public SearchableText getSearchableText() {
|
||||
|
||||
SearchableText searchableText = new SearchableText();
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.iqser.red.service.redaction.v1.model.SectionArea;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
|
||||
@ -31,6 +32,12 @@ public class SectionText {
|
||||
private List<Integer> cellStarts = new ArrayList<>();
|
||||
|
||||
|
||||
public void setTabularData(Map<String, CellValue> tabularData) {
|
||||
tabularData.remove(null);
|
||||
this.tabularData = tabularData;
|
||||
}
|
||||
|
||||
@JsonIgnore
|
||||
public SearchableText getSearchableText() {
|
||||
|
||||
SearchableText searchableText = new SearchableText();
|
||||
|
||||
@ -1,11 +1,13 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
@ -13,6 +15,7 @@ import java.util.List;
|
||||
@AllArgsConstructor
|
||||
@Builder
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
public class TextBlock extends AbstractTextContainer {
|
||||
|
||||
@Builder.Default
|
||||
@ -116,6 +119,7 @@ public class TextBlock extends AbstractTextContainer {
|
||||
}
|
||||
|
||||
@Override
|
||||
@JsonIgnore
|
||||
public String getText() {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
@ -12,7 +13,7 @@ public class UnclassifiedText {
|
||||
|
||||
private List<TextBlock> textBlocks;
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public SearchableText getSearchableText() {
|
||||
|
||||
SearchableText searchableText = new SearchableText();
|
||||
|
||||
@ -59,7 +59,7 @@ public class RedactionController implements RedactionResource {
|
||||
|
||||
try (PDDocument pdDocument = PDDocument.load(storedObjectStream, MemoryUsageSetting.setupTempFileOnly())) {
|
||||
pdDocument.setAllSecurityToBeRemoved(true);
|
||||
|
||||
|
||||
dictionaryService.updateDictionary(redactionLog.getRuleSetId());
|
||||
annotationService.annotate(pdDocument, redactionLog, sectionsGrid);
|
||||
|
||||
@ -131,7 +131,7 @@ public class RedactionController implements RedactionResource {
|
||||
|
||||
try {
|
||||
var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getProjectId(), redactionRequest.getFileId(), FileType.ORIGIN));
|
||||
classifiedDoc = pdfSegmentationService.parseDocument(storedObjectStream);
|
||||
classifiedDoc = pdfSegmentationService.parseDocument(storedObjectStream, true);
|
||||
} catch (Exception e) {
|
||||
throw new RedactionException(e);
|
||||
}
|
||||
|
||||
@ -0,0 +1,52 @@
|
||||
package com.iqser.red.service.redaction.v1.server.parsing.model;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
import org.springframework.beans.BeanUtils;
|
||||
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
public class RedTextPosition {
|
||||
|
||||
private String textMatrix;
|
||||
private int rotation;
|
||||
private float y;
|
||||
private float pageHeight;
|
||||
private float pageWidth;
|
||||
private String unicode;
|
||||
private float XDirAdj;
|
||||
private float YDirAdj;
|
||||
private float width;
|
||||
private float heightDir;
|
||||
|
||||
// not used in reanalysis
|
||||
@JsonIgnore
|
||||
private float widthOfSpace;
|
||||
|
||||
// not used in reanalysis
|
||||
@JsonIgnore
|
||||
private float fontSizeInPt;
|
||||
|
||||
// not used in reanalysis
|
||||
@JsonIgnore
|
||||
private String fontName;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public static RedTextPosition fromTextPosition(TextPosition textPosition) {
|
||||
var pos = new RedTextPosition();
|
||||
BeanUtils.copyProperties(textPosition, pos);
|
||||
pos.setFontName(textPosition.getFont().getName());
|
||||
|
||||
pos.setFontSizeInPt(textPosition.getFontSizeInPt());
|
||||
|
||||
pos.setTextMatrix(textPosition.getTextMatrix().toString());
|
||||
|
||||
return pos;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
@ -1,29 +1,52 @@
|
||||
package com.iqser.red.service.redaction.v1.server.parsing.model;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
|
||||
import com.iqser.red.service.redaction.v1.model.Point;
|
||||
import com.iqser.red.service.redaction.v1.model.Rectangle;
|
||||
import lombok.Data;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.NoArgsConstructor;
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Data
|
||||
@RequiredArgsConstructor
|
||||
@NoArgsConstructor
|
||||
@JsonIgnoreProperties({ "empty" })
|
||||
public class TextPositionSequence implements CharSequence {
|
||||
|
||||
private final int page;
|
||||
private List<TextPosition> textPositions = new ArrayList<>();
|
||||
private int page;
|
||||
private List<RedTextPosition> textPositions = new ArrayList<>();
|
||||
|
||||
private float x1;
|
||||
private float x2;
|
||||
|
||||
public TextPositionSequence(int page) {
|
||||
this.page = page;
|
||||
}
|
||||
|
||||
|
||||
public static TextPositionSequence fromData(List<RedTextPosition> textPositions, int page) {
|
||||
var textPositionSequence = new TextPositionSequence();
|
||||
textPositionSequence.textPositions = textPositions;
|
||||
textPositionSequence.page = page;
|
||||
|
||||
return textPositionSequence;
|
||||
}
|
||||
|
||||
|
||||
public TextPositionSequence(List<TextPosition> textPositions, int page) {
|
||||
|
||||
this.textPositions = textPositions;
|
||||
this.textPositions = textPositions.stream().map(RedTextPosition::fromTextPosition).collect(Collectors.toList());
|
||||
this.page = page;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@Override
|
||||
public int length() {
|
||||
|
||||
@ -34,7 +57,7 @@ public class TextPositionSequence implements CharSequence {
|
||||
@Override
|
||||
public char charAt(int index) {
|
||||
|
||||
TextPosition textPosition = textPositionAt(index);
|
||||
RedTextPosition textPosition = textPositionAt(index);
|
||||
String text = textPosition.getUnicode();
|
||||
return text.charAt(0);
|
||||
}
|
||||
@ -42,7 +65,7 @@ public class TextPositionSequence implements CharSequence {
|
||||
|
||||
public char charAt(int index, boolean caseInSensitive) {
|
||||
|
||||
TextPosition textPosition = textPositionAt(index);
|
||||
RedTextPosition textPosition = textPositionAt(index);
|
||||
String text = textPosition.getUnicode();
|
||||
return caseInSensitive ? text.toLowerCase().charAt(0) : text.charAt(0);
|
||||
}
|
||||
@ -51,7 +74,7 @@ public class TextPositionSequence implements CharSequence {
|
||||
@Override
|
||||
public TextPositionSequence subSequence(int start, int end) {
|
||||
|
||||
return new TextPositionSequence(textPositions.subList(start, end), page);
|
||||
return fromData(textPositions.subList(start, end), page);
|
||||
}
|
||||
|
||||
|
||||
@ -66,18 +89,25 @@ public class TextPositionSequence implements CharSequence {
|
||||
}
|
||||
|
||||
|
||||
public TextPosition textPositionAt(int index) {
|
||||
public RedTextPosition textPositionAt(int index) {
|
||||
|
||||
return textPositions.get(index);
|
||||
}
|
||||
|
||||
|
||||
public void add(TextPosition textPosition) {
|
||||
public void add(RedTextPosition textPosition) {
|
||||
|
||||
this.textPositions.add(textPosition);
|
||||
}
|
||||
|
||||
|
||||
public void add(TextPosition textPosition) {
|
||||
|
||||
this.textPositions.add(RedTextPosition.fromTextPosition(textPosition));
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getX1() {
|
||||
|
||||
if (textPositions.get(0).getRotation() == 90) {
|
||||
@ -88,6 +118,7 @@ public class TextPositionSequence implements CharSequence {
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getX2() {
|
||||
|
||||
if (textPositions.get(0).getRotation() == 90) {
|
||||
@ -98,13 +129,14 @@ public class TextPositionSequence implements CharSequence {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getRotationAdjustedY() {
|
||||
|
||||
return textPositions.get(0).getY();
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getY1() {
|
||||
|
||||
if (textPositions.get(0).getRotation() == 90) {
|
||||
@ -115,6 +147,7 @@ public class TextPositionSequence implements CharSequence {
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getY2() {
|
||||
|
||||
if (textPositions.get(0).getRotation() == 90) {
|
||||
@ -125,38 +158,40 @@ public class TextPositionSequence implements CharSequence {
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getTextHeight() {
|
||||
|
||||
return textPositions.get(0).getHeightDir() + 2;
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getHeight() {
|
||||
|
||||
return getY2() - getY1();
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getWidth() {
|
||||
|
||||
return getX2() - getX1();
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public String getFont() {
|
||||
|
||||
return textPositions.get(0)
|
||||
.getFont()
|
||||
.toString()
|
||||
return textPositions.get(0).getFontName()
|
||||
.toLowerCase()
|
||||
.replaceAll(",bold", "")
|
||||
.replaceAll(",italic", "");
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public String getFontStyle() {
|
||||
|
||||
String lowercaseFontName = textPositions.get(0).getFont().toString().toLowerCase();
|
||||
String lowercaseFontName = textPositions.get(0).getFontName().toLowerCase();
|
||||
|
||||
if (lowercaseFontName.contains("bold") && lowercaseFontName.contains("italic")) {
|
||||
return "bold, italic";
|
||||
@ -170,25 +205,25 @@ public class TextPositionSequence implements CharSequence {
|
||||
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getFontSize() {
|
||||
|
||||
return textPositions.get(0).getFontSizeInPt();
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getSpaceWidth() {
|
||||
|
||||
return textPositions.get(0).getWidthOfSpace();
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public int getRotation() {
|
||||
|
||||
return textPositions.get(0).getRotation();
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public Rectangle getRectangle() {
|
||||
|
||||
float height = getTextHeight();
|
||||
|
||||
@ -3,19 +3,23 @@ package com.iqser.red.service.redaction.v1.server.redaction.model;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
|
||||
import lombok.Value;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
@Value
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class CellValue {
|
||||
|
||||
private List<TextBlock> textBlocks;
|
||||
private List<TextBlock> textBlocks = new ArrayList<>();
|
||||
|
||||
private int rowSpanStart;
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
|
||||
@ -5,8 +5,6 @@ import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@ -14,7 +12,7 @@ import java.awt.geom.Rectangle2D;
|
||||
public class Image {
|
||||
|
||||
private String type;
|
||||
private Rectangle2D position;
|
||||
private RedRectangle2D position;
|
||||
private boolean redaction;
|
||||
private String redactionReason;
|
||||
private String legalBasis;
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.model;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import lombok.Data;
|
||||
import lombok.NonNull;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
@ -11,9 +12,10 @@ import java.awt.image.BufferedImage;
|
||||
@RequiredArgsConstructor
|
||||
public class PdfImage {
|
||||
|
||||
@JsonIgnore
|
||||
private BufferedImage image;
|
||||
@NonNull
|
||||
private Rectangle2D position;
|
||||
private RedRectangle2D position;
|
||||
private ImageType imageType;
|
||||
private boolean isAppendedToParagraph;
|
||||
|
||||
@ -22,7 +24,7 @@ public class PdfImage {
|
||||
|
||||
public PdfImage(BufferedImage image, Rectangle2D position, int page) {
|
||||
this.image = image;
|
||||
this.position = position;
|
||||
this.position = new RedRectangle2D(position.getX(), position.getY(), position.getWidth(), position.getHeight());
|
||||
this.page = page;
|
||||
}
|
||||
|
||||
|
||||
@ -0,0 +1,35 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.model;
|
||||
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class RedRectangle2D {
|
||||
|
||||
private double x;
|
||||
private double y;
|
||||
private double width;
|
||||
private double height;
|
||||
|
||||
@JsonIgnore
|
||||
public boolean isEmpty() {
|
||||
return width <= 0.0f || height <= 0.0f;
|
||||
}
|
||||
|
||||
public boolean contains(double x, double y, double w, double h) {
|
||||
if (isEmpty() || w <= 0 || h <= 0) {
|
||||
return false;
|
||||
}
|
||||
double x0 = getX();
|
||||
double y0 = getY();
|
||||
return x >= x0 &&
|
||||
y >= y0 &&
|
||||
(x + w) <= x0 + getWidth() &&
|
||||
(y + h) <= y0 + getHeight();
|
||||
}
|
||||
}
|
||||
@ -187,6 +187,7 @@ public class EntityRedactionService {
|
||||
.get(0)
|
||||
.getPage());
|
||||
sectionText.getSectionAreas().add(sectionArea);
|
||||
sectionText.getTextBlocks().addAll(cell.getTextBlocks());
|
||||
|
||||
addSectionToManualRedactions(cell.getTextBlocks(), manualRedactions, table.getHeadline(), sectionNumber.intValue());
|
||||
int cellStart = start;
|
||||
@ -235,6 +236,8 @@ public class EntityRedactionService {
|
||||
sectionText.setHeadline(table.getHeadline());
|
||||
sectionText.setSectionNumber(sectionNumber.intValue());
|
||||
sectionText.setTable(true);
|
||||
sectionText.setTabularData(tabularData);
|
||||
sectionText.setCellStarts(cellStarts);
|
||||
classifiedDoc.getSectionText().add(sectionText);
|
||||
}
|
||||
|
||||
@ -267,6 +270,7 @@ public class EntityRedactionService {
|
||||
.getSequences()
|
||||
.get(0)
|
||||
.getPage());
|
||||
sectionText.getTextBlocks().addAll(cell.getTextBlocks());
|
||||
sectionText.getSectionAreas().add(sectionArea);
|
||||
}
|
||||
|
||||
@ -325,6 +329,10 @@ public class EntityRedactionService {
|
||||
sectionText.setHeadline(headline);
|
||||
sectionText.setSectionNumber(sectionNumber.intValue());
|
||||
sectionText.setTable(false);
|
||||
sectionText.setImages(images.stream()
|
||||
.map(image -> convert(image, sectionNumber.intValue(), headline))
|
||||
.collect(Collectors.toSet()));
|
||||
sectionText.setTextBlocks(paragraphTextBlocks);
|
||||
classifiedDoc.getSectionText().add(sectionText);
|
||||
}
|
||||
|
||||
|
||||
@ -12,12 +12,12 @@ import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUti
|
||||
import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationService;
|
||||
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.kie.api.runtime.KieContainer;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.web.bind.annotation.RequestBody;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
@ -39,7 +39,6 @@ public class ReanalyzeService {
|
||||
|
||||
public AnalyzeResult analyze(AnalyzeRequest analyzeRequest) {
|
||||
|
||||
|
||||
var pageCount = 0;
|
||||
Document classifiedDoc;
|
||||
|
||||
@ -74,30 +73,28 @@ public class ReanalyzeService {
|
||||
return analyzeResponseService.createAnalyzeResponse(pageCount, redactionLog, changeLog);
|
||||
}
|
||||
|
||||
public AnalyzeResult reanalyze(@RequestBody AnalyzeRequest renalyzeRequest) {
|
||||
var text = redactionStorageService.getText(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId());
|
||||
// new procedure was not applied, we need a complete analysis
|
||||
|
||||
@SneakyThrows
|
||||
public AnalyzeResult reanalyze(@RequestBody AnalyzeRequest analyzeRequest) {
|
||||
|
||||
var redactionLog = redactionStorageService.getRedactionLog(analyzeRequest.getProjectId(), analyzeRequest.getFileId());
|
||||
var text = redactionStorageService.getText(analyzeRequest.getProjectId(), analyzeRequest.getFileId());
|
||||
|
||||
// not yet ready for reanalysis
|
||||
if (text.getNumberOfPages() == 0) {
|
||||
return analyze(AnalyzeRequest.builder()
|
||||
.ruleSetId(renalyzeRequest.getRuleSetId())
|
||||
.manualRedactions(renalyzeRequest.getManualRedactions())
|
||||
.projectId(renalyzeRequest.getProjectId())
|
||||
.fileId(renalyzeRequest.getFileId())
|
||||
.build());
|
||||
return analyze(analyzeRequest);
|
||||
}
|
||||
var redactionLog = redactionStorageService.getRedactionLog(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId());
|
||||
|
||||
DictionaryIncrement dictionaryIncrement = dictionaryService.getDictionaryIncrements(analyzeRequest.getRuleSetId(), redactionLog.getDictionaryVersion());
|
||||
|
||||
DictionaryIncrement dictionaryIncrement = dictionaryService.getDictionaryIncrements(renalyzeRequest.getRuleSetId(), redactionLog.getDictionaryVersion());
|
||||
|
||||
Set<String> manualForceAndRemoveIds = getForceAndRemoveIds(renalyzeRequest.getManualRedactions());
|
||||
Set<String> manualForceAndRemoveIds = getForceAndRemoveIds(analyzeRequest.getManualRedactions());
|
||||
Map<String, List<Comment>> comments = null;
|
||||
Set<ManualRedactionEntry> manualAdds = null;
|
||||
|
||||
if (renalyzeRequest.getManualRedactions() != null) {
|
||||
if (analyzeRequest.getManualRedactions() != null) {
|
||||
// TODO comments will be removed from redactionLog, so we ignore this first.
|
||||
comments = renalyzeRequest.getManualRedactions().getComments();
|
||||
manualAdds = renalyzeRequest.getManualRedactions().getEntriesToAdd();
|
||||
comments = analyzeRequest.getManualRedactions().getComments();
|
||||
manualAdds = analyzeRequest.getManualRedactions().getEntriesToAdd();
|
||||
}
|
||||
|
||||
Set<Integer> sectionsToReanalyse = new HashSet<>();
|
||||
@ -131,115 +128,114 @@ public class ReanalyzeService {
|
||||
}
|
||||
}
|
||||
|
||||
log.info("Should reanalyze {} sections for request: {}", sectionsToReanalyse.size(), analyzeRequest);
|
||||
|
||||
if (sectionsToReanalyse.isEmpty() && (manualAdds == null || manualAdds.isEmpty())) {
|
||||
redactionLog.setDictionaryVersion(dictionaryIncrement.getDictionaryVersion());
|
||||
var changeLog = redactionChangeLogService.createAndStoreChangeLog(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId(), redactionLog);
|
||||
redactionStorageService.storeObject(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId(), FileType.REDACTION_LOG, redactionLog);
|
||||
var changeLog = redactionChangeLogService.createAndStoreChangeLog(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), redactionLog);
|
||||
redactionStorageService.storeObject(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.REDACTION_LOG, redactionLog);
|
||||
return analyzeResponseService.createAnalyzeResponse(text.getNumberOfPages(), redactionLog, changeLog);
|
||||
}
|
||||
|
||||
try {
|
||||
List<SectionText> reanalysisSections = new ArrayList<>();
|
||||
|
||||
List<SectionText> reanalysisSections = new ArrayList<>();
|
||||
for (SectionText sectionText : text.getSectionTexts()) {
|
||||
|
||||
if (sectionsToReanalyse.contains(sectionText.getSectionNumber())) {
|
||||
reanalysisSections.add(sectionText);
|
||||
}
|
||||
for (SectionText sectionText : text.getSectionTexts()) {
|
||||
|
||||
if (sectionsToReanalyse.contains(sectionText.getSectionNumber())) {
|
||||
reanalysisSections.add(sectionText);
|
||||
}
|
||||
|
||||
KieContainer kieContainer = droolsExecutionService.updateRules(renalyzeRequest.getRuleSetId());
|
||||
|
||||
Dictionary dictionary = dictionaryService.getDeepCopyDictionary(renalyzeRequest.getRuleSetId());
|
||||
|
||||
List<SectionSearchableTextPair> sectionSearchableTextPairs = new ArrayList<>();
|
||||
for (SectionText reanalysisSection : reanalysisSections) {
|
||||
|
||||
Set<Entity> entities = entityRedactionService.findEntities(reanalysisSection.getSearchableText(), reanalysisSection
|
||||
.getHeadline(), reanalysisSection.getSectionNumber(), dictionary, false);
|
||||
if (reanalysisSection.getCellStarts() != null) {
|
||||
surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary, reanalysisSection
|
||||
.getCellStarts());
|
||||
} else {
|
||||
surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary);
|
||||
}
|
||||
|
||||
sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder()
|
||||
.isLocal(false)
|
||||
.dictionaryTypes(dictionary.getTypes())
|
||||
.entities(entities)
|
||||
.text(reanalysisSection.getSearchableText().getAsStringWithLinebreaks())
|
||||
.searchText(reanalysisSection.getSearchableText().toString())
|
||||
.headline(reanalysisSection.getHeadline())
|
||||
.sectionNumber(reanalysisSection.getSectionNumber())
|
||||
.tabularData(reanalysisSection.getTabularData())
|
||||
.searchableText(reanalysisSection.getSearchableText())
|
||||
.dictionary(dictionary)
|
||||
.images(reanalysisSection.getImages())
|
||||
.build(), reanalysisSection.getSearchableText()));
|
||||
}
|
||||
|
||||
Set<Entity> entities = new HashSet<>();
|
||||
Map<Integer, Set<Image>> imagesPerPage = new HashMap<>();
|
||||
sectionSearchableTextPairs.forEach(sectionSearchableTextPair -> {
|
||||
Section analysedRowSection = droolsExecutionService.executeRules(kieContainer, sectionSearchableTextPair
|
||||
.getSection());
|
||||
entities.addAll(analysedRowSection.getEntities());
|
||||
EntitySearchUtils.removeEntitiesContainedInLarger(entities);
|
||||
|
||||
for (Image image : analysedRowSection.getImages()) {
|
||||
imagesPerPage.computeIfAbsent(image.getPage(), (a) -> new HashSet<>()).add(image);
|
||||
}
|
||||
|
||||
});
|
||||
|
||||
Map<Integer, List<Entity>> entitiesPerPage = new HashMap<>();
|
||||
for (Entity entity : entities) {
|
||||
Map<Integer, List<EntityPositionSequence>> sequenceOnPage = new HashMap<>();
|
||||
for (EntityPositionSequence entityPositionSequence : entity.getPositionSequences()) {
|
||||
sequenceOnPage.computeIfAbsent(entityPositionSequence.getPageNumber(), (x) -> new ArrayList<>())
|
||||
.add(entityPositionSequence);
|
||||
}
|
||||
|
||||
for (Map.Entry<Integer, List<EntityPositionSequence>> entry : sequenceOnPage.entrySet()) {
|
||||
entitiesPerPage.computeIfAbsent(entry.getKey(), (x) -> new ArrayList<>())
|
||||
.add(new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), entry
|
||||
.getValue(), entity.getHeadline(), entity.getMatchedRule(), entity.getSectionNumber(), entity
|
||||
.getLegalBasis(), entity.isDictionaryEntry(), entity.getTextBefore(), entity.getTextAfter(), entity
|
||||
.getStart(), entity.getEnd()));
|
||||
}
|
||||
}
|
||||
|
||||
List<RedactionLogEntry> newRedactionLogEntries = new ArrayList<>();
|
||||
for (int page = 1; page <= text.getNumberOfPages(); page++) {
|
||||
if (entitiesPerPage.get(page) != null) {
|
||||
newRedactionLogEntries.addAll(redactionLogCreatorService.addEntries(entitiesPerPage, renalyzeRequest
|
||||
.getManualRedactions(), page, renalyzeRequest.getRuleSetId()));
|
||||
}
|
||||
|
||||
if (imagesPerPage.get(page) != null) {
|
||||
newRedactionLogEntries.addAll(redactionLogCreatorService.addImageEntries(imagesPerPage, renalyzeRequest
|
||||
.getManualRedactions(), page, renalyzeRequest.getRuleSetId()));
|
||||
}
|
||||
|
||||
newRedactionLogEntries.addAll(redactionLogCreatorService.addManualAddEntries(manualAdds, comments, page, renalyzeRequest
|
||||
.getRuleSetId()));
|
||||
}
|
||||
|
||||
redactionLog.getRedactionLogEntry().removeIf(entry -> sectionsToReanalyse.contains(entry.getSectionNumber()) && !entry.isImage() || entry.getSectionNumber() == 0 && !entry.isImage());
|
||||
redactionLog.getRedactionLogEntry().addAll(newRedactionLogEntries);
|
||||
redactionLog.setDictionaryVersion(dictionaryIncrement.getDictionaryVersion());
|
||||
|
||||
var changeLog = redactionChangeLogService.createAndStoreChangeLog(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId(), redactionLog);
|
||||
redactionStorageService.storeObject(renalyzeRequest.getProjectId(), renalyzeRequest.getFileId(), FileType.REDACTION_LOG, redactionLog);
|
||||
return analyzeResponseService.createAnalyzeResponse(text.getNumberOfPages(), redactionLog, changeLog);
|
||||
|
||||
|
||||
} catch (Exception e) {
|
||||
throw new RedactionException(e);
|
||||
}
|
||||
|
||||
|
||||
//--
|
||||
|
||||
KieContainer kieContainer = droolsExecutionService.updateRules(analyzeRequest.getRuleSetId());
|
||||
|
||||
Dictionary dictionary = dictionaryService.getDeepCopyDictionary(analyzeRequest.getRuleSetId());
|
||||
|
||||
List<SectionSearchableTextPair> sectionSearchableTextPairs = new ArrayList<>();
|
||||
for (SectionText reanalysisSection : reanalysisSections) {
|
||||
|
||||
Set<Entity> entities = entityRedactionService.findEntities(reanalysisSection.getSearchableText(), reanalysisSection
|
||||
.getHeadline(), reanalysisSection.getSectionNumber(), dictionary, false);
|
||||
if (reanalysisSection.getCellStarts() != null) {
|
||||
surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary, reanalysisSection
|
||||
.getCellStarts());
|
||||
} else {
|
||||
surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary);
|
||||
}
|
||||
|
||||
sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder()
|
||||
.isLocal(false)
|
||||
.dictionaryTypes(dictionary.getTypes())
|
||||
.entities(entities)
|
||||
.text(reanalysisSection.getSearchableText().getAsStringWithLinebreaks())
|
||||
.searchText(reanalysisSection.getSearchableText().toString())
|
||||
.headline(reanalysisSection.getHeadline())
|
||||
.sectionNumber(reanalysisSection.getSectionNumber())
|
||||
.tabularData(reanalysisSection.getTabularData())
|
||||
.searchableText(reanalysisSection.getSearchableText())
|
||||
.dictionary(dictionary)
|
||||
.images(reanalysisSection.getImages())
|
||||
.build(), reanalysisSection.getSearchableText()));
|
||||
}
|
||||
|
||||
Set<Entity> entities = new HashSet<>();
|
||||
Map<Integer, Set<Image>> imagesPerPage = new HashMap<>();
|
||||
sectionSearchableTextPairs.forEach(sectionSearchableTextPair -> {
|
||||
Section analysedRowSection = droolsExecutionService.executeRules(kieContainer, sectionSearchableTextPair
|
||||
.getSection());
|
||||
entities.addAll(analysedRowSection.getEntities());
|
||||
EntitySearchUtils.removeEntitiesContainedInLarger(entities);
|
||||
|
||||
for (Image image : analysedRowSection.getImages()) {
|
||||
imagesPerPage.computeIfAbsent(image.getPage(), (a) -> new HashSet<>()).add(image);
|
||||
}
|
||||
|
||||
});
|
||||
|
||||
Map<Integer, List<Entity>> entitiesPerPage = new HashMap<>();
|
||||
for (Entity entity : entities) {
|
||||
Map<Integer, List<EntityPositionSequence>> sequenceOnPage = new HashMap<>();
|
||||
for (EntityPositionSequence entityPositionSequence : entity.getPositionSequences()) {
|
||||
sequenceOnPage.computeIfAbsent(entityPositionSequence.getPageNumber(), (x) -> new ArrayList<>())
|
||||
.add(entityPositionSequence);
|
||||
}
|
||||
|
||||
for (Map.Entry<Integer, List<EntityPositionSequence>> entry : sequenceOnPage.entrySet()) {
|
||||
entitiesPerPage.computeIfAbsent(entry.getKey(), (x) -> new ArrayList<>())
|
||||
.add(new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), entry
|
||||
.getValue(), entity.getHeadline(), entity.getMatchedRule(), entity.getSectionNumber(), entity
|
||||
.getLegalBasis(), entity.isDictionaryEntry(), entity.getTextBefore(), entity.getTextAfter(), entity
|
||||
.getStart(), entity.getEnd()));
|
||||
}
|
||||
}
|
||||
|
||||
List<RedactionLogEntry> newRedactionLogEntries = new ArrayList<>();
|
||||
for (int page = 1; page <= text.getNumberOfPages(); page++) {
|
||||
if (entitiesPerPage.get(page) != null) {
|
||||
newRedactionLogEntries.addAll(redactionLogCreatorService.addEntries(entitiesPerPage, analyzeRequest
|
||||
.getManualRedactions(), page, analyzeRequest.getRuleSetId()));
|
||||
}
|
||||
|
||||
if (imagesPerPage.get(page) != null) {
|
||||
newRedactionLogEntries.addAll(redactionLogCreatorService.addImageEntries(imagesPerPage, analyzeRequest
|
||||
.getManualRedactions(), page, analyzeRequest.getRuleSetId()));
|
||||
}
|
||||
|
||||
newRedactionLogEntries.addAll(redactionLogCreatorService.addManualAddEntries(manualAdds, comments, page, analyzeRequest
|
||||
.getRuleSetId()));
|
||||
}
|
||||
|
||||
|
||||
redactionLog.getRedactionLogEntry().removeIf(entry -> sectionsToReanalyse.contains(entry.getSectionNumber()) && !entry.isImage() || entry.getSectionNumber() == 0 && !entry.isImage());
|
||||
redactionLog.getRedactionLogEntry().addAll(newRedactionLogEntries);
|
||||
redactionLog.setDictionaryVersion(dictionaryIncrement.getDictionaryVersion());
|
||||
|
||||
var changeLog = redactionChangeLogService.createAndStoreChangeLog(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), redactionLog);
|
||||
redactionStorageService.storeObject(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.REDACTION_LOG, redactionLog);
|
||||
return analyzeResponseService.createAnalyzeResponse(text.getNumberOfPages(), redactionLog, changeLog);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -262,7 +258,7 @@ public class ReanalyzeService {
|
||||
|
||||
return Image.builder()
|
||||
.type(entry.getType())
|
||||
.position(new Rectangle2D.Float(position.getTopLeft().getX(), position.getTopLeft()
|
||||
.position(new RedRectangle2D(position.getTopLeft().getX(), position.getTopLeft()
|
||||
.getY(), position.getWidth(), position.getHeight()))
|
||||
.sectionNumber(entry.getSectionNumber())
|
||||
.section(entry.getSection())
|
||||
|
||||
@ -4,6 +4,7 @@ import com.iqser.red.service.redaction.v1.model.*;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.RedTextPosition;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
|
||||
@ -14,7 +15,6 @@ import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import org.apache.commons.collections4.CollectionUtils;
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.util.ArrayList;
|
||||
@ -272,24 +272,24 @@ public class RedactionLogCreatorService {
|
||||
}
|
||||
|
||||
|
||||
private List<Rectangle> getRectanglesPerLine(List<TextPosition> textPositions, int page) {
|
||||
private List<Rectangle> getRectanglesPerLine(List<RedTextPosition> textPositions, int page) {
|
||||
|
||||
List<Rectangle> rectangles = new ArrayList<>();
|
||||
if (textPositions.size() == 1) {
|
||||
rectangles.add(new TextPositionSequence(textPositions, page).getRectangle());
|
||||
rectangles.add( TextPositionSequence.fromData(textPositions, page).getRectangle());
|
||||
} else {
|
||||
float y = textPositions.get(0).getYDirAdj();
|
||||
int startIndex = 0;
|
||||
for (int i = 1; i < textPositions.size(); i++) {
|
||||
float yDirAdj = textPositions.get(i).getYDirAdj();
|
||||
if (yDirAdj != y) {
|
||||
rectangles.add(new TextPositionSequence(textPositions.subList(startIndex, i), page).getRectangle());
|
||||
rectangles.add( TextPositionSequence.fromData(textPositions.subList(startIndex, i), page).getRectangle());
|
||||
y = yDirAdj;
|
||||
startIndex = i;
|
||||
}
|
||||
}
|
||||
if (startIndex != textPositions.size()) {
|
||||
rectangles.add(new TextPositionSequence(textPositions.subList(startIndex, textPositions.size()), page).getRectangle());
|
||||
rectangles.add( TextPositionSequence.fromData(textPositions.subList(startIndex, textPositions.size()), page).getRectangle());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -3,9 +3,9 @@ package com.iqser.red.service.redaction.v1.server.redaction.utils;
|
||||
import com.google.common.hash.HashFunction;
|
||||
import com.google.common.hash.Hashing;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.RedRectangle2D;
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.List;
|
||||
|
||||
@ -25,12 +25,8 @@ public class IdBuilder {
|
||||
}
|
||||
|
||||
|
||||
public String buildId(Rectangle2D rectangle2D, int page) {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append("x").append(rectangle2D.getX()).append("y").append(rectangle2D.getY()).append("h").append(rectangle2D.getHeight()).append("w").append(rectangle2D.getWidth()).append("p").append(page);
|
||||
|
||||
return hashFunction.hashString(sb.toString(), StandardCharsets.UTF_8).toString();
|
||||
public String buildId(RedRectangle2D rectangle2D, int page) {
|
||||
return hashFunction.hashString("x" + rectangle2D.getX() + "y" + rectangle2D.getY() + "h" + rectangle2D.getHeight() + "w" + rectangle2D.getWidth() + "p" + page, StandardCharsets.UTF_8).toString();
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -1,21 +1,15 @@
|
||||
package com.iqser.red.service.redaction.v1.server.segmentation;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.model.SectionArea;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.SectionText;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.service.BlockificationService;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.service.ClassificationService;
|
||||
import com.iqser.red.service.redaction.v1.server.exception.RedactionException;
|
||||
import com.iqser.red.service.redaction.v1.server.memory.MemoryStats;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.PDFAreaTextStripper;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.PDFLinesTextStripper;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.service.ImageClassificationService;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.CleanRulings;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.service.RulingCleaningService;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.service.TableExtractionService;
|
||||
@ -28,15 +22,12 @@ import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@ -53,80 +44,11 @@ public class PdfSegmentationService {
|
||||
private final ImageClassificationService imageClassificationService;
|
||||
|
||||
|
||||
private void postProcessSections(PDDocument pdDocument, List<SectionText> texts) {
|
||||
|
||||
try {
|
||||
for (SectionText sectionText : texts) {
|
||||
|
||||
List<TextBlock> textBlocks = new ArrayList<>();
|
||||
|
||||
Map<Integer, List<SectionArea>> sectionAreasPerPage = new HashMap<>();
|
||||
for (SectionArea sectionArea : sectionText.getSectionAreas()) {
|
||||
sectionAreasPerPage.computeIfAbsent(sectionArea.getPage(), (x) -> new ArrayList<>())
|
||||
.add(sectionArea);
|
||||
}
|
||||
|
||||
Map<String, CellValue> tabularData = new HashMap<>();
|
||||
List<Integer> cellStarts = new ArrayList<>();
|
||||
for (Integer page : sectionAreasPerPage.keySet()) {
|
||||
List<SectionArea> areasOnPage = sectionAreasPerPage.get(page);
|
||||
|
||||
PDPage pdPage = pdDocument.getPage(page - 1);
|
||||
PDRectangle cropBox = pdPage.getCropBox();
|
||||
PDFAreaTextStripper textStripper = new PDFAreaTextStripper();
|
||||
textStripper.setPageNumber(page);
|
||||
|
||||
int cellStart = 0;
|
||||
for (SectionArea sectionArea : areasOnPage) {
|
||||
|
||||
Rectangle2D rect = null;
|
||||
if (pdPage.getRotation() == 90) {
|
||||
rect = new Rectangle2D.Float(sectionArea.getTopLeft().getY(), sectionArea.getTopLeft()
|
||||
.getX(), sectionArea.getHeight(), sectionArea.getWidth() + 0.001f);
|
||||
} else {
|
||||
rect = new Rectangle2D.Float(sectionArea.getTopLeft().getX(), -sectionArea.getTopLeft()
|
||||
.getY() + cropBox.getUpperRightY() - sectionArea.getHeight(), sectionArea.getWidth(), sectionArea
|
||||
.getHeight() + 0.001f);
|
||||
}
|
||||
|
||||
textStripper.addRegion(String.valueOf(1), rect);
|
||||
textStripper.extractRegions(pdPage);
|
||||
textStripper.getTextForRegion(String.valueOf(1));
|
||||
List<TextPositionSequence> positions = textStripper.getTextPositionSequences();
|
||||
|
||||
TextBlock textBlock = new TextBlock(sectionArea.getTopLeft().getX(), sectionArea.getTopLeft()
|
||||
.getX() + sectionArea.getWidth(), sectionArea.getTopLeft()
|
||||
.getY(), sectionArea.getTopLeft().getY() + sectionArea.getHeight(), positions, 0);
|
||||
|
||||
if (sectionText.isTable()) {
|
||||
Cell cell = new Cell();
|
||||
cell.addTextBlock(textBlock);
|
||||
tabularData.put(sectionArea.getHeader(), new CellValue(cell.getTextBlocks(), cellStart));
|
||||
cellStarts.add(cellStart);
|
||||
cellStart = cellStart + cell.toString().trim().length() + 1;
|
||||
}
|
||||
|
||||
textBlocks.add(textBlock);
|
||||
textStripper.clearPositions();
|
||||
}
|
||||
|
||||
}
|
||||
sectionText.setTextBlocks(textBlocks);
|
||||
sectionText.setTabularData(tabularData);
|
||||
if (sectionText.isTable()) {
|
||||
sectionText.setCellStarts(cellStarts);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
} catch (Exception e) {
|
||||
throw new RedactionException(e);
|
||||
}
|
||||
|
||||
public Document parseDocument(InputStream documentInputStream) throws IOException {
|
||||
return parseDocument(documentInputStream, false);
|
||||
}
|
||||
|
||||
|
||||
public Document parseDocument(InputStream documentInputStream) throws IOException {
|
||||
public Document parseDocument(InputStream documentInputStream, boolean ignoreImages) throws IOException {
|
||||
PDDocument pdDocument = null;
|
||||
try {
|
||||
//create tempFile
|
||||
@ -166,24 +88,23 @@ public class PdfSegmentationService {
|
||||
|
||||
Page page = blockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings
|
||||
.getVertical());
|
||||
|
||||
page.setRotation(rotation);
|
||||
|
||||
tableExtractionService.extractTables(cleanRulings, page);
|
||||
|
||||
buildPageStatistics(page);
|
||||
|
||||
page.setLandscape(isLandscape || isRotated);
|
||||
|
||||
page.setPageNumber(pageNumber);
|
||||
increaseDocumentStatistics(page, document);
|
||||
|
||||
page.setImages(stripper.getImages());
|
||||
|
||||
imageClassificationService.classifyImages(page);
|
||||
tableExtractionService.extractTables(cleanRulings, page);
|
||||
buildPageStatistics(page);
|
||||
increaseDocumentStatistics(page, document);
|
||||
|
||||
|
||||
if (!ignoreImages) {
|
||||
imageClassificationService.classifyImages(page);
|
||||
}
|
||||
|
||||
pages.add(page);
|
||||
|
||||
|
||||
}
|
||||
|
||||
document.setPages(pages);
|
||||
@ -194,9 +115,6 @@ public class PdfSegmentationService {
|
||||
|
||||
pdDocument = reinitializePDDocument(tempFile, pdDocument);
|
||||
|
||||
// This can be improved an done in one pass, but it's complicated to do right away
|
||||
postProcessSections(pdDocument, document.getSectionText());
|
||||
|
||||
IOUtils.close(pdDocument);
|
||||
|
||||
tempFile.delete();
|
||||
|
||||
@ -50,7 +50,7 @@ public class RedactionStorageService {
|
||||
try {
|
||||
return objectMapper.readValue(inputStreamResource.getInputStream(), RedactionLog.class);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("Could not convert Text", e);
|
||||
throw new RuntimeException("Could not convert RedactionLog", e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
package com.iqser.red.service.redaction.v1.server.tableextraction.model;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.iqser.red.service.redaction.v1.model.Rectangle;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
@ -27,10 +28,12 @@ public abstract class AbstractTextContainer {
|
||||
return page == other.getPage() && this.minX <= other.getTopLeft().getX() && this.maxX >= other.getTopLeft().getX() + other.getWidth() && this.minY <= other.getTopLeft().getY() && this.maxY >= other.getTopLeft().getY() + other.getHeight();
|
||||
}
|
||||
|
||||
@JsonIgnore
|
||||
public float getHeight() {
|
||||
return maxY - minY;
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getWidth() {
|
||||
return maxX - minX;
|
||||
}
|
||||
|
||||
@ -12,11 +12,11 @@ import java.io.FileOutputStream;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
public class FilySystemBackedStorageService extends StorageService {
|
||||
public class FileSystemBackedStorageService extends StorageService {
|
||||
|
||||
private Map<String, File> dataMap = new HashMap<>();
|
||||
private final Map<String, File> dataMap = new HashMap<>();
|
||||
|
||||
public FilySystemBackedStorageService() {
|
||||
public FileSystemBackedStorageService() {
|
||||
super(null, null);
|
||||
}
|
||||
|
||||
@ -134,7 +134,7 @@ public class RedactionIntegrationTest {
|
||||
@Bean
|
||||
@Primary
|
||||
public StorageService inmemoryStorage() {
|
||||
return new FilySystemBackedStorageService();
|
||||
return new FileSystemBackedStorageService();
|
||||
}
|
||||
|
||||
}
|
||||
@ -142,8 +142,8 @@ public class RedactionIntegrationTest {
|
||||
|
||||
@After
|
||||
public void cleanupStorage() {
|
||||
if (this.storageService instanceof FilySystemBackedStorageService) {
|
||||
((FilySystemBackedStorageService) this.storageService).clearStorage();
|
||||
if (this.storageService instanceof FileSystemBackedStorageService) {
|
||||
((FileSystemBackedStorageService) this.storageService).clearStorage();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -2,7 +2,7 @@ package com.iqser.red.service.redaction.v1.server.redaction.service;
|
||||
|
||||
import com.amazonaws.services.s3.AmazonS3;
|
||||
import com.iqser.red.service.configuration.v1.api.model.*;
|
||||
import com.iqser.red.service.redaction.v1.server.FilySystemBackedStorageService;
|
||||
import com.iqser.red.service.redaction.v1.server.FileSystemBackedStorageService;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.client.DictionaryClient;
|
||||
import com.iqser.red.service.redaction.v1.server.client.RulesClient;
|
||||
@ -97,7 +97,7 @@ public class EntityRedactionServiceTest {
|
||||
@Bean
|
||||
@Primary
|
||||
public StorageService inmemoryStorage() {
|
||||
return new FilySystemBackedStorageService();
|
||||
return new FileSystemBackedStorageService();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user