Merge branch 'master' of ssh://git.iqser.com:2222/red/redaction-service

This commit is contained in:
aoezyetimoglu 2021-12-17 11:52:52 +01:00
commit 3c1f4b4853
8 changed files with 161 additions and 79 deletions

View File

@ -1,22 +1,24 @@
package com.iqser.red.service.redaction.v1.server.parsing.model;
import static java.util.stream.Collectors.toSet;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
import com.iqser.red.service.redaction.v1.model.Point;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import lombok.Data;
import lombok.NoArgsConstructor;
import org.apache.pdfbox.text.TextPosition;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.pdfbox.text.TextPosition;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
import com.iqser.red.service.redaction.v1.model.Point;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Data
@NoArgsConstructor
@JsonIgnoreProperties({ "empty" })
@JsonIgnoreProperties({"empty"})
public class TextPositionSequence implements CharSequence {
private int page;
@ -25,12 +27,15 @@ public class TextPositionSequence implements CharSequence {
private float x1;
private float x2;
public TextPositionSequence(int page) {
this.page = page;
}
public static TextPositionSequence fromData(List<RedTextPosition> textPositions, int page) {
var textPositionSequence = new TextPositionSequence();
textPositionSequence.textPositions = textPositions;
textPositionSequence.page = page;
@ -46,9 +51,6 @@ public class TextPositionSequence implements CharSequence {
}
@Override
public int length() {
@ -131,6 +133,7 @@ public class TextPositionSequence implements CharSequence {
}
}
@JsonIgnore
public float getRotationAdjustedY() {
@ -190,10 +193,8 @@ public class TextPositionSequence implements CharSequence {
@JsonIgnore
public String getFont() {
return textPositions.get(0).getFontName()
.toLowerCase()
.replaceAll(",bold", "")
.replaceAll(",italic", "");
return textPositions.get(0).getFontName().toLowerCase().replaceAll(",bold", "").replaceAll(",italic", "");
}
@ -214,27 +215,33 @@ public class TextPositionSequence implements CharSequence {
}
@JsonIgnore
public float getFontSize() {
return textPositions.get(0).getFontSizeInPt();
}
@JsonIgnore
public float getSpaceWidth() {
return textPositions.get(0).getWidthOfSpace();
}
@JsonIgnore
public int getRotation() {
return textPositions.get(0).getRotation();
}
@JsonIgnore
public Rectangle getRectangle() {
log.debug("Page: '{}', Word: '{}', Rotation: '{}'", page, toString(), textPositions.get(0).getRotation());
float height = getTextHeight();
float posXInit = getX1();
@ -246,36 +253,45 @@ public class TextPositionSequence implements CharSequence {
posXEnd = textPositions.get(0).getYDirAdj() + 2;
posYInit = getY1();
posYEnd = textPositions.get(textPositions.size() - 1).getXDirAdj() - height + 4;
} else if (textPositions.get(0).getRotation() == 270) {
} else if (textPositions.get(0).getRotation() == 270 && textPositions.get(0).getDir() == 270.0f) {
posYInit = textPositions.get(0).getPageHeight() - getX1();
posYEnd = textPositions.get(0).getPageHeight() - getX2() - textPositions.get(0)
.getWidth() - textPositions.get(textPositions.size() - 1).getWidth() - 1;
posXInit = textPositions.get(0).getPageWidth() - textPositions.get(0).getYDirAdj() - 2;
posXEnd = textPositions.get(0).getPageWidth() - textPositions.get(textPositions.size() - 1)
.getYDirAdj() + height;
} else if(textPositions.get(0).getRotation() == 0 && textPositions.get(0).getDir() == 270f) {
} else if (textPositions.get(0).getRotation() == 270 && textPositions.get(0).getDir() == 0.0f) {
posYInit = textPositions.get(0).getPageHeight() - textPositions.get(0).getYDirAdj() - 2;
posYEnd = posYInit + 1;
posXInit = textPositions.get(0).getXDirAdj();
posXEnd = textPositions.get(textPositions.size() - 1).getXDirAdj() + textPositions.get(textPositions.size() - 1).getWidthDirAdj() + 0.1f;
} else if (textPositions.get(0).getRotation() == 0 && textPositions.get(0).getDir() == 270f) {
posYInit = textPositions.get(0).getPageHeight() - getX1();
posYEnd = textPositions.get(0).getPageHeight() - getX2() - textPositions.get(0)
.getWidthDirAdj() - textPositions.get(textPositions.size() - 1).getWidthDirAdj() - 3;
posXInit = textPositions.get(0).getPageWidth() - textPositions.get(0).getYDirAdj() - 2;
posXEnd = textPositions.get(0).getPageWidth() - textPositions.get(textPositions.size() - 1)
.getYDirAdj() + height;
} else if(textPositions.get(0).getRotation() == 90 && textPositions.get(0).getDir() == 0.0f){
posXInit = textPositions.get(textPositions.size() - 1)
} else if (textPositions.get(0).getRotation() == 90 && textPositions.get(0).getDir() == 0.0f) {
posXInit = textPositions.get(textPositions.size() - 1)
.getXDirAdj() + textPositions.get(textPositions.size() - 1).getHeightDir();
posXEnd = textPositions.get(0).getXDirAdj();
posYInit = textPositions.get(0).getPageHeight() - textPositions.get(0).getYDirAdj() - 2;
posYEnd = textPositions.get(0).getPageHeight() - textPositions.get(textPositions.size() - 1)
.getYDirAdj() + 2;
} else if(textPositions.get(0).getRotation() == 0 && textPositions.get(0).getDir() == 90f){
posYInit = getX1();
posYEnd = getX2() + textPositions.get(0)
.getWidthDirAdj() - textPositions.get(textPositions.size() - 1).getWidthDirAdj() - 3;
posXInit = textPositions.get(0).getYDirAdj() + 2;
posXEnd = textPositions.get(textPositions.size() - 1)
.getYDirAdj() - height;
}
else {
} else if (textPositions.get(0).getRotation() == 0 && textPositions.get(0).getDir() == 90f) {
posYInit = getX1();
posYEnd = getX2() + textPositions.get(0).getWidthDirAdj() - textPositions.get(textPositions.size() - 1)
.getWidthDirAdj() - 3;
posXInit = textPositions.get(0).getYDirAdj() + 2;
posXEnd = textPositions.get(textPositions.size() - 1).getYDirAdj() - height;
} else {
posXEnd = textPositions.get(textPositions.size() - 1)
.getXDirAdj() + textPositions.get(textPositions.size() - 1).getWidthDirAdj() + 1;
posYInit = textPositions.get(0).getPageHeight() - textPositions.get(0).getYDirAdj() - 2;
@ -283,7 +299,9 @@ public class TextPositionSequence implements CharSequence {
.getYDirAdj() + 2;
}
return new Rectangle(new Point(posXInit, posYInit), posXEnd - posXInit, posYEnd - posYInit + height, page);
var rectangle = new Rectangle(new Point(posXInit, posYInit), posXEnd - posXInit, posYEnd - posYInit + height, page);
log.debug("Rectangle: {}", rectangle);
return rectangle;
}
}

View File

@ -50,7 +50,13 @@ public class RedactionChangeLogService {
.equals(ChangeType.REMOVED))
.collect(Collectors.toList());
Set<RedactionLogEntry> added = new HashSet<>(currentRedactionLog.getRedactionLogEntry());
Set<RedactionLogEntry> added = new HashSet<>(currentRedactionLog.getRedactionLogEntry().stream()
.filter(entry -> entry.getChanges().isEmpty() || !entry.getChanges()
.get(entry.getChanges().size() - 1)
.getType()
.equals(ChangeType.REMOVED))
.collect(Collectors.toList()));
added.removeAll(notRemovedPreviousEntries);
Set<RedactionLogEntry> removed = new HashSet<>(notRemovedPreviousEntries);

View File

@ -140,14 +140,14 @@ public class RedactionLogCreatorService {
} else {
float x = textPositions.get(0).getXDirAdj();
float y = textPositions.get(0).getYDirAdj();
float width = textPositions.get(0).getWidth();
float width = textPositions.get(0).getWidthDirAdj();
float height = textPositions.get(0).getHeightDir();
int startIndex = 0;
for (int i = 1; i < textPositions.size(); i++) {
float xDirAdj = textPositions.get(i).getXDirAdj();
float yDirAdj = textPositions.get(i).getYDirAdj();
float widthDir = textPositions.get(i).getWidth();
float widthDir = textPositions.get(i).getWidthDirAdj();
float heightDir = textPositions.get(i).getHeightDir();
if (!(isCharInSameLine(y, yDirAdj, height, heightDir) && isCharClose(x, xDirAdj, width))) {

View File

@ -24,7 +24,6 @@ import java.util.UUID;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.junit.After;
import org.junit.Before;
import org.junit.Ignore;
@ -50,7 +49,6 @@ import org.springframework.test.context.junit4.SpringRunner;
import com.amazonaws.services.s3.AmazonS3;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Sets;
import com.iqser.red.service.persistence.service.v1.api.model.annotations.AnnotationStatus;
import com.iqser.red.service.persistence.service.v1.api.model.annotations.Comment;
import com.iqser.red.service.persistence.service.v1.api.model.annotations.IdRemoval;
@ -84,6 +82,7 @@ import com.iqser.red.service.redaction.v1.server.controller.RedactionController;
import com.iqser.red.service.redaction.v1.server.memory.MemoryStats;
import com.iqser.red.service.redaction.v1.server.redaction.service.AnalyzeService;
import com.iqser.red.service.redaction.v1.server.redaction.service.ManualRedactionSurroundingTextService;
import com.iqser.red.service.redaction.v1.server.redaction.utils.OsUtils;
import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader;
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
@ -596,7 +595,7 @@ public class RedactionIntegrationTest {
.fileId(TEST_FILE_ID)
.build());
try (FileOutputStream fileOutputStream = new FileOutputStream(getTemporaryDirectory() + "/Annotated3.pdf")) {
try (FileOutputStream fileOutputStream = new FileOutputStream(OsUtils.getTemporaryDirectory() + "/Annotated3.pdf")) {
fileOutputStream.write(annotateResponse.getDocument());
}
long rstart = System.currentTimeMillis();
@ -623,13 +622,15 @@ public class RedactionIntegrationTest {
var redactionLog = redactionStorageService.getRedactionLog(TEST_DOSSIER_ID, TEST_FILE_ID);
var values = redactionLog.getRedactionLogEntry().stream().map(RedactionLogEntry::getValue).collect(Collectors.toList());
var values = redactionLog.getRedactionLogEntry()
.stream()
.map(RedactionLogEntry::getValue)
.collect(Collectors.toList());
assertThat(values).containsExactlyInAnyOrder("Lastname M.", "Doe", "Doe J.", "M. Mustermann", "Mustermann M.", "F. Lastname");
}
@Test
@SneakyThrows
public void testIgnoreHint() {
@ -664,17 +665,22 @@ public class RedactionIntegrationTest {
.fileId(TEST_FILE_ID)
.build());
var cbiAddressBeforeHintRemoval = redactionLog.getRedactionLogEntry().stream().filter(re -> re.getType().equalsIgnoreCase("CBI_Address")).findAny().get();
var cbiAddressBeforeHintRemoval = redactionLog.getRedactionLogEntry()
.stream()
.filter(re -> re.getType().equalsIgnoreCase("CBI_Address"))
.findAny()
.get();
assertThat(cbiAddressBeforeHintRemoval.isRedacted()).isFalse();
var cbiAddressAfterHintRemoval = mergedRedactionLog.getRedactionLogEntry().stream().filter(re -> re.getType().equalsIgnoreCase("CBI_Address")).findAny().get();
var cbiAddressAfterHintRemoval = mergedRedactionLog.getRedactionLogEntry()
.stream()
.filter(re -> re.getType().equalsIgnoreCase("CBI_Address"))
.findAny()
.get();
assertThat(cbiAddressAfterHintRemoval.isRedacted()).isTrue();
}
@Test
@Ignore
public void noExceptionShouldBeThrownForAnyFiles() throws IOException {
@ -732,7 +738,7 @@ public class RedactionIntegrationTest {
public void redactionTestSeparatedRedaction() throws IOException {
String fileName = "scanned/VV-380943_page38.pdf";
String outputFileName = getTemporaryDirectory() + "/AnnotatedRedactionTestSeparatedRedaction.pdf";
String outputFileName = OsUtils.getTemporaryDirectory() + "/AnnotatedRedactionTestSeparatedRedaction.pdf";
long start = System.currentTimeMillis();
ClassPathResource pdfFileResource = new ClassPathResource(fileName);
@ -756,7 +762,7 @@ public class RedactionIntegrationTest {
System.out.println("first analysis duration: " + (end - start));
try (FileOutputStream fileOutputStream = new FileOutputStream(getTemporaryDirectory() + "/Test.json")) {
try (FileOutputStream fileOutputStream = new FileOutputStream(OsUtils.getTemporaryDirectory() + "/Test.json")) {
fileOutputStream.write(objectMapper.writeValueAsBytes(redactionStorageService.getText(TEST_DOSSIER_ID, TEST_FILE_ID)));
}
@ -841,15 +847,60 @@ public class RedactionIntegrationTest {
redactionLog = redactionStorageService.getRedactionLog(TEST_DOSSIER_ID, TEST_FILE_ID);
System.out.println("hi");
System.out.println("Output file:" + outputFileName);
}
@Test
public void testChangeComputation() throws IOException {
String fileName = "files/new/test1S1T1.pdf";
String outputFileName = OsUtils.getTemporaryDirectory() + "/Annotated.pdf";
ClassPathResource pdfFileResource = new ClassPathResource(fileName);
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId()));
analyzeService.analyze(request);
dictionary.get(AUTHOR).add("report");
reanlysisVersions.put("report", 2L);
when(dictionaryClient.getVersion(TEST_DOSSIER_TEMPLATE_ID)).thenReturn(2L);
when(dictionaryClient.getDictionaryForType(AUTHOR +":" + TEST_DOSSIER_TEMPLATE_ID)).thenReturn(getDictionaryResponse(AUTHOR, false));
analyzeService.reanalyze(request);
dictionary.get(AUTHOR).add("assessment report");
reanlysisVersions.put("assessment report", 3L);
when(dictionaryClient.getVersion(TEST_DOSSIER_TEMPLATE_ID)).thenReturn(3L);
when(dictionaryClient.getDictionaryForType(AUTHOR +":" + TEST_DOSSIER_TEMPLATE_ID)).thenReturn(getDictionaryResponse(AUTHOR, false));
analyzeService.reanalyze(request);
when(dictionaryClient.getVersion(TEST_DOSSIER_TEMPLATE_ID)).thenReturn(3L);
when(dictionaryClient.getDictionaryForType(AUTHOR +":" + TEST_DOSSIER_TEMPLATE_ID)).thenReturn(getDictionaryResponse(AUTHOR, false));
analyzeService.reanalyze(request);
var redactionLog = redactionStorageService.getRedactionLog(TEST_DOSSIER_ID, TEST_FILE_ID);
var changes = redactionLog.getRedactionLogEntry().stream().filter(entry ->
entry.getValue() != null && entry.getValue().equals("report"))
.findFirst().get().getChanges();
assertThat(changes.size()).isEqualTo(2);
AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder()
.dossierId(TEST_DOSSIER_ID)
.fileId(TEST_FILE_ID)
.build());
try (FileOutputStream fileOutputStream = new FileOutputStream(outputFileName)) {
fileOutputStream.write(annotateResponse.getDocument());
}
}
@Test
public void redactionTest() throws IOException {
String fileName = "files/new/VV-919901.pdf";
String outputFileName = getTemporaryDirectory() + "/Annotated.pdf";
String fileName = "files/new/VV-511309.pdf";
String outputFileName = OsUtils.getTemporaryDirectory() + "/Annotated.pdf";
long start = System.currentTimeMillis();
ClassPathResource pdfFileResource = new ClassPathResource(fileName);
@ -873,7 +924,7 @@ public class RedactionIntegrationTest {
System.out.println("first analysis duration: " + (end - start));
try (FileOutputStream fileOutputStream = new FileOutputStream(getTemporaryDirectory() + "/Test.json")) {
try (FileOutputStream fileOutputStream = new FileOutputStream(OsUtils.getTemporaryDirectory() + "/Test.json")) {
fileOutputStream.write(objectMapper.writeValueAsBytes(redactionStorageService.getText(TEST_DOSSIER_ID, TEST_FILE_ID)));
}
@ -958,7 +1009,7 @@ public class RedactionIntegrationTest {
redactionLog = redactionStorageService.getRedactionLog(TEST_DOSSIER_ID, TEST_FILE_ID);
System.out.println("hi");
System.out.println("Output file:" + outputFileName);
}
@ -977,7 +1028,7 @@ public class RedactionIntegrationTest {
.fileId(TEST_FILE_ID)
.build());
try (FileOutputStream fileOutputStream = new FileOutputStream(getTemporaryDirectory() + "/Annotated.pdf")) {
try (FileOutputStream fileOutputStream = new FileOutputStream(OsUtils.getTemporaryDirectory() + "/Annotated.pdf")) {
fileOutputStream.write(annotateResponse.getDocument());
}
long end = System.currentTimeMillis();
@ -1071,7 +1122,7 @@ public class RedactionIntegrationTest {
.fileId(TEST_FILE_ID)
.build());
try (FileOutputStream fileOutputStream = new FileOutputStream(getTemporaryDirectory() + "/Annotated.pdf")) {
try (FileOutputStream fileOutputStream = new FileOutputStream(OsUtils.getTemporaryDirectory() + "/Annotated.pdf")) {
fileOutputStream.write(annotateResponse.getDocument());
}
long end = System.currentTimeMillis();
@ -1097,7 +1148,7 @@ public class RedactionIntegrationTest {
RedactionResult result = redactionController.classify(redactionRequest);
try (FileOutputStream fileOutputStream = new FileOutputStream(getTemporaryDirectory() + "/Classified.pdf")) {
try (FileOutputStream fileOutputStream = new FileOutputStream(OsUtils.getTemporaryDirectory() + "/Classified.pdf")) {
fileOutputStream.write(result.getDocument());
}
}
@ -1119,7 +1170,7 @@ public class RedactionIntegrationTest {
RedactionResult result = redactionController.sections(redactionRequest);
try (FileOutputStream fileOutputStream = new FileOutputStream(getTemporaryDirectory() + "/Sections.pdf")) {
try (FileOutputStream fileOutputStream = new FileOutputStream(OsUtils.getTemporaryDirectory() + "/Sections.pdf")) {
fileOutputStream.write(result.getDocument());
}
}
@ -1141,7 +1192,7 @@ public class RedactionIntegrationTest {
RedactionResult result = redactionController.htmlTables(redactionRequest);
try (FileOutputStream fileOutputStream = new FileOutputStream(getTemporaryDirectory() + "/Tables.html")) {
try (FileOutputStream fileOutputStream = new FileOutputStream(OsUtils.getTemporaryDirectory() + "/Tables.html")) {
fileOutputStream.write(result.getDocument());
}
}
@ -1163,7 +1214,7 @@ public class RedactionIntegrationTest {
RedactionResult result = redactionController.htmlTables(redactionRequest);
try (FileOutputStream fileOutputStream = new FileOutputStream(getTemporaryDirectory() + "/Tables.html")) {
try (FileOutputStream fileOutputStream = new FileOutputStream(OsUtils.getTemporaryDirectory() + "/Tables.html")) {
fileOutputStream.write(result.getDocument());
}
}
@ -1233,7 +1284,7 @@ public class RedactionIntegrationTest {
.fileId(TEST_FILE_ID)
.build());
try (FileOutputStream fileOutputStream = new FileOutputStream(getTemporaryDirectory() + "/Annotated.pdf")) {
try (FileOutputStream fileOutputStream = new FileOutputStream(OsUtils.getTemporaryDirectory() + "/Annotated.pdf")) {
fileOutputStream.write(annotateResponse.getDocument());
}
long end = System.currentTimeMillis();
@ -1366,12 +1417,13 @@ public class RedactionIntegrationTest {
.manualRedactions(manualRedactions)
.build());
try (FileOutputStream fileOutputStream = new FileOutputStream(getTemporaryDirectory() + "/Annotated.pdf")) {
try (FileOutputStream fileOutputStream = new FileOutputStream(OsUtils.getTemporaryDirectory() + "/Annotated.pdf")) {
fileOutputStream.write(annotateResponse.getDocument());
}
}
@Test
public void testExpandByRegEx() throws IOException {
@ -1387,7 +1439,7 @@ public class RedactionIntegrationTest {
.fileId(TEST_FILE_ID)
.build());
try (FileOutputStream fileOutputStream = new FileOutputStream(getTemporaryDirectory() + "/Annotated.pdf")) {
try (FileOutputStream fileOutputStream = new FileOutputStream(OsUtils.getTemporaryDirectory() + "/Annotated.pdf")) {
fileOutputStream.write(annotateResponse.getDocument());
}
long end = System.currentTimeMillis();
@ -1465,7 +1517,7 @@ public class RedactionIntegrationTest {
.manualRedactions(manualRedactions)
.build());
try (FileOutputStream fileOutputStream = new FileOutputStream(getTemporaryDirectory() + "/Annotated.pdf")) {
try (FileOutputStream fileOutputStream = new FileOutputStream(OsUtils.getTemporaryDirectory() + "/Annotated.pdf")) {
fileOutputStream.write(annotateResponse.getDocument());
}
@ -1513,14 +1565,4 @@ public class RedactionIntegrationTest {
}
private static String getTemporaryDirectory() {
// String tmpdir = System.getProperty("java.io.tmpdir");
// if (StringUtils.isNotBlank(tmpdir)) {
// return tmpdir;
// }
return "/tmp";
}
}

View File

@ -0,0 +1,22 @@
package com.iqser.red.service.redaction.v1.server.redaction.utils;
import org.apache.commons.lang3.StringUtils;
public class OsUtils {
public static boolean isWindows() {
return StringUtils.containsIgnoreCase(System.getProperty("os.name"), "Windows");
}
public static String getTemporaryDirectory() {
String tmpdir = System.getProperty("java.io.tmpdir");
if (isWindows() && StringUtils.isNotBlank(tmpdir)) {
return tmpdir;
}
return "/tmp";
}
}

View File

@ -35,6 +35,7 @@ import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;
import static com.iqser.red.service.redaction.v1.server.redaction.utils.OsUtils.getTemporaryDirectory;
import static org.assertj.core.api.Assertions.assertThat;
@ -240,12 +241,5 @@ public class PdfSegmentationServiceTest {
.equals(firstTableHeaderCells))).isTrue();
}
private static String getTemporaryDirectory() {
String tmpdir = System.getProperty("java.io.tmpdir");
if (StringUtils.isNotBlank(tmpdir)) {
return tmpdir;
}
return "/tmp";
}
}