Pull request #277: Bugfix/RED-2756

Merge in RED/redaction-service from bugfix/RED-2756 to master

* commit '9d0fafd63ddb23fd8ee2f154e02df88687e08f96':
  RED-2756 Bugfix with redactions are not continuous
  RED-2756 Bugfix for 'Redaction is not continuous', compare line height and y position instead of rounding y values
This commit is contained in:
Philipp Schramm 2021-11-29 10:36:05 +01:00 committed by Dominique Eiflaender
commit f5817204bf
4 changed files with 262 additions and 64 deletions

View File

@ -1,5 +1,15 @@
package com.iqser.red.service.redaction.v1.server.redaction.service;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.collections4.CollectionUtils;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.model.Point;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
@ -10,12 +20,8 @@ import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionS
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
import com.iqser.red.service.redaction.v1.server.redaction.model.PageEntities;
import com.iqser.red.service.redaction.v1.server.redaction.utils.IdBuilder;
import lombok.RequiredArgsConstructor;
import org.apache.commons.collections4.CollectionUtils;
import org.springframework.stereotype.Service;
import java.util.*;
import java.util.stream.Collectors;
import lombok.RequiredArgsConstructor;
@Service
@RequiredArgsConstructor
@ -24,7 +30,8 @@ public class RedactionLogCreatorService {
private final DictionaryService dictionaryService;
public List<RedactionLogEntry> createRedactionLog(PageEntities pageEntities, int numberOfPages, String dossierTemplateId) {
public List<RedactionLogEntry> createRedactionLog(PageEntities pageEntities, int numberOfPages,
String dossierTemplateId) {
List<RedactionLogEntry> entries = new ArrayList<>();
@ -130,13 +137,19 @@ public class RedactionLogCreatorService {
rectangles.add(TextPositionSequence.fromData(textPositions, page).getRectangle());
} else {
float y = textPositions.get(0).getYDirAdj();
float height = textPositions.get(0).getHeightDir();
int startIndex = 0;
for (int i = 1; i < textPositions.size(); i++) {
float yDirAdj = textPositions.get(i).getYDirAdj();
if (round(yDirAdj,3) != round(y, 3)) {
float heightDir = textPositions.get(i).getHeightDir();
if (!isCharInSameLine(y, yDirAdj, height, heightDir)) {
rectangles.add(TextPositionSequence.fromData(textPositions.subList(startIndex, i), page)
.getRectangle());
y = yDirAdj;
height = heightDir;
startIndex = i;
}
}
@ -149,9 +162,21 @@ public class RedactionLogCreatorService {
return rectangles;
}
private double round(float value, int decimalPoints) {
var d = Math.pow(10, decimalPoints);
return Math.round(value * d) / d;
private boolean isCharInSameLine(float y, float yCompare, float height, float heightCompare) {
float offsetHeight = heightCompare / 5;
float minHeight = heightCompare - offsetHeight;
float maxHeight = heightCompare + offsetHeight;
float offsetY = heightCompare / 22;
float minY = y - offsetY;
float maxY = y + offsetY;
if (yCompare > minY && yCompare < maxY && height > minHeight && height < maxHeight) {
return true;
}
return false;
}
@ -204,5 +229,4 @@ public class RedactionLogCreatorService {
return dictionaryService.isRecommendation(type, dossierTemplateId);
}
}

View File

@ -1,30 +1,27 @@
package com.iqser.red.service.redaction.v1.server;
import com.amazonaws.services.s3.AmazonS3;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.persistence.service.v1.api.model.annotations.Rectangle;
import com.iqser.red.service.persistence.service.v1.api.model.annotations.*;
import com.iqser.red.service.persistence.service.v1.api.model.common.JSONPrimitive;
import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.configuration.Colors;
import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.dossier.file.FileType;
import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.type.DictionaryEntry;
import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.type.Type;
import com.iqser.red.service.redaction.v1.model.*;
import com.iqser.red.service.redaction.v1.server.classification.model.SectionText;
import com.iqser.red.service.redaction.v1.server.client.DictionaryClient;
import com.iqser.red.service.redaction.v1.server.client.ImageClassificationClient;
import com.iqser.red.service.redaction.v1.server.client.LegalBasisClient;
import com.iqser.red.service.redaction.v1.server.client.RulesClient;
import com.iqser.red.service.redaction.v1.server.controller.RedactionController;
import com.iqser.red.service.redaction.v1.server.memory.MemoryStats;
import com.iqser.red.service.redaction.v1.server.redaction.service.AnalyzeService;
import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader;
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
import com.iqser.red.storage.commons.StorageAutoConfiguration;
import com.iqser.red.storage.commons.service.StorageService;
import static org.assertj.core.api.Assertions.assertThat;
import static org.mockito.Mockito.when;
import lombok.SneakyThrows;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.time.OffsetDateTime;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.UUID;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
@ -51,15 +48,47 @@ import org.springframework.context.annotation.Primary;
import org.springframework.core.io.ClassPathResource;
import org.springframework.test.context.junit4.SpringRunner;
import java.io.*;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.time.OffsetDateTime;
import java.util.*;
import java.util.stream.Collectors;
import com.amazonaws.services.s3.AmazonS3;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.persistence.service.v1.api.model.annotations.AnnotationStatus;
import com.iqser.red.service.persistence.service.v1.api.model.annotations.Comment;
import com.iqser.red.service.persistence.service.v1.api.model.annotations.IdRemoval;
import com.iqser.red.service.persistence.service.v1.api.model.annotations.ManualForceRedaction;
import com.iqser.red.service.persistence.service.v1.api.model.annotations.ManualImageRecategorization;
import com.iqser.red.service.persistence.service.v1.api.model.annotations.ManualLegalBasisChange;
import com.iqser.red.service.persistence.service.v1.api.model.annotations.ManualRedactionEntry;
import com.iqser.red.service.persistence.service.v1.api.model.annotations.ManualRedactions;
import com.iqser.red.service.persistence.service.v1.api.model.annotations.ManualResizeRedaction;
import com.iqser.red.service.persistence.service.v1.api.model.annotations.Rectangle;
import com.iqser.red.service.persistence.service.v1.api.model.common.JSONPrimitive;
import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.configuration.Colors;
import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.dossier.file.FileType;
import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.type.DictionaryEntry;
import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.type.Type;
import com.iqser.red.service.redaction.v1.model.AnalyzeRequest;
import com.iqser.red.service.redaction.v1.model.AnalyzeResult;
import com.iqser.red.service.redaction.v1.model.AnnotateRequest;
import com.iqser.red.service.redaction.v1.model.AnnotateResponse;
import com.iqser.red.service.redaction.v1.model.FileAttribute;
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
import com.iqser.red.service.redaction.v1.model.RedactionRequest;
import com.iqser.red.service.redaction.v1.model.RedactionResult;
import com.iqser.red.service.redaction.v1.model.StructureAnalyzeRequest;
import com.iqser.red.service.redaction.v1.server.classification.model.SectionText;
import com.iqser.red.service.redaction.v1.server.client.DictionaryClient;
import com.iqser.red.service.redaction.v1.server.client.ImageClassificationClient;
import com.iqser.red.service.redaction.v1.server.client.LegalBasisClient;
import com.iqser.red.service.redaction.v1.server.client.RulesClient;
import com.iqser.red.service.redaction.v1.server.controller.RedactionController;
import com.iqser.red.service.redaction.v1.server.memory.MemoryStats;
import com.iqser.red.service.redaction.v1.server.redaction.service.AnalyzeService;
import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader;
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
import com.iqser.red.storage.commons.StorageAutoConfiguration;
import com.iqser.red.storage.commons.service.StorageService;
import static org.assertj.core.api.Assertions.assertThat;
import static org.mockito.Mockito.when;
import lombok.SneakyThrows;
@RunWith(SpringRunner.class)
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT)
@ -631,29 +660,14 @@ public class RedactionIntegrationTest {
}
private List<File> getPathsRecursively(File path) {
List<File> result = new ArrayList<>();
if (path == null || path.listFiles() == null) {
return result;
}
for (File f : path.listFiles()) {
if (f.isFile()) {
result.add(f);
} else {
result.addAll(getPathsRecursively(f));
}
}
return result;
}
@Test
public void redactionTest() throws IOException {
public void redactionTestSeparatedRedaction() throws IOException {
String fileName = "scanned/VV-380943_page38.pdf";
String outputFileName = getTemporaryDirectory() + "/AnnotatedRedactionTestSeparatedRedaction.pdf";
long start = System.currentTimeMillis();
ClassPathResource pdfFileResource = new ClassPathResource("files/new/VV-919901.pdf");
ClassPathResource pdfFileResource = new ClassPathResource(fileName);
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
request.setExcludedPages(Set.of(1));
@ -744,7 +758,124 @@ public class RedactionIntegrationTest {
.fileId(TEST_FILE_ID)
.build());
try (FileOutputStream fileOutputStream = new FileOutputStream(getTemporaryDirectory() + "/Annotated.pdf")) {
try (FileOutputStream fileOutputStream = new FileOutputStream(outputFileName)) {
fileOutputStream.write(annotateResponse.getDocument());
}
deleted.remove("mouse");
reanlysisVersions.put("mouse", 4L);
when(dictionaryClient.getVersion(TEST_DOSSIER_TEMPLATE_ID)).thenReturn(4L);
when(dictionaryClient.getDictionaryForType(VERTEBRATE)).thenReturn(getDictionaryResponse(VERTEBRATE, false));
analyzeService.reanalyze(request);
redactionLog = redactionStorageService.getRedactionLog(TEST_DOSSIER_ID, TEST_FILE_ID);
System.out.println("hi");
}
@Test
public void redactionTest() throws IOException {
String fileName = "files/new/VV-919901.pdf";
String outputFileName = getTemporaryDirectory() + "/Annotated.pdf";
long start = System.currentTimeMillis();
ClassPathResource pdfFileResource = new ClassPathResource(fileName);
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
request.setExcludedPages(Set.of(1));
request.setFileAttributes(List.of(FileAttribute.builder()
.id("fileAttributeId")
.label("Vertebrate Study")
.placeholder("{fileattributes.vertebrateStudy}")
.value("true")
.build()));
analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId()));
AnalyzeResult result = analyzeService.analyze(request);
var redactionLog = redactionStorageService.getRedactionLog(TEST_DOSSIER_ID, TEST_FILE_ID);
var text = redactionStorageService.getText(TEST_DOSSIER_ID, TEST_FILE_ID);
long end = System.currentTimeMillis();
System.out.println("first analysis duration: " + (end - start));
try (FileOutputStream fileOutputStream = new FileOutputStream(getTemporaryDirectory() + "/Test.json")) {
fileOutputStream.write(objectMapper.writeValueAsBytes(redactionStorageService.getText(TEST_DOSSIER_ID, TEST_FILE_ID)));
}
int correctFound = 0;
loop:
for (RedactionLogEntry redactionLogEntry : redactionLog.getRedactionLogEntry()) {
for (SectionText sectionText : text.getSectionTexts()) {
if (redactionLogEntry.isImage()) {
correctFound++;
continue loop;
}
if (redactionLogEntry.getSectionNumber() == sectionText.getSectionNumber()) {
String value = sectionText.getText()
.substring(redactionLogEntry.getStartOffset(), redactionLogEntry.getEndOffset());
if (redactionLogEntry.getValue().equalsIgnoreCase(value)) {
correctFound++;
} else {
throw new RuntimeException("WTF");
}
}
}
}
assertThat(correctFound).isEqualTo(redactionLog.getRedactionLogEntry().size());
dictionary.get(AUTHOR).add("properties");
reanlysisVersions.put("properties", 1L);
dictionary.get(AUTHOR).add("physical");
reanlysisVersions.put("physical", 2L);
deleted.add("David Chubb");
deleted.add("mouse");
dictionary.get(FALSE_POSITIVE).add("David Chubb");
reanlysisVersions.put("David Chubb", 3L);
reanlysisVersions.put("mouse", 3L);
when(dictionaryClient.getVersion(TEST_DOSSIER_TEMPLATE_ID)).thenReturn(3L);
when(dictionaryClient.getDictionaryForType(VERTEBRATE)).thenReturn(getDictionaryResponse(VERTEBRATE, false));
when(dictionaryClient.getDictionaryForType(FALSE_POSITIVE)).thenReturn(getDictionaryResponse(FALSE_POSITIVE, false));
start = System.currentTimeMillis();
ManualRedactions manualRedactions = new ManualRedactions();
manualRedactions.setImageRecategorization(Set.of(ManualImageRecategorization.builder()
.annotationId("37eee3e9d589a5cc529bfec38c3ba479")
.fileId("fileId")
.status(AnnotationStatus.APPROVED)
.type("signature")
.build()));
request.setManualRedactions(manualRedactions);
AnalyzeResult reanalyzeResult = analyzeService.reanalyze(request);
redactionLog = redactionStorageService.getRedactionLog(TEST_DOSSIER_ID, TEST_FILE_ID);
end = System.currentTimeMillis();
System.out.println("reanalysis analysis duration: " + (end - start));
AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder()
.dossierId(TEST_DOSSIER_ID)
.fileId(TEST_FILE_ID)
.build());
try (FileOutputStream fileOutputStream = new FileOutputStream(outputFileName)) {
fileOutputStream.write(annotateResponse.getDocument());
}
@ -1193,6 +1324,24 @@ public class RedactionIntegrationTest {
}
private List<File> getPathsRecursively(File path) {
List<File> result = new ArrayList<>();
if (path == null || path.listFiles() == null) {
return result;
}
for (File f : path.listFiles()) {
if (f.isFile()) {
result.add(f);
} else {
result.addAll(getPathsRecursively(f));
}
}
return result;
}
private static String getTemporaryDirectory() {
String tmpdir = System.getProperty("java.io.tmpdir");

View File

@ -1,3 +1,28 @@
AD Hurt
N Pengelly
HA J Napper
E M Roper
Earl M
Weissler M S
Warrinton J S
Kuet SF
Hadeld ST
Butters C A
Hurt AD
Campbell AJ
Runnalls JK
Tummon O J
Chapman PF
Snell RJ
MclIndoe EC
Johnson R I
Richard Andrews
James Paul
Walter Richard Andrews
Wilbur H. Palmer
Jeff Mueller
James McDonelI
Jeffrey S. Heither
1 Braid S.and Tsui G
1 Schwader A.L.
2 Lee MR