diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/RedactionLogCreatorService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/RedactionLogCreatorService.java index 7754cf8b..3f0d9580 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/RedactionLogCreatorService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/RedactionLogCreatorService.java @@ -1,5 +1,15 @@ package com.iqser.red.service.redaction.v1.server.redaction.service; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +import org.apache.commons.collections4.CollectionUtils; +import org.springframework.stereotype.Service; + import com.iqser.red.service.redaction.v1.model.Point; import com.iqser.red.service.redaction.v1.model.Rectangle; import com.iqser.red.service.redaction.v1.model.RedactionLogEntry; @@ -10,12 +20,8 @@ import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionS import com.iqser.red.service.redaction.v1.server.redaction.model.Image; import com.iqser.red.service.redaction.v1.server.redaction.model.PageEntities; import com.iqser.red.service.redaction.v1.server.redaction.utils.IdBuilder; -import lombok.RequiredArgsConstructor; -import org.apache.commons.collections4.CollectionUtils; -import org.springframework.stereotype.Service; -import java.util.*; -import java.util.stream.Collectors; +import lombok.RequiredArgsConstructor; @Service @RequiredArgsConstructor @@ -24,7 +30,8 @@ public class RedactionLogCreatorService { private final DictionaryService dictionaryService; - public List createRedactionLog(PageEntities pageEntities, int numberOfPages, String dossierTemplateId) { + public List createRedactionLog(PageEntities pageEntities, int numberOfPages, + String dossierTemplateId) { List entries = new ArrayList<>(); @@ -130,13 +137,19 @@ public class RedactionLogCreatorService { rectangles.add(TextPositionSequence.fromData(textPositions, page).getRectangle()); } else { float y = textPositions.get(0).getYDirAdj(); + float height = textPositions.get(0).getHeightDir(); int startIndex = 0; + for (int i = 1; i < textPositions.size(); i++) { float yDirAdj = textPositions.get(i).getYDirAdj(); - if (round(yDirAdj,3) != round(y, 3)) { + float heightDir = textPositions.get(i).getHeightDir(); + + if (!isCharInSameLine(y, yDirAdj, height, heightDir)) { + rectangles.add(TextPositionSequence.fromData(textPositions.subList(startIndex, i), page) .getRectangle()); y = yDirAdj; + height = heightDir; startIndex = i; } } @@ -149,9 +162,21 @@ public class RedactionLogCreatorService { return rectangles; } - private double round(float value, int decimalPoints) { - var d = Math.pow(10, decimalPoints); - return Math.round(value * d) / d; + + private boolean isCharInSameLine(float y, float yCompare, float height, float heightCompare) { + + float offsetHeight = heightCompare / 5; + float minHeight = heightCompare - offsetHeight; + float maxHeight = heightCompare + offsetHeight; + + float offsetY = heightCompare / 22; + float minY = y - offsetY; + float maxY = y + offsetY; + + if (yCompare > minY && yCompare < maxY && height > minHeight && height < maxHeight) { + return true; + } + return false; } @@ -204,5 +229,4 @@ public class RedactionLogCreatorService { return dictionaryService.isRecommendation(type, dossierTemplateId); } - } diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java index 50333224..6ab19860 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java @@ -1,30 +1,27 @@ package com.iqser.red.service.redaction.v1.server; -import com.amazonaws.services.s3.AmazonS3; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.iqser.red.service.persistence.service.v1.api.model.annotations.Rectangle; -import com.iqser.red.service.persistence.service.v1.api.model.annotations.*; -import com.iqser.red.service.persistence.service.v1.api.model.common.JSONPrimitive; -import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.configuration.Colors; -import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.dossier.file.FileType; -import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.type.DictionaryEntry; -import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.type.Type; -import com.iqser.red.service.redaction.v1.model.*; -import com.iqser.red.service.redaction.v1.server.classification.model.SectionText; -import com.iqser.red.service.redaction.v1.server.client.DictionaryClient; -import com.iqser.red.service.redaction.v1.server.client.ImageClassificationClient; -import com.iqser.red.service.redaction.v1.server.client.LegalBasisClient; -import com.iqser.red.service.redaction.v1.server.client.RulesClient; -import com.iqser.red.service.redaction.v1.server.controller.RedactionController; -import com.iqser.red.service.redaction.v1.server.memory.MemoryStats; -import com.iqser.red.service.redaction.v1.server.redaction.service.AnalyzeService; -import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader; -import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities; -import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService; -import com.iqser.red.storage.commons.StorageAutoConfiguration; -import com.iqser.red.storage.commons.service.StorageService; +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.Mockito.when; -import lombok.SneakyThrows; +import java.io.BufferedReader; +import java.io.ByteArrayInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.net.URL; +import java.nio.charset.StandardCharsets; +import java.time.OffsetDateTime; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.UUID; +import java.util.stream.Collectors; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; @@ -51,15 +48,47 @@ import org.springframework.context.annotation.Primary; import org.springframework.core.io.ClassPathResource; import org.springframework.test.context.junit4.SpringRunner; -import java.io.*; -import java.net.URL; -import java.nio.charset.StandardCharsets; -import java.time.OffsetDateTime; -import java.util.*; -import java.util.stream.Collectors; +import com.amazonaws.services.s3.AmazonS3; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.iqser.red.service.persistence.service.v1.api.model.annotations.AnnotationStatus; +import com.iqser.red.service.persistence.service.v1.api.model.annotations.Comment; +import com.iqser.red.service.persistence.service.v1.api.model.annotations.IdRemoval; +import com.iqser.red.service.persistence.service.v1.api.model.annotations.ManualForceRedaction; +import com.iqser.red.service.persistence.service.v1.api.model.annotations.ManualImageRecategorization; +import com.iqser.red.service.persistence.service.v1.api.model.annotations.ManualLegalBasisChange; +import com.iqser.red.service.persistence.service.v1.api.model.annotations.ManualRedactionEntry; +import com.iqser.red.service.persistence.service.v1.api.model.annotations.ManualRedactions; +import com.iqser.red.service.persistence.service.v1.api.model.annotations.ManualResizeRedaction; +import com.iqser.red.service.persistence.service.v1.api.model.annotations.Rectangle; +import com.iqser.red.service.persistence.service.v1.api.model.common.JSONPrimitive; +import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.configuration.Colors; +import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.dossier.file.FileType; +import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.type.DictionaryEntry; +import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.type.Type; +import com.iqser.red.service.redaction.v1.model.AnalyzeRequest; +import com.iqser.red.service.redaction.v1.model.AnalyzeResult; +import com.iqser.red.service.redaction.v1.model.AnnotateRequest; +import com.iqser.red.service.redaction.v1.model.AnnotateResponse; +import com.iqser.red.service.redaction.v1.model.FileAttribute; +import com.iqser.red.service.redaction.v1.model.RedactionLogEntry; +import com.iqser.red.service.redaction.v1.model.RedactionRequest; +import com.iqser.red.service.redaction.v1.model.RedactionResult; +import com.iqser.red.service.redaction.v1.model.StructureAnalyzeRequest; +import com.iqser.red.service.redaction.v1.server.classification.model.SectionText; +import com.iqser.red.service.redaction.v1.server.client.DictionaryClient; +import com.iqser.red.service.redaction.v1.server.client.ImageClassificationClient; +import com.iqser.red.service.redaction.v1.server.client.LegalBasisClient; +import com.iqser.red.service.redaction.v1.server.client.RulesClient; +import com.iqser.red.service.redaction.v1.server.controller.RedactionController; +import com.iqser.red.service.redaction.v1.server.memory.MemoryStats; +import com.iqser.red.service.redaction.v1.server.redaction.service.AnalyzeService; +import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader; +import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities; +import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService; +import com.iqser.red.storage.commons.StorageAutoConfiguration; +import com.iqser.red.storage.commons.service.StorageService; -import static org.assertj.core.api.Assertions.assertThat; -import static org.mockito.Mockito.when; +import lombok.SneakyThrows; @RunWith(SpringRunner.class) @SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT) @@ -631,29 +660,14 @@ public class RedactionIntegrationTest { } - private List getPathsRecursively(File path) { - - List result = new ArrayList<>(); - if (path == null || path.listFiles() == null) { - return result; - } - for (File f : path.listFiles()) { - if (f.isFile()) { - result.add(f); - } else { - result.addAll(getPathsRecursively(f)); - } - } - return result; - - } - - @Test - public void redactionTest() throws IOException { + public void redactionTestSeparatedRedaction() throws IOException { + + String fileName = "scanned/VV-380943_page38.pdf"; + String outputFileName = getTemporaryDirectory() + "/AnnotatedRedactionTestSeparatedRedaction.pdf"; long start = System.currentTimeMillis(); - ClassPathResource pdfFileResource = new ClassPathResource("files/new/VV-919901.pdf"); + ClassPathResource pdfFileResource = new ClassPathResource(fileName); AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream()); request.setExcludedPages(Set.of(1)); @@ -744,7 +758,124 @@ public class RedactionIntegrationTest { .fileId(TEST_FILE_ID) .build()); - try (FileOutputStream fileOutputStream = new FileOutputStream(getTemporaryDirectory() + "/Annotated.pdf")) { + try (FileOutputStream fileOutputStream = new FileOutputStream(outputFileName)) { + fileOutputStream.write(annotateResponse.getDocument()); + } + + deleted.remove("mouse"); + reanlysisVersions.put("mouse", 4L); + + when(dictionaryClient.getVersion(TEST_DOSSIER_TEMPLATE_ID)).thenReturn(4L); + + when(dictionaryClient.getDictionaryForType(VERTEBRATE)).thenReturn(getDictionaryResponse(VERTEBRATE, false)); + + analyzeService.reanalyze(request); + + redactionLog = redactionStorageService.getRedactionLog(TEST_DOSSIER_ID, TEST_FILE_ID); + + System.out.println("hi"); + } + + + @Test + public void redactionTest() throws IOException { + + String fileName = "files/new/VV-919901.pdf"; + String outputFileName = getTemporaryDirectory() + "/Annotated.pdf"; + + long start = System.currentTimeMillis(); + ClassPathResource pdfFileResource = new ClassPathResource(fileName); + AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream()); + request.setExcludedPages(Set.of(1)); + + request.setFileAttributes(List.of(FileAttribute.builder() + .id("fileAttributeId") + .label("Vertebrate Study") + .placeholder("{fileattributes.vertebrateStudy}") + .value("true") + .build())); + + analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId())); + AnalyzeResult result = analyzeService.analyze(request); + + var redactionLog = redactionStorageService.getRedactionLog(TEST_DOSSIER_ID, TEST_FILE_ID); + var text = redactionStorageService.getText(TEST_DOSSIER_ID, TEST_FILE_ID); + + long end = System.currentTimeMillis(); + + System.out.println("first analysis duration: " + (end - start)); + + try (FileOutputStream fileOutputStream = new FileOutputStream(getTemporaryDirectory() + "/Test.json")) { + fileOutputStream.write(objectMapper.writeValueAsBytes(redactionStorageService.getText(TEST_DOSSIER_ID, TEST_FILE_ID))); + } + + int correctFound = 0; + loop: + for (RedactionLogEntry redactionLogEntry : redactionLog.getRedactionLogEntry()) { + for (SectionText sectionText : text.getSectionTexts()) { + if (redactionLogEntry.isImage()) { + correctFound++; + continue loop; + } + if (redactionLogEntry.getSectionNumber() == sectionText.getSectionNumber()) { + String value = sectionText.getText() + .substring(redactionLogEntry.getStartOffset(), redactionLogEntry.getEndOffset()); + if (redactionLogEntry.getValue().equalsIgnoreCase(value)) { + correctFound++; + } else { + throw new RuntimeException("WTF"); + } + } + } + } + assertThat(correctFound).isEqualTo(redactionLog.getRedactionLogEntry().size()); + + dictionary.get(AUTHOR).add("properties"); + reanlysisVersions.put("properties", 1L); + + dictionary.get(AUTHOR).add("physical"); + reanlysisVersions.put("physical", 2L); + + deleted.add("David Chubb"); + deleted.add("mouse"); + + dictionary.get(FALSE_POSITIVE).add("David Chubb"); + reanlysisVersions.put("David Chubb", 3L); + + reanlysisVersions.put("mouse", 3L); + + when(dictionaryClient.getVersion(TEST_DOSSIER_TEMPLATE_ID)).thenReturn(3L); + + when(dictionaryClient.getDictionaryForType(VERTEBRATE)).thenReturn(getDictionaryResponse(VERTEBRATE, false)); + + when(dictionaryClient.getDictionaryForType(FALSE_POSITIVE)).thenReturn(getDictionaryResponse(FALSE_POSITIVE, false)); + + start = System.currentTimeMillis(); + + ManualRedactions manualRedactions = new ManualRedactions(); + + manualRedactions.setImageRecategorization(Set.of(ManualImageRecategorization.builder() + .annotationId("37eee3e9d589a5cc529bfec38c3ba479") + .fileId("fileId") + .status(AnnotationStatus.APPROVED) + .type("signature") + .build())); + + request.setManualRedactions(manualRedactions); + + AnalyzeResult reanalyzeResult = analyzeService.reanalyze(request); + + redactionLog = redactionStorageService.getRedactionLog(TEST_DOSSIER_ID, TEST_FILE_ID); + + end = System.currentTimeMillis(); + System.out.println("reanalysis analysis duration: " + (end - start)); + + AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder() + .dossierId(TEST_DOSSIER_ID) + .fileId(TEST_FILE_ID) + .build()); + + try (FileOutputStream fileOutputStream = new FileOutputStream(outputFileName)) { fileOutputStream.write(annotateResponse.getDocument()); } @@ -1193,6 +1324,24 @@ public class RedactionIntegrationTest { } + private List getPathsRecursively(File path) { + + List result = new ArrayList<>(); + if (path == null || path.listFiles() == null) { + return result; + } + for (File f : path.listFiles()) { + if (f.isFile()) { + result.add(f); + } else { + result.addAll(getPathsRecursively(f)); + } + } + return result; + + } + + private static String getTemporaryDirectory() { String tmpdir = System.getProperty("java.io.tmpdir"); diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/CBI_author.txt b/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/CBI_author.txt index 3d53e6d2..05f5dc2c 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/CBI_author.txt +++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/CBI_author.txt @@ -1,3 +1,28 @@ +AD Hurt +N Pengelly +HA J Napper +E M Roper +Earl M +Weissler M S +Warrinton J S +Kuet SF +Hadeld ST +Butters C A +Hurt AD +Campbell AJ +Runnalls JK +Tummon O J +Chapman PF +Snell RJ +MclIndoe EC +Johnson R I +Richard Andrews +James Paul +Walter Richard Andrews +Wilbur H. Palmer +Jeff Mueller +James McDonelI +Jeffrey S. Heither 1 Braid S.and Tsui G 1 Schwader A.L. 2 Lee MR diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/scanned/VV-380943_page38.pdf b/redaction-service-v1/redaction-service-server-v1/src/test/resources/scanned/VV-380943_page38.pdf new file mode 100644 index 00000000..98e2f6ae Binary files /dev/null and b/redaction-service-v1/redaction-service-server-v1/src/test/resources/scanned/VV-380943_page38.pdf differ