diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Page.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Page.java index b68f0207..0fa334d0 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Page.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Page.java @@ -56,6 +56,7 @@ public class Page { public TextBlock getMainBodyTextBlock() { return textBlocksOnPage.stream() + .filter(atb -> !atb.isEmpty()) .collect(new TextBlockCollector()); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/service/document/DocumentGraphMapper.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/service/document/DocumentGraphMapper.java index b4548a42..9319457a 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/service/document/DocumentGraphMapper.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/service/document/DocumentGraphMapper.java @@ -86,6 +86,7 @@ public class DocumentGraphMapper { switch (entryData.getType()) { case HEADER -> pages.forEach(page -> page.setHeader((Header) node)); case FOOTER -> pages.forEach(page -> page.setFooter((Footer) node)); + case IMAGE -> pages.forEach(page -> page.getImages().add((Image) node)); default -> textBlock.getAtomicTextBlocks() .forEach(atb -> atb.getPage().getTextBlocksOnPage().add(atb)); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/AnalysisEnd2EndTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/AnalysisEnd2EndTest.java index 6530373b..e22e57c3 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/AnalysisEnd2EndTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/AnalysisEnd2EndTest.java @@ -19,11 +19,13 @@ import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.Optional; import java.util.Set; import java.util.UUID; import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Collectors; +import java.util.stream.Stream; import java.util.zip.GZIPInputStream; import org.junit.jupiter.api.BeforeEach; @@ -60,7 +62,6 @@ import com.iqser.red.service.redaction.v1.server.service.DictionaryService; import com.iqser.red.service.redaction.v1.server.service.websocket.RedisSyncedWebSocketService; import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService; import com.iqser.red.service.redaction.v1.server.testcontainers.MongoDBTestContainer; -import com.iqser.red.service.redaction.v1.server.utils.exception.NotFoundException; import com.iqser.red.storage.commons.service.StorageService; import com.knecon.fforesight.keycloakcommons.security.TenantAuthenticationManagerResolver; import com.knecon.fforesight.mongo.database.commons.liquibase.TenantMongoLiquibaseExecutor; @@ -85,7 +86,25 @@ import lombok.extern.slf4j.Slf4j; * This way you can recreate what is happening on the stack almost exactly. */ public class AnalysisEnd2EndTest { - Path dossierTemplateToUse = Path.of("/home/kschuettler/iqser/business-logic/documine/cpglobal/Flora SCM (Do Not Edit)"); // Add your dossier-template here + // These files will be uploaded if they are present in the folder + public static final Set ENDINGS_TO_UPLOAD = Set.of(FileType.ORIGIN, + FileType.DOCUMENT_PAGES, + FileType.DOCUMENT_POSITION, + FileType.DOCUMENT_STRUCTURE, + FileType.DOCUMENT_TEXT, + FileType.IMAGE_INFO, + FileType.NER_ENTITIES, + FileType.TABLES, + FileType.IMPORTED_REDACTIONS); + + // These files must be present in the folder or the test will skip the file + public static final Set REQUIRED_FILES = Set.of(FileType.ORIGIN, + FileType.DOCUMENT_PAGES, + FileType.DOCUMENT_POSITION, + FileType.DOCUMENT_STRUCTURE, + FileType.DOCUMENT_TEXT); + + Path dossierTemplateToUse = Path.of("/home/kschuettler/Downloads/mainBodyFailed/DOSSIER_TEMPLATE"); // Add your dossier-template here ObjectMapper mapper = ObjectMapperFactory.create(); final String TENANT_ID = "tenant"; TestDossierTemplate testDossierTemplate; @@ -124,7 +143,7 @@ import lombok.extern.slf4j.Slf4j; @SneakyThrows public void runAnalysisEnd2End() { - String folder = "/home/kschuettler/Dokumente/Ticket Related/RED-9964/17a25133-e098-4610-b553-d1bf11a56d96/560e6ab1ab4754b9a62fd2e6d4d71327"; // Should contain all files from minio directly, still zipped. Can contain multiple files. + String folder = "/home/kschuettler/Downloads/mainBodyFailed/728d0af4-f4c4-4bc9-acf8-7d2632b02962/"; // Should contain all files from minio directly, still zipped. Can contain multiple files. Path absoluteFolderPath; if (folder.startsWith("files")) { // if it starts with "files" it is most likely in the resources folder, else it should be an absolute path @@ -136,11 +155,14 @@ import lombok.extern.slf4j.Slf4j; log.info("Starting end2end analyses for all distinct filenames in folder: {}", folder); List analyzeRequests = prepareStorageForFolder(absoluteFolderPath); - log.info("Found {} distinct fileIds", analyzeRequests.size()); + log.info("Found {} distinct fileIds with all required files", analyzeRequests.size()); for (int i = 0; i < analyzeRequests.size(); i++) { AnalyzeRequest analyzeRequest = analyzeRequests.get(i); + log.info("----------------------------------------------------------------------------------"); log.info("{}/{}: Starting analysis for file {}", i + 1, analyzeRequests.size(), analyzeRequest.getFileId()); analyzeService.analyze(analyzeRequest); + log.info("----------------------------------------------------------------------------------"); + log.info(""); } } @@ -191,22 +213,36 @@ import lombok.extern.slf4j.Slf4j; @SneakyThrows private List prepareStorageForFolder(Path folder) { - return Files.list(folder) - .map(this::parseFileId) - .distinct() + return findOriginFiles(folder).stream() .map(fileId -> prepareStorageForFile(fileId, folder)) + .filter(Optional::isPresent) + .map(Optional::get) .toList(); } - private String parseFileId(Path path) { + private Set findOriginFiles(Path folder) throws IOException { - return path.getFileName().toString().split("\\.")[0]; + return Files.walk(folder) + .map(this::parseFileName) + .filter(Objects::nonNull) + .collect(Collectors.toSet()); + } + + + private String parseFileName(Path path) { + + String suffix = ".ORIGIN.pdf"; + if (!path.getFileName().toString().endsWith(suffix)) { + return null; + } + + return path.getFileName().toString().replace(suffix, ""); } @SneakyThrows - private AnalyzeRequest prepareStorageForFile(String fileId, Path folder) { + private Optional prepareStorageForFile(String fileName, Path folder) { AnalyzeRequest request = new AnalyzeRequest(); request.setDossierId(UUID.randomUUID().toString()); @@ -214,45 +250,38 @@ import lombok.extern.slf4j.Slf4j; request.setDossierTemplateId(testDossierTemplate.id); request.setAnalysisNumber(-1); - Path manualRedactionFile = folder.resolve(fileId + ".MANUAL_REDACTIONS.json"); + Path manualRedactionFile = folder.resolve(fileName + ".MANUAL_REDACTIONS.json"); if (Files.exists(manualRedactionFile)) { request.setManualRedactions(parseManualRedactions(manualRedactionFile)); } else { request.setManualRedactions(new ManualRedactions()); } - Set endingsToUpload = Set.of("ORIGIN", - "DOCUMENT_PAGES", - "DOCUMENT_POSITION", - "DOCUMENT_STRUCTURE", - "DOCUMENT_TEXT", - "IMAGE_INFO", - "NER_ENTITIES", - "TABLES", - "IMPORTED_REDACTIONS") - .stream() - .map(FileType::valueOf) - .collect(Collectors.toSet()); - - Set uploadedFileTypes = Files.walk(folder) - .filter(path -> path.toFile().isFile()) - .filter(path -> parseFileTypeFromPath(path).map(endingsToUpload::contains) - .orElse(false)) - .map(filePath -> uploadFile(filePath, request)) - .filter(Optional::isPresent) - .map(Optional::get) + Set uploadedFileTypes = findFilesToUpload(fileName, folder, ENDINGS_TO_UPLOAD).map(filePath -> uploadFile(filePath, request)) + .map(FileToUpload::fileType) .collect(Collectors.toUnmodifiableSet()); - Set missingFileTypes = Sets.difference(endingsToUpload, uploadedFileTypes); + Set missingFileTypes = Sets.difference(REQUIRED_FILES, uploadedFileTypes); + if (!missingFileTypes.isEmpty()) { log.error("Folder {} is missing files of type {}", folder.toFile(), missingFileTypes.stream() .map(Enum::toString) .collect(Collectors.joining(", "))); - throw new NotFoundException("Not all required file types are present."); + return Optional.empty(); } - return request; + return Optional.of(request); + } + + + private static Stream findFilesToUpload(String fileName, Path folder, Set endingsToUpload) throws IOException { + + return Files.walk(folder) + .filter(path -> path.toFile().isFile()) + .map(path -> parseFileTypeFromPath(path, fileName, endingsToUpload)) + .filter(Optional::isPresent) + .map(Optional::get); } @@ -267,11 +296,19 @@ import lombok.extern.slf4j.Slf4j; } - private static Optional parseFileTypeFromPath(Path path) { + private static Optional parseFileTypeFromPath(Path path, String fileName, Set endingsToUpload) { + + if (!path.getFileName().toString().startsWith(fileName)) { + return Optional.empty(); + } - String fileType = path.getFileName().toString().split("\\.")[1]; try { - return Optional.of(FileType.valueOf(fileType)); + String fileTypeString = path.getFileName().toString().split("\\.")[1]; + FileType fileType = FileType.valueOf(fileTypeString); + if (!endingsToUpload.contains(fileType)) { + return Optional.empty(); + } + return Optional.of(new FileToUpload(path, fileType)); } catch (IllegalArgumentException e) { return Optional.empty(); } @@ -279,26 +316,26 @@ import lombok.extern.slf4j.Slf4j; @SneakyThrows - private Optional uploadFile(Path path, AnalyzeRequest request) { + private FileToUpload uploadFile(FileToUpload fileToUpload, AnalyzeRequest request) { - Optional fileType = parseFileTypeFromPath(path); - if (fileType.isEmpty()) { - return Optional.empty(); - } - if (path.getFileName().toString().endsWith(".gz")) { - try (var fis = new FileInputStream(path.toFile()); var in = new GZIPInputStream(fis);) { - storageService.storeObject(TENANT_ID, RedactionStorageService.StorageIdUtils.getStorageId(request.getDossierId(), request.getFileId(), fileType.get()), in); + if (fileToUpload.path().getFileName().toString().endsWith(".gz")) { + try (var fis = new FileInputStream(fileToUpload.path().toFile()); var in = new GZIPInputStream(fis);) { + storageService.storeObject(TENANT_ID, + RedactionStorageService.StorageIdUtils.getStorageId(request.getDossierId(), request.getFileId(), fileToUpload.fileType()), + in); } } else { - try (var in = new FileInputStream(path.toFile())) { - storageService.storeObject(TENANT_ID, RedactionStorageService.StorageIdUtils.getStorageId(request.getDossierId(), request.getFileId(), fileType.get()), in); + try (var in = new FileInputStream(fileToUpload.path().toFile())) { + storageService.storeObject(TENANT_ID, + RedactionStorageService.StorageIdUtils.getStorageId(request.getDossierId(), request.getFileId(), fileToUpload.fileType()), + in); } } - return fileType; + return fileToUpload; } - private class TestDossierTemplate { + public class TestDossierTemplate { String id; Dictionary testDictionary; @@ -398,4 +435,8 @@ import lombok.extern.slf4j.Slf4j; } + private record FileToUpload(Path path, FileType fileType) { + + } + }