RED-9964: fix errors with images

This commit is contained in:
Kilian Schüttler 2024-09-04 09:30:33 +02:00
parent 895bc56590
commit cd2bda15aa
3 changed files with 92 additions and 49 deletions

View File

@ -56,6 +56,7 @@ public class Page {
public TextBlock getMainBodyTextBlock() {
return textBlocksOnPage.stream()
.filter(atb -> !atb.isEmpty())
.collect(new TextBlockCollector());
}

View File

@ -86,6 +86,7 @@ public class DocumentGraphMapper {
switch (entryData.getType()) {
case HEADER -> pages.forEach(page -> page.setHeader((Header) node));
case FOOTER -> pages.forEach(page -> page.setFooter((Footer) node));
case IMAGE -> pages.forEach(page -> page.getImages().add((Image) node));
default -> textBlock.getAtomicTextBlocks()
.forEach(atb -> atb.getPage().getTextBlocksOnPage().add(atb));
}

View File

@ -19,11 +19,13 @@ import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import java.util.zip.GZIPInputStream;
import org.junit.jupiter.api.BeforeEach;
@ -60,7 +62,6 @@ import com.iqser.red.service.redaction.v1.server.service.DictionaryService;
import com.iqser.red.service.redaction.v1.server.service.websocket.RedisSyncedWebSocketService;
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
import com.iqser.red.service.redaction.v1.server.testcontainers.MongoDBTestContainer;
import com.iqser.red.service.redaction.v1.server.utils.exception.NotFoundException;
import com.iqser.red.storage.commons.service.StorageService;
import com.knecon.fforesight.keycloakcommons.security.TenantAuthenticationManagerResolver;
import com.knecon.fforesight.mongo.database.commons.liquibase.TenantMongoLiquibaseExecutor;
@ -85,7 +86,25 @@ import lombok.extern.slf4j.Slf4j;
* This way you can recreate what is happening on the stack almost exactly.
*/ public class AnalysisEnd2EndTest {
Path dossierTemplateToUse = Path.of("/home/kschuettler/iqser/business-logic/documine/cpglobal/Flora SCM (Do Not Edit)"); // Add your dossier-template here
// These files will be uploaded if they are present in the folder
public static final Set<FileType> ENDINGS_TO_UPLOAD = Set.of(FileType.ORIGIN,
FileType.DOCUMENT_PAGES,
FileType.DOCUMENT_POSITION,
FileType.DOCUMENT_STRUCTURE,
FileType.DOCUMENT_TEXT,
FileType.IMAGE_INFO,
FileType.NER_ENTITIES,
FileType.TABLES,
FileType.IMPORTED_REDACTIONS);
// These files must be present in the folder or the test will skip the file
public static final Set<FileType> REQUIRED_FILES = Set.of(FileType.ORIGIN,
FileType.DOCUMENT_PAGES,
FileType.DOCUMENT_POSITION,
FileType.DOCUMENT_STRUCTURE,
FileType.DOCUMENT_TEXT);
Path dossierTemplateToUse = Path.of("/home/kschuettler/Downloads/mainBodyFailed/DOSSIER_TEMPLATE"); // Add your dossier-template here
ObjectMapper mapper = ObjectMapperFactory.create();
final String TENANT_ID = "tenant";
TestDossierTemplate testDossierTemplate;
@ -124,7 +143,7 @@ import lombok.extern.slf4j.Slf4j;
@SneakyThrows
public void runAnalysisEnd2End() {
String folder = "/home/kschuettler/Dokumente/Ticket Related/RED-9964/17a25133-e098-4610-b553-d1bf11a56d96/560e6ab1ab4754b9a62fd2e6d4d71327"; // Should contain all files from minio directly, still zipped. Can contain multiple files.
String folder = "/home/kschuettler/Downloads/mainBodyFailed/728d0af4-f4c4-4bc9-acf8-7d2632b02962/"; // Should contain all files from minio directly, still zipped. Can contain multiple files.
Path absoluteFolderPath;
if (folder.startsWith("files")) { // if it starts with "files" it is most likely in the resources folder, else it should be an absolute path
@ -136,11 +155,14 @@ import lombok.extern.slf4j.Slf4j;
log.info("Starting end2end analyses for all distinct filenames in folder: {}", folder);
List<AnalyzeRequest> analyzeRequests = prepareStorageForFolder(absoluteFolderPath);
log.info("Found {} distinct fileIds", analyzeRequests.size());
log.info("Found {} distinct fileIds with all required files", analyzeRequests.size());
for (int i = 0; i < analyzeRequests.size(); i++) {
AnalyzeRequest analyzeRequest = analyzeRequests.get(i);
log.info("----------------------------------------------------------------------------------");
log.info("{}/{}: Starting analysis for file {}", i + 1, analyzeRequests.size(), analyzeRequest.getFileId());
analyzeService.analyze(analyzeRequest);
log.info("----------------------------------------------------------------------------------");
log.info("");
}
}
@ -191,22 +213,36 @@ import lombok.extern.slf4j.Slf4j;
@SneakyThrows
private List<AnalyzeRequest> prepareStorageForFolder(Path folder) {
return Files.list(folder)
.map(this::parseFileId)
.distinct()
return findOriginFiles(folder).stream()
.map(fileId -> prepareStorageForFile(fileId, folder))
.filter(Optional::isPresent)
.map(Optional::get)
.toList();
}
private String parseFileId(Path path) {
private Set<String> findOriginFiles(Path folder) throws IOException {
return path.getFileName().toString().split("\\.")[0];
return Files.walk(folder)
.map(this::parseFileName)
.filter(Objects::nonNull)
.collect(Collectors.toSet());
}
private String parseFileName(Path path) {
String suffix = ".ORIGIN.pdf";
if (!path.getFileName().toString().endsWith(suffix)) {
return null;
}
return path.getFileName().toString().replace(suffix, "");
}
@SneakyThrows
private AnalyzeRequest prepareStorageForFile(String fileId, Path folder) {
private Optional<AnalyzeRequest> prepareStorageForFile(String fileName, Path folder) {
AnalyzeRequest request = new AnalyzeRequest();
request.setDossierId(UUID.randomUUID().toString());
@ -214,45 +250,38 @@ import lombok.extern.slf4j.Slf4j;
request.setDossierTemplateId(testDossierTemplate.id);
request.setAnalysisNumber(-1);
Path manualRedactionFile = folder.resolve(fileId + ".MANUAL_REDACTIONS.json");
Path manualRedactionFile = folder.resolve(fileName + ".MANUAL_REDACTIONS.json");
if (Files.exists(manualRedactionFile)) {
request.setManualRedactions(parseManualRedactions(manualRedactionFile));
} else {
request.setManualRedactions(new ManualRedactions());
}
Set<FileType> endingsToUpload = Set.of("ORIGIN",
"DOCUMENT_PAGES",
"DOCUMENT_POSITION",
"DOCUMENT_STRUCTURE",
"DOCUMENT_TEXT",
"IMAGE_INFO",
"NER_ENTITIES",
"TABLES",
"IMPORTED_REDACTIONS")
.stream()
.map(FileType::valueOf)
.collect(Collectors.toSet());
Set<FileType> uploadedFileTypes = Files.walk(folder)
.filter(path -> path.toFile().isFile())
.filter(path -> parseFileTypeFromPath(path).map(endingsToUpload::contains)
.orElse(false))
.map(filePath -> uploadFile(filePath, request))
.filter(Optional::isPresent)
.map(Optional::get)
Set<FileType> uploadedFileTypes = findFilesToUpload(fileName, folder, ENDINGS_TO_UPLOAD).map(filePath -> uploadFile(filePath, request))
.map(FileToUpload::fileType)
.collect(Collectors.toUnmodifiableSet());
Set<FileType> missingFileTypes = Sets.difference(endingsToUpload, uploadedFileTypes);
Set<FileType> missingFileTypes = Sets.difference(REQUIRED_FILES, uploadedFileTypes);
if (!missingFileTypes.isEmpty()) {
log.error("Folder {} is missing files of type {}",
folder.toFile(),
missingFileTypes.stream()
.map(Enum::toString)
.collect(Collectors.joining(", ")));
throw new NotFoundException("Not all required file types are present.");
return Optional.empty();
}
return request;
return Optional.of(request);
}
private static Stream<FileToUpload> findFilesToUpload(String fileName, Path folder, Set<FileType> endingsToUpload) throws IOException {
return Files.walk(folder)
.filter(path -> path.toFile().isFile())
.map(path -> parseFileTypeFromPath(path, fileName, endingsToUpload))
.filter(Optional::isPresent)
.map(Optional::get);
}
@ -267,11 +296,19 @@ import lombok.extern.slf4j.Slf4j;
}
private static Optional<FileType> parseFileTypeFromPath(Path path) {
private static Optional<FileToUpload> parseFileTypeFromPath(Path path, String fileName, Set<FileType> endingsToUpload) {
if (!path.getFileName().toString().startsWith(fileName)) {
return Optional.empty();
}
String fileType = path.getFileName().toString().split("\\.")[1];
try {
return Optional.of(FileType.valueOf(fileType));
String fileTypeString = path.getFileName().toString().split("\\.")[1];
FileType fileType = FileType.valueOf(fileTypeString);
if (!endingsToUpload.contains(fileType)) {
return Optional.empty();
}
return Optional.of(new FileToUpload(path, fileType));
} catch (IllegalArgumentException e) {
return Optional.empty();
}
@ -279,26 +316,26 @@ import lombok.extern.slf4j.Slf4j;
@SneakyThrows
private Optional<FileType> uploadFile(Path path, AnalyzeRequest request) {
private FileToUpload uploadFile(FileToUpload fileToUpload, AnalyzeRequest request) {
Optional<FileType> fileType = parseFileTypeFromPath(path);
if (fileType.isEmpty()) {
return Optional.empty();
}
if (path.getFileName().toString().endsWith(".gz")) {
try (var fis = new FileInputStream(path.toFile()); var in = new GZIPInputStream(fis);) {
storageService.storeObject(TENANT_ID, RedactionStorageService.StorageIdUtils.getStorageId(request.getDossierId(), request.getFileId(), fileType.get()), in);
if (fileToUpload.path().getFileName().toString().endsWith(".gz")) {
try (var fis = new FileInputStream(fileToUpload.path().toFile()); var in = new GZIPInputStream(fis);) {
storageService.storeObject(TENANT_ID,
RedactionStorageService.StorageIdUtils.getStorageId(request.getDossierId(), request.getFileId(), fileToUpload.fileType()),
in);
}
} else {
try (var in = new FileInputStream(path.toFile())) {
storageService.storeObject(TENANT_ID, RedactionStorageService.StorageIdUtils.getStorageId(request.getDossierId(), request.getFileId(), fileType.get()), in);
try (var in = new FileInputStream(fileToUpload.path().toFile())) {
storageService.storeObject(TENANT_ID,
RedactionStorageService.StorageIdUtils.getStorageId(request.getDossierId(), request.getFileId(), fileToUpload.fileType()),
in);
}
}
return fileType;
return fileToUpload;
}
private class TestDossierTemplate {
public class TestDossierTemplate {
String id;
Dictionary testDictionary;
@ -398,4 +435,8 @@ import lombok.extern.slf4j.Slf4j;
}
private record FileToUpload(Path path, FileType fileType) {
}
}