RED-7375: integrate table extractor #309

Closed
yannik.hampe wants to merge 11 commits from RED-7375 into master
12 changed files with 168 additions and 32 deletions

View File

@ -91,7 +91,7 @@
<dependency>
<groupId>com.knecon.fforesight</groupId>
<artifactId>layoutparser-service-internal-api</artifactId>
<version>0.74.0</version>
<version>yannik-0</version>
</dependency>
<dependency>

View File

@ -59,40 +59,16 @@ public class MessagingConfiguration {
public static final String CV_ANALYSIS_RESPONSE_QUEUE = "cv_analysis_response_queue";
public static final String CV_ANALYSIS_DLQ = "cv_analysis_dead_letter_queue";
public static final String TABLE_EXTRACTOR_QUEUE = "table_extractor_request_queue";
public static final String TABLE_EXTRACTOR_RESPONSE_QUEUE = "table_extractor_response_queue";
public static final String TABLE_EXTRACTOR_DLQ = "table_extractor_dead_letter_queue";
public static final String OCR_STATUS_UPDATE_RESPONSE_QUEUE = "ocr_status_update_response_queue";
public static final String OCR_STATUS_UPDATE_RESPONSE_DQL = "ocr_status_update_response_dql";
public static final String X_ERROR_INFO_HEADER = "x-error-message";
public static final String X_ERROR_INFO_TIMESTAMP_HEADER = "x-error-message-timestamp";
// --- Saas Migration, can be removed later ----
public static final String MIGRATION_QUEUE = "migrationQueue";
public static final String MIGRATION_DLQ = "migrationDLQ";
public static final String MIGRATION_RESPONSE_QUEUE = "migrationResponseQueue";
@Bean
public Queue migrationQueue() {
return QueueBuilder.durable(MIGRATION_QUEUE).withArgument("x-dead-letter-exchange", "").withArgument("x-dead-letter-routing-key", MIGRATION_DLQ).maxPriority(2).build();
}
@Bean
public Queue migrationDLQ() {
return QueueBuilder.durable(MIGRATION_DLQ).build();
}
@Bean
public Queue migrationResponseQueue() {
return QueueBuilder.durable(MIGRATION_RESPONSE_QUEUE).withArgument("x-dead-letter-exchange", "").withArgument("x-dead-letter-routing-key", MIGRATION_DLQ).maxPriority(2).build();
}
// --- End Saas Migration
@Bean
public Queue nerRequestQueue() {
@ -346,6 +322,7 @@ public class MessagingConfiguration {
.build();
}
@Bean
public Queue layoutparsingRequestQueue() {
@ -357,8 +334,10 @@ public class MessagingConfiguration {
@Bean
public Queue layoutparsingResponseQueue() {
return QueueBuilder.durable(LAYOUT_PARSING_FINISHED_EVENT_QUEUE)//
.withArgument("x-dead-letter-exchange", "").withArgument("x-dead-letter-routing-key", LAYOUT_PARSING_DLQ).build();
return QueueBuilder.durable(LAYOUT_PARSING_FINISHED_EVENT_QUEUE)
.withArgument("x-dead-letter-exchange", "")
.withArgument("x-dead-letter-routing-key", LAYOUT_PARSING_DLQ)
.build();
}
@ -367,4 +346,29 @@ public class MessagingConfiguration {
return QueueBuilder.durable(LAYOUT_PARSING_DLQ).build();
}
@Bean
public Queue tableExtractorRequestQueue() {
return QueueBuilder.durable(TABLE_EXTRACTOR_QUEUE).withArgument("x-dead-letter-exchange", "").withArgument("x-dead-letter-routing-key", TABLE_EXTRACTOR_DLQ).build();
}
@Bean
public Queue tableExtractorResponseQueue() {
return QueueBuilder.durable(TABLE_EXTRACTOR_RESPONSE_QUEUE)
.withArgument("x-dead-letter-exchange", "")
.withArgument("x-dead-letter-routing-key", TABLE_EXTRACTOR_DLQ)
.build();
}
@Bean
public Queue tableExtractorDLQ() {
return QueueBuilder.durable(TABLE_EXTRACTOR_DLQ).build();
}
}

View File

@ -0,0 +1,25 @@
package com.iqser.red.service.persistence.management.v1.processor.model;
import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.dossier.file.FileType;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
public class TableExtractorRequest {
public static final String TABLE_EXTRACTOR_FILE_EXTENSION = FileType.EXTRACTED_TABLES.name() + FileType.EXTRACTED_TABLES.getExtension() + ".gz";
public static final String TARGET_FILE_EXTENSION = "ORIGIN.pdf.gz";
private String dossierId;
private String fileId;
private String targetFileExtension;
private String responseFileExtension;
}

View File

@ -0,0 +1,17 @@
package com.iqser.red.service.persistence.management.v1.processor.model;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
public class TableExtractorResponse {
private String dossierId;
private String fileId;
}

View File

@ -21,6 +21,7 @@ import com.iqser.red.service.persistence.management.v1.processor.exception.Inter
import com.iqser.red.service.persistence.management.v1.processor.model.CvAnalysisServiceRequest;
import com.iqser.red.service.persistence.management.v1.processor.model.NerServiceRequest;
import com.iqser.red.service.persistence.management.v1.processor.model.OCRStatusUpdateResponse;
import com.iqser.red.service.persistence.management.v1.processor.model.TableExtractorRequest;
import com.iqser.red.service.persistence.management.v1.processor.model.image.ImageServiceRequest;
import com.iqser.red.service.persistence.management.v1.processor.service.layoutparsing.LayoutParsingRequestFactory;
import com.iqser.red.service.persistence.management.v1.processor.service.manualredactions.ManualRedactionProviderService;
@ -175,6 +176,12 @@ public class FileStatusService {
return;
}
if (settings.isTableExtractorEnabled() && !fileManagementStorageService.objectExists(dossierId, fileId, FileType.EXTRACTED_TABLES)) {
log.info("Add file: {} from dossier {} to Table Extractor queue", fileId, dossierId);
addToTableExtractorQueue(dossierId, fileId);
return;
}
var fileModel = MagicConverter.convert(fileEntity, FileModel.class, new FileModelMapper());
fileModel = reanalysisRequiredStatusService.enhanceFileStatusWithAnalysisRequirements(fileModel, true);
@ -230,6 +237,24 @@ public class FileStatusService {
}
private void addToTableExtractorQueue(String dossierId, String fileId) {
fileStatusPersistenceService.updateProcessingStatus(fileId, ProcessingStatus.TABLE_EXTRACTOR_ANALYZING);
rabbitTemplate.convertAndSend(MessagingConfiguration.TABLE_EXTRACTOR_QUEUE,
TableExtractorRequest.builder()
.dossierId(dossierId)
.fileId(fileId)
.targetFileExtension(TableExtractorRequest.TARGET_FILE_EXTENSION)
.responseFileExtension(TableExtractorRequest.TABLE_EXTRACTOR_FILE_EXTENSION)
.build(),
message -> {
message.getMessageProperties().setPriority(1);
return message;
});
}
@SneakyThrows
public void addToPreprocessingQueue(String dossierId, String fileId, String filename) {

View File

@ -39,12 +39,17 @@ public class LayoutParsingRequestFactory {
Optional<String> optionalTableFileId = fileManagementStorageService.objectExists(dossierId, fileId, FileType.TABLES) //
? Optional.of(StorageIdUtils.getStorageId(dossierId, fileId, FileType.TABLES)) : Optional.empty();
Optional<String> optionalTableExtractorFileId = fileManagementStorageService.objectExists(dossierId, fileId, FileType.EXTRACTED_TABLES) //
? Optional.of(StorageIdUtils.getStorageId(dossierId, fileId, FileType.EXTRACTED_TABLES)) : Optional.empty();
return LayoutParsingRequest.builder()
.layoutParsingType(type)
.identifier(layoutParsingRequestIdentifierService.buildIdentifier(dossierId, fileId, priority))
.originFileStorageId(StorageIdUtils.getStorageId(dossierId, fileId, FileType.ORIGIN))
.imagesFileStorageId(optionalImageFileId)
.tablesFileStorageId(optionalTableFileId)
.tableExtractorFileId(optionalTableExtractorFileId)
.pageFileStorageId(StorageIdUtils.getStorageId(dossierId, fileId, FileType.DOCUMENT_PAGES))
.structureFileStorageId(StorageIdUtils.getStorageId(dossierId, fileId, FileType.DOCUMENT_STRUCTURE))
.textBlockFileStorageId(StorageIdUtils.getStorageId(dossierId, fileId, FileType.DOCUMENT_TEXT))

View File

@ -0,0 +1,55 @@
package com.iqser.red.service.persistence.management.v1.processor.service.queue;
import java.time.OffsetDateTime;
import java.time.temporal.ChronoUnit;
import org.springframework.amqp.core.Message;
import org.springframework.amqp.rabbit.annotation.RabbitListener;
import org.springframework.stereotype.Service;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.persistence.management.v1.processor.configuration.MessagingConfiguration;
import com.iqser.red.service.persistence.management.v1.processor.model.CvAnalysisServiceResponse;
import com.iqser.red.service.persistence.management.v1.processor.model.TableExtractorResponse;
import com.iqser.red.service.persistence.management.v1.processor.service.FileStatusProcessingUpdateService;
import com.iqser.red.service.persistence.management.v1.processor.service.FileStatusService;
import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.dossier.file.FileErrorInfo;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
public class TableExtractorMessageReceiver {
private final ObjectMapper objectMapper;
private final FileStatusService fileStatusService;
private final FileStatusProcessingUpdateService fileStatusProcessingUpdateService;
@SneakyThrows
@RabbitListener(queues = MessagingConfiguration.TABLE_EXTRACTOR_RESPONSE_QUEUE)
public void receive(TableExtractorResponse response) {
fileStatusService.setStatusAnalyse(response.getDossierId(), response.getFileId(), false);
log.info("Received message in {} for dossierId {} and fileId {}", MessagingConfiguration.TABLE_EXTRACTOR_RESPONSE_QUEUE, response.getDossierId(), response.getFileId());
}
@SneakyThrows
@RabbitListener(queues = MessagingConfiguration.TABLE_EXTRACTOR_DLQ)
public void handleDLQMessage(Message failedMessage) {
var response = objectMapper.readValue(failedMessage.getBody(), TableExtractorResponse.class);
log.warn("Received message from {} for dossierId {} and fileId {}", MessagingConfiguration.TABLE_EXTRACTOR_DLQ, response.getDossierId(), response.getFileId());
fileStatusProcessingUpdateService.analysisFailed(response.getDossierId(),
response.getFileId(),
new FileErrorInfo("table extractor failed", MessagingConfiguration.TABLE_EXTRACTOR_DLQ, "table-extractor", OffsetDateTime.now().truncatedTo(ChronoUnit.MILLIS)));
}
}

View File

@ -24,6 +24,7 @@ public class FileManagementServiceSettings {
private boolean imageServiceEnabled = true;
private boolean nerServiceEnabled = true;
private boolean tableExtractorEnabled = true;
private boolean storeImageFile = true;

View File

@ -31,6 +31,7 @@ cors.enabled: true
persistence-service:
imageServiceEnabled: false
nerServiceEnabled: false
tableExtractorEnabled: false
storeImageFile: false
applicationName: RedactManager
fforesight:

View File

@ -58,6 +58,7 @@ server:
persistence-service:
imageServiceEnabled: false
tableExtractorEnabled: false
metrics:

View File

@ -16,6 +16,7 @@ public enum FileType {
TEXT_HIGHLIGHTS(".json"),
FIGURE(".json"),
TABLES(".json"),
EXTRACTED_TABLES(".json"),
COMPONENTS(".json"),
// document is split into 4 files, all should be overridden/deleted at the same time
DOCUMENT_TEXT(".json"),

View File

@ -18,5 +18,6 @@ public enum ProcessingStatus {
PRE_PROCESSING,
PRE_PROCESSED,
FIGURE_DETECTION_ANALYZING,
TABLE_PARSING_ANALYZING
TABLE_PARSING_ANALYZING,
TABLE_EXTRACTOR_ANALYZING,
}