Merge branch 'RED-7384' into 'main'
RED-7384: fixes for migration See merge request fforesight/layout-parser!91
This commit is contained in:
commit
760a809900
@ -24,6 +24,8 @@ tasks.named<Test>("test") {
|
|||||||
reports {
|
reports {
|
||||||
junitXml.outputLocation.set(layout.buildDirectory.dir("reports/junit"))
|
junitXml.outputLocation.set(layout.buildDirectory.dir("reports/junit"))
|
||||||
}
|
}
|
||||||
|
minHeapSize = "512m"
|
||||||
|
maxHeapSize = "2048m"
|
||||||
}
|
}
|
||||||
|
|
||||||
tasks.test {
|
tasks.test {
|
||||||
|
|||||||
@ -15,8 +15,8 @@ dependencies {
|
|||||||
exclude("org.springframework.boot", "spring-boot-starter-security")
|
exclude("org.springframework.boot", "spring-boot-starter-security")
|
||||||
exclude("org.springframework.boot", "spring-boot-starter-validation")
|
exclude("org.springframework.boot", "spring-boot-starter-validation")
|
||||||
}
|
}
|
||||||
implementation("com.knecon.fforesight:tenant-commons:0.10.0")
|
implementation("com.knecon.fforesight:tenant-commons:0.19.0")
|
||||||
implementation("com.iqser.red.commons:storage-commons:2.40.0")
|
implementation("com.iqser.red.commons:storage-commons:2.45.0")
|
||||||
|
|
||||||
implementation("org.apache.pdfbox:pdfbox:${pdfBoxVersion}")
|
implementation("org.apache.pdfbox:pdfbox:${pdfBoxVersion}")
|
||||||
implementation("org.apache.pdfbox:pdfbox-tools:${pdfBoxVersion}")
|
implementation("org.apache.pdfbox:pdfbox-tools:${pdfBoxVersion}")
|
||||||
|
|||||||
@ -3,13 +3,16 @@ package com.knecon.fforesight.service.layoutparser.processor;
|
|||||||
import static java.lang.String.format;
|
import static java.lang.String.format;
|
||||||
|
|
||||||
import java.awt.geom.Rectangle2D;
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.io.ByteArrayOutputStream;
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.concurrent.atomic.AtomicReference;
|
||||||
|
|
||||||
|
import org.apache.pdfbox.Loader;
|
||||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||||
import org.apache.pdfbox.pdmodel.PDPage;
|
import org.apache.pdfbox.pdmodel.PDPage;
|
||||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||||
@ -51,91 +54,121 @@ import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDF
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.ViewerDocumentService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.ViewerDocumentService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||||
|
|
||||||
|
import io.micrometer.observation.Observation;
|
||||||
|
import io.micrometer.observation.ObservationRegistry;
|
||||||
|
import io.micrometer.observation.annotation.Observed;
|
||||||
|
import lombok.AccessLevel;
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
|
import lombok.experimental.FieldDefaults;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
|
||||||
|
@SuppressWarnings("PMD.CloseResource")
|
||||||
@Slf4j
|
@Slf4j
|
||||||
@Service
|
@Service
|
||||||
@RequiredArgsConstructor
|
@RequiredArgsConstructor
|
||||||
|
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||||
public class LayoutParsingPipeline {
|
public class LayoutParsingPipeline {
|
||||||
|
|
||||||
private final ImageServiceResponseAdapter imageServiceResponseAdapter;
|
ImageServiceResponseAdapter imageServiceResponseAdapter;
|
||||||
private final CvTableParsingAdapter cvTableParsingAdapter;
|
CvTableParsingAdapter cvTableParsingAdapter;
|
||||||
private final LayoutParsingStorageService layoutParsingStorageService;
|
LayoutParsingStorageService layoutParsingStorageService;
|
||||||
private final SectionsBuilderService sectionsBuilderService;
|
SectionsBuilderService sectionsBuilderService;
|
||||||
private final TaasClassificationService taasClassificationService;
|
TaasClassificationService taasClassificationService;
|
||||||
private final RedactManagerClassificationService redactManagerClassificationService;
|
RedactManagerClassificationService redactManagerClassificationService;
|
||||||
private final DocuMineClassificationService docuMineClassificationService;
|
DocuMineClassificationService docuMineClassificationService;
|
||||||
private final SimplifiedSectionTextService simplifiedSectionTextService;
|
SimplifiedSectionTextService simplifiedSectionTextService;
|
||||||
private final BodyTextFrameService bodyTextFrameService;
|
BodyTextFrameService bodyTextFrameService;
|
||||||
private final RulingCleaningService rulingCleaningService;
|
RulingCleaningService rulingCleaningService;
|
||||||
private final TableExtractionService tableExtractionService;
|
TableExtractionService tableExtractionService;
|
||||||
private final TaasBlockificationService taasBlockificationService;
|
TaasBlockificationService taasBlockificationService;
|
||||||
private final DocuMineBlockificationService docuMineBlockificationService;
|
DocuMineBlockificationService docuMineBlockificationService;
|
||||||
private final RedactManagerBlockificationService redactManagerBlockificationService;
|
RedactManagerBlockificationService redactManagerBlockificationService;
|
||||||
private final ViewerDocumentService viewerDocumentService;
|
ViewerDocumentService viewerDocumentService;
|
||||||
|
ObservationRegistry observationRegistry;
|
||||||
|
|
||||||
|
|
||||||
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
|
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
|
||||||
|
|
||||||
long start = System.currentTimeMillis();
|
long start = System.currentTimeMillis();
|
||||||
|
log.info("Starting layout parsing for {}", layoutParsingRequest.identifier());
|
||||||
|
|
||||||
try (PDDocument originDocument = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId())) {
|
File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
|
||||||
ImageServiceResponse imageServiceResponse = new ImageServiceResponse();
|
File viewerDocumentFile = File.createTempFile("viewer_document", ".pdf");
|
||||||
if (layoutParsingRequest.imagesFileStorageId().isPresent()) {
|
|
||||||
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get());
|
|
||||||
}
|
|
||||||
|
|
||||||
TableServiceResponse tableServiceResponse = new TableServiceResponse();
|
ImageServiceResponse imageServiceResponse = new ImageServiceResponse();
|
||||||
if (layoutParsingRequest.tablesFileStorageId().isPresent()) {
|
if (layoutParsingRequest.imagesFileStorageId().isPresent()) {
|
||||||
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get());
|
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get());
|
||||||
}
|
|
||||||
|
|
||||||
ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(), originDocument, imageServiceResponse, tableServiceResponse);
|
|
||||||
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(classificationDocument);
|
|
||||||
|
|
||||||
int numberOfPages = originDocument.getNumberOfPages();
|
|
||||||
|
|
||||||
layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentGraph));
|
|
||||||
layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentGraph));
|
|
||||||
|
|
||||||
try (var out = new ByteArrayOutputStream()) {
|
|
||||||
viewerDocumentService.createViewerDocument(originDocument, documentGraph, out, false);
|
|
||||||
layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, out);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (layoutParsingRequest.layoutParsingType().equals(LayoutParsingType.TAAS)) {
|
|
||||||
var researchDocumentData = TaasDocumentDataMapper.fromDocument(documentGraph);
|
|
||||||
layoutParsingStorageService.storeResearchDocumentData(layoutParsingRequest, researchDocumentData);
|
|
||||||
}
|
|
||||||
|
|
||||||
return LayoutParsingFinishedEvent.builder()
|
|
||||||
.identifier(layoutParsingRequest.identifier())
|
|
||||||
.numberOfPages(numberOfPages)
|
|
||||||
.duration(System.currentTimeMillis() - start)
|
|
||||||
.message(format("""
|
|
||||||
Layout parsing has finished in %.02f s.
|
|
||||||
identifiers: %s
|
|
||||||
%s
|
|
||||||
Files have been saved with Ids:
|
|
||||||
Structure: %s
|
|
||||||
Text: %s
|
|
||||||
Positions: %s
|
|
||||||
PageData: %s
|
|
||||||
Simplified Text: %s
|
|
||||||
Viewer Doc: %s""",
|
|
||||||
((float) (System.currentTimeMillis() - start)) / 1000,
|
|
||||||
layoutParsingRequest.identifier(),
|
|
||||||
buildSemanticNodeCountMessage(numberOfPages, documentGraph.buildSemanticNodeCounts()),
|
|
||||||
layoutParsingRequest.structureFileStorageId(),
|
|
||||||
layoutParsingRequest.textBlockFileStorageId(),
|
|
||||||
layoutParsingRequest.positionBlockFileStorageId(),
|
|
||||||
layoutParsingRequest.pageFileStorageId(),
|
|
||||||
layoutParsingRequest.simplifiedTextStorageId(),
|
|
||||||
layoutParsingRequest.viewerDocumentStorageId()))
|
|
||||||
.build();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TableServiceResponse tableServiceResponse = new TableServiceResponse();
|
||||||
|
if (layoutParsingRequest.tablesFileStorageId().isPresent()) {
|
||||||
|
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get());
|
||||||
|
}
|
||||||
|
|
||||||
|
ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(),
|
||||||
|
originFile,
|
||||||
|
imageServiceResponse,
|
||||||
|
tableServiceResponse,
|
||||||
|
layoutParsingRequest.identifier().toString());
|
||||||
|
log.info("Building document graph for {}", layoutParsingRequest.identifier());
|
||||||
|
|
||||||
|
Document documentGraph = observeBuildDocumentGraph(classificationDocument);
|
||||||
|
|
||||||
|
log.info("Storing resulting files for {}", layoutParsingRequest.identifier());
|
||||||
|
layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentGraph));
|
||||||
|
layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentGraph));
|
||||||
|
|
||||||
|
log.info("Creating viewer document for {}", layoutParsingRequest.identifier());
|
||||||
|
viewerDocumentService.createViewerDocument(originFile, documentGraph, viewerDocumentFile, false);
|
||||||
|
layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, viewerDocumentFile);
|
||||||
|
if (layoutParsingRequest.layoutParsingType().equals(LayoutParsingType.TAAS)) {
|
||||||
|
log.info("Building research document data for {}", layoutParsingRequest.identifier());
|
||||||
|
var researchDocumentData = TaasDocumentDataMapper.fromDocument(documentGraph);
|
||||||
|
layoutParsingStorageService.storeResearchDocumentData(layoutParsingRequest, researchDocumentData);
|
||||||
|
}
|
||||||
|
|
||||||
|
originFile.delete();
|
||||||
|
viewerDocumentFile.delete();
|
||||||
|
|
||||||
|
return LayoutParsingFinishedEvent.builder()
|
||||||
|
.identifier(layoutParsingRequest.identifier())
|
||||||
|
.numberOfPages(documentGraph.getNumberOfPages())
|
||||||
|
.duration(System.currentTimeMillis() - start)
|
||||||
|
.message(format("""
|
||||||
|
Layout parsing has finished in %.02f s.
|
||||||
|
identifiers: %s
|
||||||
|
%s
|
||||||
|
Files have been saved with Ids:
|
||||||
|
Structure: %s
|
||||||
|
Text: %s
|
||||||
|
Positions: %s
|
||||||
|
PageData: %s
|
||||||
|
Simplified Text: %s
|
||||||
|
Viewer Doc: %s""",
|
||||||
|
((float) (System.currentTimeMillis() - start)) / 1000,
|
||||||
|
layoutParsingRequest.identifier(),
|
||||||
|
buildSemanticNodeCountMessage(documentGraph.getNumberOfPages(), documentGraph.buildSemanticNodeCounts()),
|
||||||
|
layoutParsingRequest.structureFileStorageId(),
|
||||||
|
layoutParsingRequest.textBlockFileStorageId(),
|
||||||
|
layoutParsingRequest.positionBlockFileStorageId(),
|
||||||
|
layoutParsingRequest.pageFileStorageId(),
|
||||||
|
layoutParsingRequest.simplifiedTextStorageId(),
|
||||||
|
layoutParsingRequest.viewerDocumentStorageId()))
|
||||||
|
.build();
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private Document observeBuildDocumentGraph(ClassificationDocument classificationDocument) {
|
||||||
|
|
||||||
|
AtomicReference<Document> documentReference = new AtomicReference<>();
|
||||||
|
|
||||||
|
Observation.createNotStarted("LayoutParsingPipeline", observationRegistry).contextualName("build-document-graph").observe(() -> {
|
||||||
|
documentReference.set(DocumentGraphFactory.buildDocumentGraph(classificationDocument));
|
||||||
|
});
|
||||||
|
|
||||||
|
return documentReference.get();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -154,21 +187,36 @@ public class LayoutParsingPipeline {
|
|||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
|
@Observed(name = "LayoutParsingPipeline", contextualName = "parse-layout")
|
||||||
public ClassificationDocument parseLayout(LayoutParsingType layoutParsingType,
|
public ClassificationDocument parseLayout(LayoutParsingType layoutParsingType,
|
||||||
PDDocument originDocument,
|
File originFile,
|
||||||
ImageServiceResponse imageServiceResponse,
|
ImageServiceResponse imageServiceResponse,
|
||||||
TableServiceResponse tableServiceResponse) {
|
TableServiceResponse tableServiceResponse,
|
||||||
|
String identifier) {
|
||||||
|
|
||||||
|
PDDocument originDocument = openDocument(originFile);
|
||||||
|
addNumberOfPagesToTrace(originDocument.getNumberOfPages(), Files.size(originFile.toPath()));
|
||||||
Map<Integer, List<TableCells>> pdfTableCells = cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse);
|
Map<Integer, List<TableCells>> pdfTableCells = cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse);
|
||||||
Map<Integer, List<ClassifiedImage>> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse);
|
Map<Integer, List<ClassifiedImage>> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse);
|
||||||
|
|
||||||
ClassificationDocument classificationDocument = new ClassificationDocument();
|
ClassificationDocument classificationDocument = new ClassificationDocument();
|
||||||
List<ClassificationPage> classificationPages = new ArrayList<>();
|
List<ClassificationPage> classificationPages = new ArrayList<>();
|
||||||
|
|
||||||
originDocument.setAllSecurityToBeRemoved(true);
|
|
||||||
long pageCount = originDocument.getNumberOfPages();
|
long pageCount = originDocument.getNumberOfPages();
|
||||||
|
|
||||||
for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) {
|
for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) {
|
||||||
|
|
||||||
|
if (pageNumber % 100 == 0) {
|
||||||
|
// re-open document every once in a while to save on RAM. This has no significant performance impact.
|
||||||
|
// This is due to PDFBox caching all images and some other stuff with Soft References. This dereferences them and forces the freeing of memory.
|
||||||
|
originDocument.close();
|
||||||
|
originDocument = openDocument(originFile);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (pageNumber % 100 == 0 || pageNumber == pageCount || pageNumber == 1) {
|
||||||
|
log.info("Extracting text on Page {} for {}", pageNumber, identifier);
|
||||||
|
}
|
||||||
|
|
||||||
classificationDocument.setPages(classificationPages);
|
classificationDocument.setPages(classificationPages);
|
||||||
PDFLinesTextStripper stripper = new PDFLinesTextStripper();
|
PDFLinesTextStripper stripper = new PDFLinesTextStripper();
|
||||||
PDPage pdPage = originDocument.getPage(pageNumber - 1);
|
PDPage pdPage = originDocument.getPage(pageNumber - 1);
|
||||||
@ -218,21 +266,42 @@ public class LayoutParsingPipeline {
|
|||||||
classificationPages.add(classificationPage);
|
classificationPages.add(classificationPage);
|
||||||
}
|
}
|
||||||
|
|
||||||
bodyTextFrameService.setBodyTextFrames(classificationDocument, layoutParsingType);
|
originDocument.close();
|
||||||
|
|
||||||
|
log.info("Calculating BodyTextFrame for {}", identifier);
|
||||||
|
bodyTextFrameService.setBodyTextFrames(classificationDocument, layoutParsingType);
|
||||||
|
log.info("Classify TextBlocks for {}", identifier);
|
||||||
switch (layoutParsingType) {
|
switch (layoutParsingType) {
|
||||||
case TAAS -> taasClassificationService.classifyDocument(classificationDocument);
|
case TAAS -> taasClassificationService.classifyDocument(classificationDocument);
|
||||||
case DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument);
|
case DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument);
|
||||||
case REDACT_MANAGER -> redactManagerClassificationService.classifyDocument(classificationDocument);
|
case REDACT_MANAGER -> redactManagerClassificationService.classifyDocument(classificationDocument);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
log.info("Building Sections for {}", identifier);
|
||||||
sectionsBuilderService.buildSections(classificationDocument);
|
sectionsBuilderService.buildSections(classificationDocument);
|
||||||
sectionsBuilderService.addImagesToSections(classificationDocument);
|
sectionsBuilderService.addImagesToSections(classificationDocument);
|
||||||
|
|
||||||
return classificationDocument;
|
return classificationDocument;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void addNumberOfPagesToTrace(int numberOfPages, long size) {
|
||||||
|
|
||||||
|
if (observationRegistry.getCurrentObservation() != null) {
|
||||||
|
observationRegistry.getCurrentObservation().highCardinalityKeyValue("numberOfPages", String.valueOf(numberOfPages));
|
||||||
|
observationRegistry.getCurrentObservation().highCardinalityKeyValue("fileSize", String.valueOf(size));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
private PDDocument openDocument(File originFile) {
|
||||||
|
|
||||||
|
PDDocument document = Loader.loadPDF(originFile);
|
||||||
|
document.setAllSecurityToBeRemoved(true);
|
||||||
|
return document;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
private Map<String, List<Rectangle2D>> convertMarkedContents(List<PDMarkedContent> pdMarkedContents) {
|
private Map<String, List<Rectangle2D>> convertMarkedContents(List<PDMarkedContent> pdMarkedContents) {
|
||||||
|
|
||||||
Map<String, List<Rectangle2D>> markedContentBboxes = new HashMap<>();
|
Map<String, List<Rectangle2D>> markedContentBboxes = new HashMap<>();
|
||||||
@ -244,9 +313,9 @@ public class LayoutParsingPipeline {
|
|||||||
|
|
||||||
private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) {
|
private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) {
|
||||||
|
|
||||||
if (!classificationPage.isLandscape()) {
|
if (!classificationPage.isLandscape()) {
|
||||||
document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue());
|
document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue());
|
||||||
}
|
}
|
||||||
document.getFontCounter().addAll(classificationPage.getFontCounter().getCountPerValue());
|
document.getFontCounter().addAll(classificationPage.getFontCounter().getCountPerValue());
|
||||||
document.getTextHeightCounter().addAll(classificationPage.getTextHeightCounter().getCountPerValue());
|
document.getTextHeightCounter().addAll(classificationPage.getTextHeightCounter().getCountPerValue());
|
||||||
document.getFontStyleCounter().addAll(classificationPage.getFontStyleCounter().getCountPerValue());
|
document.getFontStyleCounter().addAll(classificationPage.getFontStyleCounter().getCountPerValue());
|
||||||
|
|||||||
@ -1,9 +1,7 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor;
|
package com.knecon.fforesight.service.layoutparser.processor;
|
||||||
|
|
||||||
import java.io.ByteArrayInputStream;
|
|
||||||
import java.io.ByteArrayOutputStream;
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileOutputStream;
|
import java.io.FileInputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
@ -11,7 +9,6 @@ import java.nio.file.Path;
|
|||||||
import java.nio.file.Paths;
|
import java.nio.file.Paths;
|
||||||
import java.nio.file.StandardOpenOption;
|
import java.nio.file.StandardOpenOption;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
|
||||||
import org.apache.pdfbox.Loader;
|
import org.apache.pdfbox.Loader;
|
||||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
@ -26,6 +23,7 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.ima
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||||
import com.knecon.fforesight.tenantcommons.TenantContext;
|
import com.knecon.fforesight.tenantcommons.TenantContext;
|
||||||
|
|
||||||
|
import io.micrometer.observation.annotation.Observed;
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
@ -39,16 +37,18 @@ public class LayoutParsingStorageService {
|
|||||||
private final ObjectMapper objectMapper;
|
private final ObjectMapper objectMapper;
|
||||||
|
|
||||||
|
|
||||||
public PDDocument getOriginFile(String storageId) throws IOException {
|
public PDDocument getOriginDocument(String storageId) throws IOException {
|
||||||
|
|
||||||
try (var originDocumentInputStream = getObject(storageId)) {
|
return Loader.loadPDF(getOriginFile(storageId));
|
||||||
File tempFile = createTempFile("document", ".pdf");
|
}
|
||||||
try (var tempFileOutputStream = new FileOutputStream(tempFile)) {
|
|
||||||
IOUtils.copy(originDocumentInputStream, tempFileOutputStream);
|
|
||||||
originDocumentInputStream.close();
|
@Observed(name = "LayoutParsingStorageService", contextualName = "get-origin-file")
|
||||||
}
|
public File getOriginFile(String storageId) throws IOException {
|
||||||
return Loader.loadPDF(tempFile);
|
|
||||||
}
|
File tempFile = createTempFile("document", ".pdf");
|
||||||
|
storageService.downloadTo(TenantContext.getTenantId(), storageId, tempFile);
|
||||||
|
return tempFile;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -74,6 +74,7 @@ public class LayoutParsingStorageService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Observed(name = "LayoutParsingStorageService", contextualName = "store-document-data")
|
||||||
public void storeDocumentData(LayoutParsingRequest layoutParsingRequest, DocumentData documentData) {
|
public void storeDocumentData(LayoutParsingRequest layoutParsingRequest, DocumentData documentData) {
|
||||||
|
|
||||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.structureFileStorageId(), documentData.getDocumentStructure());
|
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.structureFileStorageId(), documentData.getDocumentStructure());
|
||||||
@ -83,7 +84,6 @@ public class LayoutParsingStorageService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
public void storeResearchDocumentData(LayoutParsingRequest layoutParsingRequest, ResearchDocumentData researchDocumentData) {
|
public void storeResearchDocumentData(LayoutParsingRequest layoutParsingRequest, ResearchDocumentData researchDocumentData) {
|
||||||
|
|
||||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.researchDocumentStorageId(), researchDocumentData);
|
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.researchDocumentStorageId(), researchDocumentData);
|
||||||
@ -115,6 +115,7 @@ public class LayoutParsingStorageService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Observed(name = "LayoutParsingStorageService", contextualName = "store-simplified-text")
|
||||||
public void storeSimplifiedText(LayoutParsingRequest layoutParsingRequest, SimplifiedText simplifiedText) {
|
public void storeSimplifiedText(LayoutParsingRequest layoutParsingRequest, SimplifiedText simplifiedText) {
|
||||||
|
|
||||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.simplifiedTextStorageId(), simplifiedText);
|
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.simplifiedTextStorageId(), simplifiedText);
|
||||||
@ -132,9 +133,10 @@ public class LayoutParsingStorageService {
|
|||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void storeViewerDocument(LayoutParsingRequest layoutParsingRequest, ByteArrayOutputStream out) {
|
@Observed(name = "LayoutParsingStorageService", contextualName = "store-viewer-document")
|
||||||
|
public void storeViewerDocument(LayoutParsingRequest layoutParsingRequest, File out) {
|
||||||
|
|
||||||
try (var in = new ByteArrayInputStream(out.toByteArray())) {
|
try (var in = new FileInputStream(out)) {
|
||||||
|
|
||||||
storageService.storeObject(TenantContext.getTenantId(), layoutParsingRequest.viewerDocumentStorageId(), in);
|
storageService.storeObject(TenantContext.getTenantId(), layoutParsingRequest.viewerDocumentStorageId(), in);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -10,7 +10,6 @@ import java.util.Set;
|
|||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
import com.amazonaws.services.kms.model.NotFoundException;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
|
||||||
@ -84,7 +83,7 @@ public class Document implements GenericSemanticNode {
|
|||||||
@Override
|
@Override
|
||||||
public Headline getHeadline() {
|
public Headline getHeadline() {
|
||||||
|
|
||||||
return streamAllSubNodesOfType(NodeType.HEADLINE).map(node -> (Headline) node).findFirst().orElseThrow(() -> new NotFoundException("No Headlines found in this document!"));
|
return streamAllSubNodesOfType(NodeType.HEADLINE).map(node -> (Headline) node).findFirst().orElse(Headline.builder().build());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -105,6 +104,7 @@ public class Document implements GenericSemanticNode {
|
|||||||
return streamAllSubNodes().collect(Collectors.groupingBy(SemanticNode::getType, Collectors.counting()));
|
return streamAllSubNodes().collect(Collectors.groupingBy(SemanticNode::getType, Collectors.counting()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
|
|
||||||
|
|||||||
@ -1,7 +1,9 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||||
|
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||||
@ -34,6 +36,9 @@ public class Footer implements GenericSemanticNode {
|
|||||||
@EqualsAndHashCode.Exclude
|
@EqualsAndHashCode.Exclude
|
||||||
Set<RedactionEntity> entities = new HashSet<>();
|
Set<RedactionEntity> entities = new HashSet<>();
|
||||||
|
|
||||||
|
@EqualsAndHashCode.Exclude
|
||||||
|
Map<Page, Rectangle2D> bBoxCache;
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public NodeType getType() {
|
public NodeType getType() {
|
||||||
@ -62,4 +67,14 @@ public class Footer implements GenericSemanticNode {
|
|||||||
return treeId + ": " + NodeType.FOOTER + ": " + leafTextBlock.buildSummary();
|
return treeId + ": " + NodeType.FOOTER + ": " + leafTextBlock.buildSummary();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Map<Page, Rectangle2D> getBBox() {
|
||||||
|
|
||||||
|
if (bBoxCache == null) {
|
||||||
|
bBoxCache = GenericSemanticNode.super.getBBox();
|
||||||
|
}
|
||||||
|
return bBoxCache;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,7 +1,9 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||||
|
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||||
@ -34,6 +36,9 @@ public class Header implements GenericSemanticNode {
|
|||||||
@EqualsAndHashCode.Exclude
|
@EqualsAndHashCode.Exclude
|
||||||
Set<RedactionEntity> entities = new HashSet<>();
|
Set<RedactionEntity> entities = new HashSet<>();
|
||||||
|
|
||||||
|
@EqualsAndHashCode.Exclude
|
||||||
|
Map<Page, Rectangle2D> bBoxCache;
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean isLeaf() {
|
public boolean isLeaf() {
|
||||||
@ -62,4 +67,14 @@ public class Header implements GenericSemanticNode {
|
|||||||
return treeId + ": " + NodeType.HEADER + ": " + leafTextBlock.buildSummary();
|
return treeId + ": " + NodeType.HEADER + ": " + leafTextBlock.buildSummary();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Map<Page, Rectangle2D> getBBox() {
|
||||||
|
|
||||||
|
if (bBoxCache == null) {
|
||||||
|
bBoxCache = GenericSemanticNode.super.getBBox();
|
||||||
|
}
|
||||||
|
return bBoxCache;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,7 +1,9 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||||
|
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||||
@ -34,6 +36,9 @@ public class Headline implements GenericSemanticNode {
|
|||||||
@EqualsAndHashCode.Exclude
|
@EqualsAndHashCode.Exclude
|
||||||
Set<RedactionEntity> entities = new HashSet<>();
|
Set<RedactionEntity> entities = new HashSet<>();
|
||||||
|
|
||||||
|
@EqualsAndHashCode.Exclude
|
||||||
|
Map<Page, Rectangle2D> bBoxCache;
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public NodeType getType() {
|
public NodeType getType() {
|
||||||
@ -69,4 +74,14 @@ public class Headline implements GenericSemanticNode {
|
|||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Map<Page, Rectangle2D> getBBox() {
|
||||||
|
|
||||||
|
if (bBoxCache == null) {
|
||||||
|
bBoxCache = GenericSemanticNode.super.getBBox();
|
||||||
|
}
|
||||||
|
return bBoxCache;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,7 +1,9 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||||
|
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||||
@ -32,6 +34,9 @@ public class Paragraph implements GenericSemanticNode {
|
|||||||
@EqualsAndHashCode.Exclude
|
@EqualsAndHashCode.Exclude
|
||||||
Set<RedactionEntity> entities = new HashSet<>();
|
Set<RedactionEntity> entities = new HashSet<>();
|
||||||
|
|
||||||
|
@EqualsAndHashCode.Exclude
|
||||||
|
Map<Page, Rectangle2D> bBoxCache;
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public NodeType getType() {
|
public NodeType getType() {
|
||||||
@ -60,4 +65,14 @@ public class Paragraph implements GenericSemanticNode {
|
|||||||
return treeId + ": " + NodeType.PARAGRAPH + ": " + leafTextBlock.buildSummary();
|
return treeId + ": " + NodeType.PARAGRAPH + ": " + leafTextBlock.buildSummary();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Map<Page, Rectangle2D> getBBox() {
|
||||||
|
|
||||||
|
if (bBoxCache == null) {
|
||||||
|
bBoxCache = GenericSemanticNode.super.getBBox();
|
||||||
|
}
|
||||||
|
return bBoxCache;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,7 +1,9 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||||
|
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||||
@ -35,6 +37,9 @@ public class Section implements GenericSemanticNode {
|
|||||||
@EqualsAndHashCode.Exclude
|
@EqualsAndHashCode.Exclude
|
||||||
Set<RedactionEntity> entities = new HashSet<>();
|
Set<RedactionEntity> entities = new HashSet<>();
|
||||||
|
|
||||||
|
@EqualsAndHashCode.Exclude
|
||||||
|
Map<Page, Rectangle2D> bBoxCache;
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public NodeType getType() {
|
public NodeType getType() {
|
||||||
@ -74,4 +79,14 @@ public class Section implements GenericSemanticNode {
|
|||||||
.orElseGet(() -> getParent().getHeadline());
|
.orElseGet(() -> getParent().getHeadline());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Map<Page, Rectangle2D> getBBox() {
|
||||||
|
|
||||||
|
if (bBoxCache == null) {
|
||||||
|
bBoxCache = GenericSemanticNode.super.getBBox();
|
||||||
|
}
|
||||||
|
return bBoxCache;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -398,12 +398,11 @@ public interface SemanticNode {
|
|||||||
*/
|
*/
|
||||||
default Map<Page, Rectangle2D> getBBox() {
|
default Map<Page, Rectangle2D> getBBox() {
|
||||||
|
|
||||||
Map<Page, Rectangle2D> bBoxPerPage = new HashMap<>();
|
|
||||||
if (isLeaf()) {
|
if (isLeaf()) {
|
||||||
return getBBoxFromLeafTextBlock(bBoxPerPage);
|
return getBBoxFromLeafTextBlock();
|
||||||
}
|
}
|
||||||
|
|
||||||
return getBBoxFromChildren(bBoxPerPage);
|
return getBBoxFromChildren();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -426,25 +425,31 @@ public interface SemanticNode {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* TODO: this produces unwanted results for sections spanning multiple columns.
|
* TODO: this produces unwanted results for sections spanning multiple columns.
|
||||||
*
|
* Computes the Union of the bounding boxes of all children recursively.
|
||||||
* @param bBoxPerPage initial empty BoundingBox
|
|
||||||
* @return The union of the BoundingBoxes of all children
|
* @return The union of the BoundingBoxes of all children
|
||||||
*/
|
*/
|
||||||
private Map<Page, Rectangle2D> getBBoxFromChildren(Map<Page, Rectangle2D> bBoxPerPage) {
|
private Map<Page, Rectangle2D> getBBoxFromChildren() {
|
||||||
|
|
||||||
return streamChildren().map(SemanticNode::getBBox).reduce((map1, map2) -> {
|
Map<Page, Rectangle2D> bBoxPerPage = new HashMap<>();
|
||||||
map1.forEach((page, rectangle) -> map2.merge(page, rectangle, (rect1, rect2) -> rect1.createUnion(rect2).getBounds2D()));
|
List<Map<Page, Rectangle2D>> childrenBBoxes = streamChildren().map(SemanticNode::getBBox).toList();
|
||||||
return map2;
|
Set<Page> pages = childrenBBoxes.stream().flatMap(map -> map.keySet().stream()).collect(Collectors.toSet());
|
||||||
}).orElse(bBoxPerPage);
|
for (Page page : pages) {
|
||||||
|
Rectangle2D bBoxOnPage = childrenBBoxes.stream()
|
||||||
|
.filter(childBboxPerPage -> childBboxPerPage.containsKey(page))
|
||||||
|
.map(childBboxPerPage -> childBboxPerPage.get(page))
|
||||||
|
.collect(RectangleTransformations.collectBBox());
|
||||||
|
bBoxPerPage.put(page, bBoxOnPage);
|
||||||
|
}
|
||||||
|
return bBoxPerPage;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param bBoxPerPage initial empty BoundingBox
|
|
||||||
* @return The union of all BoundingBoxes of the TextBlock of this node
|
* @return The union of all BoundingBoxes of the TextBlock of this node
|
||||||
*/
|
*/
|
||||||
private Map<Page, Rectangle2D> getBBoxFromLeafTextBlock(Map<Page, Rectangle2D> bBoxPerPage) {
|
private Map<Page, Rectangle2D> getBBoxFromLeafTextBlock() {
|
||||||
|
|
||||||
|
Map<Page, Rectangle2D> bBoxPerPage = new HashMap<>();
|
||||||
Map<Page, List<AtomicTextBlock>> atomicTextBlockPerPage = getTextBlock().getAtomicTextBlocks().stream().collect(Collectors.groupingBy(AtomicTextBlock::getPage));
|
Map<Page, List<AtomicTextBlock>> atomicTextBlockPerPage = getTextBlock().getAtomicTextBlocks().stream().collect(Collectors.groupingBy(AtomicTextBlock::getPage));
|
||||||
atomicTextBlockPerPage.forEach((page, atbs) -> bBoxPerPage.put(page, RectangleTransformations.bBoxUnionAtomicTextBlock(atbs)));
|
atomicTextBlockPerPage.forEach((page, atbs) -> bBoxPerPage.put(page, RectangleTransformations.bBoxUnionAtomicTextBlock(atbs)));
|
||||||
return bBoxPerPage;
|
return bBoxPerPage;
|
||||||
|
|||||||
@ -2,10 +2,12 @@ package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
|||||||
|
|
||||||
import static java.lang.String.format;
|
import static java.lang.String.format;
|
||||||
|
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.stream.IntStream;
|
import java.util.stream.IntStream;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
@ -40,6 +42,8 @@ public class Table implements SemanticNode {
|
|||||||
@EqualsAndHashCode.Exclude
|
@EqualsAndHashCode.Exclude
|
||||||
Set<RedactionEntity> entities = new HashSet<>();
|
Set<RedactionEntity> entities = new HashSet<>();
|
||||||
|
|
||||||
|
@EqualsAndHashCode.Exclude
|
||||||
|
Map<Page, Rectangle2D> bBoxCache;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Streams all entities in this table, that appear in a row, which contains any of the provided strings.
|
* Streams all entities in this table, that appear in a row, which contains any of the provided strings.
|
||||||
@ -311,5 +315,12 @@ public class Table implements SemanticNode {
|
|||||||
|
|
||||||
return treeId.toString() + ": " + NodeType.TABLE + ": #cols: " + numberOfCols + ", #rows: " + numberOfRows + ", " + this.getTextBlock().buildSummary();
|
return treeId.toString() + ": " + NodeType.TABLE + ": #cols: " + numberOfCols + ", #rows: " + numberOfRows + ", " + this.getTextBlock().buildSummary();
|
||||||
}
|
}
|
||||||
|
@Override
|
||||||
|
public Map<Page, Rectangle2D> getBBox() {
|
||||||
|
|
||||||
|
if (bBoxCache == null) {
|
||||||
|
bBoxCache = SemanticNode.super.getBBox();
|
||||||
|
}
|
||||||
|
return bBoxCache;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -9,8 +9,6 @@ import java.util.stream.Collectors;
|
|||||||
|
|
||||||
import org.apache.pdfbox.text.TextPosition;
|
import org.apache.pdfbox.text.TextPosition;
|
||||||
|
|
||||||
import com.dslplatform.json.JsonAttribute;
|
|
||||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
|
||||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
|
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
|
||||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||||
|
|
||||||
@ -142,8 +140,7 @@ public class TextPositionSequence implements CharSequence {
|
|||||||
*
|
*
|
||||||
* @return the text direction adjusted minX value
|
* @return the text direction adjusted minX value
|
||||||
*/
|
*/
|
||||||
@JsonIgnore
|
|
||||||
@JsonAttribute(ignore = true)
|
|
||||||
public float getMinXDirAdj() {
|
public float getMinXDirAdj() {
|
||||||
|
|
||||||
return textPositions.get(0).getXDirAdj();
|
return textPositions.get(0).getXDirAdj();
|
||||||
@ -157,8 +154,7 @@ public class TextPositionSequence implements CharSequence {
|
|||||||
*
|
*
|
||||||
* @return the text direction adjusted maxX value
|
* @return the text direction adjusted maxX value
|
||||||
*/
|
*/
|
||||||
@JsonIgnore
|
|
||||||
@JsonAttribute(ignore = true)
|
|
||||||
public float getMaxXDirAdj() {
|
public float getMaxXDirAdj() {
|
||||||
|
|
||||||
return textPositions.get(textPositions.size() - 1).getXDirAdj() + textPositions.get(textPositions.size() - 1).getWidthDirAdj() + HEIGHT_PADDING;
|
return textPositions.get(textPositions.size() - 1).getXDirAdj() + textPositions.get(textPositions.size() - 1).getWidthDirAdj() + HEIGHT_PADDING;
|
||||||
@ -172,8 +168,7 @@ public class TextPositionSequence implements CharSequence {
|
|||||||
*
|
*
|
||||||
* @return the text direction adjusted minY value. The upper border of the bounding box of the word.
|
* @return the text direction adjusted minY value. The upper border of the bounding box of the word.
|
||||||
*/
|
*/
|
||||||
@JsonIgnore
|
|
||||||
@JsonAttribute(ignore = true)
|
|
||||||
public float getMinYDirAdj() {
|
public float getMinYDirAdj() {
|
||||||
|
|
||||||
return textPositions.get(0).getYDirAdj() - getTextHeight();
|
return textPositions.get(0).getYDirAdj() - getTextHeight();
|
||||||
@ -187,8 +182,7 @@ public class TextPositionSequence implements CharSequence {
|
|||||||
*
|
*
|
||||||
* @return the text direction adjusted maxY value. The lower border of the bounding box of the word.
|
* @return the text direction adjusted maxY value. The lower border of the bounding box of the word.
|
||||||
*/
|
*/
|
||||||
@JsonIgnore
|
|
||||||
@JsonAttribute(ignore = true)
|
|
||||||
public float getMaxYDirAdj() {
|
public float getMaxYDirAdj() {
|
||||||
|
|
||||||
return textPositions.get(0).getYDirAdj();
|
return textPositions.get(0).getYDirAdj();
|
||||||
@ -196,32 +190,24 @@ public class TextPositionSequence implements CharSequence {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@JsonIgnore
|
|
||||||
@JsonAttribute(ignore = true)
|
|
||||||
public float getTextHeight() {
|
public float getTextHeight() {
|
||||||
|
|
||||||
return textPositions.get(0).getHeightDir() + HEIGHT_PADDING;
|
return textPositions.get(0).getHeightDir() + HEIGHT_PADDING;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@JsonIgnore
|
|
||||||
@JsonAttribute(ignore = true)
|
|
||||||
public float getHeight() {
|
public float getHeight() {
|
||||||
|
|
||||||
return getMaxYDirAdj() - getMinYDirAdj();
|
return getMaxYDirAdj() - getMinYDirAdj();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@JsonIgnore
|
|
||||||
@JsonAttribute(ignore = true)
|
|
||||||
public float getWidth() {
|
public float getWidth() {
|
||||||
|
|
||||||
return getMaxXDirAdj() - getMinXDirAdj();
|
return getMaxXDirAdj() - getMinXDirAdj();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@JsonIgnore
|
|
||||||
@JsonAttribute(ignore = true)
|
|
||||||
public String getFont() {
|
public String getFont() {
|
||||||
|
|
||||||
if (textPositions.get(0).getFontName() == null) {
|
if (textPositions.get(0).getFontName() == null) {
|
||||||
@ -231,9 +217,8 @@ public class TextPositionSequence implements CharSequence {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@JsonIgnore
|
|
||||||
@JsonAttribute(ignore = true)
|
|
||||||
public String getFontStyle() {
|
public String getFontStyle() {
|
||||||
|
|
||||||
if (textPositions.get(0).getFontName() == null) {
|
if (textPositions.get(0).getFontName() == null) {
|
||||||
return "standard";
|
return "standard";
|
||||||
}
|
}
|
||||||
@ -251,16 +236,12 @@ public class TextPositionSequence implements CharSequence {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@JsonIgnore
|
|
||||||
@JsonAttribute(ignore = true)
|
|
||||||
public float getFontSize() {
|
public float getFontSize() {
|
||||||
|
|
||||||
return textPositions.get(0).getFontSizeInPt();
|
return textPositions.get(0).getFontSizeInPt();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@JsonIgnore
|
|
||||||
@JsonAttribute(ignore = true)
|
|
||||||
public float getSpaceWidth() {
|
public float getSpaceWidth() {
|
||||||
|
|
||||||
return textPositions.get(0).getWidthOfSpace();
|
return textPositions.get(0).getWidthOfSpace();
|
||||||
@ -276,8 +257,7 @@ public class TextPositionSequence implements CharSequence {
|
|||||||
*
|
*
|
||||||
* @return bounding box of the word in Pdf Coordinate System
|
* @return bounding box of the word in Pdf Coordinate System
|
||||||
*/
|
*/
|
||||||
@JsonIgnore
|
|
||||||
@JsonAttribute(ignore = true)
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public Rectangle getRectangle() {
|
public Rectangle getRectangle() {
|
||||||
|
|
||||||
|
|||||||
@ -2,12 +2,13 @@ package com.knecon.fforesight.service.layoutparser.processor.services.visualizat
|
|||||||
|
|
||||||
import java.awt.geom.AffineTransform;
|
import java.awt.geom.AffineTransform;
|
||||||
import java.awt.geom.Rectangle2D;
|
import java.awt.geom.Rectangle2D;
|
||||||
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.OutputStream;
|
import java.nio.file.Files;
|
||||||
import java.util.HashSet;
|
import java.nio.file.Path;
|
||||||
import java.util.Set;
|
import java.nio.file.StandardCopyOption;
|
||||||
|
|
||||||
import org.apache.pdfbox.cos.COSDictionary;
|
import org.apache.pdfbox.Loader;
|
||||||
import org.apache.pdfbox.cos.COSName;
|
import org.apache.pdfbox.cos.COSName;
|
||||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||||
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
|
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
|
||||||
@ -31,6 +32,9 @@ import com.knecon.fforesight.service.layoutparser.processor.model.visualization.
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.PlacedText;
|
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.PlacedText;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.VisualizationsOnPage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.VisualizationsOnPage;
|
||||||
|
|
||||||
|
import io.micrometer.observation.Observation;
|
||||||
|
import io.micrometer.observation.ObservationRegistry;
|
||||||
|
import io.micrometer.observation.annotation.Observed;
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
@ -40,29 +44,31 @@ import lombok.extern.slf4j.Slf4j;
|
|||||||
@RequiredArgsConstructor
|
@RequiredArgsConstructor
|
||||||
public class ViewerDocumentService {
|
public class ViewerDocumentService {
|
||||||
|
|
||||||
|
|
||||||
private static final String LAYER_NAME = "Layout grid";
|
private static final String LAYER_NAME = "Layout grid";
|
||||||
private static final int FONT_SIZE = 10;
|
private static final int FONT_SIZE = 10;
|
||||||
public static final float LINE_WIDTH = 1f;
|
public static final float LINE_WIDTH = 1f;
|
||||||
|
|
||||||
private final LayoutGridService layoutGridService;
|
private final LayoutGridService layoutGridService;
|
||||||
|
private final ObservationRegistry observationRegistry;
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void createViewerDocument(PDDocument pdDocument, Document document, OutputStream outputStream, boolean layerVisibilityDefaultValue) {
|
@Observed(name = "ViewerDocumentService", contextualName = "create-viewer-document")
|
||||||
|
public void createViewerDocument(File originFile, Document document, File destinationFile, boolean layerVisibilityDefaultValue) {
|
||||||
|
|
||||||
|
Path tmpFile = Files.createTempFile("tmpViewerDocument", ".pdf");
|
||||||
|
PDDocument pdDocument = openPDDocument(originFile);
|
||||||
LayoutGrid layoutGrid = layoutGridService.createLayoutGrid(document);
|
LayoutGrid layoutGrid = layoutGridService.createLayoutGrid(document);
|
||||||
// PDDocument.save() is very slow, since it actually traverses the entire pdf and writes a new one.
|
|
||||||
// If we collect all COSDictionaries we changed and tell it explicitly to only add the changed ones by using saveIncremental it's very fast.
|
PDOptionalContentGroup layer = addLayerToDocument(pdDocument, layerVisibilityDefaultValue);
|
||||||
Set<COSDictionary> dictionariesToUpdate = new HashSet<>();
|
|
||||||
PDOptionalContentGroup layer = addLayerToDocument(pdDocument, dictionariesToUpdate, layerVisibilityDefaultValue);
|
|
||||||
PDFont font = new PDType1Font(Standard14Fonts.FontName.HELVETICA);
|
PDFont font = new PDType1Font(Standard14Fonts.FontName.HELVETICA);
|
||||||
|
|
||||||
for (int pageNumber = 0; pageNumber < pdDocument.getNumberOfPages(); pageNumber++) {
|
for (int pageNumber = 0; pageNumber < pdDocument.getNumberOfPages(); pageNumber++) {
|
||||||
PDPage pdPage = pdDocument.getPage(pageNumber);
|
|
||||||
|
|
||||||
|
PDPage pdPage = pdDocument.getPage(pageNumber);
|
||||||
|
//
|
||||||
AffineTransform textDeRotationMatrix = getTextDeRotationTransform(pdPage);
|
AffineTransform textDeRotationMatrix = getTextDeRotationTransform(pdPage);
|
||||||
addLayerToPageRessources(pdPage);
|
addLayerToPageResources(pdPage);
|
||||||
|
|
||||||
// We need to save the graphics state before, such that our appended content cannot be affected by previous content streams with side effects,
|
// We need to save the graphics state before, such that our appended content cannot be affected by previous content streams with side effects,
|
||||||
// e.g. not escaped matrix transformations.
|
// e.g. not escaped matrix transformations.
|
||||||
@ -115,16 +121,48 @@ public class ViewerDocumentService {
|
|||||||
contentStream.restoreGraphicsState();
|
contentStream.restoreGraphicsState();
|
||||||
contentStream.endMarkedContent();
|
contentStream.endMarkedContent();
|
||||||
}
|
}
|
||||||
dictionariesToUpdate.add(pdPage.getCOSObject());
|
|
||||||
dictionariesToUpdate.add(pdPage.getResources().getCOSObject());
|
if (pageNumber % 500 == 0 && pageNumber != 0) { // re-open document every once in a while to save on RAM
|
||||||
|
log.info("Incremental save after {} pages", pageNumber);
|
||||||
|
observedIncrementalSave(pdDocument, destinationFile);
|
||||||
|
pdDocument.close();
|
||||||
|
Files.copy(destinationFile.toPath(), tmpFile, StandardCopyOption.REPLACE_EXISTING);
|
||||||
|
pdDocument = openPDDocument(tmpFile.toFile());
|
||||||
|
layer = addLayerToDocument(pdDocument, layerVisibilityDefaultValue);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
dictionariesToUpdate.add(pdDocument.getDocumentInformation().getCOSObject());
|
observedIncrementalSave(pdDocument, destinationFile);
|
||||||
// dictionariesToUpdate.add(pdDocument.getDocument().getTrailer());
|
|
||||||
pdDocument.saveIncremental(outputStream, dictionariesToUpdate);
|
tmpFile.toFile().delete();
|
||||||
|
pdDocument.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static void addLayerToPageRessources(PDPage pdPage) {
|
private static PDDocument openPDDocument(File tmpFile) throws IOException {
|
||||||
|
|
||||||
|
PDDocument pdDocument;
|
||||||
|
pdDocument = Loader.loadPDF(tmpFile);
|
||||||
|
pdDocument.setAllSecurityToBeRemoved(true);
|
||||||
|
return pdDocument;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
private void observedIncrementalSave(PDDocument pdDocument, File outputFile) {
|
||||||
|
|
||||||
|
Observation.createNotStarted("ViewerDocumentService", observationRegistry).contextualName("incremental-save").observe(() -> {
|
||||||
|
try {
|
||||||
|
pdDocument.save(outputFile);
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static void addLayerToPageResources(PDPage pdPage) {
|
||||||
|
|
||||||
PDResources resources = pdPage.getResources();
|
PDResources resources = pdPage.getResources();
|
||||||
if (resources == null) {
|
if (resources == null) {
|
||||||
@ -145,7 +183,7 @@ public class ViewerDocumentService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static PDOptionalContentGroup addLayerToDocument(PDDocument pdDocument, Set<COSDictionary> dictionariesToUpdate, boolean layerVisibilityDefaultValue) {
|
private static PDOptionalContentGroup addLayerToDocument(PDDocument pdDocument, boolean layerVisibilityDefaultValue) {
|
||||||
|
|
||||||
PDDocumentCatalog catalog = pdDocument.getDocumentCatalog();
|
PDDocumentCatalog catalog = pdDocument.getDocumentCatalog();
|
||||||
PDOptionalContentProperties ocprops = catalog.getOCProperties();
|
PDOptionalContentProperties ocprops = catalog.getOCProperties();
|
||||||
@ -161,7 +199,7 @@ public class ViewerDocumentService {
|
|||||||
ocprops.addGroup(layer);
|
ocprops.addGroup(layer);
|
||||||
}
|
}
|
||||||
ocprops.setGroupEnabled(layer, layerVisibilityDefaultValue);
|
ocprops.setGroupEnabled(layer, layerVisibilityDefaultValue);
|
||||||
dictionariesToUpdate.add(catalog.getCOSObject());
|
// dictionariesToUpdate.add(catalog.getCOSObject());
|
||||||
return layer;
|
return layer;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -5,6 +5,7 @@ import org.springframework.boot.actuate.autoconfigure.security.servlet.Managemen
|
|||||||
import org.springframework.boot.autoconfigure.ImportAutoConfiguration;
|
import org.springframework.boot.autoconfigure.ImportAutoConfiguration;
|
||||||
import org.springframework.boot.autoconfigure.SpringBootApplication;
|
import org.springframework.boot.autoconfigure.SpringBootApplication;
|
||||||
import org.springframework.boot.autoconfigure.security.servlet.SecurityAutoConfiguration;
|
import org.springframework.boot.autoconfigure.security.servlet.SecurityAutoConfiguration;
|
||||||
|
import org.springframework.context.annotation.Bean;
|
||||||
import org.springframework.context.annotation.Import;
|
import org.springframework.context.annotation.Import;
|
||||||
|
|
||||||
import com.amazonaws.services.s3.model.metrics.MetricsConfiguration;
|
import com.amazonaws.services.s3.model.metrics.MetricsConfiguration;
|
||||||
@ -13,8 +14,11 @@ import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingService
|
|||||||
import com.knecon.fforesight.service.layoutparser.server.queue.MessagingConfiguration;
|
import com.knecon.fforesight.service.layoutparser.server.queue.MessagingConfiguration;
|
||||||
import com.knecon.fforesight.tenantcommons.MultiTenancyAutoConfiguration;
|
import com.knecon.fforesight.tenantcommons.MultiTenancyAutoConfiguration;
|
||||||
|
|
||||||
|
import io.micrometer.observation.ObservationRegistry;
|
||||||
|
import io.micrometer.observation.aop.ObservedAspect;
|
||||||
|
|
||||||
@ImportAutoConfiguration({MultiTenancyAutoConfiguration.class})
|
@ImportAutoConfiguration({MultiTenancyAutoConfiguration.class})
|
||||||
@Import({MetricsConfiguration.class, StorageAutoConfiguration.class, LayoutParsingServiceProcessorConfiguration.class, MessagingConfiguration.class})
|
@Import({MetricsConfiguration.class, StorageAutoConfiguration.class, LayoutParsingServiceProcessorConfiguration.class, MessagingConfiguration.class})
|
||||||
@SpringBootApplication(exclude = {SecurityAutoConfiguration.class, ManagementWebSecurityAutoConfiguration.class})
|
@SpringBootApplication(exclude = {SecurityAutoConfiguration.class, ManagementWebSecurityAutoConfiguration.class})
|
||||||
public class Application {
|
public class Application {
|
||||||
|
|
||||||
@ -23,4 +27,11 @@ public class Application {
|
|||||||
SpringApplication.run(Application.class, args);
|
SpringApplication.run(Application.class, args);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Bean
|
||||||
|
public ObservedAspect observedAspect(ObservationRegistry observationRegistry) {
|
||||||
|
|
||||||
|
return new ObservedAspect(observationRegistry);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -46,12 +46,11 @@ public class BdrJsonBuildTest extends AbstractTest {
|
|||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
protected Document buildGraph(File file) {
|
protected Document buildGraph(File file) {
|
||||||
|
|
||||||
try (PDDocument pdDocument = Loader.loadPDF(file)) {
|
return DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.TAAS,
|
||||||
return DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.TAAS,
|
file,
|
||||||
pdDocument,
|
new ImageServiceResponse(),
|
||||||
new ImageServiceResponse(),
|
new TableServiceResponse(),
|
||||||
new TableServiceResponse()));
|
file.toString()));
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -6,7 +6,6 @@ import java.util.List;
|
|||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.pdfbox.Loader;
|
|
||||||
import org.assertj.core.api.Assertions;
|
import org.assertj.core.api.Assertions;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
import org.junit.jupiter.api.extension.ExtendWith;
|
import org.junit.jupiter.api.extension.ExtendWith;
|
||||||
@ -96,9 +95,10 @@ public class HeadlinesGoldStandardIntegrationTest {
|
|||||||
goldStandardLog.getRedactionLogEntry().forEach(e -> goldStandardHeadlines.add(new Headline(e.getPositions().get(0).getPage(), e.getValue())));
|
goldStandardLog.getRedactionLogEntry().forEach(e -> goldStandardHeadlines.add(new Headline(e.getPositions().get(0).getPage(), e.getValue())));
|
||||||
|
|
||||||
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||||
Loader.loadPDF(pdfFileResource.getFile()),
|
pdfFileResource.getFile(),
|
||||||
new ImageServiceResponse(),
|
new ImageServiceResponse(),
|
||||||
new TableServiceResponse()));
|
new TableServiceResponse(),
|
||||||
|
filePath));
|
||||||
|
|
||||||
var foundHeadlines = documentGraph.streamAllSubNodes()
|
var foundHeadlines = documentGraph.streamAllSubNodes()
|
||||||
.map(SemanticNode::getHeadline)
|
.map(SemanticNode::getHeadline)
|
||||||
|
|||||||
@ -25,8 +25,8 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
|
|||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void testLayoutParserEndToEnd() {
|
public void testLayoutParserEndToEnd() {
|
||||||
|
|
||||||
prepareStorage("files/bdr/btd_19_053_1905391.pdf");
|
prepareStorage("files/bdr/Wie weiter bei Kristeneinrichtungen.pdf");
|
||||||
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.TAAS);
|
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER);
|
||||||
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
|
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
|
||||||
Arrays.stream(finishedEvent.message().split("\n")).forEach(log::info);
|
Arrays.stream(finishedEvent.message().split("\n")).forEach(log::info);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -6,6 +6,7 @@ import java.nio.file.Files;
|
|||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.Disabled;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
import org.springframework.core.io.ClassPathResource;
|
import org.springframework.core.io.ClassPathResource;
|
||||||
|
|
||||||
@ -17,7 +18,9 @@ import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest
|
|||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
|
|
||||||
public class DocumentDataTests extends BuildDocumentTest {
|
public class DocumentDataTests extends BuildDocumentTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@Disabled // This test takes waaaaaay too long, it's ridiculous
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void createDocumentDataForAllFiles() {
|
public void createDocumentDataForAllFiles() {
|
||||||
|
|
||||||
@ -36,11 +39,12 @@ public class DocumentDataTests extends BuildDocumentTest {
|
|||||||
for (String pdfFileName : pdfFileNames) {
|
for (String pdfFileName : pdfFileNames) {
|
||||||
System.out.println(pdfFileName);
|
System.out.println(pdfFileName);
|
||||||
DocumentData documentData = DocumentDataMapper.toDocumentData(buildGraph(resource.getFile().toPath().getParent().relativize(Path.of(pdfFileName)).toString()));
|
DocumentData documentData = DocumentDataMapper.toDocumentData(buildGraph(resource.getFile().toPath().getParent().relativize(Path.of(pdfFileName)).toString()));
|
||||||
File outputFile = Path.of(outPath).resolve(resource.getFile().toPath().relativize(Path.of(pdfFileName))).toFile();
|
File outputFile = Path.of(outPath).resolve(resource.getFile().toPath().relativize(Path.of(pdfFileName))).toFile();
|
||||||
outputFile.toPath().getParent().toFile().mkdirs();
|
outputFile.toPath().getParent().toFile().mkdirs();
|
||||||
try (var out = new FileOutputStream(outputFile.toString().replace(".pdf", ".json"))) {
|
try (var out = new FileOutputStream(outputFile.toString().replace(".pdf", ".json"))) {
|
||||||
ObjectMapperFactory.create().writeValue(out, documentData);
|
ObjectMapperFactory.create().writeValue(out, documentData);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -5,7 +5,6 @@ import java.io.FileOutputStream;
|
|||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
|
||||||
import org.apache.pdfbox.Loader;
|
|
||||||
import org.junit.jupiter.api.Disabled;
|
import org.junit.jupiter.api.Disabled;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
import org.springframework.core.io.ClassPathResource;
|
import org.springframework.core.io.ClassPathResource;
|
||||||
@ -56,9 +55,10 @@ public class DocumentGraphJsonWritingTest extends BuildDocumentTest {
|
|||||||
private void writeJsons(Path filename) {
|
private void writeJsons(Path filename) {
|
||||||
|
|
||||||
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||||
Loader.loadPDF(filename.toFile()),
|
filename.toFile(),
|
||||||
new ImageServiceResponse(),
|
new ImageServiceResponse(),
|
||||||
new TableServiceResponse()));
|
new TableServiceResponse(),
|
||||||
|
filename.toFile().toString()));
|
||||||
|
|
||||||
DocumentData documentData = DocumentDataMapper.toDocumentData(documentGraph);
|
DocumentData documentData = DocumentDataMapper.toDocumentData(documentGraph);
|
||||||
ObjectMapper mapper = ObjectMapperFactory.create();
|
ObjectMapper mapper = ObjectMapperFactory.create();
|
||||||
|
|||||||
@ -1,32 +1,13 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.server.graph;
|
package com.knecon.fforesight.service.layoutparser.server.graph;
|
||||||
|
|
||||||
import java.io.FileOutputStream;
|
import java.io.File;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import org.apache.pdfbox.Loader;
|
|
||||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
|
||||||
import org.junit.jupiter.api.Disabled;
|
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
import org.springframework.beans.factory.annotation.Autowired;
|
|
||||||
import org.springframework.core.io.ClassPathResource;
|
import org.springframework.core.io.ClassPathResource;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.PropertiesMapper;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.ViewerDocumentService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.ViewerDocumentService;
|
||||||
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
|
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
|
||||||
@ -35,38 +16,17 @@ import lombok.SneakyThrows;
|
|||||||
|
|
||||||
public class ViewerDocumentTest extends BuildDocumentTest {
|
public class ViewerDocumentTest extends BuildDocumentTest {
|
||||||
|
|
||||||
@Autowired
|
|
||||||
private SectionsBuilderService sectionsBuilderService;
|
|
||||||
|
|
||||||
@Autowired
|
|
||||||
private RedactManagerClassificationService redactManagerClassificationService;
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void testViewerDocument() {
|
public void testViewerDocument() {
|
||||||
|
|
||||||
String fileName = "files/bdr/notMergedParagraphs.pdf";
|
String fileName = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
|
||||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||||
LayoutGridService layoutGridService = new LayoutGridService();
|
LayoutGridService layoutGridService = new LayoutGridService();
|
||||||
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService);
|
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService, null);
|
||||||
Document document = buildGraph(fileName, LayoutParsingType.TAAS);
|
Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER);
|
||||||
try (var pdDocument = Loader.loadPDF(new ClassPathResource(fileName).getFile()); var out = new FileOutputStream(tmpFileName)) {
|
var documentFile = new ClassPathResource(fileName).getFile();
|
||||||
viewerDocumentService.createViewerDocument(pdDocument, document, out, true);
|
viewerDocumentService.createViewerDocument(documentFile, document, new File(tmpFileName), true);
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public ClassificationDocument buildClassificationDocument(PDDocument originDocument) {
|
|
||||||
|
|
||||||
ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
|
||||||
originDocument,
|
|
||||||
new ImageServiceResponse(),
|
|
||||||
new TableServiceResponse());
|
|
||||||
|
|
||||||
redactManagerClassificationService.classifyDocument(classificationDocument);
|
|
||||||
|
|
||||||
sectionsBuilderService.buildSections(classificationDocument);
|
|
||||||
|
|
||||||
return classificationDocument;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.server.segmentation;
|
|||||||
import static org.assertj.core.api.Assertions.assertThat;
|
import static org.assertj.core.api.Assertions.assertThat;
|
||||||
|
|
||||||
import java.awt.geom.Rectangle2D;
|
import java.awt.geom.Rectangle2D;
|
||||||
|
import java.io.File;
|
||||||
import java.io.FileOutputStream;
|
import java.io.FileOutputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
@ -15,8 +16,6 @@ import java.util.Locale;
|
|||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.pdfbox.Loader;
|
|
||||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
|
||||||
import org.junit.jupiter.api.Disabled;
|
import org.junit.jupiter.api.Disabled;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
import org.springframework.beans.factory.annotation.Autowired;
|
import org.springframework.beans.factory.annotation.Autowired;
|
||||||
@ -62,12 +61,13 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public ClassificationDocument buildClassificationDocument(PDDocument originDocument, TableServiceResponse tableServiceResponse) {
|
public ClassificationDocument buildClassificationDocument(File originDocument, TableServiceResponse tableServiceResponse) {
|
||||||
|
|
||||||
ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||||
originDocument,
|
originDocument,
|
||||||
new ImageServiceResponse(),
|
new ImageServiceResponse(),
|
||||||
tableServiceResponse);
|
tableServiceResponse,
|
||||||
|
"document");
|
||||||
|
|
||||||
redactManagerClassificationService.classifyDocument(classificationDocument);
|
redactManagerClassificationService.classifyDocument(classificationDocument);
|
||||||
|
|
||||||
@ -78,7 +78,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public ClassificationDocument buildClassificationDocument(PDDocument originDocument) {
|
public ClassificationDocument buildClassificationDocument(File originDocument) {
|
||||||
|
|
||||||
return buildClassificationDocument(originDocument, new TableServiceResponse());
|
return buildClassificationDocument(originDocument, new TableServiceResponse());
|
||||||
}
|
}
|
||||||
@ -89,7 +89,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
|
|
||||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/A20622A izRMS (CZ) fRR Part B9_Page185.pdf");
|
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/A20622A izRMS (CZ) fRR Part B9_Page185.pdf");
|
||||||
|
|
||||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||||
|
|
||||||
toHtml(document, "/tmp/A20622A izRMS (CZ) fRR Part B9_Page185.html");
|
toHtml(document, "/tmp/A20622A izRMS (CZ) fRR Part B9_Page185.html");
|
||||||
|
|
||||||
@ -103,7 +103,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
ClassPathResource cvTablesResource = new ClassPathResource("files/cv_tables/ScanRotationBorder.TABLES.json");
|
ClassPathResource cvTablesResource = new ClassPathResource("files/cv_tables/ScanRotationBorder.TABLES.json");
|
||||||
var tableServiceResponse = objectMapper.readValue(cvTablesResource.getInputStream(), TableServiceResponse.class);
|
var tableServiceResponse = objectMapper.readValue(cvTablesResource.getInputStream(), TableServiceResponse.class);
|
||||||
|
|
||||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()), tableServiceResponse);
|
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile(), tableServiceResponse);
|
||||||
|
|
||||||
toHtml(document, "/tmp/ScanRotationBorder.html");
|
toHtml(document, "/tmp/ScanRotationBorder.html");
|
||||||
}
|
}
|
||||||
@ -116,7 +116,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
ClassPathResource cvTablesResource = new ClassPathResource("files/cv_tables/ScanRotationBorder.TABLES.json");
|
ClassPathResource cvTablesResource = new ClassPathResource("files/cv_tables/ScanRotationBorder.TABLES.json");
|
||||||
var tableServiceResponse = objectMapper.readValue(cvTablesResource.getInputStream(), TableServiceResponse.class);
|
var tableServiceResponse = objectMapper.readValue(cvTablesResource.getInputStream(), TableServiceResponse.class);
|
||||||
|
|
||||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()), tableServiceResponse);
|
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile(), tableServiceResponse);
|
||||||
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
|
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
|
||||||
var tables = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList();
|
var tables = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList();
|
||||||
|
|
||||||
@ -156,7 +156,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
|
|
||||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Spanning Cells.pdf");
|
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Spanning Cells.pdf");
|
||||||
|
|
||||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||||
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
|
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
|
||||||
TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0);
|
TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0);
|
||||||
assertThat(table.getColCount()).isEqualTo(6);
|
assertThat(table.getColCount()).isEqualTo(6);
|
||||||
@ -170,7 +170,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
|
|
||||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Table.pdf");
|
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Table.pdf");
|
||||||
|
|
||||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||||
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
|
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
|
||||||
TablePageBlock firstTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0);
|
TablePageBlock firstTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0);
|
||||||
assertThat(firstTable.getColCount()).isEqualTo(8);
|
assertThat(firstTable.getColCount()).isEqualTo(8);
|
||||||
@ -188,7 +188,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
|
|
||||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Multi Page Table.pdf");
|
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Multi Page Table.pdf");
|
||||||
|
|
||||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||||
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
|
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
|
||||||
TablePageBlock firstTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0);
|
TablePageBlock firstTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0);
|
||||||
assertThat(firstTable.getColCount()).isEqualTo(9);
|
assertThat(firstTable.getColCount()).isEqualTo(9);
|
||||||
@ -206,7 +206,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
|
|
||||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Rotated Table Headers.pdf");
|
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Rotated Table Headers.pdf");
|
||||||
|
|
||||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||||
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
|
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
|
||||||
TablePageBlock firstTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0);
|
TablePageBlock firstTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0);
|
||||||
assertThat(firstTable.getColCount()).isEqualTo(8);
|
assertThat(firstTable.getColCount()).isEqualTo(8);
|
||||||
@ -224,7 +224,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
|
|
||||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/56 Fludioxonil_RAR_12_Volume_3CA_B-7_2018-02-21_Page170.pdf");
|
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/56 Fludioxonil_RAR_12_Volume_3CA_B-7_2018-02-21_Page170.pdf");
|
||||||
|
|
||||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||||
|
|
||||||
validateTableSize(document, 4);
|
validateTableSize(document, 4);
|
||||||
|
|
||||||
@ -241,7 +241,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
|
|
||||||
ClassPathResource pdfFileResource = new ClassPathResource("files/211.pdf");
|
ClassPathResource pdfFileResource = new ClassPathResource("files/211.pdf");
|
||||||
|
|
||||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||||
|
|
||||||
validateTableSize(document, 4);
|
validateTableSize(document, 4);
|
||||||
|
|
||||||
@ -258,7 +258,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
|
|
||||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/VV-931175_Page1.pdf");
|
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/VV-931175_Page1.pdf");
|
||||||
|
|
||||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||||
|
|
||||||
validateTableSize(document, 1);
|
validateTableSize(document, 1);
|
||||||
|
|
||||||
@ -299,7 +299,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
|
|
||||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/27 A8637C - EU AIR3 - MCP Section 1 - Identity of the plant protection product_Page6.pdf");
|
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/27 A8637C - EU AIR3 - MCP Section 1 - Identity of the plant protection product_Page6.pdf");
|
||||||
|
|
||||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||||
|
|
||||||
toHtml(document, "/tmp/html.html");
|
toHtml(document, "/tmp/html.html");
|
||||||
|
|
||||||
@ -319,7 +319,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
|
|
||||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/A20622A izRMS (CZ) fRR Part B9_Page185.pdf");
|
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/A20622A izRMS (CZ) fRR Part B9_Page185.pdf");
|
||||||
|
|
||||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||||
|
|
||||||
validateTableSize(document, 1);
|
validateTableSize(document, 1);
|
||||||
|
|
||||||
@ -332,7 +332,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
|
|
||||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/A20622A izRMS (CZ) fRR Part B9_Page185_fixed.pdf");
|
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/A20622A izRMS (CZ) fRR Part B9_Page185_fixed.pdf");
|
||||||
|
|
||||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||||
|
|
||||||
validateTableSize(document, 1);
|
validateTableSize(document, 1);
|
||||||
|
|
||||||
@ -345,7 +345,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
|
|
||||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/A20622A izZRMS (CZ) fRR Part B7_Page123.pdf");
|
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/A20622A izZRMS (CZ) fRR Part B7_Page123.pdf");
|
||||||
|
|
||||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||||
|
|
||||||
validateTableSize(document, 6);
|
validateTableSize(document, 6);
|
||||||
|
|
||||||
@ -364,7 +364,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
|
|
||||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/77 Pirimicarb_RAR_08_Volume_3CA_B-6_2017-12-04_Page11.pdf");
|
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/77 Pirimicarb_RAR_08_Volume_3CA_B-6_2017-12-04_Page11.pdf");
|
||||||
|
|
||||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||||
|
|
||||||
validateTableSize(document, 3);
|
validateTableSize(document, 3);
|
||||||
|
|
||||||
@ -380,7 +380,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
|
|
||||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/95 Trinexapac-ethyl_RAR_08_Volume_3CA_B-6_2018-01-10_Page532.pdf");
|
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/95 Trinexapac-ethyl_RAR_08_Volume_3CA_B-6_2018-01-10_Page532.pdf");
|
||||||
|
|
||||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||||
|
|
||||||
validateTableSize(document, 1);
|
validateTableSize(document, 1);
|
||||||
validateTable(document, 0, 9, 9, 0, 0);
|
validateTable(document, 0, 9, 9, 0, 0);
|
||||||
@ -393,7 +393,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
|
|
||||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/52 Fludioxonil_RAR_07_Volume_3CA_B-5_2018-02-21_Page175.pdf");
|
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/52 Fludioxonil_RAR_07_Volume_3CA_B-5_2018-02-21_Page175.pdf");
|
||||||
|
|
||||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||||
|
|
||||||
validateTableSize(document, 1);
|
validateTableSize(document, 1);
|
||||||
|
|
||||||
@ -407,7 +407,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
|
|
||||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/52 Fludioxonil_RAR_07_Volume_3CA_B-5_2018-02-21_Page174.pdf");
|
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/52 Fludioxonil_RAR_07_Volume_3CA_B-5_2018-02-21_Page174.pdf");
|
||||||
|
|
||||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||||
|
|
||||||
validateTableSize(document, 1);
|
validateTableSize(document, 1);
|
||||||
validateTable(document, 0, 9, 6, 7, 0);
|
validateTable(document, 0, 9, 6, 7, 0);
|
||||||
@ -420,7 +420,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
|
|
||||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/19 Chlorothalonil RAR 08 Volume 3CA B 6b metabolites Oct 2017_Page35.pdf");
|
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/19 Chlorothalonil RAR 08 Volume 3CA B 6b metabolites Oct 2017_Page35.pdf");
|
||||||
|
|
||||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||||
|
|
||||||
validateTableSize(document, 1);
|
validateTableSize(document, 1);
|
||||||
validateTable(document, 0, 10, 6, 0, 0);
|
validateTable(document, 0, 10, 6, 0, 0);
|
||||||
@ -433,7 +433,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
|
|
||||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/19 Chlorothalonil RAR 08 Volume 3CA B 6b metabolites Oct 2017_Page161.pdf");
|
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/19 Chlorothalonil RAR 08 Volume 3CA B 6b metabolites Oct 2017_Page161.pdf");
|
||||||
|
|
||||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||||
|
|
||||||
validateTableSize(document, 2);
|
validateTableSize(document, 2);
|
||||||
validateTable(document, 0, 2, 2, 0, 0);
|
validateTable(document, 0, 2, 2, 0, 0);
|
||||||
@ -448,7 +448,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
ClassPathResource pdfFileResource = new ClassPathResource(
|
ClassPathResource pdfFileResource = new ClassPathResource(
|
||||||
"files/SinglePages/47 Cyprodinil - EU AIR3 - MCA Section 5 Supplement - Toxicological and metabolism studies on the active substance_Page30.pdf");
|
"files/SinglePages/47 Cyprodinil - EU AIR3 - MCA Section 5 Supplement - Toxicological and metabolism studies on the active substance_Page30.pdf");
|
||||||
|
|
||||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||||
|
|
||||||
validateTableSize(document, 2);
|
validateTableSize(document, 2);
|
||||||
|
|
||||||
@ -464,7 +464,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
ClassPathResource pdfFileResource = new ClassPathResource(
|
ClassPathResource pdfFileResource = new ClassPathResource(
|
||||||
"files/SinglePages/49 Cyprodinil - EU AIR3 - MCA Section 8 Supplement - Ecotoxicological studies on the active substance_Page61.pdf");
|
"files/SinglePages/49 Cyprodinil - EU AIR3 - MCA Section 8 Supplement - Ecotoxicological studies on the active substance_Page61.pdf");
|
||||||
|
|
||||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||||
|
|
||||||
validateTableSize(document, 2);
|
validateTableSize(document, 2);
|
||||||
|
|
||||||
@ -479,7 +479,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
|
|
||||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/81 Pirimicarb_RAR_20_Volume_3CP_A10788A (_Pirimor_)_B-9_2017-12-04_Page54.pdf");
|
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/81 Pirimicarb_RAR_20_Volume_3CP_A10788A (_Pirimor_)_B-9_2017-12-04_Page54.pdf");
|
||||||
|
|
||||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||||
|
|
||||||
validateTableSize(document, 2);
|
validateTableSize(document, 2);
|
||||||
|
|
||||||
@ -494,7 +494,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
|
|
||||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/85 Pydiflumetofen_DAR_08_Volume_3CA_B-6_2017-07-26_Page134.pdf");
|
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/85 Pydiflumetofen_DAR_08_Volume_3CA_B-6_2017-07-26_Page134.pdf");
|
||||||
|
|
||||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||||
|
|
||||||
validateTableSize(document, 2);
|
validateTableSize(document, 2);
|
||||||
|
|
||||||
@ -509,7 +509,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
|
|
||||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/Thiabendazole DAR Addendum for ED_April_2020_Page18.pdf");
|
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/Thiabendazole DAR Addendum for ED_April_2020_Page18.pdf");
|
||||||
|
|
||||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||||
|
|
||||||
validateTableSize(document, 4);
|
validateTableSize(document, 4);
|
||||||
|
|
||||||
@ -526,7 +526,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
|
|
||||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/15 - Pretilachlor - Acute Oral Toxicity (Up and Down Procedure) - Rat_Page18.pdf");
|
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/15 - Pretilachlor - Acute Oral Toxicity (Up and Down Procedure) - Rat_Page18.pdf");
|
||||||
|
|
||||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||||
|
|
||||||
validateTableSize(document, 1);
|
validateTableSize(document, 1);
|
||||||
|
|
||||||
@ -541,7 +541,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
ClassPathResource pdfFileResource = new ClassPathResource(
|
ClassPathResource pdfFileResource = new ClassPathResource(
|
||||||
"files/SinglePages/28 A8637C - EU AIR3 - MCP Section 10 - Ecotoxicological studies on the plant protection product_Page23.pdf");
|
"files/SinglePages/28 A8637C - EU AIR3 - MCP Section 10 - Ecotoxicological studies on the plant protection product_Page23.pdf");
|
||||||
|
|
||||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||||
|
|
||||||
validateTableSize(document, 2);
|
validateTableSize(document, 2);
|
||||||
|
|
||||||
@ -556,7 +556,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
|
|
||||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/24 - SYN549522 - Acute Oral Toxicity - Rats_Page17.pdf");
|
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/24 - SYN549522 - Acute Oral Toxicity - Rats_Page17.pdf");
|
||||||
|
|
||||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||||
|
|
||||||
validateTableSize(document, 1);
|
validateTableSize(document, 1);
|
||||||
|
|
||||||
@ -570,7 +570,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
|
|
||||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/30 - Dicamba - Acute Oral Toxicity - Rats_Page5.pdf");
|
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/30 - Dicamba - Acute Oral Toxicity - Rats_Page5.pdf");
|
||||||
|
|
||||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||||
|
|
||||||
validateTableSize(document, 1);
|
validateTableSize(document, 1);
|
||||||
|
|
||||||
|
|||||||
@ -58,7 +58,7 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
|
|||||||
public void testTableExtraction() {
|
public void testTableExtraction() {
|
||||||
|
|
||||||
LayoutGridService layoutGridService = new LayoutGridService();
|
LayoutGridService layoutGridService = new LayoutGridService();
|
||||||
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService);
|
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService, null);
|
||||||
|
|
||||||
ClassPathResource resource = new ClassPathResource("files");
|
ClassPathResource resource = new ClassPathResource("files");
|
||||||
List<String> pdfFileNames = Files.walk(resource.getFile().toPath())
|
List<String> pdfFileNames = Files.walk(resource.getFile().toPath())
|
||||||
@ -77,13 +77,15 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
|
|||||||
private void writeJsons(Path filename) {
|
private void writeJsons(Path filename) {
|
||||||
|
|
||||||
Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||||
Loader.loadPDF(filename.toFile()),
|
filename.toFile(),
|
||||||
new ImageServiceResponse(),
|
new ImageServiceResponse(),
|
||||||
new TableServiceResponse()));
|
new TableServiceResponse(),
|
||||||
|
filename.toFile().toString()));
|
||||||
Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||||
Loader.loadPDF(filename.toFile()),
|
filename.toFile(),
|
||||||
new ImageServiceResponse(),
|
new ImageServiceResponse(),
|
||||||
new TableServiceResponse()));
|
new TableServiceResponse(),
|
||||||
|
filename.toFile().toString()));
|
||||||
DocumentData documentDataBefore = DocumentDataMapper.toDocumentData(documentGraphBefore);
|
DocumentData documentDataBefore = DocumentDataMapper.toDocumentData(documentGraphBefore);
|
||||||
DocumentData documentDataAfter = DocumentDataMapper.toDocumentData(documentGraphAfter);
|
DocumentData documentDataAfter = DocumentDataMapper.toDocumentData(documentGraphAfter);
|
||||||
if (!compareStructures(documentDataBefore.getDocumentStructure(), documentDataAfter.getDocumentStructure(), filename.getFileName().toString())) {
|
if (!compareStructures(documentDataBefore.getDocumentStructure(), documentDataAfter.getDocumentStructure(), filename.getFileName().toString())) {
|
||||||
|
|||||||
@ -1,5 +1,26 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.server.utils;
|
package com.knecon.fforesight.service.layoutparser.server.utils;
|
||||||
|
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.AfterEach;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.extension.ExtendWith;
|
||||||
|
import org.springframework.amqp.rabbit.core.RabbitTemplate;
|
||||||
|
import org.springframework.beans.factory.annotation.Autowired;
|
||||||
|
import org.springframework.boot.autoconfigure.EnableAutoConfiguration;
|
||||||
|
import org.springframework.boot.autoconfigure.amqp.RabbitAutoConfiguration;
|
||||||
|
import org.springframework.boot.test.context.SpringBootTest;
|
||||||
|
import org.springframework.boot.test.mock.mockito.MockBean;
|
||||||
|
import org.springframework.context.annotation.Bean;
|
||||||
|
import org.springframework.context.annotation.ComponentScan;
|
||||||
|
import org.springframework.context.annotation.Configuration;
|
||||||
|
import org.springframework.context.annotation.Import;
|
||||||
|
import org.springframework.context.annotation.Primary;
|
||||||
|
import org.springframework.core.io.ClassPathResource;
|
||||||
|
import org.springframework.test.context.junit.jupiter.SpringExtension;
|
||||||
|
|
||||||
import com.iqser.red.commons.jackson.ObjectMapperFactory;
|
import com.iqser.red.commons.jackson.ObjectMapperFactory;
|
||||||
import com.iqser.red.storage.commons.service.StorageService;
|
import com.iqser.red.storage.commons.service.StorageService;
|
||||||
import com.iqser.red.storage.commons.utils.FileSystemBackedStorageService;
|
import com.iqser.red.storage.commons.utils.FileSystemBackedStorageService;
|
||||||
@ -9,22 +30,8 @@ import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingStorage
|
|||||||
import com.knecon.fforesight.service.layoutparser.server.Application;
|
import com.knecon.fforesight.service.layoutparser.server.Application;
|
||||||
import com.knecon.fforesight.tenantcommons.TenantContext;
|
import com.knecon.fforesight.tenantcommons.TenantContext;
|
||||||
import com.knecon.fforesight.tenantcommons.TenantsClient;
|
import com.knecon.fforesight.tenantcommons.TenantsClient;
|
||||||
import lombok.SneakyThrows;
|
|
||||||
import org.junit.jupiter.api.AfterEach;
|
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
|
||||||
import org.junit.jupiter.api.extension.ExtendWith;
|
|
||||||
import org.springframework.amqp.rabbit.core.RabbitTemplate;
|
|
||||||
import org.springframework.beans.factory.annotation.Autowired;
|
|
||||||
import org.springframework.boot.autoconfigure.EnableAutoConfiguration;
|
|
||||||
import org.springframework.boot.autoconfigure.amqp.RabbitAutoConfiguration;
|
|
||||||
import org.springframework.boot.test.context.SpringBootTest;
|
|
||||||
import org.springframework.boot.test.mock.mockito.MockBean;
|
|
||||||
import org.springframework.context.annotation.*;
|
|
||||||
import org.springframework.core.io.ClassPathResource;
|
|
||||||
import org.springframework.test.context.junit.jupiter.SpringExtension;
|
|
||||||
|
|
||||||
import java.io.InputStream;
|
import lombok.SneakyThrows;
|
||||||
import java.util.Optional;
|
|
||||||
|
|
||||||
@ExtendWith(SpringExtension.class)
|
@ExtendWith(SpringExtension.class)
|
||||||
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT)
|
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT)
|
||||||
@ -100,9 +107,11 @@ public abstract class AbstractTest {
|
|||||||
return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER);
|
return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
protected LayoutParsingRequest buildDefaultLayoutParsingRequest(LayoutParsingType layoutParsingType) {
|
protected LayoutParsingRequest buildDefaultLayoutParsingRequest(LayoutParsingType layoutParsingType) {
|
||||||
|
|
||||||
return LayoutParsingRequest.builder()
|
return LayoutParsingRequest.builder()
|
||||||
|
.identifier(Map.of("fileId", "1337"))
|
||||||
.layoutParsingType(layoutParsingType)
|
.layoutParsingType(layoutParsingType)
|
||||||
.originFileStorageId(ORIGIN_FILE_ID)
|
.originFileStorageId(ORIGIN_FILE_ID)
|
||||||
.tablesFileStorageId(Optional.of(TABLE_FILE_ID))
|
.tablesFileStorageId(Optional.of(TABLE_FILE_ID))
|
||||||
@ -116,6 +125,7 @@ public abstract class AbstractTest {
|
|||||||
.build();
|
.build();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
protected LayoutParsingRequest prepareStorage(String file, String cvServiceResponseFile, String imageInfoFile) {
|
protected LayoutParsingRequest prepareStorage(String file, String cvServiceResponseFile, String imageInfoFile) {
|
||||||
|
|
||||||
@ -152,7 +162,6 @@ public abstract class AbstractTest {
|
|||||||
@ComponentScan("com.knecon.fforesight.service.layoutparser")
|
@ComponentScan("com.knecon.fforesight.service.layoutparser")
|
||||||
public static class TestConfiguration {
|
public static class TestConfiguration {
|
||||||
|
|
||||||
|
|
||||||
@Bean
|
@Bean
|
||||||
@Primary
|
@Primary
|
||||||
public StorageService inmemoryStorage() {
|
public StorageService inmemoryStorage() {
|
||||||
|
|||||||
@ -1,9 +1,7 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.server.utils;
|
package com.knecon.fforesight.service.layoutparser.server.utils;
|
||||||
|
|
||||||
import java.io.InputStream;
|
import java.io.File;
|
||||||
|
|
||||||
import org.apache.pdfbox.Loader;
|
|
||||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
|
||||||
import org.springframework.beans.factory.annotation.Autowired;
|
import org.springframework.beans.factory.annotation.Autowired;
|
||||||
import org.springframework.core.io.ClassPathResource;
|
import org.springframework.core.io.ClassPathResource;
|
||||||
|
|
||||||
@ -25,11 +23,9 @@ public abstract class BuildDocumentTest extends AbstractTest {
|
|||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
protected ClassificationDocument parseLayout(String filename, LayoutParsingType layoutParsingType) {
|
protected ClassificationDocument parseLayout(String filename, LayoutParsingType layoutParsingType) {
|
||||||
|
|
||||||
ClassPathResource fileResource = new ClassPathResource(filename);
|
File fileResource = new ClassPathResource(filename).getFile();
|
||||||
prepareStorage(filename);
|
prepareStorage(filename);
|
||||||
try (InputStream inputStream = fileResource.getInputStream(); PDDocument pdDocument = Loader.loadPDF(inputStream.readAllBytes())) {
|
return layoutParsingPipeline.parseLayout(layoutParsingType, fileResource, layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID), new TableServiceResponse(), filename);
|
||||||
return layoutParsingPipeline.parseLayout(layoutParsingType, pdDocument, layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID), new TableServiceResponse());
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -7,6 +7,9 @@ fforesight.tenants.remote: true
|
|||||||
server:
|
server:
|
||||||
port: 8080
|
port: 8080
|
||||||
|
|
||||||
|
logging.pattern.level: "%5p [${spring.application.name},%X{traceId:-},%X{spanId:-}]"
|
||||||
|
logging.type: ${LOGGING_TYPE:CONSOLE}
|
||||||
|
|
||||||
spring:
|
spring:
|
||||||
main:
|
main:
|
||||||
allow-circular-references: true # FIXME
|
allow-circular-references: true # FIXME
|
||||||
|
|||||||
@ -1,16 +0,0 @@
|
|||||||
<Configuration>
|
|
||||||
|
|
||||||
<Appenders>
|
|
||||||
<Console name="CONSOLE" target="SYSTEM_OUT">
|
|
||||||
<PatternLayout pattern="%d{HH:mm:ss.SSS} [%t] %-5level %logger{36} - %msg%n"/>
|
|
||||||
</Console>
|
|
||||||
</Appenders>
|
|
||||||
|
|
||||||
<Loggers>
|
|
||||||
<Root level="warn">
|
|
||||||
<AppenderRef ref="CONSOLE"/>
|
|
||||||
</Root>
|
|
||||||
<Logger name="com.iqser" level="info"/>
|
|
||||||
</Loggers>
|
|
||||||
|
|
||||||
</Configuration>
|
|
||||||
@ -0,0 +1,17 @@
|
|||||||
|
<configuration>
|
||||||
|
|
||||||
|
<springProperty scope="configuration" name="logType" source="logging.type"/>
|
||||||
|
<springProperty scope="context" name="application.name" source="spring.application.name"/>
|
||||||
|
<springProperty scope="context" name="version" source="project.version"/>
|
||||||
|
<include resource="org/springframework/boot/logging/logback/defaults.xml"/>
|
||||||
|
<include resource="org/springframework/boot/logging/logback/console-appender.xml"/>
|
||||||
|
|
||||||
|
<appender name="JSON" class="ch.qos.logback.core.ConsoleAppender">
|
||||||
|
<encoder class="net.logstash.logback.encoder.LogstashEncoder"/>
|
||||||
|
</appender>
|
||||||
|
|
||||||
|
<root level="INFO">
|
||||||
|
<appender-ref ref="${logType}"/>
|
||||||
|
</root>
|
||||||
|
|
||||||
|
</configuration>
|
||||||
@ -2,7 +2,14 @@
|
|||||||
dir=${PWD##*/}
|
dir=${PWD##*/}
|
||||||
gradle assemble
|
gradle assemble
|
||||||
|
|
||||||
buildNumber=${1:-1}
|
# Get the current Git branch
|
||||||
|
branch=$(git rev-parse --abbrev-ref HEAD)
|
||||||
|
|
||||||
gradle bootBuildImage --cleanCache --publishImage -PbuildbootDockerHostNetwork=true -Pversion=$USER-$buildNumber
|
# Get the short commit hash (first 5 characters)
|
||||||
echo "nexus.knecon.com:5001/red/${dir}-server-v1:$USER-$buildNumber"
|
commit_hash=$(git rev-parse --short=5 HEAD)
|
||||||
|
|
||||||
|
# Combine branch and commit hash
|
||||||
|
buildName="${USER}-${branch}-${commit_hash}"
|
||||||
|
|
||||||
|
gradle bootBuildImage --publishImage -PbuildbootDockerHostNetwork=true -Pversion=$buildName --no-build-cache
|
||||||
|
echo "nexus.knecon.com:5001/ff/${dir}-service-server:$buildName"
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user