Merge branch 'RED-7384' into 'main'
RED-7384: fixes for migration See merge request fforesight/layout-parser!91
This commit is contained in:
commit
760a809900
@ -24,6 +24,8 @@ tasks.named<Test>("test") {
|
||||
reports {
|
||||
junitXml.outputLocation.set(layout.buildDirectory.dir("reports/junit"))
|
||||
}
|
||||
minHeapSize = "512m"
|
||||
maxHeapSize = "2048m"
|
||||
}
|
||||
|
||||
tasks.test {
|
||||
|
||||
@ -15,8 +15,8 @@ dependencies {
|
||||
exclude("org.springframework.boot", "spring-boot-starter-security")
|
||||
exclude("org.springframework.boot", "spring-boot-starter-validation")
|
||||
}
|
||||
implementation("com.knecon.fforesight:tenant-commons:0.10.0")
|
||||
implementation("com.iqser.red.commons:storage-commons:2.40.0")
|
||||
implementation("com.knecon.fforesight:tenant-commons:0.19.0")
|
||||
implementation("com.iqser.red.commons:storage-commons:2.45.0")
|
||||
|
||||
implementation("org.apache.pdfbox:pdfbox:${pdfBoxVersion}")
|
||||
implementation("org.apache.pdfbox:pdfbox-tools:${pdfBoxVersion}")
|
||||
|
||||
@ -3,13 +3,16 @@ package com.knecon.fforesight.service.layoutparser.processor;
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.atomic.AtomicReference;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
@ -51,91 +54,121 @@ import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDF
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.ViewerDocumentService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||
|
||||
import io.micrometer.observation.Observation;
|
||||
import io.micrometer.observation.ObservationRegistry;
|
||||
import io.micrometer.observation.annotation.Observed;
|
||||
import lombok.AccessLevel;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@SuppressWarnings("PMD.CloseResource")
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class LayoutParsingPipeline {
|
||||
|
||||
private final ImageServiceResponseAdapter imageServiceResponseAdapter;
|
||||
private final CvTableParsingAdapter cvTableParsingAdapter;
|
||||
private final LayoutParsingStorageService layoutParsingStorageService;
|
||||
private final SectionsBuilderService sectionsBuilderService;
|
||||
private final TaasClassificationService taasClassificationService;
|
||||
private final RedactManagerClassificationService redactManagerClassificationService;
|
||||
private final DocuMineClassificationService docuMineClassificationService;
|
||||
private final SimplifiedSectionTextService simplifiedSectionTextService;
|
||||
private final BodyTextFrameService bodyTextFrameService;
|
||||
private final RulingCleaningService rulingCleaningService;
|
||||
private final TableExtractionService tableExtractionService;
|
||||
private final TaasBlockificationService taasBlockificationService;
|
||||
private final DocuMineBlockificationService docuMineBlockificationService;
|
||||
private final RedactManagerBlockificationService redactManagerBlockificationService;
|
||||
private final ViewerDocumentService viewerDocumentService;
|
||||
ImageServiceResponseAdapter imageServiceResponseAdapter;
|
||||
CvTableParsingAdapter cvTableParsingAdapter;
|
||||
LayoutParsingStorageService layoutParsingStorageService;
|
||||
SectionsBuilderService sectionsBuilderService;
|
||||
TaasClassificationService taasClassificationService;
|
||||
RedactManagerClassificationService redactManagerClassificationService;
|
||||
DocuMineClassificationService docuMineClassificationService;
|
||||
SimplifiedSectionTextService simplifiedSectionTextService;
|
||||
BodyTextFrameService bodyTextFrameService;
|
||||
RulingCleaningService rulingCleaningService;
|
||||
TableExtractionService tableExtractionService;
|
||||
TaasBlockificationService taasBlockificationService;
|
||||
DocuMineBlockificationService docuMineBlockificationService;
|
||||
RedactManagerBlockificationService redactManagerBlockificationService;
|
||||
ViewerDocumentService viewerDocumentService;
|
||||
ObservationRegistry observationRegistry;
|
||||
|
||||
|
||||
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
log.info("Starting layout parsing for {}", layoutParsingRequest.identifier());
|
||||
|
||||
try (PDDocument originDocument = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId())) {
|
||||
ImageServiceResponse imageServiceResponse = new ImageServiceResponse();
|
||||
if (layoutParsingRequest.imagesFileStorageId().isPresent()) {
|
||||
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get());
|
||||
}
|
||||
File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
|
||||
File viewerDocumentFile = File.createTempFile("viewer_document", ".pdf");
|
||||
|
||||
TableServiceResponse tableServiceResponse = new TableServiceResponse();
|
||||
if (layoutParsingRequest.tablesFileStorageId().isPresent()) {
|
||||
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get());
|
||||
}
|
||||
|
||||
ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(), originDocument, imageServiceResponse, tableServiceResponse);
|
||||
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(classificationDocument);
|
||||
|
||||
int numberOfPages = originDocument.getNumberOfPages();
|
||||
|
||||
layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentGraph));
|
||||
layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentGraph));
|
||||
|
||||
try (var out = new ByteArrayOutputStream()) {
|
||||
viewerDocumentService.createViewerDocument(originDocument, documentGraph, out, false);
|
||||
layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, out);
|
||||
}
|
||||
|
||||
if (layoutParsingRequest.layoutParsingType().equals(LayoutParsingType.TAAS)) {
|
||||
var researchDocumentData = TaasDocumentDataMapper.fromDocument(documentGraph);
|
||||
layoutParsingStorageService.storeResearchDocumentData(layoutParsingRequest, researchDocumentData);
|
||||
}
|
||||
|
||||
return LayoutParsingFinishedEvent.builder()
|
||||
.identifier(layoutParsingRequest.identifier())
|
||||
.numberOfPages(numberOfPages)
|
||||
.duration(System.currentTimeMillis() - start)
|
||||
.message(format("""
|
||||
Layout parsing has finished in %.02f s.
|
||||
identifiers: %s
|
||||
%s
|
||||
Files have been saved with Ids:
|
||||
Structure: %s
|
||||
Text: %s
|
||||
Positions: %s
|
||||
PageData: %s
|
||||
Simplified Text: %s
|
||||
Viewer Doc: %s""",
|
||||
((float) (System.currentTimeMillis() - start)) / 1000,
|
||||
layoutParsingRequest.identifier(),
|
||||
buildSemanticNodeCountMessage(numberOfPages, documentGraph.buildSemanticNodeCounts()),
|
||||
layoutParsingRequest.structureFileStorageId(),
|
||||
layoutParsingRequest.textBlockFileStorageId(),
|
||||
layoutParsingRequest.positionBlockFileStorageId(),
|
||||
layoutParsingRequest.pageFileStorageId(),
|
||||
layoutParsingRequest.simplifiedTextStorageId(),
|
||||
layoutParsingRequest.viewerDocumentStorageId()))
|
||||
.build();
|
||||
ImageServiceResponse imageServiceResponse = new ImageServiceResponse();
|
||||
if (layoutParsingRequest.imagesFileStorageId().isPresent()) {
|
||||
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get());
|
||||
}
|
||||
|
||||
TableServiceResponse tableServiceResponse = new TableServiceResponse();
|
||||
if (layoutParsingRequest.tablesFileStorageId().isPresent()) {
|
||||
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get());
|
||||
}
|
||||
|
||||
ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(),
|
||||
originFile,
|
||||
imageServiceResponse,
|
||||
tableServiceResponse,
|
||||
layoutParsingRequest.identifier().toString());
|
||||
log.info("Building document graph for {}", layoutParsingRequest.identifier());
|
||||
|
||||
Document documentGraph = observeBuildDocumentGraph(classificationDocument);
|
||||
|
||||
log.info("Storing resulting files for {}", layoutParsingRequest.identifier());
|
||||
layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentGraph));
|
||||
layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentGraph));
|
||||
|
||||
log.info("Creating viewer document for {}", layoutParsingRequest.identifier());
|
||||
viewerDocumentService.createViewerDocument(originFile, documentGraph, viewerDocumentFile, false);
|
||||
layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, viewerDocumentFile);
|
||||
if (layoutParsingRequest.layoutParsingType().equals(LayoutParsingType.TAAS)) {
|
||||
log.info("Building research document data for {}", layoutParsingRequest.identifier());
|
||||
var researchDocumentData = TaasDocumentDataMapper.fromDocument(documentGraph);
|
||||
layoutParsingStorageService.storeResearchDocumentData(layoutParsingRequest, researchDocumentData);
|
||||
}
|
||||
|
||||
originFile.delete();
|
||||
viewerDocumentFile.delete();
|
||||
|
||||
return LayoutParsingFinishedEvent.builder()
|
||||
.identifier(layoutParsingRequest.identifier())
|
||||
.numberOfPages(documentGraph.getNumberOfPages())
|
||||
.duration(System.currentTimeMillis() - start)
|
||||
.message(format("""
|
||||
Layout parsing has finished in %.02f s.
|
||||
identifiers: %s
|
||||
%s
|
||||
Files have been saved with Ids:
|
||||
Structure: %s
|
||||
Text: %s
|
||||
Positions: %s
|
||||
PageData: %s
|
||||
Simplified Text: %s
|
||||
Viewer Doc: %s""",
|
||||
((float) (System.currentTimeMillis() - start)) / 1000,
|
||||
layoutParsingRequest.identifier(),
|
||||
buildSemanticNodeCountMessage(documentGraph.getNumberOfPages(), documentGraph.buildSemanticNodeCounts()),
|
||||
layoutParsingRequest.structureFileStorageId(),
|
||||
layoutParsingRequest.textBlockFileStorageId(),
|
||||
layoutParsingRequest.positionBlockFileStorageId(),
|
||||
layoutParsingRequest.pageFileStorageId(),
|
||||
layoutParsingRequest.simplifiedTextStorageId(),
|
||||
layoutParsingRequest.viewerDocumentStorageId()))
|
||||
.build();
|
||||
|
||||
}
|
||||
|
||||
|
||||
private Document observeBuildDocumentGraph(ClassificationDocument classificationDocument) {
|
||||
|
||||
AtomicReference<Document> documentReference = new AtomicReference<>();
|
||||
|
||||
Observation.createNotStarted("LayoutParsingPipeline", observationRegistry).contextualName("build-document-graph").observe(() -> {
|
||||
documentReference.set(DocumentGraphFactory.buildDocumentGraph(classificationDocument));
|
||||
});
|
||||
|
||||
return documentReference.get();
|
||||
}
|
||||
|
||||
|
||||
@ -154,21 +187,36 @@ public class LayoutParsingPipeline {
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
@Observed(name = "LayoutParsingPipeline", contextualName = "parse-layout")
|
||||
public ClassificationDocument parseLayout(LayoutParsingType layoutParsingType,
|
||||
PDDocument originDocument,
|
||||
File originFile,
|
||||
ImageServiceResponse imageServiceResponse,
|
||||
TableServiceResponse tableServiceResponse) {
|
||||
TableServiceResponse tableServiceResponse,
|
||||
String identifier) {
|
||||
|
||||
PDDocument originDocument = openDocument(originFile);
|
||||
addNumberOfPagesToTrace(originDocument.getNumberOfPages(), Files.size(originFile.toPath()));
|
||||
Map<Integer, List<TableCells>> pdfTableCells = cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse);
|
||||
Map<Integer, List<ClassifiedImage>> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse);
|
||||
|
||||
ClassificationDocument classificationDocument = new ClassificationDocument();
|
||||
List<ClassificationPage> classificationPages = new ArrayList<>();
|
||||
|
||||
originDocument.setAllSecurityToBeRemoved(true);
|
||||
long pageCount = originDocument.getNumberOfPages();
|
||||
|
||||
for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) {
|
||||
|
||||
if (pageNumber % 100 == 0) {
|
||||
// re-open document every once in a while to save on RAM. This has no significant performance impact.
|
||||
// This is due to PDFBox caching all images and some other stuff with Soft References. This dereferences them and forces the freeing of memory.
|
||||
originDocument.close();
|
||||
originDocument = openDocument(originFile);
|
||||
}
|
||||
|
||||
if (pageNumber % 100 == 0 || pageNumber == pageCount || pageNumber == 1) {
|
||||
log.info("Extracting text on Page {} for {}", pageNumber, identifier);
|
||||
}
|
||||
|
||||
classificationDocument.setPages(classificationPages);
|
||||
PDFLinesTextStripper stripper = new PDFLinesTextStripper();
|
||||
PDPage pdPage = originDocument.getPage(pageNumber - 1);
|
||||
@ -218,21 +266,42 @@ public class LayoutParsingPipeline {
|
||||
classificationPages.add(classificationPage);
|
||||
}
|
||||
|
||||
bodyTextFrameService.setBodyTextFrames(classificationDocument, layoutParsingType);
|
||||
originDocument.close();
|
||||
|
||||
log.info("Calculating BodyTextFrame for {}", identifier);
|
||||
bodyTextFrameService.setBodyTextFrames(classificationDocument, layoutParsingType);
|
||||
log.info("Classify TextBlocks for {}", identifier);
|
||||
switch (layoutParsingType) {
|
||||
case TAAS -> taasClassificationService.classifyDocument(classificationDocument);
|
||||
case DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument);
|
||||
case REDACT_MANAGER -> redactManagerClassificationService.classifyDocument(classificationDocument);
|
||||
}
|
||||
|
||||
log.info("Building Sections for {}", identifier);
|
||||
sectionsBuilderService.buildSections(classificationDocument);
|
||||
sectionsBuilderService.addImagesToSections(classificationDocument);
|
||||
|
||||
return classificationDocument;
|
||||
}
|
||||
|
||||
|
||||
private void addNumberOfPagesToTrace(int numberOfPages, long size) {
|
||||
|
||||
if (observationRegistry.getCurrentObservation() != null) {
|
||||
observationRegistry.getCurrentObservation().highCardinalityKeyValue("numberOfPages", String.valueOf(numberOfPages));
|
||||
observationRegistry.getCurrentObservation().highCardinalityKeyValue("fileSize", String.valueOf(size));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private PDDocument openDocument(File originFile) {
|
||||
|
||||
PDDocument document = Loader.loadPDF(originFile);
|
||||
document.setAllSecurityToBeRemoved(true);
|
||||
return document;
|
||||
}
|
||||
|
||||
|
||||
private Map<String, List<Rectangle2D>> convertMarkedContents(List<PDMarkedContent> pdMarkedContents) {
|
||||
|
||||
Map<String, List<Rectangle2D>> markedContentBboxes = new HashMap<>();
|
||||
@ -244,9 +313,9 @@ public class LayoutParsingPipeline {
|
||||
|
||||
private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) {
|
||||
|
||||
if (!classificationPage.isLandscape()) {
|
||||
document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue());
|
||||
}
|
||||
if (!classificationPage.isLandscape()) {
|
||||
document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue());
|
||||
}
|
||||
document.getFontCounter().addAll(classificationPage.getFontCounter().getCountPerValue());
|
||||
document.getTextHeightCounter().addAll(classificationPage.getTextHeightCounter().getCountPerValue());
|
||||
document.getFontStyleCounter().addAll(classificationPage.getFontStyleCounter().getCountPerValue());
|
||||
|
||||
@ -1,9 +1,7 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.file.Files;
|
||||
@ -11,7 +9,6 @@ import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.springframework.stereotype.Service;
|
||||
@ -26,6 +23,7 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.ima
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.tenantcommons.TenantContext;
|
||||
|
||||
import io.micrometer.observation.annotation.Observed;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
@ -39,16 +37,18 @@ public class LayoutParsingStorageService {
|
||||
private final ObjectMapper objectMapper;
|
||||
|
||||
|
||||
public PDDocument getOriginFile(String storageId) throws IOException {
|
||||
public PDDocument getOriginDocument(String storageId) throws IOException {
|
||||
|
||||
try (var originDocumentInputStream = getObject(storageId)) {
|
||||
File tempFile = createTempFile("document", ".pdf");
|
||||
try (var tempFileOutputStream = new FileOutputStream(tempFile)) {
|
||||
IOUtils.copy(originDocumentInputStream, tempFileOutputStream);
|
||||
originDocumentInputStream.close();
|
||||
}
|
||||
return Loader.loadPDF(tempFile);
|
||||
}
|
||||
return Loader.loadPDF(getOriginFile(storageId));
|
||||
}
|
||||
|
||||
|
||||
@Observed(name = "LayoutParsingStorageService", contextualName = "get-origin-file")
|
||||
public File getOriginFile(String storageId) throws IOException {
|
||||
|
||||
File tempFile = createTempFile("document", ".pdf");
|
||||
storageService.downloadTo(TenantContext.getTenantId(), storageId, tempFile);
|
||||
return tempFile;
|
||||
}
|
||||
|
||||
|
||||
@ -74,6 +74,7 @@ public class LayoutParsingStorageService {
|
||||
}
|
||||
|
||||
|
||||
@Observed(name = "LayoutParsingStorageService", contextualName = "store-document-data")
|
||||
public void storeDocumentData(LayoutParsingRequest layoutParsingRequest, DocumentData documentData) {
|
||||
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.structureFileStorageId(), documentData.getDocumentStructure());
|
||||
@ -83,7 +84,6 @@ public class LayoutParsingStorageService {
|
||||
}
|
||||
|
||||
|
||||
|
||||
public void storeResearchDocumentData(LayoutParsingRequest layoutParsingRequest, ResearchDocumentData researchDocumentData) {
|
||||
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.researchDocumentStorageId(), researchDocumentData);
|
||||
@ -115,6 +115,7 @@ public class LayoutParsingStorageService {
|
||||
}
|
||||
|
||||
|
||||
@Observed(name = "LayoutParsingStorageService", contextualName = "store-simplified-text")
|
||||
public void storeSimplifiedText(LayoutParsingRequest layoutParsingRequest, SimplifiedText simplifiedText) {
|
||||
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.simplifiedTextStorageId(), simplifiedText);
|
||||
@ -132,9 +133,10 @@ public class LayoutParsingStorageService {
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void storeViewerDocument(LayoutParsingRequest layoutParsingRequest, ByteArrayOutputStream out) {
|
||||
@Observed(name = "LayoutParsingStorageService", contextualName = "store-viewer-document")
|
||||
public void storeViewerDocument(LayoutParsingRequest layoutParsingRequest, File out) {
|
||||
|
||||
try (var in = new ByteArrayInputStream(out.toByteArray())) {
|
||||
try (var in = new FileInputStream(out)) {
|
||||
|
||||
storageService.storeObject(TenantContext.getTenantId(), layoutParsingRequest.viewerDocumentStorageId(), in);
|
||||
}
|
||||
|
||||
@ -10,7 +10,6 @@ import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.amazonaws.services.kms.model.NotFoundException;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
|
||||
@ -84,7 +83,7 @@ public class Document implements GenericSemanticNode {
|
||||
@Override
|
||||
public Headline getHeadline() {
|
||||
|
||||
return streamAllSubNodesOfType(NodeType.HEADLINE).map(node -> (Headline) node).findFirst().orElseThrow(() -> new NotFoundException("No Headlines found in this document!"));
|
||||
return streamAllSubNodesOfType(NodeType.HEADLINE).map(node -> (Headline) node).findFirst().orElse(Headline.builder().build());
|
||||
}
|
||||
|
||||
|
||||
@ -105,6 +104,7 @@ public class Document implements GenericSemanticNode {
|
||||
return streamAllSubNodes().collect(Collectors.groupingBy(SemanticNode::getType, Collectors.counting()));
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
|
||||
@ -1,7 +1,9 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
@ -34,6 +36,9 @@ public class Footer implements GenericSemanticNode {
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<RedactionEntity> entities = new HashSet<>();
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
Map<Page, Rectangle2D> bBoxCache;
|
||||
|
||||
|
||||
@Override
|
||||
public NodeType getType() {
|
||||
@ -62,4 +67,14 @@ public class Footer implements GenericSemanticNode {
|
||||
return treeId + ": " + NodeType.FOOTER + ": " + leafTextBlock.buildSummary();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Map<Page, Rectangle2D> getBBox() {
|
||||
|
||||
if (bBoxCache == null) {
|
||||
bBoxCache = GenericSemanticNode.super.getBBox();
|
||||
}
|
||||
return bBoxCache;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,7 +1,9 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
@ -34,6 +36,9 @@ public class Header implements GenericSemanticNode {
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<RedactionEntity> entities = new HashSet<>();
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
Map<Page, Rectangle2D> bBoxCache;
|
||||
|
||||
|
||||
@Override
|
||||
public boolean isLeaf() {
|
||||
@ -62,4 +67,14 @@ public class Header implements GenericSemanticNode {
|
||||
return treeId + ": " + NodeType.HEADER + ": " + leafTextBlock.buildSummary();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Map<Page, Rectangle2D> getBBox() {
|
||||
|
||||
if (bBoxCache == null) {
|
||||
bBoxCache = GenericSemanticNode.super.getBBox();
|
||||
}
|
||||
return bBoxCache;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,7 +1,9 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
@ -34,6 +36,9 @@ public class Headline implements GenericSemanticNode {
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<RedactionEntity> entities = new HashSet<>();
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
Map<Page, Rectangle2D> bBoxCache;
|
||||
|
||||
|
||||
@Override
|
||||
public NodeType getType() {
|
||||
@ -69,4 +74,14 @@ public class Headline implements GenericSemanticNode {
|
||||
return this;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Map<Page, Rectangle2D> getBBox() {
|
||||
|
||||
if (bBoxCache == null) {
|
||||
bBoxCache = GenericSemanticNode.super.getBBox();
|
||||
}
|
||||
return bBoxCache;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,7 +1,9 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
@ -32,6 +34,9 @@ public class Paragraph implements GenericSemanticNode {
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<RedactionEntity> entities = new HashSet<>();
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
Map<Page, Rectangle2D> bBoxCache;
|
||||
|
||||
|
||||
@Override
|
||||
public NodeType getType() {
|
||||
@ -60,4 +65,14 @@ public class Paragraph implements GenericSemanticNode {
|
||||
return treeId + ": " + NodeType.PARAGRAPH + ": " + leafTextBlock.buildSummary();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Map<Page, Rectangle2D> getBBox() {
|
||||
|
||||
if (bBoxCache == null) {
|
||||
bBoxCache = GenericSemanticNode.super.getBBox();
|
||||
}
|
||||
return bBoxCache;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,7 +1,9 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
@ -35,6 +37,9 @@ public class Section implements GenericSemanticNode {
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<RedactionEntity> entities = new HashSet<>();
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
Map<Page, Rectangle2D> bBoxCache;
|
||||
|
||||
|
||||
@Override
|
||||
public NodeType getType() {
|
||||
@ -74,4 +79,14 @@ public class Section implements GenericSemanticNode {
|
||||
.orElseGet(() -> getParent().getHeadline());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Map<Page, Rectangle2D> getBBox() {
|
||||
|
||||
if (bBoxCache == null) {
|
||||
bBoxCache = GenericSemanticNode.super.getBBox();
|
||||
}
|
||||
return bBoxCache;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -398,12 +398,11 @@ public interface SemanticNode {
|
||||
*/
|
||||
default Map<Page, Rectangle2D> getBBox() {
|
||||
|
||||
Map<Page, Rectangle2D> bBoxPerPage = new HashMap<>();
|
||||
if (isLeaf()) {
|
||||
return getBBoxFromLeafTextBlock(bBoxPerPage);
|
||||
return getBBoxFromLeafTextBlock();
|
||||
}
|
||||
|
||||
return getBBoxFromChildren(bBoxPerPage);
|
||||
return getBBoxFromChildren();
|
||||
}
|
||||
|
||||
|
||||
@ -426,25 +425,31 @@ public interface SemanticNode {
|
||||
|
||||
/**
|
||||
* TODO: this produces unwanted results for sections spanning multiple columns.
|
||||
*
|
||||
* @param bBoxPerPage initial empty BoundingBox
|
||||
* Computes the Union of the bounding boxes of all children recursively.
|
||||
* @return The union of the BoundingBoxes of all children
|
||||
*/
|
||||
private Map<Page, Rectangle2D> getBBoxFromChildren(Map<Page, Rectangle2D> bBoxPerPage) {
|
||||
private Map<Page, Rectangle2D> getBBoxFromChildren() {
|
||||
|
||||
return streamChildren().map(SemanticNode::getBBox).reduce((map1, map2) -> {
|
||||
map1.forEach((page, rectangle) -> map2.merge(page, rectangle, (rect1, rect2) -> rect1.createUnion(rect2).getBounds2D()));
|
||||
return map2;
|
||||
}).orElse(bBoxPerPage);
|
||||
Map<Page, Rectangle2D> bBoxPerPage = new HashMap<>();
|
||||
List<Map<Page, Rectangle2D>> childrenBBoxes = streamChildren().map(SemanticNode::getBBox).toList();
|
||||
Set<Page> pages = childrenBBoxes.stream().flatMap(map -> map.keySet().stream()).collect(Collectors.toSet());
|
||||
for (Page page : pages) {
|
||||
Rectangle2D bBoxOnPage = childrenBBoxes.stream()
|
||||
.filter(childBboxPerPage -> childBboxPerPage.containsKey(page))
|
||||
.map(childBboxPerPage -> childBboxPerPage.get(page))
|
||||
.collect(RectangleTransformations.collectBBox());
|
||||
bBoxPerPage.put(page, bBoxOnPage);
|
||||
}
|
||||
return bBoxPerPage;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @param bBoxPerPage initial empty BoundingBox
|
||||
* @return The union of all BoundingBoxes of the TextBlock of this node
|
||||
*/
|
||||
private Map<Page, Rectangle2D> getBBoxFromLeafTextBlock(Map<Page, Rectangle2D> bBoxPerPage) {
|
||||
private Map<Page, Rectangle2D> getBBoxFromLeafTextBlock() {
|
||||
|
||||
Map<Page, Rectangle2D> bBoxPerPage = new HashMap<>();
|
||||
Map<Page, List<AtomicTextBlock>> atomicTextBlockPerPage = getTextBlock().getAtomicTextBlocks().stream().collect(Collectors.groupingBy(AtomicTextBlock::getPage));
|
||||
atomicTextBlockPerPage.forEach((page, atbs) -> bBoxPerPage.put(page, RectangleTransformations.bBoxUnionAtomicTextBlock(atbs)));
|
||||
return bBoxPerPage;
|
||||
|
||||
@ -2,10 +2,12 @@ package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Collection;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.IntStream;
|
||||
import java.util.stream.Stream;
|
||||
@ -40,6 +42,8 @@ public class Table implements SemanticNode {
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<RedactionEntity> entities = new HashSet<>();
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
Map<Page, Rectangle2D> bBoxCache;
|
||||
|
||||
/**
|
||||
* Streams all entities in this table, that appear in a row, which contains any of the provided strings.
|
||||
@ -311,5 +315,12 @@ public class Table implements SemanticNode {
|
||||
|
||||
return treeId.toString() + ": " + NodeType.TABLE + ": #cols: " + numberOfCols + ", #rows: " + numberOfRows + ", " + this.getTextBlock().buildSummary();
|
||||
}
|
||||
@Override
|
||||
public Map<Page, Rectangle2D> getBBox() {
|
||||
|
||||
if (bBoxCache == null) {
|
||||
bBoxCache = SemanticNode.super.getBBox();
|
||||
}
|
||||
return bBoxCache;
|
||||
}
|
||||
}
|
||||
|
||||
@ -9,8 +9,6 @@ import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
|
||||
import com.dslplatform.json.JsonAttribute;
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
|
||||
@ -142,8 +140,7 @@ public class TextPositionSequence implements CharSequence {
|
||||
*
|
||||
* @return the text direction adjusted minX value
|
||||
*/
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
|
||||
public float getMinXDirAdj() {
|
||||
|
||||
return textPositions.get(0).getXDirAdj();
|
||||
@ -157,8 +154,7 @@ public class TextPositionSequence implements CharSequence {
|
||||
*
|
||||
* @return the text direction adjusted maxX value
|
||||
*/
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
|
||||
public float getMaxXDirAdj() {
|
||||
|
||||
return textPositions.get(textPositions.size() - 1).getXDirAdj() + textPositions.get(textPositions.size() - 1).getWidthDirAdj() + HEIGHT_PADDING;
|
||||
@ -172,8 +168,7 @@ public class TextPositionSequence implements CharSequence {
|
||||
*
|
||||
* @return the text direction adjusted minY value. The upper border of the bounding box of the word.
|
||||
*/
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
|
||||
public float getMinYDirAdj() {
|
||||
|
||||
return textPositions.get(0).getYDirAdj() - getTextHeight();
|
||||
@ -187,8 +182,7 @@ public class TextPositionSequence implements CharSequence {
|
||||
*
|
||||
* @return the text direction adjusted maxY value. The lower border of the bounding box of the word.
|
||||
*/
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
|
||||
public float getMaxYDirAdj() {
|
||||
|
||||
return textPositions.get(0).getYDirAdj();
|
||||
@ -196,32 +190,24 @@ public class TextPositionSequence implements CharSequence {
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public float getTextHeight() {
|
||||
|
||||
return textPositions.get(0).getHeightDir() + HEIGHT_PADDING;
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public float getHeight() {
|
||||
|
||||
return getMaxYDirAdj() - getMinYDirAdj();
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public float getWidth() {
|
||||
|
||||
return getMaxXDirAdj() - getMinXDirAdj();
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public String getFont() {
|
||||
|
||||
if (textPositions.get(0).getFontName() == null) {
|
||||
@ -231,9 +217,8 @@ public class TextPositionSequence implements CharSequence {
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public String getFontStyle() {
|
||||
|
||||
if (textPositions.get(0).getFontName() == null) {
|
||||
return "standard";
|
||||
}
|
||||
@ -251,16 +236,12 @@ public class TextPositionSequence implements CharSequence {
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public float getFontSize() {
|
||||
|
||||
return textPositions.get(0).getFontSizeInPt();
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public float getSpaceWidth() {
|
||||
|
||||
return textPositions.get(0).getWidthOfSpace();
|
||||
@ -276,8 +257,7 @@ public class TextPositionSequence implements CharSequence {
|
||||
*
|
||||
* @return bounding box of the word in Pdf Coordinate System
|
||||
*/
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
|
||||
@SneakyThrows
|
||||
public Rectangle getRectangle() {
|
||||
|
||||
|
||||
@ -2,12 +2,13 @@ package com.knecon.fforesight.service.layoutparser.processor.services.visualizat
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStream;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardCopyOption;
|
||||
|
||||
import org.apache.pdfbox.cos.COSDictionary;
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
|
||||
@ -31,6 +32,9 @@ import com.knecon.fforesight.service.layoutparser.processor.model.visualization.
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.PlacedText;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.VisualizationsOnPage;
|
||||
|
||||
import io.micrometer.observation.Observation;
|
||||
import io.micrometer.observation.ObservationRegistry;
|
||||
import io.micrometer.observation.annotation.Observed;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
@ -40,29 +44,31 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@RequiredArgsConstructor
|
||||
public class ViewerDocumentService {
|
||||
|
||||
|
||||
private static final String LAYER_NAME = "Layout grid";
|
||||
private static final int FONT_SIZE = 10;
|
||||
public static final float LINE_WIDTH = 1f;
|
||||
|
||||
private final LayoutGridService layoutGridService;
|
||||
private final ObservationRegistry observationRegistry;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void createViewerDocument(PDDocument pdDocument, Document document, OutputStream outputStream, boolean layerVisibilityDefaultValue) {
|
||||
@Observed(name = "ViewerDocumentService", contextualName = "create-viewer-document")
|
||||
public void createViewerDocument(File originFile, Document document, File destinationFile, boolean layerVisibilityDefaultValue) {
|
||||
|
||||
Path tmpFile = Files.createTempFile("tmpViewerDocument", ".pdf");
|
||||
PDDocument pdDocument = openPDDocument(originFile);
|
||||
LayoutGrid layoutGrid = layoutGridService.createLayoutGrid(document);
|
||||
// PDDocument.save() is very slow, since it actually traverses the entire pdf and writes a new one.
|
||||
// If we collect all COSDictionaries we changed and tell it explicitly to only add the changed ones by using saveIncremental it's very fast.
|
||||
Set<COSDictionary> dictionariesToUpdate = new HashSet<>();
|
||||
PDOptionalContentGroup layer = addLayerToDocument(pdDocument, dictionariesToUpdate, layerVisibilityDefaultValue);
|
||||
|
||||
PDOptionalContentGroup layer = addLayerToDocument(pdDocument, layerVisibilityDefaultValue);
|
||||
PDFont font = new PDType1Font(Standard14Fonts.FontName.HELVETICA);
|
||||
|
||||
for (int pageNumber = 0; pageNumber < pdDocument.getNumberOfPages(); pageNumber++) {
|
||||
PDPage pdPage = pdDocument.getPage(pageNumber);
|
||||
|
||||
PDPage pdPage = pdDocument.getPage(pageNumber);
|
||||
//
|
||||
AffineTransform textDeRotationMatrix = getTextDeRotationTransform(pdPage);
|
||||
addLayerToPageRessources(pdPage);
|
||||
addLayerToPageResources(pdPage);
|
||||
|
||||
// We need to save the graphics state before, such that our appended content cannot be affected by previous content streams with side effects,
|
||||
// e.g. not escaped matrix transformations.
|
||||
@ -115,16 +121,48 @@ public class ViewerDocumentService {
|
||||
contentStream.restoreGraphicsState();
|
||||
contentStream.endMarkedContent();
|
||||
}
|
||||
dictionariesToUpdate.add(pdPage.getCOSObject());
|
||||
dictionariesToUpdate.add(pdPage.getResources().getCOSObject());
|
||||
|
||||
if (pageNumber % 500 == 0 && pageNumber != 0) { // re-open document every once in a while to save on RAM
|
||||
log.info("Incremental save after {} pages", pageNumber);
|
||||
observedIncrementalSave(pdDocument, destinationFile);
|
||||
pdDocument.close();
|
||||
Files.copy(destinationFile.toPath(), tmpFile, StandardCopyOption.REPLACE_EXISTING);
|
||||
pdDocument = openPDDocument(tmpFile.toFile());
|
||||
layer = addLayerToDocument(pdDocument, layerVisibilityDefaultValue);
|
||||
}
|
||||
|
||||
}
|
||||
dictionariesToUpdate.add(pdDocument.getDocumentInformation().getCOSObject());
|
||||
// dictionariesToUpdate.add(pdDocument.getDocument().getTrailer());
|
||||
pdDocument.saveIncremental(outputStream, dictionariesToUpdate);
|
||||
observedIncrementalSave(pdDocument, destinationFile);
|
||||
|
||||
tmpFile.toFile().delete();
|
||||
pdDocument.close();
|
||||
}
|
||||
|
||||
|
||||
private static void addLayerToPageRessources(PDPage pdPage) {
|
||||
private static PDDocument openPDDocument(File tmpFile) throws IOException {
|
||||
|
||||
PDDocument pdDocument;
|
||||
pdDocument = Loader.loadPDF(tmpFile);
|
||||
pdDocument.setAllSecurityToBeRemoved(true);
|
||||
return pdDocument;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void observedIncrementalSave(PDDocument pdDocument, File outputFile) {
|
||||
|
||||
Observation.createNotStarted("ViewerDocumentService", observationRegistry).contextualName("incremental-save").observe(() -> {
|
||||
try {
|
||||
pdDocument.save(outputFile);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
|
||||
private static void addLayerToPageResources(PDPage pdPage) {
|
||||
|
||||
PDResources resources = pdPage.getResources();
|
||||
if (resources == null) {
|
||||
@ -145,7 +183,7 @@ public class ViewerDocumentService {
|
||||
}
|
||||
|
||||
|
||||
private static PDOptionalContentGroup addLayerToDocument(PDDocument pdDocument, Set<COSDictionary> dictionariesToUpdate, boolean layerVisibilityDefaultValue) {
|
||||
private static PDOptionalContentGroup addLayerToDocument(PDDocument pdDocument, boolean layerVisibilityDefaultValue) {
|
||||
|
||||
PDDocumentCatalog catalog = pdDocument.getDocumentCatalog();
|
||||
PDOptionalContentProperties ocprops = catalog.getOCProperties();
|
||||
@ -161,7 +199,7 @@ public class ViewerDocumentService {
|
||||
ocprops.addGroup(layer);
|
||||
}
|
||||
ocprops.setGroupEnabled(layer, layerVisibilityDefaultValue);
|
||||
dictionariesToUpdate.add(catalog.getCOSObject());
|
||||
// dictionariesToUpdate.add(catalog.getCOSObject());
|
||||
return layer;
|
||||
}
|
||||
|
||||
|
||||
@ -5,6 +5,7 @@ import org.springframework.boot.actuate.autoconfigure.security.servlet.Managemen
|
||||
import org.springframework.boot.autoconfigure.ImportAutoConfiguration;
|
||||
import org.springframework.boot.autoconfigure.SpringBootApplication;
|
||||
import org.springframework.boot.autoconfigure.security.servlet.SecurityAutoConfiguration;
|
||||
import org.springframework.context.annotation.Bean;
|
||||
import org.springframework.context.annotation.Import;
|
||||
|
||||
import com.amazonaws.services.s3.model.metrics.MetricsConfiguration;
|
||||
@ -13,8 +14,11 @@ import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingService
|
||||
import com.knecon.fforesight.service.layoutparser.server.queue.MessagingConfiguration;
|
||||
import com.knecon.fforesight.tenantcommons.MultiTenancyAutoConfiguration;
|
||||
|
||||
import io.micrometer.observation.ObservationRegistry;
|
||||
import io.micrometer.observation.aop.ObservedAspect;
|
||||
|
||||
@ImportAutoConfiguration({MultiTenancyAutoConfiguration.class})
|
||||
@Import({MetricsConfiguration.class, StorageAutoConfiguration.class, LayoutParsingServiceProcessorConfiguration.class, MessagingConfiguration.class})
|
||||
@Import({MetricsConfiguration.class, StorageAutoConfiguration.class, LayoutParsingServiceProcessorConfiguration.class, MessagingConfiguration.class})
|
||||
@SpringBootApplication(exclude = {SecurityAutoConfiguration.class, ManagementWebSecurityAutoConfiguration.class})
|
||||
public class Application {
|
||||
|
||||
@ -23,4 +27,11 @@ public class Application {
|
||||
SpringApplication.run(Application.class, args);
|
||||
}
|
||||
|
||||
|
||||
@Bean
|
||||
public ObservedAspect observedAspect(ObservationRegistry observationRegistry) {
|
||||
|
||||
return new ObservedAspect(observationRegistry);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -46,12 +46,11 @@ public class BdrJsonBuildTest extends AbstractTest {
|
||||
@SneakyThrows
|
||||
protected Document buildGraph(File file) {
|
||||
|
||||
try (PDDocument pdDocument = Loader.loadPDF(file)) {
|
||||
return DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.TAAS,
|
||||
pdDocument,
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse()));
|
||||
}
|
||||
return DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.TAAS,
|
||||
file,
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse(),
|
||||
file.toString()));
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -6,7 +6,6 @@ import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.assertj.core.api.Assertions;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.extension.ExtendWith;
|
||||
@ -96,9 +95,10 @@ public class HeadlinesGoldStandardIntegrationTest {
|
||||
goldStandardLog.getRedactionLogEntry().forEach(e -> goldStandardHeadlines.add(new Headline(e.getPositions().get(0).getPage(), e.getValue())));
|
||||
|
||||
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||
Loader.loadPDF(pdfFileResource.getFile()),
|
||||
pdfFileResource.getFile(),
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse()));
|
||||
new TableServiceResponse(),
|
||||
filePath));
|
||||
|
||||
var foundHeadlines = documentGraph.streamAllSubNodes()
|
||||
.map(SemanticNode::getHeadline)
|
||||
|
||||
@ -25,8 +25,8 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
|
||||
@SneakyThrows
|
||||
public void testLayoutParserEndToEnd() {
|
||||
|
||||
prepareStorage("files/bdr/btd_19_053_1905391.pdf");
|
||||
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.TAAS);
|
||||
prepareStorage("files/bdr/Wie weiter bei Kristeneinrichtungen.pdf");
|
||||
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER);
|
||||
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
|
||||
Arrays.stream(finishedEvent.message().split("\n")).forEach(log::info);
|
||||
}
|
||||
|
||||
@ -6,6 +6,7 @@ import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
@ -17,7 +18,9 @@ import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
public class DocumentDataTests extends BuildDocumentTest {
|
||||
|
||||
@Test
|
||||
@Disabled // This test takes waaaaaay too long, it's ridiculous
|
||||
@SneakyThrows
|
||||
public void createDocumentDataForAllFiles() {
|
||||
|
||||
@ -36,11 +39,12 @@ public class DocumentDataTests extends BuildDocumentTest {
|
||||
for (String pdfFileName : pdfFileNames) {
|
||||
System.out.println(pdfFileName);
|
||||
DocumentData documentData = DocumentDataMapper.toDocumentData(buildGraph(resource.getFile().toPath().getParent().relativize(Path.of(pdfFileName)).toString()));
|
||||
File outputFile = Path.of(outPath).resolve(resource.getFile().toPath().relativize(Path.of(pdfFileName))).toFile();
|
||||
File outputFile = Path.of(outPath).resolve(resource.getFile().toPath().relativize(Path.of(pdfFileName))).toFile();
|
||||
outputFile.toPath().getParent().toFile().mkdirs();
|
||||
try (var out = new FileOutputStream(outputFile.toString().replace(".pdf", ".json"))) {
|
||||
ObjectMapperFactory.create().writeValue(out, documentData);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -5,7 +5,6 @@ import java.io.FileOutputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
@ -56,9 +55,10 @@ public class DocumentGraphJsonWritingTest extends BuildDocumentTest {
|
||||
private void writeJsons(Path filename) {
|
||||
|
||||
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||
Loader.loadPDF(filename.toFile()),
|
||||
filename.toFile(),
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse()));
|
||||
new TableServiceResponse(),
|
||||
filename.toFile().toString()));
|
||||
|
||||
DocumentData documentData = DocumentDataMapper.toDocumentData(documentGraph);
|
||||
ObjectMapper mapper = ObjectMapperFactory.create();
|
||||
|
||||
@ -1,32 +1,13 @@
|
||||
package com.knecon.fforesight.service.layoutparser.server.graph;
|
||||
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.File;
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.PropertiesMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.ViewerDocumentService;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
|
||||
@ -35,38 +16,17 @@ import lombok.SneakyThrows;
|
||||
|
||||
public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
|
||||
@Autowired
|
||||
private SectionsBuilderService sectionsBuilderService;
|
||||
|
||||
@Autowired
|
||||
private RedactManagerClassificationService redactManagerClassificationService;
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testViewerDocument() {
|
||||
|
||||
String fileName = "files/bdr/notMergedParagraphs.pdf";
|
||||
String fileName = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
|
||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||
LayoutGridService layoutGridService = new LayoutGridService();
|
||||
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService);
|
||||
Document document = buildGraph(fileName, LayoutParsingType.TAAS);
|
||||
try (var pdDocument = Loader.loadPDF(new ClassPathResource(fileName).getFile()); var out = new FileOutputStream(tmpFileName)) {
|
||||
viewerDocumentService.createViewerDocument(pdDocument, document, out, true);
|
||||
}
|
||||
}
|
||||
|
||||
public ClassificationDocument buildClassificationDocument(PDDocument originDocument) {
|
||||
|
||||
ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||
originDocument,
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse());
|
||||
|
||||
redactManagerClassificationService.classifyDocument(classificationDocument);
|
||||
|
||||
sectionsBuilderService.buildSections(classificationDocument);
|
||||
|
||||
return classificationDocument;
|
||||
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService, null);
|
||||
Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER);
|
||||
var documentFile = new ClassPathResource(fileName).getFile();
|
||||
viewerDocumentService.createViewerDocument(documentFile, document, new File(tmpFileName), true);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.server.segmentation;
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
@ -15,8 +16,6 @@ import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
@ -62,12 +61,13 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public ClassificationDocument buildClassificationDocument(PDDocument originDocument, TableServiceResponse tableServiceResponse) {
|
||||
public ClassificationDocument buildClassificationDocument(File originDocument, TableServiceResponse tableServiceResponse) {
|
||||
|
||||
ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||
originDocument,
|
||||
new ImageServiceResponse(),
|
||||
tableServiceResponse);
|
||||
tableServiceResponse,
|
||||
"document");
|
||||
|
||||
redactManagerClassificationService.classifyDocument(classificationDocument);
|
||||
|
||||
@ -78,7 +78,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public ClassificationDocument buildClassificationDocument(PDDocument originDocument) {
|
||||
public ClassificationDocument buildClassificationDocument(File originDocument) {
|
||||
|
||||
return buildClassificationDocument(originDocument, new TableServiceResponse());
|
||||
}
|
||||
@ -89,7 +89,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/A20622A izRMS (CZ) fRR Part B9_Page185.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
toHtml(document, "/tmp/A20622A izRMS (CZ) fRR Part B9_Page185.html");
|
||||
|
||||
@ -103,7 +103,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
ClassPathResource cvTablesResource = new ClassPathResource("files/cv_tables/ScanRotationBorder.TABLES.json");
|
||||
var tableServiceResponse = objectMapper.readValue(cvTablesResource.getInputStream(), TableServiceResponse.class);
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()), tableServiceResponse);
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile(), tableServiceResponse);
|
||||
|
||||
toHtml(document, "/tmp/ScanRotationBorder.html");
|
||||
}
|
||||
@ -116,7 +116,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
ClassPathResource cvTablesResource = new ClassPathResource("files/cv_tables/ScanRotationBorder.TABLES.json");
|
||||
var tableServiceResponse = objectMapper.readValue(cvTablesResource.getInputStream(), TableServiceResponse.class);
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()), tableServiceResponse);
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile(), tableServiceResponse);
|
||||
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
|
||||
var tables = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList();
|
||||
|
||||
@ -156,7 +156,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Spanning Cells.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
|
||||
TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0);
|
||||
assertThat(table.getColCount()).isEqualTo(6);
|
||||
@ -170,7 +170,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Table.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
|
||||
TablePageBlock firstTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0);
|
||||
assertThat(firstTable.getColCount()).isEqualTo(8);
|
||||
@ -188,7 +188,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Multi Page Table.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
|
||||
TablePageBlock firstTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0);
|
||||
assertThat(firstTable.getColCount()).isEqualTo(9);
|
||||
@ -206,7 +206,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Rotated Table Headers.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
|
||||
TablePageBlock firstTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0);
|
||||
assertThat(firstTable.getColCount()).isEqualTo(8);
|
||||
@ -224,7 +224,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/56 Fludioxonil_RAR_12_Volume_3CA_B-7_2018-02-21_Page170.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
validateTableSize(document, 4);
|
||||
|
||||
@ -241,7 +241,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/211.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
validateTableSize(document, 4);
|
||||
|
||||
@ -258,7 +258,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/VV-931175_Page1.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
validateTableSize(document, 1);
|
||||
|
||||
@ -299,7 +299,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/27 A8637C - EU AIR3 - MCP Section 1 - Identity of the plant protection product_Page6.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
toHtml(document, "/tmp/html.html");
|
||||
|
||||
@ -319,7 +319,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/A20622A izRMS (CZ) fRR Part B9_Page185.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
validateTableSize(document, 1);
|
||||
|
||||
@ -332,7 +332,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/A20622A izRMS (CZ) fRR Part B9_Page185_fixed.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
validateTableSize(document, 1);
|
||||
|
||||
@ -345,7 +345,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/A20622A izZRMS (CZ) fRR Part B7_Page123.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
validateTableSize(document, 6);
|
||||
|
||||
@ -364,7 +364,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/77 Pirimicarb_RAR_08_Volume_3CA_B-6_2017-12-04_Page11.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
validateTableSize(document, 3);
|
||||
|
||||
@ -380,7 +380,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/95 Trinexapac-ethyl_RAR_08_Volume_3CA_B-6_2018-01-10_Page532.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
validateTableSize(document, 1);
|
||||
validateTable(document, 0, 9, 9, 0, 0);
|
||||
@ -393,7 +393,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/52 Fludioxonil_RAR_07_Volume_3CA_B-5_2018-02-21_Page175.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
validateTableSize(document, 1);
|
||||
|
||||
@ -407,7 +407,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/52 Fludioxonil_RAR_07_Volume_3CA_B-5_2018-02-21_Page174.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
validateTableSize(document, 1);
|
||||
validateTable(document, 0, 9, 6, 7, 0);
|
||||
@ -420,7 +420,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/19 Chlorothalonil RAR 08 Volume 3CA B 6b metabolites Oct 2017_Page35.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
validateTableSize(document, 1);
|
||||
validateTable(document, 0, 10, 6, 0, 0);
|
||||
@ -433,7 +433,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/19 Chlorothalonil RAR 08 Volume 3CA B 6b metabolites Oct 2017_Page161.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
validateTableSize(document, 2);
|
||||
validateTable(document, 0, 2, 2, 0, 0);
|
||||
@ -448,7 +448,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
ClassPathResource pdfFileResource = new ClassPathResource(
|
||||
"files/SinglePages/47 Cyprodinil - EU AIR3 - MCA Section 5 Supplement - Toxicological and metabolism studies on the active substance_Page30.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
validateTableSize(document, 2);
|
||||
|
||||
@ -464,7 +464,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
ClassPathResource pdfFileResource = new ClassPathResource(
|
||||
"files/SinglePages/49 Cyprodinil - EU AIR3 - MCA Section 8 Supplement - Ecotoxicological studies on the active substance_Page61.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
validateTableSize(document, 2);
|
||||
|
||||
@ -479,7 +479,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/81 Pirimicarb_RAR_20_Volume_3CP_A10788A (_Pirimor_)_B-9_2017-12-04_Page54.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
validateTableSize(document, 2);
|
||||
|
||||
@ -494,7 +494,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/85 Pydiflumetofen_DAR_08_Volume_3CA_B-6_2017-07-26_Page134.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
validateTableSize(document, 2);
|
||||
|
||||
@ -509,7 +509,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/Thiabendazole DAR Addendum for ED_April_2020_Page18.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
validateTableSize(document, 4);
|
||||
|
||||
@ -526,7 +526,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/15 - Pretilachlor - Acute Oral Toxicity (Up and Down Procedure) - Rat_Page18.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
validateTableSize(document, 1);
|
||||
|
||||
@ -541,7 +541,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
ClassPathResource pdfFileResource = new ClassPathResource(
|
||||
"files/SinglePages/28 A8637C - EU AIR3 - MCP Section 10 - Ecotoxicological studies on the plant protection product_Page23.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
validateTableSize(document, 2);
|
||||
|
||||
@ -556,7 +556,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/24 - SYN549522 - Acute Oral Toxicity - Rats_Page17.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
validateTableSize(document, 1);
|
||||
|
||||
@ -570,7 +570,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/30 - Dicamba - Acute Oral Toxicity - Rats_Page5.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
validateTableSize(document, 1);
|
||||
|
||||
|
||||
@ -58,7 +58,7 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
|
||||
public void testTableExtraction() {
|
||||
|
||||
LayoutGridService layoutGridService = new LayoutGridService();
|
||||
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService);
|
||||
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService, null);
|
||||
|
||||
ClassPathResource resource = new ClassPathResource("files");
|
||||
List<String> pdfFileNames = Files.walk(resource.getFile().toPath())
|
||||
@ -77,13 +77,15 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
|
||||
private void writeJsons(Path filename) {
|
||||
|
||||
Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||
Loader.loadPDF(filename.toFile()),
|
||||
filename.toFile(),
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse()));
|
||||
new TableServiceResponse(),
|
||||
filename.toFile().toString()));
|
||||
Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||
Loader.loadPDF(filename.toFile()),
|
||||
filename.toFile(),
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse()));
|
||||
new TableServiceResponse(),
|
||||
filename.toFile().toString()));
|
||||
DocumentData documentDataBefore = DocumentDataMapper.toDocumentData(documentGraphBefore);
|
||||
DocumentData documentDataAfter = DocumentDataMapper.toDocumentData(documentGraphAfter);
|
||||
if (!compareStructures(documentDataBefore.getDocumentStructure(), documentDataAfter.getDocumentStructure(), filename.getFileName().toString())) {
|
||||
|
||||
@ -1,5 +1,26 @@
|
||||
package com.knecon.fforesight.service.layoutparser.server.utils;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.extension.ExtendWith;
|
||||
import org.springframework.amqp.rabbit.core.RabbitTemplate;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.boot.autoconfigure.EnableAutoConfiguration;
|
||||
import org.springframework.boot.autoconfigure.amqp.RabbitAutoConfiguration;
|
||||
import org.springframework.boot.test.context.SpringBootTest;
|
||||
import org.springframework.boot.test.mock.mockito.MockBean;
|
||||
import org.springframework.context.annotation.Bean;
|
||||
import org.springframework.context.annotation.ComponentScan;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
import org.springframework.context.annotation.Import;
|
||||
import org.springframework.context.annotation.Primary;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
import org.springframework.test.context.junit.jupiter.SpringExtension;
|
||||
|
||||
import com.iqser.red.commons.jackson.ObjectMapperFactory;
|
||||
import com.iqser.red.storage.commons.service.StorageService;
|
||||
import com.iqser.red.storage.commons.utils.FileSystemBackedStorageService;
|
||||
@ -9,22 +30,8 @@ import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingStorage
|
||||
import com.knecon.fforesight.service.layoutparser.server.Application;
|
||||
import com.knecon.fforesight.tenantcommons.TenantContext;
|
||||
import com.knecon.fforesight.tenantcommons.TenantsClient;
|
||||
import lombok.SneakyThrows;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.extension.ExtendWith;
|
||||
import org.springframework.amqp.rabbit.core.RabbitTemplate;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.boot.autoconfigure.EnableAutoConfiguration;
|
||||
import org.springframework.boot.autoconfigure.amqp.RabbitAutoConfiguration;
|
||||
import org.springframework.boot.test.context.SpringBootTest;
|
||||
import org.springframework.boot.test.mock.mockito.MockBean;
|
||||
import org.springframework.context.annotation.*;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
import org.springframework.test.context.junit.jupiter.SpringExtension;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.util.Optional;
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@ExtendWith(SpringExtension.class)
|
||||
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT)
|
||||
@ -100,9 +107,11 @@ public abstract class AbstractTest {
|
||||
return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER);
|
||||
}
|
||||
|
||||
|
||||
protected LayoutParsingRequest buildDefaultLayoutParsingRequest(LayoutParsingType layoutParsingType) {
|
||||
|
||||
return LayoutParsingRequest.builder()
|
||||
.identifier(Map.of("fileId", "1337"))
|
||||
.layoutParsingType(layoutParsingType)
|
||||
.originFileStorageId(ORIGIN_FILE_ID)
|
||||
.tablesFileStorageId(Optional.of(TABLE_FILE_ID))
|
||||
@ -116,6 +125,7 @@ public abstract class AbstractTest {
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
protected LayoutParsingRequest prepareStorage(String file, String cvServiceResponseFile, String imageInfoFile) {
|
||||
|
||||
@ -152,7 +162,6 @@ public abstract class AbstractTest {
|
||||
@ComponentScan("com.knecon.fforesight.service.layoutparser")
|
||||
public static class TestConfiguration {
|
||||
|
||||
|
||||
@Bean
|
||||
@Primary
|
||||
public StorageService inmemoryStorage() {
|
||||
|
||||
@ -1,9 +1,7 @@
|
||||
package com.knecon.fforesight.service.layoutparser.server.utils;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.io.File;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
@ -25,11 +23,9 @@ public abstract class BuildDocumentTest extends AbstractTest {
|
||||
@SneakyThrows
|
||||
protected ClassificationDocument parseLayout(String filename, LayoutParsingType layoutParsingType) {
|
||||
|
||||
ClassPathResource fileResource = new ClassPathResource(filename);
|
||||
File fileResource = new ClassPathResource(filename).getFile();
|
||||
prepareStorage(filename);
|
||||
try (InputStream inputStream = fileResource.getInputStream(); PDDocument pdDocument = Loader.loadPDF(inputStream.readAllBytes())) {
|
||||
return layoutParsingPipeline.parseLayout(layoutParsingType, pdDocument, layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID), new TableServiceResponse());
|
||||
}
|
||||
return layoutParsingPipeline.parseLayout(layoutParsingType, fileResource, layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID), new TableServiceResponse(), filename);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -7,6 +7,9 @@ fforesight.tenants.remote: true
|
||||
server:
|
||||
port: 8080
|
||||
|
||||
logging.pattern.level: "%5p [${spring.application.name},%X{traceId:-},%X{spanId:-}]"
|
||||
logging.type: ${LOGGING_TYPE:CONSOLE}
|
||||
|
||||
spring:
|
||||
main:
|
||||
allow-circular-references: true # FIXME
|
||||
|
||||
@ -1,16 +0,0 @@
|
||||
<Configuration>
|
||||
|
||||
<Appenders>
|
||||
<Console name="CONSOLE" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="%d{HH:mm:ss.SSS} [%t] %-5level %logger{36} - %msg%n"/>
|
||||
</Console>
|
||||
</Appenders>
|
||||
|
||||
<Loggers>
|
||||
<Root level="warn">
|
||||
<AppenderRef ref="CONSOLE"/>
|
||||
</Root>
|
||||
<Logger name="com.iqser" level="info"/>
|
||||
</Loggers>
|
||||
|
||||
</Configuration>
|
||||
@ -0,0 +1,17 @@
|
||||
<configuration>
|
||||
|
||||
<springProperty scope="configuration" name="logType" source="logging.type"/>
|
||||
<springProperty scope="context" name="application.name" source="spring.application.name"/>
|
||||
<springProperty scope="context" name="version" source="project.version"/>
|
||||
<include resource="org/springframework/boot/logging/logback/defaults.xml"/>
|
||||
<include resource="org/springframework/boot/logging/logback/console-appender.xml"/>
|
||||
|
||||
<appender name="JSON" class="ch.qos.logback.core.ConsoleAppender">
|
||||
<encoder class="net.logstash.logback.encoder.LogstashEncoder"/>
|
||||
</appender>
|
||||
|
||||
<root level="INFO">
|
||||
<appender-ref ref="${logType}"/>
|
||||
</root>
|
||||
|
||||
</configuration>
|
||||
@ -2,7 +2,14 @@
|
||||
dir=${PWD##*/}
|
||||
gradle assemble
|
||||
|
||||
buildNumber=${1:-1}
|
||||
# Get the current Git branch
|
||||
branch=$(git rev-parse --abbrev-ref HEAD)
|
||||
|
||||
gradle bootBuildImage --cleanCache --publishImage -PbuildbootDockerHostNetwork=true -Pversion=$USER-$buildNumber
|
||||
echo "nexus.knecon.com:5001/red/${dir}-server-v1:$USER-$buildNumber"
|
||||
# Get the short commit hash (first 5 characters)
|
||||
commit_hash=$(git rev-parse --short=5 HEAD)
|
||||
|
||||
# Combine branch and commit hash
|
||||
buildName="${USER}-${branch}-${commit_hash}"
|
||||
|
||||
gradle bootBuildImage --publishImage -PbuildbootDockerHostNetwork=true -Pversion=$buildName --no-build-cache
|
||||
echo "nexus.knecon.com:5001/ff/${dir}-service-server:$buildName"
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user