RED-9353: refactor PDFTronViewerDocumentService

* rename RedactionEntity -> TextEntity
* rename Boundary -> TextRange
This commit is contained in:
Kilian Schuettler 2024-07-15 11:44:16 +02:00
parent ad38618dc7
commit 203436c62f
8 changed files with 67 additions and 83 deletions

View File

@ -8,7 +8,6 @@ import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Headline;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Key;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutGrid;
@ -57,7 +56,6 @@ public class LayoutGridService {
case TABLE -> layoutGrid.addTable((Table) semanticNode);
case IMAGE -> layoutGrid.addImage((Image) semanticNode);
case HEADER, FOOTER -> layoutGrid.addHeaderOrFooter(semanticNode);
case KEY -> layoutGrid.addKey((Key) semanticNode);
}
});
return layoutGrid;

View File

@ -19,7 +19,6 @@ import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.No
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.DuplicatedParagraph;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Headline;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Key;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
@ -70,28 +69,6 @@ public class LayoutGrid extends LayoutGridLayerConfig {
}
public void addKey(Key key) {
addAsRectangle(key, keyValue, KEY_COLOR);
if (key.getValue() != null) {
addAsRectangle(key.getValue(), keyValue, VALUE_COLOR);
if (key.getFirstPage() != key.getValue().getFirstPage()) {
return;
}
Line2D line = ConnectionLineUtil.findClosestMidpointLine(key.getBBox().get(key.getFirstPage()), key.getValue().getBBox().get(key.getFirstPage()));
Line2D[] arrowHead = ConnectionLineUtil.createArrowHead(line, Math.min(ConnectionLineUtil.length(line), 5));
List<ColoredLine> linesOnPage = getOrCreateVisualizationsOnPage(key.getFirstPage().getNumber(), keyValue).getColoredLines();
linesOnPage.add(new ColoredLine(line, KEY_VALUE_BBOX_COLOR, LINE_WIDTH));
linesOnPage.add(new ColoredLine(arrowHead[0], KEY_VALUE_BBOX_COLOR, LINE_WIDTH));
linesOnPage.add(new ColoredLine(arrowHead[1], KEY_VALUE_BBOX_COLOR, LINE_WIDTH));
}
}
public void addHeadline(Headline headline) {
addAsRectangle(headline, headlines, HEADLINE_COLOR);

View File

@ -38,7 +38,7 @@ dependencies {
implementation("com.amazonaws:aws-java-sdk-s3:1.12.536")
implementation("org.springframework.cloud:spring-cloud-starter-openfeign:4.0.4")
implementation("net.logstash.logback:logstash-logback-encoder:7.4")
implementation("com.pdftron:PDFNet:10.5.0")
implementation("com.pdftron:PDFNet:10.7.0")
// for integration testing only
testImplementation(project(":viewer-doc-processor"))

View File

@ -1,5 +1,6 @@
package com.knecon.fforesight.service.layoutparser.server;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;
import com.google.common.base.Strings;
@ -17,7 +18,8 @@ import lombok.extern.slf4j.Slf4j;
@RequiredArgsConstructor
public class PDFNetInitializer {
private final LayoutparserSettings settings;
@Value("${pdftron.license:}")
private String pdftronLicense;
@SneakyThrows
@ -25,13 +27,13 @@ public class PDFNetInitializer {
// Do not change back to application runner, if it is application runner it takes messages from the queue before PDFNet is initialized, that leads to UnsatisfiedLinkError.
public void init() {
if (Strings.isNullOrEmpty(settings.getPdftronLicense())) {
if (Strings.isNullOrEmpty(pdftronLicense)) {
return;
}
log.info("Initializing Native Libraries");
log.info("Setting pdftron license: {}", settings.getPdftronLicense());
log.info("Setting pdftron license: {}", pdftronLicense);
PDFNet.setTempPath("/tmp/pdftron");
PDFNet.initialize(settings.getPdftronLicense());
PDFNet.initialize(pdftronLicense);
}

View File

@ -12,6 +12,7 @@ import java.util.Map;
import java.util.function.Predicate;
import org.apache.commons.lang3.StringUtils;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.core.io.ClassPathResource;
@ -34,6 +35,7 @@ import com.knecon.fforesight.service.layoutparser.processor.services.visualizati
import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest;
import com.knecon.fforesight.service.viewerdoc.service.PDFTronViewerDocumentService;
import jakarta.annotation.PostConstruct;
import lombok.SneakyThrows;
public class OutlineDetectionTest extends AbstractTest {
@ -44,6 +46,16 @@ public class OutlineDetectionTest extends AbstractTest {
@Autowired
protected LayoutParsingPipeline layoutParsingPipeline;
@Autowired
PDFNetInitializer pdfNetInitializer;
@BeforeEach
public void init() {
pdfNetInitializer.init();
}
@Test
@SneakyThrows
@ -60,28 +72,17 @@ public class OutlineDetectionTest extends AbstractTest {
OutlineObjectTree outlineObjectTree = classificationDocument.getOutlineObjectTree();
assertEquals(outlineObjectTree.getRootNodes().size(), 8);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
.get(1).size(), 1);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
.get(3).size(), 1);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
.get(4).size(), 1);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
.get(5).size(), 1);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
.get(6).size(), 2);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
.get(7).size(), 3);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
.get(8).size(), 2);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
.get(10).size(), 1);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
.get(11).size(), 4);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
.get(12).size(), 1);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
.get(13).size(), 2);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(1).size(), 1);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(3).size(), 1);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(4).size(), 1);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(5).size(), 1);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(6).size(), 2);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(7).size(), 3);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(8).size(), 2);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(10).size(), 1);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(11).size(), 4);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(12).size(), 1);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(13).size(), 2);
assertTrue(outlineObjectTree.getOutlineObjectsPerPage().values()
.stream()
.flatMap(Collection::stream)
@ -98,29 +99,15 @@ public class OutlineDetectionTest extends AbstractTest {
.stream()
.map(outlineObjectTreeNode -> sanitizeString(outlineObjectTreeNode.getOutlineObject().getTitle()))
.toList());
assertEquals(tableOfContents.getMainSections()
.get(5).getChildren().size(), 6);
assertEquals(tableOfContents.getMainSections()
.get(7).getChildren().size(), 3);
assertEquals(tableOfContents.getMainSections()
.get(8).getChildren().size(), 3);
assertEquals(tableOfContents.getMainSections()
.get(8).getChildren()
.get(2).getChildren().size(), 1);
assertEquals(tableOfContents.getMainSections()
.get(8).getChildren()
.get(2).getChildren()
.get(0).getChildren().size(), 3);
assertEquals(tableOfContents.getMainSections().get(5).getChildren().size(), 6);
assertEquals(tableOfContents.getMainSections().get(7).getChildren().size(), 3);
assertEquals(tableOfContents.getMainSections().get(8).getChildren().size(), 3);
assertEquals(tableOfContents.getMainSections().get(8).getChildren().get(2).getChildren().size(), 1);
assertEquals(tableOfContents.getMainSections().get(8).getChildren().get(2).getChildren().get(0).getChildren().size(), 3);
assertEquals(tableOfContents.getMainSections()
.get(0).getImages().size(), 1);
assertEquals(tableOfContents.getMainSections()
.get(6).getImages().size(), 1);
assertEquals(tableOfContents.getMainSections()
.get(8).getChildren()
.get(2).getChildren()
.get(0).getChildren()
.get(2).getImages().size(), 1);
assertEquals(tableOfContents.getMainSections().get(0).getImages().size(), 1);
assertEquals(tableOfContents.getMainSections().get(6).getImages().size(), 1);
assertEquals(tableOfContents.getMainSections().get(8).getChildren().get(2).getChildren().get(0).getChildren().get(2).getImages().size(), 1);
Document document = buildGraph(fileName, classificationDocument);
@ -159,17 +146,14 @@ public class OutlineDetectionTest extends AbstractTest {
.count(), 3 + 1);
assertEquals(childrenOfTypeSectionOrSuperSection.get(8).streamChildren()
.filter(isSectionOrSuperSection)
.toList()
.get(3).streamChildren()
.toList().get(3).streamChildren()
.filter(isSectionOrSuperSection)
.count(), 1 + 1);
assertEquals(childrenOfTypeSectionOrSuperSection.get(8).streamChildren()
.filter(isSectionOrSuperSection)
.toList()
.get(3).streamChildren()
.toList().get(3).streamChildren()
.filter(isSectionOrSuperSection)
.toList()
.get(1).streamChildren()
.toList().get(1).streamChildren()
.filter(isSectionOrSuperSection)
.count(), 3 + 1);

View File

@ -4,11 +4,18 @@ import java.io.File;
import java.nio.file.Path;
import java.util.Map;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.mockito.MockitoAnnotations;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.mock.mockito.MockBean;
import org.springframework.core.io.ClassPathResource;
import com.iqser.red.commons.jackson.ObjectMapperFactory;
import com.iqser.red.storage.commons.service.StorageService;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
@ -16,17 +23,30 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.tab
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
import com.knecon.fforesight.service.layoutparser.server.PDFNetInitializer;
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
import com.knecon.fforesight.service.viewerdoc.service.PDFTronViewerDocumentService;
import com.knecon.fforesight.tenantcommons.TenantsClient;
import com.pdftron.pdf.PDFNet;
import jakarta.annotation.PostConstruct;
import lombok.SneakyThrows;
public class ViewerDocumentTest extends BuildDocumentTest {
@Autowired
PDFNetInitializer pdfNetInitializer;
PDFTronViewerDocumentService viewerDocumentService = new PDFTronViewerDocumentService(null);
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
@BeforeEach
public void init() {
pdfNetInitializer.init();
}
@Test
@SneakyThrows
public void testViewerDocument() {

View File

@ -9,7 +9,5 @@ storage:
key: minioadmin
secret: minioadmin
layoutparser:
debug: true
pdftron-license: demo:1650351709282:7bd235e003000000004ec28a6743e1163a085e2115de2536ab6e2cfe5a

View File

@ -28,6 +28,11 @@ spring:
max-interval: 15000
prefetch: 1
layoutparser:
debug: true
pdftron.license: demo:1650351709282:7bd235e003000000004ec28a6743e1163a085e2115de2536ab6e2cfe5a
management:
endpoint:
metrics.enabled: ${monitoring.enabled:false}