RED-6009: Document Tree Structure

*fixed dependecny issus
This commit is contained in:
Kilian Schuettler 2023-04-12 13:55:54 +02:00
parent aac0259caf
commit 2ed617bb03
25 changed files with 310 additions and 220 deletions

6
.gitignore vendored
View File

@ -4,6 +4,12 @@ target/
!**/src/main/**/target/
!**/src/test/**/target/
### maven build ###
*.class
/out/
**/out/
**/target/
### STS ###
.apt_generated
.classpath

View File

@ -2,8 +2,8 @@
<project xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns="http://maven.apache.org/POM/4.0.0"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.knecon.fforesight</groupId>
@ -12,9 +12,6 @@
</parent>
<artifactId>layoutparser-service-internal-api</artifactId>
<version>1.0.0</version>
<packaging>pom</packaging>
<dependencies>
<dependency>
@ -29,21 +26,4 @@
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
<configuration>
<excludes>
<exclude>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
</exclude>
</excludes>
</configuration>
</plugin>
</plugins>
</build>
</project>

View File

@ -5,8 +5,6 @@ import java.util.List;
import java.util.Map;
import java.util.stream.Stream;
import javax.management.openmbean.InvalidKeyException;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.NodeType;
import lombok.AccessLevel;
@ -23,15 +21,15 @@ import lombok.experimental.FieldDefaults;
@FieldDefaults(level = AccessLevel.PRIVATE)
public class TableOfContentsData {
List<EntryData> entries;
EntryData root;
public EntryData get(List<Integer> tocId) {
if (tocId.size() < 1) {
throw new InvalidKeyException(String.format("ClassificationSection Identifier: \"%s\" is not valid.", tocId));
if (tocId.isEmpty()) {
return root;
}
EntryData entry = entries.get(tocId.get(0));
EntryData entry = root.subEntries.get(tocId.get(0));
for (int id : tocId.subList(1, tocId.size())) {
entry = entry.subEntries().get(id);
}
@ -41,7 +39,7 @@ public class TableOfContentsData {
public Stream<EntryData> streamAllEntries() {
return entries.stream().flatMap(TableOfContentsData::flatten);
return Stream.concat(Stream.of(root), root.subEntries.stream()).flatMap(TableOfContentsData::flatten);
}

View File

@ -10,6 +10,7 @@ import java.util.stream.Collectors;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.NodeType;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SectionNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode;
@ -20,11 +21,13 @@ import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class DocumentGraph implements SemanticNode {
@ -42,7 +45,7 @@ public class DocumentGraph implements SemanticNode {
public List<SectionNode> getMainSections() {
return tableOfContents.entries.stream().filter(entry -> entry.node() instanceof SectionNode).map(entry -> (SectionNode) entry.node()).collect(Collectors.toList());
return streamChildren().filter(node -> node instanceof SectionNode).map(node -> (SectionNode) node).collect(Collectors.toList());
}
@ -74,14 +77,14 @@ public class DocumentGraph implements SemanticNode {
private Stream<SemanticNode> streamAllNodes() {
return tableOfContents.streamEntriesInOrder().map(TableOfContents.Entry::node);
return tableOfContents.streamAllEntriesInOrder().map(TableOfContents.Entry::node);
}
@Override
public String toString() {
return tableOfContents.toString();
return NodeType.DOCUMENT + ": " + buildTextBlock().buildSummary();
}

View File

@ -1,10 +1,11 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph;
import static java.lang.String.format;
import java.nio.charset.StandardCharsets;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.stream.Stream;
import com.google.common.hash.Hashing;
@ -19,22 +20,22 @@ import lombok.Data;
@Data
public class TableOfContents {
List<Entry> entries;
private final Entry root;
public TableOfContents() {
public TableOfContents(DocumentGraph documentGraph) {
entries = new LinkedList<>();
root = Entry.builder().tocId(Collections.emptyList()).type(NodeType.DOCUMENT).children(new LinkedList<>()).node(documentGraph).build();
}
public TextBlock buildTextBlock() {
return streamEntriesInOrder().map(Entry::node).filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock).collect(new TextBlockCollector());
return streamAllEntriesInOrder().map(Entry::node).filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock).collect(new TextBlockCollector());
}
public List<Integer> createNewEntryAndReturnId(NodeType nodeType, SemanticNode node) {
public List<Integer> createNewMainEntryAndReturnId(NodeType nodeType, SemanticNode node) {
return createNewChildEntryAndReturnId(Collections.emptyList(), nodeType, node);
}
@ -42,27 +43,25 @@ public class TableOfContents {
public List<Integer> createNewChildEntryAndReturnId(List<Integer> parentId, NodeType nodeType, SemanticNode node) {
List<Integer> newId;
if (entryExists(parentId)) {
Entry parent = getEntryById(parentId);
newId = new LinkedList<>(parentId);
newId.add(parent.children().size());
parent.children().add(Entry.builder().tocId(newId).node(node).type(nodeType).children(new LinkedList<>()).build());
} else {
newId = List.of(entries.size());
entries.add(Entry.builder().tocId(newId).node(node).type(nodeType).children(new LinkedList<>()).build());
if (!entryExists(parentId)) {
throw new UnsupportedOperationException(format("parentId %s does not exist!", parentId));
}
Entry parent = getEntryById(parentId);
List<Integer> newId = new LinkedList<>(parentId);
newId.add(parent.children().size());
parent.children().add(Entry.builder().tocId(newId).node(node).type(nodeType).children(new LinkedList<>()).build());
return newId;
}
private boolean entryExists(List<Integer> tocId) {
if (tocId.size() < 1) {
return false;
if (tocId.isEmpty()) {
return root != null;
}
Entry entry = entries.get(tocId.get(0));
Entry entry = root.children.get(tocId.get(0));
for (int id : tocId.subList(1, tocId.size())) {
if (id >= entry.children.size() || 0 > id) {
return false;
@ -75,22 +74,17 @@ public class TableOfContents {
public Entry getParentEntryById(List<Integer> tocId) {
List<Integer> parentIds = getParentId(tocId);
if (parentIds.size() < 1) {
throw new NoSuchElementException(String.format("Node with tocId \"%s\" has no parent!", tocId));
}
return getEntryById(parentIds);
return getEntryById(getParentId(tocId));
}
public boolean hasParentById(List<Integer> tocId) {
List<Integer> parentId = getParentId(tocId);
return entryExists(parentId);
return entryExists(getParentId(tocId));
}
public Stream<SemanticNode> streamChildren(List<Integer> tocId) {
public Stream<SemanticNode> streamChildrenNodes(List<Integer> tocId) {
return getEntryById(tocId).children().stream().map(Entry::node);
}
@ -98,13 +92,22 @@ public class TableOfContents {
private static List<Integer> getParentId(List<Integer> tocId) {
if (tocId.isEmpty()) {
throw new UnsupportedOperationException("Root has no parent!");
}
if (tocId.size() < 2) {
return Collections.emptyList();
}
return tocId.subList(0, tocId.size() - 1);
}
public Entry getEntryById(List<Integer> tocId) {
Entry entry = entries.get(tocId.get(0));
if (tocId.isEmpty()) {
return root;
}
Entry entry = root.children.get(tocId.get(0));
for (int id : tocId.subList(1, tocId.size())) {
entry = entry.children().get(id);
}
@ -112,13 +115,19 @@ public class TableOfContents {
}
public Stream<Entry> streamEntriesInOrder() {
public Stream<Entry> streamMainEntries() {
return entries.stream().flatMap(TableOfContents::flatten);
return root.children.stream();
}
public Stream<Entry> streamSubEntriesInOrder(List<Integer> parentId) {
public Stream<Entry> streamAllEntriesInOrder() {
return Stream.of(root).flatMap(TableOfContents::flatten);
}
public Stream<Entry> streamAllSubEntriesInOrder(List<Integer> parentId) {
return Stream.of(getEntryById(parentId)).flatMap(TableOfContents::flatten);
}
@ -127,13 +136,13 @@ public class TableOfContents {
@Override
public String toString() {
return String.join("\n", streamEntriesInOrder().map(Entry::toString).toList());
return String.join("\n", streamAllEntriesInOrder().map(Entry::toString).toList());
}
public String toString(List<Integer> id) {
return String.join("\n", streamSubEntriesInOrder(id).map(Entry::toString).toList());
return String.join("\n", streamAllSubEntriesInOrder(id).map(Entry::toString).toList());
}

View File

@ -1,6 +1,7 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
public enum NodeType {
DOCUMENT,
SECTION,
HEADLINE,
PARAGRAPH,

View File

@ -205,7 +205,7 @@ public interface SemanticNode {
*/
default Stream<SemanticNode> streamChildren() {
return getTableOfContents().streamChildren(getTocId());
return getTableOfContents().streamChildrenNodes(getTocId());
}
@ -216,7 +216,7 @@ public interface SemanticNode {
*/
default Stream<SemanticNode> streamAllSubNodes() {
return getTableOfContents().streamSubEntriesInOrder(getTocId()).map(TableOfContents.Entry::node);
return getTableOfContents().streamAllSubEntriesInOrder(getTocId()).map(TableOfContents.Entry::node);
}

View File

@ -48,7 +48,7 @@ public class DocumentDataMapper {
private TableOfContentsData toTableOfContentsData(TableOfContents tableOfContents) {
return new TableOfContentsData(tableOfContents.getEntries().stream().map(this::toEntryData).toList());
return new TableOfContentsData(toEntryData(tableOfContents.getRoot()));
}

View File

@ -38,8 +38,10 @@ public class DocumentGraphMapper {
public DocumentGraph toDocumentGraph(DocumentData documentData) {
DocumentGraph documentGraph = new DocumentGraph();
Context context = new Context(documentData,
new TableOfContents(),
new TableOfContents(documentGraph),
new LinkedList<>(),
new LinkedList<>(),
Arrays.stream(documentData.getAtomicTextBlocks()).toList(),
@ -47,13 +49,12 @@ public class DocumentGraphMapper {
context.pages.addAll(Arrays.stream(documentData.getPages()).map(this::buildPage).toList());
context.tableOfContents.setEntries(buildEntries(documentData.getTableOfContents().getEntries(), context));
context.tableOfContents.getRoot().children().addAll(buildEntries(documentData.getTableOfContents().getRoot().subEntries(), context));
documentGraph.setTableOfContents(context.tableOfContents);
documentGraph.setPages(new HashSet<>(context.pages));
documentGraph.setNumberOfPages(documentData.getPages().length);
DocumentGraph documentGraph = DocumentGraph.builder()
.numberOfPages(documentData.getPages().length)
.pages(new HashSet<>(context.pages))
.tableOfContents(context.tableOfContents)
.build();
documentGraph.setTextBlock(documentGraph.buildTextBlock());
return documentGraph;
}

View File

@ -1,5 +1,6 @@
package com.knecon.fforesight.service.layoutparser.internal.api.services;
import java.util.Collections;
import java.util.NoSuchElementException;
import java.util.Set;
@ -21,9 +22,7 @@ public class EntityInsertionService {
public void addEntityToGraph(EntityNode entity, TableOfContents tableOfContents) {
try {
SemanticNode containingNode = tableOfContents.getEntries()
.stream()
.map(TableOfContents.Entry::node)
SemanticNode containingNode = tableOfContents.streamChildrenNodes(Collections.emptyList())
.filter(node -> node.buildTextBlock().containsBoundary(entity.getBoundary()))
.findFirst()
.orElseThrow(() -> new NoSuchElementException("No containing Node found!"));
@ -37,7 +36,6 @@ public class EntityInsertionService {
addToNodeEntitySets(entity);
} catch (NoSuchElementException e) {
entityEnrichmentService.enrichEntity(entity, tableOfContents.buildTextBlock());
entity.removeFromGraph();
}
}

View File

@ -10,94 +10,78 @@
</parent>
<artifactId>layoutparser-service-processor</artifactId>
<version>1.0.0</version>
<dependencies>
<dependency>
<groupId>com.iqser.red.service</groupId>
<artifactId>persistence-service-internal-api-v1</artifactId>
<version>2.36.0</version>
</dependency>
<dependency>
<groupId>com.knecon.fforesight</groupId>
<artifactId>layoutparser-service-internal-api</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>com.iqser.red.commons</groupId>
<artifactId>spring-commons</artifactId>
<version>6.2.0</version>
</dependency>
<dependency>
<groupId>com.iqser.red.commons</groupId>
<artifactId>storage-commons</artifactId>
<version>1.13.0</version>
</dependency>
<dependency>
<groupId>com.iqser.red.commons</groupId>
<artifactId>spring-commons</artifactId>
<version>6.2.0</version>
</dependency>
<dependency>
<groupId>com.dslplatform</groupId>
<artifactId>dsl-json-java8</artifactId>
<version>1.10.0</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>${pdfbox.version}</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox-tools</artifactId>
<version>${pdfbox.version}</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>31.1-jre</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.module</groupId>
<artifactId>jackson-module-afterburner</artifactId>
<version>${jackson.version}</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.datatype</groupId>
<artifactId>jackson-datatype-jsr310</artifactId>
<version>${jackson.version}</version>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-security</artifactId>
<version>${spring.version}</version>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
<version>${spring.version}</version>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<optional>true</optional>
</dependency>
<dependency>
<groupId>org.springframework.cloud</groupId>
<artifactId>spring-cloud-starter-openfeign</artifactId>
<version>4.0.2</version>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-amqp</artifactId>
@ -105,22 +89,6 @@
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
<configuration>
<excludes>
<exclude>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
</exclude>
</excludes>
</configuration>
</plugin>
</plugins>
</build>
<repositories>
<repository>
<id>spring-milestones</id>

View File

@ -52,15 +52,17 @@ public class DocumentGraphFactory {
public DocumentGraph buildDocumentGraph(ClassificationDocument document) {
TextBlockFactory textBlockFactory = new TextBlockFactory();
Context context = new Context(new TableOfContents(), new HashMap<>(), new LinkedList<>(), new LinkedList<>(), textBlockFactory);
DocumentGraph documentGraph = new DocumentGraph();
Context context = new Context(new TableOfContents(documentGraph), new HashMap<>(), new LinkedList<>(), new LinkedList<>(), textBlockFactory);
document.getPages().stream().map(this::buildPage).forEach(page -> context.pages().put(page, new AtomicInteger(1)));
document.getSections().stream().flatMap(section -> section.getImages().stream()).forEach(image -> context.images().add(image));
addSections(document, context);
addHeaderAndFooterToEachPage(document, context);
DocumentGraph documentGraph = DocumentGraph.builder().numberOfPages(context.pages.size()).pages(context.pages.keySet()).tableOfContents(context.tableOfContents).build();
documentGraph.setNumberOfPages(context.pages.size());
documentGraph.setPages(context.pages.keySet());
documentGraph.setTableOfContents(context.tableOfContents);
documentGraph.setTextBlock(documentGraph.buildTextBlock());
return documentGraph;
}
@ -82,7 +84,7 @@ public class DocumentGraphFactory {
List<Integer> tocId;
if (parentNode == null) {
tocId = context.tableOfContents.createNewEntryAndReturnId(NodeType.SECTION, sectionNode);
tocId = context.tableOfContents.createNewMainEntryAndReturnId(NodeType.SECTION, sectionNode);
} else {
tocId = context.tableOfContents.createNewChildEntryAndReturnId(parentNode.getTocId(), NodeType.SECTION, sectionNode);
}
@ -309,7 +311,7 @@ public class DocumentGraphFactory {
footer,
context,
page);
List<Integer> tocId = context.tableOfContents().createNewEntryAndReturnId(NodeType.FOOTER, footer);
List<Integer> tocId = context.tableOfContents().createNewMainEntryAndReturnId(NodeType.FOOTER, footer);
footer.setTocId(tocId);
footer.setTerminalTextBlock(textBlock);
page.setFooter(footer);
@ -325,7 +327,7 @@ public class DocumentGraphFactory {
context,
0,
page);
List<Integer> tocId = context.tableOfContents().createNewEntryAndReturnId(NodeType.HEADER, header);
List<Integer> tocId = context.tableOfContents().createNewMainEntryAndReturnId(NodeType.HEADER, header);
header.setTocId(tocId);
header.setTerminalTextBlock(textBlock);
page.setHeader(header);
@ -337,7 +339,7 @@ public class DocumentGraphFactory {
PageNode page = getPage(pageIndex, context);
FooterNode footer = FooterNode.builder().tableOfContents(context.tableOfContents()).build();
AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(footer, context, page);
List<Integer> tocId = context.tableOfContents().createNewEntryAndReturnId(NodeType.FOOTER, footer);
List<Integer> tocId = context.tableOfContents().createNewMainEntryAndReturnId(NodeType.FOOTER, footer);
footer.setTocId(tocId);
footer.setTerminalTextBlock(textBlock);
page.setFooter(footer);
@ -349,7 +351,7 @@ public class DocumentGraphFactory {
PageNode page = getPage(pageIndex, context);
HeaderNode header = HeaderNode.builder().tableOfContents(context.tableOfContents()).build();
AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(header, 0, page);
List<Integer> tocId = context.tableOfContents().createNewEntryAndReturnId(NodeType.HEADER, header);
List<Integer> tocId = context.tableOfContents().createNewMainEntryAndReturnId(NodeType.HEADER, header);
header.setTocId(tocId);
header.setTerminalTextBlock(textBlock);
page.setHeader(header);

View File

@ -10,7 +10,6 @@
</parent>
<artifactId>layoutparser-service-server</artifactId>
<version>1.0.0</version>
<dependencies>
<dependency>
@ -18,7 +17,11 @@
<artifactId>layoutparser-service-processor</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>com.iqser.red.commons</groupId>
<artifactId>spring-commons</artifactId>
<version>6.2.0</version>
</dependency>
<dependency>
<groupId>javax.servlet</groupId>
<artifactId>javax.servlet-api</artifactId>
@ -29,13 +32,11 @@
<artifactId>spring-cloud-starter-openfeign</artifactId>
<version>4.0.2</version>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-amqp</artifactId>
<version>${spring.version}</version>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-api</artifactId>
@ -54,5 +55,28 @@
<version>6.0.3</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-core</artifactId>
<version>5.3.0</version>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
<configuration>
<excludes>
<exclude>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
</exclude>
</excludes>
</configuration>
</plugin>
</plugins>
</build>
</project>

View File

@ -1,7 +0,0 @@
package com.knecon.fforesight.service.layoutparser.server;
import static org.junit.jupiter.api.Assertions.*;
class ApplicationTest {
}

View File

@ -2,7 +2,9 @@ package com.knecon.fforesight.service.layoutparser.server;
import org.junit.jupiter.api.Test;
class LayoutParserApplicationTests extends BaseTest {
import com.knecon.fforesight.service.layoutparser.server.utils.BaseTest;
class ApplicationTests extends BaseTest {
@Test
void contextLoads() {

View File

@ -0,0 +1,51 @@
package com.knecon.fforesight.service.layoutparser.server.graph;
import static org.junit.jupiter.api.Assertions.assertEquals;
import java.io.InputStream;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.core.io.ClassPathResource;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.DocumentGraph;
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingService;
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.server.utils.BaseTest;
import lombok.SneakyThrows;
public class BuildDocumentGraphTest extends BaseTest {
@Autowired
private LayoutParsingService layoutParsingService;
@Test
public void buildMetolachlor() {
DocumentGraph documentGraph = buildGraph("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06");
assertEquals(221, documentGraph.getPages().size());
assertEquals(220 , documentGraph.getPages().stream().filter(page -> page.getHeader().hasText()).count());
assertEquals(0 , documentGraph.getPages().stream().filter(page -> page.getFooter().hasText()).count());
}
@SneakyThrows
protected DocumentGraph buildGraph(String filename) {
if (filename.equals("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06")) {
prepareStorage(filename + ".pdf", "cv_table_parsing_response/empty.json", "image_service_response/S-Metolachlor_RAR_01_Volume_1_2018-09-06.IMAGE_INFO.json");
} else {
prepareStorage(filename + ".pdf");
}
ClassPathResource fileResource = new ClassPathResource(filename + ".pdf");
try (InputStream inputStream = fileResource.getInputStream()) {
PDDocument pdDocument = Loader.loadPDF(inputStream);
return layoutParsingService.parseLayout(pdDocument, layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID), new TableServiceResponse());
}
}
}

View File

@ -4,14 +4,10 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertInstanceOf;
import static org.wildfly.common.Assert.assertTrue;
import java.io.InputStream;
import java.util.List;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.core.io.ClassPathResource;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.DocumentGraph;
@ -26,21 +22,13 @@ import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.Table
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.internal.api.services.EntityInsertionService;
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingService;
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.server.BaseTest;
import com.knecon.fforesight.service.layoutparser.server.TestEntity;
import com.knecon.fforesight.service.layoutparser.server.utils.TestEntity;
import lombok.SneakyThrows;
public class DocumentGraphTest extends BaseTest {
public class DocumentGraphEntityInsertionTest extends BuildDocumentGraphTest {
@Autowired
private EntityInsertionService entityInsertionService;
@Autowired
private LayoutParsingService layoutParsingService;
@Test
public void assertTextBeforeAndTextAfterForParagraphCrafted() {
@ -51,7 +39,7 @@ public class DocumentGraphTest extends BaseTest {
assert start != -1;
Boundary boundary = new Boundary(start, start + searchTerm.length());
TestEntity entityNode = TestEntity.initialEntityNode(boundary, "123", "123");
TestEntity entityNode = TestEntity.initialEntityNode(boundary, "123", "123");
entityInsertionService.addEntityToGraph(entityNode, documentGraph.getTableOfContents());
assertEquals("Expand to Hint ", entityNode.getTextBefore());
@ -76,7 +64,7 @@ public class DocumentGraphTest extends BaseTest {
assert start != -1;
Boundary boundary = new Boundary(start, start + searchTerm.length());
TestEntity entityNode = TestEntity.initialEntityNode(boundary, "123", "123");
TestEntity entityNode = TestEntity.initialEntityNode(boundary, "123", "123");
entityInsertionService.addEntityToGraph(entityNode, documentGraph.getTableOfContents());
assertEquals("", entityNode.getTextBefore());
@ -132,7 +120,7 @@ public class DocumentGraphTest extends BaseTest {
DocumentGraph documentGraph = buildGraph("files/crafted document");
TableNode table = (TableNode) documentGraph.getTableOfContents()//
.streamEntriesInOrder()//
.streamAllEntriesInOrder()//
.filter(entry -> entry.type().equals(NodeType.TABLE))//
.map(TableOfContents.Entry::node)//
.findFirst().orElseThrow();
@ -162,7 +150,7 @@ public class DocumentGraphTest extends BaseTest {
DocumentGraph documentGraph = buildGraph("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06");
TableNode table = (TableNode) documentGraph.getTableOfContents()
.streamEntriesInOrder()
.streamAllEntriesInOrder()
.filter(entry -> entry.node().getPages().stream().anyMatch(page -> page.getNumber() == 22))
.filter(entry -> entry.type().equals(NodeType.TABLE))
.map(TableOfContents.Entry::node)
@ -187,7 +175,7 @@ public class DocumentGraphTest extends BaseTest {
assert start != -1;
Boundary boundary = new Boundary(start, start + searchTerm.length());
TestEntity entityNode = TestEntity.initialEntityNode(boundary, "123", "123");
TestEntity entityNode = TestEntity.initialEntityNode(boundary, "123", "123");
entityInsertionService.addEntityToGraph(entityNode, documentGraph.getTableOfContents());
assertEquals("except Cranberry; Vegetable, ", entityNode.getTextBefore());
@ -214,7 +202,7 @@ public class DocumentGraphTest extends BaseTest {
assert start != -1;
Boundary boundary = new Boundary(start, start + searchTerm.length());
TestEntity entityNode = TestEntity.initialEntityNode(boundary, "123", "123");
TestEntity entityNode = TestEntity.initialEntityNode(boundary, "123", "123");
entityInsertionService.addEntityToGraph(entityNode, documentGraph.getTableOfContents());
assertEquals("2.6.1 Summary of ", entityNode.getTextBefore());
@ -240,7 +228,7 @@ public class DocumentGraphTest extends BaseTest {
assert start != -1;
Boundary boundary = new Boundary(start, start + searchTerm.length());
TestEntity entityNode = TestEntity.initialEntityNode(boundary, "123", "123");
TestEntity entityNode = TestEntity.initialEntityNode(boundary, "123", "123");
entityInsertionService.addEntityToGraph(entityNode, documentGraph.getTableOfContents());
assertEquals("2-[(2-(1-hydroxy-ethyl)-6methyl-phenyl-amino]propan-1-ol (", entityNode.getTextBefore());
@ -258,23 +246,6 @@ public class DocumentGraphTest extends BaseTest {
}
@SneakyThrows
protected DocumentGraph buildGraph(String filename) {
if (filename.equals("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06")) {
prepareStorage(filename + ".pdf", "cv_table_parsing_response/empty.json", "image_service_response/S-Metolachlor_RAR_01_Volume_1_2018-09-06.IMAGE_INFO.json");
} else {
prepareStorage(filename + ".pdf");
}
ClassPathResource fileResource = new ClassPathResource(filename + ".pdf");
try (InputStream inputStream = fileResource.getInputStream()) {
PDDocument pdDocument = Loader.loadPDF(inputStream);
return layoutParsingService.parseLayout(pdDocument, layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID), new TableServiceResponse());
}
}
private static void assertSameOffsetInAllIntersectingNodes(String searchTerm, int start, EntityNode entityNode) {
List<Integer> paragraphStart = entityNode.getIntersectingNodes().stream()//
@ -293,7 +264,7 @@ public class DocumentGraphTest extends BaseTest {
assert start != -1;
Boundary boundary = new Boundary(start, start + searchTerm.length());
TestEntity entityNode = TestEntity.initialEntityNode(boundary, "123", "123");
TestEntity entityNode = TestEntity.initialEntityNode(boundary, "123", "123");
entityInsertionService.addEntityToGraph(entityNode, documentGraph.getTableOfContents());
PageNode pageNode = documentGraph.getPages().stream().filter(page -> page.getNumber() == pageNumber).findFirst().orElseThrow();

View File

@ -1,31 +1,18 @@
package com.knecon.fforesight.service.layoutparser.server.graph;
import java.util.Collections;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.core.io.ClassPathResource;
import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentData;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.DocumentGraph;
import com.knecon.fforesight.service.layoutparser.internal.api.mapper.DocumentDataMapper;
import com.knecon.fforesight.service.layoutparser.internal.api.mapper.DocumentGraphMapper;
import com.knecon.fforesight.service.layoutparser.processor.classification.service.PdfParsingService;
import com.knecon.fforesight.service.layoutparser.processor.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.processor.queue.LayoutParsingRequest;
import com.knecon.fforesight.service.layoutparser.server.BaseTest;
import lombok.SneakyThrows;
public class DocumentGraphMappingTest extends BaseTest {
public class DocumentGraphMappingTest extends BuildDocumentGraphTest {
@Autowired
private DocumentGraphFactory documentGraphFactory;
@Autowired
private PdfParsingService pdfParsingService;
@Autowired
private DocumentDataMapper documentDataMapper;
@ -38,17 +25,10 @@ public class DocumentGraphMappingTest extends BaseTest {
@SneakyThrows
public void testGraphMapping() {
String filename = "files/crafted document";
DocumentGraph document = buildGraph("files/crafted document");
LayoutParsingRequest layoutParsingRequest = buildStandardLayoutParsingRequest();
prepareStorage(filename + ".pdf");
ClassPathResource fileResource = new ClassPathResource(filename + ".pdf");
LayoutParsingRequest layoutParsingRequest = prepareStorage(fileResource.getInputStream());
PDDocument pdDocument = Loader.loadPDF(fileResource.getInputStream());
var classifiedDoc = pdfParsingService.parseDocument(pdDocument, Collections.emptyMap(), Collections.emptyMap());
DocumentGraph document = documentGraphFactory.buildDocumentGraph(classifiedDoc);
DocumentData documentData = documentDataMapper.toDocumentData(document);
layoutParsingStorageService.storeDocumentData(layoutParsingRequest, documentData);
DocumentData documentData2 = layoutParsingStorageService.readDocumentData(layoutParsingRequest);
DocumentGraph newDocumentGraph = documentGraphMapper.toDocumentGraph(documentData2);

View File

@ -12,37 +12,50 @@ import org.springframework.core.io.ClassPathResource;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.DocumentGraph;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.server.visualizations.PdfDraw;
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
import lombok.SneakyThrows;
@Disabled
public class DocumentGraphVisualizationTest extends DocumentGraphTest {
public class DocumentGraphVisualizationTest extends BuildDocumentGraphTest {
@Test
@SneakyThrows
@Disabled
public void visualizeMetolachlor() {
String filename = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06";
DocumentGraph documentGraph = buildGraph(filename);
TextBlock textBlock = documentGraph.buildTextBlock();
visualizeSemanticNodes(filename, documentGraph, textBlock);
visualizePdf(filename);
}
@Test
@SneakyThrows
@Disabled
public void visualizeRotatedTestDocument() {
String filename = "files/RotateTestFileWithImages";
visualizePdf(filename);
}
@Test
@SneakyThrows
@Disabled
public void visualizeCraftedDocument() {
String filename = "files/crafted document";
visualizePdf(filename);
}
@SneakyThrows
private void visualizePdf(String filename) {
DocumentGraph documentGraph = buildGraph(filename);
TextBlock textBlock = documentGraph.buildTextBlock();
visualizeSemanticNodes(filename, documentGraph, textBlock);
}

View File

@ -1,9 +1,10 @@
package com.knecon.fforesight.service.layoutparser.server;
package com.knecon.fforesight.service.layoutparser.server.utils;
import java.io.InputStream;
import java.util.Optional;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.extension.ExtendWith;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.beans.factory.annotation.Autowired;
@ -12,6 +13,7 @@ import org.springframework.boot.autoconfigure.amqp.RabbitAutoConfiguration;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.boot.test.mock.mockito.MockBean;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.ComponentScan;
import org.springframework.context.annotation.Configuration;
import org.springframework.context.annotation.Import;
import org.springframework.context.annotation.Primary;
@ -22,9 +24,12 @@ import com.iqser.red.storage.commons.service.StorageService;
import com.knecon.fforesight.service.layoutparser.internal.api.mapper.DocumentDataMapper;
import com.knecon.fforesight.service.layoutparser.internal.api.mapper.DocumentGraphMapper;
import com.knecon.fforesight.service.layoutparser.internal.api.services.EntityEnrichmentService;
import com.knecon.fforesight.service.layoutparser.internal.api.services.EntityInsertionService;
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingStorageService;
import com.knecon.fforesight.service.layoutparser.processor.multitenancy.TenantContext;
import com.knecon.fforesight.service.layoutparser.processor.multitenancy.TenantsClient;
import com.knecon.fforesight.service.layoutparser.processor.queue.LayoutParsingRequest;
import com.knecon.fforesight.service.layoutparser.server.Application;
import lombok.SneakyThrows;
@ -39,6 +44,9 @@ public class BaseTest {
@Autowired
protected StorageService storageService;
@Autowired
protected TenantsClient tenantsClient;
@MockBean
private RabbitTemplate rabbitTemplate;
@ -49,6 +57,35 @@ public class BaseTest {
protected final static String TEXT_FILE_ID = "texts";
protected final static String POSITION_FILE_ID = "positions";
protected final static String PAGES_FILE_ID = "pages";
protected final static String TENANT_ID = "tenant";
protected LayoutParsingRequest buildStandardLayoutParsingRequest() {
return LayoutParsingRequest.builder()
.imagesFileStorageId(Optional.of(IMAGE_FILE_ID))
.originFileStorageId(ORIGIN_FILE_ID)
.tablesFileStorageId(Optional.of(TABLE_FILE_ID))
.pageFileStorageId(PAGES_FILE_ID)
.positionBlockFileStorageId(POSITION_FILE_ID)
.structureFileStorageId(STRUCTURE_FILE_ID)
.textBlockFileStorageId(TEXT_FILE_ID)
.build();
}
@BeforeEach
public void setupTenantContext() {
TenantContext.setTenantId(TENANT_ID);
}
@AfterEach
public void clearTenantContext() {
TenantContext.clear();
}
@SneakyThrows
@ -115,6 +152,7 @@ public class BaseTest {
@Configuration
@EnableAutoConfiguration(exclude = RabbitAutoConfiguration.class)
@ComponentScan("com.knecon.fforesight.service.layoutparser")
public static class TestConfiguration {
@Bean
@ -125,13 +163,6 @@ public class BaseTest {
}
@Bean
public EntityEnrichmentService testEntityEnrichmentService() {
return new TestEntityEnrichmentService();
}
@Bean
public DocumentDataMapper documentDataMapper() {
@ -145,6 +176,14 @@ public class BaseTest {
return new DocumentGraphMapper();
}
@Bean
@Autowired
public EntityInsertionService entityInsertionService(EntityEnrichmentService entityEnrichmentService) {
return new EntityInsertionService(entityEnrichmentService);
}
}
}

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.server;
package com.knecon.fforesight.service.layoutparser.server.utils;
import static java.io.File.createTempFile;
@ -14,6 +14,7 @@ import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.springframework.core.io.InputStreamResource;
import org.springframework.stereotype.Service;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.commons.jackson.ObjectMapperFactory;
@ -22,6 +23,7 @@ import com.iqser.red.storage.commons.service.StorageService;
import lombok.SneakyThrows;
@Service
public class FileSystemBackedStorageService implements StorageService {
private final Map<String, File> dataMap = new HashMap<>();

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.server;
package com.knecon.fforesight.service.layoutparser.server.utils;
import java.nio.charset.StandardCharsets;
import java.util.HashSet;

View File

@ -1,15 +1,18 @@
package com.knecon.fforesight.service.layoutparser.server;
package com.knecon.fforesight.service.layoutparser.server.utils;
import java.util.Arrays;
import java.util.List;
import java.util.Objects;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.internal.api.services.EntityEnrichmentService;
import lombok.RequiredArgsConstructor;
@Service
@RequiredArgsConstructor
public class TestEntityEnrichmentService implements EntityEnrichmentService {

View File

@ -0,0 +1,42 @@
package com.knecon.fforesight.service.layoutparser.server.utils;
import java.util.Collections;
import java.util.List;
import org.springframework.stereotype.Service;
import com.iqser.red.service.persistence.service.v1.api.shared.model.common.JSONPrimitive;
import com.iqser.red.service.persistence.service.v1.api.shared.model.multitenancy.TenantRequest;
import com.iqser.red.service.persistence.service.v1.api.shared.model.multitenancy.TenantResponse;
import com.knecon.fforesight.service.layoutparser.processor.multitenancy.TenantsClient;
@Service
public class TestTenantsClient implements TenantsClient {
@Override
public void createTenant(TenantRequest tenantRequest) {
}
@Override
public List<TenantResponse> getTenants() {
return Collections.emptyList();
}
@Override
public TenantResponse getTenant(String tenantId) {
return null;
}
@Override
public JSONPrimitive<String> getDeploymentKey(String tenantId) {
return null;
}
}

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.server.visualizations;
package com.knecon.fforesight.service.layoutparser.server.utils.visualizations;
import java.awt.Color;
import java.awt.geom.Point2D;
@ -36,7 +36,7 @@ public class PdfDraw {
public static void drawDocumentGraph(PDDocument document, DocumentGraph documentGraph) {
documentGraph.getTableOfContents().streamEntriesInOrder().forEach(entry -> drawNode(document, entry));
documentGraph.getTableOfContents().streamAllEntriesInOrder().forEach(entry -> drawNode(document, entry));
}
@ -72,8 +72,11 @@ public class PdfDraw {
contentStream.setLineWidth(options.getStrokeWidth());
contentStream.beginText();
contentStream.setTextMatrix(Matrix.getRotateInstance(Math.toRadians(30), 0, 0));
contentStream.newLineAtOffset((float) location.getX(), (float) location.getY());
if (rotate) {
contentStream.setTextMatrix(Matrix.getRotateInstance(Math.toRadians(15), (float) location.getX(), (float) location.getY()));
} else {
contentStream.newLineAtOffset((float) location.getX(), (float) location.getY());
}
contentStream.setFont(new PDType1Font(Standard14Fonts.FontName.HELVETICA), 10);
contentStream.showText(string);
contentStream.endText();
@ -136,6 +139,7 @@ public class PdfDraw {
private static Options buildStandardOptionsForNodes(TableOfContents.Entry entry) {
return Options.builder().stroke(true).strokeColor(switch (entry.type()) {
case DOCUMENT -> Color.LIGHT_GRAY;
case HEADER, FOOTER -> Color.GREEN;
case PARAGRAPH -> Color.BLUE;
case HEADLINE -> Color.RED;