diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..549e00a --- /dev/null +++ b/.gitignore @@ -0,0 +1,33 @@ +HELP.md +target/ +!.mvn/wrapper/maven-wrapper.jar +!**/src/main/**/target/ +!**/src/test/**/target/ + +### STS ### +.apt_generated +.classpath +.factorypath +.project +.settings +.springBeans +.sts4-cache + +### IntelliJ IDEA ### +.idea +*.iws +*.iml +*.ipr + +### NetBeans ### +/nbproject/private/ +/nbbuild/ +/dist/ +/nbdist/ +/.nb-gradle/ +build/ +!**/src/main/**/build/ +!**/src/test/**/build/ + +### VS Code ### +.vscode/ diff --git a/.mvn/wrapper/maven-wrapper.jar b/.mvn/wrapper/maven-wrapper.jar new file mode 100644 index 0000000..bf82ff0 Binary files /dev/null and b/.mvn/wrapper/maven-wrapper.jar differ diff --git a/.mvn/wrapper/maven-wrapper.properties b/.mvn/wrapper/maven-wrapper.properties new file mode 100644 index 0000000..ca5ab4b --- /dev/null +++ b/.mvn/wrapper/maven-wrapper.properties @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +distributionUrl=https://repo.maven.apache.org/maven2/org/apache/maven/apache-maven/3.8.7/apache-maven-3.8.7-bin.zip +wrapperUrl=https://repo.maven.apache.org/maven2/org/apache/maven/wrapper/maven-wrapper/3.1.1/maven-wrapper-3.1.1.jar diff --git a/layoutparser-service-image/pom.xml b/layoutparser-service-image/pom.xml new file mode 100644 index 0000000..6107cf7 --- /dev/null +++ b/layoutparser-service-image/pom.xml @@ -0,0 +1,17 @@ + + + 4.0.0 + + + + + com.knecon.fforesight + layoutparser + 1.0.0 + + + layoutparser-service-image + 1.0.0 + + diff --git a/layoutparser-service/layoutparser-service-internal-api/pom.xml b/layoutparser-service/layoutparser-service-internal-api/pom.xml new file mode 100755 index 0000000..9b61b69 --- /dev/null +++ b/layoutparser-service/layoutparser-service-internal-api/pom.xml @@ -0,0 +1,49 @@ + + + 4.0.0 + + + + com.knecon.fforesight + layoutparser-service + 1.0.0 + + + layoutparser-service-internal-api + 1.0.0 + + pom + + + + org.projectlombok + lombok + 1.18.26 + + + com.google.guava + guava + 31.1-jre + + + + + + + org.springframework.boot + spring-boot-maven-plugin + + + + org.projectlombok + lombok + + + + + + + + diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/data/AtomicPositionBlockData.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/data/AtomicPositionBlockData.java new file mode 100644 index 0000000..f61d380 --- /dev/null +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/data/AtomicPositionBlockData.java @@ -0,0 +1,19 @@ +package com.knecon.fforesight.service.layoutparser.internal.api.data; + +import lombok.AccessLevel; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.experimental.FieldDefaults; + +@Data +@Builder +@AllArgsConstructor +@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) +public class AtomicPositionBlockData { + + Long id; + int[] stringIdxToPositionIdx; + float[][] positions; + +} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/data/AtomicTextBlockData.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/data/AtomicTextBlockData.java new file mode 100644 index 0000000..80910e8 --- /dev/null +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/data/AtomicTextBlockData.java @@ -0,0 +1,23 @@ +package com.knecon.fforesight.service.layoutparser.internal.api.data; + +import lombok.AccessLevel; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.experimental.FieldDefaults; + +@Data +@Builder +@AllArgsConstructor +@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) +public class AtomicTextBlockData { + + Long id; + Long page; + String searchText; + int numberOnPage; + int start; + int end; + int[] lineBreaks; + +} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/data/DocumentData.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/data/DocumentData.java new file mode 100644 index 0000000..166500c --- /dev/null +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/data/DocumentData.java @@ -0,0 +1,20 @@ +package com.knecon.fforesight.service.layoutparser.internal.api.data; + +import lombok.AccessLevel; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.experimental.FieldDefaults; + +@Data +@Builder +@AllArgsConstructor +@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) +public class DocumentData { + + PageData[] pages; + AtomicTextBlockData[] atomicTextBlocks; + AtomicPositionBlockData[] atomicPositionBlocks; + TableOfContentsData tableOfContents; + +} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/data/PageData.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/data/PageData.java new file mode 100644 index 0000000..20c92a3 --- /dev/null +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/data/PageData.java @@ -0,0 +1,20 @@ +package com.knecon.fforesight.service.layoutparser.internal.api.data; + +import lombok.AccessLevel; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.experimental.FieldDefaults; + +@Data +@Builder +@AllArgsConstructor +@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) +public class PageData { + + int number; + int height; + int width; + int rotation; + +} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/data/TableOfContentsData.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/data/TableOfContentsData.java new file mode 100644 index 0000000..3172b70 --- /dev/null +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/data/TableOfContentsData.java @@ -0,0 +1,90 @@ +package com.knecon.fforesight.service.layoutparser.internal.api.data; + +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.stream.Stream; + +import javax.management.openmbean.InvalidKeyException; + +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.NodeType; + +import lombok.AccessLevel; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; +import lombok.experimental.FieldDefaults; + +@Data +@Builder +@AllArgsConstructor +@NoArgsConstructor +@FieldDefaults(level = AccessLevel.PRIVATE) +public class TableOfContentsData { + + List entries; + + + public EntryData get(List tocId) { + + if (tocId.size() < 1) { + throw new InvalidKeyException(String.format("ClassificationSection Identifier: \"%s\" is not valid.", tocId)); + } + EntryData entry = entries.get(tocId.get(0)); + for (int id : tocId.subList(1, tocId.size())) { + entry = entry.subEntries().get(id); + } + return entry; + } + + + public Stream streamAllEntries() { + + return entries.stream().flatMap(TableOfContentsData::flatten); + } + + + private static List getIds(String idsAsString) { + + return Arrays.stream(idsAsString.split("\\.")).map(Integer::valueOf).toList(); + } + + + public String toString() { + + return String.join("\n", streamAllEntries().map(EntryData::toString).toList()); + } + + + private static Stream flatten(EntryData entry) { + + return Stream.concat(Stream.of(entry), entry.subEntries().stream().flatMap(TableOfContentsData::flatten)); + } + + + @Builder + public record EntryData(NodeType type, int[] tocId, Long[] atomicBlocks, Long[] pages, Map properties, List subEntries) { + + @Override + public String toString() { + + StringBuilder sb = new StringBuilder(); + sb.append("["); + for (int i : tocId) { + sb.append(i); + sb.append(","); + } + sb.delete(sb.length() - 1, sb.length()); + sb.append("]: "); + + sb.append(type); + sb.append(" atbs = "); + sb.append(atomicBlocks.length); + + return sb.toString(); + } + + } + +} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/Boundary.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/Boundary.java new file mode 100644 index 0000000..ece48e5 --- /dev/null +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/Boundary.java @@ -0,0 +1,148 @@ +package com.knecon.fforesight.service.layoutparser.internal.api.graph; + +import java.util.LinkedList; +import java.util.List; + +import lombok.Setter; + +@Setter +public class Boundary implements Comparable { + + private int start; + private int end; + + + public Boundary(int start, int end) { + + if (start > end) { + throw new IllegalArgumentException(String.format("start: %d > end: %d", start, end)); + } + this.start = start; + this.end = end; + } + + + public int length() { + + return end - start; + } + + + public int start() { + + return start; + } + + + public int end() { + + return end; + } + + + public boolean contains(Boundary boundary) { + + return start <= boundary.start() && boundary.end() <= end; + } + + + public boolean containedBy(Boundary boundary) { + + return boundary.contains(this); + } + + + public boolean contains(int start, int end) { + + if (start > end) { + throw new IllegalArgumentException(String.format("start: %d > end: %d", start, end)); + } + return this.start <= start && end <= this.end; + } + + + public boolean containedBy(int start, int end) { + + if (start > end) { + throw new IllegalArgumentException(String.format("start: %d > end: %d", start, end)); + } + return start <= this.start && this.end <= end; + } + + + public boolean contains(int index) { + + return start <= index && index < end; + } + + + public boolean intersects(Boundary boundary) { + + return contains(boundary.start()) || contains(boundary.end() - 1); + } + + + public List split(List splitIndices) { + + if (splitIndices.stream().anyMatch(idx -> !this.contains(idx))) { + throw new IndexOutOfBoundsException(String.format("%s splitting indices are out of range for %s", splitIndices.stream().filter(idx -> !this.contains(idx)).toList(), this)); + } + List splitBoundaries = new LinkedList<>(); + int previousIndex = start; + for (int splitIndex : splitIndices) { + + // skip split if it would produce a boundary of length 0 + if (splitIndex == previousIndex) { + continue; + } + splitBoundaries.add(new Boundary(previousIndex, splitIndex)); + previousIndex = splitIndex; + } + splitBoundaries.add(new Boundary(previousIndex, end)); + return splitBoundaries; + } + + + public static Boundary merge(List boundaries) { + + int minStart = boundaries.stream().mapToInt(Boundary::start).min().orElseThrow(IllegalArgumentException::new); + int maxEnd = boundaries.stream().mapToInt(Boundary::end).max().orElseThrow(IllegalArgumentException::new); + return new Boundary(minStart, maxEnd); + } + + + @Override + public String toString() { + + return String.format("Boundary [%d|%d)", start, end); + } + + + @Override + public int compareTo(Boundary boundary) { + + if (end < boundary.end() && start < boundary.start()) { + return -1; + } + if (start > boundary.start() && end > boundary.end()) { + return 1; + } + + return 0; + } + + + @Override + public int hashCode() { + + return toString().hashCode(); + } + + + @Override + public boolean equals(Object object) { + + return hashCode() == object.hashCode(); + } + +} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/DocumentGraph.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/DocumentGraph.java new file mode 100644 index 0000000..72ff2d6 --- /dev/null +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/DocumentGraph.java @@ -0,0 +1,98 @@ +package com.knecon.fforesight.service.layoutparser.internal.api.graph; + +import java.awt.geom.Rectangle2D; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SectionNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector; + +import lombok.AccessLevel; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.experimental.FieldDefaults; + +@Data +@Builder +@AllArgsConstructor +@FieldDefaults(level = AccessLevel.PRIVATE) +public class DocumentGraph implements SemanticNode { + + Set pages; + TableOfContents tableOfContents; + Integer numberOfPages; + TextBlock textBlock; + + + public TextBlock buildTextBlock() { + + return streamTerminalTextBlocksInOrder().collect(new TextBlockCollector()); + } + + + public List getMainSections() { + + return tableOfContents.entries.stream().filter(entry -> entry.node() instanceof SectionNode).map(entry -> (SectionNode) entry.node()).collect(Collectors.toList()); + } + + + public Stream streamTerminalTextBlocksInOrder() { + + return streamAllNodes().filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock); + } + + + public Set getEntities() { + + return streamAllNodes().map(SemanticNode::getEntities).flatMap(Set::stream).collect(Collectors.toUnmodifiableSet()); + } + + + @Override + public List getTocId() { + + return Collections.emptyList(); + } + + + @Override + public void setTocId(List tocId) { + + throw new UnsupportedOperationException("DocumentGraph is always the root of the Table of Contents"); + } + + + private Stream streamAllNodes() { + + return tableOfContents.streamEntriesInOrder().map(TableOfContents.Entry::node); + } + + + @Override + public String toString() { + + return tableOfContents.toString(); + } + + + @Override + public Map getBBox() { + + Map bBox = new HashMap<>(); + for (PageNode page : pages) { + bBox.put(page, new Rectangle2D.Double(0, 0, page.getWidth(), page.getHeight())); + } + return bBox; + } + +} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/TableOfContents.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/TableOfContents.java new file mode 100644 index 0000000..b0fa0f0 --- /dev/null +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/TableOfContents.java @@ -0,0 +1,164 @@ +package com.knecon.fforesight.service.layoutparser.internal.api.graph; + +import java.nio.charset.StandardCharsets; +import java.util.Collections; +import java.util.LinkedList; +import java.util.List; +import java.util.NoSuchElementException; +import java.util.stream.Stream; + +import com.google.common.hash.Hashing; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.NodeType; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector; + +import lombok.Builder; +import lombok.Data; + +@Data +public class TableOfContents { + + List entries; + + + public TableOfContents() { + + entries = new LinkedList<>(); + } + + + public TextBlock buildTextBlock() { + + return streamEntriesInOrder().map(Entry::node).filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock).collect(new TextBlockCollector()); + } + + + public List createNewEntryAndReturnId(NodeType nodeType, SemanticNode node) { + + return createNewChildEntryAndReturnId(Collections.emptyList(), nodeType, node); + } + + + public List createNewChildEntryAndReturnId(List parentId, NodeType nodeType, SemanticNode node) { + + List newId; + if (entryExists(parentId)) { + Entry parent = getEntryById(parentId); + newId = new LinkedList<>(parentId); + newId.add(parent.children().size()); + parent.children().add(Entry.builder().tocId(newId).node(node).type(nodeType).children(new LinkedList<>()).build()); + } else { + newId = List.of(entries.size()); + entries.add(Entry.builder().tocId(newId).node(node).type(nodeType).children(new LinkedList<>()).build()); + } + + return newId; + } + + + private boolean entryExists(List tocId) { + + if (tocId.size() < 1) { + return false; + } + Entry entry = entries.get(tocId.get(0)); + for (int id : tocId.subList(1, tocId.size())) { + if (id >= entry.children.size() || 0 > id) { + return false; + } + entry = entry.children().get(id); + } + return true; + } + + + public Entry getParentEntryById(List tocId) { + + List parentIds = getParentId(tocId); + if (parentIds.size() < 1) { + throw new NoSuchElementException(String.format("Node with tocId \"%s\" has no parent!", tocId)); + } + return getEntryById(parentIds); + } + + + public boolean hasParentById(List tocId) { + + List parentId = getParentId(tocId); + return entryExists(parentId); + } + + + public Stream streamChildren(List tocId) { + + return getEntryById(tocId).children().stream().map(Entry::node); + } + + + private static List getParentId(List tocId) { + + return tocId.subList(0, tocId.size() - 1); + } + + + public Entry getEntryById(List tocId) { + + Entry entry = entries.get(tocId.get(0)); + for (int id : tocId.subList(1, tocId.size())) { + entry = entry.children().get(id); + } + return entry; + } + + + public Stream streamEntriesInOrder() { + + return entries.stream().flatMap(TableOfContents::flatten); + } + + + public Stream streamSubEntriesInOrder(List parentId) { + + return Stream.of(getEntryById(parentId)).flatMap(TableOfContents::flatten); + } + + + @Override + public String toString() { + + return String.join("\n", streamEntriesInOrder().map(Entry::toString).toList()); + } + + + public String toString(List id) { + + return String.join("\n", streamSubEntriesInOrder(id).map(Entry::toString).toList()); + } + + + private static Stream flatten(Entry entry) { + + return Stream.concat(Stream.of(entry), entry.children().stream().flatMap(TableOfContents::flatten)); + } + + + @Builder + public record Entry(List tocId, NodeType type, SemanticNode node, List children) { + + @Override + public String toString() { + + return node().toString(); + } + + + @Override + public int hashCode() { + + return Hashing.murmur3_32_fixed().hashString(toString(), StandardCharsets.UTF_8).hashCode(); + } + + } + +} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/entity/EntityNode.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/entity/EntityNode.java new file mode 100644 index 0000000..226d356 --- /dev/null +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/entity/EntityNode.java @@ -0,0 +1,76 @@ +package com.knecon.fforesight.service.layoutparser.internal.api.graph.entity; + +import java.util.Collections; +import java.util.List; +import java.util.Set; + +import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode; + +public interface EntityNode { + + /** + * This represents the text, which is contained within the boundary of the Entity. + * + * @return String + */ + String getValue(); + + + /** + * The Boundary primarily defines the Entity, all other values may be inferred from it. + * + * @return Boundary, uniquely identifying this Entity + */ + Boundary getBoundary(); + + + /** + * The deepest fully containing node represents the node which is the deepest node in the document tree structure, + * whose boundary also fully contains the boundary of this entity + * + * @return the deepest fully containing node + */ + SemanticNode getDeepestFullyContainingNode(); + + + /** + * The intersecting nodes represent all nodes, whose boundary intersects the boundary of this entity. + * + * @return all intersecting Nodes + */ + List getIntersectingNodes(); + + + void setDeepestFullyContainingNode(SemanticNode semanticNode); + + + void addIntersectingNode(SemanticNode semanticNode); + + + void setIntersectingNodes(List semanticNodes); + + + /** + * @return all pages this entity intersects. + */ + Set getPages(); + + + void setPages(Set pages); + + + /** + * removes all occurrences of this node in the graph and resets all graph specific fields + */ + default void removeFromGraph() { + + getIntersectingNodes().forEach(node -> node.getEntities().remove(this)); + getPages().forEach(page -> page.getEntities().remove(this)); + setPages(Collections.emptySet()); + setDeepestFullyContainingNode(null); + setIntersectingNodes(Collections.emptyList()); + } + +} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/entity/EntityPosition.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/entity/EntityPosition.java new file mode 100644 index 0000000..90c1405 --- /dev/null +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/entity/EntityPosition.java @@ -0,0 +1,39 @@ +package com.knecon.fforesight.service.layoutparser.internal.api.graph.entity; + +import java.awt.geom.Rectangle2D; +import java.nio.charset.StandardCharsets; +import java.util.List; + +import com.google.common.hash.Hashing; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode; + +import lombok.AccessLevel; +import lombok.Builder; +import lombok.Data; +import lombok.experimental.FieldDefaults; + +@Data +@Builder +@FieldDefaults(level = AccessLevel.PRIVATE) +public class EntityPosition { + + PageNode pageNode; + List rectanglePerLine; + + + public String getId() { + + return String.valueOf(hashCode()); + } + + + @Override + public int hashCode() { + + StringBuilder sb = new StringBuilder(); + sb.append(pageNode.getNumber()); + rectanglePerLine.forEach(r -> sb.append(r.getX()).append(r.getY()).append(r.getWidth()).append(r.getHeight())); + return Hashing.murmur3_128().hashString(sb.toString(), StandardCharsets.UTF_8).hashCode(); + } + +} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/nodes/FooterNode.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/nodes/FooterNode.java new file mode 100644 index 0000000..a497a0e --- /dev/null +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/nodes/FooterNode.java @@ -0,0 +1,53 @@ +package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes; + +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock; + +import lombok.AccessLevel; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.EqualsAndHashCode; +import lombok.NoArgsConstructor; +import lombok.experimental.FieldDefaults; + +@Data +@Builder +@AllArgsConstructor +@NoArgsConstructor +@FieldDefaults(level = AccessLevel.PRIVATE) +public class FooterNode implements SemanticNode { + + List tocId; + TextBlock terminalTextBlock; + + @Builder.Default + boolean terminal = true; + + @EqualsAndHashCode.Exclude + TableOfContents tableOfContents; + + @Builder.Default + @EqualsAndHashCode.Exclude + Set entities = new HashSet<>(); + + + @Override + public TextBlock buildTextBlock() { + + return terminalTextBlock; + } + + + @Override + public String toString() { + + return tocId + ": " + NodeType.FOOTER + ": " + terminalTextBlock.buildSummary(); + } + +} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/nodes/HeaderNode.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/nodes/HeaderNode.java new file mode 100644 index 0000000..75794c5 --- /dev/null +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/nodes/HeaderNode.java @@ -0,0 +1,53 @@ +package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes; + +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock; + +import lombok.AccessLevel; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.EqualsAndHashCode; +import lombok.NoArgsConstructor; +import lombok.experimental.FieldDefaults; + +@Data +@Builder +@AllArgsConstructor +@NoArgsConstructor +@FieldDefaults(level = AccessLevel.PRIVATE) +public class HeaderNode implements SemanticNode { + + List tocId; + TextBlock terminalTextBlock; + + @Builder.Default + boolean terminal = true; + + @EqualsAndHashCode.Exclude + TableOfContents tableOfContents; + + @Builder.Default + @EqualsAndHashCode.Exclude + Set entities = new HashSet<>(); + + + @Override + public TextBlock buildTextBlock() { + + return terminalTextBlock; + } + + + @Override + public String toString() { + + return tocId + ": " + NodeType.HEADER + ": " + terminalTextBlock.buildSummary(); + } + +} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/nodes/HeadlineNode.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/nodes/HeadlineNode.java new file mode 100644 index 0000000..9c88cac --- /dev/null +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/nodes/HeadlineNode.java @@ -0,0 +1,60 @@ +package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes; + +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock; + +import lombok.AccessLevel; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.EqualsAndHashCode; +import lombok.NoArgsConstructor; +import lombok.experimental.FieldDefaults; + +@Data +@Builder +@AllArgsConstructor +@NoArgsConstructor +@FieldDefaults(level = AccessLevel.PRIVATE) +public class HeadlineNode implements SemanticNode { + + List tocId; + TextBlock terminalTextBlock; + + @Builder.Default + boolean terminal = true; + + @EqualsAndHashCode.Exclude + TableOfContents tableOfContents; + + @Builder.Default + @EqualsAndHashCode.Exclude + Set entities = new HashSet<>(); + + + @Override + public TextBlock buildTextBlock() { + + return terminalTextBlock; + } + + + @Override + public String toString() { + + return tocId + ": " + NodeType.HEADLINE + ": " + terminalTextBlock.buildSummary(); + } + + + @Override + public SemanticNode getHeadline() { + + return this; + } + +} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/nodes/ImageNode.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/nodes/ImageNode.java new file mode 100644 index 0000000..be1f2fb --- /dev/null +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/nodes/ImageNode.java @@ -0,0 +1,88 @@ +package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes; + +import java.awt.geom.Rectangle2D; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector; + +import lombok.AccessLevel; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.EqualsAndHashCode; +import lombok.NoArgsConstructor; +import lombok.experimental.FieldDefaults; + +@Data +@Builder +@AllArgsConstructor +@NoArgsConstructor +@FieldDefaults(level = AccessLevel.PRIVATE) +public class ImageNode implements SemanticNode { + + List tocId; + + ImageType imageType; + boolean transparency; + Rectangle2D position; + + @Builder.Default + boolean redaction = false; + @Builder.Default + boolean ignored = false; + @Builder.Default + String redactionReason = ""; + @Builder.Default + String legalBasis = ""; + @Builder.Default + int matchedRule = -1; + + @EqualsAndHashCode.Exclude + PageNode page; + + @EqualsAndHashCode.Exclude + TableOfContents tableOfContents; + + @Builder.Default + @EqualsAndHashCode.Exclude + Set entities = new HashSet<>(); + + + @Override + public TextBlock buildTextBlock() { + + return streamAllSubNodes().filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock).collect(new TextBlockCollector()); + } + + + @Override + public Set getPages() { + + return Collections.singleton(page); + } + + + @Override + public String toString() { + + return tocId + ": " + NodeType.IMAGE + ": " + imageType.toString() + " " + position; + } + + + @Override + public Map getBBox() { + + Map bBoxPerPage = new HashMap<>(); + bBoxPerPage.put(page, position); + return bBoxPerPage; + } + +} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/nodes/ImageType.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/nodes/ImageType.java new file mode 100644 index 0000000..7c8afc8 --- /dev/null +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/nodes/ImageType.java @@ -0,0 +1,9 @@ +package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes; + +public enum ImageType { + LOGO, + FORMULA, + SIGNATURE, + OTHER, + OCR +} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/nodes/NodeType.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/nodes/NodeType.java new file mode 100644 index 0000000..df2f63b --- /dev/null +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/nodes/NodeType.java @@ -0,0 +1,12 @@ +package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes; + +public enum NodeType { + SECTION, + HEADLINE, + PARAGRAPH, + TABLE, + TABLE_CELL, + IMAGE, + HEADER, + FOOTER +} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/nodes/PageNode.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/nodes/PageNode.java new file mode 100644 index 0000000..2e7dddc --- /dev/null +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/nodes/PageNode.java @@ -0,0 +1,66 @@ +package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes; + +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector; + +import lombok.AccessLevel; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.EqualsAndHashCode; +import lombok.Getter; +import lombok.Setter; +import lombok.experimental.FieldDefaults; + +@Getter +@Setter +@Builder +@AllArgsConstructor +@FieldDefaults(level = AccessLevel.PRIVATE) +public class PageNode { + + Integer number; + Integer height; + Integer width; + Integer rotation; + + @EqualsAndHashCode.Exclude + List mainBody; + @EqualsAndHashCode.Exclude + HeaderNode header; + @EqualsAndHashCode.Exclude + FooterNode footer; + + @Builder.Default + @EqualsAndHashCode.Exclude + Set entities = new HashSet<>(); + + @Builder.Default + @EqualsAndHashCode.Exclude + Set images = new HashSet<>(); + + + public TextBlock getMainBodyTextBlock() { + + return mainBody.stream().filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock).collect(new TextBlockCollector()); + } + + + @Override + public String toString() { + + return String.valueOf(number); + } + + + @Override + public int hashCode() { + + return number; + } + +} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/nodes/ParagraphNode.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/nodes/ParagraphNode.java new file mode 100644 index 0000000..ab7c594 --- /dev/null +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/nodes/ParagraphNode.java @@ -0,0 +1,51 @@ +package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes; + +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock; + +import lombok.AccessLevel; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.EqualsAndHashCode; +import lombok.experimental.FieldDefaults; + +@Data +@Builder +@AllArgsConstructor +@FieldDefaults(level = AccessLevel.PRIVATE) +public class ParagraphNode implements SemanticNode { + + List tocId; + TextBlock terminalTextBlock; + + @Builder.Default + boolean terminal = true; + + @EqualsAndHashCode.Exclude + TableOfContents tableOfContents; + + @Builder.Default + @EqualsAndHashCode.Exclude + Set entities = new HashSet<>(); + + + @Override + public TextBlock buildTextBlock() { + + return terminalTextBlock; + } + + + @Override + public String toString() { + + return tocId + ": " + NodeType.PARAGRAPH + ": " + terminalTextBlock.buildSummary(); + } + +} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/nodes/SectionNode.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/nodes/SectionNode.java new file mode 100644 index 0000000..1b33ec1 --- /dev/null +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/nodes/SectionNode.java @@ -0,0 +1,63 @@ +package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes; + +import java.util.HashSet; +import java.util.List; +import java.util.NoSuchElementException; +import java.util.Set; + +import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector; + +import lombok.AccessLevel; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.EqualsAndHashCode; +import lombok.experimental.FieldDefaults; + + +@Data +@Builder +@AllArgsConstructor +@FieldDefaults(level = AccessLevel.PRIVATE) +public class SectionNode implements SemanticNode { + + List tocId; + + TextBlock textBlock; + @EqualsAndHashCode.Exclude + TableOfContents tableOfContents; + + @Builder.Default + @EqualsAndHashCode.Exclude + Set entities = new HashSet<>(); + + + @Override + public TextBlock buildTextBlock() { + + if (textBlock == null) { + textBlock = streamAllSubNodes().filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock).collect(new TextBlockCollector()); + } + return textBlock; + } + + + @Override + public String toString() { + + return tocId.toString() + ": " + NodeType.SECTION + ": " + buildTextBlock().buildSummary(); + } + + + public HeadlineNode getHeadline() { + + return streamChildren().filter(node -> node instanceof HeadlineNode) + .map(node -> (HeadlineNode) node) + .findFirst() + .orElseThrow(() -> new NoSuchElementException("ClassificationSection has no Headline!")); + } + +} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/nodes/SemanticNode.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/nodes/SemanticNode.java new file mode 100644 index 0000000..bd24328 --- /dev/null +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/nodes/SemanticNode.java @@ -0,0 +1,275 @@ +package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes; + +import java.awt.geom.Rectangle2D; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.AtomicTextBlock; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock; +import com.knecon.fforesight.service.layoutparser.internal.api.services.RectangleTransformations; + +public interface SemanticNode { + + /** + * Searches all Nodes located underneath this Node in the TableOfContents and concatenates their AtomicTextBlocks into a single TextBlockEntity. + * So, for a ClassificationSection all TextBlocks of Subsections, Paragraphs, and Tables are concatenated into a single TextBlockEntity + * If the Node is Terminal, the TerminalTextBlock will be returned instead. + * + * @return ClassificationTextBlock containing all AtomicTextBlocks that are located under this Node. + */ + TextBlock buildTextBlock(); + + + /** + * Any Node maintains its own Set of Entities. + * This Set contains all Entities whose boundary intersects the boundary of this node. + * + * @return Set of all Entities associated with this Node + */ + Set getEntities(); + + + /** + * Each AtomicTextBlock is assigned a page, so to get the pages this node appears on, it collects the PageNodes from each AtomicTextBlock belonging to this node's ClassificationTextBlock + * + * @return Set of PageNodes this node appears on. + */ + default Set getPages() { + + return buildTextBlock().getPages(); + } + + + /** + * @return the TableOfContents of the ClassificationDocument this node belongs to + */ + TableOfContents getTableOfContents(); + + + /** + * The id is a List of Integers uniquely identifying this node in the TableOfContents + * + * @return the TableOfContents ID + */ + List getTocId(); + + + /** + * This should only be used during graph construction + * + * @param tocId List of Integers + */ + void setTocId(List tocId); + + + /** + * Traverses the Tree up, until it hits a HeadlineNode or hits a SectionNode which will then return the first HeadlineNode from its children. + * Throws NotFoundException if no Headline is found this way + * + * @return First HeadlineNode found + */ + default SemanticNode getHeadline() { + + return getParent().getHeadline(); + } + + + /** + * @return boolean indicating wether this Node has a Parent in the TableOfContents + */ + default boolean hasParent() { + + return getTableOfContents().hasParentById(getTocId()); + } + + + /** + * @return The SemanticNode representing the Parent in the TableOfContents + * throws NotFoundException, when no parent is present + */ + default SemanticNode getParent() { + + return getTableOfContents().getParentEntryById(getTocId()).node(); + } + + + /** + * Terminal means a SemanticNode has direct access to a ClassificationTextBlock, by default this is false and must be overridden. + * Currently only Sections, Images, and Tables are not terminal. + * A TableCell might be Terminal depending on its area compared to the page. + * + * @return boolean, indicating if a Node has direct access to a ClassificationTextBlock + */ + default boolean isTerminal() { + + return false; + } + + + /** + * Terminal means a SemanticNode has direct access to a ClassificationTextBlock, by default this is false and must be overridden. + * Currently only Sections and Tables are not terminal. + * + * @return AtomicTextBlock + */ + default TextBlock getTerminalTextBlock() { + + throw new UnsupportedOperationException("Only terminal Nodes have access to TerminalTextBlocks!"); + } + + + default void setTerminalTextBlock(TextBlock textBlock) { + + throw new UnsupportedOperationException(); + } + + + /** + * Each AtomicTextBlock has an index on its page, this returns the number of the first AtomicTextBlock underneath this node. + * If this node does not have any AtomicTexBlocks underneath it, e.g. an empty TableCell. It returns -1. + * + * @return Integer representing the number on the page + */ + default Integer getNumberOnPage() { + + TextBlock textBlock = buildTextBlock(); + if (textBlock.getAtomicTextBlocks().size() > 0) { + return buildTextBlock().getAtomicTextBlocks().get(0).getNumberOnPage(); + } else { + return -1; + } + } + + + /** + * @return true, if this node's ClassificationTextBlock is not empty + */ + default boolean hasText() { + + return buildTextBlock().length() > 0; + } + + + /** + * @param string A String which the ClassificationTextBlock might contain + * @return true, if this node's ClassificationTextBlock contains the string + */ + default boolean containsString(String string) { + + return buildTextBlock().getSearchText().contains(string); + } + + + /** + * @param strings A List of Strings which the ClassificationTextBlock might contain + * @return true, if this node's ClassificationTextBlock contains any of the strings + */ + default boolean containsAnyString(List strings) { + + return strings.stream().anyMatch(this::containsString); + } + + + /** + * This function is used during insertion of EntityNodes into the graph, it checks if the boundary of the EntityNode intersects or even contains the EntityNode. + * It sets the fields accordingly and recursively calls this function on all its children. + * + * @param entityNode EntityNode, which is being inserted into the graph + */ + default void addThisToEntityIfIntersects(EntityNode entityNode) { + + TextBlock textBlock = buildTextBlock(); + if (textBlock.getBoundary().intersects(entityNode.getBoundary())) { + + if (textBlock.containsBoundary(entityNode.getBoundary())) { + entityNode.setDeepestFullyContainingNode(this); + } + + entityNode.addIntersectingNode(this); + streamChildren().forEach(node -> node.addThisToEntityIfIntersects(entityNode)); + } + } + + + /** + * Streams all children located directly underneath this node in the TableOfContents + * + * @return Stream of all children + */ + default Stream streamChildren() { + + return getTableOfContents().streamChildren(getTocId()); + } + + + /** + * recursively streams all SemanticNodes located underneath this node in the TableOfContents in order. + * + * @return Stream of all SubNodes + */ + default Stream streamAllSubNodes() { + + return getTableOfContents().streamSubEntriesInOrder(getTocId()).map(TableOfContents.Entry::node); + } + + + /** + * @return Boundary of this Node's ClassificationTextBlock + */ + default Boundary getBoundary() { + + return buildTextBlock().getBoundary(); + } + + + /** + * If this Node is Terminal it will calculate the boundingBox of its TerminalTextBlock, otherwise it will calculate the Union of the BoundingBoxes of all its Children. + * If called on the ClassificationDocument, it will return the cropbox of each page + * + * @return Rectangle2D fully encapsulating this Node for each page. + */ + default Map getBBox() { + + Map bBoxPerPage = new HashMap<>(); + if (isTerminal()) { + return getBBoxFromTerminalTextBlock(bBoxPerPage); + } + + return getBBoxFromChildren(bBoxPerPage); + } + + + /** + * TODO this does not yet work for sections spanning multiple columns + * + * @param bBoxPerPage initial empty BoundingBox + * @return The union of the BoundingBoxes of all children + */ + private Map getBBoxFromChildren(Map bBoxPerPage) { + + return streamChildren().map(SemanticNode::getBBox).reduce((map1, map2) -> { + map1.forEach((page, rectangle) -> map2.merge(page, rectangle, (rect1, rect2) -> rect1.createUnion(rect2).getBounds2D())); + return map2; + }).orElse(bBoxPerPage); + } + + + /** + * @param bBoxPerPage initial empty BoundingBox + * @return The union of all BoundingBoxes of the ClassificationTextBlock of this node + */ + private Map getBBoxFromTerminalTextBlock(Map bBoxPerPage) { + + Map> atomicTextBlockPerPage = buildTextBlock().getAtomicTextBlocks().stream().collect(Collectors.groupingBy(AtomicTextBlock::getPage)); + atomicTextBlockPerPage.forEach((page, atbs) -> bBoxPerPage.put(page, RectangleTransformations.bBoxUnionAtomicTextBlock(atbs))); + return bBoxPerPage; + } + +} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/nodes/TableCellNode.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/nodes/TableCellNode.java new file mode 100644 index 0000000..af05599 --- /dev/null +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/nodes/TableCellNode.java @@ -0,0 +1,92 @@ +package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes; + +import java.awt.geom.Rectangle2D; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Stream; + +import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector; + +import lombok.AccessLevel; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.EqualsAndHashCode; +import lombok.experimental.FieldDefaults; + +@Data +@Builder +@AllArgsConstructor +@FieldDefaults(level = AccessLevel.PRIVATE) +public class TableCellNode implements SemanticNode { + + List tocId; + int row; + int col; + boolean header; + + Rectangle2D bBox; + + @Builder.Default + boolean terminal = true; + TextBlock terminalTextBlock; + + TextBlock textBlock; + + @EqualsAndHashCode.Exclude + TableOfContents tableOfContents; + + @Builder.Default + @EqualsAndHashCode.Exclude + Set entities = new HashSet<>(); + + + @Override + public Map getBBox() { + + Map bBoxPerPage = new HashMap<>(); + getPages().forEach(page -> bBoxPerPage.put(page, bBox)); + return bBoxPerPage; + } + + + @Override + public TextBlock buildTextBlock() { + + if (terminal) { + return terminalTextBlock; + } + + if (textBlock == null) { + textBlock = streamAllSubNodes().filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock).collect(new TextBlockCollector()); + } + return textBlock; + } + + + @Override + public String toString() { + + return tocId + ": " + NodeType.TABLE_CELL + ": " + buildTextBlock().buildSummary(); + } + + + public boolean hasHeader(String headerString) { + + return getHeaders().anyMatch(header -> header.buildTextBlock().getSearchText().strip().equals(headerString)); + } + + + private Stream getHeaders() { + + TableNode tableNode = (TableNode) getParent(); + return tableNode.streamHeadersForCell(row, col); + } + +} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/nodes/TableNode.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/nodes/TableNode.java new file mode 100644 index 0000000..a7ad031 --- /dev/null +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/nodes/TableNode.java @@ -0,0 +1,73 @@ +package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes; + +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.stream.Stream; + +import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector; + +import lombok.AccessLevel; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.EqualsAndHashCode; +import lombok.experimental.FieldDefaults; + +@Data +@Builder +@AllArgsConstructor +@FieldDefaults(level = AccessLevel.PRIVATE) +public class TableNode implements SemanticNode { + + List tocId; + TableOfContents tableOfContents; + + Integer numberOfRows; + Integer numberOfCols; + + TextBlock textBlock; + + @Builder.Default + @EqualsAndHashCode.Exclude + Set entities = new HashSet<>(); + + + public Stream streamTableCells() { + + return streamChildren().map(node -> (TableCellNode) node); + } + + + public Stream streamHeaders() { + + return streamTableCells().filter(TableCellNode::isHeader); + } + + + public Stream streamHeadersForCell(int row, int col) { + + return streamHeaders().filter(cell -> cell.getRow() == row || cell.getCol() == col); + } + + + @Override + public TextBlock buildTextBlock() { + + if (textBlock == null) { + textBlock = streamAllSubNodes().filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock).collect(new TextBlockCollector()); + } + return textBlock; + } + + + @Override + public String toString() { + + return tocId.toString() + ": " + NodeType.TABLE + ": " + buildTextBlock().buildSummary(); + } + +} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/textblock/AtomicTextBlock.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/textblock/AtomicTextBlock.java new file mode 100644 index 0000000..945f278 --- /dev/null +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/textblock/AtomicTextBlock.java @@ -0,0 +1,131 @@ +package com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock; + +import java.awt.geom.Rectangle2D; +import java.util.List; + +import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityPosition; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode; +import com.knecon.fforesight.service.layoutparser.internal.api.services.RectangleTransformations; + +import lombok.AccessLevel; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.EqualsAndHashCode; +import lombok.experimental.FieldDefaults; + +@Data +@Builder +@AllArgsConstructor +@FieldDefaults(level = AccessLevel.PRIVATE) +public class AtomicTextBlock implements TextBlock { + + Long id; + Integer numberOnPage; + PageNode page; + + //string coordinates + Boundary boundary; + String searchText; + List lineBreaks; + + //position coordinates + List stringIdxToPositionIdx; + List positions; + + @EqualsAndHashCode.Exclude + SemanticNode parent; + + + @Override + public int numberOfLines() { + + return lineBreaks.size() + 1; + } + + + public CharSequence getLine(int lineNumber) { + + if (lineNumber >= numberOfLines() || lineNumber < 0) { + throw new IndexOutOfBoundsException(String.format("line %d out of range for AtomicTextBlock with %d lines", lineNumber, numberOfLines())); + } + if (lineNumber == 0) { + return subSequence(boundary.start(), lineBreaks.get(0) + boundary.start()); + } else if (lineNumber == numberOfLines() - 1) { + return subSequence(lineBreaks.get(lineBreaks.size() - 1) + boundary.start(), boundary.end()); + } + return subSequence(lineBreaks.get(lineNumber - 1) + boundary.start(), lineBreaks.get(lineNumber) + boundary.start()); + } + + + @Override + public List getAtomicTextBlocks() { + + return List.of(this); + } + + + @Override + public int getNextLinebreak(int fromIndex) { + + return lineBreaks.stream()// + .filter(linebreak -> linebreak > fromIndex - boundary.start()) // + .findFirst() // + .orElse(searchText.length()) + boundary.start(); + } + + + @Override + public int getPreviousLinebreak(int fromIndex) { + + return lineBreaks.stream()// + .filter(linebreak -> linebreak <= fromIndex - boundary.start())// + .reduce((a, b) -> b)// + .orElse(0) + boundary.start(); + } + + + @Override + public Rectangle2D getPosition(int stringIdx) { + + return positions.get(stringIdxToPositionIdx.get(stringIdx - boundary.start())); + } + + + @Override + public List getPositions(Boundary stringBoundary) { + + if (!containsBoundary(stringBoundary)) { + throw new IndexOutOfBoundsException(String.format("%s is out of bounds for %s", stringBoundary, this.boundary)); + } + + if (stringBoundary.end() == this.boundary.end()) { + return positions.subList(stringIdxToPositionIdx.get(stringBoundary.start() - this.boundary.start()), positions.size()); + } + + return positions.subList(stringIdxToPositionIdx.get(stringBoundary.start() - this.boundary.start()), + stringIdxToPositionIdx.get(stringBoundary.end() - this.boundary.start())); + } + + + public List getEntityPositionsPerPage(Boundary stringBoundary) { + + List positionsPerLine = stringBoundary.split(getLineBreaks().stream().map(lb -> lb + boundary.start()).filter(stringBoundary::contains).toList()) + .stream() + .map(this::getPositions) + .map(RectangleTransformations::rectangleUnion) + .toList(); + + return List.of(EntityPosition.builder().rectanglePerLine(positionsPerLine).pageNode(page).build()); + } + + + @Override + public String toString() { + + return searchText; + } + +} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/textblock/ConcatenatedTextBlock.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/textblock/ConcatenatedTextBlock.java new file mode 100644 index 0000000..6f455b7 --- /dev/null +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/textblock/ConcatenatedTextBlock.java @@ -0,0 +1,179 @@ +package com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock; + +import java.awt.geom.Rectangle2D; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityPosition; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode; + +import lombok.AccessLevel; +import lombok.Data; +import lombok.experimental.FieldDefaults; + +@Data +@FieldDefaults(level = AccessLevel.PRIVATE) +public class ConcatenatedTextBlock implements TextBlock { + + List atomicTextBlocks; + String searchText; + Boundary boundary; + + + public ConcatenatedTextBlock(List atomicTextBlocks) { + + this.atomicTextBlocks = new LinkedList<>(); + if (atomicTextBlocks.isEmpty()) { + boundary = new Boundary(-1, -1); + return; + } + var firstTextBlock = atomicTextBlocks.get(0); + this.atomicTextBlocks.add(firstTextBlock); + boundary = new Boundary(firstTextBlock.getBoundary().start(), firstTextBlock.getBoundary().end()); + + atomicTextBlocks.subList(1, atomicTextBlocks.size()).forEach(this::concat); + } + + + public ConcatenatedTextBlock concat(TextBlock textBlock) { + + if (this.atomicTextBlocks.isEmpty()) { + boundary.setStart(textBlock.getBoundary().start()); + boundary.setEnd(textBlock.getBoundary().end()); + } else if (boundary.end() != textBlock.getBoundary().start()) { + throw new UnsupportedOperationException(String.format("Can only concat consecutive TextBlocks, trying to concat %s and %s", boundary, textBlock.getBoundary())); + } + this.atomicTextBlocks.addAll(textBlock.getAtomicTextBlocks()); + boundary.setEnd(textBlock.getBoundary().end()); + return this; + } + + + private AtomicTextBlock getAtomicTextBlockByStringIndex(int stringIdx) { + + return atomicTextBlocks.stream().filter(textBlock -> (textBlock.getBoundary().contains(stringIdx))).findAny().orElseThrow(IndexOutOfBoundsException::new); + } + + + private List getAllAtomicTextBlocksPartiallyInStringBoundary(Boundary boundary) { + + return atomicTextBlocks.stream().filter(tb -> tb.getBoundary().intersects(boundary)).toList(); + } + + + @Override + public String getSearchText() { + + if (searchText == null) { + StringBuilder sb = new StringBuilder(); + getAtomicTextBlocks().forEach(atb -> sb.append(atb.getSearchText())); + searchText = sb.toString(); + } + return searchText; + } + + + @Override + public int numberOfLines() { + + return atomicTextBlocks.stream().map(AtomicTextBlock::getLineBreaks).mapToInt(List::size).sum(); + } + + + @Override + public int getNextLinebreak(int fromIndex) { + + return getAtomicTextBlockByStringIndex(fromIndex).getNextLinebreak(fromIndex); + } + + + @Override + public int getPreviousLinebreak(int fromIndex) { + + return getAtomicTextBlockByStringIndex(fromIndex).getPreviousLinebreak(fromIndex); + } + + + @Override + public List getLineBreaks() { + + return getAtomicTextBlocks().stream().flatMap(atomicTextBlock -> atomicTextBlock.getLineBreaks().stream()).toList(); + } + + + @Override + public Rectangle2D getPosition(int stringIdx) { + + return getAtomicTextBlockByStringIndex(stringIdx).getPosition(stringIdx); + } + + + @Override + public List getPositions(Boundary stringBoundary) { + + List textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringBoundary); + + if (textBlocks.size() == 1) { + return textBlocks.get(0).getPositions(stringBoundary); + } + + AtomicTextBlock firstTextBlock = textBlocks.get(0); + List positions = new LinkedList<>(firstTextBlock.getPositions(new Boundary(stringBoundary.start(), firstTextBlock.getBoundary().end()))); + + for (AtomicTextBlock textBlock : textBlocks.subList(1, textBlocks.size() - 1)) { + positions.addAll(textBlock.getPositions()); + } + + var lastTextBlock = textBlocks.get(textBlocks.size() - 1); + positions.addAll(lastTextBlock.getPositions(new Boundary(lastTextBlock.getBoundary().start(), stringBoundary.end()))); + + return positions; + } + + + @Override + public List getEntityPositionsPerPage(Boundary stringBoundary) { + + List textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringBoundary); + + if (textBlocks.size() == 1) { + return textBlocks.get(0).getEntityPositionsPerPage(stringBoundary); + } + + AtomicTextBlock firstTextBlock = textBlocks.get(0); + List positions = new LinkedList<>(firstTextBlock.getEntityPositionsPerPage(new Boundary(stringBoundary.start(), firstTextBlock.getBoundary().end()))); + + for (AtomicTextBlock textBlock : textBlocks.subList(1, textBlocks.size() - 1)) { + positions.addAll(textBlock.getEntityPositionsPerPage(textBlock.getBoundary())); + } + + AtomicTextBlock lastTextBlock = textBlocks.get(textBlocks.size() - 1); + positions.addAll(lastTextBlock.getEntityPositionsPerPage(new Boundary(lastTextBlock.getBoundary().start(), stringBoundary.end()))); + + return mergeEntityPositionsWithSamePageNode(positions); + } + + + private List mergeEntityPositionsWithSamePageNode(List positions) { + + Map> entityPositionsPerPage = positions.stream().collect(// + Collectors.groupingBy(EntityPosition::getPageNode, // + Collectors.flatMapping(entityPosition -> entityPosition.getRectanglePerLine().stream(), Collectors.toList()))); + + return entityPositionsPerPage.entrySet().stream()// + .map(entry -> EntityPosition.builder().pageNode(entry.getKey()).rectanglePerLine(entry.getValue()).build())// + .toList(); + + } + + + @Override + public String toString() { + + return getSearchText(); + } + +} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/textblock/TextBlock.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/textblock/TextBlock.java new file mode 100644 index 0000000..3312650 --- /dev/null +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/textblock/TextBlock.java @@ -0,0 +1,125 @@ +package com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock; + +import java.awt.geom.Rectangle2D; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + +import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityPosition; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode; + +public interface TextBlock extends CharSequence { + + String getSearchText(); + + + List getAtomicTextBlocks(); + + + Boundary getBoundary(); + + + int getNextLinebreak(int fromIndex); + + + int getPreviousLinebreak(int fromIndex); + + + List getLineBreaks(); + + + Rectangle2D getPosition(int stringIdx); + + + List getPositions(Boundary stringBoundary); + + + List getEntityPositionsPerPage(Boundary stringBoundary); + + + int numberOfLines(); + + + default int indexOf(String searchTerm) { + + return indexOf(searchTerm, getBoundary().start()); + } + + + default Set getPages() { + + return getAtomicTextBlocks().stream().map(AtomicTextBlock::getPage).collect(Collectors.toUnmodifiableSet()); + } + + + default int indexOf(String searchTerm, int startOffset) { + + int start = getSearchText().indexOf(searchTerm, startOffset - getBoundary().start()); + if (start == -1) { + return -1; + } + return start + getBoundary().start(); + } + + + default CharSequence getFirstLine() { + + return subSequence(getBoundary().start(), getNextLinebreak(getBoundary().start())); + } + + + default boolean containsBoundary(Boundary boundary) { + + if (boundary.end() < boundary.start()) { + throw new IllegalArgumentException(String.format("Invalid %s, StartIndex must be smaller than EndIndex", boundary)); + } + return getBoundary().contains(boundary); + } + + + default boolean containsIndex(int stringIndex) { + + return getBoundary().contains(stringIndex); + } + + + default CharSequence subSequence(Boundary boundary) { + + return subSequence(boundary.start(), boundary.end()); + } + + + default String buildSummary() { + + String[] words = getSearchText().split(" "); + int bound = Math.min(words.length, 4); + List list = new ArrayList<>(Arrays.asList(words).subList(0, bound)); + + return String.join(" ", list); + } + + + @Override + default CharSequence subSequence(int start, int end) { + + return getSearchText().substring(start - getBoundary().start(), end - getBoundary().start()); + } + + + @Override + default int length() { + + return getBoundary().length(); + } + + + @Override + default char charAt(int index) { + + return getSearchText().charAt(index - getBoundary().start()); + } + +} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/textblock/TextBlockCollector.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/textblock/TextBlockCollector.java new file mode 100644 index 0000000..3a9ba1d --- /dev/null +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/graph/textblock/TextBlockCollector.java @@ -0,0 +1,50 @@ +package com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock; + +import java.util.Collections; +import java.util.Set; +import java.util.function.BiConsumer; +import java.util.function.BinaryOperator; +import java.util.function.Function; +import java.util.function.Supplier; +import java.util.stream.Collector; + +import lombok.NoArgsConstructor; + +@NoArgsConstructor +public class TextBlockCollector implements Collector { + + @Override + public Supplier supplier() { + + return () -> new ConcatenatedTextBlock(Collections.emptyList()); + } + + + @Override + public BiConsumer accumulator() { + + return ConcatenatedTextBlock::concat; + } + + + @Override + public BinaryOperator combiner() { + + return ConcatenatedTextBlock::concat; + } + + + @Override + public Function finisher() { + + return a -> a; + } + + + @Override + public Set characteristics() { + + return Set.of(Characteristics.IDENTITY_FINISH, Characteristics.CONCURRENT); + } + +} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/mapper/DocumentDataMapper.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/mapper/DocumentDataMapper.java new file mode 100644 index 0000000..4fc78a9 --- /dev/null +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/mapper/DocumentDataMapper.java @@ -0,0 +1,143 @@ +package com.knecon.fforesight.service.layoutparser.internal.api.mapper; + +import java.awt.geom.Rectangle2D; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicPositionBlockData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicTextBlockData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.PageData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.TableOfContentsData; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.DocumentGraph; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ImageNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableCellNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.AtomicTextBlock; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock; + +public class DocumentDataMapper { + + public DocumentData toDocumentData(DocumentGraph documentGraph) { + + List atomicTextBlockData = documentGraph.streamTerminalTextBlocksInOrder() + .flatMap(textBlock -> textBlock.getAtomicTextBlocks().stream()) + .distinct() + .map(this::toAtomicTextBlockData) + .toList(); + + List atomicPositionBlockData = documentGraph.streamTerminalTextBlocksInOrder() + .flatMap(textBlock -> textBlock.getAtomicTextBlocks().stream()) + .distinct() + .map(this::toAtomicPositionBlockData) + .toList(); + + List pageData = documentGraph.getPages().stream().map(this::toPageData).toList(); + TableOfContentsData tableOfContentsData = toTableOfContentsData(documentGraph.getTableOfContents()); + return DocumentData.builder() + .atomicTextBlocks(atomicTextBlockData.toArray(new AtomicTextBlockData[0])) + .atomicPositionBlocks(atomicPositionBlockData.toArray(new AtomicPositionBlockData[0])) + .pages(pageData.toArray(new PageData[0])) + .tableOfContents(tableOfContentsData) + .build(); + } + + + private TableOfContentsData toTableOfContentsData(TableOfContents tableOfContents) { + + return new TableOfContentsData(tableOfContents.getEntries().stream().map(this::toEntryData).toList()); + } + + + private TableOfContentsData.EntryData toEntryData(TableOfContents.Entry entry) { + + Long[] atomicTextBlocks; + + if (entry.node().isTerminal()) { + atomicTextBlocks = toAtomicTextBlockIds(entry.node().getTerminalTextBlock()); + } else { + atomicTextBlocks = new Long[]{}; + } + + Map properties = switch (entry.type()) { + case TABLE -> PropertiesMapper.buildTableProperties((TableNode) entry.node()); + case TABLE_CELL -> PropertiesMapper.buildTableCellProperties((TableCellNode) entry.node()); + case IMAGE -> PropertiesMapper.buildImageProperties((ImageNode) entry.node()); + default -> new HashMap<>(); + }; + + return TableOfContentsData.EntryData.builder() + .tocId(toPrimitiveIntArray(entry.tocId())) + .subEntries(entry.children().stream().map(this::toEntryData).toList()) + .type(entry.type()) + .atomicBlocks(atomicTextBlocks) + .pages(entry.node().getPages().stream().map(PageNode::getNumber).map(Integer::longValue).toArray(Long[]::new)) + .properties(properties) + .build(); + } + + + private Long[] toAtomicTextBlockIds(TextBlock textBlock) { + + return textBlock.getAtomicTextBlocks().stream().map(AtomicTextBlock::getId).toArray(Long[]::new); + } + + + private PageData toPageData(PageNode p) { + + return PageData.builder().rotation(p.getRotation()).height(p.getHeight()).width(p.getWidth()).number(p.getNumber()).build(); + } + + + private AtomicTextBlockData toAtomicTextBlockData(AtomicTextBlock atomicTextBlock) { + + return AtomicTextBlockData.builder() + .id(atomicTextBlock.getId()) + .page(atomicTextBlock.getPage().getNumber().longValue()) + .searchText(atomicTextBlock.getSearchText()) + .numberOnPage(atomicTextBlock.getNumberOnPage()) + .start(atomicTextBlock.getBoundary().start()) + .end(atomicTextBlock.getBoundary().end()) + .lineBreaks(toPrimitiveIntArray(atomicTextBlock.getLineBreaks())) + .build(); + } + + + private AtomicPositionBlockData toAtomicPositionBlockData(AtomicTextBlock atomicTextBlock) { + + return AtomicPositionBlockData.builder() + .id(atomicTextBlock.getId()) + .positions(toPrimitiveFloatMatrix(atomicTextBlock.getPositions())) + .stringIdxToPositionIdx(toPrimitiveIntArray(atomicTextBlock.getStringIdxToPositionIdx())) + .build(); + } + + + private float[][] toPrimitiveFloatMatrix(List positions) { + + float[][] positionMatrix = new float[positions.size()][]; + for (int i = 0; i < positions.size(); i++) { + float[] singlePositions = new float[4]; + singlePositions[0] = (float) positions.get(i).getMinX(); + singlePositions[1] = (float) positions.get(i).getMinY(); + singlePositions[2] = (float) positions.get(i).getWidth(); + singlePositions[3] = (float) positions.get(i).getHeight(); + positionMatrix[i] = singlePositions; + } + return positionMatrix; + } + + + private int[] toPrimitiveIntArray(List list) { + + int[] array = new int[list.size()]; + for (int i = 0; i < list.size(); i++) { + array[i] = list.get(i); + } + return array; + } + +} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/mapper/DocumentGraphMapper.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/mapper/DocumentGraphMapper.java new file mode 100644 index 0000000..e3683db --- /dev/null +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/mapper/DocumentGraphMapper.java @@ -0,0 +1,225 @@ +package com.knecon.fforesight.service.layoutparser.internal.api.mapper; + +import static com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.NodeType.FOOTER; +import static com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.NodeType.HEADER; + +import java.awt.geom.Rectangle2D; +import java.util.Arrays; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.NoSuchElementException; + +import com.google.common.primitives.Ints; +import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicPositionBlockData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicTextBlockData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.PageData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.TableOfContentsData; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.DocumentGraph; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.FooterNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.HeaderNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.HeadlineNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ImageNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ParagraphNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SectionNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableCellNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.AtomicTextBlock; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector; + +public class DocumentGraphMapper { + + public DocumentGraph toDocumentGraph(DocumentData documentData) { + + Context context = new Context(documentData, + new TableOfContents(), + new LinkedList<>(), + new LinkedList<>(), + Arrays.stream(documentData.getAtomicTextBlocks()).toList(), + Arrays.stream(documentData.getAtomicPositionBlocks()).toList()); + + context.pages.addAll(Arrays.stream(documentData.getPages()).map(this::buildPage).toList()); + + context.tableOfContents.setEntries(buildEntries(documentData.getTableOfContents().getEntries(), context)); + + DocumentGraph documentGraph = DocumentGraph.builder() + .numberOfPages(documentData.getPages().length) + .pages(new HashSet<>(context.pages)) + .tableOfContents(context.tableOfContents) + .build(); + documentGraph.setTextBlock(documentGraph.buildTextBlock()); + return documentGraph; + } + + + private List buildEntries(List entries, + Context context) { + + List newEntries = new LinkedList<>(); + for (TableOfContentsData.EntryData entryData : entries) { + + boolean terminal = isTerminal(entryData); + List pages = Arrays.stream(entryData.pages()).map(pageNumber -> getPage(pageNumber, context)).toList(); + + SemanticNode node = switch (entryData.type()) { + case SECTION -> buildSection(context); + case PARAGRAPH -> buildParagraph(context, terminal); + case HEADLINE -> buildHeadline(context, terminal); + case HEADER -> buildHeader(context, terminal); + case FOOTER -> buildFooter(context, terminal); + case TABLE -> buildTable(context, entryData.properties()); + case TABLE_CELL -> buildTableCell(context, entryData.properties(), terminal); + case IMAGE -> buildImage(context, entryData.properties()); + default -> throw new UnsupportedOperationException("Not yet implemented for type " + entryData.type()); + }; + + if (node.isTerminal()) { + TextBlock textBlock = toTextBlock(entryData.atomicBlocks(), context, node); + node.setTerminalTextBlock(textBlock); + } + List tocId = Arrays.stream(entryData.tocId()).boxed().toList(); + node.setTocId(tocId); + + if (entryData.type() == HEADER) { + pages.forEach(page -> page.setHeader((HeaderNode) node)); + } else if (entryData.type() == FOOTER) { + pages.forEach(page -> page.setFooter((FooterNode) node)); + } else { + pages.forEach(page -> page.getMainBody().add(node)); + } + newEntries.add(TableOfContents.Entry.builder().tocId(tocId).type(entryData.type()).children(buildEntries(entryData.subEntries(), context)).node(node).build()); + } + return newEntries; + } + + + private HeadlineNode buildHeadline(Context context, boolean terminal) { + + return HeadlineNode.builder().terminal(terminal).tableOfContents(context.tableOfContents()).build(); + } + + + private static boolean isTerminal(TableOfContentsData.EntryData entryData) { + + return entryData.atomicBlocks().length > 0; + } + + + private ImageNode buildImage(Context context, Map properties) { + + var builder = ImageNode.builder(); + PropertiesMapper.parseImageProperties(properties, builder); + return builder.tableOfContents(context.tableOfContents()).build(); + } + + + private TableCellNode buildTableCell(Context context, Map properties, boolean terminal) { + + TableCellNode.TableCellNodeBuilder builder = TableCellNode.builder(); + PropertiesMapper.parseTableCellProperties(properties, builder); + return builder.terminal(terminal).tableOfContents(context.tableOfContents()).build(); + } + + + private TableNode buildTable(Context context, Map properties) { + + TableNode.TableNodeBuilder builder = TableNode.builder(); + PropertiesMapper.parseTableProperties(properties, builder); + return TableNode.builder().tableOfContents(context.tableOfContents()).build(); + } + + + private FooterNode buildFooter(Context context, boolean terminal) { + + return FooterNode.builder().terminal(terminal).tableOfContents(context.tableOfContents()).build(); + } + + + private HeaderNode buildHeader(Context context, boolean terminal) { + + return HeaderNode.builder().terminal(terminal).tableOfContents(context.tableOfContents()).build(); + } + + + private SectionNode buildSection(Context context) { + + return SectionNode.builder().tableOfContents(context.tableOfContents()).build(); + + } + + + private ParagraphNode buildParagraph(Context context, boolean terminal) { + + return ParagraphNode.builder().terminal(terminal).tableOfContents(context.tableOfContents()).build(); + } + + + private TextBlock toTextBlock(Long[] atomicTextBlockIds, Context context, SemanticNode parent) { + + return Arrays.stream(atomicTextBlockIds) + .map(atomicTextBlockId -> toAtomicTextBlock(context.atomicTextBlockData.get(Math.toIntExact(atomicTextBlockId)), + context.atomicPositionBlockData.get(Math.toIntExact(atomicTextBlockId)), + parent, + context)) + .collect(new TextBlockCollector()); + } + + + private PageNode buildPage(PageData p) { + + return PageNode.builder().rotation(p.getRotation()).height(p.getHeight()).width(p.getWidth()).number(p.getNumber()).mainBody(new LinkedList<>()).build(); + } + + + private AtomicTextBlock toAtomicTextBlock(AtomicTextBlockData atomicTextBlockData, + AtomicPositionBlockData atomicPositionBlockData, + SemanticNode parent, + Context context) { + + return AtomicTextBlock.builder() + .id(atomicTextBlockData.getId()) + .numberOnPage(atomicTextBlockData.getNumberOnPage()) + .page(getPage(atomicTextBlockData.getPage(), context)) + .boundary(new Boundary(atomicTextBlockData.getStart(), atomicTextBlockData.getEnd())) + .searchText(atomicTextBlockData.getSearchText()) + .lineBreaks(Ints.asList(atomicTextBlockData.getLineBreaks())) + .stringIdxToPositionIdx(Ints.asList(atomicPositionBlockData.getStringIdxToPositionIdx())) + .positions(toRectangle2DList(atomicPositionBlockData.getPositions())) + .parent(parent) + .build(); + } + + + private static List toRectangle2DList(float[][] positions) { + + return Arrays.stream(positions).map(floatArr -> (Rectangle2D) new Rectangle2D.Float(floatArr[0], floatArr[1], floatArr[2], floatArr[3])).toList(); + } + + + private PageNode getPage(Long pageIndex, Context context) { + + return context.pages.stream() + .filter(page -> page.getNumber() == Math.toIntExact(pageIndex)) + .findFirst() + .orElseThrow(() -> new NoSuchElementException(String.format("ClassificationPage with number %d not found", pageIndex))); + } + + + record Context( + DocumentData layoutParsingModel, + TableOfContents tableOfContents, + List pages, + List sections, + List atomicTextBlockData, + List atomicPositionBlockData) { + + } + +} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/mapper/PropertiesMapper.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/mapper/PropertiesMapper.java new file mode 100644 index 0000000..8c9e370 --- /dev/null +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/mapper/PropertiesMapper.java @@ -0,0 +1,101 @@ +package com.knecon.fforesight.service.layoutparser.internal.api.mapper; + +import java.awt.geom.Rectangle2D; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ImageNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ImageType; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableCellNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableNode; +import com.knecon.fforesight.service.layoutparser.internal.api.services.RectangleTransformations; + +public class PropertiesMapper { + + public static Map buildImageProperties(ImageNode image) { + + Map properties = new HashMap<>(); + properties.put("imageType", image.getImageType().toString()); + properties.put("transparency", String.valueOf(image.isTransparency())); + properties.put("position", RectangleTransformations.toString(image.getPosition())); + return properties; + } + + + public static Map buildTableCellProperties(TableCellNode tableCell) { + + Map properties = new HashMap<>(); + properties.put("row", String.valueOf(tableCell.getRow())); + properties.put("col", String.valueOf(tableCell.getCol())); + properties.put("header", String.valueOf(tableCell.isHeader())); + + if (tableCell.getPages().size() > 1 || tableCell.getBBox().keySet().size() > 1) { + throw new IllegalArgumentException("TableCell can only occur on a single page!"); + } + String bBoxString = RectangleTransformations.toString(tableCell.getBBox().get(tableCell.getPages().stream().findFirst().get())); + properties.put("bBox", bBoxString); + + return properties; + } + + + public static Map buildTableProperties(TableNode table) { + + Map properties = new HashMap<>(); + properties.put("numberOfRows", String.valueOf(table.getNumberOfRows())); + properties.put("numberOfCols", String.valueOf(table.getNumberOfCols())); + return properties; + } + + + public static void parseImageProperties(Map properties, ImageNode.ImageNodeBuilder builder) { + + builder.imageType(parseImageType(properties.get("imageType"))); + builder.transparency(Boolean.parseBoolean(properties.get("transparency"))); + builder.position(parseRectangle2D(properties.get("position"))); + } + + + public static void parseTableCellProperties(Map properties, TableCellNode.TableCellNodeBuilder builder) { + + builder.row(Integer.parseInt(properties.get("row"))); + builder.col(Integer.parseInt(properties.get("col"))); + builder.header(Boolean.parseBoolean(properties.get("header"))); + builder.bBox(parseRectangle2D(properties.get("bBox"))); + } + + + public static void parseTableProperties(Map properties, TableNode.TableNodeBuilder builder) { + + builder.numberOfRows(Integer.parseInt(properties.get("numberOfRows"))); + builder.numberOfCols(Integer.parseInt(properties.get("numberOfCols"))); + } + + + private static ImageType parseImageType(String imageType) { + + return switch (imageType) { + case "LOGO" -> ImageType.LOGO; + case "FORMULA" -> ImageType.FORMULA; + case "SIGNATURE" -> ImageType.SIGNATURE; + case "OCR" -> ImageType.OCR; + default -> ImageType.OTHER; + }; + } + + + public static String toString(Rectangle2D rectangle2D) { + + return String.format("%f,%f,%f,%f", rectangle2D.getX(), rectangle2D.getY(), rectangle2D.getWidth(), rectangle2D.getHeight()); + } + + + public static Rectangle2D parseRectangle2D(String bBox) { + + List floats = Arrays.stream(bBox.split(",")).map(Float::parseFloat).toList(); + return new Rectangle2D.Float(floats.get(0), floats.get(1), floats.get(2), floats.get(3)); + } + +} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/services/EntityEnrichmentService.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/services/EntityEnrichmentService.java new file mode 100644 index 0000000..a0d079c --- /dev/null +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/services/EntityEnrichmentService.java @@ -0,0 +1,10 @@ +package com.knecon.fforesight.service.layoutparser.internal.api.services; + +import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock; + +public interface EntityEnrichmentService { + + void enrichEntity(EntityNode entity, TextBlock textBlock); + +} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/services/EntityInsertionService.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/services/EntityInsertionService.java new file mode 100644 index 0000000..1b67b32 --- /dev/null +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/services/EntityInsertionService.java @@ -0,0 +1,65 @@ +package com.knecon.fforesight.service.layoutparser.internal.api.services; + +import java.util.NoSuchElementException; +import java.util.Set; + +import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock; + +import lombok.RequiredArgsConstructor; + +@RequiredArgsConstructor +public class EntityInsertionService { + + private final EntityEnrichmentService entityEnrichmentService; + + + public void addEntityToGraph(EntityNode entity, TableOfContents tableOfContents) { + + try { + SemanticNode containingNode = tableOfContents.getEntries() + .stream() + .map(TableOfContents.Entry::node) + .filter(node -> node.buildTextBlock().containsBoundary(entity.getBoundary())) + .findFirst() + .orElseThrow(() -> new NoSuchElementException("No containing Node found!")); + + containingNode.addThisToEntityIfIntersects(entity); + + TextBlock textBlock = entity.getDeepestFullyContainingNode().buildTextBlock(); + entityEnrichmentService.enrichEntity(entity, textBlock); + + addToPages(entity); + addToNodeEntitySets(entity); + + } catch (NoSuchElementException e) { + entityEnrichmentService.enrichEntity(entity, tableOfContents.buildTextBlock()); + entity.removeFromGraph(); + } + } + + + private void addToPages(EntityNode entity) { + + Set pages = entity.getDeepestFullyContainingNode().getPages(); + entity.getPages().addAll(pages); + pages.forEach(page -> page.getEntities().add(entity)); + } + + + private void addToNodeEntitySets(EntityNode entity) { + + entity.getIntersectingNodes().forEach(node -> node.getEntities().add(entity)); + } + + + private static Boundary toLineAfterBoundary(TextBlock textBlock, Boundary boundary) { + + return new Boundary(boundary.end(), textBlock.getNextLinebreak(boundary.end())); + } + +} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/services/RectangleTransformations.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/services/RectangleTransformations.java new file mode 100644 index 0000000..6e0d68e --- /dev/null +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com.knecon.fforesight.service.layoutparser.internal.api/services/RectangleTransformations.java @@ -0,0 +1,95 @@ +package com.knecon.fforesight.service.layoutparser.internal.api.services; + +import static java.lang.String.format; + +import java.awt.geom.Area; +import java.awt.geom.Rectangle2D; +import java.util.Arrays; +import java.util.List; +import java.util.Set; +import java.util.function.BiConsumer; +import java.util.function.BinaryOperator; +import java.util.function.Function; +import java.util.function.Supplier; +import java.util.stream.Collector; + +import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.AtomicTextBlock; + +import lombok.experimental.UtilityClass; + +@UtilityClass +public class RectangleTransformations { + + public static Rectangle2D pad(Rectangle2D rectangle2D, int deltaX, int deltaY) { + + return new Rectangle2D.Double(rectangle2D.getMinX() - deltaX, rectangle2D.getMinY() - deltaY, rectangle2D.getWidth() + 2 * deltaX, rectangle2D.getHeight() + 2 * deltaY); + } + + + public static Rectangle2D bBoxUnionAtomicTextBlock(List atomicTextBlocks) { + + return atomicTextBlocks.stream().flatMap(atomicTextBlock -> atomicTextBlock.getPositions().stream()).collect(new Rectangle2DUnion()); + } + + + public static Rectangle2D rectangleUnion(List rectangle2DList) { + + return rectangle2DList.stream().collect(new Rectangle2DUnion()); + } + + + public static String toString(Rectangle2D rectangle2D) { + + return format("%f,%f,%f,%f", rectangle2D.getX(), rectangle2D.getY(), rectangle2D.getWidth(), rectangle2D.getHeight()); + } + + + public static Rectangle2D parseRectangle2D(String bBox) { + + List floats = Arrays.stream(bBox.split(",")).map(Float::parseFloat).toList(); + return new Rectangle2D.Float(floats.get(0), floats.get(1), floats.get(2), floats.get(3)); + } + + + private static class Rectangle2DUnion implements Collector { + + @Override + public Supplier supplier() { + + return Area::new; + } + + + @Override + public BiConsumer accumulator() { + + return (area, rectangle2D) -> area.add(new Area(rectangle2D)); + } + + + @Override + public BinaryOperator combiner() { + + return (area1, area2) -> { + area1.add(area2); + return area1; + }; + } + + + @Override + public Function finisher() { + + return Area::getBounds2D; + } + + + @Override + public Set characteristics() { + + return Set.of(Characteristics.CONCURRENT, Characteristics.UNORDERED); + } + + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/pom.xml b/layoutparser-service/layoutparser-service-processor/pom.xml new file mode 100644 index 0000000..c9e1b73 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/pom.xml @@ -0,0 +1,161 @@ + + + 4.0.0 + + + com.knecon.fforesight + layoutparser-service + 1.0.0 + + + layoutparser-service-processor + 1.0.0 + + + + + com.iqser.red.service + persistence-service-internal-api-v1 + 2.36.0 + + + + com.knecon.fforesight + layoutparser-service-internal-api + ${project.version} + + + + com.iqser.red.commons + spring-commons + 6.2.0 + + + + com.iqser.red.commons + storage-commons + 1.13.0 + + + + com.dslplatform + dsl-json-java8 + 1.10.0 + + + + org.apache.pdfbox + pdfbox + ${pdfbox.version} + + + + org.apache.pdfbox + pdfbox-tools + ${pdfbox.version} + + + + com.google.guava + guava + 31.1-jre + + + + com.fasterxml.jackson.module + jackson-module-afterburner + ${jackson.version} + + + + com.fasterxml.jackson.datatype + jackson-datatype-jsr310 + ${jackson.version} + + + + org.springframework.boot + spring-boot-starter-security + ${spring.version} + + + + org.springframework.boot + spring-boot-starter-web + ${spring.version} + + + + org.projectlombok + lombok + true + + + + org.springframework.cloud + spring-cloud-starter-openfeign + 4.0.2 + + + + org.springframework.boot + spring-boot-starter-amqp + ${spring.version} + + + + + + + org.springframework.boot + spring-boot-maven-plugin + + + + org.projectlombok + lombok + + + + + + + + + spring-milestones + Spring Milestones + https://repo.spring.io/milestone + + false + + + + spring-snapshots + Spring Snapshots + https://repo.spring.io/snapshot + + false + + + + + + spring-milestones + Spring Milestones + https://repo.spring.io/milestone + + false + + + + spring-snapshots + Spring Snapshots + https://repo.spring.io/snapshot + + false + + + + + diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingService.java new file mode 100644 index 0000000..7236dec --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingService.java @@ -0,0 +1,114 @@ +package com.knecon.fforesight.service.layoutparser.processor; + +import static java.lang.String.format; + +import java.io.IOException; + +import org.apache.pdfbox.pdmodel.PDDocument; +import org.springframework.stereotype.Service; + +import com.knecon.fforesight.service.layoutparser.internal.api.graph.DocumentGraph; +import com.knecon.fforesight.service.layoutparser.internal.api.mapper.DocumentDataMapper; +import com.knecon.fforesight.service.layoutparser.processor.adapter.CvTableParsingAdapter; +import com.knecon.fforesight.service.layoutparser.processor.adapter.ImageServiceResponseAdapter; +import com.knecon.fforesight.service.layoutparser.processor.adapter.model.image.ImageServiceResponse; +import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationDocument; +import com.knecon.fforesight.service.layoutparser.processor.classification.service.ClassificationService; +import com.knecon.fforesight.service.layoutparser.processor.classification.service.PdfParsingService; +import com.knecon.fforesight.service.layoutparser.processor.classification.service.SectionsBuilderService; +import com.knecon.fforesight.service.layoutparser.processor.factory.DocumentGraphFactory; +import com.knecon.fforesight.service.layoutparser.processor.queue.LayoutParsingFinishedEvent; +import com.knecon.fforesight.service.layoutparser.processor.queue.LayoutParsingRequest; + +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; + +@Slf4j +@Service +@RequiredArgsConstructor +public class LayoutParsingService { + + private final ImageServiceResponseAdapter imageServiceResponseAdapter; + private final CvTableParsingAdapter cvTableParsingAdapter; + private final LayoutParsingStorageService layoutParsingStorageService; + private final PdfParsingService pdfParsingService; + private final ClassificationService classificationService; + private final SectionsBuilderService sectionsBuilderService; + private final DocumentGraphFactory documentGraphFactory; + private final DocumentDataMapper documentDataMapper; + + + public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) { + + PDDocument originDocument; + try { + originDocument = layoutParsingStorageService.getOriginFile(layoutParsingRequest.getOriginFileStorageId()); + } catch (IOException e) { + log.error(e.toString()); + return LayoutParsingFinishedEvent.builder() + .status(400) + .message(format("Origin PDF File with id %s could not be loaded!", layoutParsingRequest.getPageFileStorageId())) + .build(); + } + + ImageServiceResponse imageServiceResponse = new ImageServiceResponse(); + if (layoutParsingRequest.getImagesFileStorageId().isPresent()) { + try { + imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.getPageFileStorageId()); + } catch (IOException e) { + log.error(e.toString()); + return LayoutParsingFinishedEvent.builder() + .status(400) + .message(format("Image Service File with id %s could not be loaded!", layoutParsingRequest.getImagesFileStorageId())) + .build(); + } + } + + TableServiceResponse tableServiceResponse = new TableServiceResponse(); + if (layoutParsingRequest.getTablesFileStorageId().isPresent()) { + try { + tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.getPageFileStorageId()); + } catch (IOException e) { + log.error(e.toString()); + return LayoutParsingFinishedEvent.builder() + .status(400) + .message(format("CV Table Parsing File with id %s could not be loaded!", layoutParsingRequest.getPageFileStorageId())) + .build(); + } + } + + DocumentGraph documentGraph = parseLayout(originDocument, imageServiceResponse, tableServiceResponse); + + try { + layoutParsingStorageService.storeDocumentData(layoutParsingRequest, documentDataMapper.toDocumentData(documentGraph)); + } catch (IOException e) { + log.error("Parsed Document files could not be saved!"); + log.error(e.getMessage()); + return LayoutParsingFinishedEvent.builder().status(500).message("Files could not be saved").build(); + } + return LayoutParsingFinishedEvent.builder() + .status(200) + .message(format("Layout parsing is finished and files have been saved with Ids:\n Structure: %s\nText: %s\nPositions: %s\nPageData: %s", + layoutParsingRequest.getStructureFileStorageId(), + layoutParsingRequest.getTextBlockFileStorageId(), + layoutParsingRequest.getPositionBlockFileStorageId(), + layoutParsingRequest.getPageFileStorageId())) + .build(); + } + + + public DocumentGraph parseLayout(PDDocument originDocument, ImageServiceResponse imageServiceResponse, TableServiceResponse tableServiceResponse) { + + ClassificationDocument classificationDocument = pdfParsingService.parseDocument(originDocument, + cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse), + imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse)); + + classificationService.classifyDocument(classificationDocument); + + sectionsBuilderService.buildSections(classificationDocument); + + return documentGraphFactory.buildDocumentGraph(classificationDocument); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java new file mode 100644 index 0000000..70f6715 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java @@ -0,0 +1,126 @@ +package com.knecon.fforesight.service.layoutparser.processor; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; + +import org.apache.commons.io.IOUtils; +import org.apache.pdfbox.Loader; +import org.apache.pdfbox.io.MemoryUsageSetting; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.springframework.stereotype.Service; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.iqser.red.storage.commons.service.StorageService; +import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicPositionBlockData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicTextBlockData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.PageData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.TableOfContentsData; +import com.knecon.fforesight.service.layoutparser.processor.adapter.model.image.ImageServiceResponse; +import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse; +import com.knecon.fforesight.service.layoutparser.processor.multitenancy.TenantContext; +import com.knecon.fforesight.service.layoutparser.processor.queue.LayoutParsingRequest; + +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; + +@Slf4j +@Service +@RequiredArgsConstructor +public class LayoutParsingStorageService { + + private final StorageService storageService; + private final ObjectMapper objectMapper; + + + public PDDocument getOriginFile(String storageId) throws IOException { + + try (var originDocumentInputStream = storageService.getObject(TenantContext.getTenantId(), storageId).getInputStream()) { + File tempFile = createTempFile("document", ".pdf"); + try (var tempFileOutputStream = new FileOutputStream(tempFile)) { + IOUtils.copy(originDocumentInputStream, tempFileOutputStream); + } + return Loader.loadPDF(tempFile, MemoryUsageSetting.setupMixed(67108864L)); + } + } + + + public ImageServiceResponse getImagesFile(String storageId) throws IOException { + + try (InputStream inputStream = storageService.getObject(TenantContext.getTenantId(), storageId).getInputStream()) { + + return objectMapper.readValue(inputStream, ImageServiceResponse.class); + } + } + + + public TableServiceResponse getTablesFile(String storageId) throws IOException { + + try (var tableClassificationStream = storageService.getObject(TenantContext.getTenantId(), storageId).getInputStream()) { + + return objectMapper.readValue(tableClassificationStream, TableServiceResponse.class); + + } + } + + + public void storeDocumentData(LayoutParsingRequest layoutParsingRequest, DocumentData documentData) throws IOException { + + storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.getStructureFileStorageId(), documentData.getTableOfContents()); + storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.getTextBlockFileStorageId(), documentData.getAtomicTextBlocks()); + storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.getPositionBlockFileStorageId(), documentData.getAtomicPositionBlocks()); + storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.getPageFileStorageId(), documentData.getPages()); + + } + + + public DocumentData readDocumentData(LayoutParsingRequest layoutParsingRequest) throws IOException { + + PageData[] pageData = storageService.readJSONObject(TenantContext.getTenantId(), layoutParsingRequest.getPageFileStorageId(), PageData[].class); + AtomicTextBlockData[] atomicTextBlockData = storageService.readJSONObject(TenantContext.getTenantId(), + layoutParsingRequest.getTextBlockFileStorageId(), + AtomicTextBlockData[].class); + AtomicPositionBlockData[] atomicPositionBlockData = storageService.readJSONObject(TenantContext.getTenantId(), + layoutParsingRequest.getPositionBlockFileStorageId(), + AtomicPositionBlockData[].class); + TableOfContentsData tableOfContentsData = storageService.readJSONObject(TenantContext.getTenantId(), + layoutParsingRequest.getStructureFileStorageId(), + TableOfContentsData.class); + + return DocumentData.builder() + .tableOfContents(tableOfContentsData) + .atomicPositionBlocks(atomicPositionBlockData) + .atomicTextBlocks(atomicTextBlockData) + .pages(pageData) + .build(); + } + + + private File createTempFile(String filenamePrefix, String filenameSuffix) throws IOException { + + File tempFile = Files.createTempFile(filenamePrefix, filenameSuffix).toFile(); + setRWPermissionsOnlyForOwner(tempFile); + + return tempFile; + } + + + // We don't need to check the results of the permission setters below, + // since we're manipulating a file we created ourselves. + @SuppressWarnings({"ResultOfMethodCallIgnored", "squid:S899"}) + private void setRWPermissionsOnlyForOwner(File tempFile) { + + try { + tempFile.setReadable(true, true); + tempFile.setWritable(true, true); + tempFile.setExecutable(false); + } catch (SecurityException ex) { + // This should never happen since we're creating a temp file ourselves. + log.warn("Caught an exception during temp file creation. This should not happend. Check the code.", ex); + } + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutparserServiceProcessorConfiguration.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutparserServiceProcessorConfiguration.java new file mode 100644 index 0000000..4c35fd3 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutparserServiceProcessorConfiguration.java @@ -0,0 +1,10 @@ +package com.knecon.fforesight.service.layoutparser.processor; + +import org.springframework.context.annotation.ComponentScan; +import org.springframework.context.annotation.Configuration; + +@Configuration +@ComponentScan +public class LayoutparserServiceProcessorConfiguration { + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/CvTableParsingAdapter.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/CvTableParsingAdapter.java new file mode 100644 index 0000000..31bf171 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/CvTableParsingAdapter.java @@ -0,0 +1,49 @@ +package com.knecon.fforesight.service.layoutparser.processor.adapter; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.springframework.stereotype.Service; + +import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.CvParsedTableCell; +import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse; + +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; + +@Slf4j +@Service +@RequiredArgsConstructor +public class CvTableParsingAdapter { + + public Map> buildCvParsedTablesPerPage(TableServiceResponse tableServiceResponse) { + + Map> tableCells = new HashMap<>(); + tableServiceResponse.getData() + .forEach(tableData -> tableCells.computeIfAbsent(tableData.getPageInfo().getNumber(), tableCell -> new ArrayList<>()) + .addAll(convertTableCells(tableData.getTableCells()))); + + return tableCells; + } + + + private Collection convertTableCells(List tableCells) { + + List cvParsedTableCells = new ArrayList<>(); + + tableCells.forEach(t -> cvParsedTableCells.add(com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CvParsedTableCell.builder() + .y0(t.getY0()) + .x1(t.getX1()) + .y1(t.getY1()) + .x0(t.getX0()) + .width(t.getWidth()) + .height(t.getHeight()) + .build())); + + return cvParsedTableCells; + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/ImageServiceResponseAdapter.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/ImageServiceResponseAdapter.java new file mode 100644 index 0000000..5517f3a --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/ImageServiceResponseAdapter.java @@ -0,0 +1,67 @@ +package com.knecon.fforesight.service.layoutparser.processor.adapter; + +import java.awt.geom.Rectangle2D; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; + +import org.springframework.stereotype.Service; + +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ImageType; +import com.knecon.fforesight.service.layoutparser.processor.adapter.model.image.ImageServiceResponse; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.image.ClassifiedImage; + +import lombok.RequiredArgsConstructor; + +@Service +@RequiredArgsConstructor +public class ImageServiceResponseAdapter { + + + public Map> buildClassifiedImagesPerPage(ImageServiceResponse imageServiceResponse ) { + + Map> images = new HashMap<>(); + imageServiceResponse.getData().forEach(imageMetadata -> { + var classification = imageMetadata.getFilters().isAllPassed() ? ImageType.valueOf(imageMetadata.getClassification() + .getLabel() + .toUpperCase(Locale.ROOT)) : ImageType.OTHER; + images.computeIfAbsent(imageMetadata.getPosition().getPageNumber(), x -> new ArrayList<>()) + .add(new ClassifiedImage(new Rectangle2D.Double(imageMetadata.getPosition().getX1(), + imageMetadata.getPosition().getY1(), + imageMetadata.getGeometry().getWidth(), + imageMetadata.getGeometry().getHeight()), classification, imageMetadata.isAlpha(), imageMetadata.getPosition().getPageNumber())); + }); + + // Currently This is a copy but, it will be changed later because i don' t think that we should unclassified images. + imageServiceResponse.getDataCV().forEach(imageMetadata -> { + var classification = imageMetadata.getFilters().isAllPassed() ? ImageType.valueOf(imageMetadata.getClassification() + .getLabel() + .toUpperCase(Locale.ROOT)) : ImageType.OTHER; + images.computeIfAbsent(imageMetadata.getPosition().getPageNumber(), x -> new ArrayList<>()) + .add(new ClassifiedImage(new Rectangle2D.Double(imageMetadata.getPosition().getX1(), + imageMetadata.getPosition().getY1(), + imageMetadata.getGeometry().getWidth(), + imageMetadata.getGeometry().getHeight()), classification, imageMetadata.isAlpha(), imageMetadata.getPosition().getPageNumber())); + }); + + return images; + } + + + public void findOcr(ClassificationPage classificationPage) { + + classificationPage.getImages().forEach(image -> { + if (image.getImageType().equals(ImageType.OTHER)) { + classificationPage.getTextBlocks().forEach(textblock -> { + if (image.getPosition().contains(textblock.getMinX(), textblock.getMinY(), textblock.getWidth(), textblock.getHeight())) { + image.setImageType(ImageType.OCR); + } + }); + } + }); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/Classification.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/Classification.java new file mode 100644 index 0000000..a743e5b --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/Classification.java @@ -0,0 +1,17 @@ +package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image; + +import java.util.HashMap; +import java.util.Map; + +import com.dslplatform.json.CompiledJson; + +import lombok.Data; + +@Data +@CompiledJson +public class Classification { + + private Map probabilities = new HashMap<>(); + private String label; + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/FilterGeometry.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/FilterGeometry.java new file mode 100644 index 0000000..ea02ade --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/FilterGeometry.java @@ -0,0 +1,14 @@ +package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image; + +import com.dslplatform.json.CompiledJson; + +import lombok.Data; + +@Data +@CompiledJson +public class FilterGeometry { + + private ImageSize imageSize; + private Format imageFormat; + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/Filters.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/Filters.java new file mode 100644 index 0000000..9258f89 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/Filters.java @@ -0,0 +1,15 @@ +package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image; + +import com.dslplatform.json.CompiledJson; + +import lombok.Data; + +@Data +@CompiledJson +public class Filters { + + private FilterGeometry geometry; + private Probability probability; + private boolean allPassed; + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/Format.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/Format.java new file mode 100644 index 0000000..92263f8 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/Format.java @@ -0,0 +1,15 @@ +package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image; + +import com.dslplatform.json.CompiledJson; + +import lombok.Data; + +@Data +@CompiledJson +public class Format { + + private float quotient; + private boolean tooTall; + private boolean tooWide; + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/Geometry.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/Geometry.java new file mode 100644 index 0000000..4395be7 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/Geometry.java @@ -0,0 +1,14 @@ +package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image; + +import com.dslplatform.json.CompiledJson; + +import lombok.Data; + +@Data +@CompiledJson +public class Geometry { + + private float width; + private float height; + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/ImageServiceResponse.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/ImageServiceResponse.java new file mode 100644 index 0000000..9083787 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/ImageServiceResponse.java @@ -0,0 +1,33 @@ +package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image; + +import java.util.ArrayList; +import java.util.List; + +import com.dslplatform.json.CompiledJson; +import com.dslplatform.json.JsonAttribute; +import com.fasterxml.jackson.annotation.JsonAlias; +import com.fasterxml.jackson.annotation.JsonProperty; + +import lombok.Data; + +@Data +@CompiledJson +public class ImageServiceResponse { + + private String dossierId; + private String fileId; + + @JsonProperty(value = "imageMetadata") + @JsonAlias("data") + @JsonAttribute(alternativeNames = {"imageMetadata"}) + private List data = new ArrayList<>(); + + private List dataCV = new ArrayList<>(); + + + @JsonProperty(value = "imageMetadata") + @JsonAlias("data") + @JsonAttribute(alternativeNames = {"imageMetadata"}) + public void setData(List data) {this.data = data;} + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/ImageSize.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/ImageSize.java new file mode 100644 index 0000000..cafd0b5 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/ImageSize.java @@ -0,0 +1,15 @@ +package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image; + +import com.dslplatform.json.CompiledJson; + +import lombok.Data; + +@Data +@CompiledJson +public class ImageSize { + + private float quotient; + private boolean tooLarge; + private boolean tooSmall; + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/Metadata.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/Metadata.java new file mode 100644 index 0000000..e354652 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/Metadata.java @@ -0,0 +1,17 @@ +package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image; + +import com.dslplatform.json.CompiledJson; + +import lombok.Data; + +@Data +@CompiledJson +public class Metadata { + + private Classification classification; + private Position position; + private Geometry geometry; + private Filters filters; + private boolean alpha; + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/Position.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/Position.java new file mode 100644 index 0000000..297499e --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/Position.java @@ -0,0 +1,17 @@ +package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image; + +import com.dslplatform.json.CompiledJson; + +import lombok.Data; + +@Data +@CompiledJson +public class Position { + + private float x1; + private float x2; + private float y1; + private float y2; + private int pageNumber; + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/Probability.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/Probability.java new file mode 100644 index 0000000..961f76a --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/Probability.java @@ -0,0 +1,13 @@ +package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image; + +import com.dslplatform.json.CompiledJson; + +import lombok.Data; + +@Data +@CompiledJson +public class Probability { + + private boolean unconfident; + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/table/CvParsedPageInfo.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/table/CvParsedPageInfo.java new file mode 100644 index 0000000..5f4bde3 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/table/CvParsedPageInfo.java @@ -0,0 +1,16 @@ +package com.knecon.fforesight.service.layoutparser.processor.adapter.model.table; + +import com.dslplatform.json.CompiledJson; + +import lombok.Data; + +@Data +@CompiledJson +public class CvParsedPageInfo { + + private int number; + private int rotation; + private float width; + private float height; + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/table/CvParsedTableCell.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/table/CvParsedTableCell.java new file mode 100644 index 0000000..7aa369f --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/table/CvParsedTableCell.java @@ -0,0 +1,18 @@ +package com.knecon.fforesight.service.layoutparser.processor.adapter.model.table; + +import com.dslplatform.json.CompiledJson; + +import lombok.Data; + +@Data +@CompiledJson +public class CvParsedTableCell { + + private float x0; + private float y0; + private float x1; + private float y1; + private float width; + private float height; + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/table/CvParsedTableModel.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/table/CvParsedTableModel.java new file mode 100644 index 0000000..01902cf --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/table/CvParsedTableModel.java @@ -0,0 +1,17 @@ +package com.knecon.fforesight.service.layoutparser.processor.adapter.model.table; + +import java.util.ArrayList; +import java.util.List; + +import com.dslplatform.json.CompiledJson; + +import lombok.Data; + +@Data +@CompiledJson +public class CvParsedTableModel { + + private CvParsedPageInfo pageInfo; + private List tableCells = new ArrayList<>(); + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/table/TableServiceResponse.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/table/TableServiceResponse.java new file mode 100644 index 0000000..484f7e7 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/table/TableServiceResponse.java @@ -0,0 +1,22 @@ +package com.knecon.fforesight.service.layoutparser.processor.adapter.model.table; + +import java.util.ArrayList; +import java.util.List; + +import com.dslplatform.json.CompiledJson; + +import lombok.Data; + +@Data +@CompiledJson +public class TableServiceResponse { + + private String dossierId; + private String fileId; + private String operation; + private String targetFileExtension; + private String responseFileExtension; + + private List data = new ArrayList<>(); + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/AbstractTextContainer.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/AbstractTextContainer.java new file mode 100644 index 0000000..84325a0 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/AbstractTextContainer.java @@ -0,0 +1,71 @@ +package com.knecon.fforesight.service.layoutparser.processor.classification.dto; + +import java.awt.geom.Rectangle2D; + +import com.dslplatform.json.JsonAttribute; +import com.fasterxml.jackson.annotation.JsonIgnore; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextBlockOrientation; + +import lombok.AllArgsConstructor; +import lombok.Data; +import lombok.NoArgsConstructor; + +@Data +@AllArgsConstructor +@NoArgsConstructor +public abstract class AbstractTextContainer { + + protected float minX; + protected float maxX; + protected float minY; + protected float maxY; + protected String classification; + protected int page; + + private TextBlockOrientation orientation = TextBlockOrientation.NONE; + + + public abstract String getText(); + + + public boolean containsBlock(ClassificationTextBlock other) { + + return this.minX <= other.getPdfMinX() && this.maxX >= other.getPdfMaxX() && this.minY >= other.getPdfMinY() && this.maxY <= other.getPdfMaxY(); + } + + + public boolean contains(AbstractTextContainer other) { + + return this.minX <= other.minX && this.maxX >= other.maxX && this.minY >= other.minY && this.maxY <= other.maxY; + } + + + public boolean contains(Rectangle2D other) { + + return other.contains(minX, minY, getWidth(), getHeight()); + } + + + @JsonIgnore + @JsonAttribute(ignore = true) + public float getHeight() { + + return maxY - minY; + } + + + @JsonIgnore + @JsonAttribute(ignore = true) + public float getWidth() { + + return maxX - minX; + } + + + public boolean intersectsY(AbstractTextContainer atc) { + + return this.minY <= atc.getMaxY() && this.maxY >= atc.getMinY(); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/ClassificationDocument.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/ClassificationDocument.java new file mode 100644 index 0000000..df2af76 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/ClassificationDocument.java @@ -0,0 +1,27 @@ +package com.knecon.fforesight.service.layoutparser.processor.classification.dto; + +import java.util.ArrayList; +import java.util.List; + +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.StringFrequencyCounter; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.UnclassifiedText; + +import lombok.Data; +import lombok.NoArgsConstructor; + +@Data +@NoArgsConstructor +public class ClassificationDocument { + + private List pages = new ArrayList<>(); + private List sections = new ArrayList<>(); + private List headers = new ArrayList<>(); + private List footers = new ArrayList<>(); + private List unclassifiedTexts = new ArrayList<>(); + private FloatFrequencyCounter textHeightCounter = new FloatFrequencyCounter(); + private FloatFrequencyCounter fontSizeCounter = new FloatFrequencyCounter(); + private StringFrequencyCounter fontCounter = new StringFrequencyCounter(); + private StringFrequencyCounter fontStyleCounter = new StringFrequencyCounter(); + private boolean headlines; + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/ClassificationFooter.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/ClassificationFooter.java new file mode 100644 index 0000000..f72b1bc --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/ClassificationFooter.java @@ -0,0 +1,16 @@ +package com.knecon.fforesight.service.layoutparser.processor.classification.dto; + +import java.util.List; + +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock; + +import lombok.AllArgsConstructor; +import lombok.Data; + +@Data +@AllArgsConstructor +public class ClassificationFooter { + + private List textBlocks; + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/ClassificationHeader.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/ClassificationHeader.java new file mode 100644 index 0000000..aef421a --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/ClassificationHeader.java @@ -0,0 +1,16 @@ +package com.knecon.fforesight.service.layoutparser.processor.classification.dto; + +import java.util.List; + +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock; + +import lombok.AllArgsConstructor; +import lombok.Data; + +@Data +@AllArgsConstructor +public class ClassificationHeader { + + private List textBlocks; + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/ClassificationPage.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/ClassificationPage.java new file mode 100644 index 0000000..f1c2a61 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/ClassificationPage.java @@ -0,0 +1,38 @@ +package com.knecon.fforesight.service.layoutparser.processor.classification.dto; + +import java.util.ArrayList; +import java.util.List; + +import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.image.ClassifiedImage; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.StringFrequencyCounter; + +import lombok.Data; +import lombok.NonNull; +import lombok.RequiredArgsConstructor; + +@Data +@RequiredArgsConstructor +public class ClassificationPage { + + @NonNull + private List textBlocks; + + private List images = new ArrayList<>(); + + private Rectangle bodyTextFrame; + + private boolean landscape; + private int rotation; + + private int pageNumber; + + private FloatFrequencyCounter textHeightCounter = new FloatFrequencyCounter(); + private FloatFrequencyCounter fontSizeCounter = new FloatFrequencyCounter(); + private StringFrequencyCounter fontCounter = new StringFrequencyCounter(); + private StringFrequencyCounter fontStyleCounter = new StringFrequencyCounter(); + + private float pageWidth; + private float pageHeight; + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/ClassificationSection.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/ClassificationSection.java new file mode 100644 index 0000000..4e1b6fa --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/ClassificationSection.java @@ -0,0 +1,38 @@ +package com.knecon.fforesight.service.layoutparser.processor.classification.dto; + +import java.util.ArrayList; +import java.util.List; + +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.image.ClassifiedImage; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Table; + +import lombok.Data; +import lombok.NoArgsConstructor; + +@Data +@NoArgsConstructor +public class ClassificationSection implements Comparable { + + private List pageBlocks = new ArrayList<>(); + private List images = new ArrayList<>(); + private String headline; + + + public List getTables() { + + List
tables = new ArrayList<>(); + pageBlocks.forEach(block -> { + if (block instanceof Table) { + tables.add((Table) block); + } + }); + return tables; + } + + + @Override + public int compareTo(Object o) { + + return 0; + } +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/FloatFrequencyCounter.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/FloatFrequencyCounter.java new file mode 100755 index 0000000..f970120 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/FloatFrequencyCounter.java @@ -0,0 +1,77 @@ +package com.knecon.fforesight.service.layoutparser.processor.classification.dto; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import lombok.Getter; + +public class FloatFrequencyCounter { + + @Getter + Map countPerValue = new HashMap<>(); + + + public void add(float value) { + + if (!countPerValue.containsKey(value)) { + countPerValue.put(value, 1); + } else { + countPerValue.put(value, countPerValue.get(value) + 1); + } + } + + + public void addAll(Map otherCounter) { + + for (Map.Entry entry : otherCounter.entrySet()) { + if (countPerValue.containsKey(entry.getKey())) { + countPerValue.put(entry.getKey(), countPerValue.get(entry.getKey()) + entry.getValue()); + } else { + countPerValue.put(entry.getKey(), entry.getValue()); + } + } + } + + + public Float getMostPopular() { + + Map.Entry mostPopular = null; + for (Map.Entry entry : countPerValue.entrySet()) { + if (mostPopular == null || entry.getValue() >= mostPopular.getValue()) { + mostPopular = entry; + } + } + return mostPopular != null ? mostPopular.getKey() : null; + } + + + public List getHighterThanMostPopular() { + + Float mostPopular = getMostPopular(); + List higher = new ArrayList<>(); + for (Float value : countPerValue.keySet()) { + if (value > mostPopular) { + higher.add(value); + } + } + + return higher.stream().sorted(Collections.reverseOrder()).collect(Collectors.toList()); + } + + + public Float getHighest() { + + Float highest = null; + for (Float value : countPerValue.keySet()) { + if (highest == null || value > highest) { + highest = value; + } + } + return highest; + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/Rectangle.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/Rectangle.java new file mode 100644 index 0000000..c3323fd --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/Rectangle.java @@ -0,0 +1,218 @@ +package com.knecon.fforesight.service.layoutparser.processor.classification.dto; + +import java.awt.geom.Point2D; +import java.awt.geom.Rectangle2D; +import java.util.Comparator; +import java.util.List; + +@SuppressWarnings("all") +public class Rectangle extends Rectangle2D.Float { + + protected static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f; + /** + * Ill-defined comparator, from when Rectangle was Comparable. + *

+ * see https://github.com/tabulapdf/tabula-java/issues/116 + * + * @deprecated with no replacement + */ + @Deprecated + public static final Comparator ILL_DEFINED_ORDER = new Comparator() { + @Override + public int compare(Rectangle o1, Rectangle o2) { + + if (o1.equals(o2)) { + return 0; + } + if (o1.verticalOverlap(o2) > VERTICAL_COMPARISON_THRESHOLD) { + return o1.isLtrDominant() == -1 && o2.isLtrDominant() == -1 ? -java.lang.Double.compare(o1.getX(), o2.getX()) : java.lang.Double.compare(o1.getX(), o2.getX()); + } else { + return java.lang.Float.compare(o1.getBottom(), o2.getBottom()); + } + } + }; + + + public Rectangle() { + + super(); + } + + + public Rectangle(float top, float left, float width, float height) { + + super(); + this.setRect(left, top, width, height); + } + + + /** + * @param rectangles + * @return minimum bounding box that contains all the rectangles + */ + public static Rectangle boundingBoxOf(List rectangles) { + + float minx = java.lang.Float.MAX_VALUE; + float miny = java.lang.Float.MAX_VALUE; + float maxx = java.lang.Float.MIN_VALUE; + float maxy = java.lang.Float.MIN_VALUE; + + for (Rectangle r : rectangles) { + minx = (float) Math.min(r.getMinX(), minx); + miny = (float) Math.min(r.getMinY(), miny); + maxx = (float) Math.max(r.getMaxX(), maxx); + maxy = (float) Math.max(r.getMaxY(), maxy); + } + return new Rectangle(miny, minx, maxx - minx, maxy - miny); + } + + + public int compareTo(Rectangle other) { + + return ILL_DEFINED_ORDER.compare(this, other); + } + + + // I'm bad at Java and need this for fancy sorting in + // technology.tabula.TextChunk. + public int isLtrDominant() { + + return 0; + } + + + public float getArea() { + + return this.width * this.height; + } + + + public float verticalOverlap(Rectangle other) { + + return Math.max(0, Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop())); + } + + + public boolean verticallyOverlaps(Rectangle other) { + + return verticalOverlap(other) > 0; + } + + + public float horizontalOverlap(Rectangle other) { + + return Math.max(0, Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft())); + } + + + public boolean horizontallyOverlaps(Rectangle other) { + + return horizontalOverlap(other) > 0; + } + + + public float verticalOverlapRatio(Rectangle other) { + + float rv = 0, delta = Math.min(this.getBottom() - this.getTop(), other.getBottom() - other.getTop()); + + if (other.getTop() <= this.getTop() && this.getTop() <= other.getBottom() && other.getBottom() <= this.getBottom()) { + rv = (other.getBottom() - this.getTop()) / delta; + } else if (this.getTop() <= other.getTop() && other.getTop() <= this.getBottom() && this.getBottom() <= other.getBottom()) { + rv = (this.getBottom() - other.getTop()) / delta; + } else if (this.getTop() <= other.getTop() && other.getTop() <= other.getBottom() && other.getBottom() <= this.getBottom()) { + rv = (other.getBottom() - other.getTop()) / delta; + } else if (other.getTop() <= this.getTop() && this.getTop() <= this.getBottom() && this.getBottom() <= other.getBottom()) { + rv = (this.getBottom() - this.getTop()) / delta; + } + + return rv; + + } + + + public float overlapRatio(Rectangle other) { + + double intersectionWidth = Math.max(0, Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft())); + double intersectionHeight = Math.max(0, Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop())); + double intersectionArea = Math.max(0, intersectionWidth * intersectionHeight); + double unionArea = this.getArea() + other.getArea() - intersectionArea; + + return (float) (intersectionArea / unionArea); + } + + + public Rectangle merge(Rectangle other) { + + this.setRect(this.createUnion(other)); + return this; + } + + + public float getTop() { + + return (float) this.getMinY(); + } + + + public void setTop(float top) { + + float deltaHeight = top - this.y; + this.setRect(this.x, top, this.width, this.height - deltaHeight); + } + + + public float getRight() { + + return (float) this.getMaxX(); + } + + + public void setRight(float right) { + + this.setRect(this.x, this.y, right - this.x, this.height); + } + + + public float getLeft() { + + return (float) this.getMinX(); + } + + + public void setLeft(float left) { + + float deltaWidth = left - this.x; + this.setRect(left, this.y, this.width - deltaWidth, this.height); + } + + + public float getBottom() { + + return (float) this.getMaxY(); + } + + + public void setBottom(float bottom) { + + this.setRect(this.x, this.y, this.width, bottom - this.y); + } + + + public Point2D[] getPoints() { + + return new Point2D[]{new Point2D.Float(this.getLeft(), this.getTop()), new Point2D.Float(this.getRight(), this.getTop()), new Point2D.Float(this.getRight(), + this.getBottom()), new Point2D.Float(this.getLeft(), this.getBottom())}; + } + + + @Override + public String toString() { + + StringBuilder sb = new StringBuilder(); + String s = super.toString(); + sb.append(s.substring(0, s.length() - 1)); + sb.append(String.format(",bottom=%f,right=%f]", this.getBottom(), this.getRight())); + return sb.toString(); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/image/ClassifiedImage.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/image/ClassifiedImage.java new file mode 100644 index 0000000..0ed4851 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/image/ClassifiedImage.java @@ -0,0 +1,25 @@ +package com.knecon.fforesight.service.layoutparser.processor.classification.dto.image; + +import java.awt.geom.Rectangle2D; + +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ImageType; + +import lombok.Data; +import lombok.NonNull; +import lombok.RequiredArgsConstructor; + +@Data +@RequiredArgsConstructor +public class ClassifiedImage { + + @NonNull + private Rectangle2D position; + @NonNull + private ImageType imageType; + private boolean isAppendedToSection; + @NonNull + private boolean hasTransparency; + @NonNull + private int page; + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/table/CleanRulings.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/table/CleanRulings.java new file mode 100644 index 0000000..b09bf5b --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/table/CleanRulings.java @@ -0,0 +1,17 @@ +package com.knecon.fforesight.service.layoutparser.processor.classification.dto.table; + +import java.util.List; + +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Ruling; + +import lombok.Builder; +import lombok.Data; + +@Data +@Builder +public class CleanRulings { + + List horizontal; + List vertical; + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/table/CvParsedTableCell.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/table/CvParsedTableCell.java new file mode 100644 index 0000000..0b11042 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/table/CvParsedTableCell.java @@ -0,0 +1,21 @@ +package com.knecon.fforesight.service.layoutparser.processor.classification.dto.table; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.RequiredArgsConstructor; + +@Data +@Builder +@AllArgsConstructor +@RequiredArgsConstructor +public class CvParsedTableCell { + + private float x0; + private float y0; + private float x1; + private float y1; + private float width; + private float height; + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/table/Ruling.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/table/Ruling.java new file mode 100644 index 0000000..109a06f --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/table/Ruling.java @@ -0,0 +1,437 @@ +package com.knecon.fforesight.service.layoutparser.processor.classification.dto.table; + +import java.awt.geom.Line2D; +import java.awt.geom.Point2D; +import java.awt.geom.Rectangle2D; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.Formatter; +import java.util.List; +import java.util.Map; +import java.util.TreeMap; + +import com.knecon.fforesight.service.layoutparser.processor.classification.utils.CohenSutherlandClipping; +import com.knecon.fforesight.service.layoutparser.processor.classification.utils.DoubleComparisons; + +import lombok.extern.slf4j.Slf4j; + +@Slf4j +@SuppressWarnings("all") +public class Ruling extends Line2D.Float { + + private static int PERPENDICULAR_PIXEL_EXPAND_AMOUNT = 2; + + + public Ruling(Point2D p1, Point2D p2) { + + super(p1, p2); + } + + + public static List cropRulingsToArea(List rulings, Rectangle2D area) { + + ArrayList rv = new ArrayList<>(); + for (Ruling r : rulings) { + if (r.intersects(area)) { + rv.add(r.intersect(area)); + } + } + return rv; + } + + + // log(n) implementation of find_intersections + // based on http://people.csail.mit.edu/indyk/6.838-old/handouts/lec2.pdf + public static Map findIntersections(List horizontals, List verticals) { + + class SortObject { + + protected SOType type; + protected float position; + protected Ruling ruling; + + + public SortObject(SOType type, float position, Ruling ruling) { + + this.type = type; + this.position = position; + this.ruling = ruling; + } + + } + + List sos = new ArrayList<>(); + + TreeMap tree = new TreeMap<>(new Comparator() { + @Override + public int compare(Ruling o1, Ruling o2) { + + return java.lang.Double.compare(o1.getTop(), o2.getTop()); + } + }); + + TreeMap rv = new TreeMap<>(new Comparator() { + @Override + public int compare(Point2D o1, Point2D o2) { + + if (o1.getY() > o2.getY()) { + return 1; + } + if (o1.getY() < o2.getY()) { + return -1; + } + if (o1.getX() > o2.getX()) { + return 1; + } + if (o1.getX() < o2.getX()) { + return -1; + } + return 0; + } + }); + + for (Ruling h : horizontals) { + sos.add(new SortObject(SOType.HLEFT, h.getLeft() - PERPENDICULAR_PIXEL_EXPAND_AMOUNT, h)); + sos.add(new SortObject(SOType.HRIGHT, h.getRight() + PERPENDICULAR_PIXEL_EXPAND_AMOUNT, h)); + } + + for (Ruling v : verticals) { + sos.add(new SortObject(SOType.VERTICAL, v.getLeft(), v)); + } + + Collections.sort(sos, new Comparator() { + @Override + public int compare(SortObject a, SortObject b) { + + int rv; + if (DoubleComparisons.feq(a.position, b.position)) { + if (a.type == SOType.VERTICAL && b.type == SOType.HLEFT) { + rv = 1; + } else if (a.type == SOType.VERTICAL && b.type == SOType.HRIGHT) { + rv = -1; + } else if (a.type == SOType.HLEFT && b.type == SOType.VERTICAL) { + rv = -1; + } else if (a.type == SOType.HRIGHT && b.type == SOType.VERTICAL) { + rv = 1; + } else { + rv = java.lang.Double.compare(a.position, b.position); + } + } else { + return java.lang.Double.compare(a.position, b.position); + } + return rv; + } + }); + + for (SortObject so : sos) { + switch (so.type) { + case VERTICAL: + for (Map.Entry h : tree.entrySet()) { + try { + Point2D i = h.getKey().intersectionPoint(so.ruling); + if (i == null) { + continue; + } + rv.put(i, new Ruling[]{h.getKey().expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT), so.ruling.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT)}); + } catch (UnsupportedOperationException e) { + log.info("Some line are oblique, ignoring..."); + continue; + } + } + break; + case HRIGHT: + tree.remove(so.ruling); + break; + case HLEFT: + tree.put(so.ruling, true); + break; + } + } + + return rv; + + } + + + public boolean vertical() { + + return this.length() > 0 && DoubleComparisons.feq(this.x1, this.x2); //diff < ORIENTATION_CHECK_THRESHOLD; + } + + + public boolean horizontal() { + + return this.length() > 0 && DoubleComparisons.feq(this.y1, this.y2); //diff < ORIENTATION_CHECK_THRESHOLD; + } + + // attributes that make sense only for non-oblique lines + // these are used to have a single collapse method (in page, currently) + + + public boolean oblique() { + + return !(this.vertical() || this.horizontal()); + } + + + public float getPosition() { + + if (this.oblique()) { + throw new UnsupportedOperationException(); + } + return this.vertical() ? this.getLeft() : this.getTop(); + } + + + public float getStart() { + + if (this.oblique()) { + throw new UnsupportedOperationException(); + } + return this.vertical() ? this.getTop() : this.getLeft(); + } + + + public void setStart(float v) { + + if (this.oblique()) { + throw new UnsupportedOperationException(); + } + if (this.vertical()) { + this.setTop(v); + } else { + this.setLeft(v); + } + } + + + public float getEnd() { + + if (this.oblique()) { + throw new UnsupportedOperationException(); + } + return this.vertical() ? this.getBottom() : this.getRight(); + } + + + public void setEnd(float v) { + + if (this.oblique()) { + throw new UnsupportedOperationException(); + } + if (this.vertical()) { + this.setBottom(v); + } else { + this.setRight(v); + } + } + + + public void setStartEnd(float start, float end) { + + if (this.oblique()) { + throw new UnsupportedOperationException(); + } + if (this.vertical()) { + this.setTop(start); + this.setBottom(end); + } else { + this.setLeft(start); + this.setRight(end); + } + } + + + public boolean perpendicularTo(Ruling other) { + + return this.vertical() == other.horizontal(); + } + + + public boolean nearlyIntersects(Ruling another, int colinearOrParallelExpandAmount) { + + if (this.intersectsLine(another)) { + return true; + } + + boolean rv = false; + + if (this.perpendicularTo(another)) { + rv = this.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT).intersectsLine(another); + } else { + rv = this.expand(colinearOrParallelExpandAmount).intersectsLine(another.expand(colinearOrParallelExpandAmount)); + } + + return rv; + } + + + public double length() { + + return Math.sqrt(Math.pow(this.x1 - this.x2, 2) + Math.pow(this.y1 - this.y2, 2)); + } + + + public Ruling intersect(Rectangle2D clip) { + + Float clipee = (Float) this.clone(); + boolean clipped = new CohenSutherlandClipping(clip).clip(clipee); + + if (clipped) { + return new Ruling(clipee.getP1(), clipee.getP2()); + } else { + return this; + } + } + + + public Ruling expand(float amount) { + + Ruling r = (Ruling) this.clone(); + try { + r.setStart(this.getStart() - amount); + r.setEnd(this.getEnd() + amount); + } catch (UnsupportedOperationException e) { + log.warn("Could not expand ruling!"); + } + return r; + } + + + public Point2D intersectionPoint(Ruling other) { + + Ruling this_l = this.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT); + Ruling other_l = other.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT); + Ruling horizontal, vertical; + + if (!this_l.intersectsLine(other_l)) { + return null; + } + + if (this_l.horizontal() && other_l.vertical()) { + horizontal = this_l; + vertical = other_l; + } else if (this_l.vertical() && other_l.horizontal()) { + vertical = this_l; + horizontal = other_l; + } else { + log.warn("lines must be orthogonal, vertical and horizontal"); + return null; + } + return new Point2D.Float(vertical.getLeft(), horizontal.getTop()); + } + + + @Override + public boolean equals(Object other) { + + if (this == other) { + return true; + } + + if (!(other instanceof Ruling)) { + return false; + } + + Ruling o = (Ruling) other; + return this.getP1().equals(o.getP1()) && this.getP2().equals(o.getP2()); + } + + + @Override + public int hashCode() { + + return super.hashCode(); + } + + + public float getTop() { + + return this.y1; + } + + + public void setTop(float v) { + + setLine(this.getLeft(), v, this.getRight(), this.getBottom()); + } + + + public float getLeft() { + + return this.x1; + } + + + public void setLeft(float v) { + + setLine(v, this.getTop(), this.getRight(), this.getBottom()); + } + + + public float getBottom() { + + return this.y2; + } + + + public void setBottom(float v) { + + setLine(this.getLeft(), this.getTop(), this.getRight(), v); + } + + + public float getRight() { + + return this.x2; + } + + + public void setRight(float v) { + + setLine(this.getLeft(), this.getTop(), v, this.getBottom()); + } + + + public float getWidth() { + + return this.getRight() - this.getLeft(); + } + + + public float getHeight() { + + return this.getBottom() - this.getTop(); + } + + + public double getAngle() { + + double angle = Math.toDegrees(Math.atan2(this.getP2().getY() - this.getP1().getY(), this.getP2().getX() - this.getP1().getX())); + + if (angle < 0) { + angle += 360; + } + return angle; + } + + + @Override + public String toString() { + + StringBuilder sb = new StringBuilder(); + Formatter formatter = new Formatter(sb); + String rv = formatter.format("%s[minX=%f minY=%f maxX=%f maxY=%f]", this.getClass().toString(), this.x1, this.y1, this.x2, this.y2).toString(); + formatter.close(); + return rv; + } + + + private enum SOType { + VERTICAL, + HRIGHT, + HLEFT + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/table/Table.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/table/Table.java new file mode 100644 index 0000000..cee62ef --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/table/Table.java @@ -0,0 +1,350 @@ +package com.knecon.fforesight.service.layoutparser.processor.classification.dto.table; + +import java.awt.geom.Point2D; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.TreeMap; +import java.util.stream.Collectors; + +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.Rectangle; + +import lombok.Getter; +import lombok.Setter; +import lombok.extern.slf4j.Slf4j; + +@Slf4j +public class Table extends AbstractTextContainer { + + private final TreeMap cells = new TreeMap<>(); + + private final int rotation; + @Getter + @Setter + private String headline; + private int unrotatedRowCount; + private int unrotatedColCount; + private int rowCount = -1; + private int colCount = -1; + private List> rows; + + + public Table(List cells, Rectangle area, int rotation) { + + addCells(cells); + minX = area.getLeft(); + minY = area.getBottom(); + maxX = area.getRight(); + maxY = area.getTop(); + classification = "Table"; + this.rotation = rotation; + + } + + + public List> getRows() { + + if (rows == null) { + rows = computeRows(); + + // Ignore rows that does not contain any cells and values. + List> rowsToRemove = new ArrayList<>(); + for (List row : rows) { + if (row.size() == 1 && row.get(0).getTextBlocks().isEmpty()) { + rowsToRemove.add(row); + } + } + rows.removeAll(rowsToRemove); + + computeHeaders(); + } + + return rows; + + } + + + public int getRowCount() { + + if (rowCount == -1) { + rowCount = getRows().size(); + } + return rowCount; + } + + + public int getColCount() { + + if (colCount == -1) { + colCount = getRows().stream().mapToInt(List::size).max().orElse(0); + } + return colCount; + + } + + + /** + * Detect header cells (either first row or first column): + * Column is marked as header if cell text is bold and row cell text is not bold. + * Defaults to row. + */ + private void computeHeaders() { + + if (rows == null) { + rows = computeRows(); + } + // A bold cell is a header cell as long as every cell to the left/top is bold, too + // we move from left to right and top to bottom + for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) { + List rowCells = rows.get(rowIndex); + if (rowCells.size() == 1) { + continue; + } + + for (int colIndex = 0; colIndex < rowCells.size(); colIndex++) { + TableCell cell = rowCells.get(colIndex); + List cellsToTheLeft = rowCells.subList(0, colIndex); + TableCell lastHeaderCell = null; + for (TableCell leftCell : cellsToTheLeft) { + if (leftCell.isHeaderCell()) { + lastHeaderCell = leftCell; + } else { + break; + } + } + if (lastHeaderCell != null) { + cell.getHeaderCells().add(lastHeaderCell); + } + List cellsToTheTop = new ArrayList<>(); + for (int i = 0; i < rowIndex; i++) { + try { + cellsToTheTop.add(rows.get(i).get(colIndex)); + } catch (IndexOutOfBoundsException e) { + log.debug("No cell {} in row {}, ignoring.", colIndex, rowIndex); + } + } + for (TableCell topCell : cellsToTheTop) { + if (topCell.isHeaderCell()) { + lastHeaderCell = topCell; + } else { + break; + } + } + if (lastHeaderCell != null) { + cell.getHeaderCells().add(lastHeaderCell); + } + if (!cell.getTextBlocks().isEmpty() && cell.getTextBlocks().get(0).getMostPopularWordStyle().equals("bold")) { + cell.setHeaderCell(true); + } + } + } + + } + + + private List> computeRows() { + + List> rows = new ArrayList<>(); + if (rotation == 90) { + for (int i = 0; i < unrotatedColCount; i++) { // rows + List lastRow = new ArrayList<>(); + for (int j = unrotatedRowCount - 1; j >= 0; j--) { // cols + TableCell cell = cells.get(new TableCellPosition(j, i)); + if (cell != null) { + lastRow.add(cell); + } + } + rows.add(lastRow); + } + } else if (rotation == 270) { + for (int i = unrotatedColCount - 1; i >= 0; i--) { // rows + List lastRow = new ArrayList<>(); + for (int j = 0; j < unrotatedRowCount; j++) { // cols + TableCell cell = cells.get(new TableCellPosition(j, i)); + if (cell != null) { + lastRow.add(cell); + } + } + rows.add(lastRow); + } + } else { + for (int i = 0; i < unrotatedRowCount; i++) { + List lastRow = new ArrayList<>(); + for (int j = 0; j < unrotatedColCount; j++) { + TableCell cell = cells.get(new TableCellPosition(i, j)); // JAVA_8 use getOrDefault() + if (cell != null) { + lastRow.add(cell); + } + } + rows.add(lastRow); + } + } + + return rows; + + } + + + private void add(TableCell chunk, int row, int col) { + + unrotatedRowCount = Math.max(unrotatedRowCount, row + 1); + unrotatedColCount = Math.max(unrotatedColCount, col + 1); + + TableCellPosition cp = new TableCellPosition(row, col); + cells.put(cp, chunk); + + } + + + private void addCells(List cells) { + + if (cells.isEmpty()) { + return; + } + + cells.removeIf(cell -> cell.getWidth() < 1.1 || cell.getHeight() < 1.1); + + List> rowsOfCells = calculateStructure(cells); + + for (int i = 0; i < rowsOfCells.size(); i++) { + for (int j = 0; j < rowsOfCells.get(i).size(); j++) { + add(rowsOfCells.get(i).get(j), i, j); + } + } + + } + + + /** + * Calculates the structure of the table. For spanning rows and columns multiple cells with the same values will be inserted. + * + * @param cells The found cells + * @return Table Structure + */ + private List> calculateStructure(List cells) { + + List> matrix = new ArrayList<>(); + + if (cells.isEmpty()) { + return matrix; + } + + Set uniqueX = new HashSet<>(); + Set uniqueY = new HashSet<>(); + cells.stream().filter(c -> !c.getTextBlocks().isEmpty() || c.getHeight() > 3 && c.getWidth() > 3).forEach(c -> { + uniqueX.add(c.getLeft()); + uniqueX.add(c.getRight()); + uniqueY.add(c.getBottom()); + uniqueY.add(c.getTop()); + }); + + var sortedUniqueX = uniqueX.stream().sorted().collect(Collectors.toList()); + var sortedUniqueY = uniqueY.stream().sorted().collect(Collectors.toList()); + + Float prevY = null; + for (Float y : sortedUniqueY) { + + List row = new ArrayList<>(); + + Float prevX = null; + for (Float x : sortedUniqueX) { + + if (prevY != null && prevX != null) { + var cell = new TableCell(new Point2D.Float(prevX, prevY), new Point2D.Float(x, y)); + + var intersectionCell = cells.stream().filter(c -> cell.intersects(c) && cell.overlapRatio(c) > 0.1f).findFirst(); + if (intersectionCell.isPresent()) { + cell.getTextBlocks().addAll(intersectionCell.get().getTextBlocks()); + } + row.add(cell); + } + prevX = x; + } + + if (prevY != null && prevX != null) { + matrix.add(row); + } + prevY = y; + } + + Collections.reverse(matrix); + + return matrix; + } + + + @Override + public String getText() { + + StringBuilder sb = new StringBuilder(); + List> rows = getRows(); + + int i = 0; + for (List row : rows) { + if (i != 0) { + sb.append("\n"); + } + if (!row.isEmpty()) { + boolean firstColumn = true; + for (TableCell column : row) { + if (!firstColumn) { + sb.append(","); + } + if (column != null && column.getTextBlocks() != null) { + boolean first = true; + for (ClassificationTextBlock textBlock : column.getTextBlocks()) { + if (!first) { + sb.append("\n"); + } + sb.append('\"').append(textBlock.getText().replaceAll("\"", "\\\"")).append('\"'); + first = false; + } + } + firstColumn = false; + } + } + i++; + } + + return sb.toString(); + } + + + public String getTextAsHtml() { + + StringBuilder sb = new StringBuilder(); + List> rows = getRows(); + + sb.append("

"); + int i = 0; + for (List row : rows) { + sb.append("\n"); + if (!row.isEmpty()) { + for (TableCell column : row) { + sb.append(i == 0 ? "\n"); + } + } + sb.append(""); + i++; + } + sb.append("
" : "\n"); + if (column != null && column.getTextBlocks() != null) { + boolean first = true; + for (ClassificationTextBlock textBlock : column.getTextBlocks()) { + if (!first) { + sb.append("
"); + } + sb.append(textBlock.getText().replaceAll("\\n", "
")); + first = false; + } + } + sb.append(i == 0 ? "" : "
"); + + return sb.toString(); + } + +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/table/TableCell.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/table/TableCell.java new file mode 100644 index 0000000..578371f --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/table/TableCell.java @@ -0,0 +1,38 @@ +package com.knecon.fforesight.service.layoutparser.processor.classification.dto.table; + +import java.awt.geom.Point2D; +import java.util.ArrayList; +import java.util.List; + +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.Rectangle; + +import lombok.Data; +import lombok.EqualsAndHashCode; +import lombok.NoArgsConstructor; + +@SuppressWarnings("serial") +@Data +@EqualsAndHashCode(callSuper = true) +@NoArgsConstructor +public class TableCell extends Rectangle { + + private List textBlocks = new ArrayList<>(); + + private List headerCells = new ArrayList<>(); + + private boolean isHeaderCell; + + + public TableCell(Point2D topLeft, Point2D bottomRight) { + + super((float) topLeft.getY(), (float) topLeft.getX(), (float) (bottomRight.getX() - topLeft.getX()), (float) (bottomRight.getY() - topLeft.getY())); + } + + + public void addTextBlock(ClassificationTextBlock textBlock) { + + textBlocks.add(textBlock); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/table/TableCellPosition.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/table/TableCellPosition.java new file mode 100644 index 0000000..42cb649 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/table/TableCellPosition.java @@ -0,0 +1,22 @@ +package com.knecon.fforesight.service.layoutparser.processor.classification.dto.table; + +import lombok.RequiredArgsConstructor; +import lombok.Value; + +@Value +@RequiredArgsConstructor +public class TableCellPosition implements Comparable { + + int row; + + int col; + + + @Override + public int compareTo(TableCellPosition other) { + + int rowDiff = row - other.row; + return rowDiff != 0 ? rowDiff : col - other.col; + } + +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/ClassificationTextBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/ClassificationTextBlock.java new file mode 100644 index 0000000..1076cf8 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/ClassificationTextBlock.java @@ -0,0 +1,286 @@ +package com.knecon.fforesight.service.layoutparser.processor.classification.dto.text; + +import java.util.ArrayList; +import java.util.List; + +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer; +import com.knecon.fforesight.service.layoutparser.processor.classification.utils.TextNormalizationUtilities; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.EqualsAndHashCode; +import lombok.NoArgsConstructor; + +@EqualsAndHashCode(callSuper = true) +@AllArgsConstructor +@Builder +@Data +@NoArgsConstructor +public class ClassificationTextBlock extends AbstractTextContainer { + + @Builder.Default + private List sequences = new ArrayList<>(); + + private int rotation; + + private int indexOnPage; + + private String mostPopularWordFont; + + private String mostPopularWordStyle; + + private float mostPopularWordFontSize; + + private float mostPopularWordHeight; + + private float mostPopularWordSpaceWidth; + + private float highestFontSize; + + private String classification; + + + public TextDirection getDir() { + + return sequences.get(0).getDir(); + } + + private float getPageHeight() { + + return sequences.get(0).getPageHeight(); + } + + + private float getPageWidth() { + + return sequences.get(0).getPageWidth(); + } + + + /** + * Returns the minX value in pdf coordinate system. + * Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation. + * 0 -> LowerLeft + * 90 -> UpperLeft + * 180 -> UpperRight + * 270 -> LowerRight + * + * @return the minX value in pdf coordinate system + */ + public float getPdfMinX() { + + if (getDir().getDegrees() == 90) { + return minY; + } else if (getDir().getDegrees() == 180) { + return getPageWidth() - maxX; + + } else if (getDir().getDegrees() == 270) { + + return getPageWidth() - maxY; + } else { + return minX; + } + } + + /** + * Returns the maxX value in pdf coordinate system. + * Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation. + * 0 -> LowerLeft + * 90 -> UpperLeft + * 180 -> UpperRight + * 270 -> LowerRight + * + * @return the maxX value in pdf coordinate system + */ + public float getPdfMaxX() { + + if (getDir().getDegrees() == 90) { + return maxY; + } else if (getDir().getDegrees() == 180) { + return getPageWidth() - minX; + } else if (getDir().getDegrees() == 270) { + return getPageWidth() - minY; + + } else { + return maxX; + } + } + + + /** + * Returns the minY value in pdf coordinate system. + * Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation. + * 0 -> LowerLeft + * 90 -> UpperLeft + * 180 -> UpperRight + * 270 -> LowerRight + * + * @return the minY value in pdf coordinate system + */ + public float getPdfMinY() { + + if (getDir().getDegrees() == 90) { + return minX; + } else if (getDir().getDegrees() == 180) { + return maxY; + + } else if (getDir().getDegrees() == 270) { + return getPageHeight() - maxX; + + } else { + return getPageHeight() - maxY; + } + } + + + /** + * Returns the maxY value in pdf coordinate system. + * Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation. + * 0 -> LowerLeft + * 90 -> UpperLeft + * 180 -> UpperRight + * 270 -> LowerRight + * + * @return the maxY value in pdf coordinate system + */ + public float getPdfMaxY() { + + if (getDir().getDegrees() == 90) { + return maxX; + } else if (getDir().getDegrees() == 180) { + + return minY; + } else if (getDir().getDegrees() == 270) { + return getPageHeight() - minX; + } else { + return getPageHeight() - minY; + } + } + + + public ClassificationTextBlock(float minX, float maxX, float minY, float maxY, List sequences, int rotation, int indexOnPage) { + super(); + this.indexOnPage = indexOnPage; + super.minX = minX; + super.maxX = maxX; + super.minY = minY; + super.maxY = maxY; + this.sequences = sequences; + this.rotation = rotation; + } + + + public ClassificationTextBlock union(TextPositionSequence r) { + + ClassificationTextBlock union = this.copy(); + union.add(r); + return union; + } + + + public ClassificationTextBlock union(ClassificationTextBlock r) { + + ClassificationTextBlock union = this.copy(); + union.add(r); + return union; + } + + + public void add(ClassificationTextBlock r) { + + if (r.getMinX() < minX) { + minX = r.getMinX(); + } + if (r.getMaxX() > maxX) { + maxX = r.getMaxX(); + } + if (r.getMinY() < minY) { + minY = r.getMinY(); + } + if (r.getMaxY() > maxY) { + maxY = r.getMaxY(); + } + sequences.addAll(r.getSequences()); + } + + + public void add(TextPositionSequence r) { + + if (r.getMinXDirAdj() < minX) { + minX = r.getMinXDirAdj(); + } + if (r.getMaxXDirAdj() > maxX) { + maxX = r.getMaxXDirAdj(); + } + if (r.getMinYDirAdj() < minY) { + minY = r.getMinYDirAdj(); + } + if (r.getMaxYDirAdj() > maxY) { + maxY = r.getMaxYDirAdj(); + } + } + + + public ClassificationTextBlock copy() { + + return new ClassificationTextBlock(minX, maxX, minY, maxY, sequences, rotation, indexOnPage); + } + + + public void resize(float x1, float y1, float width, float height) { + + set(x1, y1, x1 + width, y1 + height); + } + + + public void set(float x1, float y1, float x2, float y2) { + + this.minX = Math.min(x1, x2); + this.maxX = Math.max(x1, x2); + this.minY = Math.min(y1, y2); + this.maxY = Math.max(y1, y2); + } + + + @Override + public String toString() { + + StringBuilder builder = new StringBuilder(); + + for (int i = 0; i < sequences.size(); i++) { + String sequenceAsString = sequences.get(i).toString(); + // Fix for missing Whitespace. This is recognized in getSequences method. See PDFTextStripper Line 1730. + if (i != 0 && sequences.get(i - 1).charAt(sequences.get(i - 1).length() - 1) != ' ' && sequenceAsString.charAt(0) != ' ') { + builder.append(' '); + } + builder.append(sequenceAsString); + } + + return builder.toString(); + + } + + + @Override + public String getText() { + + StringBuilder sb = new StringBuilder(); + + TextPositionSequence previous = null; + for (TextPositionSequence word : sequences) { + if (previous != null) { + if (Math.abs(previous.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight()) { + sb.append('\n'); + } else { + sb.append(' '); + } + } + sb.append(word.toString()); + previous = word; + } + + return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString()); + + } +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/RedTextPosition.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/RedTextPosition.java new file mode 100644 index 0000000..1266286 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/RedTextPosition.java @@ -0,0 +1,106 @@ +package com.knecon.fforesight.service.layoutparser.processor.classification.dto.text; + +import org.apache.pdfbox.text.TextPosition; +import org.springframework.beans.BeanUtils; + +import com.dslplatform.json.CompiledJson; +import com.dslplatform.json.JsonAttribute; +import com.fasterxml.jackson.annotation.JsonIgnore; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; +import lombok.SneakyThrows; + +@Data +@Builder +@NoArgsConstructor +@AllArgsConstructor +@CompiledJson +public class RedTextPosition { + + private String textMatrix; + private float[] position; + + @JsonIgnore + private int rotation; + + @JsonIgnore + private float pageHeight; + + @JsonIgnore + private float pageWidth; + + private String unicode; + + @JsonIgnore + private float dir; + + // not used in reanalysis + @JsonIgnore + @JsonAttribute(ignore = true) + private float widthOfSpace; + + // not used in reanalysis + @JsonIgnore + @JsonAttribute(ignore = true) + private float fontSizeInPt; + + // not used in reanalysis + @JsonIgnore + @JsonAttribute(ignore = true) + private String fontName; + + + @SneakyThrows + public static RedTextPosition fromTextPosition(TextPosition textPosition) { + + var pos = new RedTextPosition(); + BeanUtils.copyProperties(textPosition, pos); + pos.setFontName(textPosition.getFont().getName()); + + pos.setFontSizeInPt(textPosition.getFontSizeInPt()); + + pos.setTextMatrix(textPosition.getTextMatrix().toString()); + + var position = new float[4]; + + position[0] = textPosition.getXDirAdj(); + position[1] = textPosition.getYDirAdj(); + position[2] = textPosition.getWidthDirAdj(); + position[3] = textPosition.getHeightDir(); + + pos.setPosition(position); + return pos; + } + + + @JsonIgnore + public float getXDirAdj() { + + return position[0]; + } + + + @JsonIgnore + public float getYDirAdj() { + + return position[1]; + } + + + @JsonIgnore + public float getWidthDirAdj() { + + return position[2]; + } + + + @JsonIgnore + public float getHeightDir() { + + return position[3]; + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/StringFrequencyCounter.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/StringFrequencyCounter.java new file mode 100644 index 0000000..4c6d3d3 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/StringFrequencyCounter.java @@ -0,0 +1,47 @@ +package com.knecon.fforesight.service.layoutparser.processor.classification.dto.text; + +import java.util.HashMap; +import java.util.Map; + +import lombok.Getter; + +public class StringFrequencyCounter { + + @Getter + private final Map countPerValue = new HashMap<>(); + + + public void add(String value) { + + if (!countPerValue.containsKey(value)) { + countPerValue.put(value, 1); + } else { + countPerValue.put(value, countPerValue.get(value) + 1); + } + } + + + public void addAll(Map otherCounter) { + + for (Map.Entry entry : otherCounter.entrySet()) { + if (countPerValue.containsKey(entry.getKey())) { + countPerValue.put(entry.getKey(), countPerValue.get(entry.getKey()) + entry.getValue()); + } else { + countPerValue.put(entry.getKey(), entry.getValue()); + } + } + } + + + public String getMostPopular() { + + Map.Entry mostPopular = null; + for (Map.Entry entry : countPerValue.entrySet()) { + if (mostPopular == null || entry.getValue() > mostPopular.getValue()) { + mostPopular = entry; + } + } + return mostPopular != null ? mostPopular.getKey() : null; + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/TextBlockOrientation.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/TextBlockOrientation.java new file mode 100644 index 0000000..5ff10a4 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/TextBlockOrientation.java @@ -0,0 +1,8 @@ +package com.knecon.fforesight.service.layoutparser.processor.classification.dto.text; + +public enum TextBlockOrientation { + + NONE, + LEFT, + RIGHT +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/TextDirection.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/TextDirection.java new file mode 100644 index 0000000..ef31669 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/TextDirection.java @@ -0,0 +1,54 @@ +package com.knecon.fforesight.service.layoutparser.processor.classification.dto.text; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonValue; + +import lombok.Getter; + +@Getter +public enum TextDirection { + ZERO(0f), + QUARTER_CIRCLE(90f), + HALF_CIRCLE(180f), + THREE_QUARTER_CIRCLE(270f); + + public static final String VALUE_STRING_SUFFIX = "°"; + + @JsonValue + private final float degrees; + private final float radians; + + + TextDirection(float degreeValue) { + + degrees = degreeValue; + radians = (float) Math.toRadians(degreeValue); + } + + + @Override + public String toString() { + + return degrees + VALUE_STRING_SUFFIX; + } + + + @com.dslplatform.json.JsonValue + public float jsonValue() { + + return getDegrees(); + } + + + @JsonCreator(mode = JsonCreator.Mode.DELEGATING) + public static TextDirection fromDegrees(float degrees) { + + for (var dir : TextDirection.values()) { + if (degrees == dir.degrees) { + return dir; + } + } + + throw new IllegalArgumentException(String.format("A value of %f is not supported by TextDirection", degrees)); + } +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/TextPositionSequence.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/TextPositionSequence.java new file mode 100644 index 0000000..ac525d5 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/TextPositionSequence.java @@ -0,0 +1,298 @@ +package com.knecon.fforesight.service.layoutparser.processor.classification.dto.text; + +import java.awt.geom.AffineTransform; +import java.awt.geom.Point2D; +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Collectors; + +import org.apache.pdfbox.text.TextPosition; + +import com.dslplatform.json.JsonAttribute; +import com.fasterxml.jackson.annotation.JsonIgnore; +import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point; +import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; +import lombok.SneakyThrows; +import lombok.extern.slf4j.Slf4j; + +@Slf4j +@Data +@Builder +@NoArgsConstructor +@AllArgsConstructor +public class TextPositionSequence implements CharSequence { + + public static final int HEIGHT_PADDING = 2; + private int page; + private List textPositions = new ArrayList<>(); + + private TextDirection dir; + private int rotation; + private float pageHeight; + private float pageWidth; + + + public TextPositionSequence(List textPositions, int page) { + + this.textPositions = textPositions.stream().map(RedTextPosition::fromTextPosition).collect(Collectors.toList()); + this.page = page; + this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir()); + this.rotation = textPositions.get(0).getRotation(); + this.pageHeight = textPositions.get(0).getPageHeight(); + this.pageWidth = textPositions.get(0).getPageWidth(); + } + + + @Override + public int length() { + + return textPositions.size(); + } + + + @Override + public char charAt(int index) { + + RedTextPosition textPosition = textPositionAt(index); + String text = textPosition.getUnicode(); + return text.charAt(0); + } + + + @Override + public TextPositionSequence subSequence(int start, int end) { + + var textPositionSequence = new TextPositionSequence(); + textPositionSequence.textPositions = textPositions.subList(start, end); + textPositionSequence.page = page; + textPositionSequence.dir = dir; + textPositionSequence.rotation = rotation; + textPositionSequence.pageHeight = pageHeight; + textPositionSequence.pageWidth = pageWidth; + + return textPositionSequence; + } + + + @Override + public String toString() { + + StringBuilder builder = new StringBuilder(length()); + for (int i = 0; i < length(); i++) { + builder.append(charAt(i)); + } + return builder.toString(); + } + + + public RedTextPosition textPositionAt(int index) { + + return textPositions.get(index); + } + + + public void add(TextPositionSequence textPositionSequence, RedTextPosition textPosition) { + + this.textPositions.add(textPosition); + this.page = textPositionSequence.getPage(); + this.dir = textPositionSequence.getDir(); + this.rotation = textPositionSequence.getRotation(); + this.pageHeight = textPositionSequence.getPageHeight(); + this.pageWidth = textPositionSequence.getPageWidth(); + } + + + public void add(TextPosition textPosition) { + + this.textPositions.add(RedTextPosition.fromTextPosition(textPosition)); + + this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir()); + this.rotation = textPositions.get(0).getRotation(); + this.pageHeight = textPositions.get(0).getPageHeight(); + this.pageWidth = textPositions.get(0).getPageWidth(); + + } + + + /** + * This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction. + * This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt. + * + * @return the text direction adjusted minX value + */ + @JsonIgnore + @JsonAttribute(ignore = true) + public float getMinXDirAdj() { + + return textPositions.get(0).getXDirAdj(); + + } + + + /** + * This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction. + * This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt. + * + * @return the text direction adjusted maxX value + */ + @JsonIgnore + @JsonAttribute(ignore = true) + public float getMaxXDirAdj() { + + return textPositions.get(textPositions.size() - 1).getXDirAdj() + textPositions.get(textPositions.size() - 1).getWidthDirAdj() + HEIGHT_PADDING; + + } + + + /** + * This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction. + * This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt. + * + * @return the text direction adjusted minY value. The upper border of the bounding box of the word. + */ + @JsonIgnore + @JsonAttribute(ignore = true) + public float getMinYDirAdj() { + + return textPositions.get(0).getYDirAdj() - getTextHeight(); + + } + + + /** + * This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction. + * This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt. + * + * @return the text direction adjusted maxY value. The lower border of the bounding box of the word. + */ + @JsonIgnore + @JsonAttribute(ignore = true) + public float getMaxYDirAdj() { + + return textPositions.get(0).getYDirAdj(); + + } + + + @JsonIgnore + @JsonAttribute(ignore = true) + public float getTextHeight() { + + return textPositions.get(0).getHeightDir() + HEIGHT_PADDING; + } + + + @JsonIgnore + @JsonAttribute(ignore = true) + public float getHeight() { + + return getMaxYDirAdj() - getMinYDirAdj(); + } + + + @JsonIgnore + @JsonAttribute(ignore = true) + public float getWidth() { + + return getMaxXDirAdj() - getMinXDirAdj(); + } + + + @JsonIgnore + @JsonAttribute(ignore = true) + public String getFont() { + + return textPositions.get(0).getFontName().toLowerCase().replaceAll(",bold", "").replaceAll(",italic", ""); + } + + + @JsonIgnore + @JsonAttribute(ignore = true) + public String getFontStyle() { + + String lowercaseFontName = textPositions.get(0).getFontName().toLowerCase(); + + if (lowercaseFontName.contains("bold") && lowercaseFontName.contains("italic")) { + return "bold, italic"; + } else if (lowercaseFontName.contains("bold")) { + return "bold"; + } else if (lowercaseFontName.contains("italic")) { + return "italic"; + } else { + return "standard"; + } + + } + + + @JsonIgnore + @JsonAttribute(ignore = true) + public float getFontSize() { + + return textPositions.get(0).getFontSizeInPt(); + } + + + @JsonIgnore + @JsonAttribute(ignore = true) + public float getSpaceWidth() { + + return textPositions.get(0).getWidthOfSpace(); + } + + + /** + * This returns the bounding box of the word in Pdf Coordinate System where {0,0} rotated with the page rotation. + * 0 -> LowerLeft + * 90 -> UpperLeft + * 180 -> UpperRight + * 270 -> LowerRight + * + * @return bounding box of the word in Pdf Coordinate System + */ + @JsonIgnore + @JsonAttribute(ignore = true) + @SneakyThrows + public Rectangle getRectangle() { + + log.debug("ClassificationPage: '{}', Word: '{}', Rotation: '{}', textRotation {}", page, this, rotation, dir); + + float textHeight = getTextHeight(); + + RedTextPosition firstTextPos = textPositions.get(0); + RedTextPosition lastTextPos = textPositions.get(textPositions.size() - 1); + + Point2D bottomLeft = new Point2D.Double(firstTextPos.getXDirAdj(), firstTextPos.getYDirAdj() - HEIGHT_PADDING); + Point2D topRight = new Point2D.Double(lastTextPos.getXDirAdj() + lastTextPos.getWidthDirAdj(), lastTextPos.getYDirAdj() + textHeight + HEIGHT_PADDING); + + AffineTransform transform = new AffineTransform(); + if (dir == TextDirection.ZERO || dir == TextDirection.HALF_CIRCLE) { + transform.rotate(dir.getRadians(), pageWidth / 2f, pageHeight / 2f); + transform.translate(0f, pageHeight + textHeight); + transform.scale(1., -1.); + } else if (dir == TextDirection.QUARTER_CIRCLE) { + transform.rotate(dir.getRadians(), pageWidth / 2f, pageWidth / 2f); + transform.translate(0f, pageWidth + textHeight); + transform.scale(1., -1.); + } else { + transform.rotate(dir.getRadians(), pageHeight / 2f, pageHeight / 2f); + transform.translate(0f, pageWidth + textHeight); + transform.scale(1., -1.); + } + + bottomLeft = transform.transform(bottomLeft, null); + topRight = transform.transform(topRight, null); + + return new Rectangle( // + new Point((float) bottomLeft.getX(), (float) bottomLeft.getY()), + (float) (topRight.getX() - bottomLeft.getX()), + (float) (topRight.getY() - bottomLeft.getY()), + page); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/UnclassifiedText.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/UnclassifiedText.java new file mode 100644 index 0000000..16be334 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/UnclassifiedText.java @@ -0,0 +1,14 @@ +package com.knecon.fforesight.service.layoutparser.processor.classification.dto.text; + +import java.util.List; + +import lombok.AllArgsConstructor; +import lombok.Data; + +@Data +@AllArgsConstructor +public class UnclassifiedText { + + private List textBlocks; + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/LegacyPDFStreamEngine.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/LegacyPDFStreamEngine.java new file mode 100644 index 0000000..5aa1439 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/LegacyPDFStreamEngine.java @@ -0,0 +1,384 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.knecon.fforesight.service.layoutparser.processor.classification.parsing; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Map; +import java.util.WeakHashMap; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.fontbox.ttf.TrueTypeFont; +import org.apache.fontbox.util.BoundingBox; +import org.apache.pdfbox.contentstream.PDFStreamEngine; +import org.apache.pdfbox.contentstream.operator.DrawObject; +import org.apache.pdfbox.contentstream.operator.state.Concatenate; +import org.apache.pdfbox.contentstream.operator.state.Restore; +import org.apache.pdfbox.contentstream.operator.state.Save; +import org.apache.pdfbox.contentstream.operator.state.SetGraphicsStateParameters; +import org.apache.pdfbox.contentstream.operator.state.SetMatrix; +import org.apache.pdfbox.contentstream.operator.text.BeginText; +import org.apache.pdfbox.contentstream.operator.text.EndText; +import org.apache.pdfbox.contentstream.operator.text.MoveText; +import org.apache.pdfbox.contentstream.operator.text.MoveTextSetLeading; +import org.apache.pdfbox.contentstream.operator.text.NextLine; +import org.apache.pdfbox.contentstream.operator.text.SetCharSpacing; +import org.apache.pdfbox.contentstream.operator.text.SetFontAndSize; +import org.apache.pdfbox.contentstream.operator.text.SetTextHorizontalScaling; +import org.apache.pdfbox.contentstream.operator.text.SetTextLeading; +import org.apache.pdfbox.contentstream.operator.text.SetTextRenderingMode; +import org.apache.pdfbox.contentstream.operator.text.SetTextRise; +import org.apache.pdfbox.contentstream.operator.text.SetWordSpacing; +import org.apache.pdfbox.contentstream.operator.text.ShowText; +import org.apache.pdfbox.contentstream.operator.text.ShowTextAdjusted; +import org.apache.pdfbox.contentstream.operator.text.ShowTextLine; +import org.apache.pdfbox.contentstream.operator.text.ShowTextLineAndSpace; +import org.apache.pdfbox.cos.COSDictionary; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.common.PDRectangle; +import org.apache.pdfbox.pdmodel.font.PDCIDFont; +import org.apache.pdfbox.pdmodel.font.PDCIDFontType2; +import org.apache.pdfbox.pdmodel.font.PDFont; +import org.apache.pdfbox.pdmodel.font.PDFontDescriptor; +import org.apache.pdfbox.pdmodel.font.PDSimpleFont; +import org.apache.pdfbox.pdmodel.font.PDTrueTypeFont; +import org.apache.pdfbox.pdmodel.font.PDType0Font; +import org.apache.pdfbox.pdmodel.font.PDType3Font; +import org.apache.pdfbox.pdmodel.font.encoding.GlyphList; +import org.apache.pdfbox.pdmodel.graphics.state.PDGraphicsState; +import org.apache.pdfbox.text.TextPosition; +import org.apache.pdfbox.util.Matrix; +import org.apache.pdfbox.util.Vector; + +/** + * LEGACY text calculations which are known to be incorrect but are depended on by PDFTextStripper. + *

+ * This class exists only so that we don't break the code of users who have their own subclasses of + * PDFTextStripper. It replaces the mostly empty implementation of showGlyph() in PDFStreamEngine + * with a heuristic implementation which is backwards compatible. + *

+ * DO NOT USE THIS CODE UNLESS YOU ARE WORKING WITH PDFTextStripper. + * THIS CODE IS DELIBERATELY INCORRECT, USE PDFStreamEngine INSTEAD. + */ +@SuppressWarnings({"PMD", "checkstyle:all"}) +class LegacyPDFStreamEngine extends PDFStreamEngine { + + private static final Log LOG = LogFactory.getLog(LegacyPDFStreamEngine.class); + + private int pageRotation; + private PDRectangle pageSize; + private Matrix translateMatrix; + private final GlyphList glyphList; + private final Map fontHeightMap = new WeakHashMap(); + + + /** + * Constructor. + */ + LegacyPDFStreamEngine() throws IOException { + + addOperator(new BeginText()); + addOperator(new Concatenate()); + addOperator(new DrawObject()); // special text version + addOperator(new EndText()); + addOperator(new SetGraphicsStateParameters()); + addOperator(new Save()); + addOperator(new Restore()); + addOperator(new NextLine()); + addOperator(new SetCharSpacing()); + addOperator(new MoveText()); + addOperator(new MoveTextSetLeading()); + addOperator(new SetFontAndSize()); + addOperator(new ShowText()); + addOperator(new ShowTextAdjusted()); + addOperator(new SetTextLeading()); + addOperator(new SetMatrix()); + addOperator(new SetTextRenderingMode()); + addOperator(new SetTextRise()); + addOperator(new SetWordSpacing()); + addOperator(new SetTextHorizontalScaling()); + addOperator(new ShowTextLine()); + addOperator(new ShowTextLineAndSpace()); + + // load additional glyph list for Unicode mapping + String path = "/org/apache/pdfbox/resources/glyphlist/additional.txt"; + InputStream input = GlyphList.class.getResourceAsStream(path); + glyphList = new GlyphList(GlyphList.getAdobeGlyphList(), input); + } + + + /** + * This will initialize and process the contents of the stream. + * + * @param page the page to process + * @throws IOException if there is an error accessing the stream. + */ + @Override + public void processPage(PDPage page) throws IOException { + + this.pageRotation = page.getRotation(); + this.pageSize = page.getCropBox(); + + if (pageSize.getLowerLeftX() == 0 && pageSize.getLowerLeftY() == 0) { + translateMatrix = null; + } else { + // translation matrix for cropbox + translateMatrix = Matrix.getTranslateInstance(-pageSize.getLowerLeftX(), -pageSize.getLowerLeftY()); + } + super.processPage(page); + } + + + /** + * Called when a glyph is to be processed. The heuristic calculations here were originally + * written by Ben Litchfield for PDFStreamEngine. + */ + @Override + protected void showGlyph(Matrix textRenderingMatrix, PDFont font, int code,Vector displacement) throws IOException { + // + // legacy calculations which were previously in PDFStreamEngine + // + // DO NOT USE THIS CODE UNLESS YOU ARE WORKING WITH PDFTextStripper. + // THIS CODE IS DELIBERATELY INCORRECT + // + + PDGraphicsState state = getGraphicsState(); + Matrix ctm = state.getCurrentTransformationMatrix(); + float fontSize = state.getTextState().getFontSize(); + float horizontalScaling = state.getTextState().getHorizontalScaling() / 100f; + Matrix textMatrix = getTextMatrix(); + + float displacementX = displacement.getX(); + // the sorting algorithm is based on the width of the character. As the displacement + // for vertical characters doesn't provide any suitable value for it, we have to + // calculate our own + if (font.isVertical()) { + displacementX = font.getWidth(code) / 1000; + // there may be an additional scaling factor for true type fonts + TrueTypeFont ttf = null; + if (font instanceof PDTrueTypeFont) { + ttf = ((PDTrueTypeFont) font).getTrueTypeFont(); + } else if (font instanceof PDType0Font) { + PDCIDFont cidFont = ((PDType0Font) font).getDescendantFont(); + if (cidFont instanceof PDCIDFontType2) { + ttf = ((PDCIDFontType2) cidFont).getTrueTypeFont(); + } + } + if (ttf != null && ttf.getUnitsPerEm() != 1000) { + displacementX *= 1000f / ttf.getUnitsPerEm(); + } + } + + // + // legacy calculations which were previously in PDFStreamEngine + // + // DO NOT USE THIS CODE UNLESS YOU ARE WORKING WITH PDFTextStripper. + // THIS CODE IS DELIBERATELY INCORRECT + // + + // (modified) combined displacement, this is calculated *without* taking the character + // spacing and word spacing into account, due to legacy code in TextStripper + float tx = displacementX * fontSize * horizontalScaling; + float ty = displacement.getY() * fontSize; + + // (modified) combined displacement matrix + Matrix td = Matrix.getTranslateInstance(tx, ty); + + // (modified) text rendering matrix + Matrix nextTextRenderingMatrix = td.multiply(textMatrix).multiply(ctm); // text space -> device space + float nextX = nextTextRenderingMatrix.getTranslateX(); + float nextY = nextTextRenderingMatrix.getTranslateY(); + + // (modified) width and height calculations + float dxDisplay = nextX - textRenderingMatrix.getTranslateX(); + Float fontHeight = fontHeightMap.get(font.getCOSObject()); + if (fontHeight == null) { + fontHeight = computeFontHeight(font); + fontHeightMap.put(font.getCOSObject(), fontHeight); + } + float dyDisplay = fontHeight * textRenderingMatrix.getScalingFactorY(); + + // + // start of the original method + // + + // Note on variable names. There are three different units being used in this code. + // Character sizes are given in glyph units, text locations are initially given in text + // units, and we want to save the data in display units. The variable names should end with + // Text or Disp to represent if the values are in text or disp units (no glyph units are + // saved). + + float glyphSpaceToTextSpaceFactor = 1 / 1000f; + if (font instanceof PDType3Font) { + glyphSpaceToTextSpaceFactor = font.getFontMatrix().getScaleX(); + } + + float spaceWidthText = 0; + try { + // to avoid crash as described in PDFBOX-614, see what the space displacement should be + spaceWidthText = font.getSpaceWidth() * glyphSpaceToTextSpaceFactor; + } catch (Throwable exception) { + LOG.warn(exception, exception); + } + + if (spaceWidthText == 0) { + spaceWidthText = font.getAverageFontWidth() * glyphSpaceToTextSpaceFactor; + // the average space width appears to be higher than necessary so make it smaller + spaceWidthText *= .80f; + } + if (spaceWidthText == 0) { + spaceWidthText = 1.0f; // if could not find font, use a generic value + } + + // the space width has to be transformed into display units + float spaceWidthDisplay = spaceWidthText * textRenderingMatrix.getScalingFactorX(); + + // use our additional glyph list for Unicode mapping + String unicodeMapping = font.toUnicode(code, glyphList); + + // when there is no Unicode mapping available, Acrobat simply coerces the character code + // into Unicode, so we do the same. Subclasses of PDFStreamEngine don't necessarily want + // this, which is why we leave it until this point in PDFTextStreamEngine. + if (unicodeMapping == null) { + if (font instanceof PDSimpleFont) { + char c = (char) code; + unicodeMapping = new String(new char[]{c}); + } else { + // Acrobat doesn't seem to coerce composite font's character codes, instead it + // skips them. See the "allah2.pdf" TestTextStripper file. + return; + } + } + + // adjust for cropbox if needed + Matrix translatedTextRenderingMatrix; + if (translateMatrix == null) { + translatedTextRenderingMatrix = textRenderingMatrix; + } else { + translatedTextRenderingMatrix = Matrix.concatenate(translateMatrix, textRenderingMatrix); + nextX -= pageSize.getLowerLeftX(); + nextY -= pageSize.getLowerLeftY(); + } + + // This is a hack for unicode letter with 2 chars e.g. RA see unicodeProblem.pdf + if (unicodeMapping.length() == 2) { + processTextPosition(new TextPosition(pageRotation, + pageSize.getWidth(), + pageSize.getHeight(), + translatedTextRenderingMatrix, + nextX, + nextY, + Math.abs(dyDisplay), + dxDisplay, + Math.abs(spaceWidthDisplay), + Character.toString(unicodeMapping.charAt(0)), + new int[]{code}, + font, + fontSize, + (int) (fontSize * textMatrix.getScalingFactorX()))); + processTextPosition(new TextPosition(pageRotation, + pageSize.getWidth(), + pageSize.getHeight(), + translatedTextRenderingMatrix, + nextX, + nextY, + Math.abs(dyDisplay), + dxDisplay, + Math.abs(spaceWidthDisplay), + Character.toString(unicodeMapping.charAt(1)), + new int[]{code}, + font, + fontSize, + (int) (fontSize * textMatrix.getScalingFactorX()))); + } else { + + processTextPosition(new TextPosition(pageRotation, + pageSize.getWidth(), + pageSize.getHeight(), + translatedTextRenderingMatrix, + nextX, + nextY, + Math.abs(dyDisplay), + dxDisplay, + Math.abs(spaceWidthDisplay), + unicodeMapping, + new int[]{code}, + font, + fontSize, + (int) (fontSize * textMatrix.getScalingFactorX()))); + } + } + + + /** + * Compute the font height. Override this if you want to use own calculations. + * + * @param font the font. + * @return the font height. + * @throws IOException if there is an error while getting the font bounding box. + */ + protected float computeFontHeight(PDFont font) throws IOException { + + BoundingBox bbox = font.getBoundingBox(); + if (bbox.getLowerLeftY() < Short.MIN_VALUE) { + // PDFBOX-2158 and PDFBOX-3130 + // files by Salmat eSolutions / ClibPDF Library + bbox.setLowerLeftY(-(bbox.getLowerLeftY() + 65536)); + } + // 1/2 the bbox is used as the height todo: why? + float glyphHeight = bbox.getHeight() / 2; + + // sometimes the bbox has very high values, but CapHeight is OK + PDFontDescriptor fontDescriptor = font.getFontDescriptor(); + if (fontDescriptor != null) { + float capHeight = fontDescriptor.getCapHeight(); + if (Float.compare(capHeight, 0) != 0 && (capHeight < glyphHeight || Float.compare(glyphHeight, 0) == 0)) { + glyphHeight = capHeight; + } + // PDFBOX-3464, PDFBOX-4480, PDFBOX-4553: + // sometimes even CapHeight has very high value, but Ascent and Descent are ok + float ascent = fontDescriptor.getAscent(); + float descent = fontDescriptor.getDescent(); + if (capHeight > ascent && ascent > 0 && descent < 0 && ((ascent - descent) / 2 < glyphHeight || Float.compare(glyphHeight, 0) == 0)) { + glyphHeight = (ascent - descent) / 2; + } + } + + // transformPoint from glyph space -> text space + float height; + if (font instanceof PDType3Font) { + height = font.getFontMatrix().transformPoint(0, glyphHeight).y; + } else { + height = glyphHeight / 1000; + } + + return height; + } + + + /** + * A method provided as an event interface to allow a subclass to perform some specific + * functionality when text needs to be processed. + * + * @param text The text to be processed. + */ + protected void processTextPosition(TextPosition text) { + // subclasses can override to provide specific functionality + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/PDFAreaTextStripper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/PDFAreaTextStripper.java new file mode 100644 index 0000000..b799434 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/PDFAreaTextStripper.java @@ -0,0 +1,82 @@ +package com.knecon.fforesight.service.layoutparser.processor.classification.parsing; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.pdfbox.text.PDFTextStripperByArea; +import org.apache.pdfbox.text.TextPosition; + +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextPositionSequence; + +import lombok.Getter; +import lombok.Setter; + +public class PDFAreaTextStripper extends PDFTextStripperByArea { + + @Getter + private List textPositionSequences = new ArrayList<>(); + + @Setter + private int pageNumber; + + + public PDFAreaTextStripper() throws IOException { + + } + + + @Override + public void writeString(String text, List textPositions) throws IOException { + + int startIndex = 0; + for (int i = 0; i <= textPositions.size() - 1; i++) { + + if (i == 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i).getUnicode().equals("\u00A0"))) { + startIndex++; + continue; + } + + // Strange but sometimes this is happening, for example: Metolachlor2.pdf + if (i > 0 && textPositions.get(i).getX() < textPositions.get(i - 1).getX()) { + List sublist = textPositions.subList(startIndex, i); + if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) { + textPositionSequences.add(new TextPositionSequence(sublist, pageNumber)); + } + startIndex = i; + } + + if (textPositions.get(i).getRotation() == 0 && i > 0 && textPositions.get(i).getX() > textPositions.get(i - 1).getEndX() + 1) { + List sublist = textPositions.subList(startIndex, i); + if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) { + textPositionSequences.add(new TextPositionSequence(sublist, pageNumber)); + } + startIndex = i; + } + + if (i > 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i).getUnicode().equals("\u00A0")) && i <= textPositions.size() - 2) { + List sublist = textPositions.subList(startIndex, i); + if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) { + textPositionSequences.add(new TextPositionSequence(sublist, pageNumber)); + } + startIndex = i + 1; + } + } + + List sublist = textPositions.subList(startIndex, textPositions.size()); + if (!sublist.isEmpty() && (sublist.get(sublist.size() - 1).getUnicode().equals(" ") || sublist.get(sublist.size() - 1).getUnicode().equals("\u00A0"))) { + sublist = sublist.subList(0, sublist.size() - 1); + } + if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) { + textPositionSequences.add(new TextPositionSequence(sublist, pageNumber)); + } + super.writeString(text); + } + + + public void clearPositions() { + + textPositionSequences = new ArrayList<>(); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/PDFLinesTextStripper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/PDFLinesTextStripper.java new file mode 100644 index 0000000..ae5c958 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/PDFLinesTextStripper.java @@ -0,0 +1,335 @@ +package com.knecon.fforesight.service.layoutparser.processor.classification.parsing; + +import java.awt.geom.Point2D; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; + +import org.apache.pdfbox.contentstream.operator.Operator; +import org.apache.pdfbox.contentstream.operator.OperatorName; +import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColor; +import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColorN; +import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColorSpace; +import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceCMYKColor; +import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceGrayColor; +import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceRGBColor; +import org.apache.pdfbox.contentstream.operator.color.SetStrokingColor; +import org.apache.pdfbox.contentstream.operator.color.SetStrokingColorN; +import org.apache.pdfbox.contentstream.operator.color.SetStrokingColorSpace; +import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceCMYKColor; +import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceGrayColor; +import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceRGBColor; +import org.apache.pdfbox.contentstream.operator.state.SetFlatness; +import org.apache.pdfbox.contentstream.operator.state.SetLineCapStyle; +import org.apache.pdfbox.contentstream.operator.state.SetLineDashPattern; +import org.apache.pdfbox.contentstream.operator.state.SetLineJoinStyle; +import org.apache.pdfbox.contentstream.operator.state.SetLineMiterLimit; +import org.apache.pdfbox.contentstream.operator.state.SetLineWidth; +import org.apache.pdfbox.contentstream.operator.state.SetRenderingIntent; +import org.apache.pdfbox.contentstream.operator.text.SetFontAndSize; +import org.apache.pdfbox.cos.COSBase; +import org.apache.pdfbox.cos.COSNumber; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.text.TextPosition; + +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.RedTextPosition; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Ruling; + +import lombok.Getter; +import lombok.Setter; +import lombok.extern.slf4j.Slf4j; + +@Slf4j +public class PDFLinesTextStripper extends PDFTextStripper { + + @Getter + private final List textPositionSequences = new ArrayList<>(); + @Getter + private final List rulings = new ArrayList<>(); + private final List graphicsPath = new ArrayList<>(); + @Setter + protected PDPage pdpage; + @Getter + private int minCharWidth; + @Getter + private int maxCharWidth; + @Getter + private int minCharHeight; + @Getter + private int maxCharHeight; + + private float path_x; + private float path_y; + + @Setter + private int pageNumber; + + + public PDFLinesTextStripper() throws IOException { + + super(); + this.addOperator(new SetStrokingColorSpace()); + this.addOperator(new SetNonStrokingColorSpace()); + this.addOperator(new SetLineDashPattern()); + this.addOperator(new SetStrokingDeviceGrayColor()); + this.addOperator(new SetNonStrokingDeviceGrayColor()); + this.addOperator(new SetFlatness()); + this.addOperator(new SetLineJoinStyle()); + this.addOperator(new SetLineCapStyle()); + this.addOperator(new SetStrokingDeviceCMYKColor()); + this.addOperator(new SetNonStrokingDeviceCMYKColor()); + this.addOperator(new SetLineMiterLimit()); + this.addOperator(new SetStrokingDeviceRGBColor()); + this.addOperator(new SetNonStrokingDeviceRGBColor()); + this.addOperator(new SetRenderingIntent()); + this.addOperator(new SetStrokingColor()); + this.addOperator(new SetNonStrokingColor()); + this.addOperator(new SetStrokingColorN()); + this.addOperator(new SetNonStrokingColorN()); + this.addOperator(new SetFontAndSize()); + this.addOperator(new SetLineWidth()); + } + + + @Override + protected void processOperator(Operator operator, List arguments) throws IOException { + + String operation = operator.getName(); + + //move + switch (operation) { + case OperatorName.MOVE_TO: + if (arguments.size() == 2) { + Point2D.Float pos = transformPosition(floatValue(arguments.get(0)), floatValue(arguments.get(1))); + path_x = (float) pos.getX(); + path_y = (float) pos.getY(); + } + break; + + //line + case OperatorName.LINE_TO: + if (arguments.size() == 2) { + Point2D.Float pos = transformPosition(floatValue(arguments.get(0)), floatValue(arguments.get(1))); + + // The direction of vertical lines must always be from bottom to top for the table extraction algorithm. + if (pos.getY() > path_y) { + graphicsPath.add(new Ruling(new Point2D.Float(path_x, path_y), new Point2D.Float((float) pos.getX(), (float) pos.getY()))); + } else { + graphicsPath.add(new Ruling(new Point2D.Float(path_x, (float) pos.getY()), new Point2D.Float((float) pos.getX(), path_y))); + } + + path_x = (float) pos.getX(); + path_y = (float) pos.getY(); + } + break; + + //rectangle + case OperatorName.APPEND_RECT: + + if (arguments.size() == 4) { + float x = floatValue(arguments.get(0)); + float y = floatValue(arguments.get(1)); + float width = floatValue(arguments.get(2)); + float height = floatValue(arguments.get(3)); + + Point2D p1 = transformPosition(x, y); + Point2D p2 = transformPosition(x + width, y + height); + + // Horizontal lines + graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p1.getY()), new Point2D.Float((float) p2.getX(), (float) p1.getY()))); + graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p2.getY()), new Point2D.Float((float) p2.getX(), (float) p2.getY()))); + + // Vertical lines, direction must always be from bottom to top for the table extraction algorithm. + if (p2.getY() > p1.getY()) { + graphicsPath.add(new Ruling(new Point2D.Float((float) p2.getX(), (float) p1.getY()), new Point2D.Float((float) p2.getX(), (float) p2.getY()))); + } else { + graphicsPath.add(new Ruling(new Point2D.Float((float) p2.getX(), (float) p2.getY()), new Point2D.Float((float) p2.getX(), (float) p1.getY()))); + } + if (p2.getY() > p1.getY()) { + graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p1.getY()), new Point2D.Float((float) p1.getX(), (float) p2.getY()))); + } else { + graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p2.getY()), new Point2D.Float((float) p1.getX(), (float) p1.getY()))); + } + } + break; + + //fill + case OperatorName.FILL_NON_ZERO: + case OperatorName.LEGACY_FILL_NON_ZERO: + case OperatorName.FILL_EVEN_ODD: + addVisibleRulings(graphicsPath, false); + graphicsPath.clear(); + break; + + //stroke + case OperatorName.STROKE_PATH: + addVisibleRulings(graphicsPath, true); + graphicsPath.clear(); + break; + + //cancel path + case OperatorName.ENDPATH: + graphicsPath.clear(); + break; + + } + + super.processOperator(operator, arguments); + } + + + private float floatValue(COSBase value) { + + if (value instanceof COSNumber) { + return ((COSNumber) value).floatValue(); + } else { + return 0; + } + } + + + private Point2D.Float transformPosition(float x, float y) { + + return super.transformedPoint(x, y); + } + + + private void addVisibleRulings(List path, boolean stroke) throws IOException { + + try { + if (stroke && !getGraphicsState().getStrokingColor().isPattern() && getGraphicsState().getStrokingColor() + .toRGB() == 0 || !stroke && !getGraphicsState().getNonStrokingColor().isPattern() && getGraphicsState().getNonStrokingColor().toRGB() == 0) { + rulings.addAll(path); + } + } catch (UnsupportedOperationException e) { + log.debug("UnsupportedOperationException: " + getGraphicsState().getStrokingColor().getColorSpace().getName() + " or " + getGraphicsState().getNonStrokingColor() + .getColorSpace() + .getName() + " does not support toRGB"); + } + } + + + @Override + public void writeString(String text, List textPositions) throws IOException { + + int startIndex = 0; + RedTextPosition previous = null; + + textPositions.sort(Comparator.comparing(TextPosition::getXDirAdj)); + + for (int i = 0; i <= textPositions.size() - 1; i++) { + + if (!textPositionSequences.isEmpty()) { + previous = textPositionSequences.get(textPositionSequences.size() - 1) + .getTextPositions() + .get(textPositionSequences.get(textPositionSequences.size() - 1).getTextPositions().size() - 1); + } + + int charWidth = (int) textPositions.get(i).getWidthDirAdj(); + if (charWidth < minCharWidth) { + minCharWidth = charWidth; + } + if (charWidth > maxCharWidth) { + maxCharWidth = charWidth; + } + + int charHeight = (int) textPositions.get(i).getHeightDir(); + if (charHeight < minCharHeight) { + minCharHeight = charHeight; + } + if (charWidth > maxCharHeight) { + maxCharHeight = charHeight; + } + + if (i == 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i).getUnicode().equals("\u00A0") || textPositions.get(i).getUnicode().equals("\t"))) { + startIndex++; + continue; + } + + // Strange but sometimes this is happening, for example: Metolachlor2.pdf + if (i > 0 && textPositions.get(i).getXDirAdj() < textPositions.get(i - 1).getXDirAdj()) { + List sublist = textPositions.subList(startIndex, i); + if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0) + .getUnicode() + .equals("\t")))) { + textPositionSequences.add(new TextPositionSequence(sublist, pageNumber)); + } + startIndex = i; + } + + if (textPositions.get(i).getRotation() == 0 && i > 0 && textPositions.get(i).getX() > textPositions.get(i - 1).getEndX() + 1) { + List sublist = textPositions.subList(startIndex, i); + if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0) + .getUnicode() + .equals("\t")))) { + textPositionSequences.add(new TextPositionSequence(sublist, pageNumber)); + } + startIndex = i; + } + + if (i > 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i).getUnicode().equals("\u00A0") || textPositions.get(i) + .getUnicode() + .equals("\t")) && i <= textPositions.size() - 2) { + List sublist = textPositions.subList(startIndex, i); + if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0) + .getUnicode() + .equals("\t")))) { + + // Remove false sequence ends (whitespaces) + if (previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0) + .getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < 0.01) { + for (TextPosition textPosition : sublist) { + textPositionSequences.get(textPositionSequences.size() - 1).add(textPosition); + } + } else { + textPositionSequences.add(new TextPositionSequence(sublist, pageNumber)); + } + } + startIndex = i + 1; + } + } + + List sublist = textPositions.subList(startIndex, textPositions.size()); + if (!sublist.isEmpty() && (sublist.get(sublist.size() - 1).getUnicode().equals(" ") || sublist.get(sublist.size() - 1) + .getUnicode() + .equals("\u00A0") || sublist.get(sublist.size() - 1).getUnicode().equals("\t"))) { + sublist = sublist.subList(0, sublist.size() - 1); + } + + if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0) + .getUnicode() + .equals("\t")))) { + if (previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0) + .getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < 0.01) { + for (TextPosition t : sublist) { + textPositionSequences.get(textPositionSequences.size() - 1).add(t); + } + } else { + textPositionSequences.add(new TextPositionSequence(sublist, pageNumber)); + } + } + super.writeString(text); + } + + + @Override + public String getText(PDDocument doc) throws IOException { + + minCharWidth = Integer.MAX_VALUE; + maxCharWidth = 0; + minCharHeight = Integer.MAX_VALUE; + maxCharHeight = 0; + textPositionSequences.clear(); + rulings.clear(); + graphicsPath.clear(); + path_x = 0.0f; + path_y = 0.0f; + + return super.getText(doc); + + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/PDFTextStripper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/PDFTextStripper.java new file mode 100644 index 0000000..de0490b --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/PDFTextStripper.java @@ -0,0 +1,2067 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.knecon.fforesight.service.layoutparser.processor.classification.parsing; + +import java.io.BufferedInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.LineNumberReader; +import java.io.StringWriter; +import java.io.Writer; +import java.text.Bidi; +import java.text.Normalizer; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.SortedMap; +import java.util.SortedSet; +import java.util.StringTokenizer; +import java.util.TreeMap; +import java.util.TreeSet; +import java.util.regex.Pattern; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.PDPageTree; +import org.apache.pdfbox.pdmodel.common.PDRectangle; +import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem; +import org.apache.pdfbox.pdmodel.interactive.pagenavigation.PDThreadBead; +import org.apache.pdfbox.text.TextPosition; +import org.apache.pdfbox.text.TextPositionComparator; +import org.apache.pdfbox.util.QuickSort; + +/** + * This is just a copy except i only adjusted lines 594-607 cause this is a bug in Pdfbox. + * see S416.pdf + */ +@SuppressWarnings({"PMD", "checkstyle:all"}) +public class PDFTextStripper extends LegacyPDFStreamEngine { + + private static float defaultIndentThreshold = 2.0f; + private static float defaultDropThreshold = 2.5f; + private static final boolean useCustomQuickSort; + + private static final Log LOG = LogFactory.getLog(PDFTextStripper.class); + + // enable the ability to set the default indent/drop thresholds + // with -D system properties: + // pdftextstripper.indent + // pdftextstripper.drop + static { + String strDrop = null, strIndent = null; + try { + String className = PDFTextStripper.class.getSimpleName().toLowerCase(); + String prop = className + ".indent"; + strIndent = System.getProperty(prop); + prop = className + ".drop"; + strDrop = System.getProperty(prop); + } catch (SecurityException e) { + // PDFBOX-1946 when run in an applet + // ignore and use default + } + if (strIndent != null && strIndent.length() > 0) { + try { + defaultIndentThreshold = Float.parseFloat(strIndent); + } catch (NumberFormatException nfe) { + // ignore and use default + } + } + if (strDrop != null && strDrop.length() > 0) { + try { + defaultDropThreshold = Float.parseFloat(strDrop); + } catch (NumberFormatException nfe) { + // ignore and use default + } + } + } + + + static { + // check if we need to use the custom quicksort algorithm as a + // workaround to the PDFBOX-1512 transitivity issue of TextPositionComparator: + boolean is16orLess = false; + try { + String version = System.getProperty("java.specification.version"); + StringTokenizer st = new StringTokenizer(version, "."); + int majorVersion = Integer.parseInt(st.nextToken()); + int minorVersion = 0; + if (st.hasMoreTokens()) { + minorVersion = Integer.parseInt(st.nextToken()); + } + is16orLess = majorVersion == 1 && minorVersion <= 6; + } catch (SecurityException x) { + // when run in an applet ignore and use default + // assume 1.7 or higher so that quicksort is used + } catch (NumberFormatException nfe) { + // should never happen, but if it does, + // assume 1.7 or higher so that quicksort is used + } + useCustomQuickSort = !is16orLess; + } + + /** + * The platform's line separator. + */ + protected final String LINE_SEPARATOR = System.getProperty("line.separator"); + + private String lineSeparator = LINE_SEPARATOR; + private String wordSeparator = " "; + private String paragraphStart = ""; + private String paragraphEnd = ""; + private String pageStart = ""; + private String pageEnd = LINE_SEPARATOR; + private String articleStart = ""; + private String articleEnd = ""; + + private int currentPageNo = 0; + private int startPage = 1; + private int endPage = Integer.MAX_VALUE; + private PDOutlineItem startBookmark = null; + + // 1-based bookmark pages + private int startBookmarkPageNumber = -1; + private int endBookmarkPageNumber = -1; + + private PDOutlineItem endBookmark = null; + private boolean suppressDuplicateOverlappingText = true; + private boolean shouldSeparateByBeads = true; + private boolean sortByPosition = false; + private boolean addMoreFormatting = false; + + private float indentThreshold = defaultIndentThreshold; + private float dropThreshold = defaultDropThreshold; + + // we will need to estimate where to add spaces, these are used to help guess + private float spacingTolerance = .5f; + private float averageCharTolerance = .3f; + + private List beadRectangles = null; + + /** + * The charactersByArticle is used to extract text by article divisions. For example a PDF that has two columns like + * a newspaper, we want to extract the first column and then the second column. In this example the PDF would have 2 + * beads(or articles), one for each column. The size of the charactersByArticle would be 5, because not all text on + * the screen will fall into one of the articles. The five divisions are shown below + *

+ * Text before first article + * first article text + * text between first article and second article + * second article text + * text after second article + *

+ * Most PDFs won't have any beads, so charactersByArticle will contain a single entry. + */ + protected ArrayList> charactersByArticle = new ArrayList<>(); + + private Map>> characterListMapping = new HashMap<>(); + + protected PDDocument document; + protected Writer output; + + /** + * True if we started a paragraph but haven't ended it yet. + */ + private boolean inParagraph; + + + /** + * Instantiate a new PDFTextStripper object. + * + * @throws IOException If there is an error loading the properties. + */ + public PDFTextStripper() throws IOException { + + } + + + /** + * This will return the text of a document. See writeText.
+ * NOTE: The document must not be encrypted when coming into this method. + * + *

IMPORTANT: By default, text extraction is done in the same sequence as the text in the PDF page content stream. + * PDF is a graphic format, not a text format, and unlike HTML, it has no requirements that text one on page + * be rendered in a certain order. The order is the one that was determined by the software that created the + * PDF. To get text sorted from left to right and top to botton, use {@link #setSortByPosition(boolean)}. + * + * @param doc The document to get the text from. + * @return The text of the PDF document. + * @throws IOException if the doc state is invalid or it is encrypted. + */ + public String getText(PDDocument doc) throws IOException { + + StringWriter outputStream = new StringWriter(); + writeText(doc, outputStream); + return outputStream.toString(); + } + + + private void resetEngine() { + + currentPageNo = 0; + document = null; + if (charactersByArticle != null) { + charactersByArticle.clear(); + } + characterListMapping.clear(); + } + + + /** + * This will take a PDDocument and write the text of that document to the print writer. + * + * @param doc The document to get the data from. + * @param outputStream The location to put the text. + * @throws IOException If the doc is in an invalid state. + */ + public void writeText(PDDocument doc, Writer outputStream) throws IOException { + + resetEngine(); + document = doc; + output = outputStream; + if (getAddMoreFormatting()) { + paragraphEnd = lineSeparator; + pageStart = lineSeparator; + articleStart = lineSeparator; + articleEnd = lineSeparator; + } + startDocument(document); + processPages(document.getPages()); + endDocument(document); + } + + + /** + * This will process all of the pages and the text that is in them. + * + * @param pages The pages object in the document. + * @throws IOException If there is an error parsing the text. + */ + protected void processPages(PDPageTree pages) throws IOException { + + PDPage startBookmarkPage = startBookmark == null ? null : startBookmark.findDestinationPage(document); + if (startBookmarkPage != null) { + startBookmarkPageNumber = pages.indexOf(startBookmarkPage) + 1; + } else { + // -1 = undefined + startBookmarkPageNumber = -1; + } + + PDPage endBookmarkPage = endBookmark == null ? null : endBookmark.findDestinationPage(document); + if (endBookmarkPage != null) { + endBookmarkPageNumber = pages.indexOf(endBookmarkPage) + 1; + } else { + // -1 = undefined + endBookmarkPageNumber = -1; + } + + if (startBookmarkPageNumber == -1 && startBookmark != null && endBookmarkPageNumber == -1 && endBookmark != null && startBookmark.getCOSObject() == endBookmark.getCOSObject()) { + // this is a special case where both the start and end bookmark + // are the same but point to nothing. In this case + // we will not extract any text. + startBookmarkPageNumber = 0; + endBookmarkPageNumber = 0; + } + + for (PDPage page : pages) { + currentPageNo++; + if (page.hasContents()) { + processPage(page); + } + } + } + + + /** + * This method is available for subclasses of this class. It will be called before processing of the document start. + * + * @param document The PDF document that is being processed. + */ + protected void startDocument(PDDocument document) { + // no default implementation, but available for subclasses + } + + + /** + * This method is available for subclasses of this class. It will be called after processing of the document + * finishes. + * + * @param document The PDF document that is being processed. + */ + protected void endDocument(PDDocument document) { + // no default implementation, but available for subclasses + } + + + /** + * This will process the contents of a page. + * + * @param page The page to process. + * @throws IOException If there is an error processing the page. + */ + @Override + public void processPage(PDPage page) throws IOException { + + if (currentPageNo >= startPage && currentPageNo <= endPage && (startBookmarkPageNumber == -1 || currentPageNo >= startBookmarkPageNumber) && (endBookmarkPageNumber == -1 || currentPageNo <= endBookmarkPageNumber)) { + startPage(page); + + int numberOfArticleSections = 1; + if (shouldSeparateByBeads) { + fillBeadRectangles(page); + numberOfArticleSections += beadRectangles.size() * 2; + } + int originalSize = charactersByArticle.size(); + charactersByArticle.ensureCapacity(numberOfArticleSections); + int lastIndex = Math.max(numberOfArticleSections, originalSize); + for (int i = 0; i < lastIndex; i++) { + if (i < originalSize) { + charactersByArticle.get(i).clear(); + } else { + if (numberOfArticleSections < originalSize) { + charactersByArticle.remove(i); + } else { + charactersByArticle.add(new ArrayList<>()); + } + } + } + characterListMapping.clear(); + super.processPage(page); + writePage(); + endPage(page); + } + } + + + private void fillBeadRectangles(PDPage page) { + + beadRectangles = new ArrayList<>(); + for (PDThreadBead bead : page.getThreadBeads()) { + if (bead == null || bead.getRectangle() == null) { + // can't skip, because of null entry handling in processTextPosition() + beadRectangles.add(null); + continue; + } + + PDRectangle rect = bead.getRectangle(); + + // bead rectangle is in PDF coordinates (y=0 is bottom), + // glyphs are in image coordinates (y=0 is top), + // so we must flip + PDRectangle mediaBox = page.getMediaBox(); + float upperRightY = mediaBox.getUpperRightY() - rect.getLowerLeftY(); + float lowerLeftY = mediaBox.getUpperRightY() - rect.getUpperRightY(); + rect.setLowerLeftY(lowerLeftY); + rect.setUpperRightY(upperRightY); + + // adjust for cropbox + PDRectangle cropBox = page.getCropBox(); + if (cropBox.getLowerLeftX() != 0 || cropBox.getLowerLeftY() != 0) { + rect.setLowerLeftX(rect.getLowerLeftX() - cropBox.getLowerLeftX()); + rect.setLowerLeftY(rect.getLowerLeftY() - cropBox.getLowerLeftY()); + rect.setUpperRightX(rect.getUpperRightX() - cropBox.getLowerLeftX()); + rect.setUpperRightY(rect.getUpperRightY() - cropBox.getLowerLeftY()); + } + + beadRectangles.add(rect); + } + } + + + /** + * Start a new article, which is typically defined as a column on a single page (also referred to as a bead). This + * assumes that the primary direction of text is left to right. Default implementation is to do nothing. Subclasses + * may provide additional information. + * + * @throws IOException If there is any error writing to the stream. + */ + protected void startArticle() throws IOException { + + startArticle(true); + } + + + /** + * Start a new article, which is typically defined as a column on a single page (also referred to as a bead). + * Default implementation is to do nothing. Subclasses may provide additional information. + * + * @param isLTR true if primary direction of text is left to right. + * @throws IOException If there is any error writing to the stream. + */ + protected void startArticle(boolean isLTR) throws IOException { + + output.write(getArticleStart()); + } + + + /** + * End an article. Default implementation is to do nothing. Subclasses may provide additional information. + * + * @throws IOException If there is any error writing to the stream. + */ + protected void endArticle() throws IOException { + + output.write(getArticleEnd()); + } + + + /** + * Start a new page. Default implementation is to do nothing. Subclasses may provide additional information. + * + * @param page The page we are about to process. + */ + protected void startPage(PDPage page) { + // default is to do nothing + } + + + /** + * End a page. Default implementation is to do nothing. Subclasses may provide additional information. + * + * @param page The page we are about to process. + */ + protected void endPage(PDPage page) { + // default is to do nothing + } + + + private static final float END_OF_LAST_TEXT_X_RESET_VALUE = -1; + private static final float MAX_Y_FOR_LINE_RESET_VALUE = -Float.MAX_VALUE; + private static final float EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE = -Float.MAX_VALUE; + private static final float MAX_HEIGHT_FOR_LINE_RESET_VALUE = -1; + private static final float MIN_Y_TOP_FOR_LINE_RESET_VALUE = Float.MAX_VALUE; + private static final float LAST_WORD_SPACING_RESET_VALUE = -1; + + + /** + * This will print the text of the processed page to "output". It will estimate, based on the coordinates of the + * text, where newlines and word spacings should be placed. The text will be sorted only if that feature was + * enabled. + * + * @throws IOException If there is an error writing the text. + */ + protected void writePage() throws IOException { + + float maxYForLine = MAX_Y_FOR_LINE_RESET_VALUE; + float minYTopForLine = MIN_Y_TOP_FOR_LINE_RESET_VALUE; + float endOfLastTextX = END_OF_LAST_TEXT_X_RESET_VALUE; + float lastWordSpacing = LAST_WORD_SPACING_RESET_VALUE; + float maxHeightForLine = MAX_HEIGHT_FOR_LINE_RESET_VALUE; + PositionWrapper lastPosition = null; + PositionWrapper lastLineStartPosition = null; + + boolean startOfPage = true; // flag to indicate start of page + boolean startOfArticle; + if (!charactersByArticle.isEmpty()) { + writePageStart(); + } + + for (List textList : charactersByArticle) { + if (getSortByPosition()) { + TextPositionComparator comparator = new TextPositionComparator(); + + // because the TextPositionComparator is not transitive, but + // JDK7+ enforces transitivity on comparators, we need to use + // a custom quicksort implementation (which is slower, unfortunately). + if (useCustomQuickSort) { + QuickSort.sort(textList, comparator); + } else { + Collections.sort(textList, comparator); + } + } + + startArticle(); + startOfArticle = true; + + // Now cycle through to print the text. + // We queue up a line at a time before we print so that we can convert + // the line from presentation form to logical form (if needed). + List line = new ArrayList<>(); + + Iterator textIter = textList.iterator(); + // PDF files don't always store spaces. We will need to guess where we should add + // spaces based on the distances between TextPositions. Historically, this was done + // based on the size of the space character provided by the font. In general, this + // worked but there were cases where it did not work. Calculating the average character + // width and using that as a metric works better in some cases but fails in some cases + // where the spacing worked. So we use both. NOTE: Adobe reader also fails on some of + // these examples. + + // Keeps track of the previous average character width + float previousAveCharWidth = -1; + while (textIter.hasNext()) { + TextPosition position = textIter.next(); + PositionWrapper current = new PositionWrapper(position); + String characterValue = position.getUnicode(); + + // Resets the average character width when we see a change in font + // or a change in the font size + if (lastPosition != null && (position.getFont() != lastPosition.getTextPosition().getFont() || position.getFontSize() != lastPosition.getTextPosition() + .getFontSize())) { + previousAveCharWidth = -1; + } + + float positionX; + float positionY; + float positionWidth; + float positionHeight; + + // If we are sorting, then we need to use the text direction + // adjusted coordinates, because they were used in the sorting. +// if (getSortByPosition()) +// { + positionX = position.getXDirAdj(); + positionY = position.getYDirAdj(); + positionWidth = position.getWidthDirAdj(); + positionHeight = position.getHeightDir(); +// } +// else +// { +// positionX = position.getX(); +// positionY = position.getY(); +// positionWidth = position.getWidth(); +// positionHeight = position.getHeight(); +// } + + // The current amount of characters in a word + int wordCharCount = position.getIndividualWidths().length; + + // Estimate the expected width of the space based on the + // space character with some margin. + float wordSpacing = position.getWidthOfSpace(); + float deltaSpace; + if (wordSpacing == 0 || Float.isNaN(wordSpacing)) { + deltaSpace = Float.MAX_VALUE; + } else { + if (lastWordSpacing < 0) { + deltaSpace = wordSpacing * getSpacingTolerance(); + } else { + deltaSpace = (wordSpacing + lastWordSpacing) / 2f * getSpacingTolerance(); + } + } + + // Estimate the expected width of the space based on the average character width + // with some margin. This calculation does not make a true average (average of + // averages) but we found that it gave the best results after numerous experiments. + // Based on experiments we also found that .3 worked well. + float averageCharWidth; + if (previousAveCharWidth < 0) { + averageCharWidth = positionWidth / wordCharCount; + } else { + averageCharWidth = (previousAveCharWidth + positionWidth / wordCharCount) / 2f; + } + float deltaCharWidth = averageCharWidth * getAverageCharTolerance(); + + // Compares the values obtained by the average method and the wordSpacing method + // and picks the smaller number. + float expectedStartOfNextWordX = EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE; + if (endOfLastTextX != END_OF_LAST_TEXT_X_RESET_VALUE) { + expectedStartOfNextWordX = endOfLastTextX + Math.min(deltaSpace, deltaCharWidth); + } + + if (lastPosition != null) { + if (startOfArticle) { + lastPosition.setArticleStart(); + startOfArticle = false; + } + // RDD - Here we determine whether this text object is on the current + // line. We use the lastBaselineFontSize to handle the superscript + // case, and the size of the current font to handle the subscript case. + // Text must overlap with the last rendered baseline text by at least + // a small amount in order to be considered as being on the same line. + + // XXX BC: In theory, this check should really check if the next char is in + // full range seen in this line. This is what I tried to do with minYTopForLine, + // but this caused a lot of regression test failures. So, I'm leaving it be for + // now + if (!overlap(positionY, positionHeight, maxYForLine, maxHeightForLine)) { + writeLine(normalize(line)); + line.clear(); + lastLineStartPosition = handleLineSeparation(current, lastPosition, lastLineStartPosition, maxHeightForLine); + expectedStartOfNextWordX = EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE; + maxYForLine = MAX_Y_FOR_LINE_RESET_VALUE; + maxHeightForLine = MAX_HEIGHT_FOR_LINE_RESET_VALUE; + minYTopForLine = MIN_Y_TOP_FOR_LINE_RESET_VALUE; + } + // test if our TextPosition starts after a new word would be expected to start + if (expectedStartOfNextWordX != EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE && expectedStartOfNextWordX < positionX + // only bother adding a word separator if the last character was not a word separator + && (wordSeparator.isEmpty() || // + (lastPosition.getTextPosition().getUnicode() != null && !lastPosition.getTextPosition().getUnicode().endsWith(wordSeparator)))) { + line.add(LineItem.getWordSeparator()); + } + // if there is at least the equivalent of one space + // between the last character and the current one, + // reset the max line height as the font size may have completely changed + if (Math.abs(position.getX() - lastPosition.getTextPosition().getX()) > (wordSpacing + deltaSpace)) { + maxYForLine = MAX_Y_FOR_LINE_RESET_VALUE; + maxHeightForLine = MAX_HEIGHT_FOR_LINE_RESET_VALUE; + minYTopForLine = MIN_Y_TOP_FOR_LINE_RESET_VALUE; + } + } + if (positionY >= maxYForLine) { + maxYForLine = positionY; + } + // RDD - endX is what PDF considers to be the x coordinate of the + // end position of the text. We use it in computing our metrics below. + endOfLastTextX = positionX + positionWidth; + + // add it to the list + if (characterValue != null) { + if (startOfPage && lastPosition == null) { + writeParagraphStart();// not sure this is correct for RTL? + } + line.add(new LineItem(position)); + } + maxHeightForLine = Math.max(maxHeightForLine, positionHeight); + minYTopForLine = Math.min(minYTopForLine, positionY - positionHeight); + lastPosition = current; + if (startOfPage) { + lastPosition.setParagraphStart(); + lastPosition.setLineStart(); + lastLineStartPosition = lastPosition; + startOfPage = false; + } + lastWordSpacing = wordSpacing; + previousAveCharWidth = averageCharWidth; + } + // print the final line + if (line.size() > 0) { + writeLine(normalize(line)); + writeParagraphEnd(); + } + endArticle(); + } + writePageEnd(); + } + + + private boolean overlap(float y1, float height1, float y2, float height2) { + + return within(y1, y2, .1f) || y2 <= y1 && y2 >= y1 - height1 || y1 <= y2 && y1 >= y2 - height2; + } + + + /** + * Write the line separator value to the output stream. + * + * @throws IOException If there is a problem writing out the line separator to the document. + */ + protected void writeLineSeparator() throws IOException { + + output.write(getLineSeparator()); + } + + + /** + * Write the word separator value to the output stream. + * + * @throws IOException If there is a problem writing out the word separator to the document. + */ + protected void writeWordSeparator() throws IOException { + + output.write(getWordSeparator()); + } + + + /** + * Write the string in TextPosition to the output stream. + * + * @param text The text to write to the stream. + * @throws IOException If there is an error when writing the text. + */ + protected void writeCharacters(TextPosition text) throws IOException { + + output.write(text.getUnicode()); + } + + + /** + * Write a Java string to the output stream. The default implementation will ignore the textPositions + * and just calls {@link #writeString(String)}. + * + * @param text The text to write to the stream. + * @param textPositions The TextPositions belonging to the text. + * @throws IOException If there is an error when writing the text. + */ + protected void writeString(String text, List textPositions) throws IOException { + + writeString(text); + } + + + /** + * Write a Java string to the output stream. + * + * @param text The text to write to the stream. + * @throws IOException If there is an error when writing the text. + */ + protected void writeString(String text) throws IOException { + + output.write(text); + } + + + /** + * This will determine of two floating point numbers are within a specified variance. + * + * @param first The first number to compare to. + * @param second The second number to compare to. + * @param variance The allowed variance. + */ + private boolean within(float first, float second, float variance) { + + return second < first + variance && second > first - variance; + } + + + /** + * This will process a TextPosition object and add the text to the list of characters on a page. It takes care of + * overlapping text. + * + * @param text The text to process. + */ + @Override + protected void processTextPosition(TextPosition text) { + + boolean showCharacter = true; + if (suppressDuplicateOverlappingText) { + showCharacter = false; + String textCharacter = text.getUnicode(); + float textX = text.getX(); + float textY = text.getY(); + TreeMap> sameTextCharacters = characterListMapping.get(textCharacter); + if (sameTextCharacters == null) { + sameTextCharacters = new TreeMap>(); + characterListMapping.put(textCharacter, sameTextCharacters); + } + // RDD - Here we compute the value that represents the end of the rendered + // text. This value is used to determine whether subsequent text rendered + // on the same line overwrites the current text. + // + // We subtract any positive padding to handle cases where extreme amounts + // of padding are applied, then backed off (not sure why this is done, but there + // are cases where the padding is on the order of 10x the character width, and + // the TJ just backs up to compensate after each character). Also, we subtract + // an amount to allow for kerning (a percentage of the width of the last + // character). + boolean suppressCharacter = false; + float tolerance = text.getWidth() / textCharacter.length() / 3.0f; + + SortedMap> xMatches = sameTextCharacters.subMap(textX - tolerance, textX + tolerance); + for (TreeSet xMatch : xMatches.values()) { + SortedSet yMatches = xMatch.subSet(textY - tolerance, textY + tolerance); + if (!yMatches.isEmpty()) { + suppressCharacter = true; + break; + } + } + if (!suppressCharacter) { + TreeSet ySet = sameTextCharacters.get(textX); + if (ySet == null) { + ySet = new TreeSet(); + sameTextCharacters.put(textX, ySet); + } + ySet.add(textY); + showCharacter = true; + } + } + if (showCharacter) { + // if we are showing the character then we need to determine which article it belongs to + int foundArticleDivisionIndex = -1; + int notFoundButFirstLeftAndAboveArticleDivisionIndex = -1; + int notFoundButFirstLeftArticleDivisionIndex = -1; + int notFoundButFirstAboveArticleDivisionIndex = -1; + float x = text.getX(); + float y = text.getY(); + if (shouldSeparateByBeads) { + for (int i = 0; i < beadRectangles.size() && foundArticleDivisionIndex == -1; i++) { + PDRectangle rect = beadRectangles.get(i); + if (rect != null) { + if (rect.contains(x, y)) { + foundArticleDivisionIndex = i * 2 + 1; + } else if ((x < rect.getLowerLeftX() || y < rect.getUpperRightY()) && notFoundButFirstLeftAndAboveArticleDivisionIndex == -1) { + notFoundButFirstLeftAndAboveArticleDivisionIndex = i * 2; + } else if (x < rect.getLowerLeftX() && notFoundButFirstLeftArticleDivisionIndex == -1) { + notFoundButFirstLeftArticleDivisionIndex = i * 2; + } else if (y < rect.getUpperRightY() && notFoundButFirstAboveArticleDivisionIndex == -1) { + notFoundButFirstAboveArticleDivisionIndex = i * 2; + } + } else { + foundArticleDivisionIndex = 0; + } + } + } else { + foundArticleDivisionIndex = 0; + } + int articleDivisionIndex; + if (foundArticleDivisionIndex != -1) { + articleDivisionIndex = foundArticleDivisionIndex; + } else if (notFoundButFirstLeftAndAboveArticleDivisionIndex != -1) { + articleDivisionIndex = notFoundButFirstLeftAndAboveArticleDivisionIndex; + } else if (notFoundButFirstLeftArticleDivisionIndex != -1) { + articleDivisionIndex = notFoundButFirstLeftArticleDivisionIndex; + } else if (notFoundButFirstAboveArticleDivisionIndex != -1) { + articleDivisionIndex = notFoundButFirstAboveArticleDivisionIndex; + } else { + articleDivisionIndex = charactersByArticle.size() - 1; + } + + List textList = charactersByArticle.get(articleDivisionIndex); + + // In the wild, some PDF encoded documents put diacritics (accents on + // top of characters) into a separate Tj element. When displaying them + // graphically, the two chunks get overlaid. With text output though, + // we need to do the overlay. This code recombines the diacritic with + // its associated character if the two are consecutive. + if (textList.isEmpty()) { + textList.add(text); + } else { + // test if we overlap the previous entry. + // Note that we are making an assumption that we need to only look back + // one TextPosition to find what we are overlapping. + // This may not always be true. */ + TextPosition previousTextPosition = textList.get(textList.size() - 1); + if (text.isDiacritic() && previousTextPosition.contains(text)) { + previousTextPosition.mergeDiacritic(text); + } + // If the previous TextPosition was the diacritic, merge it into this + // one and remove it from the list. + else if (previousTextPosition.isDiacritic() && text.contains(previousTextPosition)) { + text.mergeDiacritic(previousTextPosition); + textList.remove(textList.size() - 1); + textList.add(text); + } else { + textList.add(text); + } + } + } + } + + + /** + * This is the page that the text extraction will start on. The pages start at page 1. For example in a 5 page PDF + * document, if the start page is 1 then all pages will be extracted. If the start page is 4 then pages 4 and 5 will + * be extracted. The default value is 1. + * + * @return Value of property startPage. + */ + public int getStartPage() { + + return startPage; + } + + + /** + * This will set the first page to be extracted by this class. + * + * @param startPageValue New value of 1-based startPage property. + */ + public void setStartPage(int startPageValue) { + + startPage = startPageValue; + } + + + /** + * This will get the last page that will be extracted. This is inclusive, for example if a 5 page PDF an endPage + * value of 5 would extract the entire document, an end page of 2 would extract pages 1 and 2. This defaults to + * Integer.MAX_VALUE such that all pages of the pdf will be extracted. + * + * @return Value of property endPage. + */ + public int getEndPage() { + + return endPage; + } + + + /** + * This will set the last page to be extracted by this class. + * + * @param endPageValue New value of 1-based endPage property. + */ + public void setEndPage(int endPageValue) { + + endPage = endPageValue; + } + + + /** + * Set the desired line separator for output text. The line.separator system property is used if the line separator + * preference is not set explicitly using this method. + * + * @param separator The desired line separator string. + */ + public void setLineSeparator(String separator) { + + lineSeparator = separator; + } + + + /** + * This will get the line separator. + * + * @return The desired line separator string. + */ + public String getLineSeparator() { + + return lineSeparator; + } + + + /** + * This will get the word separator. + * + * @return The desired word separator string. + */ + public String getWordSeparator() { + + return wordSeparator; + } + + + /** + * Set the desired word separator for output text. The PDFBox text extraction algorithm will output a space + * character if there is enough space between two words. By default a space character is used. If you need and + * accurate count of characters that are found in a PDF document then you might want to set the word separator to + * the empty string. + * + * @param separator The desired page separator string. + */ + public void setWordSeparator(String separator) { + + wordSeparator = separator; + } + + + /** + * @return Returns the suppressDuplicateOverlappingText. + */ + public boolean getSuppressDuplicateOverlappingText() { + + return suppressDuplicateOverlappingText; + } + + + /** + * Get the current page number that is being processed. + * + * @return A 1 based number representing the current page. + */ + protected int getCurrentPageNo() { + + return currentPageNo; + } + + + /** + * The output stream that is being written to. + * + * @return The stream that output is being written to. + */ + protected Writer getOutput() { + + return output; + } + + + /** + * Character strings are grouped by articles. It is quite common that there will only be a single article. This + * returns a List that contains List objects, the inner lists will contain TextPosition objects. + * + * @return A double List of TextPositions for all text strings on the page. + */ + protected List> getCharactersByArticle() { + + return charactersByArticle; + } + + + /** + * By default the text stripper will attempt to remove text that overlapps each other. Word paints the same + * character several times in order to make it look bold. By setting this to false all text will be extracted, which + * means that certain sections will be duplicated, but better performance will be noticed. + * + * @param suppressDuplicateOverlappingTextValue The suppressDuplicateOverlappingText to set. + */ + public void setSuppressDuplicateOverlappingText(boolean suppressDuplicateOverlappingTextValue) { + + suppressDuplicateOverlappingText = suppressDuplicateOverlappingTextValue; + } + + + /** + * This will tell if the text stripper should separate by beads. + * + * @return If the text will be grouped by beads. + */ + public boolean getSeparateByBeads() { + + return shouldSeparateByBeads; + } + + + /** + * Set if the text stripper should group the text output by a list of beads. The default value is true! + * + * @param aShouldSeparateByBeads The new grouping of beads. + */ + public void setShouldSeparateByBeads(boolean aShouldSeparateByBeads) { + + shouldSeparateByBeads = aShouldSeparateByBeads; + } + + + /** + * Get the bookmark where text extraction should end, inclusive. Default is null. + * + * @return The ending bookmark. + */ + public PDOutlineItem getEndBookmark() { + + return endBookmark; + } + + + /** + * Set the bookmark where the text extraction should stop. + * + * @param aEndBookmark The ending bookmark. + */ + public void setEndBookmark(PDOutlineItem aEndBookmark) { + + endBookmark = aEndBookmark; + } + + + /** + * Get the bookmark where text extraction should start, inclusive. Default is null. + * + * @return The starting bookmark. + */ + public PDOutlineItem getStartBookmark() { + + return startBookmark; + } + + + /** + * Set the bookmark where text extraction should start, inclusive. + * + * @param aStartBookmark The starting bookmark. + */ + public void setStartBookmark(PDOutlineItem aStartBookmark) { + + startBookmark = aStartBookmark; + } + + + /** + * This will tell if the text stripper should add some more text formatting. + * + * @return true if some more text formatting will be added + */ + public boolean getAddMoreFormatting() { + + return addMoreFormatting; + } + + + /** + * There will some additional text formatting be added if addMoreFormatting is set to true. Default is false. + * + * @param newAddMoreFormatting Tell PDFBox to add some more text formatting + */ + public void setAddMoreFormatting(boolean newAddMoreFormatting) { + + addMoreFormatting = newAddMoreFormatting; + } + + + /** + * This will tell if the text stripper should sort the text tokens before writing to the stream. + * + * @return true If the text tokens will be sorted before being written. + */ + public boolean getSortByPosition() { + + return sortByPosition; + } + + + /** + * The order of the text tokens in a PDF file may not be in the same as they appear visually on the screen. For + * example, a PDF writer may write out all text by font, so all bold or larger text, then make a second pass and + * write out the normal text.
+ * The default is to not sort by position.
+ *
+ * A PDF writer could choose to write each character in a different order. By default PDFBox does not sort + * the text tokens before processing them due to performance reasons. + * + * @param newSortByPosition Tell PDFBox to sort the text positions. + */ + public void setSortByPosition(boolean newSortByPosition) { + + sortByPosition = newSortByPosition; + } + + + /** + * Get the current space width-based tolerance value that is being used to estimate where spaces in text should be + * added. Note that the default value for this has been determined from trial and error. + * + * @return The current tolerance / scaling factor + */ + public float getSpacingTolerance() { + + return spacingTolerance; + } + + + /** + * Set the space width-based tolerance value that is used to estimate where spaces in text should be added. Note + * that the default value for this has been determined from trial and error. Setting this value larger will reduce + * the number of spaces added. + * + * @param spacingToleranceValue tolerance / scaling factor to use + */ + public void setSpacingTolerance(float spacingToleranceValue) { + + spacingTolerance = spacingToleranceValue; + } + + + /** + * Get the current character width-based tolerance value that is being used to estimate where spaces in text should + * be added. Note that the default value for this has been determined from trial and error. + * + * @return The current tolerance / scaling factor + */ + public float getAverageCharTolerance() { + + return averageCharTolerance; + } + + + /** + * Set the character width-based tolerance value that is used to estimate where spaces in text should be added. Note + * that the default value for this has been determined from trial and error. Setting this value larger will reduce + * the number of spaces added. + * + * @param averageCharToleranceValue average tolerance / scaling factor to use + */ + public void setAverageCharTolerance(float averageCharToleranceValue) { + + averageCharTolerance = averageCharToleranceValue; + } + + + /** + * returns the multiple of whitespace character widths for the current text which the current line start can be + * indented from the previous line start beyond which the current line start is considered to be a paragraph start. + * + * @return the number of whitespace character widths to use when detecting paragraph indents. + */ + public float getIndentThreshold() { + + return indentThreshold; + } + + + /** + * sets the multiple of whitespace character widths for the current text which the current line start can be + * indented from the previous line start beyond which the current line start is considered to be a paragraph start. + * The default value is 2.0. + * + * @param indentThresholdValue the number of whitespace character widths to use when detecting paragraph indents. + */ + public void setIndentThreshold(float indentThresholdValue) { + + indentThreshold = indentThresholdValue; + } + + + /** + * the minimum whitespace, as a multiple of the max height of the current characters beyond which the current line + * start is considered to be a paragraph start. + * + * @return the character height multiple for max allowed whitespace between lines in the same paragraph. + */ + public float getDropThreshold() { + + return dropThreshold; + } + + + /** + * sets the minimum whitespace, as a multiple of the max height of the current characters beyond which the current + * line start is considered to be a paragraph start. The default value is 2.5. + * + * @param dropThresholdValue the character height multiple for max allowed whitespace between lines in the same + * paragraph. + */ + public void setDropThreshold(float dropThresholdValue) { + + dropThreshold = dropThresholdValue; + } + + + /** + * Returns the string which will be used at the beginning of a paragraph. + * + * @return the paragraph start string + */ + public String getParagraphStart() { + + return paragraphStart; + } + + + /** + * Sets the string which will be used at the beginning of a paragraph. + * + * @param s the paragraph start string + */ + public void setParagraphStart(String s) { + + paragraphStart = s; + } + + + /** + * Returns the string which will be used at the end of a paragraph. + * + * @return the paragraph end string + */ + public String getParagraphEnd() { + + return paragraphEnd; + } + + + /** + * Sets the string which will be used at the end of a paragraph. + * + * @param s the paragraph end string + */ + public void setParagraphEnd(String s) { + + paragraphEnd = s; + } + + + /** + * Returns the string which will be used at the beginning of a page. + * + * @return the page start string + */ + public String getPageStart() { + + return pageStart; + } + + + /** + * Sets the string which will be used at the beginning of a page. + * + * @param pageStartValue the page start string + */ + public void setPageStart(String pageStartValue) { + + pageStart = pageStartValue; + } + + + /** + * Returns the string which will be used at the end of a page. + * + * @return the page end string + */ + public String getPageEnd() { + + return pageEnd; + } + + + /** + * Sets the string which will be used at the end of a page. + * + * @param pageEndValue the page end string + */ + public void setPageEnd(String pageEndValue) { + + pageEnd = pageEndValue; + } + + + /** + * Returns the string which will be used at the beginning of an article. + * + * @return the article start string + */ + public String getArticleStart() { + + return articleStart; + } + + + /** + * Sets the string which will be used at the beginning of an article. + * + * @param articleStartValue the article start string + */ + public void setArticleStart(String articleStartValue) { + + articleStart = articleStartValue; + } + + + /** + * Returns the string which will be used at the end of an article. + * + * @return the article end string + */ + public String getArticleEnd() { + + return articleEnd; + } + + + /** + * Sets the string which will be used at the end of an article. + * + * @param articleEndValue the article end string + */ + public void setArticleEnd(String articleEndValue) { + + articleEnd = articleEndValue; + } + + + /** + * handles the line separator for a new line given the specified current and previous TextPositions. + * + * @param current the current text position + * @param lastPosition the previous text position + * @param lastLineStartPosition the last text position that followed a line separator. + * @param maxHeightForLine max height for positions since lastLineStartPosition + * @return start position of the last line + * @throws IOException if something went wrong + */ + private PositionWrapper handleLineSeparation(PositionWrapper current, + PositionWrapper lastPosition, + PositionWrapper lastLineStartPosition, + float maxHeightForLine) throws IOException { + + current.setLineStart(); + isParagraphSeparation(current, lastPosition, lastLineStartPosition, maxHeightForLine); + lastLineStartPosition = current; + if (current.isParagraphStart()) { + if (lastPosition.isArticleStart()) { + if (lastPosition.isLineStart()) { + writeLineSeparator(); + } + writeParagraphStart(); + } else { + writeLineSeparator(); + writeParagraphSeparator(); + } + } else { + writeLineSeparator(); + } + return lastLineStartPosition; + } + + + /** + * tests the relationship between the last text position, the current text position and the last text position that + * followed a line separator to decide if the gap represents a paragraph separation. This should only be + * called for consecutive text positions that first pass the line separation test. + *

+ * This base implementation tests to see if the lastLineStartPosition is null OR if the current vertical position + * has dropped below the last text vertical position by at least 2.5 times the current text height OR if the current + * horizontal position is indented by at least 2 times the current width of a space character. + *

+ *

+ * This also attempts to identify text that is indented under a hanging indent. + *

+ *

+ * This method sets the isParagraphStart and isHangingIndent flags on the current position object. + *

+ * + * @param position the current text position. This may have its isParagraphStart or isHangingIndent flags set upon + * return. + * @param lastPosition the previous text position (should not be null). + * @param lastLineStartPosition the last text position that followed a line separator, or null. + * @param maxHeightForLine max height for text positions since lasLineStartPosition. + */ + private void isParagraphSeparation(PositionWrapper position, PositionWrapper lastPosition, PositionWrapper lastLineStartPosition, float maxHeightForLine) { + + boolean result = false; + if (lastLineStartPosition == null) { + result = true; + } else { + float yGap = Math.abs(position.getTextPosition().getYDirAdj() - lastPosition.getTextPosition().getYDirAdj()); + float newYVal = multiplyFloat(getDropThreshold(), maxHeightForLine); + // do we need to flip this for rtl? + float xGap = position.getTextPosition().getXDirAdj() - lastLineStartPosition.getTextPosition().getXDirAdj(); + float newXVal = multiplyFloat(getIndentThreshold(), position.getTextPosition().getWidthOfSpace()); + float positionWidth = multiplyFloat(0.25f, position.getTextPosition().getWidth()); + + if (yGap > newYVal) { + result = true; + } else if (xGap > newXVal) { + // text is indented, but try to screen for hanging indent + if (!lastLineStartPosition.isParagraphStart()) { + result = true; + } else { + position.setHangingIndent(); + } + } else if (xGap < -position.getTextPosition().getWidthOfSpace()) { + // text is left of previous line. Was it a hanging indent? + if (!lastLineStartPosition.isParagraphStart()) { + result = true; + } + } else if (Math.abs(xGap) < positionWidth) { + // current horizontal position is within 1/4 a char of the last + // linestart. We'll treat them as lined up. + if (lastLineStartPosition.isHangingIndent()) { + position.setHangingIndent(); + } else if (lastLineStartPosition.isParagraphStart()) { + // check to see if the previous line looks like + // any of a number of standard list item formats + Pattern liPattern = matchListItemPattern(lastLineStartPosition); + if (liPattern != null) { + Pattern currentPattern = matchListItemPattern(position); + if (liPattern == currentPattern) { + result = true; + } + } + } + } + } + if (result) { + position.setParagraphStart(); + } + } + + + private float multiplyFloat(float value1, float value2) { + // multiply 2 floats and truncate the resulting value to 3 decimal places + // to avoid wrong results when comparing with another float + return Math.round(value1 * value2 * 1000) / 1000f; + } + + + /** + * writes the paragraph separator string to the output. + * + * @throws IOException if something went wrong + */ + protected void writeParagraphSeparator() throws IOException { + + writeParagraphEnd(); + writeParagraphStart(); + } + + + /** + * Write something (if defined) at the start of a paragraph. + * + * @throws IOException if something went wrong + */ + protected void writeParagraphStart() throws IOException { + + if (inParagraph) { + writeParagraphEnd(); + inParagraph = false; + } + output.write(getParagraphStart()); + inParagraph = true; + } + + + /** + * Write something (if defined) at the end of a paragraph. + * + * @throws IOException if something went wrong + */ + protected void writeParagraphEnd() throws IOException { + + if (!inParagraph) { + writeParagraphStart(); + } + output.write(getParagraphEnd()); + inParagraph = false; + } + + + /** + * Write something (if defined) at the start of a page. + * + * @throws IOException if something went wrong + */ + protected void writePageStart() throws IOException { + + output.write(getPageStart()); + } + + + /** + * Write something (if defined) at the end of a page. + * + * @throws IOException if something went wrong + */ + protected void writePageEnd() throws IOException { + + output.write(getPageEnd()); + } + + + /** + * returns the list item Pattern object that matches the text at the specified PositionWrapper or null if the text + * does not match such a pattern. The list of Patterns tested against is given by the {@link #getListItemPatterns()} + * method. To add to the list, simply override that method (if sub-classing) or explicitly supply your own list + * using {@link #setListItemPatterns(List)}. + * + * @param pw position + * @return the matching pattern + */ + private Pattern matchListItemPattern(PositionWrapper pw) { + + TextPosition tp = pw.getTextPosition(); + String txt = tp.getUnicode(); + return matchPattern(txt, getListItemPatterns()); + } + + + /** + * a list of regular expressions that match commonly used list item formats, i.e. bullets, numbers, letters, Roman + * numerals, etc. Not meant to be comprehensive. + */ + private static final String[] LIST_ITEM_EXPRESSIONS = {"\\.", "\\d+\\.", "\\[\\d+\\]", "\\d+\\)", "[A-Z]\\.", "[a-z]\\.", "[A-Z]\\)", "[a-z]\\)", "[IVXL]+\\.", "[ivxl]+\\.",}; + + private List listOfPatterns = null; + + + /** + * use to supply a different set of regular expression patterns for matching list item starts. + * + * @param patterns list of patterns + */ + protected void setListItemPatterns(List patterns) { + + listOfPatterns = patterns; + } + + + /** + * returns a list of regular expression Patterns representing different common list item formats. For example + * numbered items of form: + *
    + *
  1. some text
  2. + *
  3. more text
  4. + *
+ * or + *
    + *
  • some text
  • + *
  • more text
  • + *
+ * etc., all begin with some character pattern. The pattern "\\d+\." (matches "1.", "2.", ...) or "\[\\d+\]" + * (matches "[1]", "[2]", ...). + *

+ * This method returns a list of such regular expression Patterns. + * + * @return a list of Pattern objects. + */ + protected List getListItemPatterns() { + + if (listOfPatterns == null) { + listOfPatterns = new ArrayList(); + for (String expression : LIST_ITEM_EXPRESSIONS) { + Pattern p = Pattern.compile(expression); + listOfPatterns.add(p); + } + } + return listOfPatterns; + } + + + /** + * iterates over the specified list of Patterns until it finds one that matches the specified string. Then returns + * the Pattern. + *

+ * Order of the supplied list of patterns is important as most common patterns should come first. Patterns should be + * strict in general, and all will be used with case sensitivity on. + *

+ * + * @param string the string to be searched + * @param patterns list of patterns + * @return matching pattern + */ + protected static Pattern matchPattern(String string, List patterns) { + + for (Pattern p : patterns) { + if (p.matcher(string).matches()) { + return p; + } + } + return null; + } + + + /** + * Write a list of string containing a whole line of a document. + * + * @param line a list with the words of the given line + * @throws IOException if something went wrong + */ + private void writeLine(List line) throws IOException { + + int numberOfStrings = line.size(); + for (int i = 0; i < numberOfStrings; i++) { + WordWithTextPositions word = line.get(i); + writeString(word.getText(), word.getTextPositions()); + if (i < numberOfStrings - 1) { + writeWordSeparator(); + } + } + } + + + /** + * Normalize the given list of TextPositions. + * + * @param line list of TextPositions + * @return a list of strings, one string for every word + */ + private List normalize(List line) { + + List normalized = new LinkedList(); + StringBuilder lineBuilder = new StringBuilder(); + List wordPositions = new ArrayList(); + + for (LineItem item : line) { + lineBuilder = normalizeAdd(normalized, lineBuilder, wordPositions, item); + } + + if (lineBuilder.length() > 0) { + normalized.add(createWord(lineBuilder.toString(), wordPositions)); + } + return normalized; + } + + + /** + * Handles the LTR and RTL direction of the given words. The whole implementation stands and falls with the given + * word. If the word is a full line, the results will be the best. If the word contains of single words or + * characters, the order of the characters in a word or words in a line may wrong, due to RTL and LTR marks and + * characters! + *

+ * Based on http://www.nesterovsky-bros.com/weblog/2013/07/28/VisualToLogicalConversionInJava.aspx + * + * @param word The word that shall be processed + * @return new word with the correct direction of the containing characters + */ + private String handleDirection(String word) { + + Bidi bidi = new Bidi(word, Bidi.DIRECTION_DEFAULT_LEFT_TO_RIGHT); + + // if there is pure LTR text no need to process further + if (!bidi.isMixed() && bidi.getBaseLevel() == Bidi.DIRECTION_LEFT_TO_RIGHT) { + return word; + } + + // collect individual bidi information + int runCount = bidi.getRunCount(); + byte[] levels = new byte[runCount]; + Integer[] runs = new Integer[runCount]; + + for (int i = 0; i < runCount; i++) { + levels[i] = (byte) bidi.getRunLevel(i); + runs[i] = i; + } + + // reorder individual parts based on their levels + Bidi.reorderVisually(levels, 0, runs, 0, runCount); + + // collect the parts based on the direction within the run + StringBuilder result = new StringBuilder(); + + for (int i = 0; i < runCount; i++) { + int index = runs[i]; + int start = bidi.getRunStart(index); + int end = bidi.getRunLimit(index); + + int level = levels[index]; + + if ((level & 1) != 0) { + while (--end >= start) { + char character = word.charAt(end); + if (Character.isMirrored(word.codePointAt(end))) { + if (MIRRORING_CHAR_MAP.containsKey(character)) { + result.append(MIRRORING_CHAR_MAP.get(character)); + } else { + result.append(character); + } + } else { + result.append(character); + } + } + } else { + result.append(word, start, end); + } + } + + return result.toString(); + } + + + private static Map MIRRORING_CHAR_MAP = new HashMap<>(); + + static { + String path = "/org/apache/pdfbox/resources/text/BidiMirroring.txt"; + InputStream input = new BufferedInputStream(PDFTextStripper.class.getResourceAsStream(path)); + try { + parseBidiFile(input); + } catch (IOException e) { + LOG.warn("Could not parse BidiMirroring.txt, mirroring char map will be empty: " + e.getMessage()); + } finally { + try { + input.close(); + } catch (IOException e) { + LOG.debug("Could not close BidiMirroring.txt ", e); + } + } + } + + /** + * This method parses the bidi file provided as inputstream. + * + * @param inputStream - The bidi file as inputstream + * @throws IOException if any line could not be read by the LineNumberReader + */ + private static void parseBidiFile(InputStream inputStream) throws IOException { + + LineNumberReader rd = new LineNumberReader(new InputStreamReader(inputStream)); + + do { + String s = rd.readLine(); + if (s == null) { + break; + } + + int comment = s.indexOf('#'); // ignore comments + if (comment != -1) { + s = s.substring(0, comment); + } + + if (s.length() < 2) { + continue; + } + + StringTokenizer st = new StringTokenizer(s, ";"); + int nFields = st.countTokens(); + Character[] fields = new Character[nFields]; + for (int i = 0; i < nFields; i++) { + fields[i] = (char) Integer.parseInt(st.nextToken().trim(), 16); + } + + if (fields.length == 2) { + // initialize the MIRRORING_CHAR_MAP + MIRRORING_CHAR_MAP.put(fields[0], fields[1]); + } + + } while (true); + } + + + /** + * Used within {@link #normalize(List)} to create a single {@link WordWithTextPositions} entry. + */ + private WordWithTextPositions createWord(String word, List wordPositions) { + + return new WordWithTextPositions(normalizeWord(word), wordPositions); + } + + + /** + * Normalize certain Unicode characters. For example, convert the single "fi" ligature to "f" and "i". Also + * normalises Arabic and Hebrew presentation forms. + * + * @param word Word to normalize + * @return Normalized word + */ + private String normalizeWord(String word) { + + StringBuilder builder = null; + int p = 0; + int q = 0; + int strLength = word.length(); + for (; q < strLength; q++) { + // We only normalize if the codepoint is in a given range. + // Otherwise, NFKC converts too many things that would cause + // confusion. For example, it converts the micro symbol in + // extended Latin to the value in the Greek script. We normalize + // the Unicode Alphabetic and Arabic A&B Presentation forms. + char c = word.charAt(q); + if (0xFB00 <= c && c <= 0xFDFF || 0xFE70 <= c && c <= 0xFEFF) { + if (builder == null) { + builder = new StringBuilder(strLength * 2); + } + builder.append(word, p, q); + // Some fonts map U+FDF2 differently than the Unicode spec. + // They add an extra U+0627 character to compensate. + // This removes the extra character for those fonts. + if (c == 0xFDF2 && q > 0 && (word.charAt(q - 1) == 0x0627 || word.charAt(q - 1) == 0xFE8D)) { + builder.append("\u0644\u0644\u0647"); + } else { + // Trim because some decompositions have an extra space, such as U+FC5E + builder.append(Normalizer.normalize(word.substring(q, q + 1), Normalizer.Form.NFKC).trim()); + } + p = q + 1; + } + } + if (builder == null) { + return handleDirection(word); + } else { + builder.append(word, p, q); + return handleDirection(builder.toString()); + } + } + + + /** + * Used within {@link #normalize(List)} to handle a {@link TextPosition}. + * + * @return The StringBuilder that must be used when calling this method. + */ + private StringBuilder normalizeAdd(List normalized, StringBuilder lineBuilder, List wordPositions, LineItem item) { + + if (item.isWordSeparator()) { + normalized.add(createWord(lineBuilder.toString(), new ArrayList(wordPositions))); + lineBuilder = new StringBuilder(); + wordPositions.clear(); + } else { + TextPosition text = item.getTextPosition(); + lineBuilder.append(text.getUnicode()); + wordPositions.add(text); + } + return lineBuilder; + } + + + /** + * internal marker class. Used as a place holder in a line of TextPositions. + */ + private static final class LineItem { + + public static LineItem WORD_SEPARATOR = new LineItem(); + + + public static LineItem getWordSeparator() { + + return WORD_SEPARATOR; + } + + + private final TextPosition textPosition; + + + private LineItem() { + + textPosition = null; + } + + + LineItem(TextPosition textPosition) { + + this.textPosition = textPosition; + } + + + public TextPosition getTextPosition() { + + return textPosition; + } + + + public boolean isWordSeparator() { + + return textPosition == null; + } + + } + + /** + * Internal class that maps strings to lists of {@link TextPosition} arrays. Note that the number of entries in that + * list may differ from the number of characters in the string due to normalization. + * + * @author Axel Dörfler + */ + private static final class WordWithTextPositions { + + String text; + List textPositions; + + + WordWithTextPositions(String word, List positions) { + + text = word; + textPositions = positions; + } + + + public String getText() { + + return text; + } + + + public List getTextPositions() { + + return textPositions; + } + + } + + /** + * wrapper of TextPosition that adds flags to track status as linestart and paragraph start positions. + *

+ * This is implemented as a wrapper since the TextPosition class doesn't provide complete access to its state fields + * to subclasses. Also, conceptually TextPosition is immutable while these flags need to be set post-creation so it + * makes sense to put these flags in this separate class. + *

+ * + * @author m.martinez@ll.mit.edu + */ + private static final class PositionWrapper { + + private boolean isLineStart = false; + private boolean isParagraphStart = false; + private boolean isPageBreak = false; + private boolean isHangingIndent = false; + private boolean isArticleStart = false; + + private TextPosition position = null; + + + /** + * Constructs a PositionWrapper around the specified TextPosition object. + * + * @param position the text position. + */ + PositionWrapper(TextPosition position) { + + this.position = position; + } + + + /** + * Returns the underlying TextPosition object. + * + * @return the text position + */ + public TextPosition getTextPosition() { + + return position; + } + + + public boolean isLineStart() { + + return isLineStart; + } + + + /** + * Sets the isLineStart() flag to true. + */ + public void setLineStart() { + + this.isLineStart = true; + } + + + public boolean isParagraphStart() { + + return isParagraphStart; + } + + + /** + * sets the isParagraphStart() flag to true. + */ + public void setParagraphStart() { + + this.isParagraphStart = true; + } + + + public boolean isArticleStart() { + + return isArticleStart; + } + + + /** + * Sets the isArticleStart() flag to true. + */ + public void setArticleStart() { + + this.isArticleStart = true; + } + + + public boolean isPageBreak() { + + return isPageBreak; + } + + + /** + * Sets the isPageBreak() flag to true. + */ + public void setPageBreak() { + + this.isPageBreak = true; + } + + + public boolean isHangingIndent() { + + return isHangingIndent; + } + + + /** + * Sets the isHangingIndent() flag to true. + */ + public void setHangingIndent() { + + this.isHangingIndent = true; + } + + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/BlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/BlockificationService.java new file mode 100644 index 0000000..244f6ab --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/BlockificationService.java @@ -0,0 +1,279 @@ +package com.knecon.fforesight.service.layoutparser.processor.classification.service; + +import static java.util.stream.Collectors.toSet; + +import java.util.ArrayList; +import java.util.Comparator; +import java.util.Iterator; +import java.util.List; + +import org.springframework.stereotype.Service; + +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.FloatFrequencyCounter; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextBlockOrientation; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.StringFrequencyCounter; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Ruling; +import com.knecon.fforesight.service.layoutparser.processor.classification.utils.RulingTextDirAdjustUtil; + +@Service +@SuppressWarnings("all") +public class BlockificationService { + + static final float THRESHOLD = 1f; + + + /** + * This method is building blocks by expanding the minX/maxX and minY/maxY value on each word that is not split by the conditions. + * This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this! + * Rulings (Table lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling. + * + * @param textPositions The words of a page. + * @param horizontalRulingLines Horizontal table lines. + * @param verticalRulingLines Vertical table lines. + * @return ClassificationPage object that contains the Textblock and text statistics. + */ + public ClassificationPage blockify(List textPositions, List horizontalRulingLines, List verticalRulingLines) { + + int indexOnPage = 0; + List chunkWords = new ArrayList<>(); + List chunkBlockList1 = new ArrayList<>(); + + float minX = 1000, maxX = 0, minY = 1000, maxY = 0; + TextPositionSequence prev = null; + + boolean wasSplitted = false; + Float splitX1 = null; + for (TextPositionSequence word : textPositions) { + + boolean lineSeparation = word.getMinYDirAdj() - maxY > word.getHeight() * 1.25; + boolean startFromTop = prev != null && word.getMinYDirAdj() < prev.getMinYDirAdj() - prev.getTextHeight(); + boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj(); + boolean xIsBeforeFirstX = prev != null && word.getMinXDirAdj() < minX; + boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj(); + boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines); + boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir()); + + if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) { + + TextBlockOrientation prevOrientation = null; + if (!chunkBlockList1.isEmpty()) { + prevOrientation = chunkBlockList1.get(chunkBlockList1.size() - 1).getOrientation(); + } + + ClassificationTextBlock cb1 = buildTextBlock(chunkWords, indexOnPage); + indexOnPage++; + + chunkBlockList1.add(cb1); + chunkWords = new ArrayList<>(); + + if (splitByX && !isSplitByRuling) { + wasSplitted = true; + cb1.setOrientation(TextBlockOrientation.LEFT); + splitX1 = word.getMinXDirAdj(); + } else if (newLineAfterSplit && !isSplitByRuling) { + wasSplitted = false; + cb1.setOrientation(TextBlockOrientation.RIGHT); + splitX1 = null; + } else if (prevOrientation != null && prevOrientation.equals(TextBlockOrientation.RIGHT) && (lineSeparation || !startFromTop || !splitByX || !newLineAfterSplit || !isSplitByRuling)) { + cb1.setOrientation(TextBlockOrientation.LEFT); + } + + minX = 1000; + maxX = 0; + minY = 1000; + maxY = 0; + prev = null; + } + + chunkWords.add(word); + + prev = word; + if (word.getMinXDirAdj() < minX) { + minX = word.getMinXDirAdj(); + } + if (word.getMaxXDirAdj() > maxX) { + maxX = word.getMaxXDirAdj(); + } + if (word.getMinYDirAdj() < minY) { + minY = word.getMinYDirAdj(); + } + if (word.getMaxYDirAdj() > maxY) { + maxY = word.getMaxYDirAdj(); + } + } + + ClassificationTextBlock cb1 = buildTextBlock(chunkWords, indexOnPage); + if (cb1 != null) { + chunkBlockList1.add(cb1); + } + + Iterator itty = chunkBlockList1.iterator(); + + ClassificationTextBlock previousLeft = null; + ClassificationTextBlock previousRight = null; + while (itty.hasNext()) { + ClassificationTextBlock block = (ClassificationTextBlock) itty.next(); + + if (previousLeft != null && block.getOrientation().equals(TextBlockOrientation.LEFT)) { + if (previousLeft.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousLeft.getMinY()) { + previousLeft.add(block); + itty.remove(); + continue; + } + } + + if (previousRight != null && block.getOrientation().equals(TextBlockOrientation.RIGHT)) { + if (previousRight.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousRight.getMinY()) { + previousRight.add(block); + itty.remove(); + continue; + } + } + + if (block.getOrientation().equals(TextBlockOrientation.LEFT)) { + previousLeft = block; + } else if (block.getOrientation().equals(TextBlockOrientation.RIGHT)) { + previousRight = block; + } + } + + itty = chunkBlockList1.iterator(); + ClassificationTextBlock previous = null; + while (itty.hasNext()) { + ClassificationTextBlock block = (ClassificationTextBlock) itty.next(); + + if (previous != null && previous.getOrientation().equals(TextBlockOrientation.LEFT) && block.getOrientation().equals(TextBlockOrientation.LEFT) && equalsWithThreshold(block.getMaxY(), + previous.getMaxY()) || previous != null && previous.getOrientation().equals(TextBlockOrientation.LEFT) && block.getOrientation() + .equals(TextBlockOrientation.RIGHT) && equalsWithThreshold(block.getMaxY(), previous.getMaxY())) { + previous.add(block); + itty.remove(); + continue; + } + + previous = block; + } + + return new ClassificationPage(chunkBlockList1); + } + + + private boolean equalsWithThreshold(float f1, float f2) { + + return Math.abs(f1 - f2) < THRESHOLD; + } + + + private ClassificationTextBlock buildTextBlock(List wordBlockList, int indexOnPage) { + + ClassificationTextBlock textBlock = null; + + FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter(); + FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter(); + FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter(); + StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter(); + StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter(); + + for (TextPositionSequence wordBlock : wordBlockList) { + + lineHeightFrequencyCounter.add(wordBlock.getTextHeight()); + fontSizeFrequencyCounter.add(wordBlock.getFontSize()); + spaceFrequencyCounter.add(wordBlock.getSpaceWidth()); + fontFrequencyCounter.add(wordBlock.getFont()); + styleFrequencyCounter.add(wordBlock.getFontStyle()); + + if (textBlock == null) { + textBlock = new ClassificationTextBlock(wordBlock.getMinXDirAdj(), + wordBlock.getMaxXDirAdj(), + wordBlock.getMinYDirAdj(), + wordBlock.getMaxYDirAdj(), + wordBlockList, + wordBlock.getRotation(), + indexOnPage); + } else { + ClassificationTextBlock spatialEntity = textBlock.union(wordBlock); + textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight()); + } + } + + if (textBlock != null) { + textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular()); + textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular()); + textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular()); + textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular()); + textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular()); + textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest()); + } + + if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) { + textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj)); + } + return textBlock; + } + + + private boolean isSplitByRuling(float minX, + float minY, + float maxX, + float maxY, + TextPositionSequence word, + List horizontalRulingLines, + List verticalRulingLines) { + + return isSplitByRuling(maxX, + minY, + word.getMinXDirAdj(), + word.getMinYDirAdj(), + verticalRulingLines, + word.getDir().getDegrees(), + word.getPageWidth(), + word.getPageHeight()) // + || isSplitByRuling(minX, + minY, + word.getMinXDirAdj(), + word.getMaxYDirAdj(), + horizontalRulingLines, + word.getDir().getDegrees(), + word.getPageWidth(), + word.getPageHeight()) // + || isSplitByRuling(maxX, + minY, + word.getMinXDirAdj(), + word.getMinYDirAdj(), + horizontalRulingLines, + word.getDir().getDegrees(), + word.getPageWidth(), + word.getPageHeight()) // + || isSplitByRuling(minX, + minY, + word.getMinXDirAdj(), + word.getMaxYDirAdj(), + verticalRulingLines, + word.getDir().getDegrees(), + word.getPageWidth(), + word.getPageHeight()); // + } + + + private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List rulingLines, float dir, float pageWidth, float pageHeight) { + + for (Ruling ruling : rulingLines) { + var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight); + if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) { + return true; + } + } + return false; + } + + + private double round(float value, int decimalPoints) { + + var d = Math.pow(10, decimalPoints); + return Math.round(value * d) / d; + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/BodyTextFrameService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/BodyTextFrameService.java new file mode 100644 index 0000000..2860222 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/BodyTextFrameService.java @@ -0,0 +1,160 @@ +package com.knecon.fforesight.service.layoutparser.processor.classification.service; +import java.util.List; + +import org.springframework.stereotype.Service; + +import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; +import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.FloatFrequencyCounter; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.TableCell; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Table; +import com.knecon.fforesight.service.layoutparser.processor.classification.utils.PositionUtils; + +@Service +public class BodyTextFrameService { + + /** + * Adjusts and sets the body text frame to a classificationPage. + * Note: This needs to use Pdf Coordinate System where {0,0} rotated with the classificationPage rotation. + * 0 -> LowerLeft + * 90 -> UpperLeft + * 180 -> UpperRight + * 270 -> LowerRight + * The aspect ratio of the classificationPage is also regarded. + * + * @param classificationPage The classificationPage + * @param bodyTextFrame frame that contains the main text on portrait pages + * @param landscapeBodyTextFrame frame that contains the main text on landscape pages + */ + public void setBodyTextFrameAdjustedToPage(ClassificationPage classificationPage, Rectangle bodyTextFrame, Rectangle landscapeBodyTextFrame) { + + Rectangle textFrame = classificationPage.isLandscape() ? landscapeBodyTextFrame : bodyTextFrame; + + if (classificationPage.getPageWidth() > classificationPage.getPageHeight() && classificationPage.getRotation() == 270) { + textFrame = new Rectangle(new Point(textFrame.getTopLeft().getY(), classificationPage.getPageHeight() - textFrame.getTopLeft().getX() - textFrame.getWidth()), + textFrame.getHeight(), + textFrame.getWidth(), + 0); + } else if (classificationPage.getPageWidth() > classificationPage.getPageHeight() && classificationPage.getRotation() != 0) { + textFrame = new Rectangle(new Point(textFrame.getTopLeft().getY(), textFrame.getTopLeft().getX()), textFrame.getHeight(), textFrame.getWidth(), classificationPage.getPageNumber()); + } else if (classificationPage.getRotation() == 180) { + textFrame = new Rectangle(new Point(textFrame.getTopLeft().getX(), classificationPage.getPageHeight() - textFrame.getTopLeft().getY() - textFrame.getHeight()), + textFrame.getWidth(), + textFrame.getHeight(), + 0); + } + classificationPage.setBodyTextFrame(textFrame); + } + + + /** + * Calculates the frame that contains the main text, text outside the frame will be e.g. headers or footers. + * Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation. + * 0 -> LowerLeft + * 90 -> UpperLeft + * 180 -> UpperRight + * 270 -> LowerRight + * The aspect ratio of the page is also regarded. + * + * @param classificationPages List of all classificationPages + * @param documentFontSizeCounter Statistics of the document + * @param landscape Calculate for landscape or portrait + * @return Rectangle of the text frame + */ + public Rectangle calculateBodyTextFrame(List classificationPages, FloatFrequencyCounter documentFontSizeCounter, boolean landscape) { + + BodyTextFrameExpansionsRectangle expansionsRectangle = new BodyTextFrameExpansionsRectangle(); + + for (ClassificationPage classificationPage : classificationPages) { + + if (classificationPage.getTextBlocks().isEmpty() || landscape != classificationPage.isLandscape()) { + continue; + } + + for (AbstractTextContainer container : classificationPage.getTextBlocks()) { + + if (container instanceof ClassificationTextBlock) { + ClassificationTextBlock textBlock = (ClassificationTextBlock) container; + if (textBlock.getMostPopularWordFont() == null || textBlock.getMostPopularWordStyle() == null) { + continue; + } + + float approxLineCount = PositionUtils.getApproxLineCount(textBlock); + if (approxLineCount < 2.9f) { + continue; + } + + if (documentFontSizeCounter.getMostPopular() != null && textBlock.getMostPopularWordFontSize() >= documentFontSizeCounter.getMostPopular()) { + + expandRectangle(textBlock, classificationPage, expansionsRectangle); + } + } + + if (container instanceof Table) { + Table table = (Table) container; + for (List row : table.getRows()) { + for (TableCell cell : row) { + + if (cell == null || cell.getTextBlocks() == null) { + continue; + } + for (ClassificationTextBlock textBlock : cell.getTextBlocks()) { + expandRectangle(textBlock, classificationPage, expansionsRectangle); + } + } + } + } + } + } + return new Rectangle(new Point(expansionsRectangle.minX, expansionsRectangle.minY), + expansionsRectangle.maxX - expansionsRectangle.minX, + expansionsRectangle.maxY - expansionsRectangle.minY, + 0); + } + + + private void expandRectangle(ClassificationTextBlock textBlock, ClassificationPage classificationPage, BodyTextFrameExpansionsRectangle expansionsRectangle) { + + if (classificationPage.getPageWidth() > classificationPage.getPageHeight() && classificationPage.getRotation() != 0) { + if (textBlock.getPdfMinY() < expansionsRectangle.minX) { + expansionsRectangle.minX = textBlock.getPdfMinY(); + } + if (textBlock.getPdfMaxY() > expansionsRectangle.maxX) { + expansionsRectangle.maxX = textBlock.getPdfMaxY(); + } + if (textBlock.getPdfMinX() < expansionsRectangle.minY) { + expansionsRectangle.minY = textBlock.getPdfMinX(); + } + if (textBlock.getPdfMaxX() > expansionsRectangle.maxY) { + expansionsRectangle.maxY = textBlock.getPdfMaxX(); + } + } else { + if (textBlock.getPdfMinX() < expansionsRectangle.minX) { + expansionsRectangle.minX = textBlock.getPdfMinX(); + } + if (textBlock.getPdfMaxX() > expansionsRectangle.maxX) { + expansionsRectangle.maxX = textBlock.getPdfMaxX(); + } + if (textBlock.getPdfMinY() < expansionsRectangle.minY) { + expansionsRectangle.minY = textBlock.getPdfMinY(); + } + if (textBlock.getPdfMaxY() > expansionsRectangle.maxY) { + expansionsRectangle.maxY = textBlock.getPdfMaxY(); + } + } + } + + + private class BodyTextFrameExpansionsRectangle { + + float minX = 10000; + float maxX = -100; + float minY = 10000; + float maxY = -100; + + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/ClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/ClassificationService.java new file mode 100644 index 0000000..02cbb83 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/ClassificationService.java @@ -0,0 +1,116 @@ +package com.knecon.fforesight.service.layoutparser.processor.classification.service; + +import java.util.List; +import java.util.regex.Pattern; + +import org.springframework.stereotype.Service; + +import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationDocument; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer; +import com.knecon.fforesight.service.layoutparser.processor.classification.utils.PositionUtils; + +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; + +@Slf4j +@Service +@RequiredArgsConstructor +public class ClassificationService { + + private final BodyTextFrameService bodyTextFrameService; + + + public void classifyDocument(ClassificationDocument document) { + + Rectangle bodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), false); + Rectangle landscapeBodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), true); + List headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular(); + + log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue()); + + for (ClassificationPage classificationPage : document.getPages()) { + bodyTextFrameService.setBodyTextFrameAdjustedToPage(classificationPage, bodyTextFrame, landscapeBodyTextFrame); + classifyPage(classificationPage, document, headlineFontSizes); + } + } + + + public void classifyPage(ClassificationPage classificationPage, ClassificationDocument document, List headlineFontSizes) { + + for (AbstractTextContainer textBlock : classificationPage.getTextBlocks()) { + if (textBlock instanceof ClassificationTextBlock) { + classifyBlock((ClassificationTextBlock) textBlock, classificationPage, document, headlineFontSizes); + } + } + } + + + public void classifyBlock(ClassificationTextBlock textBlock, ClassificationPage page, ClassificationDocument document, List headlineFontSizes) { + + var bodyTextFrame = page.getBodyTextFrame(); + + if (document.getFontSizeCounter().getMostPopular() == null) { + textBlock.setClassification("Other"); + return; + } + if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() + .getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) { + textBlock.setClassification("Header"); + + } else if (PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() + .getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) { + textBlock.setClassification("Footer"); + } else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, + document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks() + .size() == 1)) { + if (!Pattern.matches("[0-9]+", textBlock.toString())) { + textBlock.setClassification("Title"); + } + } else if (textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter() + .getMostPopular() && PositionUtils.getApproxLineCount(textBlock) < 4.9 && (textBlock.getMostPopularWordStyle().equals("bold") || !document.getFontStyleCounter() + .getCountPerValue() + .containsKey("bold") && textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() + 1) && textBlock.getSequences() + .get(0) + .getTextPositions() + .get(0) + .getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) { + + for (int i = 1; i <= headlineFontSizes.size(); i++) { + if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) { + textBlock.setClassification("H " + i); + document.setHeadlines(true); + } + } + } else if (!textBlock.getText().startsWith("Table ") && !textBlock.getText().startsWith("Figure ") && PositionUtils.isWithinBodyTextFrame(bodyTextFrame, + textBlock) && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter() + .getMostPopular() + .equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9 && textBlock.getSequences() + .get(0) + .getTextPositions() + .get(0) + .getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) { + textBlock.setClassification("H " + (headlineFontSizes.size() + 1)); + document.setHeadlines(true); + } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter() + .getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) { + textBlock.setClassification("TextBlock Bold"); + } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFont() + .equals(document.getFontCounter().getMostPopular()) && textBlock.getMostPopularWordStyle() + .equals(document.getFontStyleCounter().getMostPopular()) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) { + textBlock.setClassification("TextBlock"); + } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter() + .getMostPopular() && textBlock.getMostPopularWordStyle().equals("italic") && !document.getFontStyleCounter() + .getMostPopular() + .equals("italic") && PositionUtils.getApproxLineCount(textBlock) < 2.9) { + textBlock.setClassification("TextBlock Italic"); + } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) { + textBlock.setClassification("TextBlock Unknown"); + } else { + textBlock.setClassification("Other"); + } + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/PdfParsingService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/PdfParsingService.java new file mode 100644 index 0000000..cde9a8b --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/PdfParsingService.java @@ -0,0 +1,134 @@ +package com.knecon.fforesight.service.layoutparser.processor.classification.service; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.common.PDRectangle; +import org.springframework.stereotype.Service; + +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationDocument; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.image.ClassifiedImage; +import com.knecon.fforesight.service.layoutparser.processor.adapter.ImageServiceResponseAdapter; +import com.knecon.fforesight.service.layoutparser.processor.classification.parsing.PDFLinesTextStripper; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CleanRulings; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CvParsedTableCell; + +import lombok.RequiredArgsConstructor; +import lombok.SneakyThrows; +import lombok.extern.slf4j.Slf4j; + +@Slf4j +@Service +@RequiredArgsConstructor +public class PdfParsingService { + + private final RulingCleaningService rulingCleaningService; + private final TableExtractionService tableExtractionService; + private final BlockificationService blockificationService; + private final ImageServiceResponseAdapter imageServiceResponseAdapter; + + + public ClassificationDocument parseDocument(PDDocument originDocument, Map> pdfTableCells, Map> pdfImages) { + + ClassificationDocument document = new ClassificationDocument(); + List classificationPages = new ArrayList<>(); + + originDocument.setAllSecurityToBeRemoved(true); + long pageCount = originDocument.getNumberOfPages(); + + for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) { + parsePage(pdfImages, originDocument, pdfTableCells, document, classificationPages, pageNumber); + } + + document.setPages(classificationPages); + + return document; + } + + + @SneakyThrows + private void parsePage(Map> pdfImages, + PDDocument pdDocument, + Map> pdfTableCells, + ClassificationDocument document, + List classificationPages, + int pageNumber) { + + PDFLinesTextStripper stripper = new PDFLinesTextStripper(); + PDPage pdPage = pdDocument.getPage(pageNumber - 1); + stripper.setPageNumber(pageNumber); + stripper.setStartPage(pageNumber); + stripper.setEndPage(pageNumber); + stripper.setPdpage(pdPage); + stripper.getText(pdDocument); + + PDRectangle pdr = pdPage.getMediaBox(); + + int rotation = pdPage.getRotation(); + boolean isLandscape = pdr.getWidth() > pdr.getHeight() && (rotation == 0 || rotation == 180) || pdr.getHeight() > pdr.getWidth() && (rotation == 90 || rotation == 270); + + PDRectangle cropbox = pdPage.getCropBox(); + CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), + stripper.getRulings(), + stripper.getMinCharWidth(), + stripper.getMaxCharHeight()); + ClassificationPage classificationPage = blockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); + + classificationPage.setRotation(rotation); + classificationPage.setLandscape(isLandscape); + classificationPage.setPageNumber(pageNumber); + classificationPage.setPageWidth(cropbox.getWidth()); + classificationPage.setPageHeight(cropbox.getHeight()); + + // If images is ocr needs to be calculated before textBlocks are moved into tables, otherwise findOcr algorithm needs to be adopted. + if (pdfImages != null && pdfImages.containsKey(pageNumber)) { + classificationPage.setImages(pdfImages.get(pageNumber)); + imageServiceResponseAdapter.findOcr(classificationPage); + } + + tableExtractionService.removeRedundantTableCells(cleanRulings, classificationPage); + buildPageStatistics(classificationPage); + increaseDocumentStatistics(classificationPage, document); + + classificationPages.add(classificationPage); + } + + + private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) { + + if (!classificationPage.isLandscape()) { + document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue()); + } + document.getFontCounter().addAll(classificationPage.getFontCounter().getCountPerValue()); + document.getTextHeightCounter().addAll(classificationPage.getTextHeightCounter().getCountPerValue()); + document.getFontStyleCounter().addAll(classificationPage.getFontStyleCounter().getCountPerValue()); + } + + + private void buildPageStatistics(ClassificationPage classificationPage) { + + // Collect all statistics for the classificationPage, except from blocks inside tables, as tables will always be added to BodyTextFrame. + for (AbstractTextContainer textBlock : classificationPage.getTextBlocks()) { + if (textBlock instanceof ClassificationTextBlock) { + if (((ClassificationTextBlock) textBlock).getSequences() == null) { + continue; + } + for (TextPositionSequence word : ((ClassificationTextBlock) textBlock).getSequences()) { + classificationPage.getTextHeightCounter().add(word.getTextHeight()); + classificationPage.getFontCounter().add(word.getFont()); + classificationPage.getFontSizeCounter().add(word.getFontSize()); + classificationPage.getFontStyleCounter().add(word.getFontStyle()); + } + } + } + + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/RulingCleaningService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/RulingCleaningService.java new file mode 100644 index 0000000..5dd0985 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/RulingCleaningService.java @@ -0,0 +1,231 @@ +package com.knecon.fforesight.service.layoutparser.processor.classification.service; + +import java.awt.geom.Line2D; +import java.awt.geom.Point2D; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.springframework.stereotype.Service; + +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CleanRulings; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CvParsedTableCell; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Ruling; +import com.knecon.fforesight.service.layoutparser.processor.classification.utils.DoubleComparisons; + +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; + +@Slf4j +@Service +@RequiredArgsConstructor +public class RulingCleaningService { + + public CleanRulings getCleanRulings(List cvParsedTableCells, List rulings, float minCharWidth, float maxCharHeight) { + + if (!rulings.isEmpty()) { + snapPoints(rulings, minCharWidth, maxCharHeight); + } + + List vrs = new ArrayList<>(); + for (Ruling vr : rulings) { + if (vr.vertical()) { + vrs.add(vr); + } + } + if (vrs.isEmpty()) { + vrs.addAll(extractVerticalRulings(cvParsedTableCells)); + } + List verticalRulingLines = collapseOrientedRulings(vrs); + + List hrs = new ArrayList<>(); + for (Ruling hr : rulings) { + if (hr.horizontal()) { + hrs.add(hr); + } + } + if (hrs.isEmpty()) { + hrs.addAll(extractHorizontalRulings(cvParsedTableCells)); + } + List horizontalRulingLines = collapseOrientedRulings(hrs); + + return CleanRulings.builder().vertical(verticalRulingLines).horizontal(horizontalRulingLines).build(); + } + + + public void snapPoints(List rulings, float xThreshold, float yThreshold) { + + // collect points and keep a Line -> p1,p2 map + Map linesToPoints = new HashMap<>(); + List points = new ArrayList<>(); + for (Line2D.Float r : rulings) { + Point2D p1 = r.getP1(); + Point2D p2 = r.getP2(); + linesToPoints.put(r, new Point2D[]{p1, p2}); + points.add(p1); + points.add(p2); + } + + // snap by X + points.sort(Comparator.comparingDouble(Point2D::getX)); + + List> groupedPoints = new ArrayList<>(); + groupedPoints.add(new ArrayList<>(Collections.singletonList(points.get(0)))); + + for (Point2D p : points.subList(1, points.size() - 1)) { + List last = groupedPoints.get(groupedPoints.size() - 1); + if (Math.abs(p.getX() - last.get(0).getX()) < xThreshold) { + groupedPoints.get(groupedPoints.size() - 1).add(p); + } else { + groupedPoints.add(new ArrayList<>(Collections.singletonList(p))); + } + } + + for (List group : groupedPoints) { + float avgLoc = 0; + for (Point2D p : group) { + avgLoc += p.getX(); + } + avgLoc /= group.size(); + for (Point2D p : group) { + p.setLocation(avgLoc, p.getY()); + } + } + // --- + + // snap by Y + points.sort(Comparator.comparingDouble(Point2D::getY)); + + groupedPoints = new ArrayList<>(); + groupedPoints.add(new ArrayList<>(Collections.singletonList(points.get(0)))); + + for (Point2D p : points.subList(1, points.size() - 1)) { + List last = groupedPoints.get(groupedPoints.size() - 1); + if (Math.abs(p.getY() - last.get(0).getY()) < yThreshold) { + groupedPoints.get(groupedPoints.size() - 1).add(p); + } else { + groupedPoints.add(new ArrayList<>(Collections.singletonList(p))); + } + } + + for (List group : groupedPoints) { + float avgLoc = 0; + for (Point2D p : group) { + avgLoc += p.getY(); + } + avgLoc /= group.size(); + for (Point2D p : group) { + p.setLocation(p.getX(), avgLoc); + } + } + // --- + + // finally, modify lines + for (Map.Entry ltp : linesToPoints.entrySet()) { + Point2D[] p = ltp.getValue(); + ltp.getKey().setLine(p[0], p[1]); + } + } + + + private Collection extractVerticalRulings(List cvParsedTableCells) { + + List vrs = new ArrayList<>(); + + if (cvParsedTableCells != null) { + for (CvParsedTableCell cvParsedTableCell : cvParsedTableCells) { + Ruling leftLine = createRuling(cvParsedTableCell.getX0(), cvParsedTableCell.getX0(), cvParsedTableCell.getY0(), cvParsedTableCell.getY1()); + Ruling rightLine = createRuling(cvParsedTableCell.getX1(), cvParsedTableCell.getX1(), cvParsedTableCell.getY0(), cvParsedTableCell.getY1()); + vrs.add(leftLine); + vrs.add(rightLine); + } + } + return vrs; + } + + + private Collection extractHorizontalRulings(List cvParsedTableCells) { + + List hrs = new ArrayList<>(); + + if (cvParsedTableCells != null) { + for (CvParsedTableCell cvParsedTableCell : cvParsedTableCells) { + Ruling topLine = createRuling(cvParsedTableCell.getX0(), cvParsedTableCell.getX1(), cvParsedTableCell.getY1(), cvParsedTableCell.getY1()); + Ruling baseLine = createRuling(cvParsedTableCell.getX0(), cvParsedTableCell.getX1(), cvParsedTableCell.getY0(), cvParsedTableCell.getY0()); + hrs.add(topLine); + hrs.add(baseLine); + } + } + + return hrs; + } + + + private Ruling createRuling(float tableCellX0, float tableCellX1, float tableCellY0, float tableCellY1) { + + float x0 = tableCellX0; + float x1 = tableCellX1; + float y0 = tableCellY0; + float y1 = tableCellY1; + + if (x1 < x0) { + x0 = tableCellX1; + x1 = tableCellX0; + } + + if (y1 < y0) { + y0 = tableCellY1; + y1 = tableCellY0; + } + + return new Ruling(new Point2D.Float(x0, y0), new Point2D.Float(x1, y1)); + } + + + private List collapseOrientedRulings(List lines) { + + int COLINEAR_OR_PARALLEL_PIXEL_EXPAND_AMOUNT = 1; + return collapseOrientedRulings(lines, COLINEAR_OR_PARALLEL_PIXEL_EXPAND_AMOUNT); + } + + + private List collapseOrientedRulings(List lines, int expandAmount) { + + ArrayList rv = new ArrayList<>(); + lines.sort((a, b) -> { + final float diff = a.getPosition() - b.getPosition(); + return Float.compare(diff == 0 ? a.getStart() - b.getStart() : diff, 0f); + }); + + for (Ruling next_line : lines) { + Ruling last = rv.isEmpty() ? null : rv.get(rv.size() - 1); + // if current line colinear with next, and are "close enough": expand current line + if (last != null && DoubleComparisons.feq(next_line.getPosition(), last.getPosition()) && last.nearlyIntersects(next_line, expandAmount)) { + final float lastStart = last.getStart(); + final float lastEnd = last.getEnd(); + + final boolean lastFlipped = lastStart > lastEnd; + final boolean nextFlipped = next_line.getStart() > next_line.getEnd(); + + boolean differentDirections = nextFlipped != lastFlipped; + float nextS = differentDirections ? next_line.getEnd() : next_line.getStart(); + float nextE = differentDirections ? next_line.getStart() : next_line.getEnd(); + + final float newStart = lastFlipped ? Math.max(nextS, lastStart) : Math.min(nextS, lastStart); + final float newEnd = lastFlipped ? Math.min(nextE, lastEnd) : Math.max(nextE, lastEnd); + last.setStartEnd(newStart, newEnd); + assert !last.oblique(); + } else if (next_line.length() == 0) { + continue; + } else { + rv.add(next_line); + } + } + return rv; + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/SectionsBuilderService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/SectionsBuilderService.java new file mode 100644 index 0000000..a8309ba --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/SectionsBuilderService.java @@ -0,0 +1,303 @@ +package com.knecon.fforesight.service.layoutparser.processor.classification.service; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import org.springframework.stereotype.Service; + +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationDocument; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationFooter; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationSection; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationHeader; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.UnclassifiedText; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.image.ClassifiedImage; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.TableCell; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Table; + +import lombok.extern.slf4j.Slf4j; + +@Slf4j +@Service +public class SectionsBuilderService { + + public void buildSections(ClassificationDocument document) { + + List chunkWords = new ArrayList<>(); + List chunkBlockList = new ArrayList<>(); + List headers = new ArrayList<>(); + List footers = new ArrayList<>(); + List unclassifiedTexts = new ArrayList<>(); + + AbstractTextContainer prev = null; + + String lastHeadline = ""; + Table previousTable = null; + for (ClassificationPage classificationPage : document.getPages()) { + List header = new ArrayList<>(); + List footer = new ArrayList<>(); + List unclassifiedText = new ArrayList<>(); + for (AbstractTextContainer current : classificationPage.getTextBlocks()) { + + if (current.getClassification() == null) { + continue; + } + + current.setPage(classificationPage.getPageNumber()); + + if (current.getClassification().equals("Header")) { + header.add((ClassificationTextBlock) current); + continue; + } + + if (current.getClassification().equals("Footer")) { + footer.add((ClassificationTextBlock) current); + continue; + } + + if (current.getClassification().equals("Other")) { + unclassifiedText.add((ClassificationTextBlock) current); + continue; + } + + if (prev != null && current.getClassification().startsWith("H ") && !prev.getClassification().startsWith("H ") || !document.isHeadlines()) { + ClassificationSection chunkBlock = buildTextBlock(chunkWords, lastHeadline); + chunkBlock.setHeadline(lastHeadline); + if (document.isHeadlines()) { + lastHeadline = current.getText(); + } + chunkBlockList.add(chunkBlock); + chunkWords = new ArrayList<>(); + if (!chunkBlock.getTables().isEmpty()) { + previousTable = chunkBlock.getTables().get(chunkBlock.getTables().size() - 1); + } + } + if (current instanceof Table table) { + // Distribute header information for subsequent tables + mergeTableMetadata(table, previousTable); + previousTable = table; + } + chunkWords.add(current); + prev = current; + } + + if (!header.isEmpty()) { + headers.add(new ClassificationHeader(header)); + } + if (!footer.isEmpty()) { + footers.add(new ClassificationFooter(footer)); + } + if (!unclassifiedText.isEmpty()) { + unclassifiedTexts.add(new UnclassifiedText(unclassifiedText)); + } + } + + ClassificationSection chunkBlock = buildTextBlock(chunkWords, lastHeadline); + chunkBlock.setHeadline(lastHeadline); + chunkBlockList.add(chunkBlock); + + document.setSections(chunkBlockList); + document.setHeaders(headers); + document.setFooters(footers); + document.setUnclassifiedTexts(unclassifiedTexts); + addImagesToSections(document); + } + + + private void addImagesToSections(ClassificationDocument document) { + + Map> sectionMap = new HashMap<>(); + for (ClassificationSection section : document.getSections()) { + for (AbstractTextContainer container : section.getPageBlocks()) { + + List sectionsOnPage = sectionMap.computeIfAbsent(container.getPage(), c -> new ArrayList<>()); + if (sectionsOnPage.contains(section)) { + continue; + } + sectionsOnPage.add(section); + } + } + + if (sectionMap.isEmpty()) { + ClassificationSection section = new ClassificationSection(); + document.getSections().add(section); + sectionMap.computeIfAbsent(1, x -> new ArrayList<>()).add(section); + } + + // first page is always a paragraph, else we can't process pages 1..N, + // where N is the first found page with a paragraph + if (sectionMap.get(1) == null) { + ClassificationSection section = new ClassificationSection(); + document.getSections().add(section); + sectionMap.computeIfAbsent(1, x -> new ArrayList<>()).add(section); + } + + for (ClassificationPage classificationPage : document.getPages()) { + for (ClassifiedImage image : classificationPage.getImages()) { + List sectionsOnPage = sectionMap.get(classificationPage.getPageNumber()); + if (sectionsOnPage == null) { + int i = classificationPage.getPageNumber(); + while (sectionsOnPage == null) { + sectionsOnPage = sectionMap.get(i); + i--; + } + } + for (ClassificationSection section : sectionsOnPage) { + Float xMin = null; + Float yMin = null; + Float xMax = null; + Float yMax = null; + + for (AbstractTextContainer abs : section.getPageBlocks()) { + if (abs.getPage() != classificationPage.getPageNumber()) { + continue; + } + + if (abs.getMinX() < abs.getMaxX()) { + if (xMin == null || abs.getMinX() < xMin) { + xMin = abs.getMinX(); + } + if (xMax == null || abs.getMaxX() > xMax) { + xMax = abs.getMaxX(); + } + } else { + if (xMin == null || abs.getMaxX() < xMin) { + xMin = abs.getMaxX(); + } + if (xMax == null || abs.getMinX() > xMax) { + xMax = abs.getMinX(); + } + } + + if (abs.getMinY() < abs.getMaxY()) { + if (yMin == null || abs.getMinY() < yMin) { + yMin = abs.getMinY(); + } + if (yMax == null || abs.getMaxY() > yMax) { + yMax = abs.getMaxY(); + } + } else { + if (yMin == null || abs.getMaxY() < yMin) { + yMin = abs.getMaxY(); + } + if (yMax == null || abs.getMinY() > yMax) { + yMax = abs.getMinY(); + } + } + + } + + log.debug("Image position x: {}, y: {}", image.getPosition().getX(), image.getPosition().getY()); + log.debug("Paragraph position xMin: {}, xMax: {}, yMin: {}, yMax: {}", xMin, xMax, yMin, yMax); + + if (xMin != null && xMax != null && yMin != null && yMax != null && image.getPosition().getX() >= xMin && image.getPosition() + .getX() <= xMax && image.getPosition().getY() >= yMin && image.getPosition().getY() <= yMax) { + section.getImages().add(image); + image.setAppendedToSection(true); + } + } + if (!image.isAppendedToSection()) { + log.debug("Image uses first paragraph"); + sectionsOnPage.get(0).getImages().add(image); + image.setAppendedToSection(true); + } + } + } + } + + + private void mergeTableMetadata(Table currentTable, Table previousTable) { + + // Distribute header information for subsequent tables + if (previousTable != null && hasInvalidHeaderInformation(currentTable) && hasValidHeaderInformation(previousTable)) { + List previousTableNonHeaderRow = getRowWithNonHeaderCells(previousTable); + List tableNonHeaderRow = getRowWithNonHeaderCells(currentTable); + // Allow merging of tables if header row is separated from first logical non-header row + if (previousTableNonHeaderRow.isEmpty() && previousTable.getRowCount() == 1 && previousTable.getRows().get(0).size() == tableNonHeaderRow.size()) { + previousTableNonHeaderRow = previousTable.getRows().get(0).stream().map(cell -> { + TableCell fakeCell = new TableCell(cell.getPoints()[0], cell.getPoints()[2]); + fakeCell.setHeaderCells(Collections.singletonList(cell)); + return fakeCell; + }).collect(Collectors.toList()); + } + if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) { + for (int i = currentTable.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table + List row = currentTable.getRows().get(i); + if (row.size() == tableNonHeaderRow.size() && row.stream().allMatch(cell -> cell.getHeaderCells().isEmpty())) { + for (int j = 0; j < row.size(); j++) { + row.get(j).setHeaderCells(previousTableNonHeaderRow.get(j).getHeaderCells()); + } + } + } + } + } + } + + + private ClassificationSection buildTextBlock(List wordBlockList, String lastHeadline) { + + ClassificationSection section = new ClassificationSection(); + + for (AbstractTextContainer container : wordBlockList) { + if (container instanceof Table table) { + + if (lastHeadline == null || lastHeadline.isEmpty()) { + table.setHeadline("Text in table"); + } else { + table.setHeadline("Table in: " + lastHeadline); + } + + section.getPageBlocks().add(table); + continue; + } + + ClassificationTextBlock wordBlock = (ClassificationTextBlock) container; + section.getPageBlocks().add(wordBlock); + } + return section; + } + + + private boolean hasValidHeaderInformation(Table table) { + + return !hasInvalidHeaderInformation(table); + } + + + private boolean hasInvalidHeaderInformation(Table table) { + + return table.getRows().stream().flatMap(row -> row.stream().filter(cell -> !cell.getHeaderCells().isEmpty())).findAny().isEmpty(); + + } + + + private List getRowWithNonHeaderCells(Table table) { + + for (int i = table.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table + List row = table.getRows().get(i); + if (row.size() == 1) { + continue; + } + boolean allNonHeader = true; + for (TableCell cell : row) { + if (cell.isHeaderCell()) { + allNonHeader = false; + break; + } + } + if (allNonHeader) { + return row; + } + } + + return Collections.emptyList(); + + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/TableExtractionService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/TableExtractionService.java new file mode 100644 index 0000000..515e4cc --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/TableExtractionService.java @@ -0,0 +1,338 @@ +package com.knecon.fforesight.service.layoutparser.processor.classification.service; + +import java.awt.geom.Point2D; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +import org.springframework.stereotype.Service; + +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.TableCell; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CleanRulings; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Ruling; +import com.knecon.fforesight.service.layoutparser.processor.classification.utils.QuickSort; +import com.knecon.fforesight.service.layoutparser.processor.classification.utils.DoubleComparisons; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.Rectangle; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Table; + +@Service +public class TableExtractionService { + + private static final Comparator X_FIRST_POINT_COMPARATOR = (arg0, arg1) -> { + + int rv = 0; + float arg0X = DoubleComparisons.round(arg0.getX(), 2); + float arg0Y = DoubleComparisons.round(arg0.getY(), 2); + float arg1X = DoubleComparisons.round(arg1.getX(), 2); + float arg1Y = DoubleComparisons.round(arg1.getY(), 2); + + if (arg0X > arg1X) { + rv = 1; + } else if (arg0X < arg1X) { + rv = -1; + } else if (arg0Y > arg1Y) { + rv = 1; + } else if (arg0Y < arg1Y) { + rv = -1; + } + return rv; + }; + private static final Comparator POINT_COMPARATOR = (arg0, arg1) -> { + + int rv = 0; + float arg0X = DoubleComparisons.round(arg0.getX(), 2); + float arg0Y = DoubleComparisons.round(arg0.getY(), 2); + float arg1X = DoubleComparisons.round(arg1.getX(), 2); + float arg1Y = DoubleComparisons.round(arg1.getY(), 2); + + if (arg0Y > arg1Y) { + rv = 1; + } else if (arg0Y < arg1Y) { + rv = -1; + } else if (arg0X > arg1X) { + rv = 1; + } else if (arg0X < arg1X) { + rv = -1; + } + return rv; + }; + + + /** + * Finds tables on a classificationPage and moves textblocks into cells of the found tables. + * Note: This algorithm uses Pdf Coordinate System where {0,0} rotated with the classificationPage rotation. + * 0 -> LowerLeft + * 90 -> UpperLeft + * 180 -> UpperRight + * 270 -> LowerRight + * + * DirAdj (Text direction adjusted) values can not be used here. + * + * @param cleanRulings The lines used to build the table. + * @param classificationPage ClassificationPage object that contains textblocks and statistics. + */ + public void removeRedundantTableCells(CleanRulings cleanRulings, ClassificationPage classificationPage) { + + List cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical()); + + List toBeRemoved = new ArrayList<>(); + + for (AbstractTextContainer abstractTextContainer : classificationPage.getTextBlocks()) { + ClassificationTextBlock textBlock = (ClassificationTextBlock) abstractTextContainer; + for (TableCell cell : cells) { + if (cell.intersects(textBlock.getPdfMinX(), + textBlock.getPdfMinY(), + textBlock.getPdfMaxX() - textBlock.getPdfMinX(), + textBlock.getPdfMaxY() - textBlock.getPdfMinY())) { + cell.addTextBlock(textBlock); + toBeRemoved.add(textBlock); + break; + } + } + } + + cells = new ArrayList<>(new HashSet<>(cells)); + QuickSort.sort(cells, Rectangle.ILL_DEFINED_ORDER); + + List spreadsheetAreas = findSpreadsheetsFromCells(cells).stream().filter(r -> r.getWidth() > 0f && r.getHeight() > 0f).collect(Collectors.toList()); + + List tables = new ArrayList<>(); + for (Rectangle area : spreadsheetAreas) { + + List overlappingCells = new ArrayList<>(); + for (TableCell c : cells) { + if (c.intersects(area)) { + overlappingCells.add(c); + } + } + tables.add(new Table(overlappingCells, area, classificationPage.getRotation())); + } + + for (Table table : tables) { + int position = -1; + + Iterator itty = classificationPage.getTextBlocks().iterator(); + while (itty.hasNext()) { + AbstractTextContainer textBlock = itty.next(); + if (textBlock instanceof ClassificationTextBlock ? table.containsBlock((ClassificationTextBlock) textBlock) : table.contains(textBlock) && position == -1) { + position = classificationPage.getTextBlocks().indexOf(textBlock); + } + } + if (position != -1) { + classificationPage.getTextBlocks().add(position, table); + } + } + + classificationPage.getTextBlocks().removeAll(toBeRemoved); + } + + + public List findCells(List horizontalRulingLines, List verticalRulingLines) { + + List cellsFound = new ArrayList<>(); + Map intersectionPoints = Ruling.findIntersections(horizontalRulingLines, verticalRulingLines); + List intersectionPointsList = new ArrayList<>(intersectionPoints.keySet()); + intersectionPointsList.sort(POINT_COMPARATOR); + + for (int i = 0; i < intersectionPointsList.size(); i++) { + Point2D topLeft = intersectionPointsList.get(i); + Ruling[] hv = intersectionPoints.get(topLeft); + + // CrossingPointsDirectlyBelow( topLeft ); + List xPoints = new ArrayList<>(); + // CrossingPointsDirectlyToTheRight( topLeft ); + List yPoints = new ArrayList<>(); + + for (Point2D p : intersectionPointsList.subList(i, intersectionPointsList.size())) { + if (p.getX() == topLeft.getX() && p.getY() > topLeft.getY()) { + xPoints.add(p); + } + if (p.getY() == topLeft.getY() && p.getX() > topLeft.getX()) { + yPoints.add(p); + } + } + outer: + for (Point2D xPoint : xPoints) { + // is there a vertical edge b/w topLeft and xPoint? + if (!hv[1].equals(intersectionPoints.get(xPoint)[1])) { + continue; + } + for (Point2D yPoint : yPoints) { + // is there an horizontal edge b/w topLeft and yPoint ? + if (!hv[0].equals(intersectionPoints.get(yPoint)[0])) { + continue; + } + Point2D btmRight = new Point2D.Float((float) yPoint.getX(), (float) xPoint.getY()); + if (intersectionPoints.containsKey(btmRight) && intersectionPoints.get(btmRight)[0].equals(intersectionPoints.get(xPoint)[0]) && intersectionPoints.get(btmRight)[1].equals( + intersectionPoints.get(yPoint)[1])) { + cellsFound.add(new TableCell(topLeft, btmRight)); + break outer; + } + } + } + } + + // TODO create cells for vertical ruling lines with aligned endpoints at the top/bottom of a grid + // that aren't connected with an horizontal ruler? + // see: https://github.com/jazzido/tabula-extractor/issues/78#issuecomment-41481207 + + return cellsFound; + } + + + private List findSpreadsheetsFromCells(List cells) { + // via: http://stackoverflow.com/questions/13746284/merging-multiple-adjacent-rectangles-into-one-polygon + List rectangles = new ArrayList<>(); + Set pointSet = new HashSet<>(); + Map edgesH = new HashMap<>(); + Map edgesV = new HashMap<>(); + int i = 0; + + for (Rectangle cell : cells) { + for (Point2D pt : cell.getPoints()) { + if (pointSet.contains(pt)) { // shared vertex, remove it + pointSet.remove(pt); + } else { + pointSet.add(pt); + } + } + } + + // X first sort + List pointsSortX = new ArrayList<>(pointSet); + pointsSortX.sort(X_FIRST_POINT_COMPARATOR); + // Y first sort + List pointsSortY = new ArrayList<>(pointSet); + pointsSortY.sort(POINT_COMPARATOR); + + while (i < pointSet.size()) { + float currY = (float) pointsSortY.get(i).getY(); + while (i < pointSet.size() && DoubleComparisons.feq(pointsSortY.get(i).getY(), currY)) { + edgesH.put(pointsSortY.get(i), pointsSortY.get(i + 1)); + edgesH.put(pointsSortY.get(i + 1), pointsSortY.get(i)); + i += 2; + } + } + + i = 0; + while (i < pointSet.size()) { + float currX = (float) pointsSortX.get(i).getX(); + while (i < pointSet.size() && DoubleComparisons.feq(pointsSortX.get(i).getX(), currX)) { + edgesV.put(pointsSortX.get(i), pointsSortX.get(i + 1)); + edgesV.put(pointsSortX.get(i + 1), pointsSortX.get(i)); + i += 2; + } + } + + // Get all the polygons + List> polygons = new ArrayList<>(); + Point2D nextVertex; + while (!edgesH.isEmpty()) { + ArrayList polygon = new ArrayList<>(); + Point2D first = edgesH.keySet().iterator().next(); + polygon.add(new PolygonVertex(first, Direction.HORIZONTAL)); + edgesH.remove(first); + + while (true) { + PolygonVertex curr = polygon.get(polygon.size() - 1); + PolygonVertex lastAddedVertex; + if (curr.direction == Direction.HORIZONTAL) { + nextVertex = edgesV.get(curr.point); + edgesV.remove(curr.point); + lastAddedVertex = new PolygonVertex(nextVertex, Direction.VERTICAL); + } else { + nextVertex = edgesH.get(curr.point); + edgesH.remove(curr.point); + lastAddedVertex = new PolygonVertex(nextVertex, Direction.HORIZONTAL); + } + polygon.add(lastAddedVertex); + + if (lastAddedVertex.equals(polygon.get(0))) { + // closed polygon + polygon.remove(polygon.size() - 1); + break; + } + } + + for (PolygonVertex vertex : polygon) { + edgesH.remove(vertex.point); + edgesV.remove(vertex.point); + } + polygons.add(polygon); + } + + // calculate grid-aligned minimum area rectangles for each found polygon + for (List poly : polygons) { + float top = Float.MAX_VALUE; + float left = Float.MAX_VALUE; + float bottom = Float.MIN_VALUE; + float right = Float.MIN_VALUE; + for (PolygonVertex pt : poly) { + top = (float) Math.min(top, pt.point.getY()); + left = (float) Math.min(left, pt.point.getX()); + bottom = (float) Math.max(bottom, pt.point.getY()); + right = (float) Math.max(right, pt.point.getX()); + } + rectangles.add(new Rectangle(top, left, right - left, bottom - top)); + } + + return rectangles; + } + + + private enum Direction { + HORIZONTAL, + VERTICAL + } + + static class PolygonVertex { + + Point2D point; + Direction direction; + + + PolygonVertex(Point2D point, Direction direction) { + + this.direction = direction; + this.point = point; + } + + + @Override + public boolean equals(Object other) { + + if (this == other) { + return true; + } + if (!(other instanceof PolygonVertex)) { + return false; + } + return this.point.equals(((PolygonVertex) other).point); + } + + + @Override + public int hashCode() { + + return this.point.hashCode(); + } + + + @Override + public String toString() { + + return String.format("%s[point=%s,direction=%s]", this.getClass().getName(), this.point.toString(), this.direction.toString()); + } + + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/CohenSutherlandClipping.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/CohenSutherlandClipping.java new file mode 100644 index 0000000..6c424c9 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/CohenSutherlandClipping.java @@ -0,0 +1,142 @@ +/* + * CohenSutherland.java + * -------------------- + * (c) 2007 by Intevation GmbH + * + * @author Sascha L. Teichmann (teichmann@intevation.de) + * @author Ludwig Reiter (ludwig@intevation.de) + * + * This program is free software under the LGPL (>=v2.1) + * Read the file LICENSE.txt coming with the sources for details. + */ +package com.knecon.fforesight.service.layoutparser.processor.classification.utils; + +import java.awt.geom.Line2D; +import java.awt.geom.Rectangle2D; + +/** + * Implements the well known Cohen Sutherland line + * clipping algorithm (line against clip rectangle). + */ +@SuppressWarnings("all") +public final class CohenSutherlandClipping { + + private static final int INSIDE = 0; + private static final int LEFT = 1; + private static final int RIGHT = 2; + private static final int BOTTOM = 4; + private static final int TOP = 8; + private double xMin; + private double yMin; + private double xMax; + private double yMax; + + + /** + * Creates a Cohen Sutherland clipper with clip rect (0, 0, 0, 0). + */ + public CohenSutherlandClipping() { + + } + + + /** + * Creates a Cohen Sutherland clipper with the given clip rectangle. + * + * @param clip the clip rectangle to use + */ + public CohenSutherlandClipping(Rectangle2D clip) { + + setClip(clip); + } + + + /** + * Sets the clip rectangle. + * + * @param clip the clip rectangle + */ + public void setClip(Rectangle2D clip) { + + xMin = clip.getX(); + xMax = xMin + clip.getWidth(); + yMin = clip.getY(); + yMax = yMin + clip.getHeight(); + } + + + private final int regionCode(double x, double y) { + + int code = x < xMin ? LEFT : x > xMax ? RIGHT : INSIDE; + if (y < yMin) { + code |= BOTTOM; + } else if (y > yMax) { + code |= TOP; + } + return code; + } + + + /** + * Clips a given line against the clip rectangle. + * The modification (if needed) is done in place. + * + * @param line the line to clip + * @return true if line is clipped, false if line is + * totally outside the clip rect. + */ + public boolean clip(Line2D.Float line) { + + double p1x = line.getX1(); + double p1y = line.getY1(); + double p2x = line.getX2(); + double p2y = line.getY2(); + + double qx = 0d; + double qy = 0d; + + boolean vertical = p1x == p2x; + + double slope = vertical ? 0d : (p2y - p1y) / (p2x - p1x); + + int c1 = regionCode(p1x, p1y); + int c2 = regionCode(p2x, p2y); + + while (c1 != INSIDE || c2 != INSIDE) { + + if ((c1 & c2) != INSIDE) { + return false; + } + + int c = c1 == INSIDE ? c2 : c1; + + if ((c & LEFT) != INSIDE) { + qx = xMin; + qy = (DoubleComparisons.feq(qx, p1x) ? 0 : qx - p1x) * slope + p1y; + } else if ((c & RIGHT) != INSIDE) { + qx = xMax; + qy = (DoubleComparisons.feq(qx, p1x) ? 0 : qx - p1x) * slope + p1y; + } else if ((c & BOTTOM) != INSIDE) { + qy = yMin; + qx = vertical ? p1x : (DoubleComparisons.feq(qy, p1y) ? 0 : qy - p1y) / slope + p1x; + } else if ((c & TOP) != INSIDE) { + qy = yMax; + qx = vertical ? p1x : (DoubleComparisons.feq(qy, p1y) ? 0 : qy - p1y) / slope + p1x; + } + + if (c == c1) { + p1x = qx; + p1y = qy; + c1 = regionCode(p1x, p1y); + } else { + p2x = qx; + p2y = qy; + c2 = regionCode(p2x, p2y); + } + } + line.setLine(p1x, p1y, p2x, p2y); + return true; + } + +} +// end of file diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/DoubleComparisons.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/DoubleComparisons.java new file mode 100644 index 0000000..36ef41b --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/DoubleComparisons.java @@ -0,0 +1,30 @@ +package com.knecon.fforesight.service.layoutparser.processor.classification.utils; + +import java.math.BigDecimal; + +import lombok.experimental.UtilityClass; +import lombok.extern.slf4j.Slf4j; + +@Slf4j +@UtilityClass +@SuppressWarnings("all") +public final class DoubleComparisons { + + private final static float EPSILON = 0.1f; + + + public static boolean feq(double f1, double f2) { + + return (Math.abs(f1 - f2) < EPSILON); + } + + + public static float round(double d, int decimalPlace) { + BigDecimal bd = BigDecimal.valueOf(d); + bd = bd.setScale(decimalPlace, BigDecimal.ROUND_HALF_UP); + return bd.floatValue(); + } + + +} + diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/PositionUtils.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/PositionUtils.java new file mode 100644 index 0000000..765ea6c --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/PositionUtils.java @@ -0,0 +1,119 @@ +package com.knecon.fforesight.service.layoutparser.processor.classification.utils; + +import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock; + +import lombok.experimental.UtilityClass; + +@UtilityClass +@SuppressWarnings("all") +public final class PositionUtils { + + // TODO This currently uses pdf coord system. In the futher this should use java coord system. + // Note: DirAdj (TextDirection Adjusted) can not be user for this. + public boolean isWithinBodyTextFrame(Rectangle btf, ClassificationTextBlock textBlock) { + + if (btf == null || textBlock == null) { + return false; + } + + double threshold = textBlock.getMostPopularWordHeight() * 3; + + if (textBlock.getPdfMinX() + threshold > btf.getTopLeft().getX() && textBlock.getPdfMaxX() - threshold < btf.getTopLeft() + .getX() + btf.getWidth() && textBlock.getPdfMinY() + threshold > btf.getTopLeft().getY() && textBlock.getPdfMaxY() - threshold < btf.getTopLeft() + .getY() + btf.getHeight()) { + return true; + } else { + return false; + } + + } + + + // TODO This currently uses pdf coord system. In the futher this should use java coord system. + // Note: DirAdj (TextDirection Adjusted) can not be user for this. + public boolean isOverBodyTextFrame(Rectangle btf, ClassificationTextBlock textBlock, int rotation) { + + if (btf == null || textBlock == null) { + return false; + } + + if (rotation == 90 && textBlock.getPdfMaxX() < btf.getTopLeft().getX()) { + return true; + } + + if (rotation == 180 && textBlock.getPdfMaxY() < btf.getTopLeft().getY()) { + return true; + } + + if (rotation == 270 && textBlock.getPdfMinX() > btf.getTopLeft().getX() + btf.getWidth()) { + return true; + } + + if (rotation == 0 && textBlock.getPdfMinY() > btf.getTopLeft().getY() + btf.getHeight()) { + return true; + } else { + return false; + } + + } + + // TODO This currently uses pdf coord system. In the futher this should use java coord system. + // Note: DirAdj (TextDirection Adjusted) can not be user for this. + public boolean isUnderBodyTextFrame(Rectangle btf, ClassificationTextBlock textBlock, int rotation) { + + if (btf == null || textBlock == null) { + return false; + } + + if (rotation == 90 && textBlock.getPdfMinX() > btf.getTopLeft().getX() + btf.getWidth()) { + return true; + } + + if (rotation == 180 && textBlock.getPdfMinY() > btf.getTopLeft().getY() + btf.getHeight()) { + return true; + } + + if (rotation == 270 && textBlock.getPdfMaxX() < btf.getTopLeft().getX()) { + return true; + } + + if (rotation == 0 && textBlock.getPdfMaxY() < btf.getTopLeft().getY()) { + return true; + } else { + return false; + } + + } + + // TODO This currently uses pdf coord system. In the futher this should use java coord system. + // Note: DirAdj (TextDirection Adjusted) can not be user for this. + public boolean isTouchingUnderBodyTextFrame(Rectangle btf, ClassificationTextBlock textBlock) { + + //TODO Currently this is not working for rotated pages. + + if (btf == null || textBlock == null) { + return false; + } + + if (textBlock.getMinY() < btf.getTopLeft().getY()) { + return true; + } else { + return false; + } + + } + + + public float getHeightDifferenceBetweenChunkWordAndDocumentWord(ClassificationTextBlock textBlock, Float documentMostPopularWordHeight) { + + return textBlock.getMostPopularWordHeight() - documentMostPopularWordHeight; + } + + + public Float getApproxLineCount(ClassificationTextBlock textBlock) { + + return textBlock.getHeight() / textBlock.getMostPopularWordHeight(); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/QuickSort.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/QuickSort.java new file mode 100644 index 0000000..5e65c49 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/QuickSort.java @@ -0,0 +1,109 @@ +package com.knecon.fforesight.service.layoutparser.processor.classification.utils; + +import java.util.ArrayDeque; +import java.util.Comparator; +import java.util.Deque; +import java.util.List; + +import lombok.experimental.UtilityClass; + +/** + * Copied and minimal modified from PDFBox. + */ +@UtilityClass +public final class QuickSort { + + private static final Comparator OBJCOMP = new Comparator() { + @Override + public int compare(Comparable object1, Comparable object2) { + + return object1.compareTo(object2); + } + }; + + + /** + * Sorts the given list using the given comparator. + * + * @param type of the objects to be sorted. + * @param list list to be sorted + * @param cmp comparator used to compare the objects within the list + */ + public static void sort(List list, Comparator cmp) { + + int size = list.size(); + if (size < 2) { + return; + } + quicksort(list, cmp); + } + + + /** + * Sorts the given list using compareTo as comparator. + * + * @param type of the objects to be sorted. + * @param list list to be sorted + */ + public static void sort(List list) { + + sort(list, (Comparator) OBJCOMP); + } + + + private static void quicksort(List list, Comparator cmp) { + + Deque stack = new ArrayDeque(); + stack.push(0); + stack.push(list.size()); + while (!stack.isEmpty()) { + int right = stack.pop(); + int left = stack.pop(); + if (right - left < 2) { + continue; + } + int p = left + ((right - left) / 2); + p = partition(list, cmp, p, left, right); + + stack.push(p + 1); + stack.push(right); + + stack.push(left); + stack.push(p); + } + } + + + private static int partition(List list, Comparator cmp, int p, int start, int end) { + + int l = start; + int h = end - 2; + T piv = list.get(p); + swap(list, p, end - 1); + + while (l < h) { + if (cmp.compare(list.get(l), piv) <= 0) { + l++; + } else if (cmp.compare(piv, list.get(h)) <= 0) { + h--; + } else { + swap(list, l, h); + } + } + int idx = h; + if (cmp.compare(list.get(h), piv) < 0) { + idx++; + } + swap(list, end - 1, idx); + return idx; + } + + + private static void swap(List list, int i, int j) { + + T tmp = list.get(i); + list.set(i, list.get(j)); + list.set(j, tmp); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/RulingTextDirAdjustUtil.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/RulingTextDirAdjustUtil.java new file mode 100644 index 0000000..ce3ac2b --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/RulingTextDirAdjustUtil.java @@ -0,0 +1,64 @@ +package com.knecon.fforesight.service.layoutparser.processor.classification.utils; + +import java.awt.geom.Line2D; +import java.awt.geom.Point2D; + +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Ruling; + +import lombok.experimental.UtilityClass; + +@UtilityClass +public final class RulingTextDirAdjustUtil { + + /** + * Converts a ruling (line of a table) the same way TextPositions are converted in PDFBox. + * This will get the y position of the text, adjusted so that 0,0 is upper left and it is adjusted based on the text direction. + * + * See org.apache.pdfbox.text.TextPosition + */ + public Line2D.Float convertToDirAdj(Ruling ruling, float dir, float pageWidth, float pageHeight) { + + return new Line2D.Float(convertPoint(ruling.x1, ruling.y1, dir, pageWidth, pageHeight), convertPoint(ruling.x2, ruling.y2, dir, pageWidth, pageHeight)); + } + + + private Point2D convertPoint(float x, float y, float dir, float pageWidth, float pageHeight) { + + var xAdj = getXRot(x, y, dir, pageWidth, pageHeight); + var yLowerLeftRot = getYLowerLeftRot(x, y, dir, pageWidth, pageHeight); + var yAdj = dir == 0 || dir == 180 ? pageHeight - yLowerLeftRot : pageWidth - yLowerLeftRot; + return new Point2D.Float(xAdj, yAdj); + } + + + @SuppressWarnings("SuspiciousNameCombination") + private float getXRot(float x, float y, float dir, float pageWidth, float pageHeight) { + + if (dir == 0) { + return x; + } else if (dir == 90) { + return y; + } else if (dir == 180) { + return pageWidth - x; + } else if (dir == 270) { + return pageHeight - y; + } + return 0; + } + + + private float getYLowerLeftRot(float x, float y, float dir, float pageWidth, float pageHeight) { + + if (dir == 0) { + return y; + } else if (dir == 90) { + return pageWidth - x; + } else if (dir == 180) { + return pageHeight - y; + } else if (dir == 270) { + return x; + } + return 0; + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/TextNormalizationUtilities.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/TextNormalizationUtilities.java new file mode 100644 index 0000000..fa08958 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/TextNormalizationUtilities.java @@ -0,0 +1,19 @@ +package com.knecon.fforesight.service.layoutparser.processor.classification.utils; + +import lombok.experimental.UtilityClass; + +@UtilityClass +public final class TextNormalizationUtilities { + + /** + * Revert hyphenation due to line breaks. + * + * @param text Text to be processed. + * @return Text without line-break hyphenation. + */ + public static String removeHyphenLineBreaks(String text) { + + return text.replaceAll("([^\\s\\d\\-]{2,500})[\\-\\u00AD]\\R", "$1"); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/DocumentGraphFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/DocumentGraphFactory.java new file mode 100644 index 0000000..217212f --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/DocumentGraphFactory.java @@ -0,0 +1,386 @@ +package com.knecon.fforesight.service.layoutparser.processor.factory; + +import static java.lang.String.format; +import static java.util.stream.Collectors.groupingBy; +import static java.util.stream.Collectors.toList; + +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.NoSuchElementException; +import java.util.Set; +import java.util.concurrent.atomic.AtomicInteger; + +import org.springframework.stereotype.Service; + +import com.knecon.fforesight.service.layoutparser.internal.api.graph.DocumentGraph; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.FooterNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.HeaderNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.HeadlineNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ImageNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.NodeType; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ParagraphNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SectionNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableCellNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.AtomicTextBlock; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationDocument; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationFooter; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationHeader; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.image.ClassifiedImage; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Table; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.TableCell; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextPositionSequence; + + +@Service +public class DocumentGraphFactory { + + public static final double TABLE_CELL_MERGE_CONTENTS_SIZE_THRESHOLD = 0.05; + + + public DocumentGraph buildDocumentGraph(ClassificationDocument document) { + + TextBlockFactory textBlockFactory = new TextBlockFactory(); + Context context = new Context(new TableOfContents(), new HashMap<>(), new LinkedList<>(), new LinkedList<>(), textBlockFactory); + + document.getPages().stream().map(this::buildPage).forEach(page -> context.pages().put(page, new AtomicInteger(1))); + document.getSections().stream().flatMap(section -> section.getImages().stream()).forEach(image -> context.images().add(image)); + addSections(document, context); + addHeaderAndFooterToEachPage(document, context); + + DocumentGraph documentGraph = DocumentGraph.builder().numberOfPages(context.pages.size()).pages(context.pages.keySet()).tableOfContents(context.tableOfContents).build(); + + documentGraph.setTextBlock(documentGraph.buildTextBlock()); + return documentGraph; + } + + + private void addSections(ClassificationDocument document, Context context) { + + document.getSections().forEach(section -> addSection(null, section.getPageBlocks(), section.getImages(), context)); + } + + + private void addSection(SemanticNode parentNode, List pageBlocks, List images, Context context) { + + Map> blocksPerPage = pageBlocks.stream().collect(groupingBy(AbstractTextContainer::getPage)); + SectionNode sectionNode = SectionNode.builder().entities(new HashSet<>()).tableOfContents(context.tableOfContents()).build(); + + context.sections().add(sectionNode); + blocksPerPage.keySet().forEach(pageNumber -> addSectionNodeToPageNode(context, sectionNode, pageNumber)); + + List tocId; + if (parentNode == null) { + tocId = context.tableOfContents.createNewEntryAndReturnId(NodeType.SECTION, sectionNode); + } else { + tocId = context.tableOfContents.createNewChildEntryAndReturnId(parentNode.getTocId(), NodeType.SECTION, sectionNode); + } + sectionNode.setTocId(tocId); + Set alreadyMerged = new HashSet<>(); + for (AbstractTextContainer abstractTextContainer : pageBlocks) { + + if (alreadyMerged.contains(abstractTextContainer)) { + continue; + } + + if (abstractTextContainer instanceof ClassificationTextBlock) { + List textBlocks = findTextBlocksWithSameClassificationAndAlignsY(abstractTextContainer, pageBlocks); + alreadyMerged.addAll(textBlocks); + addParagraphOrHeadline(sectionNode, (ClassificationTextBlock) abstractTextContainer, context, textBlocks); + } + if (abstractTextContainer instanceof Table) { + addTable(sectionNode, (Table) abstractTextContainer, context); + } + } + for (ClassifiedImage image : images) { + + addImage(sectionNode, image, context); + } + } + + + private static List findTextBlocksWithSameClassificationAndAlignsY(AbstractTextContainer atc, List pageBlocks) { + + return pageBlocks.stream() + .filter(abstractTextContainer -> !abstractTextContainer.equals(atc)) + .filter(abstractTextContainer -> abstractTextContainer.getPage() == atc.getPage()) + .filter(abstractTextContainer -> abstractTextContainer instanceof ClassificationTextBlock) + .filter(abstractTextContainer -> abstractTextContainer.intersectsY(atc)) + .map(abstractTextContainer -> (ClassificationTextBlock) abstractTextContainer) + .toList(); + } + + + private void addSectionNodeToPageNode(Context context, SectionNode sectionNode, Integer pageNumber) { + + PageNode page = getPage(pageNumber, context); + page.getMainBody().add(sectionNode); + } + + + private void addTable(SemanticNode parentNode, Table table, Context context) { + + PageNode page = getPage(table.getPage(), context); + TableNode tableNode = TableNode.builder().tableOfContents(context.tableOfContents()).numberOfCols(table.getColCount()).numberOfRows(table.getRowCount()).build(); + + if (!page.getMainBody().contains(parentNode)) { + parentNode.getPages().add(page); + } + + page.getMainBody().add(tableNode); + + List tocId = context.tableOfContents().createNewChildEntryAndReturnId(parentNode.getTocId(), NodeType.TABLE, tableNode); + tableNode.setTocId(tocId); + + addTableCells(table.getRows(), tableNode, context, table.getPage()); + } + + + private void addTableCells(List> rows, SemanticNode parentNode, Context context, int pageNumber) { + + for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) { + for (int colIndex = 0; colIndex < rows.get(rowIndex).size(); colIndex++) { + addTableCell(rows.get(rowIndex).get(colIndex), rowIndex, colIndex, parentNode, pageNumber, context); + } + } + } + + + private void addTableCell(TableCell cell, int rowIndex, int colIndex, SemanticNode parentNode, int pageNumber, Context context) { + + PageNode page = getPage(pageNumber, context); + cell.getTextBlocks().stream().filter(tb -> tb.getPage() == 0).forEach(tb -> tb.setPage(pageNumber)); + + TableCellNode tableCellNode = TableCellNode.builder() + .tableOfContents(context.tableOfContents()) + .row(rowIndex) + .col(colIndex) + .header(cell.isHeaderCell()) + .bBox(cell.getBounds2D()) + .build(); + page.getMainBody().add(tableCellNode); + + TextBlock textBlock; + + List tocId = context.tableOfContents().createNewChildEntryAndReturnId(parentNode.getTocId(), NodeType.TABLE_CELL, tableCellNode); + tableCellNode.setTocId(tocId); + + if (cell.getTextBlocks().isEmpty()) { + tableCellNode.setTerminalTextBlock(context.textBlockFactory.emptyTextBlock(parentNode, context, page)); + tableCellNode.setTerminal(true); + + } else if (cell.getTextBlocks().size() == 1) { + textBlock = context.textBlockFactory().buildAtomicTextBlock(cell.getTextBlocks().get(0).getSequences(), tableCellNode, context, page); + tableCellNode.setTerminalTextBlock(textBlock); + tableCellNode.setTerminal(true); + + } else if (firstTextBlockIsHeadline(cell)) { + addSection(tableCellNode, cell.getTextBlocks().stream().map(tb -> (AbstractTextContainer) tb).toList(), Collections.emptyList(), context); + tableCellNode.setTerminal(false); + + } else if (cellAreaIsSmallerThanPageAreaTimesThreshold(cell, page)) { + List sequences = TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(cell.getTextBlocks()); + textBlock = context.textBlockFactory().buildAtomicTextBlock(sequences, tableCellNode, context, page); + tableCellNode.setTerminalTextBlock(textBlock); + tableCellNode.setTerminal(true); + + } else { + cell.getTextBlocks().forEach(tb -> addParagraphOrHeadline(tableCellNode, tb, context)); + tableCellNode.setTerminal(false); + } + + } + + + private static boolean cellAreaIsSmallerThanPageAreaTimesThreshold(TableCell cell, PageNode page) { + + return cell.getArea() < TABLE_CELL_MERGE_CONTENTS_SIZE_THRESHOLD * page.getHeight() * page.getWidth(); + } + + + private static boolean firstTextBlockIsHeadline(TableCell cell) { + + String classification = cell.getTextBlocks().get(0).getClassification(); + return classification != null && classification.startsWith("H"); + } + + + private void addParagraphOrHeadline(SemanticNode parentNode, ClassificationTextBlock originalTextBlock, Context context) { + + addParagraphOrHeadline(parentNode, originalTextBlock, context, Collections.emptyList()); + } + + + private void addParagraphOrHeadline(SemanticNode parentNode, ClassificationTextBlock originalTextBlock, Context context, List textBlocksToMerge) { + + PageNode page = getPage(originalTextBlock.getPage(), context); + + SemanticNode node; + if (originalTextBlock.getClassification() != null && originalTextBlock.getClassification().startsWith("H")) { + node = HeadlineNode.builder().tableOfContents(context.tableOfContents()).build(); + } else { + node = ParagraphNode.builder().tableOfContents(context.tableOfContents()).build(); + } + + page.getMainBody().add(node); + + List textBlocks = new LinkedList<>(textBlocksToMerge); + textBlocks.add(originalTextBlock); + AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), node, context, page); + + if (node instanceof HeadlineNode headlineNode) { + List tocId = context.tableOfContents.createNewChildEntryAndReturnId(parentNode.getTocId(), NodeType.HEADLINE, node); + headlineNode.setTerminalTextBlock(textBlock); + headlineNode.setTocId(tocId); + } + if (node instanceof ParagraphNode paragraphNode) { + List tocId = context.tableOfContents.createNewChildEntryAndReturnId(parentNode.getTocId(), NodeType.PARAGRAPH, node); + paragraphNode.setTerminalTextBlock(textBlock); + paragraphNode.setTocId(tocId); + } + } + + + private void addImage(SectionNode sectionNode, ClassifiedImage image, Context context) { + + PageNode page = getPage(image.getPage(), context); + ImageNode imageNode = ImageNode.builder() + .imageType(image.getImageType()) + .position(image.getPosition()) + .transparency(image.isHasTransparency()) + .page(page) + .tableOfContents(context.tableOfContents()) + .build(); + page.getMainBody().add(imageNode); + + List tocId = context.tableOfContents().createNewChildEntryAndReturnId(sectionNode.getTocId(), NodeType.IMAGE, imageNode); + imageNode.setTocId(tocId); + } + + + private void addHeaderAndFooterToEachPage(ClassificationDocument document, Context context) { + + Map> headers = document.getHeaders() + .stream() + .map(ClassificationHeader::getTextBlocks) + .flatMap(List::stream) + .collect(groupingBy(AbstractTextContainer::getPage, toList())); + + Map> footers = document.getFooters() + .stream() + .map(ClassificationFooter::getTextBlocks) + .flatMap(List::stream) + .collect(groupingBy(AbstractTextContainer::getPage, toList())); + + for (int pageIndex = 1; pageIndex <= document.getPages().size(); pageIndex++) { + if (headers.containsKey(pageIndex)) { + addHeader(headers.get(pageIndex), context); + } else { + addEmptyHeader(pageIndex, context); + } + } + + for (int pageIndex = 1; pageIndex <= document.getPages().size(); pageIndex++) { + if (footers.containsKey(pageIndex)) { + addFooter(footers.get(pageIndex), context); + } else { + addEmptyFooter(pageIndex, context); + } + } + } + + + private void addFooter(List textBlocks, Context context) { + + PageNode page = getPage(textBlocks.get(0).getPage(), context); + FooterNode footer = FooterNode.builder().tableOfContents(context.tableOfContents()).build(); + AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), + footer, + context, + page); + List tocId = context.tableOfContents().createNewEntryAndReturnId(NodeType.FOOTER, footer); + footer.setTocId(tocId); + footer.setTerminalTextBlock(textBlock); + page.setFooter(footer); + } + + + public void addHeader(List textBlocks, Context context) { + + PageNode page = getPage(textBlocks.get(0).getPage(), context); + HeaderNode header = HeaderNode.builder().tableOfContents(context.tableOfContents()).build(); + AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), + header, + context, + 0, + page); + List tocId = context.tableOfContents().createNewEntryAndReturnId(NodeType.HEADER, header); + header.setTocId(tocId); + header.setTerminalTextBlock(textBlock); + page.setHeader(header); + } + + + private void addEmptyFooter(int pageIndex, Context context) { + + PageNode page = getPage(pageIndex, context); + FooterNode footer = FooterNode.builder().tableOfContents(context.tableOfContents()).build(); + AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(footer, context, page); + List tocId = context.tableOfContents().createNewEntryAndReturnId(NodeType.FOOTER, footer); + footer.setTocId(tocId); + footer.setTerminalTextBlock(textBlock); + page.setFooter(footer); + } + + + private void addEmptyHeader(int pageIndex, Context context) { + + PageNode page = getPage(pageIndex, context); + HeaderNode header = HeaderNode.builder().tableOfContents(context.tableOfContents()).build(); + AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(header, 0, page); + List tocId = context.tableOfContents().createNewEntryAndReturnId(NodeType.HEADER, header); + header.setTocId(tocId); + header.setTerminalTextBlock(textBlock); + page.setHeader(header); + } + + + private PageNode buildPage(ClassificationPage p) { + + return PageNode.builder() + .height((int) p.getPageHeight()) + .width((int) p.getPageWidth()) + .number(p.getPageNumber()) + .rotation(p.getRotation()) + .mainBody(new LinkedList<>()) + .build(); + } + + + private PageNode getPage(int pageIndex, Context context) { + + return context.pages.keySet() + .stream() + .filter(page -> page.getNumber() == pageIndex) + .findFirst() + .orElseThrow(() -> new NoSuchElementException(format("ClassificationPage with number %d not found", pageIndex))); + } + + + record Context( + TableOfContents tableOfContents, Map pages, List sections, List images, TextBlockFactory textBlockFactory) { + + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/ImageSortService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/ImageSortService.java new file mode 100644 index 0000000..285bcc5 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/ImageSortService.java @@ -0,0 +1,132 @@ +package com.knecon.fforesight.service.layoutparser.processor.factory; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.stream.Collectors; + +import org.springframework.stereotype.Service; + +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationDocument; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationSection; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.image.ClassifiedImage; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Table; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.TableCell; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock; + +@Service +public class ImageSortService { + + public SortedImages sortImagesIntoStructure(ClassificationDocument document) { + + SortedImages sortedImages = new SortedImages(new HashMap<>(), new HashMap<>(), new HashMap<>(), new HashMap<>(), new HashMap<>()); + + Map> imagesByPage = document.getSections() + .stream() + .flatMap(section -> section.getImages().stream()) + .distinct() + .collect(Collectors.groupingBy(ClassifiedImage::getPage)); + + for (int pageNumber : imagesByPage.keySet()) { + List textContainersOnPage = document.getSections() + .stream() + .flatMap(section -> section.getPageBlocks().stream()) + .filter(abstractTextContainer -> abstractTextContainer.getPage() == pageNumber) + .toList(); + + List sectionsOnPage = document.getSections() + .stream() + .filter(section -> section.getPageBlocks().stream().anyMatch(block -> block.getPage() == pageNumber)) + .toList(); + + for (ClassifiedImage image : imagesByPage.get(pageNumber)) { + sortImage(textContainersOnPage, sectionsOnPage, image, sortedImages); + } + } + return sortedImages; + } + + + private void sortImage(List textContainersOnPage, List sectionsOnPage, ClassifiedImage image, SortedImages sortedImages) { + + Optional containingTextContainer = getContainingTextContainer(image, textContainersOnPage); + Optional sectionContainingTextContainer = getContainingSection(image, sectionsOnPage); + List containedTextContainers = getContainedTextContainers(image, textContainersOnPage); + List containedSections = getContainedSections(image, sectionsOnPage); + if (containingTextContainer.isPresent()) { + if (sortImageIntoTextContainerOrCell(image, sortedImages, containingTextContainer.get())) { + return; + } + } + } + + + private static boolean sortImageIntoTextContainerOrCell(ClassifiedImage image, SortedImages sortedImages, AbstractTextContainer containingTextContainer) { + + if (containingTextContainer instanceof ClassificationTextBlock) { + sortedImages.containedInTextContainer().computeIfAbsent(containingTextContainer, sortedImage -> new ArrayList<>()).add(image); + return true; + } + if (containingTextContainer instanceof Table) { + Optional containingCell = getContainingCell((Table) containingTextContainer, image); + if (containingCell.isPresent()) { + sortedImages.containedInCell().computeIfAbsent(containingCell.get(), sortedImage -> new ArrayList<>()).add(image); + return true; + } + } + + return false; + } + + + private static Optional getContainingCell(Table table, ClassifiedImage image) { + + return table.getRows().stream().flatMap(List::stream).filter(cell -> cell.contains(image.getPosition())).findFirst(); + } + + + private List getContainedSections(ClassifiedImage image, List sectionsOnPage) { + + return sectionsOnPage.stream() + .filter(section -> image.getPosition().contains(RectangleTransformations.bBoxUnionAbstractTextContainer(section.getPageBlocks() + .stream() + .filter(block -> block.getPage() == image.getPage()) + .toList()))) + .toList(); + } + + + private List getContainedTextContainers(ClassifiedImage image, List textContainersOnPage) { + + return textContainersOnPage.stream().filter(textContainer -> image.getPosition().contains(RectangleTransformations.toRectangle2D(textContainer))).toList(); + } + + + private Optional getContainingSection(ClassifiedImage image, List sectionsOnPage) { + + return sectionsOnPage.stream()// + .filter(section -> // + RectangleTransformations.bBoxUnionAbstractTextContainer(section.getPageBlocks().stream().filter(block -> block.getPage() == image.getPage()).toList())// + .contains(image.getPosition())).findFirst(); + } + + + private Optional getContainingTextContainer(ClassifiedImage image, List textContainersOnPage) { + + return textContainersOnPage.stream().filter(textContainer -> RectangleTransformations.toRectangle2D(textContainer).contains(image.getPosition())).findFirst(); + } + + + public record SortedImages( + Map> containedInCell, + Map> containedInTextContainer, + Map> containedInSection, + Map> containedByImage, + Map> sectionContainedByImage) { + + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/RectangleTransformations.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/RectangleTransformations.java new file mode 100644 index 0000000..062b49d --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/RectangleTransformations.java @@ -0,0 +1,105 @@ +package com.knecon.fforesight.service.layoutparser.processor.factory; + +import static java.lang.String.format; + +import java.awt.geom.Area; +import java.awt.geom.Rectangle2D; +import java.util.Arrays; +import java.util.List; +import java.util.Set; +import java.util.function.BiConsumer; +import java.util.function.BinaryOperator; +import java.util.function.Function; +import java.util.function.Supplier; +import java.util.stream.Collector; + +import org.apache.pdfbox.pdmodel.common.PDRectangle; + +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer; + +public class RectangleTransformations { + + public static Rectangle2D pad(Rectangle2D rectangle2D, int deltaX, int deltaY) { + + return new Rectangle2D.Double(rectangle2D.getMinX() - deltaX, rectangle2D.getMinY() - deltaY, rectangle2D.getWidth() + 2 * deltaX, rectangle2D.getHeight() + 2 * deltaY); + } + + + public static Rectangle2D bBoxUnionAbstractTextContainer(List abstractTextContainers) { + + return abstractTextContainers.stream().map(RectangleTransformations::toRectangle2D).collect(new Rectangle2DUnion()); + } + + public static Rectangle2D rectangleUnion(List rectangle2DList) { + + return rectangle2DList.stream().collect(new Rectangle2DUnion()); + } + + + public static Rectangle2D toRectangle2D(AbstractTextContainer abstractTextContainer) { + + return new Rectangle2D.Float(abstractTextContainer.getMinX(), abstractTextContainer.getMinY(), abstractTextContainer.getWidth(), abstractTextContainer.getHeight()); + } + + + public static Rectangle2D toRectangle2D(PDRectangle rectangle) { + + return new Rectangle2D.Double(rectangle.getLowerLeftX(), rectangle.getLowerLeftY(), rectangle.getWidth(), rectangle.getHeight()); + } + + + public static String toString(Rectangle2D rectangle2D) { + + return format("%f,%f,%f,%f", rectangle2D.getX(), rectangle2D.getY(), rectangle2D.getWidth(), rectangle2D.getHeight()); + } + + + public static Rectangle2D parseRectangle2D(String bBox) { + + List floats = Arrays.stream(bBox.split(",")).map(Float::parseFloat).toList(); + return new Rectangle2D.Float(floats.get(0), floats.get(1), floats.get(2), floats.get(3)); + } + + + private static class Rectangle2DUnion implements Collector { + + @Override + public Supplier supplier() { + + return Area::new; + } + + + @Override + public BiConsumer accumulator() { + + return (area, rectangle2D) -> area.add(new Area(rectangle2D)); + } + + + @Override + public BinaryOperator combiner() { + + return (area1, area2) -> { + area1.add(area2); + return area1; + }; + } + + + @Override + public Function finisher() { + + return Area::getBounds2D; + } + + + @Override + public Set characteristics() { + + return Set.of(Characteristics.CONCURRENT, Characteristics.UNORDERED); + } + + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/SearchTextWithTextPositionFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/SearchTextWithTextPositionFactory.java new file mode 100644 index 0000000..76a1583 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/SearchTextWithTextPositionFactory.java @@ -0,0 +1,156 @@ +package com.knecon.fforesight.service.layoutparser.processor.factory; + +import java.awt.geom.AffineTransform; +import java.awt.geom.Rectangle2D; +import java.util.Collections; +import java.util.LinkedList; +import java.util.List; +import java.util.Objects; + +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.RedTextPosition; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextDirection; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextPositionSequence; + +public class SearchTextWithTextPositionFactory { + + public static final int HEIGHT_PADDING = 2; + + + public static SearchTextWithTextPositionModel buildSearchTextToTextPositionModel(List sequences) { + + if (sequences.isEmpty() || sequences.stream().allMatch(sequence -> sequence.getTextPositions().isEmpty())) { + return SearchTextWithTextPositionModel.builder() + .searchText("") + .lineBreaks(Collections.emptyList()) + .positions(Collections.emptyList()) + .stringCoordsToPositionCoords(Collections.emptyList()) + .build(); + } + + List stringIdxToPositionIdx = new LinkedList<>(); + List lineBreaksStringIdx = new LinkedList<>(); + StringBuilder sb = new StringBuilder(); + + int stringIdx = 0; + int positionIdx = 0; + int lastHyphenIdx = -3; + + RedTextPosition currentTextPosition = sequences.get(0).getTextPositions().get(0); + RedTextPosition previousTextPosition = RedTextPosition.builder().unicode(" ").position(currentTextPosition.getPosition()).build(); + + for (TextPositionSequence word : sequences) { + for (int i = 0; i < word.getTextPositions().size(); ++i) { + + currentTextPosition = word.getTextPositions().get(i); + + if (isLineBreak(currentTextPosition, previousTextPosition)) { + + if (stringIdx - lastHyphenIdx < 3) { + sb.delete(lastHyphenIdx, sb.length()); + stringIdxToPositionIdx = stringIdxToPositionIdx.subList(0, lastHyphenIdx); + stringIdx = lastHyphenIdx; + lastHyphenIdx = -3; + } + lineBreaksStringIdx.add(stringIdx); + } + if (!isRepeatedWhitespace(currentTextPosition.getUnicode(), previousTextPosition.getUnicode())) { + + if (isHyphen(currentTextPosition.getUnicode())) { + lastHyphenIdx = stringIdx; + } + sb.append(currentTextPosition.getUnicode()); + stringIdxToPositionIdx.add(positionIdx); + ++stringIdx; + } + + previousTextPosition = currentTextPosition; + + ++positionIdx; + } + + previousTextPosition = RedTextPosition.builder().unicode(" ").position(previousTextPosition.getPosition()).build(); + sb.append(previousTextPosition.getUnicode()); + stringIdxToPositionIdx.add(positionIdx); + ++stringIdx; + } + + assert sb.length() == stringIdxToPositionIdx.size(); + + List positions = sequences.stream() + .flatMap(sequence -> sequence.getTextPositions().stream().map(textPosition -> mapRedTextPositionToInitialUserSpace(textPosition, sequence))) + .toList(); + + return SearchTextWithTextPositionModel.builder() + .searchText(sb.toString()) + .lineBreaks(lineBreaksStringIdx) + .stringCoordsToPositionCoords(stringIdxToPositionIdx) + .positions(positions) + .build(); + } + + + private static boolean isLineBreak(RedTextPosition currentTextPosition, RedTextPosition previousTextPosition) { + + return Objects.equals(currentTextPosition.getUnicode(), "\n") || isDeltaYLargerThanTextHeight(currentTextPosition, previousTextPosition); + } + + + private static boolean isDeltaYLargerThanTextHeight(RedTextPosition currentPosition, RedTextPosition previousPosition) { + + if (previousPosition == null) { + return false; + } + + float deltaY = Math.abs(currentPosition.getYDirAdj() - previousPosition.getYDirAdj()); + return deltaY >= currentPosition.getHeightDir(); + } + + + private static boolean isRepeatedWhitespace(String currentUnicode, String previousUnicode) { + + return Objects.equals(previousUnicode, " ") && Objects.equals(currentUnicode, " "); + } + + + private static boolean isHyphen(String unicodeCharacter) { + + return Objects.equals(unicodeCharacter, "-") || // + Objects.equals(unicodeCharacter, "~") || // + Objects.equals(unicodeCharacter, "‐") || // + Objects.equals(unicodeCharacter, "‒") || // + Objects.equals(unicodeCharacter, "⁻") || // + Objects.equals(unicodeCharacter, "−") || // + Objects.equals(unicodeCharacter, "﹣") || // + Objects.equals(unicodeCharacter, "゠") || // + Objects.equals(unicodeCharacter, "⁓") || // + Objects.equals(unicodeCharacter, "‑") || // + Objects.equals(unicodeCharacter, "\u00AD"); + } + + + private static Rectangle2D mapRedTextPositionToInitialUserSpace(RedTextPosition textPosition, TextPositionSequence sequence) { + + float textHeight = sequence.getTextHeight() + HEIGHT_PADDING; + Rectangle2D rectangle2D = new Rectangle2D.Double(textPosition.getXDirAdj(), + textPosition.getYDirAdj() - textHeight, + textPosition.getWidthDirAdj(), + textHeight + HEIGHT_PADDING); + + AffineTransform transform = new AffineTransform(); + + if (sequence.getDir() == TextDirection.ZERO || sequence.getDir() == TextDirection.HALF_CIRCLE) { + transform.rotate(sequence.getDir().getRadians(), sequence.getPageWidth() / 2f, sequence.getPageHeight() / 2f); + transform.translate(0f, sequence.getPageHeight()); + } else if (sequence.getDir() == TextDirection.QUARTER_CIRCLE) { + transform.rotate(sequence.getDir().getRadians(), sequence.getPageWidth() / 2f, sequence.getPageWidth() / 2f); + transform.translate(0f, sequence.getPageWidth()); + } else { + transform.rotate(sequence.getDir().getRadians(), sequence.getPageHeight() / 2f, sequence.getPageHeight() / 2f); + transform.translate(0f, sequence.getPageWidth()); + } + transform.scale(1., -1.); + + return transform.createTransformedShape(rectangle2D).getBounds2D(); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/SearchTextWithTextPositionModel.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/SearchTextWithTextPositionModel.java new file mode 100644 index 0000000..a9d9f3d --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/SearchTextWithTextPositionModel.java @@ -0,0 +1,20 @@ +package com.knecon.fforesight.service.layoutparser.processor.factory; + +import java.awt.geom.Rectangle2D; +import java.util.List; + +import lombok.AccessLevel; +import lombok.Builder; +import lombok.Getter; +import lombok.experimental.FieldDefaults; + +@Builder +@Getter +@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) +public class SearchTextWithTextPositionModel { + + String searchText; + List lineBreaks; + List stringCoordsToPositionCoords; + List positions; +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/TextBlockFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/TextBlockFactory.java new file mode 100644 index 0000000..46a950a --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/TextBlockFactory.java @@ -0,0 +1,79 @@ +package com.knecon.fforesight.service.layoutparser.processor.factory; + +import java.util.Collections; +import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; + +import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.AtomicTextBlock; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextPositionSequence; + + +public class TextBlockFactory { + + AtomicInteger stringOffset; + AtomicLong textBlockIdx; + + + public TextBlockFactory() { + + stringOffset = new AtomicInteger(); + textBlockIdx = new AtomicLong(); + } + + + public AtomicTextBlock buildAtomicTextBlock(List sequences, SemanticNode parent, DocumentGraphFactory.Context context, PageNode page) { + + Integer numberOnPage = context.pages().get(page).getAndIncrement(); + return buildAtomicTextBlock(sequences, parent, context, numberOnPage, page); + } + + + public AtomicTextBlock buildAtomicTextBlock(List sequences, + SemanticNode parent, + DocumentGraphFactory.Context context, + Integer numberOnPage, + PageNode page) { + + SearchTextWithTextPositionModel searchTextWithTextPositionModel = SearchTextWithTextPositionFactory.buildSearchTextToTextPositionModel(sequences); + int offset = stringOffset.getAndAdd(searchTextWithTextPositionModel.getSearchText().length()); + + return AtomicTextBlock.builder() + .id(textBlockIdx.getAndIncrement()) + .parent(parent) + .searchText(searchTextWithTextPositionModel.getSearchText()) + .numberOnPage(numberOnPage) + .page(page) + .lineBreaks(searchTextWithTextPositionModel.getLineBreaks()) + .positions(searchTextWithTextPositionModel.getPositions()) + .stringIdxToPositionIdx(searchTextWithTextPositionModel.getStringCoordsToPositionCoords()) + .boundary(new Boundary(offset, offset + searchTextWithTextPositionModel.getSearchText().length())) + .build(); + } + + + public AtomicTextBlock emptyTextBlock(SemanticNode parent, DocumentGraphFactory.Context context, PageNode page) { + + return emptyTextBlock(parent, context.pages().get(page).getAndIncrement(), page); + } + + + public AtomicTextBlock emptyTextBlock(SemanticNode parent, Integer numberOnPage, PageNode page) { + + return AtomicTextBlock.builder() + .id(textBlockIdx.getAndIncrement()) + .boundary(new Boundary(stringOffset.get(), stringOffset.get())) + .searchText("") + .lineBreaks(Collections.emptyList()) + .page(page) + .numberOnPage(numberOnPage) + .stringIdxToPositionIdx(Collections.emptyList()) + .positions(Collections.emptyList()) + .parent(parent) + .build(); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/TextPositionOperations.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/TextPositionOperations.java new file mode 100644 index 0000000..006436d --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/TextPositionOperations.java @@ -0,0 +1,20 @@ +package com.knecon.fforesight.service.layoutparser.processor.factory; + +import java.util.Comparator; +import java.util.List; + +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock; +import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextPositionSequence; + +public class TextPositionOperations { + + public static List mergeAndSortTextPositionSequenceByYThenX(List textBlocks) { + + return textBlocks.stream()// + .flatMap(tb -> tb.getSequences().stream())// + .sorted(Comparator.comparingDouble(TextPositionSequence::getMaxYDirAdj)// + .thenComparing(TextPositionSequence::getMaxXDirAdj))// + .toList(); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/multitenancy/AsyncConfig.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/multitenancy/AsyncConfig.java new file mode 100644 index 0000000..a4dc7d3 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/multitenancy/AsyncConfig.java @@ -0,0 +1,27 @@ +package com.knecon.fforesight.service.layoutparser.processor.multitenancy; + +import java.util.concurrent.Executor; + +import org.springframework.context.annotation.Configuration; +import org.springframework.scheduling.annotation.AsyncConfigurerSupport; +import org.springframework.scheduling.concurrent.ThreadPoolTaskExecutor; + +@Configuration +public class AsyncConfig extends AsyncConfigurerSupport { + + @Override + public Executor getAsyncExecutor() { + + ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor(); + + executor.setCorePoolSize(7); + executor.setMaxPoolSize(42); + executor.setQueueCapacity(11); + executor.setThreadNamePrefix("TenantAwareTaskExecutor-"); + executor.setTaskDecorator(new TenantAwareTaskDecorator()); + executor.initialize(); + + return executor; + } + +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/multitenancy/EncryptionDecryptionService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/multitenancy/EncryptionDecryptionService.java new file mode 100644 index 0000000..dfda46c --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/multitenancy/EncryptionDecryptionService.java @@ -0,0 +1,105 @@ +package com.knecon.fforesight.service.layoutparser.processor.multitenancy; + +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.security.SecureRandom; +import java.security.spec.KeySpec; +import java.util.Base64; + +import javax.crypto.Cipher; +import javax.crypto.SecretKey; +import javax.crypto.SecretKeyFactory; +import javax.crypto.spec.GCMParameterSpec; +import javax.crypto.spec.PBEKeySpec; +import javax.crypto.spec.SecretKeySpec; + +import org.springframework.beans.factory.annotation.Value; +import org.springframework.stereotype.Service; + +import jakarta.annotation.PostConstruct; +import lombok.SneakyThrows; + +@Service +public class EncryptionDecryptionService { + + @Value("${redaction-service.crypto.key:redaction}") + private String key; + + private SecretKey secretKey; + private byte[] iv; + + + @SneakyThrows + @PostConstruct + protected void postConstruct() { + + SecureRandom secureRandom = new SecureRandom(); + iv = new byte[12]; + secureRandom.nextBytes(iv); + secretKey = generateSecretKey(key, iv); + } + + + @SneakyThrows + public String encrypt(String strToEncrypt) { + + return Base64.getEncoder().encodeToString(encrypt(strToEncrypt.getBytes())); + } + + + @SneakyThrows + public String decrypt(String strToDecrypt) { + + byte[] bytes = Base64.getDecoder().decode(strToDecrypt); + return new String(decrypt(bytes), StandardCharsets.UTF_8); + } + + + @SneakyThrows + public byte[] encrypt(byte[] data) { + + Cipher cipher = Cipher.getInstance("AES/GCM/NoPadding"); + GCMParameterSpec parameterSpec = new GCMParameterSpec(128, iv); + cipher.init(Cipher.ENCRYPT_MODE, secretKey, parameterSpec); + byte[] encryptedData = cipher.doFinal(data); + ByteBuffer byteBuffer = ByteBuffer.allocate(4 + iv.length + encryptedData.length); + byteBuffer.putInt(iv.length); + byteBuffer.put(iv); + byteBuffer.put(encryptedData); + return byteBuffer.array(); + } + + + @SneakyThrows + public byte[] decrypt(byte[] encryptedData) { + + ByteBuffer byteBuffer = ByteBuffer.wrap(encryptedData); + int noonceSize = byteBuffer.getInt(); + if (noonceSize < 12 || noonceSize >= 16) { + throw new IllegalArgumentException("Nonce size is incorrect. Make sure that the incoming data is an AES encrypted file."); + } + byte[] iv = new byte[noonceSize]; + byteBuffer.get(iv); + + SecretKey secretKey = generateSecretKey(key, iv); + + byte[] cipherBytes = new byte[byteBuffer.remaining()]; + byteBuffer.get(cipherBytes); + + Cipher cipher = Cipher.getInstance("AES/GCM/NoPadding"); + GCMParameterSpec parameterSpec = new GCMParameterSpec(128, iv); + cipher.init(Cipher.DECRYPT_MODE, secretKey, parameterSpec); + return cipher.doFinal(cipherBytes); + } + + + @SneakyThrows + public SecretKey generateSecretKey(String password, byte[] iv) { + + KeySpec spec = new PBEKeySpec(password.toCharArray(), iv, 65536, 128); // AES-128 + SecretKeyFactory secretKeyFactory = SecretKeyFactory.getInstance("PBKDF2WithHmacSHA1"); + byte[] key = secretKeyFactory.generateSecret(spec).getEncoded(); + return new SecretKeySpec(key, "AES"); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/multitenancy/ForwardTenantInterceptor.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/multitenancy/ForwardTenantInterceptor.java new file mode 100644 index 0000000..196bb42 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/multitenancy/ForwardTenantInterceptor.java @@ -0,0 +1,17 @@ +package com.knecon.fforesight.service.layoutparser.processor.multitenancy; + +import org.springframework.stereotype.Component; + +import feign.RequestInterceptor; +import feign.RequestTemplate; + +@Component +public class ForwardTenantInterceptor implements RequestInterceptor { + + public static final String TENANT_HEADER_NAME = "X-TENANT-ID"; + + @Override + public void apply(RequestTemplate template) { + template.header(TENANT_HEADER_NAME, TenantContext.getTenantId()); + } +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/multitenancy/MultiTenancyMessagingConfiguration.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/multitenancy/MultiTenancyMessagingConfiguration.java new file mode 100644 index 0000000..f47010d --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/multitenancy/MultiTenancyMessagingConfiguration.java @@ -0,0 +1,48 @@ +package com.knecon.fforesight.service.layoutparser.processor.multitenancy; + + + +import org.springframework.amqp.rabbit.config.AbstractRabbitListenerContainerFactory; +import org.springframework.amqp.rabbit.core.RabbitTemplate; +import org.springframework.beans.BeansException; +import org.springframework.beans.factory.config.BeanPostProcessor; +import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.Configuration; + +@Configuration +public class MultiTenancyMessagingConfiguration { + + @Bean + public static BeanPostProcessor multitenancyBeanPostProcessor() { + + return new BeanPostProcessor() { + + @Override + public Object postProcessAfterInitialization(Object bean, String beanName) throws BeansException { + + if (bean instanceof RabbitTemplate) { + + ((RabbitTemplate) bean).setBeforePublishPostProcessors(m -> { + m.getMessageProperties().setHeader(ForwardTenantInterceptor.TENANT_HEADER_NAME, TenantContext.getTenantId()); + return m; + }); + + } else if (bean instanceof AbstractRabbitListenerContainerFactory) { + + ((AbstractRabbitListenerContainerFactory) bean).setAfterReceivePostProcessors(m -> { + String tenant = m.getMessageProperties().getHeader(ForwardTenantInterceptor.TENANT_HEADER_NAME); + + if (tenant != null) { + TenantContext.setTenantId(tenant); + } else { + throw new RuntimeException("No Tenant is set queue message"); + } + return m; + }); + } + return bean; + } + }; + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/multitenancy/MultiTenancyWebConfiguration.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/multitenancy/MultiTenancyWebConfiguration.java new file mode 100644 index 0000000..cf099c2 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/multitenancy/MultiTenancyWebConfiguration.java @@ -0,0 +1,28 @@ +package com.knecon.fforesight.service.layoutparser.processor.multitenancy; + +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.context.annotation.Configuration; +import org.springframework.web.servlet.config.annotation.InterceptorRegistry; + +import com.iqser.red.commons.spring.DefaultWebMvcConfiguration; + +@Configuration +public class MultiTenancyWebConfiguration extends DefaultWebMvcConfiguration { + + private final TenantInterceptor tenantInterceptor; + + + @Autowired + public MultiTenancyWebConfiguration(TenantInterceptor tenantInterceptor) { + + this.tenantInterceptor = tenantInterceptor; + } + + + @Override + public void addInterceptors(InterceptorRegistry registry) { + + registry.addWebRequestInterceptor(tenantInterceptor); + } + +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/multitenancy/StorageConnectionProviderImpl.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/multitenancy/StorageConnectionProviderImpl.java new file mode 100644 index 0000000..592f5d2 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/multitenancy/StorageConnectionProviderImpl.java @@ -0,0 +1,44 @@ +package com.knecon.fforesight.service.layoutparser.processor.multitenancy; + +import org.springframework.stereotype.Service; + +import com.iqser.red.storage.commons.model.AzureStorageConnection; +import com.iqser.red.storage.commons.model.S3StorageConnection; +import com.iqser.red.storage.commons.service.StorageConnectionProvider; + +import lombok.RequiredArgsConstructor; + +@Service +@RequiredArgsConstructor +public class StorageConnectionProviderImpl implements StorageConnectionProvider { + + private final TenantsClient tenantsClient; + private final EncryptionDecryptionService encryptionDecryptionService; + + + @Override + public AzureStorageConnection getAzureStorageConnection(String tenantId) { + + var tenant = tenantsClient.getTenant(tenantId); + return AzureStorageConnection.builder() + .connectionString(encryptionDecryptionService.decrypt(tenant.getAzureStorageConnection().getConnectionString())) + .containerName(tenant.getAzureStorageConnection().getContainerName()) + .build(); + } + + + @Override + public S3StorageConnection getS3StorageConnection(String tenantId) { + + var tenant = tenantsClient.getTenant(tenantId); + return S3StorageConnection.builder() + .key(tenant.getS3StorageConnection().getKey()) + .secret(encryptionDecryptionService.decrypt(tenant.getS3StorageConnection().getSecret())) + .signerType(tenant.getS3StorageConnection().getSignerType()) + .bucketName(tenant.getS3StorageConnection().getBucketName()) + .region(tenant.getS3StorageConnection().getRegion()) + .endpoint(tenant.getS3StorageConnection().getEndpoint()) + .build(); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/multitenancy/TenantAwareTaskDecorator.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/multitenancy/TenantAwareTaskDecorator.java new file mode 100644 index 0000000..a500112 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/multitenancy/TenantAwareTaskDecorator.java @@ -0,0 +1,23 @@ +package com.knecon.fforesight.service.layoutparser.processor.multitenancy; + +import org.springframework.core.task.TaskDecorator; +import org.springframework.lang.NonNull; + +public class TenantAwareTaskDecorator implements TaskDecorator { + + @Override + @NonNull + public Runnable decorate(@NonNull Runnable runnable) { + + String tenantId = TenantContext.getTenantId(); + return () -> { + try { + TenantContext.setTenantId(tenantId); + runnable.run(); + } finally { + TenantContext.setTenantId(null); + } + }; + } + +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/multitenancy/TenantContext.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/multitenancy/TenantContext.java new file mode 100644 index 0000000..d042b05 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/multitenancy/TenantContext.java @@ -0,0 +1,29 @@ +package com.knecon.fforesight.service.layoutparser.processor.multitenancy; + +import lombok.extern.slf4j.Slf4j; + +@Slf4j +public final class TenantContext { + + private static InheritableThreadLocal currentTenant = new InheritableThreadLocal<>(); + + + public static void setTenantId(String tenantId) { + + log.debug("Setting tenantId to " + tenantId); + currentTenant.set(tenantId); + } + + + public static String getTenantId() { + + return currentTenant.get(); + } + + + public static void clear() { + + currentTenant.remove(); + } + +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/multitenancy/TenantInterceptor.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/multitenancy/TenantInterceptor.java new file mode 100644 index 0000000..a077df0 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/multitenancy/TenantInterceptor.java @@ -0,0 +1,35 @@ +package com.knecon.fforesight.service.layoutparser.processor.multitenancy; + +import org.springframework.stereotype.Component; +import org.springframework.ui.ModelMap; +import org.springframework.web.context.request.WebRequest; +import org.springframework.web.context.request.WebRequestInterceptor; + +@Component +public class TenantInterceptor implements WebRequestInterceptor { + + public static final String TENANT_HEADER_NAME = "X-TENANT-ID"; + + + @Override + public void preHandle(WebRequest request) { + + if (request.getHeader(TENANT_HEADER_NAME) != null) { + TenantContext.setTenantId(request.getHeader(TENANT_HEADER_NAME)); + } + } + + + @Override + public void postHandle(WebRequest request, ModelMap model) { + + TenantContext.clear(); + } + + + @Override + public void afterCompletion(WebRequest request, Exception ex) { + + } + +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/multitenancy/TenantsClient.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/multitenancy/TenantsClient.java new file mode 100644 index 0000000..8b7d135 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/multitenancy/TenantsClient.java @@ -0,0 +1,10 @@ +package com.knecon.fforesight.service.layoutparser.processor.multitenancy; + +import org.springframework.cloud.openfeign.FeignClient; + +import com.iqser.red.service.persistence.service.v1.api.internal.resources.TenantsResource; + +@FeignClient(name = "TenantsResource", url = "${persistence-service.url}") +public interface TenantsClient extends TenantsResource { + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/queue/LayoutParsingFinishedEvent.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/queue/LayoutParsingFinishedEvent.java new file mode 100644 index 0000000..6374fdc --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/queue/LayoutParsingFinishedEvent.java @@ -0,0 +1,11 @@ +package com.knecon.fforesight.service.layoutparser.processor.queue; + +import lombok.Builder; + +@Builder +public class LayoutParsingFinishedEvent { + + int status; + String message; + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/queue/LayoutParsingRequest.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/queue/LayoutParsingRequest.java new file mode 100644 index 0000000..661d98b --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/queue/LayoutParsingRequest.java @@ -0,0 +1,26 @@ +package com.knecon.fforesight.service.layoutparser.processor.queue; + +import java.util.Optional; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; + +@Data +@Builder +@AllArgsConstructor +@NoArgsConstructor +public class LayoutParsingRequest { + + String originFileStorageId; + + Optional tablesFileStorageId; + Optional imagesFileStorageId; + + String structureFileStorageId; + String textBlockFileStorageId; + String positionBlockFileStorageId; + String pageFileStorageId; + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/queue/MessageHandler.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/queue/MessageHandler.java new file mode 100644 index 0000000..ba72513 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/queue/MessageHandler.java @@ -0,0 +1,55 @@ +package com.knecon.fforesight.service.layoutparser.processor.queue; + +import static com.knecon.fforesight.service.layoutparser.processor.queue.MessagingConfiguration.LAYOUTPARSING_REQUEST_QUEUE; + +import java.io.IOException; + +import org.springframework.amqp.core.Message; +import org.springframework.amqp.rabbit.annotation.RabbitHandler; +import org.springframework.amqp.rabbit.annotation.RabbitListener; +import org.springframework.amqp.rabbit.core.RabbitTemplate; +import org.springframework.stereotype.Service; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingService; + +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; + +@Slf4j +@Service +@RequiredArgsConstructor +public class MessageHandler { + + private final LayoutParsingService layoutParsingService; + private final ObjectMapper objectMapper; + private final RabbitTemplate rabbitTemplate; + + + @RabbitHandler + @RabbitListener(queues = LAYOUTPARSING_REQUEST_QUEUE) + public void receiveLayoutParsingRequest(Message message) { + + LayoutParsingRequest layoutParsingRequest = null; + try { + layoutParsingRequest = objectMapper.readValue(message.getBody(), LayoutParsingRequest.class); + } catch (IOException e) { + sendLayoutParsingFinishedEvent(LayoutParsingFinishedEvent.builder().status(400).message("LayoutParsingRequest could not be deserialized!").build()); + throw new RuntimeException(e); + } + LayoutParsingFinishedEvent layoutParsingFinishedEvent = layoutParsingService.parseLayoutAndSaveFilesToStorage(layoutParsingRequest); + sendLayoutParsingFinishedEvent(layoutParsingFinishedEvent); + } + + + public void sendLayoutParsingFinishedEvent(LayoutParsingFinishedEvent layoutParsingFinishedEvent) { + + try { + rabbitTemplate.convertAndSend(MessagingConfiguration.LAYOUTPARSING_FINISHED_EVENT_QUEUE, objectMapper.writeValueAsString(layoutParsingFinishedEvent)); + } catch (JsonProcessingException e) { + throw new RuntimeException(e); + } + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/queue/MessagingConfiguration.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/queue/MessagingConfiguration.java new file mode 100644 index 0000000..be54ef6 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/queue/MessagingConfiguration.java @@ -0,0 +1,16 @@ +package com.knecon.fforesight.service.layoutparser.processor.queue; + +import org.springframework.context.annotation.Configuration; + +import lombok.RequiredArgsConstructor; + +@Configuration +@RequiredArgsConstructor +public class MessagingConfiguration { + + + public static final String LAYOUTPARSING_REQUEST_QUEUE = "LAYOUTPARSING_REQUEST_QUEUE"; + public static final String LAYOUTPARSING_FINISHED_EVENT_QUEUE = "LAYOUTPARSING_FINISHED_EVENT_QUEUE"; + + +} diff --git a/layoutparser-service/layoutparser-service-server/pom.xml b/layoutparser-service/layoutparser-service-server/pom.xml new file mode 100644 index 0000000..c95d10a --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/pom.xml @@ -0,0 +1,58 @@ + + + 4.0.0 + + + com.knecon.fforesight + layoutparser-service + 1.0.0 + + + layoutparser-service-server + 1.0.0 + + + + com.knecon.fforesight + layoutparser-service-processor + ${project.version} + + + + javax.servlet + javax.servlet-api + 4.0.1 + + + org.springframework.cloud + spring-cloud-starter-openfeign + 4.0.2 + + + + org.springframework.boot + spring-boot-starter-amqp + ${spring.version} + + + + org.junit.jupiter + junit-jupiter-api + 5.9.1 + test + + + org.springframework.boot + spring-boot-test + ${spring.version} + test + + + org.springframework + spring-test + 6.0.3 + test + + + diff --git a/layoutparser-service/layoutparser-service-server/src/main/java/com.knecon.fforesight.service.layoutparser.server/Application.java b/layoutparser-service/layoutparser-service-server/src/main/java/com.knecon.fforesight.service.layoutparser.server/Application.java new file mode 100644 index 0000000..e250913 --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/main/java/com.knecon.fforesight.service.layoutparser.server/Application.java @@ -0,0 +1,26 @@ +package com.knecon.fforesight.service.layoutparser.server; + +import org.springframework.boot.SpringApplication; +import org.springframework.boot.actuate.autoconfigure.security.servlet.ManagementWebSecurityAutoConfiguration; +import org.springframework.boot.autoconfigure.SpringBootApplication; +import org.springframework.boot.autoconfigure.security.servlet.SecurityAutoConfiguration; +import org.springframework.cloud.openfeign.EnableFeignClients; +import org.springframework.context.annotation.Import; + +import com.amazonaws.services.s3.model.metrics.MetricsConfiguration; +import com.knecon.fforesight.service.layoutparser.processor.LayoutparserServiceProcessorConfiguration; +import com.knecon.fforesight.service.layoutparser.processor.multitenancy.AsyncConfig; +import com.knecon.fforesight.service.layoutparser.processor.multitenancy.MultiTenancyMessagingConfiguration; +import com.knecon.fforesight.service.layoutparser.processor.multitenancy.MultiTenancyWebConfiguration; + +@Import({MultiTenancyWebConfiguration.class, AsyncConfig.class, MultiTenancyMessagingConfiguration.class, MetricsConfiguration.class, LayoutparserServiceProcessorConfiguration.class}) +@EnableFeignClients +@SpringBootApplication(exclude = {SecurityAutoConfiguration.class, ManagementWebSecurityAutoConfiguration.class}) +public class Application { + + public static void main(String[] args) { + + SpringApplication.run(Application.class, args); + } + +} diff --git a/layoutparser-service/layoutparser-service-server/src/main/resources/application-dev.yml b/layoutparser-service/layoutparser-service-server/src/main/resources/application-dev.yml new file mode 100644 index 0000000..789bcf1 --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/main/resources/application-dev.yml @@ -0,0 +1,17 @@ +server: + port: 8083 + +persistence-service.url: "http://localhost:8085" + +storage: + bucket-name: 'redaction' + endpoint: 'http://localhost:9000' + key: minioadmin + secret: minioadmin + + +redaction-service: + enableImageClassification: false + cvTableParsingEnabled: false + nerServiceEnabled: false + priorityMode: false diff --git a/layoutparser-service/layoutparser-service-server/src/main/resources/application.yml b/layoutparser-service/layoutparser-service-server/src/main/resources/application.yml new file mode 100644 index 0000000..0692bdc --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/main/resources/application.yml @@ -0,0 +1,38 @@ +info: + description: Layout Parser Service Processor + +persistence-service.url: "http://persistence-service-v1:8080" + +server: + port: 8080 + +spring: + main: + allow-circular-references: true # FIXME + profiles: + active: kubernetes + rabbitmq: + host: ${RABBITMQ_HOST:localhost} + port: ${RABBITMQ_PORT:5672} + username: ${RABBITMQ_USERNAME:user} + password: ${RABBITMQ_PASSWORD:rabbitmq} + listener: + simple: + acknowledge-mode: AUTO + concurrency: 2 + retry: + enabled: true + max-attempts: 3 + max-interval: 15000 + prefetch: 1 + +management: + endpoint: + metrics.enabled: ${monitoring.enabled:false} + prometheus.enabled: ${monitoring.enabled:false} + health.enabled: true + endpoints.web.exposure.include: prometheus, health + + +storage: + backend: 's3' diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/ApplicationTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/ApplicationTest.java new file mode 100644 index 0000000..24573c1 --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/ApplicationTest.java @@ -0,0 +1,7 @@ +package com.knecon.fforesight.service.layoutparser.server; + +import static org.junit.jupiter.api.Assertions.*; + +class ApplicationTest { + +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/BaseTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/BaseTest.java new file mode 100644 index 0000000..74e28c2 --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/BaseTest.java @@ -0,0 +1,150 @@ +package com.knecon.fforesight.service.layoutparser.server; + +import java.io.InputStream; +import java.util.Optional; + +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.extension.ExtendWith; +import org.springframework.amqp.rabbit.core.RabbitTemplate; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.autoconfigure.EnableAutoConfiguration; +import org.springframework.boot.autoconfigure.amqp.RabbitAutoConfiguration; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.boot.test.mock.mockito.MockBean; +import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.Configuration; +import org.springframework.context.annotation.Import; +import org.springframework.context.annotation.Primary; +import org.springframework.core.io.ClassPathResource; +import org.springframework.test.context.junit.jupiter.SpringExtension; + +import com.iqser.red.storage.commons.service.StorageService; +import com.knecon.fforesight.service.layoutparser.internal.api.mapper.DocumentDataMapper; +import com.knecon.fforesight.service.layoutparser.internal.api.mapper.DocumentGraphMapper; +import com.knecon.fforesight.service.layoutparser.internal.api.services.EntityEnrichmentService; +import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingStorageService; +import com.knecon.fforesight.service.layoutparser.processor.multitenancy.TenantContext; +import com.knecon.fforesight.service.layoutparser.processor.queue.LayoutParsingRequest; + +import lombok.SneakyThrows; + +@ExtendWith(SpringExtension.class) +@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT) +@Import(BaseTest.TestConfiguration.class) +public class BaseTest { + + @Autowired + protected LayoutParsingStorageService layoutParsingStorageService; + + @Autowired + protected StorageService storageService; + + @MockBean + private RabbitTemplate rabbitTemplate; + + protected final static String ORIGIN_FILE_ID = "origin"; + protected final static String TABLE_FILE_ID = "table"; + protected final static String IMAGE_FILE_ID = "image"; + protected final static String STRUCTURE_FILE_ID = "structure"; + protected final static String TEXT_FILE_ID = "texts"; + protected final static String POSITION_FILE_ID = "positions"; + protected final static String PAGES_FILE_ID = "pages"; + + + @SneakyThrows + protected LayoutParsingRequest prepareStorage(String file) { + + return prepareStorage(file, "cv_table_parsing_response/empty.json", "image_service_response/empty.json"); + } + + + @SneakyThrows + protected LayoutParsingRequest prepareStorage(InputStream fileInputStream) { + + storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileInputStream); + return LayoutParsingRequest.builder() + .imagesFileStorageId(Optional.empty()) + .originFileStorageId(ORIGIN_FILE_ID) + .tablesFileStorageId(Optional.empty()) + .pageFileStorageId(PAGES_FILE_ID) + .positionBlockFileStorageId(POSITION_FILE_ID) + .structureFileStorageId(STRUCTURE_FILE_ID) + .textBlockFileStorageId(TEXT_FILE_ID) + .build(); + } + + + @SneakyThrows + protected LayoutParsingRequest prepareStorage(String file, String cvServiceResponseFile, String imageInfoFile) { + + ClassPathResource pdfFileResource = new ClassPathResource(file); + ClassPathResource cvServiceResponseFileResource = new ClassPathResource(cvServiceResponseFile); + ClassPathResource imageInfoFileResource = new ClassPathResource(imageInfoFile); + + return prepareStorage(pdfFileResource.getInputStream(), cvServiceResponseFileResource.getInputStream(), imageInfoFileResource.getInputStream()); + } + + + @SneakyThrows + protected LayoutParsingRequest prepareStorage(InputStream fileStream, InputStream cvServiceResponseFileStream, InputStream imageInfoStream) { + + storageService.storeObject(TenantContext.getTenantId(), IMAGE_FILE_ID, imageInfoStream); + storageService.storeObject(TenantContext.getTenantId(), TABLE_FILE_ID, cvServiceResponseFileStream); + storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileStream); + + return LayoutParsingRequest.builder() + .imagesFileStorageId(Optional.of(IMAGE_FILE_ID)) + .originFileStorageId(ORIGIN_FILE_ID) + .tablesFileStorageId(Optional.of(TABLE_FILE_ID)) + .pageFileStorageId(PAGES_FILE_ID) + .positionBlockFileStorageId(POSITION_FILE_ID) + .structureFileStorageId(STRUCTURE_FILE_ID) + .textBlockFileStorageId(TEXT_FILE_ID) + .build(); + } + + + @AfterEach + public void cleanupStorage() { + + if (this.storageService instanceof FileSystemBackedStorageService) { + ((FileSystemBackedStorageService) this.storageService).clearStorage(); + } + } + + + @Configuration + @EnableAutoConfiguration(exclude = RabbitAutoConfiguration.class) + public static class TestConfiguration { + + @Bean + @Primary + public StorageService inMemoryStorage() { + + return new FileSystemBackedStorageService(); + } + + + @Bean + public EntityEnrichmentService testEntityEnrichmentService() { + + return new TestEntityEnrichmentService(); + } + + + @Bean + public DocumentDataMapper documentDataMapper() { + + return new DocumentDataMapper(); + } + + + @Bean + public DocumentGraphMapper documentGraphMapper() { + + return new DocumentGraphMapper(); + } + + } + +} diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/FileSystemBackedStorageService.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/FileSystemBackedStorageService.java new file mode 100644 index 0000000..809cb9e --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/FileSystemBackedStorageService.java @@ -0,0 +1,123 @@ +package com.knecon.fforesight.service.layoutparser.server; + +import static java.io.File.createTempFile; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import org.apache.commons.io.IOUtils; +import org.springframework.core.io.InputStreamResource; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.iqser.red.commons.jackson.ObjectMapperFactory; +import com.iqser.red.storage.commons.exception.StorageObjectDoesNotExist; +import com.iqser.red.storage.commons.service.StorageService; + +import lombok.SneakyThrows; + +public class FileSystemBackedStorageService implements StorageService { + + private final Map dataMap = new HashMap<>(); + + + public FileSystemBackedStorageService() { + + } + + + @SneakyThrows + @Override + public InputStreamResource getObject(String tenantId, String objectId) { + + var res = dataMap.get(objectId); + if (res == null) { + throw new StorageObjectDoesNotExist(new RuntimeException()); + } + return new InputStreamResource(new FileInputStream(res)); + + } + + + @Override + public void deleteObject(String tenantId, String objectId) { + + dataMap.remove(objectId); + } + + + @Override + public boolean objectExists(String tenantId, String objectId) { + + return dataMap.containsKey(objectId); + } + + + @Override + @SneakyThrows + public void storeJSONObject(String tenantId, String objectId, T any) { + + File tempFile = createTempFile("test", ".tmp"); + getMapper().writeValue(new FileOutputStream(tempFile), any); + dataMap.put(objectId, tempFile); + } + + + private ObjectMapper getMapper() { + + return ObjectMapperFactory.create(); + } + + + @Override + @SneakyThrows + public T readJSONObject(String tenantId, String objectId, Class clazz) { + + if (dataMap.get(objectId) == null || !dataMap.get(objectId).exists()) { + throw new StorageObjectDoesNotExist("Stored object not found"); + } + return getMapper().readValue(new FileInputStream(dataMap.get(objectId)), clazz); + } + + + public List listPaths() { + + return new ArrayList<>(dataMap.keySet()); + } + + + public List listFilePaths() { + + return dataMap.values().stream().map(File::getAbsolutePath).collect(Collectors.toList()); + } + + + @Override + @SneakyThrows + public void storeObject(String tenantId, String objectId, InputStream stream) { + + File tempFile = createTempFile("test", ".tmp"); + + try (var fileOutputStream = new FileOutputStream(tempFile)) { + IOUtils.copy(stream, fileOutputStream); + } + + dataMap.put(objectId, tempFile); + } + + + public void clearStorage() { + + this.dataMap.forEach((k, v) -> { + v.delete(); + }); + this.dataMap.clear(); + } + +} diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutParserApplicationTests.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutParserApplicationTests.java new file mode 100644 index 0000000..a2af3fe --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutParserApplicationTests.java @@ -0,0 +1,11 @@ +package com.knecon.fforesight.service.layoutparser.server; + +import org.junit.jupiter.api.Test; + +class LayoutParserApplicationTests extends BaseTest { + + @Test + void contextLoads() { + } + +} diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/TestEntity.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/TestEntity.java new file mode 100644 index 0000000..22c7eec --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/TestEntity.java @@ -0,0 +1,124 @@ +package com.knecon.fforesight.service.layoutparser.server; + +import java.nio.charset.StandardCharsets; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Set; + +import com.google.common.hash.Hashing; +import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Engine; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityPosition; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode; + +import lombok.AccessLevel; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.experimental.FieldDefaults; + +@Data +@Builder +@AllArgsConstructor +@FieldDefaults(level = AccessLevel.PRIVATE) +public class TestEntity implements EntityNode { + + // initial values + final Boundary boundary; + final String type; + final String entityType; + + // empty defaults + boolean redaction; + boolean removed; + boolean ignored; + boolean resized; + boolean skipRemoveEntitiesContainedInLarger; + boolean dictionaryEntry; + boolean dossierDictionaryEntry; + Set engines; + Set references; + int matchedRule; + String redactionReason; + String legalBasis; + + // inferred on graph insertion + String value; + CharSequence textBefore; + CharSequence textAfter; + @Builder.Default + Set pages = new HashSet<>(); + List entityPositionsPerPage; + @Builder.Default + List intersectingNodes = new LinkedList<>(); + SemanticNode deepestFullyContainingNode; + + + public static TestEntity initialEntityNode(Boundary boundary, String type, String entityType) { + + return TestEntity.builder() + .type(type) + .entityType(entityType) + .boundary(boundary) + .redaction(false) + .removed(false) + .ignored(false) + .resized(false) + .skipRemoveEntitiesContainedInLarger(false) + .dictionaryEntry(false) + .dossierDictionaryEntry(false) + .engines(new HashSet<>()) + .references(new HashSet<>()) + .matchedRule(-1) + .redactionReason("") + .legalBasis("") + .build(); + } + + + public void addIntersectingNode(SemanticNode containingNode) { + + intersectingNodes.add(containingNode); + } + + + @Override + public String toString() { + + StringBuilder sb = new StringBuilder(); + sb.append("Entity[\""); + sb.append(value); + sb.append("\", "); + sb.append(boundary); + sb.append(", pages["); + pages.forEach(page -> { + sb.append(page.getNumber()); + sb.append(", "); + }); + sb.delete(sb.length() - 2, sb.length()); + sb.append("], type = \""); + sb.append(type); + sb.append("\", EntityType."); + sb.append(entityType); + sb.append("]"); + return sb.toString(); + } + + + @Override + public int hashCode() { + + return Hashing.murmur3_128().hashString(toString(), StandardCharsets.UTF_8).hashCode(); + } + + + @Override + public boolean equals(Object o) { + + return o instanceof TestEntity && o.hashCode() == hashCode(); + } + +} diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/TestEntityEnrichmentService.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/TestEntityEnrichmentService.java new file mode 100644 index 0000000..76a54c1 --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/TestEntityEnrichmentService.java @@ -0,0 +1,86 @@ +package com.knecon.fforesight.service.layoutparser.server; + +import java.util.Arrays; +import java.util.List; +import java.util.Objects; + +import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock; +import com.knecon.fforesight.service.layoutparser.internal.api.services.EntityEnrichmentService; + +import lombok.RequiredArgsConstructor; + +@RequiredArgsConstructor +public class TestEntityEnrichmentService implements EntityEnrichmentService { + + public void enrichEntity(EntityNode entity, TextBlock textBlock) { + if (entity instanceof TestEntity) { + TestEntity entity2 = (TestEntity) entity; + entity2.setValue(textBlock.subSequence(entity.getBoundary()).toString()); + entity2.setTextAfter(findTextAfter(entity.getBoundary().end(), textBlock)); + entity2.setTextBefore(findTextBefore(entity.getBoundary().start(), textBlock)); + } + } + + + private CharSequence findTextAfter(int index, TextBlock textBlock) { + + int endOffset = Math.min(index + 100, textBlock.getBoundary().end()); + String textAfter = textBlock.subSequence(index, endOffset).toString(); + if (!textAfter.isBlank()) { + List wordsAfter = splitToWordsAndRemoveEmptyWords(textAfter); + int numberOfWordsAfter = Math.min(wordsAfter.size(), 3); + if (wordsAfter.size() > 0) { + return concatWordsAfter(wordsAfter.subList(0, numberOfWordsAfter), textAfter.startsWith(" ")); + } + } + return ""; + } + + + private CharSequence findTextBefore(int index, TextBlock textBlock) { + + int offsetBefore = Math.max(index - 100, textBlock.getBoundary().start()); + String textBefore = textBlock.subSequence(offsetBefore, index).toString(); + if (!textBefore.isBlank()) { + List wordsBefore = splitToWordsAndRemoveEmptyWords(textBefore); + int numberOfWordsBefore = Math.min(wordsBefore.size(), 3); + if (wordsBefore.size() > 0) { + return concatWordsBefore(wordsBefore.subList(wordsBefore.size() - numberOfWordsBefore, wordsBefore.size()), textBefore.endsWith(" ")); + } + } + return ""; + } + + + private static List splitToWordsAndRemoveEmptyWords(String textAfter) { + + return Arrays.stream(textAfter.split(" ")).filter(word -> !Objects.equals("", word)).toList(); + } + + + private static String concatWordsBefore(List words, boolean endWithSpace) { + + StringBuilder sb = new StringBuilder(); + + for (String word : words) { + sb.append(word).append(" "); + } + + String result = sb.toString().trim(); + return endWithSpace ? result + " " : result; + } + + + private static String concatWordsAfter(List words, boolean startWithSpace) { + + StringBuilder sb = new StringBuilder(); + + for (String word : words) { + sb.append(word).append(" "); + } + + String result = sb.toString().trim(); + return startWithSpace ? " " + result : result; + } +} diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/BoundaryTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/BoundaryTest.java new file mode 100644 index 0000000..3d97c8a --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/BoundaryTest.java @@ -0,0 +1,71 @@ +package com.knecon.fforesight.service.layoutparser.server.graph; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.Collections; +import java.util.List; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary; + +class BoundaryTest { + + Boundary startBoundary; + + + @BeforeEach + void setUp() { + + startBoundary = new Boundary(10, 100); + } + + + @Test + void testContains() { + + assertTrue(startBoundary.contains(11)); + assertTrue(startBoundary.contains(50)); + assertFalse(startBoundary.contains(9)); + assertFalse(startBoundary.contains(100)); + assertFalse(startBoundary.contains(150)); + assertFalse(startBoundary.contains(-123)); + assertTrue(startBoundary.contains(new Boundary(11, 99))); + assertTrue(startBoundary.contains(new Boundary(10, 100))); + assertTrue(startBoundary.contains(new Boundary(11, 11))); + assertFalse(startBoundary.contains(9, 100)); + assertTrue(startBoundary.contains(100, 100)); + assertFalse(startBoundary.contains(100, 101)); + assertFalse(startBoundary.contains(150, 151)); + } + + + @Test + void testIntersects() { + + assertTrue(startBoundary.intersects(new Boundary(1, 11))); + assertTrue(startBoundary.intersects(new Boundary(11, 12))); + assertTrue(startBoundary.intersects(new Boundary(11, 100))); + assertFalse(startBoundary.intersects(new Boundary(100, 101))); + assertTrue(startBoundary.intersects(new Boundary(99, 101))); + } + + + @Test + void testSplit() { + + assertEquals(4, startBoundary.split(List.of(12, 40, 90)).size()); + assertEquals(List.of(new Boundary(10, 12), new Boundary(12, 40), new Boundary(40, 90), new Boundary(90, 100)), startBoundary.split(List.of(12, 40, 90))); + assertEquals(List.of(new Boundary(10, 40), new Boundary(40, 100)), startBoundary.split(List.of(40))); + assertEquals(1, startBoundary.split(Collections.emptyList()).size()); + assertEquals(1, startBoundary.split(List.of(startBoundary.start())).size()); + assertThrows(IndexOutOfBoundsException.class, () -> startBoundary.split(Collections.singletonList(0))); + assertThrows(IndexOutOfBoundsException.class, () -> startBoundary.split(Collections.singletonList(100))); + assertThrows(IndexOutOfBoundsException.class, () -> startBoundary.split(List.of(12, 40, 100))); + } + +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphMappingTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphMappingTest.java new file mode 100644 index 0000000..cd0bf02 --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphMappingTest.java @@ -0,0 +1,60 @@ +package com.knecon.fforesight.service.layoutparser.server.graph; + +import java.util.Collections; + +import org.apache.pdfbox.Loader; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.junit.jupiter.api.Test; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.core.io.ClassPathResource; + +import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentData; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.DocumentGraph; +import com.knecon.fforesight.service.layoutparser.internal.api.mapper.DocumentDataMapper; +import com.knecon.fforesight.service.layoutparser.internal.api.mapper.DocumentGraphMapper; +import com.knecon.fforesight.service.layoutparser.processor.classification.service.PdfParsingService; +import com.knecon.fforesight.service.layoutparser.processor.factory.DocumentGraphFactory; +import com.knecon.fforesight.service.layoutparser.processor.queue.LayoutParsingRequest; +import com.knecon.fforesight.service.layoutparser.server.BaseTest; + +import lombok.SneakyThrows; + +public class DocumentGraphMappingTest extends BaseTest { + + @Autowired + private DocumentGraphFactory documentGraphFactory; + + @Autowired + private PdfParsingService pdfParsingService; + + @Autowired + private DocumentDataMapper documentDataMapper; + + @Autowired + private DocumentGraphMapper documentGraphMapper; + + + @Test + @SneakyThrows + public void testGraphMapping() { + + String filename = "files/crafted document"; + + prepareStorage(filename + ".pdf"); + ClassPathResource fileResource = new ClassPathResource(filename + ".pdf"); + LayoutParsingRequest layoutParsingRequest = prepareStorage(fileResource.getInputStream()); + PDDocument pdDocument = Loader.loadPDF(fileResource.getInputStream()); + + var classifiedDoc = pdfParsingService.parseDocument(pdDocument, Collections.emptyMap(), Collections.emptyMap()); + DocumentGraph document = documentGraphFactory.buildDocumentGraph(classifiedDoc); + DocumentData documentData = documentDataMapper.toDocumentData(document); + + layoutParsingStorageService.storeDocumentData(layoutParsingRequest, documentData); + DocumentData documentData2 = layoutParsingStorageService.readDocumentData(layoutParsingRequest); + DocumentGraph newDocumentGraph = documentGraphMapper.toDocumentGraph(documentData2); + + assert document.toString().equals(newDocumentGraph.toString()); + assert document.getTableOfContents().toString().equals(newDocumentGraph.getTableOfContents().toString()); + } + +} diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphTest.java new file mode 100644 index 0000000..bf02e3a --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphTest.java @@ -0,0 +1,308 @@ +package com.knecon.fforesight.service.layoutparser.server.graph; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.wildfly.common.Assert.assertTrue; + +import java.io.InputStream; +import java.util.List; + +import org.apache.pdfbox.Loader; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.junit.jupiter.api.Test; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.core.io.ClassPathResource; + +import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.DocumentGraph; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.HeadlineNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.NodeType; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ParagraphNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableCellNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock; +import com.knecon.fforesight.service.layoutparser.internal.api.services.EntityInsertionService; +import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingService; +import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse; +import com.knecon.fforesight.service.layoutparser.server.BaseTest; +import com.knecon.fforesight.service.layoutparser.server.TestEntity; + +import lombok.SneakyThrows; + +public class DocumentGraphTest extends BaseTest { + + @Autowired + private EntityInsertionService entityInsertionService; + + @Autowired + private LayoutParsingService layoutParsingService; + + + @Test + public void assertTextBeforeAndTextAfterForParagraphCrafted() { + + DocumentGraph documentGraph = buildGraph("files/crafted document"); + String searchTerm = "Clarissa"; + int start = documentGraph.getTextBlock().indexOf(searchTerm); + assert start != -1; + + Boundary boundary = new Boundary(start, start + searchTerm.length()); + TestEntity entityNode = TestEntity.initialEntityNode(boundary, "123", "123"); + entityInsertionService.addEntityToGraph(entityNode, documentGraph.getTableOfContents()); + + assertEquals("Expand to Hint ", entityNode.getTextBefore()); + assertEquals("’s Donut ←", entityNode.getTextAfter()); + assertEquals(searchTerm, entityNode.getValue()); + assertEquals("Rule 5: Do not redact genitive CBI_authors (Entries based on Dict) ", + entityNode.getDeepestFullyContainingNode().getHeadline().buildTextBlock().getSearchText()); + assertEquals(2, entityNode.getIntersectingNodes().size()); + assertEquals(5, entityNode.getDeepestFullyContainingNode().getNumberOnPage()); + assertInstanceOf(ParagraphNode.class, entityNode.getDeepestFullyContainingNode()); + + assertSameOffsetInAllIntersectingNodes(searchTerm, start, entityNode); + } + + + @Test + public void assertTextBeforeAndTextAfterForHeadlineCrafted() { + + DocumentGraph documentGraph = buildGraph("files/crafted document"); + String searchTerm = "Rule 39:"; + int start = documentGraph.getTextBlock().indexOf(searchTerm); + assert start != -1; + + Boundary boundary = new Boundary(start, start + searchTerm.length()); + TestEntity entityNode = TestEntity.initialEntityNode(boundary, "123", "123"); + entityInsertionService.addEntityToGraph(entityNode, documentGraph.getTableOfContents()); + + assertEquals("", entityNode.getTextBefore()); + assertEquals(" Purity Hint", entityNode.getTextAfter()); + assertEquals(searchTerm, entityNode.getValue()); + assertEquals("Rule 39: Purity Hint ", entityNode.getDeepestFullyContainingNode().getHeadline().buildTextBlock().getSearchText()); + assertEquals(2, entityNode.getIntersectingNodes().size()); + assertEquals(6, entityNode.getDeepestFullyContainingNode().getNumberOnPage()); + assertInstanceOf(HeadlineNode.class, entityNode.getDeepestFullyContainingNode()); + + assertSameOffsetInAllIntersectingNodes(searchTerm, start, entityNode); + } + + + @Test + public void assertTextBeforeAndTextAfterForTableCellCrafted() { + + DocumentGraph documentGraph = buildGraph("files/crafted document"); + String searchTerm = "1998"; + int start = documentGraph.getTextBlock().indexOf(searchTerm); + assert start != -1; + + Boundary boundary = new Boundary(start, start + searchTerm.length()); + TestEntity entityNode = TestEntity.initialEntityNode(boundary, "123", "123"); + entityInsertionService.addEntityToGraph(entityNode, documentGraph.getTableOfContents()); + + assertEquals("", entityNode.getTextBefore()); + assertEquals("", entityNode.getTextAfter()); + assertEquals(searchTerm, entityNode.getValue()); + assertEquals("Rule 6-11 (Authors Table) ", entityNode.getDeepestFullyContainingNode().getHeadline().buildTextBlock().getSearchText()); + assertEquals(3, entityNode.getIntersectingNodes().size()); + assertEquals(15, entityNode.getDeepestFullyContainingNode().getNumberOnPage()); + assertInstanceOf(TableCellNode.class, entityNode.getDeepestFullyContainingNode()); + + assertSameOffsetInAllIntersectingNodes(searchTerm, start, entityNode); + } + + + @Test + public void findAndCheckMultipleSearchTermsCrafted() { + + DocumentGraph documentGraph = buildGraph("files/crafted document"); + assertValueAndPageAndIntersectingNodes(documentGraph, "David", 1); + assertValueAndPageAndIntersectingNodes(documentGraph, "Weyland Industries", 2); + assertValueAndPageAndIntersectingNodes(documentGraph, "Desiree", 3); + assertValueAndPageAndIntersectingNodes(documentGraph, "kawasaki@me.com", 4); + assertValueAndPageAndIntersectingNodes(documentGraph, "Central Research Industry", 5); + } + + + @Test + public void assertTableStructure() { + + DocumentGraph documentGraph = buildGraph("files/crafted document"); + TableNode table = (TableNode) documentGraph.getTableOfContents()// + .streamEntriesInOrder()// + .filter(entry -> entry.type().equals(NodeType.TABLE))// + .map(TableOfContents.Entry::node)// + .findFirst().orElseThrow(); + assertEquals(5, table.getNumberOfCols()); + assertEquals(4, table.getNumberOfRows()); + assertEquals(5, table.streamHeaders().toList().size()); + CharSequence firstHeader = table.streamHeadersForCell(1, 1).map(TableCellNode::buildTextBlock).map(TextBlock::getSearchText).findFirst().orElseThrow(); + assertEquals("Author(s)", firstHeader.toString().stripTrailing()); + } + + + @Test + public void findAndCheckMultipleSearchTermsMetolachlor() { + + DocumentGraph documentGraph = buildGraph("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06"); + assertValueAndPageAndIntersectingNodes(documentGraph, "sideeffects", 4); + assertValueAndPageAndIntersectingNodes(documentGraph, "Commission Regulation", 9); + assertValueAndPageAndIntersectingNodes(documentGraph, "Pre-emergence", 15); + assertValueAndPageAndIntersectingNodes(documentGraph, "LiChrosorb CN +", 22); + assertValueAndPageAndIntersectingNodes(documentGraph, "RCC856132", 22); + assertValueAndPageAndIntersectingNodes(documentGraph, "Number of references included", 33); + } + + + @Test + public void assertTableStructureMetolachlor() { + + DocumentGraph documentGraph = buildGraph("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06"); + TableNode table = (TableNode) documentGraph.getTableOfContents() + .streamEntriesInOrder() + .filter(entry -> entry.node().getPages().stream().anyMatch(page -> page.getNumber() == 22)) + .filter(entry -> entry.type().equals(NodeType.TABLE)) + .map(TableOfContents.Entry::node) + .findFirst() + .orElseThrow(); + assertEquals(5, table.getNumberOfCols()); + assertEquals(14, table.getNumberOfRows()); + assertEquals(10, table.streamHeaders().toList().size()); + List twoHeaders = table.streamHeadersForCell(2, 1).map(TableCellNode::buildTextBlock).map(TextBlock::getSearchText).toList(); + assertEquals(2, twoHeaders.size()); + assertEquals("Component of residue definition: S-Metolachlor", twoHeaders.get(0).stripTrailing()); + assertEquals("Method type", twoHeaders.get(1).stripTrailing()); + } + + + @Test + public void assertTextBeforeAndTextAfterForParagraphMetolachlor() { + + DocumentGraph documentGraph = buildGraph("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06"); + String searchTerm = "Cucurbit"; + int start = documentGraph.getTextBlock().indexOf(searchTerm); + assert start != -1; + + Boundary boundary = new Boundary(start, start + searchTerm.length()); + TestEntity entityNode = TestEntity.initialEntityNode(boundary, "123", "123"); + entityInsertionService.addEntityToGraph(entityNode, documentGraph.getTableOfContents()); + + assertEquals("except Cranberry; Vegetable, ", entityNode.getTextBefore()); + assertEquals(", Group 9;", entityNode.getTextAfter()); + assertEquals("1.1.4 Evaluations carried out under other regulatory contexts ", entityNode.getDeepestFullyContainingNode().getHeadline().buildTextBlock().getSearchText()); + assertEquals(searchTerm, entityNode.getValue()); + assertEquals(2, entityNode.getIntersectingNodes().size()); + assertEquals(5, entityNode.getDeepestFullyContainingNode().getNumberOnPage()); + assertTrue(entityNode.getPages().stream().allMatch(pageNode -> pageNode.getNumber() == 10)); + assertInstanceOf(ParagraphNode.class, entityNode.getDeepestFullyContainingNode()); + + assertSameOffsetInAllIntersectingNodes(searchTerm, start, entityNode); + } + + + @Test + public void assertTextBeforeAndTextAfterForHeadlineMetolachlor() { + + DocumentGraph documentGraph = buildGraph("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06"); + String searchTerm = "absorption, distribution, metabolism"; + int start = documentGraph.getTextBlock().indexOf(searchTerm); + assert start != -1; + start = documentGraph.getTextBlock().indexOf(searchTerm, start + 1); + assert start != -1; + + Boundary boundary = new Boundary(start, start + searchTerm.length()); + TestEntity entityNode = TestEntity.initialEntityNode(boundary, "123", "123"); + entityInsertionService.addEntityToGraph(entityNode, documentGraph.getTableOfContents()); + + assertEquals("2.6.1 Summary of ", entityNode.getTextBefore()); + assertEquals(" and excretion in", entityNode.getTextAfter()); + assertEquals("2.6.1 Summary of absorption, distribution, metabolism and excretion in mammals ", + entityNode.getDeepestFullyContainingNode().getHeadline().buildTextBlock().getSearchText()); + assertEquals(searchTerm, entityNode.getValue()); + assertEquals(2, entityNode.getIntersectingNodes().size()); + assertEquals(4, entityNode.getDeepestFullyContainingNode().getNumberOnPage()); + assertTrue(entityNode.getPages().stream().allMatch(pageNode -> pageNode.getNumber() == 33)); + assertInstanceOf(HeadlineNode.class, entityNode.getDeepestFullyContainingNode()); + + assertSameOffsetInAllIntersectingNodes(searchTerm, start, entityNode); + } + + + @Test + public void assertTextBeforeAndTextAfterForTableCellMetolachlor() { + + DocumentGraph documentGraph = buildGraph("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06"); + String searchTerm = "N-deacetylation product"; + int start = documentGraph.getTextBlock().indexOf(searchTerm); + assert start != -1; + + Boundary boundary = new Boundary(start, start + searchTerm.length()); + TestEntity entityNode = TestEntity.initialEntityNode(boundary, "123", "123"); + entityInsertionService.addEntityToGraph(entityNode, documentGraph.getTableOfContents()); + + assertEquals("2-[(2-(1-hydroxy-ethyl)-6methyl-phenyl-amino]propan-1-ol (", entityNode.getTextBefore()); + assertEquals(" of metabolite of", entityNode.getTextAfter()); + assertEquals(searchTerm, entityNode.getValue()); + assertEquals(3, entityNode.getIntersectingNodes().size()); + assertEquals("2.7.2 Summary of metabolism, distribution and expression of residues in plants, poultry, lactating ruminants, pigs and fish ", + entityNode.getDeepestFullyContainingNode().getHeadline().buildTextBlock().getSearchText()); + assertTrue(entityNode.getPages().stream().allMatch(pageNode -> pageNode.getNumber() == 54)); + assertEquals(26, entityNode.getDeepestFullyContainingNode().getNumberOnPage()); + + assertInstanceOf(TableCellNode.class, entityNode.getDeepestFullyContainingNode()); + + assertSameOffsetInAllIntersectingNodes(searchTerm, start, entityNode); + } + + + @SneakyThrows + protected DocumentGraph buildGraph(String filename) { + + if (filename.equals("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06")) { + prepareStorage(filename + ".pdf", "cv_table_parsing_response/empty.json", "image_service_response/S-Metolachlor_RAR_01_Volume_1_2018-09-06.IMAGE_INFO.json"); + } else { + prepareStorage(filename + ".pdf"); + } + ClassPathResource fileResource = new ClassPathResource(filename + ".pdf"); + + try (InputStream inputStream = fileResource.getInputStream()) { + PDDocument pdDocument = Loader.loadPDF(inputStream); + return layoutParsingService.parseLayout(pdDocument, layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID), new TableServiceResponse()); + } + } + + + private static void assertSameOffsetInAllIntersectingNodes(String searchTerm, int start, EntityNode entityNode) { + + List paragraphStart = entityNode.getIntersectingNodes().stream()// + .map(SemanticNode::buildTextBlock)// + .map(textBlock -> textBlock.indexOf(searchTerm))// + .toList(); + + paragraphStart.forEach(nodeStart -> assertEquals(start, nodeStart)); + } + + + private void assertValueAndPageAndIntersectingNodes(DocumentGraph documentGraph, String searchTerm, int pageNumber) { + + int start = documentGraph.getTextBlock().indexOf(searchTerm); + + assert start != -1; + + Boundary boundary = new Boundary(start, start + searchTerm.length()); + TestEntity entityNode = TestEntity.initialEntityNode(boundary, "123", "123"); + entityInsertionService.addEntityToGraph(entityNode, documentGraph.getTableOfContents()); + PageNode pageNode = documentGraph.getPages().stream().filter(page -> page.getNumber() == pageNumber).findFirst().orElseThrow(); + + assertEquals(entityNode.getValue(), searchTerm); + assertTrue(pageNode.getEntities().contains(entityNode)); + assertTrue(documentGraph.getPages().stream().filter(page -> page != pageNode).noneMatch(page -> page.getEntities().contains(entityNode))); + assertTrue(entityNode.getPages().contains(pageNode)); + assertSameOffsetInAllIntersectingNodes(searchTerm, start, entityNode); + assertTrue(entityNode.getIntersectingNodes().stream().allMatch(node -> node.getEntities().contains(entityNode))); + } + +} diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphVisualizationTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphVisualizationTest.java new file mode 100644 index 0000000..0e252e4 --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphVisualizationTest.java @@ -0,0 +1,63 @@ +package com.knecon.fforesight.service.layoutparser.server.graph; + +import java.awt.Color; +import java.io.File; +import java.io.IOException; + +import org.apache.pdfbox.Loader; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; +import org.springframework.core.io.ClassPathResource; + +import com.knecon.fforesight.service.layoutparser.internal.api.graph.DocumentGraph; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock; +import com.knecon.fforesight.service.layoutparser.server.visualizations.PdfDraw; + +import lombok.SneakyThrows; + + +@Disabled +public class DocumentGraphVisualizationTest extends DocumentGraphTest { + + @Test + @SneakyThrows + public void visualizeMetolachlor() { + + String filename = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06"; + + DocumentGraph documentGraph = buildGraph(filename); + TextBlock textBlock = documentGraph.buildTextBlock(); + + visualizeSemanticNodes(filename, documentGraph, textBlock); + } + + + @Test + @SneakyThrows + public void visualizeRotatedTestDocument() { + + String filename = "files/RotateTestFileWithImages"; + + DocumentGraph documentGraph = buildGraph(filename); + TextBlock textBlock = documentGraph.buildTextBlock(); + + visualizeSemanticNodes(filename, documentGraph, textBlock); + } + + + private static void visualizeSemanticNodes(String filename, DocumentGraph documentGraph, TextBlock textBlock) throws IOException { + + File tmpFile = File.createTempFile(filename, "SEMANTIC_NODES_BBOX.pdf"); + ClassPathResource fileResource = new ClassPathResource(filename + ".pdf"); + + try (var fileStream = fileResource.getInputStream()) { + PDDocument pdDocument = Loader.loadPDF(fileStream); + PdfDraw.drawDocumentGraph(pdDocument, documentGraph); + PdfDraw.drawTextBlock(pdDocument, textBlock, PdfDraw.Options.builder().stroke(true).strokeWidth(0.1f).strokeColor(Color.YELLOW).build()); + pdDocument.save(tmpFile); + pdDocument.close(); + } + } + +} diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/visualizations/PdfDraw.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/visualizations/PdfDraw.java new file mode 100644 index 0000000..6113d09 --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/visualizations/PdfDraw.java @@ -0,0 +1,168 @@ +package com.knecon.fforesight.service.layoutparser.server.visualizations; + +import java.awt.Color; +import java.awt.geom.Point2D; +import java.awt.geom.Rectangle2D; +import java.io.IOException; +import java.util.List; +import java.util.Map; + +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.PDPageContentStream; +import org.apache.pdfbox.pdmodel.font.PDType1Font; +import org.apache.pdfbox.pdmodel.font.Standard14Fonts; +import org.apache.pdfbox.util.Matrix; + +import com.knecon.fforesight.service.layoutparser.internal.api.graph.DocumentGraph; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.NodeType; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.AtomicTextBlock; +import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock; +import com.knecon.fforesight.service.layoutparser.processor.factory.RectangleTransformations; + +import lombok.AccessLevel; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Getter; +import lombok.NoArgsConstructor; +import lombok.SneakyThrows; +import lombok.experimental.FieldDefaults; +import lombok.experimental.UtilityClass; + +@UtilityClass +public class PdfDraw { + + public static void drawDocumentGraph(PDDocument document, DocumentGraph documentGraph) { + + documentGraph.getTableOfContents().streamEntriesInOrder().forEach(entry -> drawNode(document, entry)); + } + + + public static void drawNode(PDDocument document, TableOfContents.Entry entry) { + + Options options = buildStandardOptionsForNodes(entry); + + drawBBoxAndLabelAndNumberOnPage(document, entry, options); + + } + + + public static void drawTextBlock(PDDocument document, TextBlock textBlock, Options options) { + + textBlock.getAtomicTextBlocks().forEach(atb -> drawAtomicTextBlock(document, atb, options)); + } + + + public static void drawAtomicTextBlock(PDDocument document, AtomicTextBlock atomicTextBlock, Options options) { + + drawRectangle2DList(document, atomicTextBlock.getPage().getNumber(), atomicTextBlock.getPositions().stream().toList(), options); + + } + + + @SneakyThrows + private static void drawText(String string, PDDocument document, Point2D location, Integer pageNumber, Options options, boolean rotate) { + + var pdPage = document.getPage(pageNumber - 1); + var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true); + + contentStream.setNonStrokingColor(options.getStrokeColor()); + contentStream.setLineWidth(options.getStrokeWidth()); + + contentStream.beginText(); + contentStream.setTextMatrix(Matrix.getRotateInstance(Math.toRadians(30), 0, 0)); + contentStream.newLineAtOffset((float) location.getX(), (float) location.getY()); + contentStream.setFont(new PDType1Font(Standard14Fonts.FontName.HELVETICA), 10); + contentStream.showText(string); + contentStream.endText(); + contentStream.close(); + } + + + @SneakyThrows + public static void drawRectangle2DList(PDDocument document, int pageNumber, List rectCollection, Options options) { + + var pdPage = document.getPage(pageNumber - 1); + drawRectangle2DList(document, rectCollection, options, pdPage); + } + + + private static void drawRectangle2DList(PDDocument document, List rectCollection, Options options, PDPage pdPage) throws IOException { + + var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true); + + contentStream.setStrokingColor(options.getStrokeColor()); + contentStream.setNonStrokingColor(options.getFillColor()); + contentStream.setLineWidth(options.getStrokeWidth()); + + for (var r : rectCollection) { + contentStream.addRect((float) r.getMinX(), (float) r.getMinY(), (float) r.getWidth(), (float) r.getHeight()); + + if (options.isStroke() && options.isFill()) { + contentStream.fillAndStroke(); + } else if (options.isStroke()) { + contentStream.stroke(); + } else if (options.isFill()) { + contentStream.fill(); + } + } + contentStream.close(); + } + + + @Builder + @AllArgsConstructor + @NoArgsConstructor + @Getter + @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) + public static class Options { + + @Builder.Default + boolean stroke = false; + @Builder.Default + Color strokeColor = Color.BLACK; + @Builder.Default + float strokeWidth = 1f; + @Builder.Default + boolean fill = false; + @Builder.Default + Color fillColor = Color.BLACK; + + } + + + private static Options buildStandardOptionsForNodes(TableOfContents.Entry entry) { + + return Options.builder().stroke(true).strokeColor(switch (entry.type()) { + case HEADER, FOOTER -> Color.GREEN; + case PARAGRAPH -> Color.BLUE; + case HEADLINE -> Color.RED; + case SECTION -> Color.BLACK; + case TABLE -> Color.ORANGE; + case TABLE_CELL -> Color.GRAY; + case IMAGE -> Color.MAGENTA; + }).build(); + } + + + private static void drawBBoxAndLabelAndNumberOnPage(PDDocument document, TableOfContents.Entry entry, Options options) { + + Map rectanglesPerPage = entry.node().getBBox(); + rectanglesPerPage.forEach((page, rectangle2D) -> { + if (entry.type() == NodeType.SECTION) { + rectangle2D = RectangleTransformations.pad(rectangle2D, 10, 10); + } + drawRectangle2DList(document, page.getNumber(), List.of(rectangle2D), options); + drawText(buildString(entry), document, new Point2D.Double(rectangle2D.getMinX(), rectangle2D.getMaxY() + 2), page.getNumber(), options, entry.type() == NodeType.TABLE_CELL); + }); + } + + + private static String buildString(TableOfContents.Entry entry) { + + return entry.node().getNumberOnPage() + ": " + entry.tocId() + ": " + entry.type().toString(); + } + +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/cv_table_parsing_response/empty.json b/layoutparser-service/layoutparser-service-server/src/test/resources/cv_table_parsing_response/empty.json new file mode 100644 index 0000000..af83f68 --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/resources/cv_table_parsing_response/empty.json @@ -0,0 +1,8 @@ +{ + "dossierId": "123", + "fileId": "123", + "operation": "table", + "targetFileExtension": "ORIGIN.pdf.gz", + "responseFileExtension": "TABLES.json.gz", + "data": [] +} diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/RotateTestFileWithImages.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/RotateTestFileWithImages.pdf new file mode 100644 index 0000000..2b009d1 Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/RotateTestFileWithImages.pdf differ diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf new file mode 100644 index 0000000..a145741 Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf differ diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/crafted document.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/crafted document.pdf new file mode 100644 index 0000000..be18a14 Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/crafted document.pdf differ diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/image_service_response/S-Metolachlor_RAR_01_Volume_1_2018-09-06.IMAGE_INFO.json b/layoutparser-service/layoutparser-service-server/src/test/resources/image_service_response/S-Metolachlor_RAR_01_Volume_1_2018-09-06.IMAGE_INFO.json new file mode 100644 index 0000000..c44818b --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/resources/image_service_response/S-Metolachlor_RAR_01_Volume_1_2018-09-06.IMAGE_INFO.json @@ -0,0 +1,638 @@ +{ + "dossierId": "123", + "fileId": "123", + "targetFileExtension": "ORIGIN.pdf.gz", + "responseFileExtension": "IMAGE_INFO.json.gz", + "data": [ + { + "classification": { + "label": "formula", + "probabilities": { + "formula": 1.0, + "logo": 0.0, + "other": 0.0, + "signature": 0.0 + } + }, + "representation": "DE7BE326BAC8CC872F3EF1FFC", + "position": { + "x1": 259, + "x2": 369, + "y1": 372, + "y2": 465, + "pageNumber": 54 + }, + "geometry": { + "width": 110, + "height": 93 + }, + "alpha": false, + "filters": { + "geometry": { + "imageSize": { + "quotient": 0.1429, + "tooLarge": false, + "tooSmall": false + }, + "imageFormat": { + "quotient": 1.1828, + "tooTall": false, + "tooWide": false + } + }, + "probability": { + "unconfident": false + }, + "allPassed": true + } + }, + { + "classification": { + "label": "formula", + "probabilities": { + "formula": 1.0, + "logo": 0.0, + "other": 0.0, + "signature": 0.0 + } + }, + "representation": "BDF2E766B088D8976E3EFBFFD", + "position": { + "x1": 259, + "x2": 370, + "y1": 558, + "y2": 633, + "pageNumber": 54 + }, + "geometry": { + "width": 111, + "height": 75 + }, + "alpha": false, + "filters": { + "geometry": { + "imageSize": { + "quotient": 0.1289, + "tooLarge": false, + "tooSmall": false + }, + "imageFormat": { + "quotient": 1.48, + "tooTall": false, + "tooWide": false + } + }, + "probability": { + "unconfident": false + }, + "allPassed": true + } + }, + { + "classification": { + "label": "formula", + "probabilities": { + "formula": 1.0, + "logo": 0.0, + "other": 0.0, + "signature": 0.0 + } + }, + "representation": "FCFBE026A8F86A802F3EFBFFE", + "position": { + "x1": 259, + "x2": 370, + "y1": 476, + "y2": 552, + "pageNumber": 54 + }, + "geometry": { + "width": 111, + "height": 76 + }, + "alpha": false, + "filters": { + "geometry": { + "imageSize": { + "quotient": 0.1298, + "tooLarge": false, + "tooSmall": false + }, + "imageFormat": { + "quotient": 1.4605, + "tooTall": false, + "tooWide": false + } + }, + "probability": { + "unconfident": false + }, + "allPassed": true + } + }, + { + "classification": { + "label": "formula", + "probabilities": { + "formula": 1.0, + "logo": 0.0, + "other": 0.0, + "signature": 0.0 + } + }, + "representation": "F3EF89CF8C7262A33E9EFAFFE", + "position": { + "x1": 259, + "x2": 372, + "y1": 274, + "y2": 366, + "pageNumber": 54 + }, + "geometry": { + "width": 113, + "height": 92 + }, + "alpha": false, + "filters": { + "geometry": { + "imageSize": { + "quotient": 0.1441, + "tooLarge": false, + "tooSmall": false + }, + "imageFormat": { + "quotient": 1.2283, + "tooTall": false, + "tooWide": false + } + }, + "probability": { + "unconfident": false + }, + "allPassed": true + } + }, + { + "classification": { + "label": "formula", + "probabilities": { + "formula": 1.0, + "logo": 0.0, + "other": 0.0, + "signature": 0.0 + } + }, + "representation": "FBFFC72232C8C8872F3EF1FFC", + "position": { + "x1": 259, + "x2": 372, + "y1": 183, + "y2": 268, + "pageNumber": 54 + }, + "geometry": { + "width": 113, + "height": 85 + }, + "alpha": false, + "filters": { + "geometry": { + "imageSize": { + "quotient": 0.1385, + "tooLarge": false, + "tooSmall": false + }, + "imageFormat": { + "quotient": 1.3294, + "tooTall": false, + "tooWide": false + } + }, + "probability": { + "unconfident": false + }, + "allPassed": true + } + }, + { + "classification": { + "label": "formula", + "probabilities": { + "formula": 1.0, + "logo": 0.0, + "other": 0.0, + "signature": 0.0 + } + }, + "representation": "FBFD873286106785EF78F0CF1", + "position": { + "x1": 259, + "x2": 449, + "y1": 111, + "y2": 177, + "pageNumber": 54 + }, + "geometry": { + "width": 190, + "height": 66 + }, + "alpha": false, + "filters": { + "geometry": { + "imageSize": { + "quotient": 0.1582, + "tooLarge": false, + "tooSmall": false + }, + "imageFormat": { + "quotient": 2.8788, + "tooTall": false, + "tooWide": false + } + }, + "probability": { + "unconfident": false + }, + "allPassed": true + } + }, + { + "classification": { + "label": "formula", + "probabilities": { + "formula": 1.0, + "logo": 0.0, + "other": 0.0, + "signature": 0.0 + } + }, + "representation": "7EF1C79E5B016ACFFFBD04442", + "position": { + "x1": 259, + "x2": 372, + "y1": 464, + "y2": 538, + "pageNumber": 55 + }, + "geometry": { + "width": 113, + "height": 74 + }, + "alpha": false, + "filters": { + "geometry": { + "imageSize": { + "quotient": 0.1292, + "tooLarge": false, + "tooSmall": false + }, + "imageFormat": { + "quotient": 1.527, + "tooTall": false, + "tooWide": false + } + }, + "probability": { + "unconfident": false + }, + "allPassed": true + } + }, + { + "classification": { + "label": "formula", + "probabilities": { + "formula": 1.0, + "logo": 0.0, + "other": 0.0, + "signature": 0.0 + } + }, + "representation": "F9F7ED37781D208B3FAEF0CF4", + "position": { + "x1": 259, + "x2": 440, + "y1": 628, + "y2": 721, + "pageNumber": 55 + }, + "geometry": { + "width": 181, + "height": 93 + }, + "alpha": false, + "filters": { + "geometry": { + "imageSize": { + "quotient": 0.1833, + "tooLarge": false, + "tooSmall": false + }, + "imageFormat": { + "quotient": 1.9462, + "tooTall": false, + "tooWide": false + } + }, + "probability": { + "unconfident": false + }, + "allPassed": true + } + }, + { + "classification": { + "label": "formula", + "probabilities": { + "formula": 1.0, + "logo": 0.0, + "other": 0.0, + "signature": 0.0 + } + }, + "representation": "FD778D164B00E389FF9EF0CF4", + "position": { + "x1": 259, + "x2": 440, + "y1": 544, + "y2": 622, + "pageNumber": 55 + }, + "geometry": { + "width": 181, + "height": 78 + }, + "alpha": false, + "filters": { + "geometry": { + "imageSize": { + "quotient": 0.1679, + "tooLarge": false, + "tooSmall": false + }, + "imageFormat": { + "quotient": 2.3205, + "tooTall": false, + "tooWide": false + } + }, + "probability": { + "unconfident": false + }, + "allPassed": true + } + }, + { + "classification": { + "label": "other", + "probabilities": { + "other": 0.9342, + "signature": 0.0474, + "logo": 0.0183, + "formula": 0.0002 + } + }, + "representation": "00CFFEF9F7EF9F7EF9F7FF3", + "position": { + "x1": 537, + "x2": 707, + "y1": 462, + "y2": 511, + "pageNumber": 74 + }, + "geometry": { + "width": 170, + "height": 49 + }, + "alpha": false, + "filters": { + "geometry": { + "imageSize": { + "quotient": 0.1289, + "tooLarge": false, + "tooSmall": false + }, + "imageFormat": { + "quotient": 3.4694, + "tooTall": false, + "tooWide": false + } + }, + "probability": { + "unconfident": false + }, + "allPassed": true + } + }, + { + "classification": { + "label": "other", + "probabilities": { + "other": 0.9435, + "signature": 0.0403, + "logo": 0.016, + "formula": 0.0001 + } + }, + "representation": "00CFFFFFFFFFFFFFFFFFFF3", + "position": { + "x1": 537, + "x2": 707, + "y1": 437, + "y2": 453, + "pageNumber": 74 + }, + "geometry": { + "width": 170, + "height": 16 + }, + "alpha": false, + "filters": { + "geometry": { + "imageSize": { + "quotient": 0.0737, + "tooLarge": false, + "tooSmall": false + }, + "imageFormat": { + "quotient": 10.625, + "tooTall": false, + "tooWide": true + } + }, + "probability": { + "unconfident": false + }, + "allPassed": false + } + }, + { + "classification": { + "label": "other", + "probabilities": { + "other": 0.9998, + "logo": 0.0001, + "formula": 0.0, + "signature": 0.0 + } + }, + "representation": "FFFFFFFFFFFFFFFFFFFFFFFFF", + "position": { + "x1": 442, + "x2": 499, + "y1": 308, + "y2": 351, + "pageNumber": 154 + }, + "geometry": { + "width": 57, + "height": 43 + }, + "alpha": false, + "filters": { + "geometry": { + "imageSize": { + "quotient": 0.0699, + "tooLarge": false, + "tooSmall": false + }, + "imageFormat": { + "quotient": 1.3256, + "tooTall": false, + "tooWide": false + } + }, + "probability": { + "unconfident": false + }, + "allPassed": true + } + }, + { + "classification": { + "label": "other", + "probabilities": { + "other": 1.0, + "formula": 0.0, + "logo": 0.0, + "signature": 0.0 + } + }, + "representation": "2C3083D04BD8CF2FFCF3F7EFF", + "position": { + "x1": 71, + "x2": 524, + "y1": 318, + "y2": 771, + "pageNumber": 165 + }, + "geometry": { + "width": 453, + "height": 453 + }, + "alpha": false, + "filters": { + "geometry": { + "imageSize": { + "quotient": 0.64, + "tooLarge": false, + "tooSmall": false + }, + "imageFormat": { + "quotient": 1.0, + "tooTall": false, + "tooWide": false + } + }, + "probability": { + "unconfident": false + }, + "allPassed": true + } + }, + { + "classification": { + "label": "other", + "probabilities": { + "other": 1.0, + "formula": 0.0, + "logo": 0.0, + "signature": 0.0 + } + }, + "representation": "FFF0010400FFECD31C60FFCFF", + "position": { + "x1": 93, + "x2": 502, + "y1": 374, + "y2": 495, + "pageNumber": 185 + }, + "geometry": { + "width": 409, + "height": 121 + }, + "alpha": false, + "filters": { + "geometry": { + "imageSize": { + "quotient": 0.3143, + "tooLarge": false, + "tooSmall": false + }, + "imageFormat": { + "quotient": 3.3802, + "tooTall": false, + "tooWide": false + } + }, + "probability": { + "unconfident": false + }, + "allPassed": true + } + }, + { + "classification": { + "label": "formula", + "probabilities": { + "formula": 1.0, + "logo": 0.0, + "other": 0.0, + "signature": 0.0 + } + }, + "representation": "33F5D7760837FFF3660877FFF", + "position": { + "x1": 158, + "x2": 437, + "y1": 538, + "y2": 771, + "pageNumber": 185 + }, + "geometry": { + "width": 279, + "height": 233 + }, + "alpha": false, + "filters": { + "geometry": { + "imageSize": { + "quotient": 0.3602, + "tooLarge": false, + "tooSmall": false + }, + "imageFormat": { + "quotient": 1.1974, + "tooTall": false, + "tooWide": false + } + }, + "probability": { + "unconfident": false + }, + "allPassed": true + } + } + ] +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/image_service_response/empty.json b/layoutparser-service/layoutparser-service-server/src/test/resources/image_service_response/empty.json new file mode 100644 index 0000000..856ee5d --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/resources/image_service_response/empty.json @@ -0,0 +1,5 @@ +{ + "dossierId": "f889853e-4bf8-49a9-aae5-c38605c6ef40", + "fileId": "22ef63e29bb2a27db8497272336f6b32", + "data": [] +} diff --git a/layoutparser-service/pom.xml b/layoutparser-service/pom.xml new file mode 100644 index 0000000..8c8cc55 --- /dev/null +++ b/layoutparser-service/pom.xml @@ -0,0 +1,107 @@ + + + 4.0.0 + + + com.knecon.fforesight + layoutparser + 1.0.0 + + + layoutparser-service + 1.0.0 + + pom + + + layoutparser-service-processor + layoutparser-service-internal-api + layoutparser-service-server + + + + 17 + 2.13.2 + 2.0.7 + 3.0.0-alpha2 + 3.0.1 + 2022.0.1 + 2.15.0-rc2 + 1.9.9 + UTF-8 + + + + + + org.springdoc + springdoc-openapi-ui + 1.6.13 + + + + + + + + + org.sonarsource.scanner.maven + sonar-maven-plugin + 3.9.0.2155 + + + org.owasp + dependency-check-maven + 6.3.1 + + ALL + + + + org.jacoco + jacoco-maven-plugin + + + prepare-agent + + prepare-agent + + + + report + + report + + + + + + + + + org.jacoco + jacoco-maven-plugin + 0.8.8 + + + prepare-agent + + prepare-agent + + + + report + + report-aggregate + + verify + + + + + + + + + diff --git a/mvnw b/mvnw new file mode 100755 index 0000000..8a8fb22 --- /dev/null +++ b/mvnw @@ -0,0 +1,316 @@ +#!/bin/sh +# ---------------------------------------------------------------------------- +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# ---------------------------------------------------------------------------- + +# ---------------------------------------------------------------------------- +# Maven Start Up Batch script +# +# Required ENV vars: +# ------------------ +# JAVA_HOME - location of a JDK home dir +# +# Optional ENV vars +# ----------------- +# M2_HOME - location of maven2's installed home dir +# MAVEN_OPTS - parameters passed to the Java VM when running Maven +# e.g. to debug Maven itself, use +# set MAVEN_OPTS=-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=8000 +# MAVEN_SKIP_RC - flag to disable loading of mavenrc files +# ---------------------------------------------------------------------------- + +if [ -z "$MAVEN_SKIP_RC" ] ; then + + if [ -f /usr/local/etc/mavenrc ] ; then + . /usr/local/etc/mavenrc + fi + + if [ -f /etc/mavenrc ] ; then + . /etc/mavenrc + fi + + if [ -f "$HOME/.mavenrc" ] ; then + . "$HOME/.mavenrc" + fi + +fi + +# OS specific support. $var _must_ be set to either true or false. +cygwin=false; +darwin=false; +mingw=false +case "`uname`" in + CYGWIN*) cygwin=true ;; + MINGW*) mingw=true;; + Darwin*) darwin=true + # Use /usr/libexec/java_home if available, otherwise fall back to /Library/Java/Home + # See https://developer.apple.com/library/mac/qa/qa1170/_index.html + if [ -z "$JAVA_HOME" ]; then + if [ -x "/usr/libexec/java_home" ]; then + export JAVA_HOME="`/usr/libexec/java_home`" + else + export JAVA_HOME="/Library/Java/Home" + fi + fi + ;; +esac + +if [ -z "$JAVA_HOME" ] ; then + if [ -r /etc/gentoo-release ] ; then + JAVA_HOME=`java-config --jre-home` + fi +fi + +if [ -z "$M2_HOME" ] ; then + ## resolve links - $0 may be a link to maven's home + PRG="$0" + + # need this for relative symlinks + while [ -h "$PRG" ] ; do + ls=`ls -ld "$PRG"` + link=`expr "$ls" : '.*-> \(.*\)$'` + if expr "$link" : '/.*' > /dev/null; then + PRG="$link" + else + PRG="`dirname "$PRG"`/$link" + fi + done + + saveddir=`pwd` + + M2_HOME=`dirname "$PRG"`/.. + + # make it fully qualified + M2_HOME=`cd "$M2_HOME" && pwd` + + cd "$saveddir" + # echo Using m2 at $M2_HOME +fi + +# For Cygwin, ensure paths are in UNIX format before anything is touched +if $cygwin ; then + [ -n "$M2_HOME" ] && + M2_HOME=`cygpath --unix "$M2_HOME"` + [ -n "$JAVA_HOME" ] && + JAVA_HOME=`cygpath --unix "$JAVA_HOME"` + [ -n "$CLASSPATH" ] && + CLASSPATH=`cygpath --path --unix "$CLASSPATH"` +fi + +# For Mingw, ensure paths are in UNIX format before anything is touched +if $mingw ; then + [ -n "$M2_HOME" ] && + M2_HOME="`(cd "$M2_HOME"; pwd)`" + [ -n "$JAVA_HOME" ] && + JAVA_HOME="`(cd "$JAVA_HOME"; pwd)`" +fi + +if [ -z "$JAVA_HOME" ]; then + javaExecutable="`which javac`" + if [ -n "$javaExecutable" ] && ! [ "`expr \"$javaExecutable\" : '\([^ ]*\)'`" = "no" ]; then + # readlink(1) is not available as standard on Solaris 10. + readLink=`which readlink` + if [ ! `expr "$readLink" : '\([^ ]*\)'` = "no" ]; then + if $darwin ; then + javaHome="`dirname \"$javaExecutable\"`" + javaExecutable="`cd \"$javaHome\" && pwd -P`/javac" + else + javaExecutable="`readlink -f \"$javaExecutable\"`" + fi + javaHome="`dirname \"$javaExecutable\"`" + javaHome=`expr "$javaHome" : '\(.*\)/bin'` + JAVA_HOME="$javaHome" + export JAVA_HOME + fi + fi +fi + +if [ -z "$JAVACMD" ] ; then + if [ -n "$JAVA_HOME" ] ; then + if [ -x "$JAVA_HOME/jre/sh/java" ] ; then + # IBM's JDK on AIX uses strange locations for the executables + JAVACMD="$JAVA_HOME/jre/sh/java" + else + JAVACMD="$JAVA_HOME/bin/java" + fi + else + JAVACMD="`\\unset -f command; \\command -v java`" + fi +fi + +if [ ! -x "$JAVACMD" ] ; then + echo "Error: JAVA_HOME is not defined correctly." >&2 + echo " We cannot execute $JAVACMD" >&2 + exit 1 +fi + +if [ -z "$JAVA_HOME" ] ; then + echo "Warning: JAVA_HOME environment variable is not set." +fi + +CLASSWORLDS_LAUNCHER=org.codehaus.plexus.classworlds.launcher.Launcher + +# traverses directory structure from process work directory to filesystem root +# first directory with .mvn subdirectory is considered project base directory +find_maven_basedir() { + + if [ -z "$1" ] + then + echo "Path not specified to find_maven_basedir" + return 1 + fi + + basedir="$1" + wdir="$1" + while [ "$wdir" != '/' ] ; do + if [ -d "$wdir"/.mvn ] ; then + basedir=$wdir + break + fi + # workaround for JBEAP-8937 (on Solaris 10/Sparc) + if [ -d "${wdir}" ]; then + wdir=`cd "$wdir/.."; pwd` + fi + # end of workaround + done + echo "${basedir}" +} + +# concatenates all lines of a file +concat_lines() { + if [ -f "$1" ]; then + echo "$(tr -s '\n' ' ' < "$1")" + fi +} + +BASE_DIR=`find_maven_basedir "$(pwd)"` +if [ -z "$BASE_DIR" ]; then + exit 1; +fi + +########################################################################################## +# Extension to allow automatically downloading the maven-wrapper.jar from Maven-central +# This allows using the maven wrapper in projects that prohibit checking in binary data. +########################################################################################## +if [ -r "$BASE_DIR/.mvn/wrapper/maven-wrapper.jar" ]; then + if [ "$MVNW_VERBOSE" = true ]; then + echo "Found .mvn/wrapper/maven-wrapper.jar" + fi +else + if [ "$MVNW_VERBOSE" = true ]; then + echo "Couldn't find .mvn/wrapper/maven-wrapper.jar, downloading it ..." + fi + if [ -n "$MVNW_REPOURL" ]; then + jarUrl="$MVNW_REPOURL/org/apache/maven/wrapper/maven-wrapper/3.1.0/maven-wrapper-3.1.0.jar" + else + jarUrl="https://repo.maven.apache.org/maven2/org/apache/maven/wrapper/maven-wrapper/3.1.0/maven-wrapper-3.1.0.jar" + fi + while IFS="=" read key value; do + case "$key" in (wrapperUrl) jarUrl="$value"; break ;; + esac + done < "$BASE_DIR/.mvn/wrapper/maven-wrapper.properties" + if [ "$MVNW_VERBOSE" = true ]; then + echo "Downloading from: $jarUrl" + fi + wrapperJarPath="$BASE_DIR/.mvn/wrapper/maven-wrapper.jar" + if $cygwin; then + wrapperJarPath=`cygpath --path --windows "$wrapperJarPath"` + fi + + if command -v wget > /dev/null; then + if [ "$MVNW_VERBOSE" = true ]; then + echo "Found wget ... using wget" + fi + if [ -z "$MVNW_USERNAME" ] || [ -z "$MVNW_PASSWORD" ]; then + wget "$jarUrl" -O "$wrapperJarPath" || rm -f "$wrapperJarPath" + else + wget --http-user=$MVNW_USERNAME --http-password=$MVNW_PASSWORD "$jarUrl" -O "$wrapperJarPath" || rm -f "$wrapperJarPath" + fi + elif command -v curl > /dev/null; then + if [ "$MVNW_VERBOSE" = true ]; then + echo "Found curl ... using curl" + fi + if [ -z "$MVNW_USERNAME" ] || [ -z "$MVNW_PASSWORD" ]; then + curl -o "$wrapperJarPath" "$jarUrl" -f + else + curl --user $MVNW_USERNAME:$MVNW_PASSWORD -o "$wrapperJarPath" "$jarUrl" -f + fi + + else + if [ "$MVNW_VERBOSE" = true ]; then + echo "Falling back to using Java to download" + fi + javaClass="$BASE_DIR/.mvn/wrapper/MavenWrapperDownloader.java" + # For Cygwin, switch paths to Windows format before running javac + if $cygwin; then + javaClass=`cygpath --path --windows "$javaClass"` + fi + if [ -e "$javaClass" ]; then + if [ ! -e "$BASE_DIR/.mvn/wrapper/MavenWrapperDownloader.class" ]; then + if [ "$MVNW_VERBOSE" = true ]; then + echo " - Compiling MavenWrapperDownloader.java ..." + fi + # Compiling the Java class + ("$JAVA_HOME/bin/javac" "$javaClass") + fi + if [ -e "$BASE_DIR/.mvn/wrapper/MavenWrapperDownloader.class" ]; then + # Running the downloader + if [ "$MVNW_VERBOSE" = true ]; then + echo " - Running MavenWrapperDownloader.java ..." + fi + ("$JAVA_HOME/bin/java" -cp .mvn/wrapper MavenWrapperDownloader "$MAVEN_PROJECTBASEDIR") + fi + fi + fi +fi +########################################################################################## +# End of extension +########################################################################################## + +export MAVEN_PROJECTBASEDIR=${MAVEN_BASEDIR:-"$BASE_DIR"} +if [ "$MVNW_VERBOSE" = true ]; then + echo $MAVEN_PROJECTBASEDIR +fi +MAVEN_OPTS="$(concat_lines "$MAVEN_PROJECTBASEDIR/.mvn/jvm.config") $MAVEN_OPTS" + +# For Cygwin, switch paths to Windows format before running java +if $cygwin; then + [ -n "$M2_HOME" ] && + M2_HOME=`cygpath --path --windows "$M2_HOME"` + [ -n "$JAVA_HOME" ] && + JAVA_HOME=`cygpath --path --windows "$JAVA_HOME"` + [ -n "$CLASSPATH" ] && + CLASSPATH=`cygpath --path --windows "$CLASSPATH"` + [ -n "$MAVEN_PROJECTBASEDIR" ] && + MAVEN_PROJECTBASEDIR=`cygpath --path --windows "$MAVEN_PROJECTBASEDIR"` +fi + +# Provide a "standardized" way to retrieve the CLI args that will +# work with both Windows and non-Windows executions. +MAVEN_CMD_LINE_ARGS="$MAVEN_CONFIG $@" +export MAVEN_CMD_LINE_ARGS + +WRAPPER_LAUNCHER=org.apache.maven.wrapper.MavenWrapperMain + +exec "$JAVACMD" \ + $MAVEN_OPTS \ + $MAVEN_DEBUG_OPTS \ + -classpath "$MAVEN_PROJECTBASEDIR/.mvn/wrapper/maven-wrapper.jar" \ + "-Dmaven.home=${M2_HOME}" \ + "-Dmaven.multiModuleProjectDirectory=${MAVEN_PROJECTBASEDIR}" \ + ${WRAPPER_LAUNCHER} $MAVEN_CONFIG "$@" diff --git a/mvnw.cmd b/mvnw.cmd new file mode 100644 index 0000000..1d8ab01 --- /dev/null +++ b/mvnw.cmd @@ -0,0 +1,188 @@ +@REM ---------------------------------------------------------------------------- +@REM Licensed to the Apache Software Foundation (ASF) under one +@REM or more contributor license agreements. See the NOTICE file +@REM distributed with this work for additional information +@REM regarding copyright ownership. The ASF licenses this file +@REM to you under the Apache License, Version 2.0 (the +@REM "License"); you may not use this file except in compliance +@REM with the License. You may obtain a copy of the License at +@REM +@REM https://www.apache.org/licenses/LICENSE-2.0 +@REM +@REM Unless required by applicable law or agreed to in writing, +@REM software distributed under the License is distributed on an +@REM "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +@REM KIND, either express or implied. See the License for the +@REM specific language governing permissions and limitations +@REM under the License. +@REM ---------------------------------------------------------------------------- + +@REM ---------------------------------------------------------------------------- +@REM Maven Start Up Batch script +@REM +@REM Required ENV vars: +@REM JAVA_HOME - location of a JDK home dir +@REM +@REM Optional ENV vars +@REM M2_HOME - location of maven2's installed home dir +@REM MAVEN_BATCH_ECHO - set to 'on' to enable the echoing of the batch commands +@REM MAVEN_BATCH_PAUSE - set to 'on' to wait for a keystroke before ending +@REM MAVEN_OPTS - parameters passed to the Java VM when running Maven +@REM e.g. to debug Maven itself, use +@REM set MAVEN_OPTS=-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=8000 +@REM MAVEN_SKIP_RC - flag to disable loading of mavenrc files +@REM ---------------------------------------------------------------------------- + +@REM Begin all REM lines with '@' in case MAVEN_BATCH_ECHO is 'on' +@echo off +@REM set title of command window +title %0 +@REM enable echoing by setting MAVEN_BATCH_ECHO to 'on' +@if "%MAVEN_BATCH_ECHO%" == "on" echo %MAVEN_BATCH_ECHO% + +@REM set %HOME% to equivalent of $HOME +if "%HOME%" == "" (set "HOME=%HOMEDRIVE%%HOMEPATH%") + +@REM Execute a user defined script before this one +if not "%MAVEN_SKIP_RC%" == "" goto skipRcPre +@REM check for pre script, once with legacy .bat ending and once with .cmd ending +if exist "%USERPROFILE%\mavenrc_pre.bat" call "%USERPROFILE%\mavenrc_pre.bat" %* +if exist "%USERPROFILE%\mavenrc_pre.cmd" call "%USERPROFILE%\mavenrc_pre.cmd" %* +:skipRcPre + +@setlocal + +set ERROR_CODE=0 + +@REM To isolate internal variables from possible post scripts, we use another setlocal +@setlocal + +@REM ==== START VALIDATION ==== +if not "%JAVA_HOME%" == "" goto OkJHome + +echo. +echo Error: JAVA_HOME not found in your environment. >&2 +echo Please set the JAVA_HOME variable in your environment to match the >&2 +echo location of your Java installation. >&2 +echo. +goto error + +:OkJHome +if exist "%JAVA_HOME%\bin\java.exe" goto init + +echo. +echo Error: JAVA_HOME is set to an invalid directory. >&2 +echo JAVA_HOME = "%JAVA_HOME%" >&2 +echo Please set the JAVA_HOME variable in your environment to match the >&2 +echo location of your Java installation. >&2 +echo. +goto error + +@REM ==== END VALIDATION ==== + +:init + +@REM Find the project base dir, i.e. the directory that contains the folder ".mvn". +@REM Fallback to current working directory if not found. + +set MAVEN_PROJECTBASEDIR=%MAVEN_BASEDIR% +IF NOT "%MAVEN_PROJECTBASEDIR%"=="" goto endDetectBaseDir + +set EXEC_DIR=%CD% +set WDIR=%EXEC_DIR% +:findBaseDir +IF EXIST "%WDIR%"\.mvn goto baseDirFound +cd .. +IF "%WDIR%"=="%CD%" goto baseDirNotFound +set WDIR=%CD% +goto findBaseDir + +:baseDirFound +set MAVEN_PROJECTBASEDIR=%WDIR% +cd "%EXEC_DIR%" +goto endDetectBaseDir + +:baseDirNotFound +set MAVEN_PROJECTBASEDIR=%EXEC_DIR% +cd "%EXEC_DIR%" + +:endDetectBaseDir + +IF NOT EXIST "%MAVEN_PROJECTBASEDIR%\.mvn\jvm.config" goto endReadAdditionalConfig + +@setlocal EnableExtensions EnableDelayedExpansion +for /F "usebackq delims=" %%a in ("%MAVEN_PROJECTBASEDIR%\.mvn\jvm.config") do set JVM_CONFIG_MAVEN_PROPS=!JVM_CONFIG_MAVEN_PROPS! %%a +@endlocal & set JVM_CONFIG_MAVEN_PROPS=%JVM_CONFIG_MAVEN_PROPS% + +:endReadAdditionalConfig + +SET MAVEN_JAVA_EXE="%JAVA_HOME%\bin\java.exe" +set WRAPPER_JAR="%MAVEN_PROJECTBASEDIR%\.mvn\wrapper\maven-wrapper.jar" +set WRAPPER_LAUNCHER=org.apache.maven.wrapper.MavenWrapperMain + +set DOWNLOAD_URL="https://repo.maven.apache.org/maven2/org/apache/maven/wrapper/maven-wrapper/3.1.0/maven-wrapper-3.1.0.jar" + +FOR /F "usebackq tokens=1,2 delims==" %%A IN ("%MAVEN_PROJECTBASEDIR%\.mvn\wrapper\maven-wrapper.properties") DO ( + IF "%%A"=="wrapperUrl" SET DOWNLOAD_URL=%%B +) + +@REM Extension to allow automatically downloading the maven-wrapper.jar from Maven-central +@REM This allows using the maven wrapper in projects that prohibit checking in binary data. +if exist %WRAPPER_JAR% ( + if "%MVNW_VERBOSE%" == "true" ( + echo Found %WRAPPER_JAR% + ) +) else ( + if not "%MVNW_REPOURL%" == "" ( + SET DOWNLOAD_URL="%MVNW_REPOURL%/org/apache/maven/wrapper/maven-wrapper/3.1.0/maven-wrapper-3.1.0.jar" + ) + if "%MVNW_VERBOSE%" == "true" ( + echo Couldn't find %WRAPPER_JAR%, downloading it ... + echo Downloading from: %DOWNLOAD_URL% + ) + + powershell -Command "&{"^ + "$webclient = new-object System.Net.WebClient;"^ + "if (-not ([string]::IsNullOrEmpty('%MVNW_USERNAME%') -and [string]::IsNullOrEmpty('%MVNW_PASSWORD%'))) {"^ + "$webclient.Credentials = new-object System.Net.NetworkCredential('%MVNW_USERNAME%', '%MVNW_PASSWORD%');"^ + "}"^ + "[Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12; $webclient.DownloadFile('%DOWNLOAD_URL%', '%WRAPPER_JAR%')"^ + "}" + if "%MVNW_VERBOSE%" == "true" ( + echo Finished downloading %WRAPPER_JAR% + ) +) +@REM End of extension + +@REM Provide a "standardized" way to retrieve the CLI args that will +@REM work with both Windows and non-Windows executions. +set MAVEN_CMD_LINE_ARGS=%* + +%MAVEN_JAVA_EXE% ^ + %JVM_CONFIG_MAVEN_PROPS% ^ + %MAVEN_OPTS% ^ + %MAVEN_DEBUG_OPTS% ^ + -classpath %WRAPPER_JAR% ^ + "-Dmaven.multiModuleProjectDirectory=%MAVEN_PROJECTBASEDIR%" ^ + %WRAPPER_LAUNCHER% %MAVEN_CONFIG% %* +if ERRORLEVEL 1 goto error +goto end + +:error +set ERROR_CODE=1 + +:end +@endlocal & set ERROR_CODE=%ERROR_CODE% + +if not "%MAVEN_SKIP_RC%"=="" goto skipRcPost +@REM check for post script, once with legacy .bat ending and once with .cmd ending +if exist "%USERPROFILE%\mavenrc_post.bat" call "%USERPROFILE%\mavenrc_post.bat" +if exist "%USERPROFILE%\mavenrc_post.cmd" call "%USERPROFILE%\mavenrc_post.cmd" +:skipRcPost + +@REM pause the script if MAVEN_BATCH_PAUSE is set to 'on' +if "%MAVEN_BATCH_PAUSE%"=="on" pause + +if "%MAVEN_TERMINATE_CMD%"=="on" exit %ERROR_CODE% + +cmd /C exit /B %ERROR_CODE% diff --git a/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/data/AtomicPositionBlockData$AtomicPositionBlockDataBuilder.class b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/data/AtomicPositionBlockData$AtomicPositionBlockDataBuilder.class new file mode 100644 index 0000000..d1e41c9 Binary files /dev/null and b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/data/AtomicPositionBlockData$AtomicPositionBlockDataBuilder.class differ diff --git a/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/data/AtomicPositionBlockData.class b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/data/AtomicPositionBlockData.class new file mode 100644 index 0000000..8a11c7c Binary files /dev/null and b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/data/AtomicPositionBlockData.class differ diff --git a/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/data/AtomicTextBlockData$AtomicTextBlockDataBuilder.class b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/data/AtomicTextBlockData$AtomicTextBlockDataBuilder.class new file mode 100644 index 0000000..cb17e68 Binary files /dev/null and b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/data/AtomicTextBlockData$AtomicTextBlockDataBuilder.class differ diff --git a/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/data/AtomicTextBlockData.class b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/data/AtomicTextBlockData.class new file mode 100644 index 0000000..1fe9e3f Binary files /dev/null and b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/data/AtomicTextBlockData.class differ diff --git a/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/data/DocumentData$DocumentDataBuilder.class b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/data/DocumentData$DocumentDataBuilder.class new file mode 100644 index 0000000..03bbc45 Binary files /dev/null and b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/data/DocumentData$DocumentDataBuilder.class differ diff --git a/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/data/DocumentData.class b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/data/DocumentData.class new file mode 100644 index 0000000..7d89b3b Binary files /dev/null and b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/data/DocumentData.class differ diff --git a/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/data/PageData$PageDataBuilder.class b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/data/PageData$PageDataBuilder.class new file mode 100644 index 0000000..52df2c4 Binary files /dev/null and b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/data/PageData$PageDataBuilder.class differ diff --git a/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/data/PageData.class b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/data/PageData.class new file mode 100644 index 0000000..2e5b26f Binary files /dev/null and b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/data/PageData.class differ diff --git a/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/data/TableOfContentsData$EntryData$EntryDataBuilder.class b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/data/TableOfContentsData$EntryData$EntryDataBuilder.class new file mode 100644 index 0000000..710e257 Binary files /dev/null and b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/data/TableOfContentsData$EntryData$EntryDataBuilder.class differ diff --git a/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/data/TableOfContentsData$EntryData.class b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/data/TableOfContentsData$EntryData.class new file mode 100644 index 0000000..420514f Binary files /dev/null and b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/data/TableOfContentsData$EntryData.class differ diff --git a/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/data/TableOfContentsData$TableOfContentsDataBuilder.class b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/data/TableOfContentsData$TableOfContentsDataBuilder.class new file mode 100644 index 0000000..e9b40e4 Binary files /dev/null and b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/data/TableOfContentsData$TableOfContentsDataBuilder.class differ diff --git a/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/data/TableOfContentsData.class b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/data/TableOfContentsData.class new file mode 100644 index 0000000..ec0c583 Binary files /dev/null and b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/data/TableOfContentsData.class differ diff --git a/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/Boundary.class b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/Boundary.class new file mode 100644 index 0000000..13d19c8 Binary files /dev/null and b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/Boundary.class differ diff --git a/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/DocumentGraph$DocumentGraphBuilder.class b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/DocumentGraph$DocumentGraphBuilder.class new file mode 100644 index 0000000..eedebfe Binary files /dev/null and b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/DocumentGraph$DocumentGraphBuilder.class differ diff --git a/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/DocumentGraph.class b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/DocumentGraph.class new file mode 100644 index 0000000..fac11e1 Binary files /dev/null and b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/DocumentGraph.class differ diff --git a/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/TableOfContents$Entry$EntryBuilder.class b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/TableOfContents$Entry$EntryBuilder.class new file mode 100644 index 0000000..cda549a Binary files /dev/null and b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/TableOfContents$Entry$EntryBuilder.class differ diff --git a/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/TableOfContents$Entry.class b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/TableOfContents$Entry.class new file mode 100644 index 0000000..9de08a8 Binary files /dev/null and b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/TableOfContents$Entry.class differ diff --git a/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/TableOfContents.class b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/TableOfContents.class new file mode 100644 index 0000000..22bb417 Binary files /dev/null and b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/TableOfContents.class differ diff --git a/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/entity/EntityNode.class b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/entity/EntityNode.class new file mode 100644 index 0000000..d0e22dd Binary files /dev/null and b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/entity/EntityNode.class differ diff --git a/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/entity/EntityPosition$EntityPositionBuilder.class b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/entity/EntityPosition$EntityPositionBuilder.class new file mode 100644 index 0000000..01f8823 Binary files /dev/null and b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/entity/EntityPosition$EntityPositionBuilder.class differ diff --git a/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/entity/EntityPosition.class b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/entity/EntityPosition.class new file mode 100644 index 0000000..f194326 Binary files /dev/null and b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/entity/EntityPosition.class differ diff --git a/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/FooterNode$FooterNodeBuilder.class b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/FooterNode$FooterNodeBuilder.class new file mode 100644 index 0000000..5a8bdea Binary files /dev/null and b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/FooterNode$FooterNodeBuilder.class differ diff --git a/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/FooterNode.class b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/FooterNode.class new file mode 100644 index 0000000..7eee70a Binary files /dev/null and b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/FooterNode.class differ diff --git a/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/HeaderNode$HeaderNodeBuilder.class b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/HeaderNode$HeaderNodeBuilder.class new file mode 100644 index 0000000..b2222eb Binary files /dev/null and b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/HeaderNode$HeaderNodeBuilder.class differ diff --git a/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/HeaderNode.class b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/HeaderNode.class new file mode 100644 index 0000000..8fa53a3 Binary files /dev/null and b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/HeaderNode.class differ diff --git a/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/HeadlineNode$HeadlineNodeBuilder.class b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/HeadlineNode$HeadlineNodeBuilder.class new file mode 100644 index 0000000..f3acb45 Binary files /dev/null and b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/HeadlineNode$HeadlineNodeBuilder.class differ diff --git a/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/HeadlineNode.class b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/HeadlineNode.class new file mode 100644 index 0000000..5d52127 Binary files /dev/null and b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/HeadlineNode.class differ diff --git a/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/ImageNode$ImageNodeBuilder.class b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/ImageNode$ImageNodeBuilder.class new file mode 100644 index 0000000..19b792d Binary files /dev/null and b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/ImageNode$ImageNodeBuilder.class differ diff --git a/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/ImageNode.class b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/ImageNode.class new file mode 100644 index 0000000..fb6a184 Binary files /dev/null and b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/ImageNode.class differ diff --git a/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/ImageType.class b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/ImageType.class new file mode 100644 index 0000000..803bfe2 Binary files /dev/null and b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/ImageType.class differ diff --git a/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/NodeType.class b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/NodeType.class new file mode 100644 index 0000000..42d9463 Binary files /dev/null and b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/NodeType.class differ diff --git a/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/PageNode$PageNodeBuilder.class b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/PageNode$PageNodeBuilder.class new file mode 100644 index 0000000..c5c76b3 Binary files /dev/null and b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/PageNode$PageNodeBuilder.class differ diff --git a/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/PageNode.class b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/PageNode.class new file mode 100644 index 0000000..adffea8 Binary files /dev/null and b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/PageNode.class differ diff --git a/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/ParagraphNode$ParagraphNodeBuilder.class b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/ParagraphNode$ParagraphNodeBuilder.class new file mode 100644 index 0000000..bca0237 Binary files /dev/null and b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/ParagraphNode$ParagraphNodeBuilder.class differ diff --git a/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/ParagraphNode.class b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/ParagraphNode.class new file mode 100644 index 0000000..e587020 Binary files /dev/null and b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/ParagraphNode.class differ diff --git a/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/SectionNode$SectionNodeBuilder.class b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/SectionNode$SectionNodeBuilder.class new file mode 100644 index 0000000..a2734fa Binary files /dev/null and b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/SectionNode$SectionNodeBuilder.class differ diff --git a/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/SectionNode.class b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/SectionNode.class new file mode 100644 index 0000000..4cd60d9 Binary files /dev/null and b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/SectionNode.class differ diff --git a/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/SemanticNode.class b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/SemanticNode.class new file mode 100644 index 0000000..ce22cec Binary files /dev/null and b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/SemanticNode.class differ diff --git a/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/TableCellNode$TableCellNodeBuilder.class b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/TableCellNode$TableCellNodeBuilder.class new file mode 100644 index 0000000..af18d8c Binary files /dev/null and b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/TableCellNode$TableCellNodeBuilder.class differ diff --git a/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/TableCellNode.class b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/TableCellNode.class new file mode 100644 index 0000000..ecbeb42 Binary files /dev/null and b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/TableCellNode.class differ diff --git a/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/TableNode$TableNodeBuilder.class b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/TableNode$TableNodeBuilder.class new file mode 100644 index 0000000..4b1e53e Binary files /dev/null and b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/TableNode$TableNodeBuilder.class differ diff --git a/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/TableNode.class b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/TableNode.class new file mode 100644 index 0000000..b072cad Binary files /dev/null and b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/TableNode.class differ diff --git a/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/textblock/AtomicTextBlock$AtomicTextBlockBuilder.class b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/textblock/AtomicTextBlock$AtomicTextBlockBuilder.class new file mode 100644 index 0000000..0856ef2 Binary files /dev/null and b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/textblock/AtomicTextBlock$AtomicTextBlockBuilder.class differ diff --git a/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/textblock/AtomicTextBlock.class b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/textblock/AtomicTextBlock.class new file mode 100644 index 0000000..252cf3f Binary files /dev/null and b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/textblock/AtomicTextBlock.class differ diff --git a/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/textblock/ConcatenatedTextBlock.class b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/textblock/ConcatenatedTextBlock.class new file mode 100644 index 0000000..a0cd157 Binary files /dev/null and b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/textblock/ConcatenatedTextBlock.class differ diff --git a/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/textblock/TextBlock.class b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/textblock/TextBlock.class new file mode 100644 index 0000000..037dc41 Binary files /dev/null and b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/textblock/TextBlock.class differ diff --git a/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/textblock/TextBlockCollector.class b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/textblock/TextBlockCollector.class new file mode 100644 index 0000000..733661b Binary files /dev/null and b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/graph/textblock/TextBlockCollector.class differ diff --git a/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/mapper/DocumentDataMapper$1.class b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/mapper/DocumentDataMapper$1.class new file mode 100644 index 0000000..091c0fd Binary files /dev/null and b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/mapper/DocumentDataMapper$1.class differ diff --git a/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/mapper/DocumentDataMapper.class b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/mapper/DocumentDataMapper.class new file mode 100644 index 0000000..420038f Binary files /dev/null and b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/mapper/DocumentDataMapper.class differ diff --git a/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/mapper/DocumentGraphMapper$1.class b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/mapper/DocumentGraphMapper$1.class new file mode 100644 index 0000000..32b8934 Binary files /dev/null and b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/mapper/DocumentGraphMapper$1.class differ diff --git a/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/mapper/DocumentGraphMapper$Context.class b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/mapper/DocumentGraphMapper$Context.class new file mode 100644 index 0000000..65c339e Binary files /dev/null and b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/mapper/DocumentGraphMapper$Context.class differ diff --git a/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/mapper/DocumentGraphMapper.class b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/mapper/DocumentGraphMapper.class new file mode 100644 index 0000000..21c6dd8 Binary files /dev/null and b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/mapper/DocumentGraphMapper.class differ diff --git a/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/mapper/PropertiesMapper.class b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/mapper/PropertiesMapper.class new file mode 100644 index 0000000..55ddca7 Binary files /dev/null and b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/mapper/PropertiesMapper.class differ diff --git a/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/services/EntityEnrichmentService.class b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/services/EntityEnrichmentService.class new file mode 100644 index 0000000..a76cef9 Binary files /dev/null and b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/services/EntityEnrichmentService.class differ diff --git a/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/services/EntityInsertionService.class b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/services/EntityInsertionService.class new file mode 100644 index 0000000..ee274b4 Binary files /dev/null and b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/services/EntityInsertionService.class differ diff --git a/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/services/RectangleTransformations$Rectangle2DUnion.class b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/services/RectangleTransformations$Rectangle2DUnion.class new file mode 100644 index 0000000..ef73fe7 Binary files /dev/null and b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/services/RectangleTransformations$Rectangle2DUnion.class differ diff --git a/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/services/RectangleTransformations.class b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/services/RectangleTransformations.class new file mode 100644 index 0000000..4afe9c0 Binary files /dev/null and b/out/production/layoutparser-service-internal-api/com/knecon/fforesight/service/layoutparser/internal/api/services/RectangleTransformations.class differ diff --git a/pom.xml b/pom.xml new file mode 100644 index 0000000..837b38f --- /dev/null +++ b/pom.xml @@ -0,0 +1,24 @@ + + + 4.0.0 + + + org.springframework.boot + spring-boot-starter-parent + 3.0.1 + + + com.knecon.fforesight + layoutparser + 1.0.0 + + + layoutparser-service + layoutparser-service-image + + + + pom + +