RED-6725: Integrate new layout parser
* ported current state from RedactManager
This commit is contained in:
parent
cc1fedac41
commit
df9cbdc036
@ -1,5 +1,7 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data;
|
||||
|
||||
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
@ -20,4 +22,6 @@ public class AtomicTextBlockData {
|
||||
int end;
|
||||
int[] lineBreaks;
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
@ -15,6 +15,7 @@ public class DocumentData {
|
||||
PageData[] pages;
|
||||
AtomicTextBlockData[] atomicTextBlocks;
|
||||
AtomicPositionBlockData[] atomicPositionBlocks;
|
||||
TableOfContentsData tableOfContents;
|
||||
DocumentTreeData documentTreeData;
|
||||
|
||||
|
||||
}
|
||||
|
||||
@ -4,8 +4,6 @@ import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.NodeType;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
@ -19,7 +17,7 @@ import lombok.experimental.FieldDefaults;
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class TableOfContentsData {
|
||||
public class DocumentTreeData {
|
||||
|
||||
EntryData root;
|
||||
|
||||
@ -29,9 +27,9 @@ public class TableOfContentsData {
|
||||
if (tocId.isEmpty()) {
|
||||
return root;
|
||||
}
|
||||
EntryData entry = root.subEntries.get(tocId.get(0));
|
||||
EntryData entry = root.children.get(tocId.get(0));
|
||||
for (int id : tocId.subList(1, tocId.size())) {
|
||||
entry = entry.subEntries.get(id);
|
||||
entry = entry.children.get(id);
|
||||
}
|
||||
return entry;
|
||||
}
|
||||
@ -39,7 +37,7 @@ public class TableOfContentsData {
|
||||
|
||||
public Stream<EntryData> streamAllEntries() {
|
||||
|
||||
return Stream.concat(Stream.of(root), root.subEntries.stream()).flatMap(TableOfContentsData::flatten);
|
||||
return Stream.concat(Stream.of(root), root.children.stream()).flatMap(DocumentTreeData::flatten);
|
||||
}
|
||||
|
||||
|
||||
@ -51,7 +49,7 @@ public class TableOfContentsData {
|
||||
|
||||
private static Stream<EntryData> flatten(EntryData entry) {
|
||||
|
||||
return Stream.concat(Stream.of(entry), entry.subEntries.stream().flatMap(TableOfContentsData::flatten));
|
||||
return Stream.concat(Stream.of(entry), entry.children.stream().flatMap(DocumentTreeData::flatten));
|
||||
}
|
||||
|
||||
|
||||
@ -62,19 +60,18 @@ public class TableOfContentsData {
|
||||
public static class EntryData {
|
||||
|
||||
NodeType type;
|
||||
int[] tocId;
|
||||
Long[] atomicBlocks;
|
||||
Long[] pages;
|
||||
int[] treeId;
|
||||
Long[] atomicBlockIds;
|
||||
Long[] pageNumbers;
|
||||
Map<String, String> properties;
|
||||
List<EntryData> subEntries;
|
||||
|
||||
List<EntryData> children;
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append("[");
|
||||
for (int i : tocId) {
|
||||
for (int i : treeId) {
|
||||
sb.append(i);
|
||||
sb.append(",");
|
||||
}
|
||||
@ -83,7 +80,7 @@ public class TableOfContentsData {
|
||||
|
||||
sb.append(type);
|
||||
sb.append(" atbs = ");
|
||||
sb.append(atomicBlocks.length);
|
||||
sb.append(atomicBlockIds.length);
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
@ -1,4 +1,6 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data;
|
||||
|
||||
import java.util.Locale;
|
||||
|
||||
public enum NodeType {
|
||||
DOCUMENT,
|
||||
@ -9,5 +11,11 @@ public enum NodeType {
|
||||
TABLE_CELL,
|
||||
IMAGE,
|
||||
HEADER,
|
||||
FOOTER
|
||||
FOOTER;
|
||||
|
||||
|
||||
public String toString() {
|
||||
|
||||
return this.name().charAt(0) + this.name().substring(1).toLowerCase(Locale.ROOT);
|
||||
}
|
||||
}
|
||||
@ -1,101 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SectionNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class DocumentGraph implements SemanticNode {
|
||||
|
||||
Set<PageNode> pages;
|
||||
TableOfContents tableOfContents;
|
||||
Integer numberOfPages;
|
||||
TextBlock textBlock;
|
||||
|
||||
|
||||
public TextBlock buildTextBlock() {
|
||||
|
||||
return streamTerminalTextBlocksInOrder().collect(new TextBlockCollector());
|
||||
}
|
||||
|
||||
|
||||
public List<SectionNode> getMainSections() {
|
||||
|
||||
return streamChildren().filter(node -> node instanceof SectionNode).map(node -> (SectionNode) node).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
||||
public Stream<TextBlock> streamTerminalTextBlocksInOrder() {
|
||||
|
||||
return streamAllNodes().filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock);
|
||||
}
|
||||
|
||||
|
||||
public Set<EntityNode> getEntities() {
|
||||
|
||||
return streamAllSubNodes().map(SemanticNode::getEntities).flatMap(Set::stream).collect(Collectors.toUnmodifiableSet());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public List<Integer> getTocId() {
|
||||
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void setTocId(List<Integer> tocId) {
|
||||
|
||||
throw new UnsupportedOperationException("DocumentGraph is always the root of the Table of Contents");
|
||||
}
|
||||
|
||||
|
||||
private Stream<SemanticNode> streamAllNodes() {
|
||||
|
||||
return tableOfContents.streamAllEntriesInOrder().map(TableOfContents.Entry::getNode);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return NodeType.DOCUMENT + ": " + buildTextBlock().buildSummary();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Map<PageNode, Rectangle2D> getBBox() {
|
||||
|
||||
Map<PageNode, Rectangle2D> bBox = new HashMap<>();
|
||||
for (PageNode page : pages) {
|
||||
bBox.put(page, new Rectangle2D.Double(0, 0, page.getWidth(), page.getHeight()));
|
||||
}
|
||||
return bBox;
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,193 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph;
|
||||
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.google.common.hash.Hashing;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.Getter;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
public class TableOfContents {
|
||||
|
||||
private final Entry root;
|
||||
|
||||
|
||||
public TableOfContents(DocumentGraph documentGraph) {
|
||||
|
||||
root = Entry.builder().tocId(Collections.emptyList()).type(NodeType.DOCUMENT).children(new LinkedList<>()).node(documentGraph).build();
|
||||
}
|
||||
|
||||
|
||||
public TextBlock buildTextBlock() {
|
||||
|
||||
return streamAllEntriesInOrder().map(Entry::getNode).filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock).collect(new TextBlockCollector());
|
||||
}
|
||||
|
||||
|
||||
public List<Integer> createNewMainEntryAndReturnId(NodeType nodeType, SemanticNode node) {
|
||||
|
||||
return createNewChildEntryAndReturnId(Collections.emptyList(), nodeType, node);
|
||||
}
|
||||
|
||||
|
||||
public List<Integer> createNewChildEntryAndReturnId(List<Integer> parentId, NodeType nodeType, SemanticNode node) {
|
||||
|
||||
if (!entryExists(parentId)) {
|
||||
throw new UnsupportedOperationException(format("parentId %s does not exist!", parentId));
|
||||
}
|
||||
|
||||
Entry parent = getEntryById(parentId);
|
||||
List<Integer> newId = new LinkedList<>(parentId);
|
||||
newId.add(parent.children.size());
|
||||
parent.children.add(Entry.builder().tocId(newId).node(node).type(nodeType).children(new LinkedList<>()).build());
|
||||
|
||||
return newId;
|
||||
}
|
||||
|
||||
|
||||
private boolean entryExists(List<Integer> tocId) {
|
||||
|
||||
if (tocId.isEmpty()) {
|
||||
return root != null;
|
||||
}
|
||||
Entry entry = root.children.get(tocId.get(0));
|
||||
for (int id : tocId.subList(1, tocId.size())) {
|
||||
if (id >= entry.children.size() || 0 > id) {
|
||||
return false;
|
||||
}
|
||||
entry = entry.children.get(id);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
public Entry getParentEntryById(List<Integer> tocId) {
|
||||
|
||||
return getEntryById(getParentId(tocId));
|
||||
}
|
||||
|
||||
|
||||
public boolean hasParentById(List<Integer> tocId) {
|
||||
|
||||
return entryExists(getParentId(tocId));
|
||||
}
|
||||
|
||||
|
||||
public Stream<SemanticNode> streamChildrenNodes(List<Integer> tocId) {
|
||||
|
||||
return getEntryById(tocId).children.stream().map(Entry::getNode);
|
||||
}
|
||||
|
||||
|
||||
private static List<Integer> getParentId(List<Integer> tocId) {
|
||||
|
||||
if (tocId.isEmpty()) {
|
||||
throw new UnsupportedOperationException("Root has no parent!");
|
||||
}
|
||||
if (tocId.size() < 2) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
return tocId.subList(0, tocId.size() - 1);
|
||||
}
|
||||
|
||||
|
||||
public Entry getEntryById(List<Integer> tocId) {
|
||||
|
||||
if (tocId.isEmpty()) {
|
||||
return root;
|
||||
}
|
||||
Entry entry = root.children.get(tocId.get(0));
|
||||
for (int id : tocId.subList(1, tocId.size())) {
|
||||
entry = entry.children.get(id);
|
||||
}
|
||||
return entry;
|
||||
}
|
||||
|
||||
|
||||
public Stream<Entry> streamMainEntries() {
|
||||
|
||||
return root.children.stream();
|
||||
}
|
||||
|
||||
|
||||
public Stream<Entry> streamAllEntriesInOrder() {
|
||||
|
||||
return Stream.of(root).flatMap(TableOfContents::flatten);
|
||||
}
|
||||
|
||||
|
||||
public Stream<Entry> streamAllSubEntriesInOrder(List<Integer> parentId) {
|
||||
|
||||
return getEntryById(parentId).getChildren().stream().flatMap(TableOfContents::flatten);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return String.join("\n", streamAllEntriesInOrder().map(Entry::toString).toList());
|
||||
}
|
||||
|
||||
|
||||
public String toString(List<Integer> id) {
|
||||
|
||||
return String.join("\n", streamAllSubEntriesInOrder(id).map(Entry::toString).toList());
|
||||
}
|
||||
|
||||
|
||||
private static Stream<Entry> flatten(Entry entry) {
|
||||
|
||||
return Stream.concat(Stream.of(entry), entry.children.stream().flatMap(TableOfContents::flatten));
|
||||
}
|
||||
|
||||
|
||||
@Builder
|
||||
@Getter
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE, makeFinal = true)
|
||||
public static class Entry {
|
||||
|
||||
List<Integer> tocId;
|
||||
NodeType type;
|
||||
SemanticNode node;
|
||||
List<Entry> children;
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return node.toString();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
|
||||
return Hashing.murmur3_32_fixed().hashString(toString(), StandardCharsets.UTF_8).hashCode();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
|
||||
return o instanceof Entry && o.hashCode() == this.hashCode();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,76 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph.entity;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode;
|
||||
|
||||
public interface EntityNode {
|
||||
|
||||
/**
|
||||
* This represents the text, which is contained within the boundary of the Entity.
|
||||
*
|
||||
* @return String
|
||||
*/
|
||||
String getValue();
|
||||
|
||||
|
||||
/**
|
||||
* The Boundary primarily defines the Entity, all other values may be inferred from it.
|
||||
*
|
||||
* @return Boundary, uniquely identifying this Entity
|
||||
*/
|
||||
Boundary getBoundary();
|
||||
|
||||
|
||||
/**
|
||||
* The deepest fully containing node represents the node which is the deepest node in the document tree structure,
|
||||
* whose boundary also fully contains the boundary of this entity.
|
||||
*
|
||||
* @return the deepest fully containing node
|
||||
*/
|
||||
SemanticNode getDeepestFullyContainingNode();
|
||||
|
||||
|
||||
/**
|
||||
* The intersecting nodes represent all nodes, whose boundary intersects the boundary of this entity.
|
||||
*
|
||||
* @return all intersecting Nodes
|
||||
*/
|
||||
List<SemanticNode> getIntersectingNodes();
|
||||
|
||||
|
||||
void setDeepestFullyContainingNode(SemanticNode semanticNode);
|
||||
|
||||
|
||||
void addIntersectingNode(SemanticNode semanticNode);
|
||||
|
||||
|
||||
void setIntersectingNodes(List<SemanticNode> semanticNodes);
|
||||
|
||||
|
||||
/**
|
||||
* @return all pages this entity intersects.
|
||||
*/
|
||||
Set<PageNode> getPages();
|
||||
|
||||
|
||||
void setPages(Set<PageNode> pages);
|
||||
|
||||
|
||||
/**
|
||||
* removes all occurrences of this node in the graph and resets all graph specific fields.
|
||||
*/
|
||||
default void removeFromGraph() {
|
||||
|
||||
getIntersectingNodes().forEach(node -> node.getEntities().remove(this));
|
||||
getPages().forEach(page -> page.getEntities().remove(this));
|
||||
setPages(Collections.emptySet());
|
||||
setDeepestFullyContainingNode(null);
|
||||
setIntersectingNodes(Collections.emptyList());
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,45 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph.entity;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.List;
|
||||
|
||||
import com.google.common.hash.Hashing;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class EntityPosition {
|
||||
|
||||
PageNode pageNode;
|
||||
List<Rectangle2D> rectanglePerLine;
|
||||
|
||||
|
||||
public String getId() {
|
||||
|
||||
return String.valueOf(hashCode());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append(pageNode.getNumber());
|
||||
rectanglePerLine.forEach(r -> sb.append(r.getX()).append(r.getY()).append(r.getWidth()).append(r.getHeight()));
|
||||
return Hashing.murmur3_128().hashString(sb.toString(), StandardCharsets.UTF_8).hashCode();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
|
||||
return o instanceof EntityPosition && o.hashCode() == this.hashCode();
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,53 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class FooterNode implements SemanticNode {
|
||||
|
||||
List<Integer> tocId;
|
||||
TextBlock terminalTextBlock;
|
||||
|
||||
@Builder.Default
|
||||
boolean terminal = true;
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
TableOfContents tableOfContents;
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<EntityNode> entities = new HashSet<>();
|
||||
|
||||
|
||||
@Override
|
||||
public TextBlock buildTextBlock() {
|
||||
|
||||
return terminalTextBlock;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return tocId + ": " + NodeType.FOOTER + ": " + terminalTextBlock.buildSummary();
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,53 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class HeaderNode implements SemanticNode {
|
||||
|
||||
List<Integer> tocId;
|
||||
TextBlock terminalTextBlock;
|
||||
|
||||
@Builder.Default
|
||||
boolean terminal = true;
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
TableOfContents tableOfContents;
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<EntityNode> entities = new HashSet<>();
|
||||
|
||||
|
||||
@Override
|
||||
public TextBlock buildTextBlock() {
|
||||
|
||||
return terminalTextBlock;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return tocId + ": " + NodeType.HEADER + ": " + terminalTextBlock.buildSummary();
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,60 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class HeadlineNode implements SemanticNode {
|
||||
|
||||
List<Integer> tocId;
|
||||
TextBlock terminalTextBlock;
|
||||
|
||||
@Builder.Default
|
||||
boolean terminal = true;
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
TableOfContents tableOfContents;
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<EntityNode> entities = new HashSet<>();
|
||||
|
||||
|
||||
@Override
|
||||
public TextBlock buildTextBlock() {
|
||||
|
||||
return terminalTextBlock;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return tocId + ": " + NodeType.HEADLINE + ": " + terminalTextBlock.buildSummary();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public SemanticNode getHeadline() {
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,87 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class ImageNode implements SemanticNode {
|
||||
|
||||
List<Integer> tocId;
|
||||
|
||||
ImageType imageType;
|
||||
boolean transparency;
|
||||
Rectangle2D position;
|
||||
|
||||
|
||||
boolean redaction;
|
||||
boolean ignored;
|
||||
|
||||
@Builder.Default
|
||||
String redactionReason = "";
|
||||
@Builder.Default
|
||||
String legalBasis = "";
|
||||
@Builder.Default
|
||||
int matchedRule = -1;
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
PageNode page;
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
TableOfContents tableOfContents;
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<EntityNode> entities = new HashSet<>();
|
||||
|
||||
@Override
|
||||
public TextBlock buildTextBlock() {
|
||||
|
||||
return streamAllSubNodes().filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock).collect(new TextBlockCollector());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Set<PageNode> getPages() {
|
||||
|
||||
return Collections.singleton(page);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return tocId + ": " + NodeType.IMAGE + ": " + imageType.toString() + " " + position;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Map<PageNode, Rectangle2D> getBBox() {
|
||||
|
||||
Map<PageNode, Rectangle2D> bBoxPerPage = new HashMap<>();
|
||||
bBoxPerPage.put(page, position);
|
||||
return bBoxPerPage;
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,9 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
|
||||
|
||||
public enum ImageType {
|
||||
LOGO,
|
||||
FORMULA,
|
||||
SIGNATURE,
|
||||
OTHER,
|
||||
OCR
|
||||
}
|
||||
@ -1,71 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Getter
|
||||
@Setter
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class PageNode {
|
||||
|
||||
Integer number;
|
||||
Integer height;
|
||||
Integer width;
|
||||
Integer rotation;
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
List<SemanticNode> mainBody;
|
||||
@EqualsAndHashCode.Exclude
|
||||
HeaderNode header;
|
||||
@EqualsAndHashCode.Exclude
|
||||
FooterNode footer;
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<EntityNode> entities = new HashSet<>();
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<ImageNode> images = new HashSet<>();
|
||||
|
||||
|
||||
public TextBlock getMainBodyTextBlock() {
|
||||
|
||||
return mainBody.stream().filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock).collect(new TextBlockCollector());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return String.valueOf(number);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
|
||||
return number;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
|
||||
return o instanceof PageNode && o.hashCode() == this.hashCode();
|
||||
}
|
||||
}
|
||||
@ -1,51 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class ParagraphNode implements SemanticNode {
|
||||
|
||||
List<Integer> tocId;
|
||||
TextBlock terminalTextBlock;
|
||||
|
||||
@Builder.Default
|
||||
boolean terminal = true;
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
TableOfContents tableOfContents;
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<EntityNode> entities = new HashSet<>();
|
||||
|
||||
|
||||
@Override
|
||||
public TextBlock buildTextBlock() {
|
||||
|
||||
return terminalTextBlock;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return tocId + ": " + NodeType.PARAGRAPH + ": " + terminalTextBlock.buildSummary();
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,63 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class SectionNode implements SemanticNode {
|
||||
|
||||
List<Integer> tocId;
|
||||
|
||||
TextBlock textBlock;
|
||||
@EqualsAndHashCode.Exclude
|
||||
TableOfContents tableOfContents;
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<EntityNode> entities = new HashSet<>();
|
||||
|
||||
|
||||
@Override
|
||||
public TextBlock buildTextBlock() {
|
||||
|
||||
if (textBlock == null) {
|
||||
textBlock = streamAllSubNodes().filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock).collect(new TextBlockCollector());
|
||||
}
|
||||
return textBlock;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return tocId.toString() + ": " + NodeType.SECTION + ": " + buildTextBlock().buildSummary();
|
||||
}
|
||||
|
||||
|
||||
public HeadlineNode getHeadline() {
|
||||
|
||||
return streamChildren().filter(node -> node instanceof HeadlineNode)
|
||||
.map(node -> (HeadlineNode) node)
|
||||
.findFirst()
|
||||
.orElseThrow(() -> new NoSuchElementException("ClassificationSection has no Headline!"));
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,275 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.AtomicTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.services.RectangleTransformations;
|
||||
|
||||
public interface SemanticNode {
|
||||
|
||||
/**
|
||||
* Searches all Nodes located underneath this Node in the TableOfContents and concatenates their AtomicTextBlocks into a single TextBlockEntity.
|
||||
* So, for a ClassificationSection all TextBlocks of Subsections, Paragraphs, and Tables are concatenated into a single TextBlockEntity
|
||||
* If the Node is Terminal, the TerminalTextBlock will be returned instead.
|
||||
*
|
||||
* @return ClassificationTextBlock containing all AtomicTextBlocks that are located under this Node.
|
||||
*/
|
||||
TextBlock buildTextBlock();
|
||||
|
||||
|
||||
/**
|
||||
* Any Node maintains its own Set of Entities.
|
||||
* This Set contains all Entities whose boundary intersects the boundary of this node.
|
||||
*
|
||||
* @return Set of all Entities associated with this Node
|
||||
*/
|
||||
Set<EntityNode> getEntities();
|
||||
|
||||
|
||||
/**
|
||||
* Each AtomicTextBlock is assigned a page, so to get the pages this node appears on, it collects the PageNodes from each AtomicTextBlock belonging to this node's ClassificationTextBlock.
|
||||
*
|
||||
* @return Set of PageNodes this node appears on.
|
||||
*/
|
||||
default Set<PageNode> getPages() {
|
||||
|
||||
return buildTextBlock().getPages();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @return the TableOfContents of the ClassificationDocument this node belongs to
|
||||
*/
|
||||
TableOfContents getTableOfContents();
|
||||
|
||||
|
||||
/**
|
||||
* The id is a List of Integers uniquely identifying this node in the TableOfContents.
|
||||
*
|
||||
* @return the TableOfContents ID
|
||||
*/
|
||||
List<Integer> getTocId();
|
||||
|
||||
|
||||
/**
|
||||
* This should only be used during graph construction.
|
||||
*
|
||||
* @param tocId List of Integers
|
||||
*/
|
||||
void setTocId(List<Integer> tocId);
|
||||
|
||||
|
||||
/**
|
||||
* Traverses the Tree up, until it hits a HeadlineNode or hits a SectionNode which will then return the first HeadlineNode from its children.
|
||||
* Throws NotFoundException if no Headline is found this way
|
||||
*
|
||||
* @return First HeadlineNode found
|
||||
*/
|
||||
default SemanticNode getHeadline() {
|
||||
|
||||
return getParent().getHeadline();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @return boolean indicating wether this Node has a Parent in the TableOfContents
|
||||
*/
|
||||
default boolean hasParent() {
|
||||
|
||||
return getTableOfContents().hasParentById(getTocId());
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @return The SemanticNode representing the Parent in the TableOfContents
|
||||
* throws NotFoundException, when no parent is present
|
||||
*/
|
||||
default SemanticNode getParent() {
|
||||
|
||||
return getTableOfContents().getParentEntryById(getTocId()).getNode();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Terminal means a SemanticNode has direct access to a ClassificationTextBlock, by default this is false and must be overridden.
|
||||
* Currently only Sections, Images, and Tables are not terminal.
|
||||
* A TableCell might be Terminal depending on its area compared to the page.
|
||||
*
|
||||
* @return boolean, indicating if a Node has direct access to a ClassificationTextBlock
|
||||
*/
|
||||
default boolean isTerminal() {
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Terminal means a SemanticNode has direct access to a ClassificationTextBlock, by default this is false and must be overridden.
|
||||
* Currently only Sections and Tables are not terminal.
|
||||
*
|
||||
* @return AtomicTextBlock
|
||||
*/
|
||||
default TextBlock getTerminalTextBlock() {
|
||||
|
||||
throw new UnsupportedOperationException("Only terminal Nodes have access to TerminalTextBlocks!");
|
||||
}
|
||||
|
||||
|
||||
default void setTerminalTextBlock(TextBlock textBlock) {
|
||||
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Each AtomicTextBlock has an index on its page, this returns the number of the first AtomicTextBlock underneath this node.
|
||||
* If this node does not have any AtomicTexBlocks underneath it, e.g. an empty TableCell. It returns -1.
|
||||
*
|
||||
* @return Integer representing the number on the page
|
||||
*/
|
||||
default Integer getNumberOnPage() {
|
||||
|
||||
TextBlock textBlock = buildTextBlock();
|
||||
if (textBlock.getAtomicTextBlocks().size() > 0) {
|
||||
return buildTextBlock().getAtomicTextBlocks().get(0).getNumberOnPage();
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @return true, if this node's ClassificationTextBlock is not empty
|
||||
*/
|
||||
default boolean hasText() {
|
||||
|
||||
return buildTextBlock().length() > 0;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @param string A String which the ClassificationTextBlock might contain
|
||||
* @return true, if this node's ClassificationTextBlock contains the string
|
||||
*/
|
||||
default boolean containsString(String string) {
|
||||
|
||||
return buildTextBlock().getSearchText().contains(string);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @param strings A List of Strings which the ClassificationTextBlock might contain
|
||||
* @return true, if this node's ClassificationTextBlock contains any of the strings
|
||||
*/
|
||||
default boolean containsAnyString(List<String> strings) {
|
||||
|
||||
return strings.stream().anyMatch(this::containsString);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* This function is used during insertion of EntityNodes into the graph, it checks if the boundary of the EntityNode intersects or even contains the EntityNode.
|
||||
* It sets the fields accordingly and recursively calls this function on all its children.
|
||||
*
|
||||
* @param entityNode EntityNode, which is being inserted into the graph
|
||||
*/
|
||||
default void addThisToEntityIfIntersects(EntityNode entityNode) {
|
||||
|
||||
TextBlock textBlock = buildTextBlock();
|
||||
if (textBlock.getBoundary().intersects(entityNode.getBoundary())) {
|
||||
|
||||
if (textBlock.containsBoundary(entityNode.getBoundary())) {
|
||||
entityNode.setDeepestFullyContainingNode(this);
|
||||
}
|
||||
|
||||
entityNode.addIntersectingNode(this);
|
||||
streamChildren().forEach(node -> node.addThisToEntityIfIntersects(entityNode));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Streams all children located directly underneath this node in the TableOfContents.
|
||||
*
|
||||
* @return Stream of all children
|
||||
*/
|
||||
default Stream<SemanticNode> streamChildren() {
|
||||
|
||||
return getTableOfContents().streamChildrenNodes(getTocId());
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* recursively streams all SemanticNodes located underneath this node in the TableOfContents in order.
|
||||
*
|
||||
* @return Stream of all SubNodes
|
||||
*/
|
||||
default Stream<SemanticNode> streamAllSubNodes() {
|
||||
|
||||
return getTableOfContents().streamAllSubEntriesInOrder(getTocId()).map(TableOfContents.Entry::getNode);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @return Boundary of this Node's ClassificationTextBlock
|
||||
*/
|
||||
default Boundary getBoundary() {
|
||||
|
||||
return buildTextBlock().getBoundary();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* If this Node is Terminal it will calculate the boundingBox of its TerminalTextBlock, otherwise it will calculate the Union of the BoundingBoxes of all its Children.
|
||||
* If called on the ClassificationDocument, it will return the cropbox of each page
|
||||
*
|
||||
* @return Rectangle2D fully encapsulating this Node for each page.
|
||||
*/
|
||||
default Map<PageNode, Rectangle2D> getBBox() {
|
||||
|
||||
Map<PageNode, Rectangle2D> bBoxPerPage = new HashMap<>();
|
||||
if (isTerminal()) {
|
||||
return getBBoxFromTerminalTextBlock(bBoxPerPage);
|
||||
}
|
||||
|
||||
return getBBoxFromChildren(bBoxPerPage);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* TODO this does not yet work for sections spanning multiple columns.
|
||||
*
|
||||
* @param bBoxPerPage initial empty BoundingBox
|
||||
* @return The union of the BoundingBoxes of all children
|
||||
*/
|
||||
private Map<PageNode, Rectangle2D> getBBoxFromChildren(Map<PageNode, Rectangle2D> bBoxPerPage) {
|
||||
|
||||
return streamChildren().map(SemanticNode::getBBox).reduce((map1, map2) -> {
|
||||
map1.forEach((page, rectangle) -> map2.merge(page, rectangle, (rect1, rect2) -> rect1.createUnion(rect2).getBounds2D()));
|
||||
return map2;
|
||||
}).orElse(bBoxPerPage);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @param bBoxPerPage initial empty BoundingBox
|
||||
* @return The union of all BoundingBoxes of the ClassificationTextBlock of this node
|
||||
*/
|
||||
private Map<PageNode, Rectangle2D> getBBoxFromTerminalTextBlock(Map<PageNode, Rectangle2D> bBoxPerPage) {
|
||||
|
||||
Map<PageNode, List<AtomicTextBlock>> atomicTextBlockPerPage = buildTextBlock().getAtomicTextBlocks().stream().collect(Collectors.groupingBy(AtomicTextBlock::getPage));
|
||||
atomicTextBlockPerPage.forEach((page, atbs) -> bBoxPerPage.put(page, RectangleTransformations.bBoxUnionAtomicTextBlock(atbs)));
|
||||
return bBoxPerPage;
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,92 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class TableCellNode implements SemanticNode {
|
||||
|
||||
List<Integer> tocId;
|
||||
int row;
|
||||
int col;
|
||||
boolean header;
|
||||
|
||||
Rectangle2D bBox;
|
||||
|
||||
@Builder.Default
|
||||
boolean terminal = true;
|
||||
TextBlock terminalTextBlock;
|
||||
|
||||
TextBlock textBlock;
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
TableOfContents tableOfContents;
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<EntityNode> entities = new HashSet<>();
|
||||
|
||||
|
||||
@Override
|
||||
public Map<PageNode, Rectangle2D> getBBox() {
|
||||
|
||||
Map<PageNode, Rectangle2D> bBoxPerPage = new HashMap<>();
|
||||
getPages().forEach(page -> bBoxPerPage.put(page, bBox));
|
||||
return bBoxPerPage;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public TextBlock buildTextBlock() {
|
||||
|
||||
if (terminal) {
|
||||
return terminalTextBlock;
|
||||
}
|
||||
|
||||
if (textBlock == null) {
|
||||
textBlock = streamAllSubNodes().filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock).collect(new TextBlockCollector());
|
||||
}
|
||||
return textBlock;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return tocId + ": " + NodeType.TABLE_CELL + ": " + buildTextBlock().buildSummary();
|
||||
}
|
||||
|
||||
|
||||
public boolean hasHeader(String headerString) {
|
||||
|
||||
return getHeaders().anyMatch(header -> header.buildTextBlock().getSearchText().strip().equals(headerString));
|
||||
}
|
||||
|
||||
|
||||
private Stream<TableCellNode> getHeaders() {
|
||||
|
||||
TableNode tableNode = (TableNode) getParent();
|
||||
return tableNode.streamHeadersForCell(row, col);
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,73 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class TableNode implements SemanticNode {
|
||||
|
||||
List<Integer> tocId;
|
||||
TableOfContents tableOfContents;
|
||||
|
||||
Integer numberOfRows;
|
||||
Integer numberOfCols;
|
||||
|
||||
TextBlock textBlock;
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<EntityNode> entities = new HashSet<>();
|
||||
|
||||
|
||||
public Stream<TableCellNode> streamTableCells() {
|
||||
|
||||
return streamChildren().map(node -> (TableCellNode) node);
|
||||
}
|
||||
|
||||
|
||||
public Stream<TableCellNode> streamHeaders() {
|
||||
|
||||
return streamTableCells().filter(TableCellNode::isHeader);
|
||||
}
|
||||
|
||||
|
||||
public Stream<TableCellNode> streamHeadersForCell(int row, int col) {
|
||||
|
||||
return streamHeaders().filter(cell -> cell.getRow() == row || cell.getCol() == col);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public TextBlock buildTextBlock() {
|
||||
|
||||
if (textBlock == null) {
|
||||
textBlock = streamAllSubNodes().filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock).collect(new TextBlockCollector());
|
||||
}
|
||||
return textBlock;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return tocId.toString() + ": " + NodeType.TABLE + ": " + buildTextBlock().buildSummary();
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,131 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityPosition;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.services.RectangleTransformations;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class AtomicTextBlock implements TextBlock {
|
||||
|
||||
Long id;
|
||||
Integer numberOnPage;
|
||||
PageNode page;
|
||||
|
||||
//string coordinates
|
||||
Boundary boundary;
|
||||
String searchText;
|
||||
List<Integer> lineBreaks;
|
||||
|
||||
//position coordinates
|
||||
List<Integer> stringIdxToPositionIdx;
|
||||
List<Rectangle2D> positions;
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
SemanticNode parent;
|
||||
|
||||
|
||||
@Override
|
||||
public int numberOfLines() {
|
||||
|
||||
return lineBreaks.size() + 1;
|
||||
}
|
||||
|
||||
|
||||
public CharSequence getLine(int lineNumber) {
|
||||
|
||||
if (lineNumber >= numberOfLines() || lineNumber < 0) {
|
||||
throw new IndexOutOfBoundsException(String.format("line %d out of range for AtomicTextBlock with %d lines", lineNumber, numberOfLines()));
|
||||
}
|
||||
if (lineNumber == 0) {
|
||||
return subSequence(boundary.start(), lineBreaks.get(0) + boundary.start());
|
||||
} else if (lineNumber == numberOfLines() - 1) {
|
||||
return subSequence(lineBreaks.get(lineBreaks.size() - 1) + boundary.start(), boundary.end());
|
||||
}
|
||||
return subSequence(lineBreaks.get(lineNumber - 1) + boundary.start(), lineBreaks.get(lineNumber) + boundary.start());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public List<AtomicTextBlock> getAtomicTextBlocks() {
|
||||
|
||||
return List.of(this);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int getNextLinebreak(int fromIndex) {
|
||||
|
||||
return lineBreaks.stream()//
|
||||
.filter(linebreak -> linebreak > fromIndex - boundary.start()) //
|
||||
.findFirst() //
|
||||
.orElse(searchText.length()) + boundary.start();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int getPreviousLinebreak(int fromIndex) {
|
||||
|
||||
return lineBreaks.stream()//
|
||||
.filter(linebreak -> linebreak <= fromIndex - boundary.start())//
|
||||
.reduce((a, b) -> b)//
|
||||
.orElse(0) + boundary.start();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Rectangle2D getPosition(int stringIdx) {
|
||||
|
||||
return positions.get(stringIdxToPositionIdx.get(stringIdx - boundary.start()));
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public List<Rectangle2D> getPositions(Boundary stringBoundary) {
|
||||
|
||||
if (!containsBoundary(stringBoundary)) {
|
||||
throw new IndexOutOfBoundsException(String.format("%s is out of bounds for %s", stringBoundary, this.boundary));
|
||||
}
|
||||
|
||||
if (stringBoundary.end() == this.boundary.end()) {
|
||||
return positions.subList(stringIdxToPositionIdx.get(stringBoundary.start() - this.boundary.start()), positions.size());
|
||||
}
|
||||
|
||||
return positions.subList(stringIdxToPositionIdx.get(stringBoundary.start() - this.boundary.start()),
|
||||
stringIdxToPositionIdx.get(stringBoundary.end() - this.boundary.start()));
|
||||
}
|
||||
|
||||
|
||||
public List<EntityPosition> getEntityPositionsPerPage(Boundary stringBoundary) {
|
||||
|
||||
List<Rectangle2D> positionsPerLine = stringBoundary.split(getLineBreaks().stream().map(lb -> lb + boundary.start()).filter(stringBoundary::contains).toList())
|
||||
.stream()
|
||||
.map(this::getPositions)
|
||||
.map(RectangleTransformations::rectangleUnion)
|
||||
.toList();
|
||||
|
||||
return List.of(EntityPosition.builder().rectanglePerLine(positionsPerLine).pageNode(page).build());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return searchText;
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,229 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.mapper;
|
||||
|
||||
import static com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.NodeType.FOOTER;
|
||||
import static com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.NodeType.HEADER;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.NoSuchElementException;
|
||||
|
||||
import com.google.common.primitives.Ints;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicPositionBlockData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicTextBlockData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.PageData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.TableOfContentsData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.DocumentGraph;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.FooterNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.HeaderNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.HeadlineNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ImageNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ParagraphNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SectionNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableCellNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.AtomicTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class DocumentGraphMapper {
|
||||
|
||||
public DocumentGraph toDocumentGraph(DocumentData documentData) {
|
||||
|
||||
|
||||
DocumentGraph documentGraph = new DocumentGraph();
|
||||
Context context = new Context(documentData,
|
||||
new TableOfContents(documentGraph),
|
||||
new LinkedList<>(),
|
||||
new LinkedList<>(),
|
||||
Arrays.stream(documentData.getAtomicTextBlocks()).toList(),
|
||||
Arrays.stream(documentData.getAtomicPositionBlocks()).toList());
|
||||
|
||||
context.pages.addAll(Arrays.stream(documentData.getPages()).map(DocumentGraphMapper::buildPage).toList());
|
||||
|
||||
context.tableOfContents.getRoot().getChildren().addAll(buildEntries(documentData.getTableOfContents().getRoot().getSubEntries(), context));
|
||||
|
||||
documentGraph.setTableOfContents(context.tableOfContents);
|
||||
documentGraph.setPages(new HashSet<>(context.pages));
|
||||
documentGraph.setNumberOfPages(documentData.getPages().length);
|
||||
|
||||
documentGraph.setTextBlock(documentGraph.buildTextBlock());
|
||||
return documentGraph;
|
||||
}
|
||||
|
||||
|
||||
private List<TableOfContents.Entry> buildEntries(List<TableOfContentsData.EntryData> entries,
|
||||
Context context) {
|
||||
|
||||
List<TableOfContents.Entry> newEntries = new LinkedList<>();
|
||||
for (TableOfContentsData.EntryData entryData : entries) {
|
||||
|
||||
boolean terminal = isTerminal(entryData);
|
||||
List<PageNode> pages = Arrays.stream(entryData.getPages()).map(pageNumber -> getPage(pageNumber, context)).toList();
|
||||
|
||||
SemanticNode node = switch (entryData.getType()) {
|
||||
case SECTION -> buildSection(context);
|
||||
case PARAGRAPH -> buildParagraph(context, terminal);
|
||||
case HEADLINE -> buildHeadline(context, terminal);
|
||||
case HEADER -> buildHeader(context, terminal);
|
||||
case FOOTER -> buildFooter(context, terminal);
|
||||
case TABLE -> buildTable(context, entryData.getProperties());
|
||||
case TABLE_CELL -> buildTableCell(context, entryData.getProperties(), terminal);
|
||||
case IMAGE -> buildImage(context, entryData.getProperties());
|
||||
default -> throw new UnsupportedOperationException("Not yet implemented for type " + entryData.getType());
|
||||
};
|
||||
|
||||
if (node.isTerminal()) {
|
||||
TextBlock textBlock = toTextBlock(entryData.getAtomicBlocks(), context, node);
|
||||
node.setTerminalTextBlock(textBlock);
|
||||
}
|
||||
List<Integer> tocId = Arrays.stream(entryData.getTocId()).boxed().toList();
|
||||
node.setTocId(tocId);
|
||||
|
||||
if (entryData.getType() == HEADER) {
|
||||
pages.forEach(page -> page.setHeader((HeaderNode) node));
|
||||
} else if (entryData.getType() == FOOTER) {
|
||||
pages.forEach(page -> page.setFooter((FooterNode) node));
|
||||
} else {
|
||||
pages.forEach(page -> page.getMainBody().add(node));
|
||||
}
|
||||
newEntries.add(TableOfContents.Entry.builder().tocId(tocId).type(entryData.getType()).children(buildEntries(entryData.getSubEntries(), context)).node(node).build());
|
||||
}
|
||||
return newEntries;
|
||||
}
|
||||
|
||||
|
||||
private HeadlineNode buildHeadline(Context context, boolean terminal) {
|
||||
|
||||
return HeadlineNode.builder().terminal(terminal).tableOfContents(context.tableOfContents()).build();
|
||||
}
|
||||
|
||||
|
||||
private static boolean isTerminal(TableOfContentsData.EntryData entryData) {
|
||||
|
||||
return entryData.getAtomicBlocks().length > 0;
|
||||
}
|
||||
|
||||
|
||||
private ImageNode buildImage(Context context, Map<String, String> properties) {
|
||||
|
||||
var builder = ImageNode.builder();
|
||||
PropertiesMapper.parseImageProperties(properties, builder);
|
||||
return builder.tableOfContents(context.tableOfContents()).build();
|
||||
}
|
||||
|
||||
|
||||
private TableCellNode buildTableCell(Context context, Map<String, String> properties, boolean terminal) {
|
||||
|
||||
TableCellNode.TableCellNodeBuilder builder = TableCellNode.builder();
|
||||
PropertiesMapper.parseTableCellProperties(properties, builder);
|
||||
return builder.terminal(terminal).tableOfContents(context.tableOfContents()).build();
|
||||
}
|
||||
|
||||
|
||||
private TableNode buildTable(Context context, Map<String, String> properties) {
|
||||
|
||||
TableNode.TableNodeBuilder builder = TableNode.builder();
|
||||
PropertiesMapper.parseTableProperties(properties, builder);
|
||||
return TableNode.builder().tableOfContents(context.tableOfContents()).build();
|
||||
}
|
||||
|
||||
|
||||
private FooterNode buildFooter(Context context, boolean terminal) {
|
||||
|
||||
return FooterNode.builder().terminal(terminal).tableOfContents(context.tableOfContents()).build();
|
||||
}
|
||||
|
||||
|
||||
private HeaderNode buildHeader(Context context, boolean terminal) {
|
||||
|
||||
return HeaderNode.builder().terminal(terminal).tableOfContents(context.tableOfContents()).build();
|
||||
}
|
||||
|
||||
|
||||
private SectionNode buildSection(Context context) {
|
||||
|
||||
return SectionNode.builder().tableOfContents(context.tableOfContents()).build();
|
||||
|
||||
}
|
||||
|
||||
|
||||
private ParagraphNode buildParagraph(Context context, boolean terminal) {
|
||||
|
||||
return ParagraphNode.builder().terminal(terminal).tableOfContents(context.tableOfContents()).build();
|
||||
}
|
||||
|
||||
|
||||
private TextBlock toTextBlock(Long[] atomicTextBlockIds, Context context, SemanticNode parent) {
|
||||
|
||||
return Arrays.stream(atomicTextBlockIds)
|
||||
.map(atomicTextBlockId -> toAtomicTextBlock(context.atomicTextBlockData.get(Math.toIntExact(atomicTextBlockId)),
|
||||
context.atomicPositionBlockData.get(Math.toIntExact(atomicTextBlockId)),
|
||||
parent,
|
||||
context))
|
||||
.collect(new TextBlockCollector());
|
||||
}
|
||||
|
||||
|
||||
private PageNode buildPage(PageData p) {
|
||||
|
||||
return PageNode.builder().rotation(p.getRotation()).height(p.getHeight()).width(p.getWidth()).number(p.getNumber()).mainBody(new LinkedList<>()).build();
|
||||
}
|
||||
|
||||
|
||||
private AtomicTextBlock toAtomicTextBlock(AtomicTextBlockData atomicTextBlockData,
|
||||
AtomicPositionBlockData atomicPositionBlockData,
|
||||
SemanticNode parent,
|
||||
Context context) {
|
||||
|
||||
return AtomicTextBlock.builder()
|
||||
.id(atomicTextBlockData.getId())
|
||||
.numberOnPage(atomicTextBlockData.getNumberOnPage())
|
||||
.page(getPage(atomicTextBlockData.getPage(), context))
|
||||
.boundary(new Boundary(atomicTextBlockData.getStart(), atomicTextBlockData.getEnd()))
|
||||
.searchText(atomicTextBlockData.getSearchText())
|
||||
.lineBreaks(Ints.asList(atomicTextBlockData.getLineBreaks()))
|
||||
.stringIdxToPositionIdx(Ints.asList(atomicPositionBlockData.getStringIdxToPositionIdx()))
|
||||
.positions(toRectangle2DList(atomicPositionBlockData.getPositions()))
|
||||
.parent(parent)
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
private static List<Rectangle2D> toRectangle2DList(float[][] positions) {
|
||||
|
||||
return Arrays.stream(positions).map(floatArr -> (Rectangle2D) new Rectangle2D.Float(floatArr[0], floatArr[1], floatArr[2], floatArr[3])).toList();
|
||||
}
|
||||
|
||||
|
||||
private PageNode getPage(Long pageIndex, Context context) {
|
||||
|
||||
return context.pages.stream()
|
||||
.filter(page -> page.getNumber() == Math.toIntExact(pageIndex))
|
||||
.findFirst()
|
||||
.orElseThrow(() -> new NoSuchElementException(String.format("ClassificationPage with number %d not found", pageIndex)));
|
||||
}
|
||||
|
||||
|
||||
record Context(
|
||||
DocumentData layoutParsingModel,
|
||||
TableOfContents tableOfContents,
|
||||
List<PageNode> pages,
|
||||
List<SectionNode> sections,
|
||||
List<AtomicTextBlockData> atomicTextBlockData,
|
||||
List<AtomicPositionBlockData> atomicPositionBlockData) {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,101 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.mapper;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ImageNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ImageType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableCellNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.services.RectangleTransformations;
|
||||
|
||||
public class PropertiesMapper {
|
||||
|
||||
public static Map<String, String> buildImageProperties(ImageNode image) {
|
||||
|
||||
Map<String, String> properties = new HashMap<>();
|
||||
properties.put("imageType", image.getImageType().toString());
|
||||
properties.put("transparency", String.valueOf(image.isTransparency()));
|
||||
properties.put("position", RectangleTransformations.toString(image.getPosition()));
|
||||
return properties;
|
||||
}
|
||||
|
||||
|
||||
public static Map<String, String> buildTableCellProperties(TableCellNode tableCell) {
|
||||
|
||||
Map<String, String> properties = new HashMap<>();
|
||||
properties.put("row", String.valueOf(tableCell.getRow()));
|
||||
properties.put("col", String.valueOf(tableCell.getCol()));
|
||||
properties.put("header", String.valueOf(tableCell.isHeader()));
|
||||
|
||||
if (tableCell.getPages().size() > 1 || tableCell.getBBox().keySet().size() > 1) {
|
||||
throw new IllegalArgumentException("TableCell can only occur on a single page!");
|
||||
}
|
||||
String bBoxString = RectangleTransformations.toString(tableCell.getBBox().get(tableCell.getPages().stream().findFirst().get()));
|
||||
properties.put("bBox", bBoxString);
|
||||
|
||||
return properties;
|
||||
}
|
||||
|
||||
|
||||
public static Map<String, String> buildTableProperties(TableNode table) {
|
||||
|
||||
Map<String, String> properties = new HashMap<>();
|
||||
properties.put("numberOfRows", String.valueOf(table.getNumberOfRows()));
|
||||
properties.put("numberOfCols", String.valueOf(table.getNumberOfCols()));
|
||||
return properties;
|
||||
}
|
||||
|
||||
|
||||
public static void parseImageProperties(Map<String, String> properties, ImageNode.ImageNodeBuilder builder) {
|
||||
|
||||
builder.imageType(parseImageType(properties.get("imageType")));
|
||||
builder.transparency(Boolean.parseBoolean(properties.get("transparency")));
|
||||
builder.position(parseRectangle2D(properties.get("position")));
|
||||
}
|
||||
|
||||
|
||||
public static void parseTableCellProperties(Map<String, String> properties, TableCellNode.TableCellNodeBuilder builder) {
|
||||
|
||||
builder.row(Integer.parseInt(properties.get("row")));
|
||||
builder.col(Integer.parseInt(properties.get("col")));
|
||||
builder.header(Boolean.parseBoolean(properties.get("header")));
|
||||
builder.bBox(parseRectangle2D(properties.get("bBox")));
|
||||
}
|
||||
|
||||
|
||||
public static void parseTableProperties(Map<String, String> properties, TableNode.TableNodeBuilder builder) {
|
||||
|
||||
builder.numberOfRows(Integer.parseInt(properties.get("numberOfRows")));
|
||||
builder.numberOfCols(Integer.parseInt(properties.get("numberOfCols")));
|
||||
}
|
||||
|
||||
|
||||
private static ImageType parseImageType(String imageType) {
|
||||
|
||||
return switch (imageType) {
|
||||
case "LOGO" -> ImageType.LOGO;
|
||||
case "FORMULA" -> ImageType.FORMULA;
|
||||
case "SIGNATURE" -> ImageType.SIGNATURE;
|
||||
case "OCR" -> ImageType.OCR;
|
||||
default -> ImageType.OTHER;
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
public static String toString(Rectangle2D rectangle2D) {
|
||||
|
||||
return String.format("%f,%f,%f,%f", rectangle2D.getX(), rectangle2D.getY(), rectangle2D.getWidth(), rectangle2D.getHeight());
|
||||
}
|
||||
|
||||
|
||||
public static Rectangle2D parseRectangle2D(String bBox) {
|
||||
|
||||
List<Float> floats = Arrays.stream(bBox.split(",")).map(Float::parseFloat).toList();
|
||||
return new Rectangle2D.Float(floats.get(0), floats.get(1), floats.get(2), floats.get(3));
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,10 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.services;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
|
||||
|
||||
public interface EntityEnrichmentService {
|
||||
|
||||
void enrichEntity(EntityNode entity, TextBlock textBlock);
|
||||
|
||||
}
|
||||
@ -1,56 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.services;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@RequiredArgsConstructor
|
||||
public class EntityInsertionService {
|
||||
|
||||
private final EntityEnrichmentService entityEnrichmentService;
|
||||
|
||||
|
||||
public void addEntityToGraph(EntityNode entity, TableOfContents tableOfContents) {
|
||||
|
||||
try {
|
||||
SemanticNode containingNode = tableOfContents.streamChildrenNodes(Collections.emptyList())
|
||||
.filter(node -> node.buildTextBlock().containsBoundary(entity.getBoundary()))
|
||||
.findFirst()
|
||||
.orElseThrow(() -> new NoSuchElementException("No containing Node found!"));
|
||||
|
||||
containingNode.addThisToEntityIfIntersects(entity);
|
||||
|
||||
TextBlock textBlock = entity.getDeepestFullyContainingNode().buildTextBlock();
|
||||
entityEnrichmentService.enrichEntity(entity, textBlock);
|
||||
|
||||
addToPages(entity);
|
||||
addToNodeEntitySets(entity);
|
||||
|
||||
} catch (NoSuchElementException e) {
|
||||
entity.removeFromGraph();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void addToPages(EntityNode entity) {
|
||||
|
||||
Set<PageNode> pages = entity.getDeepestFullyContainingNode().getPages();
|
||||
entity.getPages().addAll(pages);
|
||||
pages.forEach(page -> page.getEntities().add(entity));
|
||||
}
|
||||
|
||||
|
||||
private void addToNodeEntitySets(EntityNode entity) {
|
||||
|
||||
entity.getIntersectingNodes().forEach(node -> node.getEntities().add(entity));
|
||||
}
|
||||
|
||||
}
|
||||
@ -77,6 +77,10 @@
|
||||
<artifactId>spring-boot-starter-amqp</artifactId>
|
||||
<version>${spring.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.junit.platform</groupId>
|
||||
<artifactId>junit-platform-commons</artifactId>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
<repositories>
|
||||
|
||||
@ -7,19 +7,19 @@ import java.io.IOException;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.DocumentGraph;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.mapper.DocumentDataMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.CvTableParsingAdapter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.ImageServiceResponseAdapter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.image.ImageServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.service.ClassificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.service.PdfParsingService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.service.SectionsBuilderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.mapper.DocumentDataMapper;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
@ -35,7 +35,6 @@ public class LayoutParsingService {
|
||||
private final PdfParsingService pdfParsingService;
|
||||
private final ClassificationService classificationService;
|
||||
private final SectionsBuilderService sectionsBuilderService;
|
||||
private final DocumentGraphFactory documentGraphFactory;
|
||||
|
||||
|
||||
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
|
||||
@ -53,7 +52,7 @@ public class LayoutParsingService {
|
||||
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.pageFileStorageId());
|
||||
}
|
||||
|
||||
DocumentGraph documentGraph = parseLayout(originDocument, imageServiceResponse, tableServiceResponse);
|
||||
Document documentGraph = parseLayout(originDocument, imageServiceResponse, tableServiceResponse);
|
||||
int numberOfPages = originDocument.getNumberOfPages();
|
||||
originDocument.close();
|
||||
|
||||
@ -72,7 +71,7 @@ public class LayoutParsingService {
|
||||
}
|
||||
|
||||
|
||||
public DocumentGraph parseLayout(PDDocument originDocument, ImageServiceResponse imageServiceResponse, TableServiceResponse tableServiceResponse) {
|
||||
public Document parseLayout(PDDocument originDocument, ImageServiceResponse imageServiceResponse, TableServiceResponse tableServiceResponse) {
|
||||
|
||||
ClassificationDocument classificationDocument = pdfParsingService.parseDocument(originDocument,
|
||||
cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse),
|
||||
@ -82,7 +81,7 @@ public class LayoutParsingService {
|
||||
|
||||
sectionsBuilderService.buildSections(classificationDocument);
|
||||
|
||||
return documentGraphFactory.buildDocumentGraph(classificationDocument);
|
||||
return DocumentGraphFactory.buildDocumentGraph(classificationDocument);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -17,12 +17,12 @@ import com.iqser.red.storage.commons.service.StorageService;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicPositionBlockData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicTextBlockData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentTreeData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.PageData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.TableOfContentsData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.image.ImageServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.multitenancy.TenantContext;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
@ -69,7 +69,7 @@ public class LayoutParsingStorageService {
|
||||
|
||||
public void storeDocumentData(LayoutParsingRequest layoutParsingRequest, DocumentData documentData) throws IOException {
|
||||
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.structureFileStorageId(), documentData.getTableOfContents());
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.structureFileStorageId(), documentData.getDocumentTreeData());
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.textBlockFileStorageId(), documentData.getAtomicTextBlocks());
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.positionBlockFileStorageId(), documentData.getAtomicPositionBlocks());
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.pageFileStorageId(), documentData.getPages());
|
||||
@ -86,12 +86,12 @@ public class LayoutParsingStorageService {
|
||||
AtomicPositionBlockData[] atomicPositionBlockData = storageService.readJSONObject(TenantContext.getTenantId(),
|
||||
layoutParsingRequest.positionBlockFileStorageId(),
|
||||
AtomicPositionBlockData[].class);
|
||||
TableOfContentsData tableOfContentsData = storageService.readJSONObject(TenantContext.getTenantId(),
|
||||
DocumentTreeData tableOfContentsData = storageService.readJSONObject(TenantContext.getTenantId(),
|
||||
layoutParsingRequest.structureFileStorageId(),
|
||||
TableOfContentsData.class);
|
||||
DocumentTreeData.class);
|
||||
|
||||
return DocumentData.builder()
|
||||
.tableOfContents(tableOfContentsData)
|
||||
.documentTreeData(tableOfContentsData)
|
||||
.atomicPositionBlocks(atomicPositionBlockData)
|
||||
.atomicTextBlocks(atomicTextBlockData)
|
||||
.pages(pageData)
|
||||
|
||||
@ -8,7 +8,7 @@ import java.util.Map;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.CvParsedTableCell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableCells;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
@ -19,9 +19,9 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@RequiredArgsConstructor
|
||||
public class CvTableParsingAdapter {
|
||||
|
||||
public Map<Integer, List<com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CvParsedTableCell>> buildCvParsedTablesPerPage(TableServiceResponse tableServiceResponse) {
|
||||
public Map<Integer, List<TableCells>> buildCvParsedTablesPerPage(TableServiceResponse tableServiceResponse) {
|
||||
|
||||
Map<Integer, List<com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CvParsedTableCell>> tableCells = new HashMap<>();
|
||||
Map<Integer, List<TableCells>> tableCells = new HashMap<>();
|
||||
tableServiceResponse.getData()
|
||||
.forEach(tableData -> tableCells.computeIfAbsent(tableData.getPageInfo().getNumber(), tableCell -> new ArrayList<>())
|
||||
.addAll(convertTableCells(tableData.getTableCells())));
|
||||
@ -30,11 +30,11 @@ public class CvTableParsingAdapter {
|
||||
}
|
||||
|
||||
|
||||
private Collection<? extends com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CvParsedTableCell> convertTableCells(List<CvParsedTableCell> tableCells) {
|
||||
private Collection<TableCells> convertTableCells(List<TableCells> tableCells) {
|
||||
|
||||
List<com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CvParsedTableCell> cvParsedTableCells = new ArrayList<>();
|
||||
List<TableCells> cvParsedTableCells = new ArrayList<>();
|
||||
|
||||
tableCells.forEach(t -> cvParsedTableCells.add(com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CvParsedTableCell.builder()
|
||||
tableCells.forEach(t -> cvParsedTableCells.add(TableCells.builder()
|
||||
.y0(t.getY0())
|
||||
.x1(t.getX1())
|
||||
.y1(t.getY1())
|
||||
|
||||
@ -9,10 +9,10 @@ import java.util.Map;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ImageType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.image.ImageServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.ImageType;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
|
||||
@ -3,12 +3,9 @@ package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import com.dslplatform.json.CompiledJson;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@CompiledJson
|
||||
public class Classification {
|
||||
|
||||
private Map<String, Float> probabilities = new HashMap<>();
|
||||
|
||||
@ -1,14 +1,11 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
|
||||
|
||||
import com.dslplatform.json.CompiledJson;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@CompiledJson
|
||||
public class FilterGeometry {
|
||||
|
||||
private ImageSize imageSize;
|
||||
private Format imageFormat;
|
||||
private ImageFormat imageFormat;
|
||||
|
||||
}
|
||||
|
||||
@ -1,11 +1,8 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
|
||||
|
||||
import com.dslplatform.json.CompiledJson;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@CompiledJson
|
||||
public class Filters {
|
||||
|
||||
private FilterGeometry geometry;
|
||||
|
||||
@ -1,11 +1,8 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
|
||||
|
||||
import com.dslplatform.json.CompiledJson;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@CompiledJson
|
||||
public class Geometry {
|
||||
|
||||
private float width;
|
||||
|
||||
@ -1,12 +1,9 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
|
||||
|
||||
import com.dslplatform.json.CompiledJson;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@CompiledJson
|
||||
public class Format {
|
||||
public class ImageFormat {
|
||||
|
||||
private float quotient;
|
||||
private boolean tooTall;
|
||||
@ -1,12 +1,9 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
|
||||
|
||||
import com.dslplatform.json.CompiledJson;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@CompiledJson
|
||||
public class Metadata {
|
||||
public class ImageMetadata {
|
||||
|
||||
private Classification classification;
|
||||
private Position position;
|
||||
@ -3,15 +3,12 @@ package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.dslplatform.json.CompiledJson;
|
||||
import com.dslplatform.json.JsonAttribute;
|
||||
import com.fasterxml.jackson.annotation.JsonAlias;
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@CompiledJson
|
||||
public class ImageServiceResponse {
|
||||
|
||||
private String dossierId;
|
||||
@ -19,15 +16,13 @@ public class ImageServiceResponse {
|
||||
|
||||
@JsonProperty(value = "imageMetadata")
|
||||
@JsonAlias("data")
|
||||
@JsonAttribute(alternativeNames = {"imageMetadata"})
|
||||
private List<Metadata> data = new ArrayList<>();
|
||||
private List<ImageMetadata> data = new ArrayList<>();
|
||||
|
||||
private List<Metadata> dataCV = new ArrayList<>();
|
||||
private List<ImageMetadata> dataCV = new ArrayList<>();
|
||||
|
||||
|
||||
@JsonProperty(value = "imageMetadata")
|
||||
@JsonAlias("data")
|
||||
@JsonAttribute(alternativeNames = {"imageMetadata"})
|
||||
public void setData(List<Metadata> data) {this.data = data;}
|
||||
public void setData(List<ImageMetadata> data) {this.data = data;}
|
||||
|
||||
}
|
||||
|
||||
@ -1,11 +1,8 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
|
||||
|
||||
import com.dslplatform.json.CompiledJson;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@CompiledJson
|
||||
public class ImageSize {
|
||||
|
||||
private float quotient;
|
||||
|
||||
@ -1,11 +1,8 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
|
||||
|
||||
import com.dslplatform.json.CompiledJson;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@CompiledJson
|
||||
public class Position {
|
||||
|
||||
private float x1;
|
||||
|
||||
@ -1,11 +1,8 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
|
||||
|
||||
import com.dslplatform.json.CompiledJson;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@CompiledJson
|
||||
public class Probability {
|
||||
|
||||
private boolean unconfident;
|
||||
|
||||
@ -1,17 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.table;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.dslplatform.json.CompiledJson;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@CompiledJson
|
||||
public class CvParsedTableModel {
|
||||
|
||||
private CvParsedPageInfo pageInfo;
|
||||
private List<CvParsedTableCell> tableCells = new ArrayList<>();
|
||||
|
||||
}
|
||||
@ -1,12 +1,9 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.table;
|
||||
|
||||
import com.dslplatform.json.CompiledJson;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@CompiledJson
|
||||
public class CvParsedPageInfo {
|
||||
public class PageInfo {
|
||||
|
||||
private int number;
|
||||
private int rotation;
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.table;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.table;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
@ -9,7 +9,7 @@ import lombok.RequiredArgsConstructor;
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@RequiredArgsConstructor
|
||||
public class CvParsedTableCell {
|
||||
public class PdfTableCell {
|
||||
|
||||
private float x0;
|
||||
private float y0;
|
||||
@ -1,12 +1,11 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.table;
|
||||
|
||||
import com.dslplatform.json.CompiledJson;
|
||||
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@CompiledJson
|
||||
public class CvParsedTableCell {
|
||||
@Builder
|
||||
public class TableCells {
|
||||
|
||||
private float x0;
|
||||
private float y0;
|
||||
@ -0,0 +1,14 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.table;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
public class TableData {
|
||||
|
||||
private PageInfo pageInfo;
|
||||
private List<TableCells> tableCells = new ArrayList<>();
|
||||
|
||||
}
|
||||
@ -3,12 +3,9 @@ package com.knecon.fforesight.service.layoutparser.processor.adapter.model.table
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.dslplatform.json.CompiledJson;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@CompiledJson
|
||||
public class TableServiceResponse {
|
||||
|
||||
private String dossierId;
|
||||
@ -17,6 +14,6 @@ public class TableServiceResponse {
|
||||
private String targetFileExtension;
|
||||
private String responseFileExtension;
|
||||
|
||||
private List<CvParsedTableModel> data = new ArrayList<>();
|
||||
private List<TableData> data = new ArrayList<>();
|
||||
|
||||
}
|
||||
|
||||
@ -1,71 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
import com.dslplatform.json.JsonAttribute;
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextBlockOrientation;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
public abstract class AbstractTextContainer {
|
||||
|
||||
protected float minX;
|
||||
protected float maxX;
|
||||
protected float minY;
|
||||
protected float maxY;
|
||||
protected String classification;
|
||||
protected int page;
|
||||
|
||||
private TextBlockOrientation orientation = TextBlockOrientation.NONE;
|
||||
|
||||
|
||||
public abstract String getText();
|
||||
|
||||
|
||||
public boolean containsBlock(ClassificationTextBlock other) {
|
||||
|
||||
return this.minX <= other.getPdfMinX() && this.maxX >= other.getPdfMaxX() && this.minY >= other.getPdfMinY() && this.maxY <= other.getPdfMaxY();
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(AbstractTextContainer other) {
|
||||
|
||||
return this.minX <= other.minX && this.maxX >= other.maxX && this.minY >= other.minY && this.maxY <= other.maxY;
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(Rectangle2D other) {
|
||||
|
||||
return other.contains(minX, minY, getWidth(), getHeight());
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public float getHeight() {
|
||||
|
||||
return maxY - minY;
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public float getWidth() {
|
||||
|
||||
return maxX - minX;
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsY(AbstractTextContainer atc) {
|
||||
|
||||
return this.minY <= atc.getMaxY() && this.maxY >= atc.getMinY();
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,38 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.table;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.Rectangle;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@SuppressWarnings("serial")
|
||||
@Data
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
@NoArgsConstructor
|
||||
public class TableCell extends Rectangle {
|
||||
|
||||
private List<ClassificationTextBlock> textBlocks = new ArrayList<>();
|
||||
|
||||
private List<TableCell> headerCells = new ArrayList<>();
|
||||
|
||||
private boolean isHeaderCell;
|
||||
|
||||
|
||||
public TableCell(Point2D topLeft, Point2D bottomRight) {
|
||||
|
||||
super((float) topLeft.getY(), (float) topLeft.getX(), (float) (bottomRight.getX() - topLeft.getX()), (float) (bottomRight.getY() - topLeft.getY()));
|
||||
}
|
||||
|
||||
|
||||
public void addTextBlock(ClassificationTextBlock textBlock) {
|
||||
|
||||
textBlocks.add(textBlock);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,80 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
public abstract class AbstractPageBlock {
|
||||
|
||||
@JsonIgnore
|
||||
protected float minX;
|
||||
@JsonIgnore
|
||||
protected float maxX;
|
||||
@JsonIgnore
|
||||
protected float minY;
|
||||
@JsonIgnore
|
||||
protected float maxY;
|
||||
@JsonIgnore
|
||||
protected PageBlockType classification;
|
||||
@JsonIgnore
|
||||
protected int page;
|
||||
|
||||
@JsonIgnore
|
||||
private Orientation orientation = Orientation.NONE;
|
||||
|
||||
|
||||
public abstract String getText();
|
||||
|
||||
|
||||
public boolean isHeadline() {
|
||||
|
||||
return this instanceof TextPageBlock && this.getClassification() != null && this.getClassification().isHeadline();
|
||||
}
|
||||
|
||||
|
||||
public boolean containsBlock(TextPageBlock other) {
|
||||
|
||||
return this.minX <= other.getPdfMinX() && this.maxX >= other.getPdfMaxX() && this.minY >= other.getPdfMinY() && this.maxY <= other.getPdfMaxY();
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(AbstractPageBlock other) {
|
||||
|
||||
return this.minX <= other.minX && this.maxX >= other.maxX && this.minY >= other.minY && this.maxY <= other.maxY;
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(Rectangle other) {
|
||||
|
||||
return page == other.getPage() && this.minX <= other.getTopLeft().getX() && this.maxX >= other.getTopLeft().getX() + other.getWidth() && this.minY <= other.getTopLeft()
|
||||
.getY() && this.maxY >= other.getTopLeft().getY() + other.getHeight();
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getHeight() {
|
||||
|
||||
return maxY - minY;
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getWidth() {
|
||||
|
||||
return maxX - minX;
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsY(AbstractPageBlock atc) {
|
||||
|
||||
return this.minY <= atc.getMaxY() && this.maxY >= atc.getMinY();
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,10 +1,11 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.StringFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.UnclassifiedText;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.section.SectionGrid;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.StringFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.UnclassifiedText;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
@ -24,4 +25,7 @@ public class ClassificationDocument {
|
||||
private StringFrequencyCounter fontStyleCounter = new StringFrequencyCounter();
|
||||
private boolean headlines;
|
||||
|
||||
private SectionGrid sectionGrid = new SectionGrid();
|
||||
private long rulesVersion;
|
||||
|
||||
}
|
||||
@ -1,8 +1,8 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
@ -11,6 +11,6 @@ import lombok.Data;
|
||||
@AllArgsConstructor
|
||||
public class ClassificationFooter {
|
||||
|
||||
private List<ClassificationTextBlock> textBlocks;
|
||||
private List<TextPageBlock> textBlocks;
|
||||
|
||||
}
|
||||
@ -1,8 +1,8 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
@ -11,6 +11,6 @@ import lombok.Data;
|
||||
@AllArgsConstructor
|
||||
public class ClassificationHeader {
|
||||
|
||||
private List<ClassificationTextBlock> textBlocks;
|
||||
private List<TextPageBlock> textBlocks;
|
||||
|
||||
}
|
||||
@ -1,11 +1,11 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.StringFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.StringFrequencyCounter;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.NonNull;
|
||||
@ -16,7 +16,7 @@ import lombok.RequiredArgsConstructor;
|
||||
public class ClassificationPage {
|
||||
|
||||
@NonNull
|
||||
private List<AbstractTextContainer> textBlocks;
|
||||
private List<AbstractPageBlock> textBlocks;
|
||||
|
||||
private List<ClassifiedImage> images = new ArrayList<>();
|
||||
|
||||
@ -1,38 +1,32 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Table;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
public class ClassificationSection implements Comparable {
|
||||
public class ClassificationSection {
|
||||
|
||||
private List<AbstractTextContainer> pageBlocks = new ArrayList<>();
|
||||
private List<AbstractPageBlock> pageBlocks = new ArrayList<>();
|
||||
private List<ClassifiedImage> images = new ArrayList<>();
|
||||
private String headline;
|
||||
|
||||
|
||||
public List<Table> getTables() {
|
||||
public List<TablePageBlock> getTables() {
|
||||
|
||||
List<Table> tables = new ArrayList<>();
|
||||
List<TablePageBlock> tables = new ArrayList<>();
|
||||
pageBlocks.forEach(block -> {
|
||||
if (block instanceof Table) {
|
||||
tables.add((Table) block);
|
||||
if (block instanceof TablePageBlock) {
|
||||
tables.add((TablePageBlock) block);
|
||||
}
|
||||
});
|
||||
return tables;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int compareTo(Object o) {
|
||||
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
@ -9,9 +9,9 @@ import java.util.stream.Collectors;
|
||||
|
||||
import lombok.Getter;
|
||||
|
||||
@Getter
|
||||
public class FloatFrequencyCounter {
|
||||
|
||||
@Getter
|
||||
Map<Float, Integer> countPerValue = new HashMap<>();
|
||||
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.text;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model;
|
||||
|
||||
public enum TextBlockOrientation {
|
||||
public enum Orientation {
|
||||
|
||||
NONE,
|
||||
LEFT,
|
||||
@ -0,0 +1,38 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model;
|
||||
|
||||
public enum PageBlockType {
|
||||
H1,
|
||||
H2,
|
||||
H3,
|
||||
H4,
|
||||
H5,
|
||||
H6,
|
||||
HEADER,
|
||||
FOOTER,
|
||||
TITLE,
|
||||
PARAGRAPH,
|
||||
PARAGRAPH_BOLD,
|
||||
PARAGRAPH_ITALIC,
|
||||
PARAGRAPH_UNKNOWN,
|
||||
OTHER,
|
||||
TABLE;
|
||||
|
||||
|
||||
public static PageBlockType getHeadlineType(int i) {
|
||||
|
||||
return switch (i) {
|
||||
case 1 -> PageBlockType.H1;
|
||||
case 2 -> PageBlockType.H2;
|
||||
case 3 -> PageBlockType.H3;
|
||||
case 4 -> PageBlockType.H4;
|
||||
case 5 -> PageBlockType.H5;
|
||||
default -> PageBlockType.H6;
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
public boolean isHeadline() {
|
||||
|
||||
return this.equals(H1) || this.equals(H2) || this.equals(H3) || this.equals(H4) || this.equals(H5) || this.equals(H6);
|
||||
}
|
||||
}
|
||||
@ -1,8 +1,8 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.image;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.image;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ImageType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.ImageType;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.NonNull;
|
||||
@ -0,0 +1,79 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.table;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.TextNormalizationUtilities;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@SuppressWarnings("serial")
|
||||
@Data
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
@NoArgsConstructor
|
||||
public class Cell extends Rectangle {
|
||||
|
||||
private List<TextPageBlock> textBlocks = new ArrayList<>();
|
||||
|
||||
private List<Cell> headerCells = new ArrayList<>();
|
||||
|
||||
private boolean isHeaderCell;
|
||||
|
||||
private static final int MIN_SIZE = 1;
|
||||
|
||||
private int pageNumber;
|
||||
|
||||
|
||||
public Cell(Point2D topLeft, Point2D bottomRight) {
|
||||
|
||||
super((float) topLeft.getY(), (float) topLeft.getX(), (float) (bottomRight.getX() - topLeft.getX()), (float) (bottomRight.getY() - topLeft.getY()));
|
||||
}
|
||||
|
||||
|
||||
public void addTextBlock(TextPageBlock textBlock) {
|
||||
|
||||
textBlocks.add(textBlock);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
Iterator<TextPageBlock> itty = textBlocks.iterator();
|
||||
TextPositionSequence previous = null;
|
||||
while (itty.hasNext()) {
|
||||
|
||||
TextPageBlock textBlock = itty.next();
|
||||
|
||||
for (TextPositionSequence word : textBlock.getSequences()) {
|
||||
if (previous != null) {
|
||||
if (Math.abs(previous.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight()) {
|
||||
sb.append('\n');
|
||||
} else {
|
||||
sb.append(' ');
|
||||
}
|
||||
}
|
||||
sb.append(word.toString());
|
||||
previous = word;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString()).replaceAll("\n", " ").replaceAll(" {2}", " ");
|
||||
}
|
||||
|
||||
|
||||
public boolean hasMinimumSize() {
|
||||
|
||||
return this.getHeight() >= MIN_SIZE && this.getWidth() >= MIN_SIZE;
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,11 +1,11 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.table;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.table;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.Value;
|
||||
|
||||
@Value
|
||||
@RequiredArgsConstructor
|
||||
public class TableCellPosition implements Comparable<TableCellPosition> {
|
||||
public class CellPosition implements Comparable<CellPosition> {
|
||||
|
||||
int row;
|
||||
|
||||
@ -13,7 +13,7 @@ public class TableCellPosition implements Comparable<TableCellPosition> {
|
||||
|
||||
|
||||
@Override
|
||||
public int compareTo(TableCellPosition other) {
|
||||
public int compareTo(CellPosition other) {
|
||||
|
||||
int rowDiff = row - other.row;
|
||||
return rowDiff != 0 ? rowDiff : col - other.col;
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.table;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.table;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.table;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.table;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.table;
|
||||
|
||||
import java.awt.geom.Line2D;
|
||||
import java.awt.geom.Point2D;
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.table;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.table;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.ArrayList;
|
||||
@ -7,20 +7,19 @@ import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.TreeMap;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
public class Table extends AbstractTextContainer {
|
||||
public class TablePageBlock extends AbstractPageBlock {
|
||||
|
||||
private final TreeMap<TableCellPosition, TableCell> cells = new TreeMap<>();
|
||||
private final TreeMap<CellPosition, Cell> cells = new TreeMap<>();
|
||||
|
||||
private final int rotation;
|
||||
@Getter
|
||||
@ -28,32 +27,29 @@ public class Table extends AbstractTextContainer {
|
||||
private String headline;
|
||||
private int unrotatedRowCount;
|
||||
private int unrotatedColCount;
|
||||
private int rowCount = -1;
|
||||
private int colCount = -1;
|
||||
private List<List<TableCell>> rows;
|
||||
private List<List<Cell>> rows;
|
||||
|
||||
|
||||
public Table(List<TableCell> cells, Rectangle area, int rotation) {
|
||||
public TablePageBlock(List<Cell> cells, Rectangle area, int rotation) {
|
||||
|
||||
addCells(cells);
|
||||
minX = area.getLeft();
|
||||
minY = area.getBottom();
|
||||
maxX = area.getRight();
|
||||
maxY = area.getTop();
|
||||
classification = "Table";
|
||||
classification = PageBlockType.TABLE;
|
||||
this.rotation = rotation;
|
||||
|
||||
}
|
||||
|
||||
|
||||
public List<List<TableCell>> getRows() {
|
||||
public List<List<Cell>> getRows() {
|
||||
|
||||
if (rows == null) {
|
||||
rows = computeRows();
|
||||
|
||||
// Ignore rows that does not contain any cells and values.
|
||||
List<List<TableCell>> rowsToRemove = new ArrayList<>();
|
||||
for (List<TableCell> row : rows) {
|
||||
List<List<Cell>> rowsToRemove = new ArrayList<>();
|
||||
for (List<Cell> row : rows) {
|
||||
if (row.size() == 1 && row.get(0).getTextBlocks().isEmpty()) {
|
||||
rowsToRemove.add(row);
|
||||
}
|
||||
@ -70,19 +66,13 @@ public class Table extends AbstractTextContainer {
|
||||
|
||||
public int getRowCount() {
|
||||
|
||||
if (rowCount == -1) {
|
||||
rowCount = getRows().size();
|
||||
}
|
||||
return rowCount;
|
||||
return getRows().size();
|
||||
}
|
||||
|
||||
|
||||
public int getColCount() {
|
||||
|
||||
if (colCount == -1) {
|
||||
colCount = getRows().stream().mapToInt(List::size).max().orElse(0);
|
||||
}
|
||||
return colCount;
|
||||
return getRows().stream().mapToInt(List::size).max().orElse(0);
|
||||
|
||||
}
|
||||
|
||||
@ -100,16 +90,16 @@ public class Table extends AbstractTextContainer {
|
||||
// A bold cell is a header cell as long as every cell to the left/top is bold, too
|
||||
// we move from left to right and top to bottom
|
||||
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
|
||||
List<TableCell> rowCells = rows.get(rowIndex);
|
||||
List<Cell> rowCells = rows.get(rowIndex);
|
||||
if (rowCells.size() == 1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (int colIndex = 0; colIndex < rowCells.size(); colIndex++) {
|
||||
TableCell cell = rowCells.get(colIndex);
|
||||
List<TableCell> cellsToTheLeft = rowCells.subList(0, colIndex);
|
||||
TableCell lastHeaderCell = null;
|
||||
for (TableCell leftCell : cellsToTheLeft) {
|
||||
Cell cell = rowCells.get(colIndex);
|
||||
List<Cell> cellsToTheLeft = rowCells.subList(0, colIndex);
|
||||
Cell lastHeaderCell = null;
|
||||
for (Cell leftCell : cellsToTheLeft) {
|
||||
if (leftCell.isHeaderCell()) {
|
||||
lastHeaderCell = leftCell;
|
||||
} else {
|
||||
@ -119,7 +109,7 @@ public class Table extends AbstractTextContainer {
|
||||
if (lastHeaderCell != null) {
|
||||
cell.getHeaderCells().add(lastHeaderCell);
|
||||
}
|
||||
List<TableCell> cellsToTheTop = new ArrayList<>();
|
||||
List<Cell> cellsToTheTop = new ArrayList<>();
|
||||
for (int i = 0; i < rowIndex; i++) {
|
||||
try {
|
||||
cellsToTheTop.add(rows.get(i).get(colIndex));
|
||||
@ -127,7 +117,7 @@ public class Table extends AbstractTextContainer {
|
||||
log.debug("No cell {} in row {}, ignoring.", colIndex, rowIndex);
|
||||
}
|
||||
}
|
||||
for (TableCell topCell : cellsToTheTop) {
|
||||
for (Cell topCell : cellsToTheTop) {
|
||||
if (topCell.isHeaderCell()) {
|
||||
lastHeaderCell = topCell;
|
||||
} else {
|
||||
@ -146,14 +136,14 @@ public class Table extends AbstractTextContainer {
|
||||
}
|
||||
|
||||
|
||||
private List<List<TableCell>> computeRows() {
|
||||
private List<List<Cell>> computeRows() {
|
||||
|
||||
List<List<TableCell>> rows = new ArrayList<>();
|
||||
List<List<Cell>> rows = new ArrayList<>();
|
||||
if (rotation == 90) {
|
||||
for (int i = 0; i < unrotatedColCount; i++) { // rows
|
||||
List<TableCell> lastRow = new ArrayList<>();
|
||||
List<Cell> lastRow = new ArrayList<>();
|
||||
for (int j = unrotatedRowCount - 1; j >= 0; j--) { // cols
|
||||
TableCell cell = cells.get(new TableCellPosition(j, i));
|
||||
Cell cell = cells.get(new CellPosition(j, i));
|
||||
if (cell != null) {
|
||||
lastRow.add(cell);
|
||||
}
|
||||
@ -162,9 +152,9 @@ public class Table extends AbstractTextContainer {
|
||||
}
|
||||
} else if (rotation == 270) {
|
||||
for (int i = unrotatedColCount - 1; i >= 0; i--) { // rows
|
||||
List<TableCell> lastRow = new ArrayList<>();
|
||||
List<Cell> lastRow = new ArrayList<>();
|
||||
for (int j = 0; j < unrotatedRowCount; j++) { // cols
|
||||
TableCell cell = cells.get(new TableCellPosition(j, i));
|
||||
Cell cell = cells.get(new CellPosition(j, i));
|
||||
if (cell != null) {
|
||||
lastRow.add(cell);
|
||||
}
|
||||
@ -173,9 +163,9 @@ public class Table extends AbstractTextContainer {
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < unrotatedRowCount; i++) {
|
||||
List<TableCell> lastRow = new ArrayList<>();
|
||||
List<Cell> lastRow = new ArrayList<>();
|
||||
for (int j = 0; j < unrotatedColCount; j++) {
|
||||
TableCell cell = cells.get(new TableCellPosition(i, j)); // JAVA_8 use getOrDefault()
|
||||
Cell cell = cells.get(new CellPosition(i, j)); // JAVA_8 use getOrDefault()
|
||||
if (cell != null) {
|
||||
lastRow.add(cell);
|
||||
}
|
||||
@ -189,18 +179,18 @@ public class Table extends AbstractTextContainer {
|
||||
}
|
||||
|
||||
|
||||
private void add(TableCell chunk, int row, int col) {
|
||||
private void add(Cell chunk, int row, int col) {
|
||||
|
||||
unrotatedRowCount = Math.max(unrotatedRowCount, row + 1);
|
||||
unrotatedColCount = Math.max(unrotatedColCount, col + 1);
|
||||
|
||||
TableCellPosition cp = new TableCellPosition(row, col);
|
||||
CellPosition cp = new CellPosition(row, col);
|
||||
cells.put(cp, chunk);
|
||||
|
||||
}
|
||||
|
||||
|
||||
private void addCells(List<TableCell> cells) {
|
||||
private void addCells(List<Cell> cells) {
|
||||
|
||||
if (cells.isEmpty()) {
|
||||
return;
|
||||
@ -208,7 +198,7 @@ public class Table extends AbstractTextContainer {
|
||||
|
||||
cells.removeIf(cell -> cell.getWidth() < 1.1 || cell.getHeight() < 1.1);
|
||||
|
||||
List<List<TableCell>> rowsOfCells = calculateStructure(cells);
|
||||
List<List<Cell>> rowsOfCells = calculateStructure(cells);
|
||||
|
||||
for (int i = 0; i < rowsOfCells.size(); i++) {
|
||||
for (int j = 0; j < rowsOfCells.get(i).size(); j++) {
|
||||
@ -223,11 +213,11 @@ public class Table extends AbstractTextContainer {
|
||||
* Calculates the structure of the table. For spanning rows and columns multiple cells with the same values will be inserted.
|
||||
*
|
||||
* @param cells The found cells
|
||||
* @return Table Structure
|
||||
* @return TablePageBlock Structure
|
||||
*/
|
||||
private List<List<TableCell>> calculateStructure(List<TableCell> cells) {
|
||||
private List<List<Cell>> calculateStructure(List<Cell> cells) {
|
||||
|
||||
List<List<TableCell>> matrix = new ArrayList<>();
|
||||
List<List<Cell>> matrix = new ArrayList<>();
|
||||
|
||||
if (cells.isEmpty()) {
|
||||
return matrix;
|
||||
@ -242,30 +232,30 @@ public class Table extends AbstractTextContainer {
|
||||
uniqueY.add(c.getTop());
|
||||
});
|
||||
|
||||
var sortedUniqueX = uniqueX.stream().sorted().collect(Collectors.toList());
|
||||
var sortedUniqueY = uniqueY.stream().sorted().collect(Collectors.toList());
|
||||
var sortedUniqueX = uniqueX.stream().sorted().toList();
|
||||
var sortedUniqueY = uniqueY.stream().sorted().toList();
|
||||
|
||||
Float prevY = null;
|
||||
for (Float y : sortedUniqueY) {
|
||||
|
||||
List<TableCell> row = new ArrayList<>();
|
||||
List<Cell> row = new ArrayList<>();
|
||||
|
||||
Float prevX = null;
|
||||
for (Float x : sortedUniqueX) {
|
||||
|
||||
if (prevY != null && prevX != null) {
|
||||
var cell = new TableCell(new Point2D.Float(prevX, prevY), new Point2D.Float(x, y));
|
||||
var cell = new Cell(new Point2D.Float(prevX, prevY), new Point2D.Float(x, y));
|
||||
|
||||
var intersectionCell = cells.stream().filter(c -> cell.intersects(c) && cell.overlapRatio(c) > 0.1f).findFirst();
|
||||
if (intersectionCell.isPresent()) {
|
||||
cell.getTextBlocks().addAll(intersectionCell.get().getTextBlocks());
|
||||
intersectionCell.ifPresent(value -> cell.getTextBlocks().addAll(value.getTextBlocks()));
|
||||
if (cell.hasMinimumSize()) {
|
||||
row.add(cell);
|
||||
}
|
||||
row.add(cell);
|
||||
}
|
||||
prevX = x;
|
||||
}
|
||||
|
||||
if (prevY != null && prevX != null) {
|
||||
if (prevY != null && prevX != null && !row.isEmpty()) {
|
||||
matrix.add(row);
|
||||
}
|
||||
prevY = y;
|
||||
@ -281,22 +271,22 @@ public class Table extends AbstractTextContainer {
|
||||
public String getText() {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
List<List<TableCell>> rows = getRows();
|
||||
List<List<Cell>> rows = getRows();
|
||||
|
||||
int i = 0;
|
||||
for (List<TableCell> row : rows) {
|
||||
for (List<Cell> row : rows) {
|
||||
if (i != 0) {
|
||||
sb.append("\n");
|
||||
}
|
||||
if (!row.isEmpty()) {
|
||||
boolean firstColumn = true;
|
||||
for (TableCell column : row) {
|
||||
for (Cell column : row) {
|
||||
if (!firstColumn) {
|
||||
sb.append(",");
|
||||
}
|
||||
if (column != null && column.getTextBlocks() != null) {
|
||||
boolean first = true;
|
||||
for (ClassificationTextBlock textBlock : column.getTextBlocks()) {
|
||||
for (TextPageBlock textBlock : column.getTextBlocks()) {
|
||||
if (!first) {
|
||||
sb.append("\n");
|
||||
}
|
||||
@ -317,18 +307,18 @@ public class Table extends AbstractTextContainer {
|
||||
public String getTextAsHtml() {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
List<List<TableCell>> rows = getRows();
|
||||
List<List<Cell>> rows = getRows();
|
||||
|
||||
sb.append("<table border=\"1\">");
|
||||
int i = 0;
|
||||
for (List<TableCell> row : rows) {
|
||||
for (List<Cell> row : rows) {
|
||||
sb.append("\n<tr>");
|
||||
if (!row.isEmpty()) {
|
||||
for (TableCell column : row) {
|
||||
for (Cell column : row) {
|
||||
sb.append(i == 0 ? "\n<th>" : "\n<td>");
|
||||
if (column != null && column.getTextBlocks() != null) {
|
||||
boolean first = true;
|
||||
for (ClassificationTextBlock textBlock : column.getTextBlocks()) {
|
||||
for (TextPageBlock textBlock : column.getTextBlocks()) {
|
||||
if (!first) {
|
||||
sb.append("<br />");
|
||||
}
|
||||
@ -1,10 +1,8 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.text;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
|
||||
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
import org.springframework.beans.BeanUtils;
|
||||
|
||||
import com.dslplatform.json.CompiledJson;
|
||||
import com.dslplatform.json.JsonAttribute;
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
@ -17,7 +15,6 @@ import lombok.SneakyThrows;
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@CompiledJson
|
||||
public class RedTextPosition {
|
||||
|
||||
private String textMatrix;
|
||||
@ -39,17 +36,14 @@ public class RedTextPosition {
|
||||
|
||||
// not used in reanalysis
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
private float widthOfSpace;
|
||||
|
||||
// not used in reanalysis
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
private float fontSizeInPt;
|
||||
|
||||
// not used in reanalysis
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
private String fontName;
|
||||
|
||||
|
||||
@ -0,0 +1,48 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.TextNormalizationUtilities;
|
||||
|
||||
import lombok.Getter;
|
||||
|
||||
@Getter
|
||||
public class SearchableText {
|
||||
|
||||
private final List<TextPositionSequence> sequences = new ArrayList<>();
|
||||
|
||||
|
||||
public void add(TextPositionSequence textPositionSequence) {
|
||||
|
||||
sequences.add(textPositionSequence);
|
||||
}
|
||||
|
||||
|
||||
public void addAll(List<TextPositionSequence> textPositionSequences) {
|
||||
|
||||
sequences.addAll(textPositionSequences);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return buildString(sequences);
|
||||
}
|
||||
|
||||
|
||||
public static String buildString(List<TextPositionSequence> sequences) {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (TextPositionSequence word : sequences) {
|
||||
sb.append(word);
|
||||
sb.append(' ');
|
||||
}
|
||||
String text = sb.toString();
|
||||
text = TextNormalizationUtilities.removeHyphenLineBreaks(text);
|
||||
text = TextNormalizationUtilities.removeLineBreaks(text);
|
||||
text = TextNormalizationUtilities.removeRepeatingWhitespaces(text);
|
||||
return text;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,17 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class SimplifiedSectionText {
|
||||
|
||||
private int sectionNumber;
|
||||
private String text;
|
||||
|
||||
}
|
||||
@ -0,0 +1,20 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class SimplifiedText {
|
||||
|
||||
private int numberOfPages;
|
||||
private List<SimplifiedSectionText> sectionTexts = new ArrayList<>();
|
||||
|
||||
}
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.text;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.text;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonCreator;
|
||||
import com.fasterxml.jackson.annotation.JsonValue;
|
||||
@ -33,13 +33,6 @@ public enum TextDirection {
|
||||
}
|
||||
|
||||
|
||||
@com.dslplatform.json.JsonValue
|
||||
public float jsonValue() {
|
||||
|
||||
return getDegrees();
|
||||
}
|
||||
|
||||
|
||||
@JsonCreator(mode = JsonCreator.Mode.DELEGATING)
|
||||
public static TextDirection fromDegrees(float degrees) {
|
||||
|
||||
@ -1,57 +1,67 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.text;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer;
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.TextNormalizationUtilities;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
@AllArgsConstructor
|
||||
@Builder
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
public class ClassificationTextBlock extends AbstractTextContainer {
|
||||
public class TextPageBlock extends AbstractPageBlock {
|
||||
|
||||
@Builder.Default
|
||||
private List<TextPositionSequence> sequences = new ArrayList<>();
|
||||
|
||||
@JsonIgnore
|
||||
private int rotation;
|
||||
|
||||
private int indexOnPage;
|
||||
|
||||
@JsonIgnore
|
||||
private String mostPopularWordFont;
|
||||
|
||||
@JsonIgnore
|
||||
private String mostPopularWordStyle;
|
||||
|
||||
@JsonIgnore
|
||||
private float mostPopularWordFontSize;
|
||||
|
||||
@JsonIgnore
|
||||
private float mostPopularWordHeight;
|
||||
|
||||
@JsonIgnore
|
||||
private float mostPopularWordSpaceWidth;
|
||||
|
||||
@JsonIgnore
|
||||
private float highestFontSize;
|
||||
|
||||
private String classification;
|
||||
@JsonIgnore
|
||||
private PageBlockType classification;
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public TextDirection getDir() {
|
||||
|
||||
return sequences.get(0).getDir();
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
private float getPageHeight() {
|
||||
|
||||
return sequences.get(0).getPageHeight();
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
private float getPageWidth() {
|
||||
|
||||
return sequences.get(0).getPageWidth();
|
||||
@ -68,6 +78,7 @@ public class ClassificationTextBlock extends AbstractTextContainer {
|
||||
*
|
||||
* @return the minX value in pdf coordinate system
|
||||
*/
|
||||
@JsonIgnore
|
||||
public float getPdfMinX() {
|
||||
|
||||
if (getDir().getDegrees() == 90) {
|
||||
@ -83,6 +94,7 @@ public class ClassificationTextBlock extends AbstractTextContainer {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns the maxX value in pdf coordinate system.
|
||||
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
|
||||
@ -93,6 +105,7 @@ public class ClassificationTextBlock extends AbstractTextContainer {
|
||||
*
|
||||
* @return the maxX value in pdf coordinate system
|
||||
*/
|
||||
@JsonIgnore
|
||||
public float getPdfMaxX() {
|
||||
|
||||
if (getDir().getDegrees() == 90) {
|
||||
@ -118,6 +131,7 @@ public class ClassificationTextBlock extends AbstractTextContainer {
|
||||
*
|
||||
* @return the minY value in pdf coordinate system
|
||||
*/
|
||||
@JsonIgnore
|
||||
public float getPdfMinY() {
|
||||
|
||||
if (getDir().getDegrees() == 90) {
|
||||
@ -144,6 +158,7 @@ public class ClassificationTextBlock extends AbstractTextContainer {
|
||||
*
|
||||
* @return the maxY value in pdf coordinate system
|
||||
*/
|
||||
@JsonIgnore
|
||||
public float getPdfMaxY() {
|
||||
|
||||
if (getDir().getDegrees() == 90) {
|
||||
@ -159,35 +174,34 @@ public class ClassificationTextBlock extends AbstractTextContainer {
|
||||
}
|
||||
|
||||
|
||||
public ClassificationTextBlock(float minX, float maxX, float minY, float maxY, List<TextPositionSequence> sequences, int rotation, int indexOnPage) {
|
||||
super();
|
||||
this.indexOnPage = indexOnPage;
|
||||
super.minX = minX;
|
||||
super.maxX = maxX;
|
||||
super.minY = minY;
|
||||
super.maxY = maxY;
|
||||
public TextPageBlock(float minX, float maxX, float minY, float maxY, List<TextPositionSequence> sequences, int rotation) {
|
||||
|
||||
this.minX = minX;
|
||||
this.maxX = maxX;
|
||||
this.minY = minY;
|
||||
this.maxY = maxY;
|
||||
this.sequences = sequences;
|
||||
this.rotation = rotation;
|
||||
}
|
||||
|
||||
|
||||
public ClassificationTextBlock union(TextPositionSequence r) {
|
||||
public TextPageBlock union(TextPositionSequence r) {
|
||||
|
||||
ClassificationTextBlock union = this.copy();
|
||||
TextPageBlock union = this.copy();
|
||||
union.add(r);
|
||||
return union;
|
||||
}
|
||||
|
||||
|
||||
public ClassificationTextBlock union(ClassificationTextBlock r) {
|
||||
public TextPageBlock union(TextPageBlock r) {
|
||||
|
||||
ClassificationTextBlock union = this.copy();
|
||||
TextPageBlock union = this.copy();
|
||||
union.add(r);
|
||||
return union;
|
||||
}
|
||||
|
||||
|
||||
public void add(ClassificationTextBlock r) {
|
||||
public void add(TextPageBlock r) {
|
||||
|
||||
if (r.getMinX() < minX) {
|
||||
minX = r.getMinX();
|
||||
@ -222,9 +236,9 @@ public class ClassificationTextBlock extends AbstractTextContainer {
|
||||
}
|
||||
|
||||
|
||||
public ClassificationTextBlock copy() {
|
||||
public TextPageBlock copy() {
|
||||
|
||||
return new ClassificationTextBlock(minX, maxX, minY, maxY, sequences, rotation, indexOnPage);
|
||||
return new TextPageBlock(minX, maxX, minY, maxY, sequences, rotation);
|
||||
}
|
||||
|
||||
|
||||
@ -263,6 +277,7 @@ public class ClassificationTextBlock extends AbstractTextContainer {
|
||||
|
||||
|
||||
@Override
|
||||
@JsonIgnore
|
||||
public String getText() {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
@ -283,4 +298,5 @@ public class ClassificationTextBlock extends AbstractTextContainer {
|
||||
return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString());
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.text;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Point2D;
|
||||
@ -8,8 +8,8 @@ import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
|
||||
import com.dslplatform.json.JsonAttribute;
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
|
||||
@ -25,6 +25,7 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@JsonIgnoreProperties({"empty"})
|
||||
public class TextPositionSequence implements CharSequence {
|
||||
|
||||
public static final int HEIGHT_PADDING = 2;
|
||||
@ -37,6 +38,12 @@ public class TextPositionSequence implements CharSequence {
|
||||
private float pageWidth;
|
||||
|
||||
|
||||
public TextPositionSequence(int page) {
|
||||
|
||||
this.page = page;
|
||||
}
|
||||
|
||||
|
||||
public TextPositionSequence(List<TextPosition> textPositions, int page) {
|
||||
|
||||
this.textPositions = textPositions.stream().map(RedTextPosition::fromTextPosition).collect(Collectors.toList());
|
||||
@ -64,6 +71,14 @@ public class TextPositionSequence implements CharSequence {
|
||||
}
|
||||
|
||||
|
||||
public char charAt(int index, boolean caseInSensitive) {
|
||||
|
||||
RedTextPosition textPosition = textPositionAt(index);
|
||||
String text = textPosition.getUnicode();
|
||||
return caseInSensitive ? text.toLowerCase().charAt(0) : text.charAt(0);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public TextPositionSequence subSequence(int start, int end) {
|
||||
|
||||
@ -126,7 +141,6 @@ public class TextPositionSequence implements CharSequence {
|
||||
* @return the text direction adjusted minX value
|
||||
*/
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public float getMinXDirAdj() {
|
||||
|
||||
return textPositions.get(0).getXDirAdj();
|
||||
@ -141,7 +155,6 @@ public class TextPositionSequence implements CharSequence {
|
||||
* @return the text direction adjusted maxX value
|
||||
*/
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public float getMaxXDirAdj() {
|
||||
|
||||
return textPositions.get(textPositions.size() - 1).getXDirAdj() + textPositions.get(textPositions.size() - 1).getWidthDirAdj() + HEIGHT_PADDING;
|
||||
@ -156,7 +169,6 @@ public class TextPositionSequence implements CharSequence {
|
||||
* @return the text direction adjusted minY value. The upper border of the bounding box of the word.
|
||||
*/
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public float getMinYDirAdj() {
|
||||
|
||||
return textPositions.get(0).getYDirAdj() - getTextHeight();
|
||||
@ -171,7 +183,6 @@ public class TextPositionSequence implements CharSequence {
|
||||
* @return the text direction adjusted maxY value. The lower border of the bounding box of the word.
|
||||
*/
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public float getMaxYDirAdj() {
|
||||
|
||||
return textPositions.get(0).getYDirAdj();
|
||||
@ -180,7 +191,6 @@ public class TextPositionSequence implements CharSequence {
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public float getTextHeight() {
|
||||
|
||||
return textPositions.get(0).getHeightDir() + HEIGHT_PADDING;
|
||||
@ -188,7 +198,6 @@ public class TextPositionSequence implements CharSequence {
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public float getHeight() {
|
||||
|
||||
return getMaxYDirAdj() - getMinYDirAdj();
|
||||
@ -196,7 +205,6 @@ public class TextPositionSequence implements CharSequence {
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public float getWidth() {
|
||||
|
||||
return getMaxXDirAdj() - getMinXDirAdj();
|
||||
@ -204,7 +212,6 @@ public class TextPositionSequence implements CharSequence {
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public String getFont() {
|
||||
|
||||
return textPositions.get(0).getFontName().toLowerCase().replaceAll(",bold", "").replaceAll(",italic", "");
|
||||
@ -212,7 +219,6 @@ public class TextPositionSequence implements CharSequence {
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public String getFontStyle() {
|
||||
|
||||
String lowercaseFontName = textPositions.get(0).getFontName().toLowerCase();
|
||||
@ -231,7 +237,6 @@ public class TextPositionSequence implements CharSequence {
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public float getFontSize() {
|
||||
|
||||
return textPositions.get(0).getFontSizeInPt();
|
||||
@ -239,7 +244,6 @@ public class TextPositionSequence implements CharSequence {
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public float getSpaceWidth() {
|
||||
|
||||
return textPositions.get(0).getWidthOfSpace();
|
||||
@ -256,11 +260,10 @@ public class TextPositionSequence implements CharSequence {
|
||||
* @return bounding box of the word in Pdf Coordinate System
|
||||
*/
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
@SneakyThrows
|
||||
public Rectangle getRectangle() {
|
||||
|
||||
log.debug("ClassificationPage: '{}', Word: '{}', Rotation: '{}', textRotation {}", page, this, rotation, dir);
|
||||
log.debug("Page: '{}', Word: '{}', Rotation: '{}', textRotation {}", page, this, rotation, dir);
|
||||
|
||||
float textHeight = getTextHeight();
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.text;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@ -9,6 +9,6 @@ import lombok.Data;
|
||||
@AllArgsConstructor
|
||||
public class UnclassifiedText {
|
||||
|
||||
private List<ClassificationTextBlock> textBlocks;
|
||||
private List<TextPageBlock> textBlocks;
|
||||
|
||||
}
|
||||
@ -1,82 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.parsing;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.pdfbox.text.PDFTextStripperByArea;
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextPositionSequence;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
|
||||
public class PDFAreaTextStripper extends PDFTextStripperByArea {
|
||||
|
||||
@Getter
|
||||
private List<TextPositionSequence> textPositionSequences = new ArrayList<>();
|
||||
|
||||
@Setter
|
||||
private int pageNumber;
|
||||
|
||||
|
||||
public PDFAreaTextStripper() throws IOException {
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void writeString(String text, List<TextPosition> textPositions) throws IOException {
|
||||
|
||||
int startIndex = 0;
|
||||
for (int i = 0; i <= textPositions.size() - 1; i++) {
|
||||
|
||||
if (i == 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i).getUnicode().equals("\u00A0"))) {
|
||||
startIndex++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Strange but sometimes this is happening, for example: Metolachlor2.pdf
|
||||
if (i > 0 && textPositions.get(i).getX() < textPositions.get(i - 1).getX()) {
|
||||
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
||||
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) {
|
||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
|
||||
}
|
||||
startIndex = i;
|
||||
}
|
||||
|
||||
if (textPositions.get(i).getRotation() == 0 && i > 0 && textPositions.get(i).getX() > textPositions.get(i - 1).getEndX() + 1) {
|
||||
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
||||
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) {
|
||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
|
||||
}
|
||||
startIndex = i;
|
||||
}
|
||||
|
||||
if (i > 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i).getUnicode().equals("\u00A0")) && i <= textPositions.size() - 2) {
|
||||
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
||||
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) {
|
||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
|
||||
}
|
||||
startIndex = i + 1;
|
||||
}
|
||||
}
|
||||
|
||||
List<TextPosition> sublist = textPositions.subList(startIndex, textPositions.size());
|
||||
if (!sublist.isEmpty() && (sublist.get(sublist.size() - 1).getUnicode().equals(" ") || sublist.get(sublist.size() - 1).getUnicode().equals("\u00A0"))) {
|
||||
sublist = sublist.subList(0, sublist.size() - 1);
|
||||
}
|
||||
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) {
|
||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
|
||||
}
|
||||
super.writeString(text);
|
||||
}
|
||||
|
||||
|
||||
public void clearPositions() {
|
||||
|
||||
textPositionSequences = new ArrayList<>();
|
||||
}
|
||||
|
||||
}
|
||||
@ -34,31 +34,26 @@ import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.RedTextPosition;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.RedTextPosition;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Getter
|
||||
@Slf4j
|
||||
public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
|
||||
@Getter
|
||||
private final List<TextPositionSequence> textPositionSequences = new ArrayList<>();
|
||||
@Getter
|
||||
private final List<Ruling> rulings = new ArrayList<>();
|
||||
private final List<Ruling> graphicsPath = new ArrayList<>();
|
||||
@Setter
|
||||
protected PDPage pdpage;
|
||||
@Getter
|
||||
private int minCharWidth;
|
||||
@Getter
|
||||
private int maxCharWidth;
|
||||
@Getter
|
||||
private int minCharHeight;
|
||||
@Getter
|
||||
private int maxCharHeight;
|
||||
|
||||
private float path_x;
|
||||
|
||||
@ -9,14 +9,14 @@ import java.util.List;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.FloatFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextBlockOrientation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.StringFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.FloatFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.Orientation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.StringFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.RulingTextDirAdjustUtil;
|
||||
|
||||
@Service
|
||||
@ -29,18 +29,18 @@ public class BlockificationService {
|
||||
/**
|
||||
* This method is building blocks by expanding the minX/maxX and minY/maxY value on each word that is not split by the conditions.
|
||||
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
|
||||
* Rulings (Table lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
|
||||
* Rulings (TablePageBlock lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
|
||||
*
|
||||
* @param textPositions The words of a page.
|
||||
* @param horizontalRulingLines Horizontal table lines.
|
||||
* @param verticalRulingLines Vertical table lines.
|
||||
* @return ClassificationPage object that contains the Textblock and text statistics.
|
||||
* @return Page object that contains the Textblock and text statistics.
|
||||
*/
|
||||
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
||||
|
||||
int indexOnPage = 0;
|
||||
List<TextPositionSequence> chunkWords = new ArrayList<>();
|
||||
List<AbstractTextContainer> chunkBlockList1 = new ArrayList<>();
|
||||
List<AbstractPageBlock> chunkBlockList = new ArrayList<>();
|
||||
|
||||
float minX = 1000, maxX = 0, minY = 1000, maxY = 0;
|
||||
TextPositionSequence prev = null;
|
||||
@ -59,27 +59,27 @@ public class BlockificationService {
|
||||
|
||||
if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) {
|
||||
|
||||
TextBlockOrientation prevOrientation = null;
|
||||
if (!chunkBlockList1.isEmpty()) {
|
||||
prevOrientation = chunkBlockList1.get(chunkBlockList1.size() - 1).getOrientation();
|
||||
Orientation prevOrientation = null;
|
||||
if (!chunkBlockList.isEmpty()) {
|
||||
prevOrientation = chunkBlockList.get(chunkBlockList.size() - 1).getOrientation();
|
||||
}
|
||||
|
||||
ClassificationTextBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
|
||||
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
|
||||
indexOnPage++;
|
||||
|
||||
chunkBlockList1.add(cb1);
|
||||
chunkBlockList.add(cb1);
|
||||
chunkWords = new ArrayList<>();
|
||||
|
||||
if (splitByX && !isSplitByRuling) {
|
||||
wasSplitted = true;
|
||||
cb1.setOrientation(TextBlockOrientation.LEFT);
|
||||
cb1.setOrientation(Orientation.LEFT);
|
||||
splitX1 = word.getMinXDirAdj();
|
||||
} else if (newLineAfterSplit && !isSplitByRuling) {
|
||||
wasSplitted = false;
|
||||
cb1.setOrientation(TextBlockOrientation.RIGHT);
|
||||
cb1.setOrientation(Orientation.RIGHT);
|
||||
splitX1 = null;
|
||||
} else if (prevOrientation != null && prevOrientation.equals(TextBlockOrientation.RIGHT) && (lineSeparation || !startFromTop || !splitByX || !newLineAfterSplit || !isSplitByRuling)) {
|
||||
cb1.setOrientation(TextBlockOrientation.LEFT);
|
||||
} else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation || !startFromTop || !splitByX || !newLineAfterSplit || !isSplitByRuling)) {
|
||||
cb1.setOrientation(Orientation.LEFT);
|
||||
}
|
||||
|
||||
minX = 1000;
|
||||
@ -106,19 +106,19 @@ public class BlockificationService {
|
||||
}
|
||||
}
|
||||
|
||||
ClassificationTextBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
|
||||
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
|
||||
if (cb1 != null) {
|
||||
chunkBlockList1.add(cb1);
|
||||
chunkBlockList.add(cb1);
|
||||
}
|
||||
|
||||
Iterator<AbstractTextContainer> itty = chunkBlockList1.iterator();
|
||||
Iterator<AbstractPageBlock> itty = chunkBlockList.iterator();
|
||||
|
||||
ClassificationTextBlock previousLeft = null;
|
||||
ClassificationTextBlock previousRight = null;
|
||||
TextPageBlock previousLeft = null;
|
||||
TextPageBlock previousRight = null;
|
||||
while (itty.hasNext()) {
|
||||
ClassificationTextBlock block = (ClassificationTextBlock) itty.next();
|
||||
TextPageBlock block = (TextPageBlock) itty.next();
|
||||
|
||||
if (previousLeft != null && block.getOrientation().equals(TextBlockOrientation.LEFT)) {
|
||||
if (previousLeft != null && block.getOrientation().equals(Orientation.LEFT)) {
|
||||
if (previousLeft.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousLeft.getMinY()) {
|
||||
previousLeft.add(block);
|
||||
itty.remove();
|
||||
@ -126,7 +126,7 @@ public class BlockificationService {
|
||||
}
|
||||
}
|
||||
|
||||
if (previousRight != null && block.getOrientation().equals(TextBlockOrientation.RIGHT)) {
|
||||
if (previousRight != null && block.getOrientation().equals(Orientation.RIGHT)) {
|
||||
if (previousRight.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousRight.getMinY()) {
|
||||
previousRight.add(block);
|
||||
itty.remove();
|
||||
@ -134,21 +134,21 @@ public class BlockificationService {
|
||||
}
|
||||
}
|
||||
|
||||
if (block.getOrientation().equals(TextBlockOrientation.LEFT)) {
|
||||
if (block.getOrientation().equals(Orientation.LEFT)) {
|
||||
previousLeft = block;
|
||||
} else if (block.getOrientation().equals(TextBlockOrientation.RIGHT)) {
|
||||
} else if (block.getOrientation().equals(Orientation.RIGHT)) {
|
||||
previousRight = block;
|
||||
}
|
||||
}
|
||||
|
||||
itty = chunkBlockList1.iterator();
|
||||
ClassificationTextBlock previous = null;
|
||||
itty = chunkBlockList.iterator();
|
||||
TextPageBlock previous = null;
|
||||
while (itty.hasNext()) {
|
||||
ClassificationTextBlock block = (ClassificationTextBlock) itty.next();
|
||||
TextPageBlock block = (TextPageBlock) itty.next();
|
||||
|
||||
if (previous != null && previous.getOrientation().equals(TextBlockOrientation.LEFT) && block.getOrientation().equals(TextBlockOrientation.LEFT) && equalsWithThreshold(block.getMaxY(),
|
||||
previous.getMaxY()) || previous != null && previous.getOrientation().equals(TextBlockOrientation.LEFT) && block.getOrientation()
|
||||
.equals(TextBlockOrientation.RIGHT) && equalsWithThreshold(block.getMaxY(), previous.getMaxY())) {
|
||||
if (previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold(block.getMaxY(),
|
||||
previous.getMaxY()) || previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation()
|
||||
.equals(Orientation.RIGHT) && equalsWithThreshold(block.getMaxY(), previous.getMaxY())) {
|
||||
previous.add(block);
|
||||
itty.remove();
|
||||
continue;
|
||||
@ -157,7 +157,7 @@ public class BlockificationService {
|
||||
previous = block;
|
||||
}
|
||||
|
||||
return new ClassificationPage(chunkBlockList1);
|
||||
return new ClassificationPage(chunkBlockList);
|
||||
}
|
||||
|
||||
|
||||
@ -167,9 +167,9 @@ public class BlockificationService {
|
||||
}
|
||||
|
||||
|
||||
private ClassificationTextBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
|
||||
private TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
|
||||
|
||||
ClassificationTextBlock textBlock = null;
|
||||
TextPageBlock textBlock = null;
|
||||
|
||||
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
|
||||
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
|
||||
@ -186,15 +186,14 @@ public class BlockificationService {
|
||||
styleFrequencyCounter.add(wordBlock.getFontStyle());
|
||||
|
||||
if (textBlock == null) {
|
||||
textBlock = new ClassificationTextBlock(wordBlock.getMinXDirAdj(),
|
||||
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
|
||||
wordBlock.getMaxXDirAdj(),
|
||||
wordBlock.getMinYDirAdj(),
|
||||
wordBlock.getMaxYDirAdj(),
|
||||
wordBlockList,
|
||||
wordBlock.getRotation(),
|
||||
indexOnPage);
|
||||
wordBlock.getRotation());
|
||||
} else {
|
||||
ClassificationTextBlock spatialEntity = textBlock.union(wordBlock);
|
||||
TextPageBlock spatialEntity = textBlock.union(wordBlock);
|
||||
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
|
||||
}
|
||||
}
|
||||
@ -254,7 +253,7 @@ public class BlockificationService {
|
||||
verticalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight()); //
|
||||
word.getPageHeight());
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -1,52 +1,56 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.service;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.FloatFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.TableCell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Table;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.FloatFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.PositionUtils;
|
||||
|
||||
@Service
|
||||
public class BodyTextFrameService {
|
||||
|
||||
private static final float APPROXIMATE_HEADER_LINE_COUNT = 2.9f;
|
||||
|
||||
|
||||
/**
|
||||
* Adjusts and sets the body text frame to a classificationPage.
|
||||
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the classificationPage rotation.
|
||||
* Adjusts and sets the body text frame to a page.
|
||||
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
|
||||
* 0 -> LowerLeft
|
||||
* 90 -> UpperLeft
|
||||
* 180 -> UpperRight
|
||||
* 270 -> LowerRight
|
||||
* The aspect ratio of the classificationPage is also regarded.
|
||||
* The aspect ratio of the page is also regarded.
|
||||
*
|
||||
* @param classificationPage The classificationPage
|
||||
* @param page The page
|
||||
* @param bodyTextFrame frame that contains the main text on portrait pages
|
||||
* @param landscapeBodyTextFrame frame that contains the main text on landscape pages
|
||||
*/
|
||||
public void setBodyTextFrameAdjustedToPage(ClassificationPage classificationPage, Rectangle bodyTextFrame, Rectangle landscapeBodyTextFrame) {
|
||||
public void setBodyTextFrameAdjustedToPage(ClassificationPage page, Rectangle bodyTextFrame, Rectangle landscapeBodyTextFrame) {
|
||||
|
||||
Rectangle textFrame = classificationPage.isLandscape() ? landscapeBodyTextFrame : bodyTextFrame;
|
||||
Rectangle textFrame = page.isLandscape() ? landscapeBodyTextFrame : bodyTextFrame;
|
||||
|
||||
if (classificationPage.getPageWidth() > classificationPage.getPageHeight() && classificationPage.getRotation() == 270) {
|
||||
textFrame = new Rectangle(new Point(textFrame.getTopLeft().getY(), classificationPage.getPageHeight() - textFrame.getTopLeft().getX() - textFrame.getWidth()),
|
||||
if (page.getPageWidth() > page.getPageHeight() && page.getRotation() == 270) {
|
||||
textFrame = new Rectangle(new Point(textFrame.getTopLeft().getY(), page.getPageHeight() - textFrame.getTopLeft().getX() - textFrame.getWidth()),
|
||||
textFrame.getHeight(),
|
||||
textFrame.getWidth(),
|
||||
0);
|
||||
} else if (classificationPage.getPageWidth() > classificationPage.getPageHeight() && classificationPage.getRotation() != 0) {
|
||||
textFrame = new Rectangle(new Point(textFrame.getTopLeft().getY(), textFrame.getTopLeft().getX()), textFrame.getHeight(), textFrame.getWidth(), classificationPage.getPageNumber());
|
||||
} else if (classificationPage.getRotation() == 180) {
|
||||
textFrame = new Rectangle(new Point(textFrame.getTopLeft().getX(), classificationPage.getPageHeight() - textFrame.getTopLeft().getY() - textFrame.getHeight()),
|
||||
} else if (page.getPageWidth() > page.getPageHeight() && page.getRotation() != 0) {
|
||||
textFrame = new Rectangle(new Point(textFrame.getTopLeft().getY(), textFrame.getTopLeft().getX()), textFrame.getHeight(), textFrame.getWidth(), page.getPageNumber());
|
||||
} else if (page.getRotation() == 180) {
|
||||
textFrame = new Rectangle(new Point(textFrame.getTopLeft().getX(), page.getPageHeight() - textFrame.getTopLeft().getY() - textFrame.getHeight()),
|
||||
textFrame.getWidth(),
|
||||
textFrame.getHeight(),
|
||||
0);
|
||||
}
|
||||
classificationPage.setBodyTextFrame(textFrame);
|
||||
page.setBodyTextFrame(textFrame);
|
||||
}
|
||||
|
||||
|
||||
@ -59,50 +63,50 @@ public class BodyTextFrameService {
|
||||
* 270 -> LowerRight
|
||||
* The aspect ratio of the page is also regarded.
|
||||
*
|
||||
* @param classificationPages List of all classificationPages
|
||||
* @param pages List of all pages
|
||||
* @param documentFontSizeCounter Statistics of the document
|
||||
* @param landscape Calculate for landscape or portrait
|
||||
* @return Rectangle of the text frame
|
||||
*/
|
||||
public Rectangle calculateBodyTextFrame(List<ClassificationPage> classificationPages, FloatFrequencyCounter documentFontSizeCounter, boolean landscape) {
|
||||
public Rectangle calculateBodyTextFrame(List<ClassificationPage> pages, FloatFrequencyCounter documentFontSizeCounter, boolean landscape) {
|
||||
|
||||
BodyTextFrameExpansionsRectangle expansionsRectangle = new BodyTextFrameExpansionsRectangle();
|
||||
|
||||
for (ClassificationPage classificationPage : classificationPages) {
|
||||
for (ClassificationPage page : pages) {
|
||||
|
||||
if (classificationPage.getTextBlocks().isEmpty() || landscape != classificationPage.isLandscape()) {
|
||||
if (page.getTextBlocks().isEmpty() || landscape != page.isLandscape()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (AbstractTextContainer container : classificationPage.getTextBlocks()) {
|
||||
for (AbstractPageBlock container : page.getTextBlocks()) {
|
||||
|
||||
if (container instanceof ClassificationTextBlock) {
|
||||
ClassificationTextBlock textBlock = (ClassificationTextBlock) container;
|
||||
if (container instanceof TextPageBlock) {
|
||||
TextPageBlock textBlock = (TextPageBlock) container;
|
||||
if (textBlock.getMostPopularWordFont() == null || textBlock.getMostPopularWordStyle() == null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
float approxLineCount = PositionUtils.getApproxLineCount(textBlock);
|
||||
if (approxLineCount < 2.9f) {
|
||||
if (approxLineCount < APPROXIMATE_HEADER_LINE_COUNT) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (documentFontSizeCounter.getMostPopular() != null && textBlock.getMostPopularWordFontSize() >= documentFontSizeCounter.getMostPopular()) {
|
||||
|
||||
expandRectangle(textBlock, classificationPage, expansionsRectangle);
|
||||
expandRectangle(textBlock, page, expansionsRectangle);
|
||||
}
|
||||
}
|
||||
|
||||
if (container instanceof Table) {
|
||||
Table table = (Table) container;
|
||||
for (List<TableCell> row : table.getRows()) {
|
||||
for (TableCell cell : row) {
|
||||
if (container instanceof TablePageBlock) {
|
||||
TablePageBlock table = (TablePageBlock) container;
|
||||
for (List<Cell> row : table.getRows()) {
|
||||
for (Cell cell : row) {
|
||||
|
||||
if (cell == null || cell.getTextBlocks() == null) {
|
||||
continue;
|
||||
}
|
||||
for (ClassificationTextBlock textBlock : cell.getTextBlocks()) {
|
||||
expandRectangle(textBlock, classificationPage, expansionsRectangle);
|
||||
for (TextPageBlock textBlock : cell.getTextBlocks()) {
|
||||
expandRectangle(textBlock, page, expansionsRectangle);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -116,9 +120,9 @@ public class BodyTextFrameService {
|
||||
}
|
||||
|
||||
|
||||
private void expandRectangle(ClassificationTextBlock textBlock, ClassificationPage classificationPage, BodyTextFrameExpansionsRectangle expansionsRectangle) {
|
||||
private void expandRectangle(TextPageBlock textBlock, ClassificationPage page, BodyTextFrameExpansionsRectangle expansionsRectangle) {
|
||||
|
||||
if (classificationPage.getPageWidth() > classificationPage.getPageHeight() && classificationPage.getRotation() != 0) {
|
||||
if (page.getPageWidth() > page.getPageHeight() && page.getRotation() != 0) {
|
||||
if (textBlock.getPdfMinY() < expansionsRectangle.minX) {
|
||||
expansionsRectangle.minX = textBlock.getPdfMinY();
|
||||
}
|
||||
|
||||
@ -6,10 +6,11 @@ import java.util.regex.Pattern;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.PositionUtils;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
@ -31,43 +32,43 @@ public class ClassificationService {
|
||||
|
||||
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
|
||||
|
||||
for (ClassificationPage classificationPage : document.getPages()) {
|
||||
bodyTextFrameService.setBodyTextFrameAdjustedToPage(classificationPage, bodyTextFrame, landscapeBodyTextFrame);
|
||||
classifyPage(classificationPage, document, headlineFontSizes);
|
||||
for (ClassificationPage page : document.getPages()) {
|
||||
bodyTextFrameService.setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame);
|
||||
classifyPage(page, document, headlineFontSizes);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void classifyPage(ClassificationPage classificationPage, ClassificationDocument document, List<Float> headlineFontSizes) {
|
||||
public void classifyPage(ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
|
||||
|
||||
for (AbstractTextContainer textBlock : classificationPage.getTextBlocks()) {
|
||||
if (textBlock instanceof ClassificationTextBlock) {
|
||||
classifyBlock((ClassificationTextBlock) textBlock, classificationPage, document, headlineFontSizes);
|
||||
for (AbstractPageBlock textBlock : page.getTextBlocks()) {
|
||||
if (textBlock instanceof TextPageBlock) {
|
||||
classifyBlock((TextPageBlock) textBlock, page, document, headlineFontSizes);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void classifyBlock(ClassificationTextBlock textBlock, ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
|
||||
public void classifyBlock(TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
|
||||
|
||||
var bodyTextFrame = page.getBodyTextFrame();
|
||||
|
||||
if (document.getFontSizeCounter().getMostPopular() == null) {
|
||||
textBlock.setClassification("Other");
|
||||
textBlock.setClassification(PageBlockType.OTHER);
|
||||
return;
|
||||
}
|
||||
if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
|
||||
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
|
||||
textBlock.setClassification("Header");
|
||||
textBlock.setClassification(PageBlockType.HEADER);
|
||||
|
||||
} else if (PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
|
||||
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
|
||||
textBlock.setClassification("Footer");
|
||||
textBlock.setClassification(PageBlockType.FOOTER);
|
||||
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
|
||||
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
|
||||
.size() == 1)) {
|
||||
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
|
||||
textBlock.setClassification("Title");
|
||||
textBlock.setClassification(PageBlockType.TITLE);
|
||||
}
|
||||
} else if (textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter()
|
||||
.getMostPopular() && PositionUtils.getApproxLineCount(textBlock) < 4.9 && (textBlock.getMostPopularWordStyle().equals("bold") || !document.getFontStyleCounter()
|
||||
@ -80,36 +81,34 @@ public class ClassificationService {
|
||||
|
||||
for (int i = 1; i <= headlineFontSizes.size(); i++) {
|
||||
if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) {
|
||||
textBlock.setClassification("H " + i);
|
||||
textBlock.setClassification(PageBlockType.getHeadlineType(i));
|
||||
document.setHeadlines(true);
|
||||
}
|
||||
}
|
||||
} else if (!textBlock.getText().startsWith("Table ") && !textBlock.getText().startsWith("Figure ") && PositionUtils.isWithinBodyTextFrame(bodyTextFrame,
|
||||
textBlock) && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter()
|
||||
.getMostPopular()
|
||||
.equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9 && textBlock.getSequences()
|
||||
} else if (!textBlock.getText().startsWith("Figure ") && PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordStyle()
|
||||
.equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9 && textBlock.getSequences()
|
||||
.get(0)
|
||||
.getTextPositions()
|
||||
.get(0)
|
||||
.getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
|
||||
textBlock.setClassification("H " + (headlineFontSizes.size() + 1));
|
||||
textBlock.setClassification(PageBlockType.getHeadlineType(headlineFontSizes.size() + 1));
|
||||
document.setHeadlines(true);
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
|
||||
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) {
|
||||
textBlock.setClassification("TextBlock Bold");
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD);
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFont()
|
||||
.equals(document.getFontCounter().getMostPopular()) && textBlock.getMostPopularWordStyle()
|
||||
.equals(document.getFontStyleCounter().getMostPopular()) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) {
|
||||
textBlock.setClassification("TextBlock");
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
|
||||
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("italic") && !document.getFontStyleCounter()
|
||||
.getMostPopular()
|
||||
.equals("italic") && PositionUtils.getApproxLineCount(textBlock) < 2.9) {
|
||||
textBlock.setClassification("TextBlock Italic");
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC);
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) {
|
||||
textBlock.setClassification("TextBlock Unknown");
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN);
|
||||
} else {
|
||||
textBlock.setClassification("Other");
|
||||
textBlock.setClassification(PageBlockType.OTHER);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -9,16 +9,16 @@ import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.ImageServiceResponseAdapter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableCells;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.parsing.PDFLinesTextStripper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CvParsedTableCell;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
@ -35,7 +35,7 @@ public class PdfParsingService {
|
||||
private final ImageServiceResponseAdapter imageServiceResponseAdapter;
|
||||
|
||||
|
||||
public ClassificationDocument parseDocument(PDDocument originDocument, Map<Integer, List<CvParsedTableCell>> pdfTableCells, Map<Integer, List<ClassifiedImage>> pdfImages) {
|
||||
public ClassificationDocument parseDocument(PDDocument originDocument, Map<Integer, List<TableCells>> pdfTableCells, Map<Integer, List<ClassifiedImage>> pdfImages) {
|
||||
|
||||
ClassificationDocument document = new ClassificationDocument();
|
||||
List<ClassificationPage> classificationPages = new ArrayList<>();
|
||||
@ -56,7 +56,7 @@ public class PdfParsingService {
|
||||
@SneakyThrows
|
||||
private void parsePage(Map<Integer, List<ClassifiedImage>> pdfImages,
|
||||
PDDocument pdDocument,
|
||||
Map<Integer, List<CvParsedTableCell>> pdfTableCells,
|
||||
Map<Integer, List<TableCells>> pdfTableCells,
|
||||
ClassificationDocument document,
|
||||
List<ClassificationPage> classificationPages,
|
||||
int pageNumber) {
|
||||
@ -93,7 +93,7 @@ public class PdfParsingService {
|
||||
imageServiceResponseAdapter.findOcr(classificationPage);
|
||||
}
|
||||
|
||||
tableExtractionService.removeRedundantTableCells(cleanRulings, classificationPage);
|
||||
tableExtractionService.extractTables(cleanRulings, classificationPage);
|
||||
buildPageStatistics(classificationPage);
|
||||
increaseDocumentStatistics(classificationPage, document);
|
||||
|
||||
@ -115,12 +115,12 @@ public class PdfParsingService {
|
||||
private void buildPageStatistics(ClassificationPage classificationPage) {
|
||||
|
||||
// Collect all statistics for the classificationPage, except from blocks inside tables, as tables will always be added to BodyTextFrame.
|
||||
for (AbstractTextContainer textBlock : classificationPage.getTextBlocks()) {
|
||||
if (textBlock instanceof ClassificationTextBlock) {
|
||||
if (((ClassificationTextBlock) textBlock).getSequences() == null) {
|
||||
for (AbstractPageBlock textBlock : classificationPage.getTextBlocks()) {
|
||||
if (textBlock instanceof TextPageBlock) {
|
||||
if (((TextPageBlock) textBlock).getSequences() == null) {
|
||||
continue;
|
||||
}
|
||||
for (TextPositionSequence word : ((ClassificationTextBlock) textBlock).getSequences()) {
|
||||
for (TextPositionSequence word : ((TextPageBlock) textBlock).getSequences()) {
|
||||
classificationPage.getTextHeightCounter().add(word.getTextHeight());
|
||||
classificationPage.getFontCounter().add(word.getFont());
|
||||
classificationPage.getFontSizeCounter().add(word.getFontSize());
|
||||
@ -132,3 +132,5 @@ public class PdfParsingService {
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -12,9 +12,9 @@ import java.util.Map;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CvParsedTableCell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableCells;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.DoubleComparisons;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
@ -25,7 +25,7 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@RequiredArgsConstructor
|
||||
public class RulingCleaningService {
|
||||
|
||||
public CleanRulings getCleanRulings(List<CvParsedTableCell> cvParsedTableCells, List<Ruling> rulings, float minCharWidth, float maxCharHeight) {
|
||||
public CleanRulings getCleanRulings(List<TableCells> tableCells, List<Ruling> rulings, float minCharWidth, float maxCharHeight) {
|
||||
|
||||
if (!rulings.isEmpty()) {
|
||||
snapPoints(rulings, minCharWidth, maxCharHeight);
|
||||
@ -38,7 +38,7 @@ public class RulingCleaningService {
|
||||
}
|
||||
}
|
||||
if (vrs.isEmpty()) {
|
||||
vrs.addAll(extractVerticalRulings(cvParsedTableCells));
|
||||
vrs.addAll(extractVerticalRulings(tableCells));
|
||||
}
|
||||
List<Ruling> verticalRulingLines = collapseOrientedRulings(vrs);
|
||||
|
||||
@ -49,7 +49,7 @@ public class RulingCleaningService {
|
||||
}
|
||||
}
|
||||
if (hrs.isEmpty()) {
|
||||
hrs.addAll(extractHorizontalRulings(cvParsedTableCells));
|
||||
hrs.addAll(extractHorizontalRulings(tableCells));
|
||||
}
|
||||
List<Ruling> horizontalRulingLines = collapseOrientedRulings(hrs);
|
||||
|
||||
@ -132,12 +132,12 @@ public class RulingCleaningService {
|
||||
}
|
||||
|
||||
|
||||
private Collection<? extends Ruling> extractVerticalRulings(List<CvParsedTableCell> cvParsedTableCells) {
|
||||
private Collection<? extends Ruling> extractVerticalRulings(List<TableCells> cvParsedTableCells) {
|
||||
|
||||
List<Ruling> vrs = new ArrayList<>();
|
||||
|
||||
if (cvParsedTableCells != null) {
|
||||
for (CvParsedTableCell cvParsedTableCell : cvParsedTableCells) {
|
||||
for (TableCells cvParsedTableCell : cvParsedTableCells) {
|
||||
Ruling leftLine = createRuling(cvParsedTableCell.getX0(), cvParsedTableCell.getX0(), cvParsedTableCell.getY0(), cvParsedTableCell.getY1());
|
||||
Ruling rightLine = createRuling(cvParsedTableCell.getX1(), cvParsedTableCell.getX1(), cvParsedTableCell.getY0(), cvParsedTableCell.getY1());
|
||||
vrs.add(leftLine);
|
||||
@ -148,12 +148,12 @@ public class RulingCleaningService {
|
||||
}
|
||||
|
||||
|
||||
private Collection<? extends Ruling> extractHorizontalRulings(List<CvParsedTableCell> cvParsedTableCells) {
|
||||
private Collection<? extends Ruling> extractHorizontalRulings(List<TableCells> cvParsedTableCells) {
|
||||
|
||||
List<Ruling> hrs = new ArrayList<>();
|
||||
|
||||
if (cvParsedTableCells != null) {
|
||||
for (CvParsedTableCell cvParsedTableCell : cvParsedTableCells) {
|
||||
for (TableCells cvParsedTableCell : cvParsedTableCells) {
|
||||
Ruling topLine = createRuling(cvParsedTableCell.getX0(), cvParsedTableCell.getX1(), cvParsedTableCell.getY1(), cvParsedTableCell.getY1());
|
||||
Ruling baseLine = createRuling(cvParsedTableCell.getX0(), cvParsedTableCell.getX1(), cvParsedTableCell.getY0(), cvParsedTableCell.getY0());
|
||||
hrs.add(topLine);
|
||||
|
||||
@ -9,17 +9,18 @@ import java.util.stream.Collectors;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationFooter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationSection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationHeader;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.UnclassifiedText;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.TableCell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Table;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationFooter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationHeader;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationSection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.UnclassifiedText;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@ -29,44 +30,44 @@ public class SectionsBuilderService {
|
||||
|
||||
public void buildSections(ClassificationDocument document) {
|
||||
|
||||
List<AbstractTextContainer> chunkWords = new ArrayList<>();
|
||||
List<AbstractPageBlock> chunkWords = new ArrayList<>();
|
||||
List<ClassificationSection> chunkBlockList = new ArrayList<>();
|
||||
List<ClassificationHeader> headers = new ArrayList<>();
|
||||
List<ClassificationFooter> footers = new ArrayList<>();
|
||||
List<UnclassifiedText> unclassifiedTexts = new ArrayList<>();
|
||||
|
||||
AbstractTextContainer prev = null;
|
||||
AbstractPageBlock prev = null;
|
||||
|
||||
String lastHeadline = "";
|
||||
Table previousTable = null;
|
||||
for (ClassificationPage classificationPage : document.getPages()) {
|
||||
List<ClassificationTextBlock> header = new ArrayList<>();
|
||||
List<ClassificationTextBlock> footer = new ArrayList<>();
|
||||
List<ClassificationTextBlock> unclassifiedText = new ArrayList<>();
|
||||
for (AbstractTextContainer current : classificationPage.getTextBlocks()) {
|
||||
TablePageBlock previousTable = null;
|
||||
for (ClassificationPage page : document.getPages()) {
|
||||
List<TextPageBlock> header = new ArrayList<>();
|
||||
List<TextPageBlock> footer = new ArrayList<>();
|
||||
List<TextPageBlock> unclassifiedText = new ArrayList<>();
|
||||
for (AbstractPageBlock current : page.getTextBlocks()) {
|
||||
|
||||
if (current.getClassification() == null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
current.setPage(classificationPage.getPageNumber());
|
||||
current.setPage(page.getPageNumber());
|
||||
|
||||
if (current.getClassification().equals("Header")) {
|
||||
header.add((ClassificationTextBlock) current);
|
||||
if (current.getClassification().equals(PageBlockType.HEADER)) {
|
||||
header.add((TextPageBlock) current);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (current.getClassification().equals("Footer")) {
|
||||
footer.add((ClassificationTextBlock) current);
|
||||
if (current.getClassification().equals(PageBlockType.FOOTER)) {
|
||||
footer.add((TextPageBlock) current);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (current.getClassification().equals("Other")) {
|
||||
unclassifiedText.add((ClassificationTextBlock) current);
|
||||
if (current.getClassification().equals(PageBlockType.OTHER)) {
|
||||
unclassifiedText.add((TextPageBlock) current);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (prev != null && current.getClassification().startsWith("H ") && !prev.getClassification().startsWith("H ") || !document.isHeadlines()) {
|
||||
if (prev != null && current.getClassification().isHeadline() && !prev.getClassification().isHeadline() || !document.isHeadlines()) {
|
||||
ClassificationSection chunkBlock = buildTextBlock(chunkWords, lastHeadline);
|
||||
chunkBlock.setHeadline(lastHeadline);
|
||||
if (document.isHeadlines()) {
|
||||
@ -78,7 +79,7 @@ public class SectionsBuilderService {
|
||||
previousTable = chunkBlock.getTables().get(chunkBlock.getTables().size() - 1);
|
||||
}
|
||||
}
|
||||
if (current instanceof Table table) {
|
||||
if (current instanceof TablePageBlock table) {
|
||||
// Distribute header information for subsequent tables
|
||||
mergeTableMetadata(table, previousTable);
|
||||
previousTable = table;
|
||||
@ -106,15 +107,14 @@ public class SectionsBuilderService {
|
||||
document.setHeaders(headers);
|
||||
document.setFooters(footers);
|
||||
document.setUnclassifiedTexts(unclassifiedTexts);
|
||||
addImagesToSections(document);
|
||||
}
|
||||
|
||||
|
||||
private void addImagesToSections(ClassificationDocument document) {
|
||||
public void addImagesToSections(ClassificationDocument document) {
|
||||
|
||||
Map<Integer, List<ClassificationSection>> sectionMap = new HashMap<>();
|
||||
for (ClassificationSection section : document.getSections()) {
|
||||
for (AbstractTextContainer container : section.getPageBlocks()) {
|
||||
for (AbstractPageBlock container : section.getPageBlocks()) {
|
||||
|
||||
List<ClassificationSection> sectionsOnPage = sectionMap.computeIfAbsent(container.getPage(), c -> new ArrayList<>());
|
||||
if (sectionsOnPage.contains(section)) {
|
||||
@ -138,11 +138,11 @@ public class SectionsBuilderService {
|
||||
sectionMap.computeIfAbsent(1, x -> new ArrayList<>()).add(section);
|
||||
}
|
||||
|
||||
for (ClassificationPage classificationPage : document.getPages()) {
|
||||
for (ClassifiedImage image : classificationPage.getImages()) {
|
||||
List<ClassificationSection> sectionsOnPage = sectionMap.get(classificationPage.getPageNumber());
|
||||
for (ClassificationPage page : document.getPages()) {
|
||||
for (ClassifiedImage image : page.getImages()) {
|
||||
List<ClassificationSection> sectionsOnPage = sectionMap.get(page.getPageNumber());
|
||||
if (sectionsOnPage == null) {
|
||||
int i = classificationPage.getPageNumber();
|
||||
int i = page.getPageNumber();
|
||||
while (sectionsOnPage == null) {
|
||||
sectionsOnPage = sectionMap.get(i);
|
||||
i--;
|
||||
@ -154,8 +154,8 @@ public class SectionsBuilderService {
|
||||
Float xMax = null;
|
||||
Float yMax = null;
|
||||
|
||||
for (AbstractTextContainer abs : section.getPageBlocks()) {
|
||||
if (abs.getPage() != classificationPage.getPageNumber()) {
|
||||
for (AbstractPageBlock abs : section.getPageBlocks()) {
|
||||
if (abs.getPage() != page.getPageNumber()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -212,23 +212,23 @@ public class SectionsBuilderService {
|
||||
}
|
||||
|
||||
|
||||
private void mergeTableMetadata(Table currentTable, Table previousTable) {
|
||||
private void mergeTableMetadata(TablePageBlock currentTable, TablePageBlock previousTable) {
|
||||
|
||||
// Distribute header information for subsequent tables
|
||||
if (previousTable != null && hasInvalidHeaderInformation(currentTable) && hasValidHeaderInformation(previousTable)) {
|
||||
List<TableCell> previousTableNonHeaderRow = getRowWithNonHeaderCells(previousTable);
|
||||
List<TableCell> tableNonHeaderRow = getRowWithNonHeaderCells(currentTable);
|
||||
List<Cell> previousTableNonHeaderRow = getRowWithNonHeaderCells(previousTable);
|
||||
List<Cell> tableNonHeaderRow = getRowWithNonHeaderCells(currentTable);
|
||||
// Allow merging of tables if header row is separated from first logical non-header row
|
||||
if (previousTableNonHeaderRow.isEmpty() && previousTable.getRowCount() == 1 && previousTable.getRows().get(0).size() == tableNonHeaderRow.size()) {
|
||||
previousTableNonHeaderRow = previousTable.getRows().get(0).stream().map(cell -> {
|
||||
TableCell fakeCell = new TableCell(cell.getPoints()[0], cell.getPoints()[2]);
|
||||
Cell fakeCell = new Cell(cell.getPoints()[0], cell.getPoints()[2]);
|
||||
fakeCell.setHeaderCells(Collections.singletonList(cell));
|
||||
return fakeCell;
|
||||
}).collect(Collectors.toList());
|
||||
}
|
||||
if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) {
|
||||
for (int i = currentTable.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
|
||||
List<TableCell> row = currentTable.getRows().get(i);
|
||||
List<Cell> row = currentTable.getRows().get(i);
|
||||
if (row.size() == tableNonHeaderRow.size() && row.stream().allMatch(cell -> cell.getHeaderCells().isEmpty())) {
|
||||
for (int j = 0; j < row.size(); j++) {
|
||||
row.get(j).setHeaderCells(previousTableNonHeaderRow.get(j).getHeaderCells());
|
||||
@ -240,52 +240,52 @@ public class SectionsBuilderService {
|
||||
}
|
||||
|
||||
|
||||
private ClassificationSection buildTextBlock(List<AbstractTextContainer> wordBlockList, String lastHeadline) {
|
||||
private ClassificationSection buildTextBlock(List<AbstractPageBlock> wordBlockList, String lastHeadline) {
|
||||
|
||||
ClassificationSection section = new ClassificationSection();
|
||||
|
||||
for (AbstractTextContainer container : wordBlockList) {
|
||||
if (container instanceof Table table) {
|
||||
for (AbstractPageBlock container : wordBlockList) {
|
||||
if (container instanceof TablePageBlock table) {
|
||||
|
||||
if (lastHeadline == null || lastHeadline.isEmpty()) {
|
||||
table.setHeadline("Text in table");
|
||||
} else {
|
||||
table.setHeadline("Table in: " + lastHeadline);
|
||||
table.setHeadline("TablePageBlock in: " + lastHeadline);
|
||||
}
|
||||
|
||||
section.getPageBlocks().add(table);
|
||||
continue;
|
||||
}
|
||||
|
||||
ClassificationTextBlock wordBlock = (ClassificationTextBlock) container;
|
||||
TextPageBlock wordBlock = (TextPageBlock) container;
|
||||
section.getPageBlocks().add(wordBlock);
|
||||
}
|
||||
return section;
|
||||
}
|
||||
|
||||
|
||||
private boolean hasValidHeaderInformation(Table table) {
|
||||
private boolean hasValidHeaderInformation(TablePageBlock table) {
|
||||
|
||||
return !hasInvalidHeaderInformation(table);
|
||||
}
|
||||
|
||||
|
||||
private boolean hasInvalidHeaderInformation(Table table) {
|
||||
private boolean hasInvalidHeaderInformation(TablePageBlock table) {
|
||||
|
||||
return table.getRows().stream().flatMap(row -> row.stream().filter(cell -> !cell.getHeaderCells().isEmpty())).findAny().isEmpty();
|
||||
|
||||
}
|
||||
|
||||
|
||||
private List<TableCell> getRowWithNonHeaderCells(Table table) {
|
||||
private List<Cell> getRowWithNonHeaderCells(TablePageBlock table) {
|
||||
|
||||
for (int i = table.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
|
||||
List<TableCell> row = table.getRows().get(i);
|
||||
List<Cell> row = table.getRows().get(i);
|
||||
if (row.size() == 1) {
|
||||
continue;
|
||||
}
|
||||
boolean allNonHeader = true;
|
||||
for (TableCell cell : row) {
|
||||
for (Cell cell : row) {
|
||||
if (cell.isHeaderCell()) {
|
||||
allNonHeader = false;
|
||||
break;
|
||||
|
||||
@ -9,20 +9,18 @@ import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.TableCell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.QuickSort;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.DoubleComparisons;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Table;
|
||||
|
||||
@Service
|
||||
public class TableExtractionService {
|
||||
@ -68,28 +66,28 @@ public class TableExtractionService {
|
||||
|
||||
|
||||
/**
|
||||
* Finds tables on a classificationPage and moves textblocks into cells of the found tables.
|
||||
* Note: This algorithm uses Pdf Coordinate System where {0,0} rotated with the classificationPage rotation.
|
||||
* Finds tables on a page and moves textblocks into cells of the found tables.
|
||||
* Note: This algorithm uses Pdf Coordinate System where {0,0} rotated with the page rotation.
|
||||
* 0 -> LowerLeft
|
||||
* 90 -> UpperLeft
|
||||
* 180 -> UpperRight
|
||||
* 270 -> LowerRight
|
||||
*
|
||||
* <p>
|
||||
* DirAdj (Text direction adjusted) values can not be used here.
|
||||
*
|
||||
* @param cleanRulings The lines used to build the table.
|
||||
* @param classificationPage ClassificationPage object that contains textblocks and statistics.
|
||||
* @param page Page object that contains textblocks and statistics.
|
||||
*/
|
||||
public void removeRedundantTableCells(CleanRulings cleanRulings, ClassificationPage classificationPage) {
|
||||
public void extractTables(CleanRulings cleanRulings, ClassificationPage page) {
|
||||
|
||||
List<TableCell> cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
List<Cell> cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
|
||||
List<ClassificationTextBlock> toBeRemoved = new ArrayList<>();
|
||||
List<TextPageBlock> toBeRemoved = new ArrayList<>();
|
||||
|
||||
for (AbstractTextContainer abstractTextContainer : classificationPage.getTextBlocks()) {
|
||||
ClassificationTextBlock textBlock = (ClassificationTextBlock) abstractTextContainer;
|
||||
for (TableCell cell : cells) {
|
||||
if (cell.intersects(textBlock.getPdfMinX(),
|
||||
for (AbstractPageBlock abstractPageBlock : page.getTextBlocks()) {
|
||||
TextPageBlock textBlock = (TextPageBlock) abstractPageBlock;
|
||||
for (Cell cell : cells) {
|
||||
if (cell.hasMinimumSize() && cell.intersects(textBlock.getPdfMinX(),
|
||||
textBlock.getPdfMinY(),
|
||||
textBlock.getPdfMaxX() - textBlock.getPdfMinX(),
|
||||
textBlock.getPdfMaxY() - textBlock.getPdfMinY())) {
|
||||
@ -101,44 +99,44 @@ public class TableExtractionService {
|
||||
}
|
||||
|
||||
cells = new ArrayList<>(new HashSet<>(cells));
|
||||
QuickSort.sort(cells, Rectangle.ILL_DEFINED_ORDER);
|
||||
DoubleComparisons.sort(cells, Rectangle.ILL_DEFINED_ORDER);
|
||||
|
||||
List<Rectangle> spreadsheetAreas = findSpreadsheetsFromCells(cells).stream().filter(r -> r.getWidth() > 0f && r.getHeight() > 0f).collect(Collectors.toList());
|
||||
List<Rectangle> spreadsheetAreas = findSpreadsheetsFromCells(cells).stream().filter(r -> r.getWidth() > 0f && r.getHeight() > 0f).toList();
|
||||
|
||||
List<Table> tables = new ArrayList<>();
|
||||
List<TablePageBlock> tables = new ArrayList<>();
|
||||
for (Rectangle area : spreadsheetAreas) {
|
||||
|
||||
List<TableCell> overlappingCells = new ArrayList<>();
|
||||
for (TableCell c : cells) {
|
||||
if (c.intersects(area)) {
|
||||
List<Cell> overlappingCells = new ArrayList<>();
|
||||
for (Cell c : cells) {
|
||||
if (c.hasMinimumSize() && c.intersects(area)) {
|
||||
overlappingCells.add(c);
|
||||
}
|
||||
}
|
||||
tables.add(new Table(overlappingCells, area, classificationPage.getRotation()));
|
||||
tables.add(new TablePageBlock(overlappingCells, area, page.getRotation()));
|
||||
}
|
||||
|
||||
for (Table table : tables) {
|
||||
for (TablePageBlock table : tables) {
|
||||
int position = -1;
|
||||
|
||||
Iterator<AbstractTextContainer> itty = classificationPage.getTextBlocks().iterator();
|
||||
Iterator<AbstractPageBlock> itty = page.getTextBlocks().iterator();
|
||||
while (itty.hasNext()) {
|
||||
AbstractTextContainer textBlock = itty.next();
|
||||
if (textBlock instanceof ClassificationTextBlock ? table.containsBlock((ClassificationTextBlock) textBlock) : table.contains(textBlock) && position == -1) {
|
||||
position = classificationPage.getTextBlocks().indexOf(textBlock);
|
||||
AbstractPageBlock textBlock = itty.next();
|
||||
if (textBlock instanceof TextPageBlock ? table.containsBlock((TextPageBlock) textBlock) : table.contains(textBlock) && position == -1) {
|
||||
position = page.getTextBlocks().indexOf(textBlock);
|
||||
}
|
||||
}
|
||||
if (position != -1) {
|
||||
classificationPage.getTextBlocks().add(position, table);
|
||||
page.getTextBlocks().add(position, table);
|
||||
}
|
||||
}
|
||||
|
||||
classificationPage.getTextBlocks().removeAll(toBeRemoved);
|
||||
page.getTextBlocks().removeAll(toBeRemoved);
|
||||
}
|
||||
|
||||
|
||||
public List<TableCell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
||||
public List<Cell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
||||
|
||||
List<TableCell> cellsFound = new ArrayList<>();
|
||||
List<Cell> cellsFound = new ArrayList<>();
|
||||
Map<Point2D, Ruling[]> intersectionPoints = Ruling.findIntersections(horizontalRulingLines, verticalRulingLines);
|
||||
List<Point2D> intersectionPointsList = new ArrayList<>(intersectionPoints.keySet());
|
||||
intersectionPointsList.sort(POINT_COMPARATOR);
|
||||
@ -174,7 +172,7 @@ public class TableExtractionService {
|
||||
Point2D btmRight = new Point2D.Float((float) yPoint.getX(), (float) xPoint.getY());
|
||||
if (intersectionPoints.containsKey(btmRight) && intersectionPoints.get(btmRight)[0].equals(intersectionPoints.get(xPoint)[0]) && intersectionPoints.get(btmRight)[1].equals(
|
||||
intersectionPoints.get(yPoint)[1])) {
|
||||
cellsFound.add(new TableCell(topLeft, btmRight));
|
||||
cellsFound.add(new Cell(topLeft, btmRight));
|
||||
break outer;
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,6 +1,8 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.utils;
|
||||
|
||||
import java.math.BigDecimal;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
@ -20,11 +22,22 @@ public final class DoubleComparisons {
|
||||
|
||||
|
||||
public static float round(double d, int decimalPlace) {
|
||||
|
||||
BigDecimal bd = BigDecimal.valueOf(d);
|
||||
bd = bd.setScale(decimalPlace, BigDecimal.ROUND_HALF_UP);
|
||||
return bd.floatValue();
|
||||
}
|
||||
|
||||
|
||||
public static <T> void sort(List<T> list, Comparator<? super T> comparator) {
|
||||
|
||||
try {
|
||||
QuickSort.sort(list, comparator);
|
||||
} catch (IllegalArgumentException e) {
|
||||
// This should not happen since we use QuickSort from PDFBox
|
||||
log.warn(e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -0,0 +1,56 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.utils;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@UtilityClass
|
||||
public class FileUtils {
|
||||
|
||||
public File createTempFile(String filenamePrefix, String filenameSuffix) throws IOException {
|
||||
|
||||
File tempFile = Files.createTempFile(filenamePrefix, filenameSuffix).toFile();
|
||||
setRWPermissionsOnlyForOwner(tempFile);
|
||||
|
||||
return tempFile;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Deletes a file; logs a message with the reason if the deletion fails.
|
||||
* This method is null-safe.
|
||||
*
|
||||
* @param file The file to delete. Can be null.
|
||||
*/
|
||||
public void deleteFile(File file) {
|
||||
|
||||
if (file != null) {
|
||||
try {
|
||||
Files.deleteIfExists(file.toPath());
|
||||
} catch (IOException ex) {
|
||||
log.warn("Could not delete file!", ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// We don't need to check the results of the permission setters below,
|
||||
// since we're manipulating a file we created ourselves.
|
||||
@SuppressWarnings({"ResultOfMethodCallIgnored", "squid:S899"})
|
||||
private void setRWPermissionsOnlyForOwner(File tempFile) {
|
||||
|
||||
try {
|
||||
tempFile.setReadable(true, true);
|
||||
tempFile.setWritable(true, true);
|
||||
tempFile.setExecutable(false);
|
||||
} catch (SecurityException ex) {
|
||||
// This should never happen since we're creating a temp file ourselves.
|
||||
log.warn("Caught an exception during temp file creation. This should not happend. Check the code.", ex);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,7 +1,7 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.utils;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@ -11,7 +11,7 @@ public final class PositionUtils {
|
||||
|
||||
// TODO This currently uses pdf coord system. In the futher this should use java coord system.
|
||||
// Note: DirAdj (TextDirection Adjusted) can not be user for this.
|
||||
public boolean isWithinBodyTextFrame(Rectangle btf, ClassificationTextBlock textBlock) {
|
||||
public boolean isWithinBodyTextFrame(Rectangle btf, TextPageBlock textBlock) {
|
||||
|
||||
if (btf == null || textBlock == null) {
|
||||
return false;
|
||||
@ -32,7 +32,7 @@ public final class PositionUtils {
|
||||
|
||||
// TODO This currently uses pdf coord system. In the futher this should use java coord system.
|
||||
// Note: DirAdj (TextDirection Adjusted) can not be user for this.
|
||||
public boolean isOverBodyTextFrame(Rectangle btf, ClassificationTextBlock textBlock, int rotation) {
|
||||
public boolean isOverBodyTextFrame(Rectangle btf, TextPageBlock textBlock, int rotation) {
|
||||
|
||||
if (btf == null || textBlock == null) {
|
||||
return false;
|
||||
@ -58,9 +58,10 @@ public final class PositionUtils {
|
||||
|
||||
}
|
||||
|
||||
|
||||
// TODO This currently uses pdf coord system. In the futher this should use java coord system.
|
||||
// Note: DirAdj (TextDirection Adjusted) can not be user for this.
|
||||
public boolean isUnderBodyTextFrame(Rectangle btf, ClassificationTextBlock textBlock, int rotation) {
|
||||
public boolean isUnderBodyTextFrame(Rectangle btf, TextPageBlock textBlock, int rotation) {
|
||||
|
||||
if (btf == null || textBlock == null) {
|
||||
return false;
|
||||
@ -86,9 +87,10 @@ public final class PositionUtils {
|
||||
|
||||
}
|
||||
|
||||
|
||||
// TODO This currently uses pdf coord system. In the futher this should use java coord system.
|
||||
// Note: DirAdj (TextDirection Adjusted) can not be user for this.
|
||||
public boolean isTouchingUnderBodyTextFrame(Rectangle btf, ClassificationTextBlock textBlock) {
|
||||
public boolean isTouchingUnderBodyTextFrame(Rectangle btf, TextPageBlock textBlock) {
|
||||
|
||||
//TODO Currently this is not working for rotated pages.
|
||||
|
||||
@ -105,13 +107,13 @@ public final class PositionUtils {
|
||||
}
|
||||
|
||||
|
||||
public float getHeightDifferenceBetweenChunkWordAndDocumentWord(ClassificationTextBlock textBlock, Float documentMostPopularWordHeight) {
|
||||
public float getHeightDifferenceBetweenChunkWordAndDocumentWord(TextPageBlock textBlock, Float documentMostPopularWordHeight) {
|
||||
|
||||
return textBlock.getMostPopularWordHeight() - documentMostPopularWordHeight;
|
||||
}
|
||||
|
||||
|
||||
public Float getApproxLineCount(ClassificationTextBlock textBlock) {
|
||||
public Float getApproxLineCount(TextPageBlock textBlock) {
|
||||
|
||||
return textBlock.getHeight() / textBlock.getMostPopularWordHeight();
|
||||
}
|
||||
|
||||
@ -3,7 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.classification.util
|
||||
import java.awt.geom.Line2D;
|
||||
import java.awt.geom.Point2D;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Ruling;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@ -13,7 +13,7 @@ public final class RulingTextDirAdjustUtil {
|
||||
/**
|
||||
* Converts a ruling (line of a table) the same way TextPositions are converted in PDFBox.
|
||||
* This will get the y position of the text, adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
|
||||
*
|
||||
* <p>
|
||||
* See org.apache.pdfbox.text.TextPosition
|
||||
*/
|
||||
public Line2D.Float convertToDirAdj(Ruling ruling, float dir, float pageWidth, float pageHeight) {
|
||||
|
||||
@ -16,4 +16,16 @@ public final class TextNormalizationUtilities {
|
||||
return text.replaceAll("([^\\s\\d\\-]{2,500})[\\-\\u00AD]\\R", "$1");
|
||||
}
|
||||
|
||||
|
||||
public static String removeLineBreaks(String text) {
|
||||
|
||||
return text.replaceAll("\n", " ");
|
||||
}
|
||||
|
||||
|
||||
public static String removeRepeatingWhitespaces(String text) {
|
||||
|
||||
return text.replaceAll(" {2}", " ");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -4,286 +4,124 @@ import static java.lang.String.format;
|
||||
import static java.util.stream.Collectors.groupingBy;
|
||||
import static java.util.stream.Collectors.toList;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationFooter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationHeader;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Footer;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.GenericSemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Header;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Headline;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Image;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Paragraph;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Section;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.AtomicTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.IdBuilder;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.DocumentGraph;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.FooterNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.HeaderNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.HeadlineNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ImageNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ParagraphNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SectionNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableCellNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.AtomicTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationFooter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationHeader;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Table;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.TableCell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextPositionSequence;
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Getter;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
|
||||
@Service
|
||||
@UtilityClass
|
||||
public class DocumentGraphFactory {
|
||||
|
||||
public static final double TABLE_CELL_MERGE_CONTENTS_SIZE_THRESHOLD = 0.05;
|
||||
public Document buildDocumentGraph(ClassificationDocument document) {
|
||||
|
||||
Document documentGraph = new Document();
|
||||
Context context = new Context(documentGraph);
|
||||
|
||||
public DocumentGraph buildDocumentGraph(ClassificationDocument document) {
|
||||
|
||||
TextBlockFactory textBlockFactory = new TextBlockFactory();
|
||||
DocumentGraph documentGraph = new DocumentGraph();
|
||||
Context context = new Context(new TableOfContents(documentGraph), new HashMap<>(), new LinkedList<>(), new LinkedList<>(), textBlockFactory);
|
||||
|
||||
document.getPages().stream().map(this::buildPage).forEach(page -> context.pages().put(page, new AtomicInteger(1)));
|
||||
document.getSections().stream().flatMap(section -> section.getImages().stream()).forEach(image -> context.images().add(image));
|
||||
document.getPages().forEach(context::buildAndAddPageWithCounter);
|
||||
document.getSections().stream().flatMap(section -> section.getImages().stream()).forEach(image -> context.getImages().add(image));
|
||||
addSections(document, context);
|
||||
addHeaderAndFooterToEachPage(document, context);
|
||||
|
||||
documentGraph.setNumberOfPages(context.pages.size());
|
||||
documentGraph.setPages(context.pages.keySet());
|
||||
documentGraph.setTableOfContents(context.tableOfContents);
|
||||
documentGraph.setTextBlock(documentGraph.buildTextBlock());
|
||||
documentGraph.setDocumentTree(context.documentTree);
|
||||
documentGraph.setTextBlock(documentGraph.getTextBlock());
|
||||
return documentGraph;
|
||||
}
|
||||
|
||||
|
||||
private void addSections(ClassificationDocument document, Context context) {
|
||||
|
||||
document.getSections().forEach(section -> addSection(null, section.getPageBlocks(), section.getImages(), context));
|
||||
document.getSections().forEach(section -> SectionNodeFactory.addSection(null, section.getPageBlocks(), section.getImages(), context));
|
||||
}
|
||||
|
||||
|
||||
private void addSection(SemanticNode parentNode, List<AbstractTextContainer> pageBlocks, List<ClassifiedImage> images, Context context) {
|
||||
public void addParagraphOrHeadline(GenericSemanticNode parentNode, TextPageBlock originalTextBlock, Context context, List<TextPageBlock> textBlocksToMerge) {
|
||||
|
||||
Map<Integer, List<AbstractTextContainer>> blocksPerPage = pageBlocks.stream().collect(groupingBy(AbstractTextContainer::getPage));
|
||||
SectionNode sectionNode = SectionNode.builder().entities(new HashSet<>()).tableOfContents(context.tableOfContents()).build();
|
||||
Page page = context.getPage(originalTextBlock.getPage());
|
||||
|
||||
context.sections().add(sectionNode);
|
||||
blocksPerPage.keySet().forEach(pageNumber -> addSectionNodeToPageNode(context, sectionNode, pageNumber));
|
||||
|
||||
List<Integer> tocId;
|
||||
if (parentNode == null) {
|
||||
tocId = context.tableOfContents.createNewMainEntryAndReturnId(NodeType.SECTION, sectionNode);
|
||||
GenericSemanticNode node;
|
||||
if (originalTextBlock.isHeadline()) {
|
||||
node = Headline.builder().documentTree(context.getDocumentTree()).build();
|
||||
} else {
|
||||
tocId = context.tableOfContents.createNewChildEntryAndReturnId(parentNode.getTocId(), NodeType.SECTION, sectionNode);
|
||||
}
|
||||
sectionNode.setTocId(tocId);
|
||||
Set<AbstractTextContainer> alreadyMerged = new HashSet<>();
|
||||
for (AbstractTextContainer abstractTextContainer : pageBlocks) {
|
||||
|
||||
if (alreadyMerged.contains(abstractTextContainer)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (abstractTextContainer instanceof ClassificationTextBlock) {
|
||||
List<ClassificationTextBlock> textBlocks = findTextBlocksWithSameClassificationAndAlignsY(abstractTextContainer, pageBlocks);
|
||||
alreadyMerged.addAll(textBlocks);
|
||||
addParagraphOrHeadline(sectionNode, (ClassificationTextBlock) abstractTextContainer, context, textBlocks);
|
||||
}
|
||||
if (abstractTextContainer instanceof Table) {
|
||||
addTable(sectionNode, (Table) abstractTextContainer, context);
|
||||
}
|
||||
}
|
||||
for (ClassifiedImage image : images) {
|
||||
|
||||
addImage(sectionNode, image, context);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static List<ClassificationTextBlock> findTextBlocksWithSameClassificationAndAlignsY(AbstractTextContainer atc, List<AbstractTextContainer> pageBlocks) {
|
||||
|
||||
return pageBlocks.stream()
|
||||
.filter(abstractTextContainer -> !abstractTextContainer.equals(atc))
|
||||
.filter(abstractTextContainer -> abstractTextContainer.getPage() == atc.getPage())
|
||||
.filter(abstractTextContainer -> abstractTextContainer instanceof ClassificationTextBlock)
|
||||
.filter(abstractTextContainer -> abstractTextContainer.intersectsY(atc))
|
||||
.map(abstractTextContainer -> (ClassificationTextBlock) abstractTextContainer)
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
private void addSectionNodeToPageNode(Context context, SectionNode sectionNode, Integer pageNumber) {
|
||||
|
||||
PageNode page = getPage(pageNumber, context);
|
||||
page.getMainBody().add(sectionNode);
|
||||
}
|
||||
|
||||
|
||||
private void addTable(SemanticNode parentNode, Table table, Context context) {
|
||||
|
||||
PageNode page = getPage(table.getPage(), context);
|
||||
TableNode tableNode = TableNode.builder().tableOfContents(context.tableOfContents()).numberOfCols(table.getColCount()).numberOfRows(table.getRowCount()).build();
|
||||
|
||||
if (!page.getMainBody().contains(parentNode)) {
|
||||
parentNode.getPages().add(page);
|
||||
}
|
||||
|
||||
page.getMainBody().add(tableNode);
|
||||
|
||||
List<Integer> tocId = context.tableOfContents().createNewChildEntryAndReturnId(parentNode.getTocId(), NodeType.TABLE, tableNode);
|
||||
tableNode.setTocId(tocId);
|
||||
|
||||
addTableCells(table.getRows(), tableNode, context, table.getPage());
|
||||
}
|
||||
|
||||
|
||||
private void addTableCells(List<List<TableCell>> rows, SemanticNode parentNode, Context context, int pageNumber) {
|
||||
|
||||
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
|
||||
for (int colIndex = 0; colIndex < rows.get(rowIndex).size(); colIndex++) {
|
||||
addTableCell(rows.get(rowIndex).get(colIndex), rowIndex, colIndex, parentNode, pageNumber, context);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void addTableCell(TableCell cell, int rowIndex, int colIndex, SemanticNode parentNode, int pageNumber, Context context) {
|
||||
|
||||
PageNode page = getPage(pageNumber, context);
|
||||
cell.getTextBlocks().stream().filter(tb -> tb.getPage() == 0).forEach(tb -> tb.setPage(pageNumber));
|
||||
|
||||
TableCellNode tableCellNode = TableCellNode.builder()
|
||||
.tableOfContents(context.tableOfContents())
|
||||
.row(rowIndex)
|
||||
.col(colIndex)
|
||||
.header(cell.isHeaderCell())
|
||||
.bBox(cell.getBounds2D())
|
||||
.build();
|
||||
page.getMainBody().add(tableCellNode);
|
||||
|
||||
TextBlock textBlock;
|
||||
|
||||
List<Integer> tocId = context.tableOfContents().createNewChildEntryAndReturnId(parentNode.getTocId(), NodeType.TABLE_CELL, tableCellNode);
|
||||
tableCellNode.setTocId(tocId);
|
||||
|
||||
if (cell.getTextBlocks().isEmpty()) {
|
||||
tableCellNode.setTerminalTextBlock(context.textBlockFactory.emptyTextBlock(parentNode, context, page));
|
||||
tableCellNode.setTerminal(true);
|
||||
|
||||
} else if (cell.getTextBlocks().size() == 1) {
|
||||
textBlock = context.textBlockFactory().buildAtomicTextBlock(cell.getTextBlocks().get(0).getSequences(), tableCellNode, context, page);
|
||||
tableCellNode.setTerminalTextBlock(textBlock);
|
||||
tableCellNode.setTerminal(true);
|
||||
|
||||
} else if (firstTextBlockIsHeadline(cell)) {
|
||||
addSection(tableCellNode, cell.getTextBlocks().stream().map(tb -> (AbstractTextContainer) tb).toList(), Collections.emptyList(), context);
|
||||
tableCellNode.setTerminal(false);
|
||||
|
||||
} else if (cellAreaIsSmallerThanPageAreaTimesThreshold(cell, page)) {
|
||||
List<TextPositionSequence> sequences = TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(cell.getTextBlocks());
|
||||
textBlock = context.textBlockFactory().buildAtomicTextBlock(sequences, tableCellNode, context, page);
|
||||
tableCellNode.setTerminalTextBlock(textBlock);
|
||||
tableCellNode.setTerminal(true);
|
||||
|
||||
} else {
|
||||
cell.getTextBlocks().forEach(tb -> addParagraphOrHeadline(tableCellNode, tb, context));
|
||||
tableCellNode.setTerminal(false);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
private static boolean cellAreaIsSmallerThanPageAreaTimesThreshold(TableCell cell, PageNode page) {
|
||||
|
||||
return cell.getArea() < TABLE_CELL_MERGE_CONTENTS_SIZE_THRESHOLD * page.getHeight() * page.getWidth();
|
||||
}
|
||||
|
||||
|
||||
private static boolean firstTextBlockIsHeadline(TableCell cell) {
|
||||
|
||||
String classification = cell.getTextBlocks().get(0).getClassification();
|
||||
return classification != null && classification.startsWith("H");
|
||||
}
|
||||
|
||||
|
||||
private void addParagraphOrHeadline(SemanticNode parentNode, ClassificationTextBlock originalTextBlock, Context context) {
|
||||
|
||||
addParagraphOrHeadline(parentNode, originalTextBlock, context, Collections.emptyList());
|
||||
}
|
||||
|
||||
|
||||
private void addParagraphOrHeadline(SemanticNode parentNode, ClassificationTextBlock originalTextBlock, Context context, List<ClassificationTextBlock> textBlocksToMerge) {
|
||||
|
||||
PageNode page = getPage(originalTextBlock.getPage(), context);
|
||||
|
||||
SemanticNode node;
|
||||
if (originalTextBlock.getClassification() != null && originalTextBlock.getClassification().startsWith("H")) {
|
||||
node = HeadlineNode.builder().tableOfContents(context.tableOfContents()).build();
|
||||
} else {
|
||||
node = ParagraphNode.builder().tableOfContents(context.tableOfContents()).build();
|
||||
node = Paragraph.builder().documentTree(context.getDocumentTree()).build();
|
||||
}
|
||||
|
||||
page.getMainBody().add(node);
|
||||
|
||||
List<ClassificationTextBlock> textBlocks = new LinkedList<>(textBlocksToMerge);
|
||||
List<TextPageBlock> textBlocks = new ArrayList<>(textBlocksToMerge);
|
||||
textBlocks.add(originalTextBlock);
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), node, context, page);
|
||||
|
||||
if (node instanceof HeadlineNode headlineNode) {
|
||||
List<Integer> tocId = context.tableOfContents.createNewChildEntryAndReturnId(parentNode.getTocId(), NodeType.HEADLINE, node);
|
||||
headlineNode.setTerminalTextBlock(textBlock);
|
||||
headlineNode.setTocId(tocId);
|
||||
}
|
||||
if (node instanceof ParagraphNode paragraphNode) {
|
||||
List<Integer> tocId = context.tableOfContents.createNewChildEntryAndReturnId(parentNode.getTocId(), NodeType.PARAGRAPH, node);
|
||||
paragraphNode.setTerminalTextBlock(textBlock);
|
||||
paragraphNode.setTocId(tocId);
|
||||
}
|
||||
List<Integer> treeId = context.documentTree.createNewChildEntryAndReturnId(parentNode, node);
|
||||
node.setLeafTextBlock(textBlock);
|
||||
node.setTreeId(treeId);
|
||||
}
|
||||
|
||||
|
||||
private void addImage(SectionNode sectionNode, ClassifiedImage image, Context context) {
|
||||
public void addImage(Section section, ClassifiedImage image, Context context) {
|
||||
|
||||
PageNode page = getPage(image.getPage(), context);
|
||||
ImageNode imageNode = ImageNode.builder()
|
||||
Rectangle2D position = image.getPosition();
|
||||
Page page = context.getPage(image.getPage());
|
||||
Image imageNode = Image.builder()
|
||||
.id(IdBuilder.buildId(Set.of(page), List.of(position)))
|
||||
.imageType(image.getImageType())
|
||||
.position(image.getPosition())
|
||||
.transparency(image.isHasTransparency())
|
||||
.position(position)
|
||||
.transparent(image.isHasTransparency())
|
||||
.page(page)
|
||||
.tableOfContents(context.tableOfContents())
|
||||
.documentTree(context.getDocumentTree())
|
||||
.build();
|
||||
page.getMainBody().add(imageNode);
|
||||
|
||||
List<Integer> tocId = context.tableOfContents().createNewChildEntryAndReturnId(sectionNode.getTocId(), NodeType.IMAGE, imageNode);
|
||||
imageNode.setTocId(tocId);
|
||||
List<Integer> tocId = context.getDocumentTree().createNewChildEntryAndReturnId(section, imageNode);
|
||||
imageNode.setTreeId(tocId);
|
||||
}
|
||||
|
||||
|
||||
private void addHeaderAndFooterToEachPage(ClassificationDocument document, Context context) {
|
||||
|
||||
Map<Integer, List<ClassificationTextBlock>> headers = document.getHeaders()
|
||||
Map<Integer, List<TextPageBlock>> headers = document.getHeaders()
|
||||
.stream()
|
||||
.map(ClassificationHeader::getTextBlocks)
|
||||
.flatMap(List::stream)
|
||||
.collect(groupingBy(AbstractTextContainer::getPage, toList()));
|
||||
.collect(groupingBy(AbstractPageBlock::getPage, toList()));
|
||||
|
||||
Map<Integer, List<ClassificationTextBlock>> footers = document.getFooters()
|
||||
Map<Integer, List<TextPageBlock>> footers = document.getFooters()
|
||||
.stream()
|
||||
.map(ClassificationFooter::getTextBlocks)
|
||||
.flatMap(List::stream)
|
||||
.collect(groupingBy(AbstractTextContainer::getPage, toList()));
|
||||
.collect(groupingBy(AbstractPageBlock::getPage, toList()));
|
||||
|
||||
for (int pageIndex = 1; pageIndex <= document.getPages().size(); pageIndex++) {
|
||||
if (headers.containsKey(pageIndex)) {
|
||||
@ -303,85 +141,105 @@ public class DocumentGraphFactory {
|
||||
}
|
||||
|
||||
|
||||
private void addFooter(List<ClassificationTextBlock> textBlocks, Context context) {
|
||||
private void addFooter(List<TextPageBlock> textBlocks, Context context) {
|
||||
|
||||
PageNode page = getPage(textBlocks.get(0).getPage(), context);
|
||||
FooterNode footer = FooterNode.builder().tableOfContents(context.tableOfContents()).build();
|
||||
Page page = context.getPage(textBlocks.get(0).getPage());
|
||||
Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build();
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks),
|
||||
footer,
|
||||
context,
|
||||
page);
|
||||
List<Integer> tocId = context.tableOfContents().createNewMainEntryAndReturnId(NodeType.FOOTER, footer);
|
||||
footer.setTocId(tocId);
|
||||
footer.setTerminalTextBlock(textBlock);
|
||||
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer);
|
||||
footer.setTreeId(tocId);
|
||||
footer.setLeafTextBlock(textBlock);
|
||||
page.setFooter(footer);
|
||||
}
|
||||
|
||||
|
||||
public void addHeader(List<ClassificationTextBlock> textBlocks, Context context) {
|
||||
public void addHeader(List<TextPageBlock> textBlocks, Context context) {
|
||||
|
||||
PageNode page = getPage(textBlocks.get(0).getPage(), context);
|
||||
HeaderNode header = HeaderNode.builder().tableOfContents(context.tableOfContents()).build();
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks),
|
||||
header,
|
||||
context,
|
||||
0,
|
||||
page);
|
||||
List<Integer> tocId = context.tableOfContents().createNewMainEntryAndReturnId(NodeType.HEADER, header);
|
||||
header.setTocId(tocId);
|
||||
header.setTerminalTextBlock(textBlock);
|
||||
Page page = context.getPage(textBlocks.get(0).getPage());
|
||||
Header header = Header.builder().documentTree(context.getDocumentTree()).build();
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), header, 0, page);
|
||||
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header);
|
||||
header.setTreeId(tocId);
|
||||
header.setLeafTextBlock(textBlock);
|
||||
page.setHeader(header);
|
||||
}
|
||||
|
||||
|
||||
private void addEmptyFooter(int pageIndex, Context context) {
|
||||
|
||||
PageNode page = getPage(pageIndex, context);
|
||||
FooterNode footer = FooterNode.builder().tableOfContents(context.tableOfContents()).build();
|
||||
Page page = context.getPage(pageIndex);
|
||||
Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build();
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(footer, context, page);
|
||||
List<Integer> tocId = context.tableOfContents().createNewMainEntryAndReturnId(NodeType.FOOTER, footer);
|
||||
footer.setTocId(tocId);
|
||||
footer.setTerminalTextBlock(textBlock);
|
||||
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer);
|
||||
footer.setTreeId(tocId);
|
||||
footer.setLeafTextBlock(textBlock);
|
||||
page.setFooter(footer);
|
||||
}
|
||||
|
||||
|
||||
private void addEmptyHeader(int pageIndex, Context context) {
|
||||
|
||||
PageNode page = getPage(pageIndex, context);
|
||||
HeaderNode header = HeaderNode.builder().tableOfContents(context.tableOfContents()).build();
|
||||
Page page = context.getPage(pageIndex);
|
||||
Header header = Header.builder().documentTree(context.getDocumentTree()).build();
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(header, 0, page);
|
||||
List<Integer> tocId = context.tableOfContents().createNewMainEntryAndReturnId(NodeType.HEADER, header);
|
||||
header.setTocId(tocId);
|
||||
header.setTerminalTextBlock(textBlock);
|
||||
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header);
|
||||
header.setTreeId(tocId);
|
||||
header.setLeafTextBlock(textBlock);
|
||||
page.setHeader(header);
|
||||
}
|
||||
|
||||
|
||||
private PageNode buildPage(ClassificationPage p) {
|
||||
@Getter
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public final class Context {
|
||||
|
||||
return PageNode.builder()
|
||||
.height((int) p.getPageHeight())
|
||||
.width((int) p.getPageWidth())
|
||||
.number(p.getPageNumber())
|
||||
.rotation(p.getRotation())
|
||||
.mainBody(new LinkedList<>())
|
||||
.build();
|
||||
}
|
||||
DocumentTree documentTree;
|
||||
Map<Page, Integer> pages;
|
||||
List<Section> sections;
|
||||
List<ClassifiedImage> images;
|
||||
TextBlockFactory textBlockFactory;
|
||||
|
||||
|
||||
private PageNode getPage(int pageIndex, Context context) {
|
||||
public Context(Document document) {
|
||||
|
||||
return context.pages.keySet()
|
||||
.stream()
|
||||
.filter(page -> page.getNumber() == pageIndex)
|
||||
.findFirst()
|
||||
.orElseThrow(() -> new NoSuchElementException(format("ClassificationPage with number %d not found", pageIndex)));
|
||||
}
|
||||
documentTree = new DocumentTree(document);
|
||||
pages = new HashMap<>();
|
||||
sections = new LinkedList<>();
|
||||
images = new LinkedList<>();
|
||||
textBlockFactory = new TextBlockFactory();
|
||||
}
|
||||
|
||||
|
||||
record Context(
|
||||
TableOfContents tableOfContents, Map<PageNode, AtomicInteger> pages, List<SectionNode> sections, List<ClassifiedImage> images, TextBlockFactory textBlockFactory) {
|
||||
public void buildAndAddPageWithCounter(ClassificationPage classificationPage) {
|
||||
|
||||
Page page = Page.fromClassificationPage(classificationPage);
|
||||
//this counter counts the TextBlocks per page
|
||||
//initial value is set to 1, because 0 is reserved for Header
|
||||
pages.put(page, 1);
|
||||
}
|
||||
|
||||
|
||||
public int getAndIncrementTextBlockNumberOnPage(Page page) {
|
||||
|
||||
Integer textBlockNumberOnPage = pages.get(page);
|
||||
pages.merge(page, 1, Integer::sum);
|
||||
return textBlockNumberOnPage;
|
||||
}
|
||||
|
||||
|
||||
public Page getPage(int pageIndex) {
|
||||
|
||||
return pages.keySet()
|
||||
.stream()
|
||||
.filter(page -> page.getNumber() == pageIndex)
|
||||
.findFirst()
|
||||
.orElseThrow(() -> new NoSuchElementException(format("ClassificationPage with number %d not found", pageIndex)));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -1,105 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.factory;
|
||||
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.awt.geom.Area;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.function.BiConsumer;
|
||||
import java.util.function.BinaryOperator;
|
||||
import java.util.function.Function;
|
||||
import java.util.function.Supplier;
|
||||
import java.util.stream.Collector;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer;
|
||||
|
||||
public class RectangleTransformations {
|
||||
|
||||
public static Rectangle2D pad(Rectangle2D rectangle2D, int deltaX, int deltaY) {
|
||||
|
||||
return new Rectangle2D.Double(rectangle2D.getMinX() - deltaX, rectangle2D.getMinY() - deltaY, rectangle2D.getWidth() + 2 * deltaX, rectangle2D.getHeight() + 2 * deltaY);
|
||||
}
|
||||
|
||||
|
||||
public static Rectangle2D bBoxUnionAbstractTextContainer(List<AbstractTextContainer> abstractTextContainers) {
|
||||
|
||||
return abstractTextContainers.stream().map(RectangleTransformations::toRectangle2D).collect(new Rectangle2DUnion());
|
||||
}
|
||||
|
||||
public static Rectangle2D rectangleUnion(List<Rectangle2D> rectangle2DList) {
|
||||
|
||||
return rectangle2DList.stream().collect(new Rectangle2DUnion());
|
||||
}
|
||||
|
||||
|
||||
public static Rectangle2D toRectangle2D(AbstractTextContainer abstractTextContainer) {
|
||||
|
||||
return new Rectangle2D.Float(abstractTextContainer.getMinX(), abstractTextContainer.getMinY(), abstractTextContainer.getWidth(), abstractTextContainer.getHeight());
|
||||
}
|
||||
|
||||
|
||||
public static Rectangle2D toRectangle2D(PDRectangle rectangle) {
|
||||
|
||||
return new Rectangle2D.Double(rectangle.getLowerLeftX(), rectangle.getLowerLeftY(), rectangle.getWidth(), rectangle.getHeight());
|
||||
}
|
||||
|
||||
|
||||
public static String toString(Rectangle2D rectangle2D) {
|
||||
|
||||
return format("%f,%f,%f,%f", rectangle2D.getX(), rectangle2D.getY(), rectangle2D.getWidth(), rectangle2D.getHeight());
|
||||
}
|
||||
|
||||
|
||||
public static Rectangle2D parseRectangle2D(String bBox) {
|
||||
|
||||
List<Float> floats = Arrays.stream(bBox.split(",")).map(Float::parseFloat).toList();
|
||||
return new Rectangle2D.Float(floats.get(0), floats.get(1), floats.get(2), floats.get(3));
|
||||
}
|
||||
|
||||
|
||||
private static class Rectangle2DUnion implements Collector<Rectangle2D, Area, Rectangle2D> {
|
||||
|
||||
@Override
|
||||
public Supplier<Area> supplier() {
|
||||
|
||||
return Area::new;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public BiConsumer<Area, Rectangle2D> accumulator() {
|
||||
|
||||
return (area, rectangle2D) -> area.add(new Area(rectangle2D));
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public BinaryOperator<Area> combiner() {
|
||||
|
||||
return (area1, area2) -> {
|
||||
area1.add(area2);
|
||||
return area1;
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Function<Area, Rectangle2D> finisher() {
|
||||
|
||||
return Area::getBounds2D;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Set<Characteristics> characteristics() {
|
||||
|
||||
return Set.of(Characteristics.CONCURRENT, Characteristics.UNORDERED);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,6 +1,7 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.factory;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
@ -11,10 +12,22 @@ import lombok.experimental.FieldDefaults;
|
||||
@Builder
|
||||
@Getter
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class SearchTextWithTextPositionModel {
|
||||
public class SearchTextWithTextPositionDto {
|
||||
|
||||
String searchText;
|
||||
List<Integer> lineBreaks;
|
||||
List<Integer> stringCoordsToPositionCoords;
|
||||
List<Rectangle2D> positions;
|
||||
|
||||
|
||||
public static SearchTextWithTextPositionDto empty() {
|
||||
|
||||
return SearchTextWithTextPositionDto.builder()
|
||||
.searchText("")
|
||||
.lineBreaks(Collections.emptyList())
|
||||
.positions(Collections.emptyList())
|
||||
.stringCoordsToPositionCoords(Collections.emptyList())
|
||||
.build();
|
||||
}
|
||||
|
||||
}
|
||||
@ -2,38 +2,35 @@ package com.knecon.fforesight.service.layoutparser.processor.factory;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.RedTextPosition;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextDirection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.RedTextPosition;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextDirection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class SearchTextWithTextPositionFactory {
|
||||
|
||||
public static final int HEIGHT_PADDING = 2;
|
||||
public final int HEIGHT_PADDING = 2;
|
||||
// when checking for a hyphen linebreak, we need to check after a linebreak if the last hyphen was less than three symbols away.
|
||||
// We detect a linebreak as either a "\n" character or if two adjacent symbol's position differ in y-coordinates by at least one character height.
|
||||
// If there is a hyphen linebreak, the hyphen will be 1 position in front of a "\n" or 2 positions in front of the character which has a lower y-coordinate
|
||||
// This is why, we need to initialize this to < -2, otherwise, if the very first symbol is a \n we would detect a hyphen linebreak that isn't there.
|
||||
// Also, Integer.MIN_VALUE is a bad idea due to potential overflow during arithmetic operations. This is why the default should be -3.
|
||||
public final int MAX_HYPHEN_LINEBREAK_DISTANCE = 3;
|
||||
|
||||
|
||||
public static SearchTextWithTextPositionModel buildSearchTextToTextPositionModel(List<TextPositionSequence> sequences) {
|
||||
public SearchTextWithTextPositionDto buildSearchTextToTextPositionModel(List<TextPositionSequence> sequences) {
|
||||
|
||||
if (sequences.isEmpty() || sequences.stream().allMatch(sequence -> sequence.getTextPositions().isEmpty())) {
|
||||
return SearchTextWithTextPositionModel.builder()
|
||||
.searchText("")
|
||||
.lineBreaks(Collections.emptyList())
|
||||
.positions(Collections.emptyList())
|
||||
.stringCoordsToPositionCoords(Collections.emptyList())
|
||||
.build();
|
||||
return SearchTextWithTextPositionDto.empty();
|
||||
}
|
||||
|
||||
List<Integer> stringIdxToPositionIdx = new LinkedList<>();
|
||||
List<Integer> lineBreaksStringIdx = new LinkedList<>();
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
int stringIdx = 0;
|
||||
int positionIdx = 0;
|
||||
int lastHyphenIdx = -3;
|
||||
Context context = new Context();
|
||||
|
||||
RedTextPosition currentTextPosition = sequences.get(0).getTextPositions().get(0);
|
||||
RedTextPosition previousTextPosition = RedTextPosition.builder().unicode(" ").position(currentTextPosition.getPosition()).build();
|
||||
@ -42,60 +39,78 @@ public class SearchTextWithTextPositionFactory {
|
||||
for (int i = 0; i < word.getTextPositions().size(); ++i) {
|
||||
|
||||
currentTextPosition = word.getTextPositions().get(i);
|
||||
|
||||
if (isLineBreak(currentTextPosition, previousTextPosition)) {
|
||||
|
||||
if (stringIdx - lastHyphenIdx < 3) {
|
||||
sb.delete(lastHyphenIdx, sb.length());
|
||||
stringIdxToPositionIdx = stringIdxToPositionIdx.subList(0, lastHyphenIdx);
|
||||
stringIdx = lastHyphenIdx;
|
||||
lastHyphenIdx = -3;
|
||||
}
|
||||
lineBreaksStringIdx.add(stringIdx);
|
||||
removeHyphenLinebreaks(context);
|
||||
context.lineBreaksStringIdx.add(context.stringIdx);
|
||||
}
|
||||
if (!isRepeatedWhitespace(currentTextPosition.getUnicode(), previousTextPosition.getUnicode())) {
|
||||
|
||||
if (isHyphen(currentTextPosition.getUnicode())) {
|
||||
lastHyphenIdx = stringIdx;
|
||||
context.lastHyphenIdx = context.stringIdx;
|
||||
}
|
||||
sb.append(currentTextPosition.getUnicode());
|
||||
stringIdxToPositionIdx.add(positionIdx);
|
||||
++stringIdx;
|
||||
appendCurrentTextPosition(context, currentTextPosition);
|
||||
}
|
||||
|
||||
previousTextPosition = currentTextPosition;
|
||||
|
||||
++positionIdx;
|
||||
++context.positionIdx;
|
||||
}
|
||||
|
||||
previousTextPosition = RedTextPosition.builder().unicode(" ").position(previousTextPosition.getPosition()).build();
|
||||
sb.append(previousTextPosition.getUnicode());
|
||||
stringIdxToPositionIdx.add(positionIdx);
|
||||
++stringIdx;
|
||||
context.stringBuilder.append(" ");
|
||||
context.stringIdxToPositionIdx.add(context.positionIdx);
|
||||
++context.stringIdx;
|
||||
}
|
||||
|
||||
assert sb.length() == stringIdxToPositionIdx.size();
|
||||
assert context.stringBuilder.length() == context.stringIdxToPositionIdx.size();
|
||||
|
||||
List<Rectangle2D> positions = sequences.stream()
|
||||
.flatMap(sequence -> sequence.getTextPositions().stream().map(textPosition -> mapRedTextPositionToInitialUserSpace(textPosition, sequence)))
|
||||
.toList();
|
||||
|
||||
return SearchTextWithTextPositionModel.builder()
|
||||
.searchText(sb.toString())
|
||||
.lineBreaks(lineBreaksStringIdx)
|
||||
.stringCoordsToPositionCoords(stringIdxToPositionIdx)
|
||||
return SearchTextWithTextPositionDto.builder()
|
||||
.searchText(context.stringBuilder.toString())
|
||||
.lineBreaks(context.lineBreaksStringIdx)
|
||||
.stringCoordsToPositionCoords(context.stringIdxToPositionIdx)
|
||||
.positions(positions)
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
private static boolean isLineBreak(RedTextPosition currentTextPosition, RedTextPosition previousTextPosition) {
|
||||
private void appendCurrentTextPosition(Context context, RedTextPosition currentTextPosition) {
|
||||
|
||||
context.stringBuilder.append(currentTextPosition.getUnicode());
|
||||
|
||||
// unicode characters with more than 16-bit encoding have a length > 1 in java strings
|
||||
for (int j = 0; j < currentTextPosition.getUnicode().length(); j++) {
|
||||
context.stringIdxToPositionIdx.add(context.positionIdx);
|
||||
}
|
||||
context.stringIdx += currentTextPosition.getUnicode().length();
|
||||
}
|
||||
|
||||
|
||||
private void removeHyphenLinebreaks(Context context) {
|
||||
|
||||
if (lastHyphenDirectlyBeforeLineBreak(context)) {
|
||||
context.stringBuilder.delete(context.lastHyphenIdx, context.stringBuilder.length());
|
||||
context.stringIdxToPositionIdx = context.stringIdxToPositionIdx.subList(0, context.lastHyphenIdx);
|
||||
context.stringIdx = context.lastHyphenIdx;
|
||||
context.lastHyphenIdx = -MAX_HYPHEN_LINEBREAK_DISTANCE;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private boolean lastHyphenDirectlyBeforeLineBreak(Context context) {
|
||||
|
||||
return context.stringIdx - context.lastHyphenIdx < MAX_HYPHEN_LINEBREAK_DISTANCE;
|
||||
}
|
||||
|
||||
|
||||
private boolean isLineBreak(RedTextPosition currentTextPosition, RedTextPosition previousTextPosition) {
|
||||
|
||||
return Objects.equals(currentTextPosition.getUnicode(), "\n") || isDeltaYLargerThanTextHeight(currentTextPosition, previousTextPosition);
|
||||
}
|
||||
|
||||
|
||||
private static boolean isDeltaYLargerThanTextHeight(RedTextPosition currentPosition, RedTextPosition previousPosition) {
|
||||
private boolean isDeltaYLargerThanTextHeight(RedTextPosition currentPosition, RedTextPosition previousPosition) {
|
||||
|
||||
if (previousPosition == null) {
|
||||
return false;
|
||||
@ -106,13 +121,13 @@ public class SearchTextWithTextPositionFactory {
|
||||
}
|
||||
|
||||
|
||||
private static boolean isRepeatedWhitespace(String currentUnicode, String previousUnicode) {
|
||||
private boolean isRepeatedWhitespace(String currentUnicode, String previousUnicode) {
|
||||
|
||||
return Objects.equals(previousUnicode, " ") && Objects.equals(currentUnicode, " ");
|
||||
}
|
||||
|
||||
|
||||
private static boolean isHyphen(String unicodeCharacter) {
|
||||
private boolean isHyphen(String unicodeCharacter) {
|
||||
|
||||
return Objects.equals(unicodeCharacter, "-") || //
|
||||
Objects.equals(unicodeCharacter, "~") || //
|
||||
@ -128,7 +143,7 @@ public class SearchTextWithTextPositionFactory {
|
||||
}
|
||||
|
||||
|
||||
private static Rectangle2D mapRedTextPositionToInitialUserSpace(RedTextPosition textPosition, TextPositionSequence sequence) {
|
||||
private Rectangle2D mapRedTextPositionToInitialUserSpace(RedTextPosition textPosition, TextPositionSequence sequence) {
|
||||
|
||||
float textHeight = sequence.getTextHeight() + HEIGHT_PADDING;
|
||||
Rectangle2D rectangle2D = new Rectangle2D.Double(textPosition.getXDirAdj(),
|
||||
@ -153,4 +168,18 @@ public class SearchTextWithTextPositionFactory {
|
||||
return transform.createTransformedShape(rectangle2D).getBounds2D();
|
||||
}
|
||||
|
||||
|
||||
private class Context {
|
||||
|
||||
List<Integer> stringIdxToPositionIdx = new LinkedList<>();
|
||||
List<Integer> lineBreaksStringIdx = new LinkedList<>();
|
||||
StringBuilder stringBuilder = new StringBuilder();
|
||||
|
||||
int stringIdx;
|
||||
int positionIdx;
|
||||
|
||||
int lastHyphenIdx = -MAX_HYPHEN_LINEBREAK_DISTANCE;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,183 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.factory;
|
||||
|
||||
import static java.lang.String.format;
|
||||
import static java.util.Collections.emptyList;
|
||||
import static java.util.stream.Collectors.groupingBy;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.GenericSemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Section;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TableMergingUtility;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class SectionNodeFactory {
|
||||
|
||||
public void addSection(GenericSemanticNode parentNode, List<AbstractPageBlock> pageBlocks, List<ClassifiedImage> images, DocumentGraphFactory.Context context) {
|
||||
|
||||
if (pageBlocks.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
Map<Integer, List<AbstractPageBlock>> blocksPerPage = pageBlocks.stream().collect(groupingBy(AbstractPageBlock::getPage));
|
||||
Section section = Section.builder().documentTree(context.getDocumentTree()).build();
|
||||
|
||||
context.getSections().add(section);
|
||||
blocksPerPage.keySet().forEach(pageNumber -> addSectionNodeToPageNode(context, section, pageNumber));
|
||||
|
||||
section.setTreeId(getTreeId(parentNode, context, section));
|
||||
|
||||
addFirstHeadlineDirectlyToSection(pageBlocks, context, section);
|
||||
if (containsTablesAndTextBlocks(pageBlocks)) {
|
||||
splitPageBlocksIntoSubSections(pageBlocks).forEach(subSectionPageBlocks -> addSection(section, subSectionPageBlocks, emptyList(), context));
|
||||
} else {
|
||||
addTablesAndParagraphsAndHeadlinesToSection(pageBlocks, context, section);
|
||||
}
|
||||
|
||||
images.stream().distinct().forEach(image -> DocumentGraphFactory.addImage(section, image, context));
|
||||
}
|
||||
|
||||
|
||||
private List<Integer> getTreeId(GenericSemanticNode parentNode, DocumentGraphFactory.Context context, Section section) {
|
||||
|
||||
if (parentNode == null) {
|
||||
return context.getDocumentTree().createNewMainEntryAndReturnId(section);
|
||||
} else {
|
||||
return context.getDocumentTree().createNewChildEntryAndReturnId(parentNode, section);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void addFirstHeadlineDirectlyToSection(List<AbstractPageBlock> pageBlocks, DocumentGraphFactory.Context context, Section section) {
|
||||
|
||||
if (pageBlocks.get(0).isHeadline()) {
|
||||
addTablesAndParagraphsAndHeadlinesToSection(List.of(pageBlocks.get(0)), context, section);
|
||||
pageBlocks.remove(0);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void addTablesAndParagraphsAndHeadlinesToSection(List<AbstractPageBlock> pageBlocks, DocumentGraphFactory.Context context, Section section) {
|
||||
|
||||
Set<AbstractPageBlock> alreadyMerged = new HashSet<>();
|
||||
List<AbstractPageBlock> remainingBlocks = new LinkedList<>(pageBlocks);
|
||||
for (AbstractPageBlock abstractPageBlock : pageBlocks) {
|
||||
|
||||
if (alreadyMerged.contains(abstractPageBlock)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
remainingBlocks.removeAll(alreadyMerged);
|
||||
|
||||
if (abstractPageBlock instanceof TextPageBlock) {
|
||||
List<TextPageBlock> textBlocks = findTextBlocksWithSameClassificationAndAlignsY(abstractPageBlock, remainingBlocks);
|
||||
alreadyMerged.addAll(textBlocks);
|
||||
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, textBlocks);
|
||||
} else if (abstractPageBlock instanceof TablePageBlock tablePageBlock) {
|
||||
List<TablePageBlock> tablesToMerge = TableMergingUtility.findConsecutiveTablesWithSameColCountAndSameHeaders(tablePageBlock, remainingBlocks);
|
||||
alreadyMerged.addAll(tablesToMerge);
|
||||
TableNodeFactory.addTable(section, tablesToMerge, context);
|
||||
} else {
|
||||
throw new RuntimeException(format("Unhandled AbstractPageBlockType %s!", abstractPageBlock.getClass()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private boolean containsTablesAndTextBlocks(List<AbstractPageBlock> pageBlocks) {
|
||||
|
||||
return pageBlocks.stream().anyMatch(pageBlock -> pageBlock instanceof TablePageBlock) && pageBlocks.stream().anyMatch(pageBlock -> pageBlock instanceof TextPageBlock);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* This function splits the list of PageBlocks around TablePageBlocks, such that SubSections can be created, that don't include tables.
|
||||
* This is needed so we can execute rules on sections, that do not contain tables.
|
||||
* See: <a href="https://knecon.atlassian.net/wiki/spaces/RED/pages/14765218/Document+Structure">document structure wiki</a>
|
||||
*
|
||||
* @param pageBlocks a List of AbstractPageBlocks, which have at least one TablePageBlock and one ClassificationTextBlock
|
||||
* @return List of Lists of AbstractPageBlocks, which include either a single Headline ClassificationTextBlock and a TablePageBlock or only ClassificationTextBlocks.
|
||||
*/
|
||||
private List<List<AbstractPageBlock>> splitPageBlocksIntoSubSections(List<AbstractPageBlock> pageBlocks) {
|
||||
|
||||
List<List<AbstractPageBlock>> splitList = splitIntoCoherentList(pageBlocks);
|
||||
movePrecedingHeadlineToTableList(splitList);
|
||||
return splitList.stream().filter(list -> !list.isEmpty()).toList();
|
||||
}
|
||||
|
||||
|
||||
private void movePrecedingHeadlineToTableList(List<List<AbstractPageBlock>> splitList) {
|
||||
|
||||
for (int i = 0; i < splitList.size(); i++) {
|
||||
if (listIsTablesOnly(splitList.get(i)) && i > 0) {
|
||||
List<AbstractPageBlock> previousList = splitList.get(i - 1);
|
||||
AbstractPageBlock lastPageBlockInPreviousList = previousList.get(previousList.size() - 1);
|
||||
if (lastPageBlockInPreviousList.isHeadline()) {
|
||||
previousList.remove(i - 1);
|
||||
splitList.get(i).add(0, lastPageBlockInPreviousList);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private boolean listIsTablesOnly(List<AbstractPageBlock> abstractPageBlocks) {
|
||||
|
||||
return abstractPageBlocks.stream().allMatch(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @param pageBlocks a List of AbstractPageBlocks, which have at least one TablePageBlock and one ClassificationTextBlock
|
||||
* @return List of Lists of AbstractPageBlocks, which are exclusively of type ClassificationTextBlock or TablePageBlock
|
||||
*/
|
||||
private List<List<AbstractPageBlock>> splitIntoCoherentList(List<AbstractPageBlock> pageBlocks) {
|
||||
|
||||
List<List<AbstractPageBlock>> splitList = new LinkedList<>();
|
||||
List<AbstractPageBlock> currentList = new LinkedList<>();
|
||||
splitList.add(currentList);
|
||||
|
||||
Class<? extends AbstractPageBlock> lastPageBlockClass = pageBlocks.get(0).getClass();
|
||||
for (AbstractPageBlock pageBlock : pageBlocks) {
|
||||
if (lastPageBlockClass.isInstance(pageBlock)) {
|
||||
currentList.add(pageBlock);
|
||||
} else {
|
||||
currentList = new LinkedList<>();
|
||||
currentList.add(pageBlock);
|
||||
splitList.add(currentList);
|
||||
lastPageBlockClass = pageBlock.getClass();
|
||||
}
|
||||
}
|
||||
return splitList;
|
||||
}
|
||||
|
||||
|
||||
private List<TextPageBlock> findTextBlocksWithSameClassificationAndAlignsY(AbstractPageBlock atc, List<AbstractPageBlock> pageBlocks) {
|
||||
|
||||
return pageBlocks.stream()
|
||||
.filter(abstractTextContainer -> !abstractTextContainer.equals(atc))
|
||||
.filter(abstractTextContainer -> abstractTextContainer.getPage() == atc.getPage())
|
||||
.filter(abstractTextContainer -> abstractTextContainer instanceof TextPageBlock)
|
||||
.filter(abstractTextContainer -> abstractTextContainer.intersectsY(atc))
|
||||
.map(abstractTextContainer -> (TextPageBlock) abstractTextContainer)
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
private void addSectionNodeToPageNode(DocumentGraphFactory.Context context, Section section, Integer pageNumber) {
|
||||
|
||||
Page page = context.getPage(pageNumber);
|
||||
page.getMainBody().add(section);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,136 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.factory;
|
||||
|
||||
import static java.util.Collections.emptyList;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.GenericSemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.SemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Table;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.TableCell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class TableNodeFactory {
|
||||
|
||||
public final double TABLE_CELL_MERGE_CONTENTS_SIZE_THRESHOLD = 0.05;
|
||||
|
||||
|
||||
public void addTable(GenericSemanticNode parentNode, List<TablePageBlock> tablesToMerge, DocumentGraphFactory.Context context) {
|
||||
|
||||
setPageNumberInCells(tablesToMerge);
|
||||
Set<Page> pages = tablesToMerge.stream().map(AbstractPageBlock::getPage).map(context::getPage).collect(Collectors.toSet());
|
||||
List<List<Cell>> mergedRows = tablesToMerge.stream().map(TablePageBlock::getRows).flatMap(Collection::stream).toList();
|
||||
Table table = Table.builder().documentTree(context.getDocumentTree()).numberOfCols(mergedRows.get(0).size()).numberOfRows(mergedRows.size()).build();
|
||||
|
||||
pages.forEach(page -> addTableToPage(page, parentNode, table));
|
||||
|
||||
List<Integer> treeId = context.getDocumentTree().createNewChildEntryAndReturnId(parentNode, table);
|
||||
table.setTreeId(treeId);
|
||||
addTableCells(mergedRows, table, context);
|
||||
|
||||
ifTableHasNoHeadersSetFirstRowAsHeaders(table);
|
||||
}
|
||||
|
||||
|
||||
private void setPageNumberInCells(List<TablePageBlock> tablesToMerge) {
|
||||
|
||||
// For some reason I can't figure out, in some table cells, the ClassificationTextBlocks have 0 as page number
|
||||
// So I am fixing this here, but this should actually be fixed upstream.
|
||||
tablesToMerge.forEach(table -> table.getRows()
|
||||
.stream()
|
||||
.flatMap(Collection::stream)
|
||||
.peek(cell -> cell.setPageNumber(table.getPage()))
|
||||
.forEach(cell -> setPageNumberInTextBlocksWithPageNumberSetTo0(table, cell)));
|
||||
}
|
||||
|
||||
|
||||
private void setPageNumberInTextBlocksWithPageNumberSetTo0(TablePageBlock table, Cell cell) {
|
||||
|
||||
cell.getTextBlocks().stream()//
|
||||
.filter(tb -> tb.getPage() == 0)//
|
||||
.forEach(tb -> tb.setPage(table.getPage()));
|
||||
}
|
||||
|
||||
|
||||
@SuppressWarnings("PMD.UnusedPrivateMethod") // PMD actually flags this wrong
|
||||
private void addTableToPage(Page page, SemanticNode parentNode, Table table) {
|
||||
|
||||
if (!page.getMainBody().contains(parentNode)) {
|
||||
parentNode.getPages().add(page);
|
||||
}
|
||||
|
||||
page.getMainBody().add(table);
|
||||
}
|
||||
|
||||
|
||||
private void ifTableHasNoHeadersSetFirstRowAsHeaders(Table table) {
|
||||
|
||||
if (table.streamHeaders().findAny().isEmpty()) {
|
||||
table.streamRow(0).forEach(tableCellNode -> tableCellNode.setHeader(true));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void addTableCells(List<List<Cell>> rows, Table table, DocumentGraphFactory.Context context) {
|
||||
|
||||
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
|
||||
for (int colIndex = 0; colIndex < rows.get(rowIndex).size(); colIndex++) {
|
||||
addTableCell(rows.get(rowIndex).get(colIndex), rowIndex, colIndex, table, context);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SuppressWarnings("PMD.UnusedPrivateMethod") // PMD actually flags this wrong
|
||||
private void addTableCell(Cell cell, int rowIndex, int colIndex, Table tableNode, DocumentGraphFactory.Context context) {
|
||||
|
||||
Page page = context.getPage(cell.getPageNumber());
|
||||
|
||||
TableCell tableCell = TableCell.builder().documentTree(context.getDocumentTree()).row(rowIndex).col(colIndex).header(cell.isHeaderCell()).bBox(cell.getBounds2D()).build();
|
||||
page.getMainBody().add(tableCell);
|
||||
|
||||
List<Integer> treeId = context.getDocumentTree().createNewTableChildEntryAndReturnId(tableNode, tableCell);
|
||||
tableCell.setTreeId(treeId);
|
||||
|
||||
TextBlock textBlock;
|
||||
if (cell.getTextBlocks().isEmpty()) {
|
||||
tableCell.setLeafTextBlock(context.getTextBlockFactory().emptyTextBlock(tableNode, context, page));
|
||||
} else if (cell.getTextBlocks().size() == 1) {
|
||||
textBlock = context.getTextBlockFactory().buildAtomicTextBlock(cell.getTextBlocks().get(0).getSequences(), tableCell, context, page);
|
||||
tableCell.setLeafTextBlock(textBlock);
|
||||
} else if (firstTextBlockIsHeadline(cell)) {
|
||||
SectionNodeFactory.addSection(tableCell, cell.getTextBlocks().stream().map(tb -> (AbstractPageBlock) tb).toList(), emptyList(), context);
|
||||
} else if (cellAreaIsSmallerThanPageAreaTimesThreshold(cell, page)) {
|
||||
List<TextPositionSequence> sequences = TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(cell.getTextBlocks());
|
||||
textBlock = context.getTextBlockFactory().buildAtomicTextBlock(sequences, tableCell, context, page);
|
||||
tableCell.setLeafTextBlock(textBlock);
|
||||
} else {
|
||||
cell.getTextBlocks().forEach(tb -> DocumentGraphFactory.addParagraphOrHeadline(tableCell, tb, context, emptyList()));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private boolean cellAreaIsSmallerThanPageAreaTimesThreshold(Cell cell, Page page) {
|
||||
|
||||
return cell.getArea() < TABLE_CELL_MERGE_CONTENTS_SIZE_THRESHOLD * page.getHeight() * page.getWidth();
|
||||
}
|
||||
|
||||
|
||||
private boolean firstTextBlockIsHeadline(Cell cell) {
|
||||
|
||||
return cell.getTextBlocks().get(0).isHeadline();
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,79 +1,53 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.factory;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.AtomicTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.SemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.AtomicTextBlock;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class TextBlockFactory {
|
||||
|
||||
AtomicInteger stringOffset;
|
||||
AtomicLong textBlockIdx;
|
||||
int stringOffset;
|
||||
long textBlockIdx;
|
||||
|
||||
|
||||
public TextBlockFactory() {
|
||||
public AtomicTextBlock buildAtomicTextBlock(List<TextPositionSequence> sequences, SemanticNode parent, DocumentGraphFactory.Context context, Page page) {
|
||||
|
||||
stringOffset = new AtomicInteger();
|
||||
textBlockIdx = new AtomicLong();
|
||||
Integer numberOnPage = context.getAndIncrementTextBlockNumberOnPage(page);
|
||||
return buildAtomicTextBlock(sequences, parent, numberOnPage, page);
|
||||
}
|
||||
|
||||
|
||||
public AtomicTextBlock buildAtomicTextBlock(List<TextPositionSequence> sequences, SemanticNode parent, DocumentGraphFactory.Context context, PageNode page) {
|
||||
public AtomicTextBlock buildAtomicTextBlock(List<TextPositionSequence> sequences, SemanticNode parent, Integer numberOnPage, Page page) {
|
||||
|
||||
Integer numberOnPage = context.pages().get(page).getAndIncrement();
|
||||
return buildAtomicTextBlock(sequences, parent, context, numberOnPage, page);
|
||||
SearchTextWithTextPositionDto searchTextWithTextPositionDto = SearchTextWithTextPositionFactory.buildSearchTextToTextPositionModel(sequences);
|
||||
int offset = stringOffset;
|
||||
stringOffset += searchTextWithTextPositionDto.getSearchText().length();
|
||||
long idx = textBlockIdx;
|
||||
textBlockIdx++;
|
||||
return AtomicTextBlock.fromSearchTextWithTextPositionDto(searchTextWithTextPositionDto, parent, offset, idx, numberOnPage, page);
|
||||
}
|
||||
|
||||
|
||||
public AtomicTextBlock buildAtomicTextBlock(List<TextPositionSequence> sequences,
|
||||
SemanticNode parent,
|
||||
DocumentGraphFactory.Context context,
|
||||
Integer numberOnPage,
|
||||
PageNode page) {
|
||||
public AtomicTextBlock emptyTextBlock(SemanticNode parent, DocumentGraphFactory.Context context, Page page) {
|
||||
|
||||
SearchTextWithTextPositionModel searchTextWithTextPositionModel = SearchTextWithTextPositionFactory.buildSearchTextToTextPositionModel(sequences);
|
||||
int offset = stringOffset.getAndAdd(searchTextWithTextPositionModel.getSearchText().length());
|
||||
|
||||
return AtomicTextBlock.builder()
|
||||
.id(textBlockIdx.getAndIncrement())
|
||||
.parent(parent)
|
||||
.searchText(searchTextWithTextPositionModel.getSearchText())
|
||||
.numberOnPage(numberOnPage)
|
||||
.page(page)
|
||||
.lineBreaks(searchTextWithTextPositionModel.getLineBreaks())
|
||||
.positions(searchTextWithTextPositionModel.getPositions())
|
||||
.stringIdxToPositionIdx(searchTextWithTextPositionModel.getStringCoordsToPositionCoords())
|
||||
.boundary(new Boundary(offset, offset + searchTextWithTextPositionModel.getSearchText().length()))
|
||||
.build();
|
||||
long idx = textBlockIdx;
|
||||
textBlockIdx++;
|
||||
return AtomicTextBlock.empty(idx, stringOffset, page, context.getAndIncrementTextBlockNumberOnPage(page), parent);
|
||||
}
|
||||
|
||||
|
||||
public AtomicTextBlock emptyTextBlock(SemanticNode parent, DocumentGraphFactory.Context context, PageNode page) {
|
||||
public AtomicTextBlock emptyTextBlock(SemanticNode parent, Integer numberOnPage, Page page) {
|
||||
|
||||
return emptyTextBlock(parent, context.pages().get(page).getAndIncrement(), page);
|
||||
}
|
||||
|
||||
|
||||
public AtomicTextBlock emptyTextBlock(SemanticNode parent, Integer numberOnPage, PageNode page) {
|
||||
|
||||
return AtomicTextBlock.builder()
|
||||
.id(textBlockIdx.getAndIncrement())
|
||||
.boundary(new Boundary(stringOffset.get(), stringOffset.get()))
|
||||
.searchText("")
|
||||
.lineBreaks(Collections.emptyList())
|
||||
.page(page)
|
||||
.numberOnPage(numberOnPage)
|
||||
.stringIdxToPositionIdx(Collections.emptyList())
|
||||
.positions(Collections.emptyList())
|
||||
.parent(parent)
|
||||
.build();
|
||||
long idx = textBlockIdx;
|
||||
textBlockIdx++;
|
||||
return AtomicTextBlock.empty(idx, stringOffset, page, numberOnPage, parent);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,11 +1,18 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.graph;
|
||||
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock;
|
||||
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.Setter;
|
||||
|
||||
@Setter
|
||||
@EqualsAndHashCode
|
||||
public class Boundary implements Comparable<Boundary> {
|
||||
|
||||
private int start;
|
||||
@ -15,7 +22,7 @@ public class Boundary implements Comparable<Boundary> {
|
||||
public Boundary(int start, int end) {
|
||||
|
||||
if (start > end) {
|
||||
throw new IllegalArgumentException(String.format("start: %d > end: %d", start, end));
|
||||
throw new IllegalArgumentException(format("start: %d > end: %d", start, end));
|
||||
}
|
||||
this.start = start;
|
||||
this.end = end;
|
||||
@ -55,7 +62,7 @@ public class Boundary implements Comparable<Boundary> {
|
||||
public boolean contains(int start, int end) {
|
||||
|
||||
if (start > end) {
|
||||
throw new IllegalArgumentException(String.format("start: %d > end: %d", start, end));
|
||||
throw new IllegalArgumentException(format("start: %d > end: %d", start, end));
|
||||
}
|
||||
return this.start <= start && end <= this.end;
|
||||
}
|
||||
@ -64,7 +71,7 @@ public class Boundary implements Comparable<Boundary> {
|
||||
public boolean containedBy(int start, int end) {
|
||||
|
||||
if (start > end) {
|
||||
throw new IllegalArgumentException(String.format("start: %d > end: %d", start, end));
|
||||
throw new IllegalArgumentException(format("start: %d > end: %d", start, end));
|
||||
}
|
||||
return start <= this.start && this.end <= end;
|
||||
}
|
||||
@ -78,14 +85,14 @@ public class Boundary implements Comparable<Boundary> {
|
||||
|
||||
public boolean intersects(Boundary boundary) {
|
||||
|
||||
return contains(boundary.start()) || contains(boundary.end() - 1);
|
||||
return boundary.start() < this.end && this.start < boundary.end();
|
||||
}
|
||||
|
||||
|
||||
public List<Boundary> split(List<Integer> splitIndices) {
|
||||
|
||||
if (splitIndices.stream().anyMatch(idx -> !this.contains(idx))) {
|
||||
throw new IndexOutOfBoundsException(String.format("%s splitting indices are out of range for %s", splitIndices.stream().filter(idx -> !this.contains(idx)).toList(), this));
|
||||
throw new IndexOutOfBoundsException(format("%s splitting indices are out of range for %s", splitIndices.stream().filter(idx -> !this.contains(idx)).toList(), this));
|
||||
}
|
||||
List<Boundary> splitBoundaries = new LinkedList<>();
|
||||
int previousIndex = start;
|
||||
@ -103,7 +110,7 @@ public class Boundary implements Comparable<Boundary> {
|
||||
}
|
||||
|
||||
|
||||
public static Boundary merge(List<Boundary> boundaries) {
|
||||
public static Boundary merge(Collection<Boundary> boundaries) {
|
||||
|
||||
int minStart = boundaries.stream().mapToInt(Boundary::start).min().orElseThrow(IllegalArgumentException::new);
|
||||
int maxEnd = boundaries.stream().mapToInt(Boundary::end).max().orElseThrow(IllegalArgumentException::new);
|
||||
@ -114,7 +121,7 @@ public class Boundary implements Comparable<Boundary> {
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return String.format("Boundary [%d|%d)", start, end);
|
||||
return format("Boundary [%d|%d)", start, end);
|
||||
}
|
||||
|
||||
|
||||
@ -132,17 +139,25 @@ public class Boundary implements Comparable<Boundary> {
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
/**
|
||||
* shrinks the boundary, such that textBlock.subSequence(boundary) returns a string without whitespaces.
|
||||
*
|
||||
* @param textBlock TextBlock to check whitespaces against
|
||||
* @return boundary
|
||||
*/
|
||||
public Boundary trim(TextBlock textBlock) {
|
||||
|
||||
return toString().hashCode();
|
||||
}
|
||||
int trimmedStart = this.start;
|
||||
while (Character.isWhitespace(textBlock.charAt(trimmedStart))) {
|
||||
trimmedStart++;
|
||||
}
|
||||
|
||||
int trimmedEnd = this.end;
|
||||
while (Character.isWhitespace(textBlock.charAt(trimmedEnd - 1))) {
|
||||
trimmedEnd--;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object object) {
|
||||
|
||||
return hashCode() == object.hashCode();
|
||||
return new Boundary(trimmedStart, Math.max(trimmedEnd, trimmedStart));
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,217 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.graph;
|
||||
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.GenericSemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.SemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Table;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.TableCell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlockCollector;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.Getter;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@EqualsAndHashCode
|
||||
public class DocumentTree {
|
||||
|
||||
private final Entry root;
|
||||
|
||||
|
||||
public DocumentTree(Document document) {
|
||||
|
||||
root = Entry.builder().treeId(Collections.emptyList()).children(new LinkedList<>()).node(document).build();
|
||||
}
|
||||
|
||||
|
||||
public TextBlock buildTextBlock() {
|
||||
|
||||
return allEntriesInOrder().map(Entry::getNode).filter(SemanticNode::isLeaf).map(SemanticNode::getLeafTextBlock).collect(new TextBlockCollector());
|
||||
}
|
||||
|
||||
|
||||
public List<Integer> createNewMainEntryAndReturnId(GenericSemanticNode node) {
|
||||
|
||||
return createNewChildEntryAndReturnIdImpl(Collections.emptyList(), node);
|
||||
}
|
||||
|
||||
|
||||
public List<Integer> createNewChildEntryAndReturnId(GenericSemanticNode parentNode, GenericSemanticNode node) {
|
||||
|
||||
return createNewChildEntryAndReturnIdImpl(parentNode.getTreeId(), node);
|
||||
}
|
||||
|
||||
|
||||
public List<Integer> createNewChildEntryAndReturnId(GenericSemanticNode parentNode, Table node) {
|
||||
|
||||
return createNewChildEntryAndReturnIdImpl(parentNode.getTreeId(), node);
|
||||
}
|
||||
|
||||
|
||||
public List<Integer> createNewTableChildEntryAndReturnId(Table parentTable, TableCell tableCell) {
|
||||
|
||||
return createNewChildEntryAndReturnIdImpl(parentTable.getTreeId(), tableCell);
|
||||
}
|
||||
|
||||
|
||||
@SuppressWarnings("PMD.UnusedPrivateMethod") // PMD actually flags this wrong
|
||||
private List<Integer> createNewChildEntryAndReturnIdImpl(List<Integer> parentId, SemanticNode node) {
|
||||
|
||||
if (!entryExists(parentId)) {
|
||||
throw new IllegalArgumentException(format("parentId %s does not exist!", parentId));
|
||||
}
|
||||
|
||||
Entry parent = getEntryById(parentId);
|
||||
List<Integer> newId = new LinkedList<>(parentId);
|
||||
newId.add(parent.children.size());
|
||||
parent.children.add(Entry.builder().treeId(newId).node(node).build());
|
||||
|
||||
return newId;
|
||||
}
|
||||
|
||||
|
||||
private boolean entryExists(List<Integer> treeId) {
|
||||
|
||||
if (treeId.isEmpty()) {
|
||||
return root != null;
|
||||
}
|
||||
Entry entry = root.children.get(treeId.get(0));
|
||||
for (int id : treeId.subList(1, treeId.size())) {
|
||||
if (id >= entry.children.size() || 0 > id) {
|
||||
return false;
|
||||
}
|
||||
entry = entry.children.get(id);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
public Entry getParentEntryById(List<Integer> treeId) {
|
||||
|
||||
return getEntryById(getParentId(treeId));
|
||||
}
|
||||
|
||||
|
||||
public boolean hasParentById(List<Integer> treeId) {
|
||||
|
||||
return !treeId.isEmpty();
|
||||
}
|
||||
|
||||
|
||||
public Stream<SemanticNode> childNodes(List<Integer> treeId) {
|
||||
|
||||
return getEntryById(treeId).children.stream().map(Entry::getNode);
|
||||
}
|
||||
|
||||
|
||||
public Stream<SemanticNode> childNodesOfType(List<Integer> treeId, NodeType nodeType) {
|
||||
|
||||
return getEntryById(treeId).children.stream().filter(entry -> entry.node.getType().equals(nodeType)).map(Entry::getNode);
|
||||
}
|
||||
|
||||
|
||||
private static List<Integer> getParentId(List<Integer> treeId) {
|
||||
|
||||
if (treeId.isEmpty()) {
|
||||
throw new UnsupportedOperationException("Root has no parent!");
|
||||
}
|
||||
if (treeId.size() < 2) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
return treeId.subList(0, treeId.size() - 1);
|
||||
}
|
||||
|
||||
|
||||
public Entry getEntryById(List<Integer> treeId) {
|
||||
|
||||
if (treeId.isEmpty()) {
|
||||
return root;
|
||||
}
|
||||
Entry entry = root.children.get(treeId.get(0));
|
||||
for (int id : treeId.subList(1, treeId.size())) {
|
||||
entry = entry.children.get(id);
|
||||
}
|
||||
return entry;
|
||||
}
|
||||
|
||||
|
||||
public Stream<Entry> mainEntries() {
|
||||
|
||||
return root.children.stream();
|
||||
}
|
||||
|
||||
|
||||
public Stream<Entry> allEntriesInOrder() {
|
||||
|
||||
return Stream.of(root).flatMap(DocumentTree::flatten);
|
||||
}
|
||||
|
||||
|
||||
public Stream<Entry> allSubEntriesInOrder(List<Integer> parentId) {
|
||||
|
||||
return getEntryById(parentId).children.stream().flatMap(DocumentTree::flatten);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return String.join("\n", allEntriesInOrder().map(Entry::toString).toList());
|
||||
}
|
||||
|
||||
|
||||
private static Stream<Entry> flatten(Entry entry) {
|
||||
|
||||
return Stream.concat(Stream.of(entry), entry.children.stream().flatMap(DocumentTree::flatten));
|
||||
}
|
||||
|
||||
|
||||
public SemanticNode getHighestParentById(List<Integer> treeId) {
|
||||
|
||||
if (treeId.isEmpty()) {
|
||||
return root.node;
|
||||
}
|
||||
return root.children.get(treeId.get(0)).node;
|
||||
}
|
||||
|
||||
|
||||
@Builder
|
||||
@Getter
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE, makeFinal = true)
|
||||
public static class Entry {
|
||||
|
||||
List<Integer> treeId;
|
||||
SemanticNode node;
|
||||
@Builder.Default
|
||||
List<Entry> children = new LinkedList<>();
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return node.toString();
|
||||
}
|
||||
|
||||
|
||||
public NodeType getType() {
|
||||
|
||||
return node.getType();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,8 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.graph.entity;
|
||||
|
||||
public enum EntityType {
|
||||
ENTITY,
|
||||
RECOMMENDATION,
|
||||
FALSE_POSITIVE,
|
||||
FALSE_RECOMMENDATION
|
||||
}
|
||||
@ -0,0 +1,228 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.graph.entity;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Collection;
|
||||
import java.util.Comparator;
|
||||
import java.util.Deque;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Engine;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.Boundary;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.SemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.IdBuilder;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
|
||||
public class RedactionEntity {
|
||||
|
||||
// initial values
|
||||
@EqualsAndHashCode.Include
|
||||
final Boundary boundary;
|
||||
@EqualsAndHashCode.Include
|
||||
final String type;
|
||||
@EqualsAndHashCode.Include
|
||||
final EntityType entityType;
|
||||
|
||||
// empty defaults
|
||||
boolean redaction;
|
||||
boolean removed;
|
||||
boolean ignored;
|
||||
boolean resized;
|
||||
boolean skipRemoveEntitiesContainedInLarger;
|
||||
boolean dictionaryEntry;
|
||||
boolean dossierDictionaryEntry;
|
||||
Set<Engine> engines;
|
||||
Set<RedactionEntity> references;
|
||||
@Builder.Default
|
||||
Deque<Integer> matchedRules = new LinkedList<>();
|
||||
String redactionReason;
|
||||
String legalBasis;
|
||||
|
||||
// inferred on graph insertion
|
||||
@EqualsAndHashCode.Include
|
||||
String value;
|
||||
String textBefore;
|
||||
String textAfter;
|
||||
@Builder.Default
|
||||
Set<Page> pages = new HashSet<>();
|
||||
List<RedactionPosition> redactionPositionsPerPage;
|
||||
@Builder.Default
|
||||
List<SemanticNode> intersectingNodes = new LinkedList<>();
|
||||
SemanticNode deepestFullyContainingNode;
|
||||
|
||||
|
||||
public static RedactionEntity initialEntityNode(Boundary boundary, String type, EntityType entityType) {
|
||||
|
||||
return RedactionEntity.builder().type(type).entityType(entityType).boundary(boundary).engines(new HashSet<>()).references(new HashSet<>()).build();
|
||||
}
|
||||
|
||||
|
||||
public boolean occursInNodeOfType(Class<? extends SemanticNode> clazz) {
|
||||
|
||||
return intersectingNodes.stream().anyMatch(clazz::isInstance);
|
||||
}
|
||||
|
||||
|
||||
public boolean occursInNode(SemanticNode semanticNode) {
|
||||
|
||||
return intersectingNodes.stream().anyMatch(node -> node.equals(semanticNode));
|
||||
}
|
||||
|
||||
|
||||
public boolean isType(String type) {
|
||||
|
||||
return this.type.equals(type);
|
||||
}
|
||||
|
||||
|
||||
public boolean isAnyType(List<String> types) {
|
||||
|
||||
return types.contains(type);
|
||||
}
|
||||
|
||||
|
||||
public void addIntersectingNode(SemanticNode containingNode) {
|
||||
|
||||
intersectingNodes.add(containingNode);
|
||||
}
|
||||
|
||||
|
||||
public void removeFromGraph() {
|
||||
|
||||
intersectingNodes.forEach(node -> node.getEntities().remove(this));
|
||||
pages.forEach(page -> page.getEntities().remove(this));
|
||||
intersectingNodes = new LinkedList<>();
|
||||
deepestFullyContainingNode = null;
|
||||
pages = new HashSet<>();
|
||||
removed = true;
|
||||
ignored = true;
|
||||
}
|
||||
|
||||
|
||||
public void addMatchedRule(int ruleNumber) {
|
||||
|
||||
matchedRules.add(ruleNumber);
|
||||
}
|
||||
|
||||
|
||||
public int getMatchedRule() {
|
||||
|
||||
if (matchedRules.isEmpty()) {
|
||||
return 0;
|
||||
}
|
||||
return matchedRules.getLast();
|
||||
}
|
||||
|
||||
|
||||
public List<RedactionPosition> getRedactionPositionsPerPage() {
|
||||
|
||||
if (redactionPositionsPerPage == null || redactionPositionsPerPage.isEmpty()) {
|
||||
Map<Page, List<Rectangle2D>> rectanglesPerLinePerPage = deepestFullyContainingNode.getTextBlock().getPositionsPerPage(boundary);
|
||||
|
||||
Page firstPage = rectanglesPerLinePerPage.keySet()
|
||||
.stream()
|
||||
.min(Comparator.comparingInt(Page::getNumber))
|
||||
.orElseThrow(() -> new RuntimeException("No Positions found on any page!"));
|
||||
String id = IdBuilder.buildId(pages, rectanglesPerLinePerPage.values().stream().flatMap(Collection::stream).toList());
|
||||
redactionPositionsPerPage = rectanglesPerLinePerPage.entrySet().stream().map(entry -> buildRedactionPosition(firstPage, id, entry)).toList();
|
||||
}
|
||||
return redactionPositionsPerPage;
|
||||
}
|
||||
|
||||
|
||||
private static RedactionPosition buildRedactionPosition(Page firstPage, String id, Map.Entry<Page, List<Rectangle2D>> entry) {
|
||||
|
||||
if (entry.getKey().equals(firstPage)) {
|
||||
return new RedactionPosition(id, entry.getKey(), entry.getValue());
|
||||
} else {
|
||||
return new RedactionPosition(id + "-" + entry.getKey().getNumber(), entry.getKey(), entry.getValue());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public boolean containedBy(RedactionEntity redactionEntity) {
|
||||
|
||||
return this.boundary.containedBy(redactionEntity.getBoundary());
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(RedactionEntity redactionEntity) {
|
||||
|
||||
return this.boundary.contains(redactionEntity.getBoundary());
|
||||
}
|
||||
|
||||
|
||||
public boolean intersects(RedactionEntity redactionEntity) {
|
||||
|
||||
return this.boundary.intersects(redactionEntity.getBoundary());
|
||||
}
|
||||
|
||||
|
||||
public void addEngine(Engine engine) {
|
||||
|
||||
engines.add(engine);
|
||||
}
|
||||
|
||||
|
||||
public void addEngines(Set<Engine> engines) {
|
||||
|
||||
this.engines.addAll(engines);
|
||||
}
|
||||
|
||||
|
||||
public void addReference(RedactionEntity reference) {
|
||||
|
||||
references.add(reference);
|
||||
}
|
||||
|
||||
|
||||
public void addReferences(List<RedactionEntity> references) {
|
||||
|
||||
this.references.addAll(references);
|
||||
}
|
||||
|
||||
|
||||
public boolean matchesAnnotationId(String manualRedactionId) {
|
||||
|
||||
return getRedactionPositionsPerPage().stream().anyMatch(entityPosition -> entityPosition.getId().equals(manualRedactionId));
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append("Entity[\"");
|
||||
sb.append(value);
|
||||
sb.append("\", ");
|
||||
sb.append(boundary);
|
||||
sb.append(", pages[");
|
||||
pages.forEach(page -> {
|
||||
sb.append(page.getNumber());
|
||||
sb.append(", ");
|
||||
});
|
||||
sb.delete(sb.length() - 2, sb.length());
|
||||
sb.append("], type = \"");
|
||||
sb.append(type);
|
||||
sb.append("\", EntityType.");
|
||||
sb.append(entityType);
|
||||
sb.append("]");
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,24 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.graph.entity;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class RedactionPosition {
|
||||
|
||||
final String id;
|
||||
Page page;
|
||||
// Each entry in this list corresponds to an entry in the redaction log, this means:
|
||||
// An entity might be represented by multiple redaction log entries
|
||||
List<Rectangle2D> rectanglePerLine;
|
||||
|
||||
}
|
||||
@ -0,0 +1,120 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.graph.nodes;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.amazonaws.services.kms.model.NotFoundException;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlockCollector;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class Document implements GenericSemanticNode {
|
||||
|
||||
Set<Page> pages;
|
||||
DocumentTree documentTree;
|
||||
Integer numberOfPages;
|
||||
TextBlock textBlock;
|
||||
@Builder.Default
|
||||
Set<RedactionEntity> entities = new HashSet<>();
|
||||
|
||||
|
||||
@Override
|
||||
public NodeType getType() {
|
||||
|
||||
return NodeType.DOCUMENT;
|
||||
}
|
||||
|
||||
|
||||
public TextBlock getTextBlock() {
|
||||
|
||||
if (textBlock == null) {
|
||||
textBlock = streamTerminalTextBlocksInOrder().collect(new TextBlockCollector());
|
||||
}
|
||||
return textBlock;
|
||||
}
|
||||
|
||||
|
||||
public List<Section> getMainSections() {
|
||||
|
||||
return streamChildrenOfType(NodeType.SECTION).map(node -> (Section) node).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
||||
public Stream<TextBlock> streamTerminalTextBlocksInOrder() {
|
||||
|
||||
return streamAllNodes().filter(SemanticNode::isLeaf).map(SemanticNode::getLeafTextBlock);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public List<Integer> getTreeId() {
|
||||
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void setTreeId(List<Integer> tocId) {
|
||||
|
||||
throw new UnsupportedOperationException("Document is always the root of the TablePageBlock of Contents");
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Headline getHeadline() {
|
||||
|
||||
return streamAllSubNodesOfType(NodeType.HEADLINE).map(node -> (Headline) node).findFirst().orElseThrow(() -> new NotFoundException("No Headlines found in this document!"));
|
||||
}
|
||||
|
||||
|
||||
private Stream<SemanticNode> streamAllNodes() {
|
||||
|
||||
return documentTree.allEntriesInOrder().map(DocumentTree.Entry::getNode);
|
||||
}
|
||||
|
||||
|
||||
public Stream<Image> streamAllImages() {
|
||||
|
||||
return streamAllSubNodesOfType(NodeType.IMAGE).map(node -> (Image) node);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return NodeType.DOCUMENT + ": " + this.getTextBlock().buildSummary();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Map<Page, Rectangle2D> getBBox() {
|
||||
|
||||
Map<Page, Rectangle2D> bBox = new HashMap<>();
|
||||
for (Page page : pages) {
|
||||
bBox.put(page, new Rectangle2D.Double(0, 0, page.getWidth(), page.getHeight()));
|
||||
}
|
||||
return bBox;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,65 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.graph.nodes;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class Footer implements GenericSemanticNode {
|
||||
|
||||
List<Integer> treeId;
|
||||
TextBlock leafTextBlock;
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
DocumentTree documentTree;
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<RedactionEntity> entities = new HashSet<>();
|
||||
|
||||
|
||||
@Override
|
||||
public NodeType getType() {
|
||||
|
||||
return NodeType.FOOTER;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean isLeaf() {
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public TextBlock getTextBlock() {
|
||||
|
||||
return leafTextBlock;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return treeId + ": " + NodeType.FOOTER + ": " + leafTextBlock.buildSummary();
|
||||
}
|
||||
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user