RED-6725: Integrate new layout parser

* ported current state from RedactManager
This commit is contained in:
Kilian Schuettler 2023-06-15 12:51:35 +02:00
parent cc1fedac41
commit df9cbdc036
134 changed files with 4481 additions and 3639 deletions

View File

@ -1,5 +1,7 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
@ -20,4 +22,6 @@ public class AtomicTextBlockData {
int end;
int[] lineBreaks;
}

View File

@ -15,6 +15,7 @@ public class DocumentData {
PageData[] pages;
AtomicTextBlockData[] atomicTextBlocks;
AtomicPositionBlockData[] atomicPositionBlocks;
TableOfContentsData tableOfContents;
DocumentTreeData documentTreeData;
}

View File

@ -4,8 +4,6 @@ import java.util.List;
import java.util.Map;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.NodeType;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
@ -19,7 +17,7 @@ import lombok.experimental.FieldDefaults;
@AllArgsConstructor
@NoArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class TableOfContentsData {
public class DocumentTreeData {
EntryData root;
@ -29,9 +27,9 @@ public class TableOfContentsData {
if (tocId.isEmpty()) {
return root;
}
EntryData entry = root.subEntries.get(tocId.get(0));
EntryData entry = root.children.get(tocId.get(0));
for (int id : tocId.subList(1, tocId.size())) {
entry = entry.subEntries.get(id);
entry = entry.children.get(id);
}
return entry;
}
@ -39,7 +37,7 @@ public class TableOfContentsData {
public Stream<EntryData> streamAllEntries() {
return Stream.concat(Stream.of(root), root.subEntries.stream()).flatMap(TableOfContentsData::flatten);
return Stream.concat(Stream.of(root), root.children.stream()).flatMap(DocumentTreeData::flatten);
}
@ -51,7 +49,7 @@ public class TableOfContentsData {
private static Stream<EntryData> flatten(EntryData entry) {
return Stream.concat(Stream.of(entry), entry.subEntries.stream().flatMap(TableOfContentsData::flatten));
return Stream.concat(Stream.of(entry), entry.children.stream().flatMap(DocumentTreeData::flatten));
}
@ -62,19 +60,18 @@ public class TableOfContentsData {
public static class EntryData {
NodeType type;
int[] tocId;
Long[] atomicBlocks;
Long[] pages;
int[] treeId;
Long[] atomicBlockIds;
Long[] pageNumbers;
Map<String, String> properties;
List<EntryData> subEntries;
List<EntryData> children;
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("[");
for (int i : tocId) {
for (int i : treeId) {
sb.append(i);
sb.append(",");
}
@ -83,7 +80,7 @@ public class TableOfContentsData {
sb.append(type);
sb.append(" atbs = ");
sb.append(atomicBlocks.length);
sb.append(atomicBlockIds.length);
return sb.toString();
}

View File

@ -1,4 +1,6 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
package com.knecon.fforesight.service.layoutparser.internal.api.data;
import java.util.Locale;
public enum NodeType {
DOCUMENT,
@ -9,5 +11,11 @@ public enum NodeType {
TABLE_CELL,
IMAGE,
HEADER,
FOOTER
FOOTER;
public String toString() {
return this.name().charAt(0) + this.name().substring(1).toLowerCase(Locale.ROOT);
}
}

View File

@ -1,101 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph;
import java.awt.geom.Rectangle2D;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.NodeType;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SectionNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class DocumentGraph implements SemanticNode {
Set<PageNode> pages;
TableOfContents tableOfContents;
Integer numberOfPages;
TextBlock textBlock;
public TextBlock buildTextBlock() {
return streamTerminalTextBlocksInOrder().collect(new TextBlockCollector());
}
public List<SectionNode> getMainSections() {
return streamChildren().filter(node -> node instanceof SectionNode).map(node -> (SectionNode) node).collect(Collectors.toList());
}
public Stream<TextBlock> streamTerminalTextBlocksInOrder() {
return streamAllNodes().filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock);
}
public Set<EntityNode> getEntities() {
return streamAllSubNodes().map(SemanticNode::getEntities).flatMap(Set::stream).collect(Collectors.toUnmodifiableSet());
}
@Override
public List<Integer> getTocId() {
return Collections.emptyList();
}
@Override
public void setTocId(List<Integer> tocId) {
throw new UnsupportedOperationException("DocumentGraph is always the root of the Table of Contents");
}
private Stream<SemanticNode> streamAllNodes() {
return tableOfContents.streamAllEntriesInOrder().map(TableOfContents.Entry::getNode);
}
@Override
public String toString() {
return NodeType.DOCUMENT + ": " + buildTextBlock().buildSummary();
}
@Override
public Map<PageNode, Rectangle2D> getBBox() {
Map<PageNode, Rectangle2D> bBox = new HashMap<>();
for (PageNode page : pages) {
bBox.put(page, new Rectangle2D.Double(0, 0, page.getWidth(), page.getHeight()));
}
return bBox;
}
}

View File

@ -1,193 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph;
import static java.lang.String.format;
import java.nio.charset.StandardCharsets;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.stream.Stream;
import com.google.common.hash.Hashing;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.NodeType;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.Getter;
import lombok.experimental.FieldDefaults;
@Data
public class TableOfContents {
private final Entry root;
public TableOfContents(DocumentGraph documentGraph) {
root = Entry.builder().tocId(Collections.emptyList()).type(NodeType.DOCUMENT).children(new LinkedList<>()).node(documentGraph).build();
}
public TextBlock buildTextBlock() {
return streamAllEntriesInOrder().map(Entry::getNode).filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock).collect(new TextBlockCollector());
}
public List<Integer> createNewMainEntryAndReturnId(NodeType nodeType, SemanticNode node) {
return createNewChildEntryAndReturnId(Collections.emptyList(), nodeType, node);
}
public List<Integer> createNewChildEntryAndReturnId(List<Integer> parentId, NodeType nodeType, SemanticNode node) {
if (!entryExists(parentId)) {
throw new UnsupportedOperationException(format("parentId %s does not exist!", parentId));
}
Entry parent = getEntryById(parentId);
List<Integer> newId = new LinkedList<>(parentId);
newId.add(parent.children.size());
parent.children.add(Entry.builder().tocId(newId).node(node).type(nodeType).children(new LinkedList<>()).build());
return newId;
}
private boolean entryExists(List<Integer> tocId) {
if (tocId.isEmpty()) {
return root != null;
}
Entry entry = root.children.get(tocId.get(0));
for (int id : tocId.subList(1, tocId.size())) {
if (id >= entry.children.size() || 0 > id) {
return false;
}
entry = entry.children.get(id);
}
return true;
}
public Entry getParentEntryById(List<Integer> tocId) {
return getEntryById(getParentId(tocId));
}
public boolean hasParentById(List<Integer> tocId) {
return entryExists(getParentId(tocId));
}
public Stream<SemanticNode> streamChildrenNodes(List<Integer> tocId) {
return getEntryById(tocId).children.stream().map(Entry::getNode);
}
private static List<Integer> getParentId(List<Integer> tocId) {
if (tocId.isEmpty()) {
throw new UnsupportedOperationException("Root has no parent!");
}
if (tocId.size() < 2) {
return Collections.emptyList();
}
return tocId.subList(0, tocId.size() - 1);
}
public Entry getEntryById(List<Integer> tocId) {
if (tocId.isEmpty()) {
return root;
}
Entry entry = root.children.get(tocId.get(0));
for (int id : tocId.subList(1, tocId.size())) {
entry = entry.children.get(id);
}
return entry;
}
public Stream<Entry> streamMainEntries() {
return root.children.stream();
}
public Stream<Entry> streamAllEntriesInOrder() {
return Stream.of(root).flatMap(TableOfContents::flatten);
}
public Stream<Entry> streamAllSubEntriesInOrder(List<Integer> parentId) {
return getEntryById(parentId).getChildren().stream().flatMap(TableOfContents::flatten);
}
@Override
public String toString() {
return String.join("\n", streamAllEntriesInOrder().map(Entry::toString).toList());
}
public String toString(List<Integer> id) {
return String.join("\n", streamAllSubEntriesInOrder(id).map(Entry::toString).toList());
}
private static Stream<Entry> flatten(Entry entry) {
return Stream.concat(Stream.of(entry), entry.children.stream().flatMap(TableOfContents::flatten));
}
@Builder
@Getter
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE, makeFinal = true)
public static class Entry {
List<Integer> tocId;
NodeType type;
SemanticNode node;
List<Entry> children;
@Override
public String toString() {
return node.toString();
}
@Override
public int hashCode() {
return Hashing.murmur3_32_fixed().hashString(toString(), StandardCharsets.UTF_8).hashCode();
}
@Override
public boolean equals(Object o) {
return o instanceof Entry && o.hashCode() == this.hashCode();
}
}
}

View File

@ -1,76 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph.entity;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode;
public interface EntityNode {
/**
* This represents the text, which is contained within the boundary of the Entity.
*
* @return String
*/
String getValue();
/**
* The Boundary primarily defines the Entity, all other values may be inferred from it.
*
* @return Boundary, uniquely identifying this Entity
*/
Boundary getBoundary();
/**
* The deepest fully containing node represents the node which is the deepest node in the document tree structure,
* whose boundary also fully contains the boundary of this entity.
*
* @return the deepest fully containing node
*/
SemanticNode getDeepestFullyContainingNode();
/**
* The intersecting nodes represent all nodes, whose boundary intersects the boundary of this entity.
*
* @return all intersecting Nodes
*/
List<SemanticNode> getIntersectingNodes();
void setDeepestFullyContainingNode(SemanticNode semanticNode);
void addIntersectingNode(SemanticNode semanticNode);
void setIntersectingNodes(List<SemanticNode> semanticNodes);
/**
* @return all pages this entity intersects.
*/
Set<PageNode> getPages();
void setPages(Set<PageNode> pages);
/**
* removes all occurrences of this node in the graph and resets all graph specific fields.
*/
default void removeFromGraph() {
getIntersectingNodes().forEach(node -> node.getEntities().remove(this));
getPages().forEach(page -> page.getEntities().remove(this));
setPages(Collections.emptySet());
setDeepestFullyContainingNode(null);
setIntersectingNodes(Collections.emptyList());
}
}

View File

@ -1,45 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph.entity;
import java.awt.geom.Rectangle2D;
import java.nio.charset.StandardCharsets;
import java.util.List;
import com.google.common.hash.Hashing;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode;
import lombok.AccessLevel;
import lombok.Builder;
import lombok.Data;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@FieldDefaults(level = AccessLevel.PRIVATE)
public class EntityPosition {
PageNode pageNode;
List<Rectangle2D> rectanglePerLine;
public String getId() {
return String.valueOf(hashCode());
}
@Override
public int hashCode() {
StringBuilder sb = new StringBuilder();
sb.append(pageNode.getNumber());
rectanglePerLine.forEach(r -> sb.append(r.getX()).append(r.getY()).append(r.getWidth()).append(r.getHeight()));
return Hashing.murmur3_128().hashString(sb.toString(), StandardCharsets.UTF_8).hashCode();
}
@Override
public boolean equals(Object o) {
return o instanceof EntityPosition && o.hashCode() == this.hashCode();
}
}

View File

@ -1,53 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class FooterNode implements SemanticNode {
List<Integer> tocId;
TextBlock terminalTextBlock;
@Builder.Default
boolean terminal = true;
@EqualsAndHashCode.Exclude
TableOfContents tableOfContents;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<EntityNode> entities = new HashSet<>();
@Override
public TextBlock buildTextBlock() {
return terminalTextBlock;
}
@Override
public String toString() {
return tocId + ": " + NodeType.FOOTER + ": " + terminalTextBlock.buildSummary();
}
}

View File

@ -1,53 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class HeaderNode implements SemanticNode {
List<Integer> tocId;
TextBlock terminalTextBlock;
@Builder.Default
boolean terminal = true;
@EqualsAndHashCode.Exclude
TableOfContents tableOfContents;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<EntityNode> entities = new HashSet<>();
@Override
public TextBlock buildTextBlock() {
return terminalTextBlock;
}
@Override
public String toString() {
return tocId + ": " + NodeType.HEADER + ": " + terminalTextBlock.buildSummary();
}
}

View File

@ -1,60 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class HeadlineNode implements SemanticNode {
List<Integer> tocId;
TextBlock terminalTextBlock;
@Builder.Default
boolean terminal = true;
@EqualsAndHashCode.Exclude
TableOfContents tableOfContents;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<EntityNode> entities = new HashSet<>();
@Override
public TextBlock buildTextBlock() {
return terminalTextBlock;
}
@Override
public String toString() {
return tocId + ": " + NodeType.HEADLINE + ": " + terminalTextBlock.buildSummary();
}
@Override
public SemanticNode getHeadline() {
return this;
}
}

View File

@ -1,87 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
import java.awt.geom.Rectangle2D;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class ImageNode implements SemanticNode {
List<Integer> tocId;
ImageType imageType;
boolean transparency;
Rectangle2D position;
boolean redaction;
boolean ignored;
@Builder.Default
String redactionReason = "";
@Builder.Default
String legalBasis = "";
@Builder.Default
int matchedRule = -1;
@EqualsAndHashCode.Exclude
PageNode page;
@EqualsAndHashCode.Exclude
TableOfContents tableOfContents;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<EntityNode> entities = new HashSet<>();
@Override
public TextBlock buildTextBlock() {
return streamAllSubNodes().filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock).collect(new TextBlockCollector());
}
@Override
public Set<PageNode> getPages() {
return Collections.singleton(page);
}
@Override
public String toString() {
return tocId + ": " + NodeType.IMAGE + ": " + imageType.toString() + " " + position;
}
@Override
public Map<PageNode, Rectangle2D> getBBox() {
Map<PageNode, Rectangle2D> bBoxPerPage = new HashMap<>();
bBoxPerPage.put(page, position);
return bBoxPerPage;
}
}

View File

@ -1,9 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
public enum ImageType {
LOGO,
FORMULA,
SIGNATURE,
OTHER,
OCR
}

View File

@ -1,71 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.Setter;
import lombok.experimental.FieldDefaults;
@Getter
@Setter
@Builder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class PageNode {
Integer number;
Integer height;
Integer width;
Integer rotation;
@EqualsAndHashCode.Exclude
List<SemanticNode> mainBody;
@EqualsAndHashCode.Exclude
HeaderNode header;
@EqualsAndHashCode.Exclude
FooterNode footer;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<EntityNode> entities = new HashSet<>();
@Builder.Default
@EqualsAndHashCode.Exclude
Set<ImageNode> images = new HashSet<>();
public TextBlock getMainBodyTextBlock() {
return mainBody.stream().filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock).collect(new TextBlockCollector());
}
@Override
public String toString() {
return String.valueOf(number);
}
@Override
public int hashCode() {
return number;
}
@Override
public boolean equals(Object o) {
return o instanceof PageNode && o.hashCode() == this.hashCode();
}
}

View File

@ -1,51 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class ParagraphNode implements SemanticNode {
List<Integer> tocId;
TextBlock terminalTextBlock;
@Builder.Default
boolean terminal = true;
@EqualsAndHashCode.Exclude
TableOfContents tableOfContents;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<EntityNode> entities = new HashSet<>();
@Override
public TextBlock buildTextBlock() {
return terminalTextBlock;
}
@Override
public String toString() {
return tocId + ": " + NodeType.PARAGRAPH + ": " + terminalTextBlock.buildSummary();
}
}

View File

@ -1,63 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
import java.util.HashSet;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class SectionNode implements SemanticNode {
List<Integer> tocId;
TextBlock textBlock;
@EqualsAndHashCode.Exclude
TableOfContents tableOfContents;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<EntityNode> entities = new HashSet<>();
@Override
public TextBlock buildTextBlock() {
if (textBlock == null) {
textBlock = streamAllSubNodes().filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock).collect(new TextBlockCollector());
}
return textBlock;
}
@Override
public String toString() {
return tocId.toString() + ": " + NodeType.SECTION + ": " + buildTextBlock().buildSummary();
}
public HeadlineNode getHeadline() {
return streamChildren().filter(node -> node instanceof HeadlineNode)
.map(node -> (HeadlineNode) node)
.findFirst()
.orElseThrow(() -> new NoSuchElementException("ClassificationSection has no Headline!"));
}
}

View File

@ -1,275 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
import java.awt.geom.Rectangle2D;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.AtomicTextBlock;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.internal.api.services.RectangleTransformations;
public interface SemanticNode {
/**
* Searches all Nodes located underneath this Node in the TableOfContents and concatenates their AtomicTextBlocks into a single TextBlockEntity.
* So, for a ClassificationSection all TextBlocks of Subsections, Paragraphs, and Tables are concatenated into a single TextBlockEntity
* If the Node is Terminal, the TerminalTextBlock will be returned instead.
*
* @return ClassificationTextBlock containing all AtomicTextBlocks that are located under this Node.
*/
TextBlock buildTextBlock();
/**
* Any Node maintains its own Set of Entities.
* This Set contains all Entities whose boundary intersects the boundary of this node.
*
* @return Set of all Entities associated with this Node
*/
Set<EntityNode> getEntities();
/**
* Each AtomicTextBlock is assigned a page, so to get the pages this node appears on, it collects the PageNodes from each AtomicTextBlock belonging to this node's ClassificationTextBlock.
*
* @return Set of PageNodes this node appears on.
*/
default Set<PageNode> getPages() {
return buildTextBlock().getPages();
}
/**
* @return the TableOfContents of the ClassificationDocument this node belongs to
*/
TableOfContents getTableOfContents();
/**
* The id is a List of Integers uniquely identifying this node in the TableOfContents.
*
* @return the TableOfContents ID
*/
List<Integer> getTocId();
/**
* This should only be used during graph construction.
*
* @param tocId List of Integers
*/
void setTocId(List<Integer> tocId);
/**
* Traverses the Tree up, until it hits a HeadlineNode or hits a SectionNode which will then return the first HeadlineNode from its children.
* Throws NotFoundException if no Headline is found this way
*
* @return First HeadlineNode found
*/
default SemanticNode getHeadline() {
return getParent().getHeadline();
}
/**
* @return boolean indicating wether this Node has a Parent in the TableOfContents
*/
default boolean hasParent() {
return getTableOfContents().hasParentById(getTocId());
}
/**
* @return The SemanticNode representing the Parent in the TableOfContents
* throws NotFoundException, when no parent is present
*/
default SemanticNode getParent() {
return getTableOfContents().getParentEntryById(getTocId()).getNode();
}
/**
* Terminal means a SemanticNode has direct access to a ClassificationTextBlock, by default this is false and must be overridden.
* Currently only Sections, Images, and Tables are not terminal.
* A TableCell might be Terminal depending on its area compared to the page.
*
* @return boolean, indicating if a Node has direct access to a ClassificationTextBlock
*/
default boolean isTerminal() {
return false;
}
/**
* Terminal means a SemanticNode has direct access to a ClassificationTextBlock, by default this is false and must be overridden.
* Currently only Sections and Tables are not terminal.
*
* @return AtomicTextBlock
*/
default TextBlock getTerminalTextBlock() {
throw new UnsupportedOperationException("Only terminal Nodes have access to TerminalTextBlocks!");
}
default void setTerminalTextBlock(TextBlock textBlock) {
throw new UnsupportedOperationException();
}
/**
* Each AtomicTextBlock has an index on its page, this returns the number of the first AtomicTextBlock underneath this node.
* If this node does not have any AtomicTexBlocks underneath it, e.g. an empty TableCell. It returns -1.
*
* @return Integer representing the number on the page
*/
default Integer getNumberOnPage() {
TextBlock textBlock = buildTextBlock();
if (textBlock.getAtomicTextBlocks().size() > 0) {
return buildTextBlock().getAtomicTextBlocks().get(0).getNumberOnPage();
} else {
return -1;
}
}
/**
* @return true, if this node's ClassificationTextBlock is not empty
*/
default boolean hasText() {
return buildTextBlock().length() > 0;
}
/**
* @param string A String which the ClassificationTextBlock might contain
* @return true, if this node's ClassificationTextBlock contains the string
*/
default boolean containsString(String string) {
return buildTextBlock().getSearchText().contains(string);
}
/**
* @param strings A List of Strings which the ClassificationTextBlock might contain
* @return true, if this node's ClassificationTextBlock contains any of the strings
*/
default boolean containsAnyString(List<String> strings) {
return strings.stream().anyMatch(this::containsString);
}
/**
* This function is used during insertion of EntityNodes into the graph, it checks if the boundary of the EntityNode intersects or even contains the EntityNode.
* It sets the fields accordingly and recursively calls this function on all its children.
*
* @param entityNode EntityNode, which is being inserted into the graph
*/
default void addThisToEntityIfIntersects(EntityNode entityNode) {
TextBlock textBlock = buildTextBlock();
if (textBlock.getBoundary().intersects(entityNode.getBoundary())) {
if (textBlock.containsBoundary(entityNode.getBoundary())) {
entityNode.setDeepestFullyContainingNode(this);
}
entityNode.addIntersectingNode(this);
streamChildren().forEach(node -> node.addThisToEntityIfIntersects(entityNode));
}
}
/**
* Streams all children located directly underneath this node in the TableOfContents.
*
* @return Stream of all children
*/
default Stream<SemanticNode> streamChildren() {
return getTableOfContents().streamChildrenNodes(getTocId());
}
/**
* recursively streams all SemanticNodes located underneath this node in the TableOfContents in order.
*
* @return Stream of all SubNodes
*/
default Stream<SemanticNode> streamAllSubNodes() {
return getTableOfContents().streamAllSubEntriesInOrder(getTocId()).map(TableOfContents.Entry::getNode);
}
/**
* @return Boundary of this Node's ClassificationTextBlock
*/
default Boundary getBoundary() {
return buildTextBlock().getBoundary();
}
/**
* If this Node is Terminal it will calculate the boundingBox of its TerminalTextBlock, otherwise it will calculate the Union of the BoundingBoxes of all its Children.
* If called on the ClassificationDocument, it will return the cropbox of each page
*
* @return Rectangle2D fully encapsulating this Node for each page.
*/
default Map<PageNode, Rectangle2D> getBBox() {
Map<PageNode, Rectangle2D> bBoxPerPage = new HashMap<>();
if (isTerminal()) {
return getBBoxFromTerminalTextBlock(bBoxPerPage);
}
return getBBoxFromChildren(bBoxPerPage);
}
/**
* TODO this does not yet work for sections spanning multiple columns.
*
* @param bBoxPerPage initial empty BoundingBox
* @return The union of the BoundingBoxes of all children
*/
private Map<PageNode, Rectangle2D> getBBoxFromChildren(Map<PageNode, Rectangle2D> bBoxPerPage) {
return streamChildren().map(SemanticNode::getBBox).reduce((map1, map2) -> {
map1.forEach((page, rectangle) -> map2.merge(page, rectangle, (rect1, rect2) -> rect1.createUnion(rect2).getBounds2D()));
return map2;
}).orElse(bBoxPerPage);
}
/**
* @param bBoxPerPage initial empty BoundingBox
* @return The union of all BoundingBoxes of the ClassificationTextBlock of this node
*/
private Map<PageNode, Rectangle2D> getBBoxFromTerminalTextBlock(Map<PageNode, Rectangle2D> bBoxPerPage) {
Map<PageNode, List<AtomicTextBlock>> atomicTextBlockPerPage = buildTextBlock().getAtomicTextBlocks().stream().collect(Collectors.groupingBy(AtomicTextBlock::getPage));
atomicTextBlockPerPage.forEach((page, atbs) -> bBoxPerPage.put(page, RectangleTransformations.bBoxUnionAtomicTextBlock(atbs)));
return bBoxPerPage;
}
}

View File

@ -1,92 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
import java.awt.geom.Rectangle2D;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class TableCellNode implements SemanticNode {
List<Integer> tocId;
int row;
int col;
boolean header;
Rectangle2D bBox;
@Builder.Default
boolean terminal = true;
TextBlock terminalTextBlock;
TextBlock textBlock;
@EqualsAndHashCode.Exclude
TableOfContents tableOfContents;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<EntityNode> entities = new HashSet<>();
@Override
public Map<PageNode, Rectangle2D> getBBox() {
Map<PageNode, Rectangle2D> bBoxPerPage = new HashMap<>();
getPages().forEach(page -> bBoxPerPage.put(page, bBox));
return bBoxPerPage;
}
@Override
public TextBlock buildTextBlock() {
if (terminal) {
return terminalTextBlock;
}
if (textBlock == null) {
textBlock = streamAllSubNodes().filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock).collect(new TextBlockCollector());
}
return textBlock;
}
@Override
public String toString() {
return tocId + ": " + NodeType.TABLE_CELL + ": " + buildTextBlock().buildSummary();
}
public boolean hasHeader(String headerString) {
return getHeaders().anyMatch(header -> header.buildTextBlock().getSearchText().strip().equals(headerString));
}
private Stream<TableCellNode> getHeaders() {
TableNode tableNode = (TableNode) getParent();
return tableNode.streamHeadersForCell(row, col);
}
}

View File

@ -1,73 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class TableNode implements SemanticNode {
List<Integer> tocId;
TableOfContents tableOfContents;
Integer numberOfRows;
Integer numberOfCols;
TextBlock textBlock;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<EntityNode> entities = new HashSet<>();
public Stream<TableCellNode> streamTableCells() {
return streamChildren().map(node -> (TableCellNode) node);
}
public Stream<TableCellNode> streamHeaders() {
return streamTableCells().filter(TableCellNode::isHeader);
}
public Stream<TableCellNode> streamHeadersForCell(int row, int col) {
return streamHeaders().filter(cell -> cell.getRow() == row || cell.getCol() == col);
}
@Override
public TextBlock buildTextBlock() {
if (textBlock == null) {
textBlock = streamAllSubNodes().filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock).collect(new TextBlockCollector());
}
return textBlock;
}
@Override
public String toString() {
return tocId.toString() + ": " + NodeType.TABLE + ": " + buildTextBlock().buildSummary();
}
}

View File

@ -1,131 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock;
import java.awt.geom.Rectangle2D;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityPosition;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.internal.api.services.RectangleTransformations;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class AtomicTextBlock implements TextBlock {
Long id;
Integer numberOnPage;
PageNode page;
//string coordinates
Boundary boundary;
String searchText;
List<Integer> lineBreaks;
//position coordinates
List<Integer> stringIdxToPositionIdx;
List<Rectangle2D> positions;
@EqualsAndHashCode.Exclude
SemanticNode parent;
@Override
public int numberOfLines() {
return lineBreaks.size() + 1;
}
public CharSequence getLine(int lineNumber) {
if (lineNumber >= numberOfLines() || lineNumber < 0) {
throw new IndexOutOfBoundsException(String.format("line %d out of range for AtomicTextBlock with %d lines", lineNumber, numberOfLines()));
}
if (lineNumber == 0) {
return subSequence(boundary.start(), lineBreaks.get(0) + boundary.start());
} else if (lineNumber == numberOfLines() - 1) {
return subSequence(lineBreaks.get(lineBreaks.size() - 1) + boundary.start(), boundary.end());
}
return subSequence(lineBreaks.get(lineNumber - 1) + boundary.start(), lineBreaks.get(lineNumber) + boundary.start());
}
@Override
public List<AtomicTextBlock> getAtomicTextBlocks() {
return List.of(this);
}
@Override
public int getNextLinebreak(int fromIndex) {
return lineBreaks.stream()//
.filter(linebreak -> linebreak > fromIndex - boundary.start()) //
.findFirst() //
.orElse(searchText.length()) + boundary.start();
}
@Override
public int getPreviousLinebreak(int fromIndex) {
return lineBreaks.stream()//
.filter(linebreak -> linebreak <= fromIndex - boundary.start())//
.reduce((a, b) -> b)//
.orElse(0) + boundary.start();
}
@Override
public Rectangle2D getPosition(int stringIdx) {
return positions.get(stringIdxToPositionIdx.get(stringIdx - boundary.start()));
}
@Override
public List<Rectangle2D> getPositions(Boundary stringBoundary) {
if (!containsBoundary(stringBoundary)) {
throw new IndexOutOfBoundsException(String.format("%s is out of bounds for %s", stringBoundary, this.boundary));
}
if (stringBoundary.end() == this.boundary.end()) {
return positions.subList(stringIdxToPositionIdx.get(stringBoundary.start() - this.boundary.start()), positions.size());
}
return positions.subList(stringIdxToPositionIdx.get(stringBoundary.start() - this.boundary.start()),
stringIdxToPositionIdx.get(stringBoundary.end() - this.boundary.start()));
}
public List<EntityPosition> getEntityPositionsPerPage(Boundary stringBoundary) {
List<Rectangle2D> positionsPerLine = stringBoundary.split(getLineBreaks().stream().map(lb -> lb + boundary.start()).filter(stringBoundary::contains).toList())
.stream()
.map(this::getPositions)
.map(RectangleTransformations::rectangleUnion)
.toList();
return List.of(EntityPosition.builder().rectanglePerLine(positionsPerLine).pageNode(page).build());
}
@Override
public String toString() {
return searchText;
}
}

View File

@ -1,229 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.mapper;
import static com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.NodeType.FOOTER;
import static com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.NodeType.HEADER;
import java.awt.geom.Rectangle2D;
import java.util.Arrays;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import com.google.common.primitives.Ints;
import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicPositionBlockData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicTextBlockData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.PageData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.TableOfContentsData;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.DocumentGraph;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.FooterNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.HeaderNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.HeadlineNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ImageNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ParagraphNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SectionNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableCellNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.AtomicTextBlock;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector;
import lombok.experimental.UtilityClass;
@UtilityClass
public class DocumentGraphMapper {
public DocumentGraph toDocumentGraph(DocumentData documentData) {
DocumentGraph documentGraph = new DocumentGraph();
Context context = new Context(documentData,
new TableOfContents(documentGraph),
new LinkedList<>(),
new LinkedList<>(),
Arrays.stream(documentData.getAtomicTextBlocks()).toList(),
Arrays.stream(documentData.getAtomicPositionBlocks()).toList());
context.pages.addAll(Arrays.stream(documentData.getPages()).map(DocumentGraphMapper::buildPage).toList());
context.tableOfContents.getRoot().getChildren().addAll(buildEntries(documentData.getTableOfContents().getRoot().getSubEntries(), context));
documentGraph.setTableOfContents(context.tableOfContents);
documentGraph.setPages(new HashSet<>(context.pages));
documentGraph.setNumberOfPages(documentData.getPages().length);
documentGraph.setTextBlock(documentGraph.buildTextBlock());
return documentGraph;
}
private List<TableOfContents.Entry> buildEntries(List<TableOfContentsData.EntryData> entries,
Context context) {
List<TableOfContents.Entry> newEntries = new LinkedList<>();
for (TableOfContentsData.EntryData entryData : entries) {
boolean terminal = isTerminal(entryData);
List<PageNode> pages = Arrays.stream(entryData.getPages()).map(pageNumber -> getPage(pageNumber, context)).toList();
SemanticNode node = switch (entryData.getType()) {
case SECTION -> buildSection(context);
case PARAGRAPH -> buildParagraph(context, terminal);
case HEADLINE -> buildHeadline(context, terminal);
case HEADER -> buildHeader(context, terminal);
case FOOTER -> buildFooter(context, terminal);
case TABLE -> buildTable(context, entryData.getProperties());
case TABLE_CELL -> buildTableCell(context, entryData.getProperties(), terminal);
case IMAGE -> buildImage(context, entryData.getProperties());
default -> throw new UnsupportedOperationException("Not yet implemented for type " + entryData.getType());
};
if (node.isTerminal()) {
TextBlock textBlock = toTextBlock(entryData.getAtomicBlocks(), context, node);
node.setTerminalTextBlock(textBlock);
}
List<Integer> tocId = Arrays.stream(entryData.getTocId()).boxed().toList();
node.setTocId(tocId);
if (entryData.getType() == HEADER) {
pages.forEach(page -> page.setHeader((HeaderNode) node));
} else if (entryData.getType() == FOOTER) {
pages.forEach(page -> page.setFooter((FooterNode) node));
} else {
pages.forEach(page -> page.getMainBody().add(node));
}
newEntries.add(TableOfContents.Entry.builder().tocId(tocId).type(entryData.getType()).children(buildEntries(entryData.getSubEntries(), context)).node(node).build());
}
return newEntries;
}
private HeadlineNode buildHeadline(Context context, boolean terminal) {
return HeadlineNode.builder().terminal(terminal).tableOfContents(context.tableOfContents()).build();
}
private static boolean isTerminal(TableOfContentsData.EntryData entryData) {
return entryData.getAtomicBlocks().length > 0;
}
private ImageNode buildImage(Context context, Map<String, String> properties) {
var builder = ImageNode.builder();
PropertiesMapper.parseImageProperties(properties, builder);
return builder.tableOfContents(context.tableOfContents()).build();
}
private TableCellNode buildTableCell(Context context, Map<String, String> properties, boolean terminal) {
TableCellNode.TableCellNodeBuilder builder = TableCellNode.builder();
PropertiesMapper.parseTableCellProperties(properties, builder);
return builder.terminal(terminal).tableOfContents(context.tableOfContents()).build();
}
private TableNode buildTable(Context context, Map<String, String> properties) {
TableNode.TableNodeBuilder builder = TableNode.builder();
PropertiesMapper.parseTableProperties(properties, builder);
return TableNode.builder().tableOfContents(context.tableOfContents()).build();
}
private FooterNode buildFooter(Context context, boolean terminal) {
return FooterNode.builder().terminal(terminal).tableOfContents(context.tableOfContents()).build();
}
private HeaderNode buildHeader(Context context, boolean terminal) {
return HeaderNode.builder().terminal(terminal).tableOfContents(context.tableOfContents()).build();
}
private SectionNode buildSection(Context context) {
return SectionNode.builder().tableOfContents(context.tableOfContents()).build();
}
private ParagraphNode buildParagraph(Context context, boolean terminal) {
return ParagraphNode.builder().terminal(terminal).tableOfContents(context.tableOfContents()).build();
}
private TextBlock toTextBlock(Long[] atomicTextBlockIds, Context context, SemanticNode parent) {
return Arrays.stream(atomicTextBlockIds)
.map(atomicTextBlockId -> toAtomicTextBlock(context.atomicTextBlockData.get(Math.toIntExact(atomicTextBlockId)),
context.atomicPositionBlockData.get(Math.toIntExact(atomicTextBlockId)),
parent,
context))
.collect(new TextBlockCollector());
}
private PageNode buildPage(PageData p) {
return PageNode.builder().rotation(p.getRotation()).height(p.getHeight()).width(p.getWidth()).number(p.getNumber()).mainBody(new LinkedList<>()).build();
}
private AtomicTextBlock toAtomicTextBlock(AtomicTextBlockData atomicTextBlockData,
AtomicPositionBlockData atomicPositionBlockData,
SemanticNode parent,
Context context) {
return AtomicTextBlock.builder()
.id(atomicTextBlockData.getId())
.numberOnPage(atomicTextBlockData.getNumberOnPage())
.page(getPage(atomicTextBlockData.getPage(), context))
.boundary(new Boundary(atomicTextBlockData.getStart(), atomicTextBlockData.getEnd()))
.searchText(atomicTextBlockData.getSearchText())
.lineBreaks(Ints.asList(atomicTextBlockData.getLineBreaks()))
.stringIdxToPositionIdx(Ints.asList(atomicPositionBlockData.getStringIdxToPositionIdx()))
.positions(toRectangle2DList(atomicPositionBlockData.getPositions()))
.parent(parent)
.build();
}
private static List<Rectangle2D> toRectangle2DList(float[][] positions) {
return Arrays.stream(positions).map(floatArr -> (Rectangle2D) new Rectangle2D.Float(floatArr[0], floatArr[1], floatArr[2], floatArr[3])).toList();
}
private PageNode getPage(Long pageIndex, Context context) {
return context.pages.stream()
.filter(page -> page.getNumber() == Math.toIntExact(pageIndex))
.findFirst()
.orElseThrow(() -> new NoSuchElementException(String.format("ClassificationPage with number %d not found", pageIndex)));
}
record Context(
DocumentData layoutParsingModel,
TableOfContents tableOfContents,
List<PageNode> pages,
List<SectionNode> sections,
List<AtomicTextBlockData> atomicTextBlockData,
List<AtomicPositionBlockData> atomicPositionBlockData) {
}
}

View File

@ -1,101 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.mapper;
import java.awt.geom.Rectangle2D;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ImageNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ImageType;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableCellNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableNode;
import com.knecon.fforesight.service.layoutparser.internal.api.services.RectangleTransformations;
public class PropertiesMapper {
public static Map<String, String> buildImageProperties(ImageNode image) {
Map<String, String> properties = new HashMap<>();
properties.put("imageType", image.getImageType().toString());
properties.put("transparency", String.valueOf(image.isTransparency()));
properties.put("position", RectangleTransformations.toString(image.getPosition()));
return properties;
}
public static Map<String, String> buildTableCellProperties(TableCellNode tableCell) {
Map<String, String> properties = new HashMap<>();
properties.put("row", String.valueOf(tableCell.getRow()));
properties.put("col", String.valueOf(tableCell.getCol()));
properties.put("header", String.valueOf(tableCell.isHeader()));
if (tableCell.getPages().size() > 1 || tableCell.getBBox().keySet().size() > 1) {
throw new IllegalArgumentException("TableCell can only occur on a single page!");
}
String bBoxString = RectangleTransformations.toString(tableCell.getBBox().get(tableCell.getPages().stream().findFirst().get()));
properties.put("bBox", bBoxString);
return properties;
}
public static Map<String, String> buildTableProperties(TableNode table) {
Map<String, String> properties = new HashMap<>();
properties.put("numberOfRows", String.valueOf(table.getNumberOfRows()));
properties.put("numberOfCols", String.valueOf(table.getNumberOfCols()));
return properties;
}
public static void parseImageProperties(Map<String, String> properties, ImageNode.ImageNodeBuilder builder) {
builder.imageType(parseImageType(properties.get("imageType")));
builder.transparency(Boolean.parseBoolean(properties.get("transparency")));
builder.position(parseRectangle2D(properties.get("position")));
}
public static void parseTableCellProperties(Map<String, String> properties, TableCellNode.TableCellNodeBuilder builder) {
builder.row(Integer.parseInt(properties.get("row")));
builder.col(Integer.parseInt(properties.get("col")));
builder.header(Boolean.parseBoolean(properties.get("header")));
builder.bBox(parseRectangle2D(properties.get("bBox")));
}
public static void parseTableProperties(Map<String, String> properties, TableNode.TableNodeBuilder builder) {
builder.numberOfRows(Integer.parseInt(properties.get("numberOfRows")));
builder.numberOfCols(Integer.parseInt(properties.get("numberOfCols")));
}
private static ImageType parseImageType(String imageType) {
return switch (imageType) {
case "LOGO" -> ImageType.LOGO;
case "FORMULA" -> ImageType.FORMULA;
case "SIGNATURE" -> ImageType.SIGNATURE;
case "OCR" -> ImageType.OCR;
default -> ImageType.OTHER;
};
}
public static String toString(Rectangle2D rectangle2D) {
return String.format("%f,%f,%f,%f", rectangle2D.getX(), rectangle2D.getY(), rectangle2D.getWidth(), rectangle2D.getHeight());
}
public static Rectangle2D parseRectangle2D(String bBox) {
List<Float> floats = Arrays.stream(bBox.split(",")).map(Float::parseFloat).toList();
return new Rectangle2D.Float(floats.get(0), floats.get(1), floats.get(2), floats.get(3));
}
}

View File

@ -1,10 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.services;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
public interface EntityEnrichmentService {
void enrichEntity(EntityNode entity, TextBlock textBlock);
}

View File

@ -1,56 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.services;
import java.util.Collections;
import java.util.NoSuchElementException;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
import lombok.RequiredArgsConstructor;
@RequiredArgsConstructor
public class EntityInsertionService {
private final EntityEnrichmentService entityEnrichmentService;
public void addEntityToGraph(EntityNode entity, TableOfContents tableOfContents) {
try {
SemanticNode containingNode = tableOfContents.streamChildrenNodes(Collections.emptyList())
.filter(node -> node.buildTextBlock().containsBoundary(entity.getBoundary()))
.findFirst()
.orElseThrow(() -> new NoSuchElementException("No containing Node found!"));
containingNode.addThisToEntityIfIntersects(entity);
TextBlock textBlock = entity.getDeepestFullyContainingNode().buildTextBlock();
entityEnrichmentService.enrichEntity(entity, textBlock);
addToPages(entity);
addToNodeEntitySets(entity);
} catch (NoSuchElementException e) {
entity.removeFromGraph();
}
}
private void addToPages(EntityNode entity) {
Set<PageNode> pages = entity.getDeepestFullyContainingNode().getPages();
entity.getPages().addAll(pages);
pages.forEach(page -> page.getEntities().add(entity));
}
private void addToNodeEntitySets(EntityNode entity) {
entity.getIntersectingNodes().forEach(node -> node.getEntities().add(entity));
}
}

View File

@ -77,6 +77,10 @@
<artifactId>spring-boot-starter-amqp</artifactId>
<version>${spring.version}</version>
</dependency>
<dependency>
<groupId>org.junit.platform</groupId>
<artifactId>junit-platform-commons</artifactId>
</dependency>
</dependencies>
<repositories>

View File

@ -7,19 +7,19 @@ import java.io.IOException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.DocumentGraph;
import com.knecon.fforesight.service.layoutparser.internal.api.mapper.DocumentDataMapper;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
import com.knecon.fforesight.service.layoutparser.processor.adapter.CvTableParsingAdapter;
import com.knecon.fforesight.service.layoutparser.processor.adapter.ImageServiceResponseAdapter;
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.image.ImageServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.classification.service.ClassificationService;
import com.knecon.fforesight.service.layoutparser.processor.classification.service.PdfParsingService;
import com.knecon.fforesight.service.layoutparser.processor.classification.service.SectionsBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.mapper.DocumentDataMapper;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@ -35,7 +35,6 @@ public class LayoutParsingService {
private final PdfParsingService pdfParsingService;
private final ClassificationService classificationService;
private final SectionsBuilderService sectionsBuilderService;
private final DocumentGraphFactory documentGraphFactory;
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
@ -53,7 +52,7 @@ public class LayoutParsingService {
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.pageFileStorageId());
}
DocumentGraph documentGraph = parseLayout(originDocument, imageServiceResponse, tableServiceResponse);
Document documentGraph = parseLayout(originDocument, imageServiceResponse, tableServiceResponse);
int numberOfPages = originDocument.getNumberOfPages();
originDocument.close();
@ -72,7 +71,7 @@ public class LayoutParsingService {
}
public DocumentGraph parseLayout(PDDocument originDocument, ImageServiceResponse imageServiceResponse, TableServiceResponse tableServiceResponse) {
public Document parseLayout(PDDocument originDocument, ImageServiceResponse imageServiceResponse, TableServiceResponse tableServiceResponse) {
ClassificationDocument classificationDocument = pdfParsingService.parseDocument(originDocument,
cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse),
@ -82,7 +81,7 @@ public class LayoutParsingService {
sectionsBuilderService.buildSections(classificationDocument);
return documentGraphFactory.buildDocumentGraph(classificationDocument);
return DocumentGraphFactory.buildDocumentGraph(classificationDocument);
}
}

View File

@ -17,12 +17,12 @@ import com.iqser.red.storage.commons.service.StorageService;
import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicPositionBlockData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicTextBlockData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentTreeData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.PageData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.TableOfContentsData;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.image.ImageServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.multitenancy.TenantContext;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@ -69,7 +69,7 @@ public class LayoutParsingStorageService {
public void storeDocumentData(LayoutParsingRequest layoutParsingRequest, DocumentData documentData) throws IOException {
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.structureFileStorageId(), documentData.getTableOfContents());
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.structureFileStorageId(), documentData.getDocumentTreeData());
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.textBlockFileStorageId(), documentData.getAtomicTextBlocks());
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.positionBlockFileStorageId(), documentData.getAtomicPositionBlocks());
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.pageFileStorageId(), documentData.getPages());
@ -86,12 +86,12 @@ public class LayoutParsingStorageService {
AtomicPositionBlockData[] atomicPositionBlockData = storageService.readJSONObject(TenantContext.getTenantId(),
layoutParsingRequest.positionBlockFileStorageId(),
AtomicPositionBlockData[].class);
TableOfContentsData tableOfContentsData = storageService.readJSONObject(TenantContext.getTenantId(),
DocumentTreeData tableOfContentsData = storageService.readJSONObject(TenantContext.getTenantId(),
layoutParsingRequest.structureFileStorageId(),
TableOfContentsData.class);
DocumentTreeData.class);
return DocumentData.builder()
.tableOfContents(tableOfContentsData)
.documentTreeData(tableOfContentsData)
.atomicPositionBlocks(atomicPositionBlockData)
.atomicTextBlocks(atomicTextBlockData)
.pages(pageData)

View File

@ -8,7 +8,7 @@ import java.util.Map;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.CvParsedTableCell;
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableCells;
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse;
import lombok.RequiredArgsConstructor;
@ -19,9 +19,9 @@ import lombok.extern.slf4j.Slf4j;
@RequiredArgsConstructor
public class CvTableParsingAdapter {
public Map<Integer, List<com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CvParsedTableCell>> buildCvParsedTablesPerPage(TableServiceResponse tableServiceResponse) {
public Map<Integer, List<TableCells>> buildCvParsedTablesPerPage(TableServiceResponse tableServiceResponse) {
Map<Integer, List<com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CvParsedTableCell>> tableCells = new HashMap<>();
Map<Integer, List<TableCells>> tableCells = new HashMap<>();
tableServiceResponse.getData()
.forEach(tableData -> tableCells.computeIfAbsent(tableData.getPageInfo().getNumber(), tableCell -> new ArrayList<>())
.addAll(convertTableCells(tableData.getTableCells())));
@ -30,11 +30,11 @@ public class CvTableParsingAdapter {
}
private Collection<? extends com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CvParsedTableCell> convertTableCells(List<CvParsedTableCell> tableCells) {
private Collection<TableCells> convertTableCells(List<TableCells> tableCells) {
List<com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CvParsedTableCell> cvParsedTableCells = new ArrayList<>();
List<TableCells> cvParsedTableCells = new ArrayList<>();
tableCells.forEach(t -> cvParsedTableCells.add(com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CvParsedTableCell.builder()
tableCells.forEach(t -> cvParsedTableCells.add(TableCells.builder()
.y0(t.getY0())
.x1(t.getX1())
.y1(t.getY1())

View File

@ -9,10 +9,10 @@ import java.util.Map;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ImageType;
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.image.ImageServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.ImageType;
import lombok.RequiredArgsConstructor;

View File

@ -3,12 +3,9 @@ package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image
import java.util.HashMap;
import java.util.Map;
import com.dslplatform.json.CompiledJson;
import lombok.Data;
@Data
@CompiledJson
public class Classification {
private Map<String, Float> probabilities = new HashMap<>();

View File

@ -1,14 +1,11 @@
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
import com.dslplatform.json.CompiledJson;
import lombok.Data;
@Data
@CompiledJson
public class FilterGeometry {
private ImageSize imageSize;
private Format imageFormat;
private ImageFormat imageFormat;
}

View File

@ -1,11 +1,8 @@
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
import com.dslplatform.json.CompiledJson;
import lombok.Data;
@Data
@CompiledJson
public class Filters {
private FilterGeometry geometry;

View File

@ -1,11 +1,8 @@
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
import com.dslplatform.json.CompiledJson;
import lombok.Data;
@Data
@CompiledJson
public class Geometry {
private float width;

View File

@ -1,12 +1,9 @@
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
import com.dslplatform.json.CompiledJson;
import lombok.Data;
@Data
@CompiledJson
public class Format {
public class ImageFormat {
private float quotient;
private boolean tooTall;

View File

@ -1,12 +1,9 @@
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
import com.dslplatform.json.CompiledJson;
import lombok.Data;
@Data
@CompiledJson
public class Metadata {
public class ImageMetadata {
private Classification classification;
private Position position;

View File

@ -3,15 +3,12 @@ package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image
import java.util.ArrayList;
import java.util.List;
import com.dslplatform.json.CompiledJson;
import com.dslplatform.json.JsonAttribute;
import com.fasterxml.jackson.annotation.JsonAlias;
import com.fasterxml.jackson.annotation.JsonProperty;
import lombok.Data;
@Data
@CompiledJson
public class ImageServiceResponse {
private String dossierId;
@ -19,15 +16,13 @@ public class ImageServiceResponse {
@JsonProperty(value = "imageMetadata")
@JsonAlias("data")
@JsonAttribute(alternativeNames = {"imageMetadata"})
private List<Metadata> data = new ArrayList<>();
private List<ImageMetadata> data = new ArrayList<>();
private List<Metadata> dataCV = new ArrayList<>();
private List<ImageMetadata> dataCV = new ArrayList<>();
@JsonProperty(value = "imageMetadata")
@JsonAlias("data")
@JsonAttribute(alternativeNames = {"imageMetadata"})
public void setData(List<Metadata> data) {this.data = data;}
public void setData(List<ImageMetadata> data) {this.data = data;}
}

View File

@ -1,11 +1,8 @@
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
import com.dslplatform.json.CompiledJson;
import lombok.Data;
@Data
@CompiledJson
public class ImageSize {
private float quotient;

View File

@ -1,11 +1,8 @@
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
import com.dslplatform.json.CompiledJson;
import lombok.Data;
@Data
@CompiledJson
public class Position {
private float x1;

View File

@ -1,11 +1,8 @@
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
import com.dslplatform.json.CompiledJson;
import lombok.Data;
@Data
@CompiledJson
public class Probability {
private boolean unconfident;

View File

@ -1,17 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.table;
import java.util.ArrayList;
import java.util.List;
import com.dslplatform.json.CompiledJson;
import lombok.Data;
@Data
@CompiledJson
public class CvParsedTableModel {
private CvParsedPageInfo pageInfo;
private List<CvParsedTableCell> tableCells = new ArrayList<>();
}

View File

@ -1,12 +1,9 @@
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.table;
import com.dslplatform.json.CompiledJson;
import lombok.Data;
@Data
@CompiledJson
public class CvParsedPageInfo {
public class PageInfo {
private int number;
private int rotation;

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.table;
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.table;
import lombok.AllArgsConstructor;
import lombok.Builder;
@ -9,7 +9,7 @@ import lombok.RequiredArgsConstructor;
@Builder
@AllArgsConstructor
@RequiredArgsConstructor
public class CvParsedTableCell {
public class PdfTableCell {
private float x0;
private float y0;

View File

@ -1,12 +1,11 @@
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.table;
import com.dslplatform.json.CompiledJson;
import lombok.Builder;
import lombok.Data;
@Data
@CompiledJson
public class CvParsedTableCell {
@Builder
public class TableCells {
private float x0;
private float y0;

View File

@ -0,0 +1,14 @@
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.table;
import java.util.ArrayList;
import java.util.List;
import lombok.Data;
@Data
public class TableData {
private PageInfo pageInfo;
private List<TableCells> tableCells = new ArrayList<>();
}

View File

@ -3,12 +3,9 @@ package com.knecon.fforesight.service.layoutparser.processor.adapter.model.table
import java.util.ArrayList;
import java.util.List;
import com.dslplatform.json.CompiledJson;
import lombok.Data;
@Data
@CompiledJson
public class TableServiceResponse {
private String dossierId;
@ -17,6 +14,6 @@ public class TableServiceResponse {
private String targetFileExtension;
private String responseFileExtension;
private List<CvParsedTableModel> data = new ArrayList<>();
private List<TableData> data = new ArrayList<>();
}

View File

@ -1,71 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto;
import java.awt.geom.Rectangle2D;
import com.dslplatform.json.JsonAttribute;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextBlockOrientation;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@AllArgsConstructor
@NoArgsConstructor
public abstract class AbstractTextContainer {
protected float minX;
protected float maxX;
protected float minY;
protected float maxY;
protected String classification;
protected int page;
private TextBlockOrientation orientation = TextBlockOrientation.NONE;
public abstract String getText();
public boolean containsBlock(ClassificationTextBlock other) {
return this.minX <= other.getPdfMinX() && this.maxX >= other.getPdfMaxX() && this.minY >= other.getPdfMinY() && this.maxY <= other.getPdfMaxY();
}
public boolean contains(AbstractTextContainer other) {
return this.minX <= other.minX && this.maxX >= other.maxX && this.minY >= other.minY && this.maxY <= other.maxY;
}
public boolean contains(Rectangle2D other) {
return other.contains(minX, minY, getWidth(), getHeight());
}
@JsonIgnore
@JsonAttribute(ignore = true)
public float getHeight() {
return maxY - minY;
}
@JsonIgnore
@JsonAttribute(ignore = true)
public float getWidth() {
return maxX - minX;
}
public boolean intersectsY(AbstractTextContainer atc) {
return this.minY <= atc.getMaxY() && this.maxY >= atc.getMinY();
}
}

View File

@ -1,38 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.table;
import java.awt.geom.Point2D;
import java.util.ArrayList;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.Rectangle;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
@SuppressWarnings("serial")
@Data
@EqualsAndHashCode(callSuper = true)
@NoArgsConstructor
public class TableCell extends Rectangle {
private List<ClassificationTextBlock> textBlocks = new ArrayList<>();
private List<TableCell> headerCells = new ArrayList<>();
private boolean isHeaderCell;
public TableCell(Point2D topLeft, Point2D bottomRight) {
super((float) topLeft.getY(), (float) topLeft.getX(), (float) (bottomRight.getX() - topLeft.getX()), (float) (bottomRight.getY() - topLeft.getY()));
}
public void addTextBlock(ClassificationTextBlock textBlock) {
textBlocks.add(textBlock);
}
}

View File

@ -0,0 +1,80 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.model;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@AllArgsConstructor
@NoArgsConstructor
public abstract class AbstractPageBlock {
@JsonIgnore
protected float minX;
@JsonIgnore
protected float maxX;
@JsonIgnore
protected float minY;
@JsonIgnore
protected float maxY;
@JsonIgnore
protected PageBlockType classification;
@JsonIgnore
protected int page;
@JsonIgnore
private Orientation orientation = Orientation.NONE;
public abstract String getText();
public boolean isHeadline() {
return this instanceof TextPageBlock && this.getClassification() != null && this.getClassification().isHeadline();
}
public boolean containsBlock(TextPageBlock other) {
return this.minX <= other.getPdfMinX() && this.maxX >= other.getPdfMaxX() && this.minY >= other.getPdfMinY() && this.maxY <= other.getPdfMaxY();
}
public boolean contains(AbstractPageBlock other) {
return this.minX <= other.minX && this.maxX >= other.maxX && this.minY >= other.minY && this.maxY <= other.maxY;
}
public boolean contains(Rectangle other) {
return page == other.getPage() && this.minX <= other.getTopLeft().getX() && this.maxX >= other.getTopLeft().getX() + other.getWidth() && this.minY <= other.getTopLeft()
.getY() && this.maxY >= other.getTopLeft().getY() + other.getHeight();
}
@JsonIgnore
public float getHeight() {
return maxY - minY;
}
@JsonIgnore
public float getWidth() {
return maxX - minX;
}
public boolean intersectsY(AbstractPageBlock atc) {
return this.minY <= atc.getMaxY() && this.maxY >= atc.getMinY();
}
}

View File

@ -1,10 +1,11 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto;
package com.knecon.fforesight.service.layoutparser.processor.classification.model;
import java.util.ArrayList;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.StringFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.UnclassifiedText;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.section.SectionGrid;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.StringFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.UnclassifiedText;
import lombok.Data;
import lombok.NoArgsConstructor;
@ -24,4 +25,7 @@ public class ClassificationDocument {
private StringFrequencyCounter fontStyleCounter = new StringFrequencyCounter();
private boolean headlines;
private SectionGrid sectionGrid = new SectionGrid();
private long rulesVersion;
}

View File

@ -1,8 +1,8 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto;
package com.knecon.fforesight.service.layoutparser.processor.classification.model;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
import lombok.AllArgsConstructor;
import lombok.Data;
@ -11,6 +11,6 @@ import lombok.Data;
@AllArgsConstructor
public class ClassificationFooter {
private List<ClassificationTextBlock> textBlocks;
private List<TextPageBlock> textBlocks;
}

View File

@ -1,8 +1,8 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto;
package com.knecon.fforesight.service.layoutparser.processor.classification.model;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
import lombok.AllArgsConstructor;
import lombok.Data;
@ -11,6 +11,6 @@ import lombok.Data;
@AllArgsConstructor
public class ClassificationHeader {
private List<ClassificationTextBlock> textBlocks;
private List<TextPageBlock> textBlocks;
}

View File

@ -1,11 +1,11 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto;
package com.knecon.fforesight.service.layoutparser.processor.classification.model;
import java.util.ArrayList;
import java.util.List;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.StringFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.StringFrequencyCounter;
import lombok.Data;
import lombok.NonNull;
@ -16,7 +16,7 @@ import lombok.RequiredArgsConstructor;
public class ClassificationPage {
@NonNull
private List<AbstractTextContainer> textBlocks;
private List<AbstractPageBlock> textBlocks;
private List<ClassifiedImage> images = new ArrayList<>();

View File

@ -1,38 +1,32 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto;
package com.knecon.fforesight.service.layoutparser.processor.classification.model;
import java.util.ArrayList;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Table;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@NoArgsConstructor
public class ClassificationSection implements Comparable {
public class ClassificationSection {
private List<AbstractTextContainer> pageBlocks = new ArrayList<>();
private List<AbstractPageBlock> pageBlocks = new ArrayList<>();
private List<ClassifiedImage> images = new ArrayList<>();
private String headline;
public List<Table> getTables() {
public List<TablePageBlock> getTables() {
List<Table> tables = new ArrayList<>();
List<TablePageBlock> tables = new ArrayList<>();
pageBlocks.forEach(block -> {
if (block instanceof Table) {
tables.add((Table) block);
if (block instanceof TablePageBlock) {
tables.add((TablePageBlock) block);
}
});
return tables;
}
@Override
public int compareTo(Object o) {
return 0;
}
}

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto;
package com.knecon.fforesight.service.layoutparser.processor.classification.model;
import java.util.ArrayList;
import java.util.Collections;
@ -9,9 +9,9 @@ import java.util.stream.Collectors;
import lombok.Getter;
@Getter
public class FloatFrequencyCounter {
@Getter
Map<Float, Integer> countPerValue = new HashMap<>();

View File

@ -1,6 +1,6 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.text;
package com.knecon.fforesight.service.layoutparser.processor.classification.model;
public enum TextBlockOrientation {
public enum Orientation {
NONE,
LEFT,

View File

@ -0,0 +1,38 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.model;
public enum PageBlockType {
H1,
H2,
H3,
H4,
H5,
H6,
HEADER,
FOOTER,
TITLE,
PARAGRAPH,
PARAGRAPH_BOLD,
PARAGRAPH_ITALIC,
PARAGRAPH_UNKNOWN,
OTHER,
TABLE;
public static PageBlockType getHeadlineType(int i) {
return switch (i) {
case 1 -> PageBlockType.H1;
case 2 -> PageBlockType.H2;
case 3 -> PageBlockType.H3;
case 4 -> PageBlockType.H4;
case 5 -> PageBlockType.H5;
default -> PageBlockType.H6;
};
}
public boolean isHeadline() {
return this.equals(H1) || this.equals(H2) || this.equals(H3) || this.equals(H4) || this.equals(H5) || this.equals(H6);
}
}

View File

@ -1,8 +1,8 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.image;
package com.knecon.fforesight.service.layoutparser.processor.classification.model.image;
import java.awt.geom.Rectangle2D;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ImageType;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.ImageType;
import lombok.Data;
import lombok.NonNull;

View File

@ -0,0 +1,79 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.model.table;
import java.awt.geom.Point2D;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.TextNormalizationUtilities;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
@SuppressWarnings("serial")
@Data
@EqualsAndHashCode(callSuper = true)
@NoArgsConstructor
public class Cell extends Rectangle {
private List<TextPageBlock> textBlocks = new ArrayList<>();
private List<Cell> headerCells = new ArrayList<>();
private boolean isHeaderCell;
private static final int MIN_SIZE = 1;
private int pageNumber;
public Cell(Point2D topLeft, Point2D bottomRight) {
super((float) topLeft.getY(), (float) topLeft.getX(), (float) (bottomRight.getX() - topLeft.getX()), (float) (bottomRight.getY() - topLeft.getY()));
}
public void addTextBlock(TextPageBlock textBlock) {
textBlocks.add(textBlock);
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
Iterator<TextPageBlock> itty = textBlocks.iterator();
TextPositionSequence previous = null;
while (itty.hasNext()) {
TextPageBlock textBlock = itty.next();
for (TextPositionSequence word : textBlock.getSequences()) {
if (previous != null) {
if (Math.abs(previous.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight()) {
sb.append('\n');
} else {
sb.append(' ');
}
}
sb.append(word.toString());
previous = word;
}
}
return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString()).replaceAll("\n", " ").replaceAll(" {2}", " ");
}
public boolean hasMinimumSize() {
return this.getHeight() >= MIN_SIZE && this.getWidth() >= MIN_SIZE;
}
}

View File

@ -1,11 +1,11 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.table;
package com.knecon.fforesight.service.layoutparser.processor.classification.model.table;
import lombok.RequiredArgsConstructor;
import lombok.Value;
@Value
@RequiredArgsConstructor
public class TableCellPosition implements Comparable<TableCellPosition> {
public class CellPosition implements Comparable<CellPosition> {
int row;
@ -13,7 +13,7 @@ public class TableCellPosition implements Comparable<TableCellPosition> {
@Override
public int compareTo(TableCellPosition other) {
public int compareTo(CellPosition other) {
int rowDiff = row - other.row;
return rowDiff != 0 ? rowDiff : col - other.col;

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.table;
package com.knecon.fforesight.service.layoutparser.processor.classification.model.table;
import java.util.List;

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto;
package com.knecon.fforesight.service.layoutparser.processor.classification.model.table;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.table;
package com.knecon.fforesight.service.layoutparser.processor.classification.model.table;
import java.awt.geom.Line2D;
import java.awt.geom.Point2D;

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.table;
package com.knecon.fforesight.service.layoutparser.processor.classification.model.table;
import java.awt.geom.Point2D;
import java.util.ArrayList;
@ -7,20 +7,19 @@ import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.TreeMap;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
import lombok.Getter;
import lombok.Setter;
import lombok.extern.slf4j.Slf4j;
@Slf4j
public class Table extends AbstractTextContainer {
public class TablePageBlock extends AbstractPageBlock {
private final TreeMap<TableCellPosition, TableCell> cells = new TreeMap<>();
private final TreeMap<CellPosition, Cell> cells = new TreeMap<>();
private final int rotation;
@Getter
@ -28,32 +27,29 @@ public class Table extends AbstractTextContainer {
private String headline;
private int unrotatedRowCount;
private int unrotatedColCount;
private int rowCount = -1;
private int colCount = -1;
private List<List<TableCell>> rows;
private List<List<Cell>> rows;
public Table(List<TableCell> cells, Rectangle area, int rotation) {
public TablePageBlock(List<Cell> cells, Rectangle area, int rotation) {
addCells(cells);
minX = area.getLeft();
minY = area.getBottom();
maxX = area.getRight();
maxY = area.getTop();
classification = "Table";
classification = PageBlockType.TABLE;
this.rotation = rotation;
}
public List<List<TableCell>> getRows() {
public List<List<Cell>> getRows() {
if (rows == null) {
rows = computeRows();
// Ignore rows that does not contain any cells and values.
List<List<TableCell>> rowsToRemove = new ArrayList<>();
for (List<TableCell> row : rows) {
List<List<Cell>> rowsToRemove = new ArrayList<>();
for (List<Cell> row : rows) {
if (row.size() == 1 && row.get(0).getTextBlocks().isEmpty()) {
rowsToRemove.add(row);
}
@ -70,19 +66,13 @@ public class Table extends AbstractTextContainer {
public int getRowCount() {
if (rowCount == -1) {
rowCount = getRows().size();
}
return rowCount;
return getRows().size();
}
public int getColCount() {
if (colCount == -1) {
colCount = getRows().stream().mapToInt(List::size).max().orElse(0);
}
return colCount;
return getRows().stream().mapToInt(List::size).max().orElse(0);
}
@ -100,16 +90,16 @@ public class Table extends AbstractTextContainer {
// A bold cell is a header cell as long as every cell to the left/top is bold, too
// we move from left to right and top to bottom
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
List<TableCell> rowCells = rows.get(rowIndex);
List<Cell> rowCells = rows.get(rowIndex);
if (rowCells.size() == 1) {
continue;
}
for (int colIndex = 0; colIndex < rowCells.size(); colIndex++) {
TableCell cell = rowCells.get(colIndex);
List<TableCell> cellsToTheLeft = rowCells.subList(0, colIndex);
TableCell lastHeaderCell = null;
for (TableCell leftCell : cellsToTheLeft) {
Cell cell = rowCells.get(colIndex);
List<Cell> cellsToTheLeft = rowCells.subList(0, colIndex);
Cell lastHeaderCell = null;
for (Cell leftCell : cellsToTheLeft) {
if (leftCell.isHeaderCell()) {
lastHeaderCell = leftCell;
} else {
@ -119,7 +109,7 @@ public class Table extends AbstractTextContainer {
if (lastHeaderCell != null) {
cell.getHeaderCells().add(lastHeaderCell);
}
List<TableCell> cellsToTheTop = new ArrayList<>();
List<Cell> cellsToTheTop = new ArrayList<>();
for (int i = 0; i < rowIndex; i++) {
try {
cellsToTheTop.add(rows.get(i).get(colIndex));
@ -127,7 +117,7 @@ public class Table extends AbstractTextContainer {
log.debug("No cell {} in row {}, ignoring.", colIndex, rowIndex);
}
}
for (TableCell topCell : cellsToTheTop) {
for (Cell topCell : cellsToTheTop) {
if (topCell.isHeaderCell()) {
lastHeaderCell = topCell;
} else {
@ -146,14 +136,14 @@ public class Table extends AbstractTextContainer {
}
private List<List<TableCell>> computeRows() {
private List<List<Cell>> computeRows() {
List<List<TableCell>> rows = new ArrayList<>();
List<List<Cell>> rows = new ArrayList<>();
if (rotation == 90) {
for (int i = 0; i < unrotatedColCount; i++) { // rows
List<TableCell> lastRow = new ArrayList<>();
List<Cell> lastRow = new ArrayList<>();
for (int j = unrotatedRowCount - 1; j >= 0; j--) { // cols
TableCell cell = cells.get(new TableCellPosition(j, i));
Cell cell = cells.get(new CellPosition(j, i));
if (cell != null) {
lastRow.add(cell);
}
@ -162,9 +152,9 @@ public class Table extends AbstractTextContainer {
}
} else if (rotation == 270) {
for (int i = unrotatedColCount - 1; i >= 0; i--) { // rows
List<TableCell> lastRow = new ArrayList<>();
List<Cell> lastRow = new ArrayList<>();
for (int j = 0; j < unrotatedRowCount; j++) { // cols
TableCell cell = cells.get(new TableCellPosition(j, i));
Cell cell = cells.get(new CellPosition(j, i));
if (cell != null) {
lastRow.add(cell);
}
@ -173,9 +163,9 @@ public class Table extends AbstractTextContainer {
}
} else {
for (int i = 0; i < unrotatedRowCount; i++) {
List<TableCell> lastRow = new ArrayList<>();
List<Cell> lastRow = new ArrayList<>();
for (int j = 0; j < unrotatedColCount; j++) {
TableCell cell = cells.get(new TableCellPosition(i, j)); // JAVA_8 use getOrDefault()
Cell cell = cells.get(new CellPosition(i, j)); // JAVA_8 use getOrDefault()
if (cell != null) {
lastRow.add(cell);
}
@ -189,18 +179,18 @@ public class Table extends AbstractTextContainer {
}
private void add(TableCell chunk, int row, int col) {
private void add(Cell chunk, int row, int col) {
unrotatedRowCount = Math.max(unrotatedRowCount, row + 1);
unrotatedColCount = Math.max(unrotatedColCount, col + 1);
TableCellPosition cp = new TableCellPosition(row, col);
CellPosition cp = new CellPosition(row, col);
cells.put(cp, chunk);
}
private void addCells(List<TableCell> cells) {
private void addCells(List<Cell> cells) {
if (cells.isEmpty()) {
return;
@ -208,7 +198,7 @@ public class Table extends AbstractTextContainer {
cells.removeIf(cell -> cell.getWidth() < 1.1 || cell.getHeight() < 1.1);
List<List<TableCell>> rowsOfCells = calculateStructure(cells);
List<List<Cell>> rowsOfCells = calculateStructure(cells);
for (int i = 0; i < rowsOfCells.size(); i++) {
for (int j = 0; j < rowsOfCells.get(i).size(); j++) {
@ -223,11 +213,11 @@ public class Table extends AbstractTextContainer {
* Calculates the structure of the table. For spanning rows and columns multiple cells with the same values will be inserted.
*
* @param cells The found cells
* @return Table Structure
* @return TablePageBlock Structure
*/
private List<List<TableCell>> calculateStructure(List<TableCell> cells) {
private List<List<Cell>> calculateStructure(List<Cell> cells) {
List<List<TableCell>> matrix = new ArrayList<>();
List<List<Cell>> matrix = new ArrayList<>();
if (cells.isEmpty()) {
return matrix;
@ -242,30 +232,30 @@ public class Table extends AbstractTextContainer {
uniqueY.add(c.getTop());
});
var sortedUniqueX = uniqueX.stream().sorted().collect(Collectors.toList());
var sortedUniqueY = uniqueY.stream().sorted().collect(Collectors.toList());
var sortedUniqueX = uniqueX.stream().sorted().toList();
var sortedUniqueY = uniqueY.stream().sorted().toList();
Float prevY = null;
for (Float y : sortedUniqueY) {
List<TableCell> row = new ArrayList<>();
List<Cell> row = new ArrayList<>();
Float prevX = null;
for (Float x : sortedUniqueX) {
if (prevY != null && prevX != null) {
var cell = new TableCell(new Point2D.Float(prevX, prevY), new Point2D.Float(x, y));
var cell = new Cell(new Point2D.Float(prevX, prevY), new Point2D.Float(x, y));
var intersectionCell = cells.stream().filter(c -> cell.intersects(c) && cell.overlapRatio(c) > 0.1f).findFirst();
if (intersectionCell.isPresent()) {
cell.getTextBlocks().addAll(intersectionCell.get().getTextBlocks());
intersectionCell.ifPresent(value -> cell.getTextBlocks().addAll(value.getTextBlocks()));
if (cell.hasMinimumSize()) {
row.add(cell);
}
row.add(cell);
}
prevX = x;
}
if (prevY != null && prevX != null) {
if (prevY != null && prevX != null && !row.isEmpty()) {
matrix.add(row);
}
prevY = y;
@ -281,22 +271,22 @@ public class Table extends AbstractTextContainer {
public String getText() {
StringBuilder sb = new StringBuilder();
List<List<TableCell>> rows = getRows();
List<List<Cell>> rows = getRows();
int i = 0;
for (List<TableCell> row : rows) {
for (List<Cell> row : rows) {
if (i != 0) {
sb.append("\n");
}
if (!row.isEmpty()) {
boolean firstColumn = true;
for (TableCell column : row) {
for (Cell column : row) {
if (!firstColumn) {
sb.append(",");
}
if (column != null && column.getTextBlocks() != null) {
boolean first = true;
for (ClassificationTextBlock textBlock : column.getTextBlocks()) {
for (TextPageBlock textBlock : column.getTextBlocks()) {
if (!first) {
sb.append("\n");
}
@ -317,18 +307,18 @@ public class Table extends AbstractTextContainer {
public String getTextAsHtml() {
StringBuilder sb = new StringBuilder();
List<List<TableCell>> rows = getRows();
List<List<Cell>> rows = getRows();
sb.append("<table border=\"1\">");
int i = 0;
for (List<TableCell> row : rows) {
for (List<Cell> row : rows) {
sb.append("\n<tr>");
if (!row.isEmpty()) {
for (TableCell column : row) {
for (Cell column : row) {
sb.append(i == 0 ? "\n<th>" : "\n<td>");
if (column != null && column.getTextBlocks() != null) {
boolean first = true;
for (ClassificationTextBlock textBlock : column.getTextBlocks()) {
for (TextPageBlock textBlock : column.getTextBlocks()) {
if (!first) {
sb.append("<br />");
}

View File

@ -1,10 +1,8 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.text;
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
import org.apache.pdfbox.text.TextPosition;
import org.springframework.beans.BeanUtils;
import com.dslplatform.json.CompiledJson;
import com.dslplatform.json.JsonAttribute;
import com.fasterxml.jackson.annotation.JsonIgnore;
import lombok.AllArgsConstructor;
@ -17,7 +15,6 @@ import lombok.SneakyThrows;
@Builder
@NoArgsConstructor
@AllArgsConstructor
@CompiledJson
public class RedTextPosition {
private String textMatrix;
@ -39,17 +36,14 @@ public class RedTextPosition {
// not used in reanalysis
@JsonIgnore
@JsonAttribute(ignore = true)
private float widthOfSpace;
// not used in reanalysis
@JsonIgnore
@JsonAttribute(ignore = true)
private float fontSizeInPt;
// not used in reanalysis
@JsonIgnore
@JsonAttribute(ignore = true)
private String fontName;

View File

@ -0,0 +1,48 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
import java.util.ArrayList;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.TextNormalizationUtilities;
import lombok.Getter;
@Getter
public class SearchableText {
private final List<TextPositionSequence> sequences = new ArrayList<>();
public void add(TextPositionSequence textPositionSequence) {
sequences.add(textPositionSequence);
}
public void addAll(List<TextPositionSequence> textPositionSequences) {
sequences.addAll(textPositionSequences);
}
@Override
public String toString() {
return buildString(sequences);
}
public static String buildString(List<TextPositionSequence> sequences) {
StringBuilder sb = new StringBuilder();
for (TextPositionSequence word : sequences) {
sb.append(word);
sb.append(' ');
}
String text = sb.toString();
text = TextNormalizationUtilities.removeHyphenLineBreaks(text);
text = TextNormalizationUtilities.removeLineBreaks(text);
text = TextNormalizationUtilities.removeRepeatingWhitespaces(text);
return text;
}
}

View File

@ -0,0 +1,17 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class SimplifiedSectionText {
private int sectionNumber;
private String text;
}

View File

@ -0,0 +1,20 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
import java.util.ArrayList;
import java.util.List;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class SimplifiedText {
private int numberOfPages;
private List<SimplifiedSectionText> sectionTexts = new ArrayList<>();
}

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.text;
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
import java.util.HashMap;
import java.util.Map;

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.text;
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonValue;
@ -33,13 +33,6 @@ public enum TextDirection {
}
@com.dslplatform.json.JsonValue
public float jsonValue() {
return getDegrees();
}
@JsonCreator(mode = JsonCreator.Mode.DELEGATING)
public static TextDirection fromDegrees(float degrees) {

View File

@ -1,57 +1,67 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.text;
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
import java.util.ArrayList;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.TextNormalizationUtilities;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
@EqualsAndHashCode(callSuper = true)
@AllArgsConstructor
@Builder
@Data
@NoArgsConstructor
public class ClassificationTextBlock extends AbstractTextContainer {
public class TextPageBlock extends AbstractPageBlock {
@Builder.Default
private List<TextPositionSequence> sequences = new ArrayList<>();
@JsonIgnore
private int rotation;
private int indexOnPage;
@JsonIgnore
private String mostPopularWordFont;
@JsonIgnore
private String mostPopularWordStyle;
@JsonIgnore
private float mostPopularWordFontSize;
@JsonIgnore
private float mostPopularWordHeight;
@JsonIgnore
private float mostPopularWordSpaceWidth;
@JsonIgnore
private float highestFontSize;
private String classification;
@JsonIgnore
private PageBlockType classification;
@JsonIgnore
public TextDirection getDir() {
return sequences.get(0).getDir();
}
@JsonIgnore
private float getPageHeight() {
return sequences.get(0).getPageHeight();
}
@JsonIgnore
private float getPageWidth() {
return sequences.get(0).getPageWidth();
@ -68,6 +78,7 @@ public class ClassificationTextBlock extends AbstractTextContainer {
*
* @return the minX value in pdf coordinate system
*/
@JsonIgnore
public float getPdfMinX() {
if (getDir().getDegrees() == 90) {
@ -83,6 +94,7 @@ public class ClassificationTextBlock extends AbstractTextContainer {
}
}
/**
* Returns the maxX value in pdf coordinate system.
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
@ -93,6 +105,7 @@ public class ClassificationTextBlock extends AbstractTextContainer {
*
* @return the maxX value in pdf coordinate system
*/
@JsonIgnore
public float getPdfMaxX() {
if (getDir().getDegrees() == 90) {
@ -118,6 +131,7 @@ public class ClassificationTextBlock extends AbstractTextContainer {
*
* @return the minY value in pdf coordinate system
*/
@JsonIgnore
public float getPdfMinY() {
if (getDir().getDegrees() == 90) {
@ -144,6 +158,7 @@ public class ClassificationTextBlock extends AbstractTextContainer {
*
* @return the maxY value in pdf coordinate system
*/
@JsonIgnore
public float getPdfMaxY() {
if (getDir().getDegrees() == 90) {
@ -159,35 +174,34 @@ public class ClassificationTextBlock extends AbstractTextContainer {
}
public ClassificationTextBlock(float minX, float maxX, float minY, float maxY, List<TextPositionSequence> sequences, int rotation, int indexOnPage) {
super();
this.indexOnPage = indexOnPage;
super.minX = minX;
super.maxX = maxX;
super.minY = minY;
super.maxY = maxY;
public TextPageBlock(float minX, float maxX, float minY, float maxY, List<TextPositionSequence> sequences, int rotation) {
this.minX = minX;
this.maxX = maxX;
this.minY = minY;
this.maxY = maxY;
this.sequences = sequences;
this.rotation = rotation;
}
public ClassificationTextBlock union(TextPositionSequence r) {
public TextPageBlock union(TextPositionSequence r) {
ClassificationTextBlock union = this.copy();
TextPageBlock union = this.copy();
union.add(r);
return union;
}
public ClassificationTextBlock union(ClassificationTextBlock r) {
public TextPageBlock union(TextPageBlock r) {
ClassificationTextBlock union = this.copy();
TextPageBlock union = this.copy();
union.add(r);
return union;
}
public void add(ClassificationTextBlock r) {
public void add(TextPageBlock r) {
if (r.getMinX() < minX) {
minX = r.getMinX();
@ -222,9 +236,9 @@ public class ClassificationTextBlock extends AbstractTextContainer {
}
public ClassificationTextBlock copy() {
public TextPageBlock copy() {
return new ClassificationTextBlock(minX, maxX, minY, maxY, sequences, rotation, indexOnPage);
return new TextPageBlock(minX, maxX, minY, maxY, sequences, rotation);
}
@ -263,6 +277,7 @@ public class ClassificationTextBlock extends AbstractTextContainer {
@Override
@JsonIgnore
public String getText() {
StringBuilder sb = new StringBuilder();
@ -283,4 +298,5 @@ public class ClassificationTextBlock extends AbstractTextContainer {
return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString());
}
}

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.text;
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
import java.awt.geom.AffineTransform;
import java.awt.geom.Point2D;
@ -8,8 +8,8 @@ import java.util.stream.Collectors;
import org.apache.pdfbox.text.TextPosition;
import com.dslplatform.json.JsonAttribute;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
@ -25,6 +25,7 @@ import lombok.extern.slf4j.Slf4j;
@Builder
@NoArgsConstructor
@AllArgsConstructor
@JsonIgnoreProperties({"empty"})
public class TextPositionSequence implements CharSequence {
public static final int HEIGHT_PADDING = 2;
@ -37,6 +38,12 @@ public class TextPositionSequence implements CharSequence {
private float pageWidth;
public TextPositionSequence(int page) {
this.page = page;
}
public TextPositionSequence(List<TextPosition> textPositions, int page) {
this.textPositions = textPositions.stream().map(RedTextPosition::fromTextPosition).collect(Collectors.toList());
@ -64,6 +71,14 @@ public class TextPositionSequence implements CharSequence {
}
public char charAt(int index, boolean caseInSensitive) {
RedTextPosition textPosition = textPositionAt(index);
String text = textPosition.getUnicode();
return caseInSensitive ? text.toLowerCase().charAt(0) : text.charAt(0);
}
@Override
public TextPositionSequence subSequence(int start, int end) {
@ -126,7 +141,6 @@ public class TextPositionSequence implements CharSequence {
* @return the text direction adjusted minX value
*/
@JsonIgnore
@JsonAttribute(ignore = true)
public float getMinXDirAdj() {
return textPositions.get(0).getXDirAdj();
@ -141,7 +155,6 @@ public class TextPositionSequence implements CharSequence {
* @return the text direction adjusted maxX value
*/
@JsonIgnore
@JsonAttribute(ignore = true)
public float getMaxXDirAdj() {
return textPositions.get(textPositions.size() - 1).getXDirAdj() + textPositions.get(textPositions.size() - 1).getWidthDirAdj() + HEIGHT_PADDING;
@ -156,7 +169,6 @@ public class TextPositionSequence implements CharSequence {
* @return the text direction adjusted minY value. The upper border of the bounding box of the word.
*/
@JsonIgnore
@JsonAttribute(ignore = true)
public float getMinYDirAdj() {
return textPositions.get(0).getYDirAdj() - getTextHeight();
@ -171,7 +183,6 @@ public class TextPositionSequence implements CharSequence {
* @return the text direction adjusted maxY value. The lower border of the bounding box of the word.
*/
@JsonIgnore
@JsonAttribute(ignore = true)
public float getMaxYDirAdj() {
return textPositions.get(0).getYDirAdj();
@ -180,7 +191,6 @@ public class TextPositionSequence implements CharSequence {
@JsonIgnore
@JsonAttribute(ignore = true)
public float getTextHeight() {
return textPositions.get(0).getHeightDir() + HEIGHT_PADDING;
@ -188,7 +198,6 @@ public class TextPositionSequence implements CharSequence {
@JsonIgnore
@JsonAttribute(ignore = true)
public float getHeight() {
return getMaxYDirAdj() - getMinYDirAdj();
@ -196,7 +205,6 @@ public class TextPositionSequence implements CharSequence {
@JsonIgnore
@JsonAttribute(ignore = true)
public float getWidth() {
return getMaxXDirAdj() - getMinXDirAdj();
@ -204,7 +212,6 @@ public class TextPositionSequence implements CharSequence {
@JsonIgnore
@JsonAttribute(ignore = true)
public String getFont() {
return textPositions.get(0).getFontName().toLowerCase().replaceAll(",bold", "").replaceAll(",italic", "");
@ -212,7 +219,6 @@ public class TextPositionSequence implements CharSequence {
@JsonIgnore
@JsonAttribute(ignore = true)
public String getFontStyle() {
String lowercaseFontName = textPositions.get(0).getFontName().toLowerCase();
@ -231,7 +237,6 @@ public class TextPositionSequence implements CharSequence {
@JsonIgnore
@JsonAttribute(ignore = true)
public float getFontSize() {
return textPositions.get(0).getFontSizeInPt();
@ -239,7 +244,6 @@ public class TextPositionSequence implements CharSequence {
@JsonIgnore
@JsonAttribute(ignore = true)
public float getSpaceWidth() {
return textPositions.get(0).getWidthOfSpace();
@ -256,11 +260,10 @@ public class TextPositionSequence implements CharSequence {
* @return bounding box of the word in Pdf Coordinate System
*/
@JsonIgnore
@JsonAttribute(ignore = true)
@SneakyThrows
public Rectangle getRectangle() {
log.debug("ClassificationPage: '{}', Word: '{}', Rotation: '{}', textRotation {}", page, this, rotation, dir);
log.debug("Page: '{}', Word: '{}', Rotation: '{}', textRotation {}", page, this, rotation, dir);
float textHeight = getTextHeight();

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.text;
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
import java.util.List;
@ -9,6 +9,6 @@ import lombok.Data;
@AllArgsConstructor
public class UnclassifiedText {
private List<ClassificationTextBlock> textBlocks;
private List<TextPageBlock> textBlocks;
}

View File

@ -1,82 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.parsing;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.pdfbox.text.PDFTextStripperByArea;
import org.apache.pdfbox.text.TextPosition;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextPositionSequence;
import lombok.Getter;
import lombok.Setter;
public class PDFAreaTextStripper extends PDFTextStripperByArea {
@Getter
private List<TextPositionSequence> textPositionSequences = new ArrayList<>();
@Setter
private int pageNumber;
public PDFAreaTextStripper() throws IOException {
}
@Override
public void writeString(String text, List<TextPosition> textPositions) throws IOException {
int startIndex = 0;
for (int i = 0; i <= textPositions.size() - 1; i++) {
if (i == 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i).getUnicode().equals("\u00A0"))) {
startIndex++;
continue;
}
// Strange but sometimes this is happening, for example: Metolachlor2.pdf
if (i > 0 && textPositions.get(i).getX() < textPositions.get(i - 1).getX()) {
List<TextPosition> sublist = textPositions.subList(startIndex, i);
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) {
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
}
startIndex = i;
}
if (textPositions.get(i).getRotation() == 0 && i > 0 && textPositions.get(i).getX() > textPositions.get(i - 1).getEndX() + 1) {
List<TextPosition> sublist = textPositions.subList(startIndex, i);
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) {
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
}
startIndex = i;
}
if (i > 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i).getUnicode().equals("\u00A0")) && i <= textPositions.size() - 2) {
List<TextPosition> sublist = textPositions.subList(startIndex, i);
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) {
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
}
startIndex = i + 1;
}
}
List<TextPosition> sublist = textPositions.subList(startIndex, textPositions.size());
if (!sublist.isEmpty() && (sublist.get(sublist.size() - 1).getUnicode().equals(" ") || sublist.get(sublist.size() - 1).getUnicode().equals("\u00A0"))) {
sublist = sublist.subList(0, sublist.size() - 1);
}
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) {
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
}
super.writeString(text);
}
public void clearPositions() {
textPositionSequences = new ArrayList<>();
}
}

View File

@ -34,31 +34,26 @@ import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.text.TextPosition;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.RedTextPosition;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.RedTextPosition;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence;
import lombok.Getter;
import lombok.Setter;
import lombok.extern.slf4j.Slf4j;
@Getter
@Slf4j
public class PDFLinesTextStripper extends PDFTextStripper {
@Getter
private final List<TextPositionSequence> textPositionSequences = new ArrayList<>();
@Getter
private final List<Ruling> rulings = new ArrayList<>();
private final List<Ruling> graphicsPath = new ArrayList<>();
@Setter
protected PDPage pdpage;
@Getter
private int minCharWidth;
@Getter
private int maxCharWidth;
@Getter
private int minCharHeight;
@Getter
private int maxCharHeight;
private float path_x;

View File

@ -9,14 +9,14 @@ import java.util.List;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.FloatFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextBlockOrientation;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.StringFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.FloatFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.Orientation;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.StringFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.RulingTextDirAdjustUtil;
@Service
@ -29,18 +29,18 @@ public class BlockificationService {
/**
* This method is building blocks by expanding the minX/maxX and minY/maxY value on each word that is not split by the conditions.
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
* Rulings (Table lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
* Rulings (TablePageBlock lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
*
* @param textPositions The words of a page.
* @param horizontalRulingLines Horizontal table lines.
* @param verticalRulingLines Vertical table lines.
* @return ClassificationPage object that contains the Textblock and text statistics.
* @return Page object that contains the Textblock and text statistics.
*/
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
int indexOnPage = 0;
List<TextPositionSequence> chunkWords = new ArrayList<>();
List<AbstractTextContainer> chunkBlockList1 = new ArrayList<>();
List<AbstractPageBlock> chunkBlockList = new ArrayList<>();
float minX = 1000, maxX = 0, minY = 1000, maxY = 0;
TextPositionSequence prev = null;
@ -59,27 +59,27 @@ public class BlockificationService {
if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) {
TextBlockOrientation prevOrientation = null;
if (!chunkBlockList1.isEmpty()) {
prevOrientation = chunkBlockList1.get(chunkBlockList1.size() - 1).getOrientation();
Orientation prevOrientation = null;
if (!chunkBlockList.isEmpty()) {
prevOrientation = chunkBlockList.get(chunkBlockList.size() - 1).getOrientation();
}
ClassificationTextBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
indexOnPage++;
chunkBlockList1.add(cb1);
chunkBlockList.add(cb1);
chunkWords = new ArrayList<>();
if (splitByX && !isSplitByRuling) {
wasSplitted = true;
cb1.setOrientation(TextBlockOrientation.LEFT);
cb1.setOrientation(Orientation.LEFT);
splitX1 = word.getMinXDirAdj();
} else if (newLineAfterSplit && !isSplitByRuling) {
wasSplitted = false;
cb1.setOrientation(TextBlockOrientation.RIGHT);
cb1.setOrientation(Orientation.RIGHT);
splitX1 = null;
} else if (prevOrientation != null && prevOrientation.equals(TextBlockOrientation.RIGHT) && (lineSeparation || !startFromTop || !splitByX || !newLineAfterSplit || !isSplitByRuling)) {
cb1.setOrientation(TextBlockOrientation.LEFT);
} else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation || !startFromTop || !splitByX || !newLineAfterSplit || !isSplitByRuling)) {
cb1.setOrientation(Orientation.LEFT);
}
minX = 1000;
@ -106,19 +106,19 @@ public class BlockificationService {
}
}
ClassificationTextBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
if (cb1 != null) {
chunkBlockList1.add(cb1);
chunkBlockList.add(cb1);
}
Iterator<AbstractTextContainer> itty = chunkBlockList1.iterator();
Iterator<AbstractPageBlock> itty = chunkBlockList.iterator();
ClassificationTextBlock previousLeft = null;
ClassificationTextBlock previousRight = null;
TextPageBlock previousLeft = null;
TextPageBlock previousRight = null;
while (itty.hasNext()) {
ClassificationTextBlock block = (ClassificationTextBlock) itty.next();
TextPageBlock block = (TextPageBlock) itty.next();
if (previousLeft != null && block.getOrientation().equals(TextBlockOrientation.LEFT)) {
if (previousLeft != null && block.getOrientation().equals(Orientation.LEFT)) {
if (previousLeft.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousLeft.getMinY()) {
previousLeft.add(block);
itty.remove();
@ -126,7 +126,7 @@ public class BlockificationService {
}
}
if (previousRight != null && block.getOrientation().equals(TextBlockOrientation.RIGHT)) {
if (previousRight != null && block.getOrientation().equals(Orientation.RIGHT)) {
if (previousRight.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousRight.getMinY()) {
previousRight.add(block);
itty.remove();
@ -134,21 +134,21 @@ public class BlockificationService {
}
}
if (block.getOrientation().equals(TextBlockOrientation.LEFT)) {
if (block.getOrientation().equals(Orientation.LEFT)) {
previousLeft = block;
} else if (block.getOrientation().equals(TextBlockOrientation.RIGHT)) {
} else if (block.getOrientation().equals(Orientation.RIGHT)) {
previousRight = block;
}
}
itty = chunkBlockList1.iterator();
ClassificationTextBlock previous = null;
itty = chunkBlockList.iterator();
TextPageBlock previous = null;
while (itty.hasNext()) {
ClassificationTextBlock block = (ClassificationTextBlock) itty.next();
TextPageBlock block = (TextPageBlock) itty.next();
if (previous != null && previous.getOrientation().equals(TextBlockOrientation.LEFT) && block.getOrientation().equals(TextBlockOrientation.LEFT) && equalsWithThreshold(block.getMaxY(),
previous.getMaxY()) || previous != null && previous.getOrientation().equals(TextBlockOrientation.LEFT) && block.getOrientation()
.equals(TextBlockOrientation.RIGHT) && equalsWithThreshold(block.getMaxY(), previous.getMaxY())) {
if (previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold(block.getMaxY(),
previous.getMaxY()) || previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation()
.equals(Orientation.RIGHT) && equalsWithThreshold(block.getMaxY(), previous.getMaxY())) {
previous.add(block);
itty.remove();
continue;
@ -157,7 +157,7 @@ public class BlockificationService {
previous = block;
}
return new ClassificationPage(chunkBlockList1);
return new ClassificationPage(chunkBlockList);
}
@ -167,9 +167,9 @@ public class BlockificationService {
}
private ClassificationTextBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
private TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
ClassificationTextBlock textBlock = null;
TextPageBlock textBlock = null;
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
@ -186,15 +186,14 @@ public class BlockificationService {
styleFrequencyCounter.add(wordBlock.getFontStyle());
if (textBlock == null) {
textBlock = new ClassificationTextBlock(wordBlock.getMinXDirAdj(),
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
wordBlock.getMaxXDirAdj(),
wordBlock.getMinYDirAdj(),
wordBlock.getMaxYDirAdj(),
wordBlockList,
wordBlock.getRotation(),
indexOnPage);
wordBlock.getRotation());
} else {
ClassificationTextBlock spatialEntity = textBlock.union(wordBlock);
TextPageBlock spatialEntity = textBlock.union(wordBlock);
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
}
}
@ -254,7 +253,7 @@ public class BlockificationService {
verticalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()); //
word.getPageHeight());
}

View File

@ -1,52 +1,56 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.service;
import java.util.List;
import org.springframework.stereotype.Service;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.FloatFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.TableCell;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Table;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.FloatFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.PositionUtils;
@Service
public class BodyTextFrameService {
private static final float APPROXIMATE_HEADER_LINE_COUNT = 2.9f;
/**
* Adjusts and sets the body text frame to a classificationPage.
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the classificationPage rotation.
* Adjusts and sets the body text frame to a page.
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
* 0 -> LowerLeft
* 90 -> UpperLeft
* 180 -> UpperRight
* 270 -> LowerRight
* The aspect ratio of the classificationPage is also regarded.
* The aspect ratio of the page is also regarded.
*
* @param classificationPage The classificationPage
* @param page The page
* @param bodyTextFrame frame that contains the main text on portrait pages
* @param landscapeBodyTextFrame frame that contains the main text on landscape pages
*/
public void setBodyTextFrameAdjustedToPage(ClassificationPage classificationPage, Rectangle bodyTextFrame, Rectangle landscapeBodyTextFrame) {
public void setBodyTextFrameAdjustedToPage(ClassificationPage page, Rectangle bodyTextFrame, Rectangle landscapeBodyTextFrame) {
Rectangle textFrame = classificationPage.isLandscape() ? landscapeBodyTextFrame : bodyTextFrame;
Rectangle textFrame = page.isLandscape() ? landscapeBodyTextFrame : bodyTextFrame;
if (classificationPage.getPageWidth() > classificationPage.getPageHeight() && classificationPage.getRotation() == 270) {
textFrame = new Rectangle(new Point(textFrame.getTopLeft().getY(), classificationPage.getPageHeight() - textFrame.getTopLeft().getX() - textFrame.getWidth()),
if (page.getPageWidth() > page.getPageHeight() && page.getRotation() == 270) {
textFrame = new Rectangle(new Point(textFrame.getTopLeft().getY(), page.getPageHeight() - textFrame.getTopLeft().getX() - textFrame.getWidth()),
textFrame.getHeight(),
textFrame.getWidth(),
0);
} else if (classificationPage.getPageWidth() > classificationPage.getPageHeight() && classificationPage.getRotation() != 0) {
textFrame = new Rectangle(new Point(textFrame.getTopLeft().getY(), textFrame.getTopLeft().getX()), textFrame.getHeight(), textFrame.getWidth(), classificationPage.getPageNumber());
} else if (classificationPage.getRotation() == 180) {
textFrame = new Rectangle(new Point(textFrame.getTopLeft().getX(), classificationPage.getPageHeight() - textFrame.getTopLeft().getY() - textFrame.getHeight()),
} else if (page.getPageWidth() > page.getPageHeight() && page.getRotation() != 0) {
textFrame = new Rectangle(new Point(textFrame.getTopLeft().getY(), textFrame.getTopLeft().getX()), textFrame.getHeight(), textFrame.getWidth(), page.getPageNumber());
} else if (page.getRotation() == 180) {
textFrame = new Rectangle(new Point(textFrame.getTopLeft().getX(), page.getPageHeight() - textFrame.getTopLeft().getY() - textFrame.getHeight()),
textFrame.getWidth(),
textFrame.getHeight(),
0);
}
classificationPage.setBodyTextFrame(textFrame);
page.setBodyTextFrame(textFrame);
}
@ -59,50 +63,50 @@ public class BodyTextFrameService {
* 270 -> LowerRight
* The aspect ratio of the page is also regarded.
*
* @param classificationPages List of all classificationPages
* @param pages List of all pages
* @param documentFontSizeCounter Statistics of the document
* @param landscape Calculate for landscape or portrait
* @return Rectangle of the text frame
*/
public Rectangle calculateBodyTextFrame(List<ClassificationPage> classificationPages, FloatFrequencyCounter documentFontSizeCounter, boolean landscape) {
public Rectangle calculateBodyTextFrame(List<ClassificationPage> pages, FloatFrequencyCounter documentFontSizeCounter, boolean landscape) {
BodyTextFrameExpansionsRectangle expansionsRectangle = new BodyTextFrameExpansionsRectangle();
for (ClassificationPage classificationPage : classificationPages) {
for (ClassificationPage page : pages) {
if (classificationPage.getTextBlocks().isEmpty() || landscape != classificationPage.isLandscape()) {
if (page.getTextBlocks().isEmpty() || landscape != page.isLandscape()) {
continue;
}
for (AbstractTextContainer container : classificationPage.getTextBlocks()) {
for (AbstractPageBlock container : page.getTextBlocks()) {
if (container instanceof ClassificationTextBlock) {
ClassificationTextBlock textBlock = (ClassificationTextBlock) container;
if (container instanceof TextPageBlock) {
TextPageBlock textBlock = (TextPageBlock) container;
if (textBlock.getMostPopularWordFont() == null || textBlock.getMostPopularWordStyle() == null) {
continue;
}
float approxLineCount = PositionUtils.getApproxLineCount(textBlock);
if (approxLineCount < 2.9f) {
if (approxLineCount < APPROXIMATE_HEADER_LINE_COUNT) {
continue;
}
if (documentFontSizeCounter.getMostPopular() != null && textBlock.getMostPopularWordFontSize() >= documentFontSizeCounter.getMostPopular()) {
expandRectangle(textBlock, classificationPage, expansionsRectangle);
expandRectangle(textBlock, page, expansionsRectangle);
}
}
if (container instanceof Table) {
Table table = (Table) container;
for (List<TableCell> row : table.getRows()) {
for (TableCell cell : row) {
if (container instanceof TablePageBlock) {
TablePageBlock table = (TablePageBlock) container;
for (List<Cell> row : table.getRows()) {
for (Cell cell : row) {
if (cell == null || cell.getTextBlocks() == null) {
continue;
}
for (ClassificationTextBlock textBlock : cell.getTextBlocks()) {
expandRectangle(textBlock, classificationPage, expansionsRectangle);
for (TextPageBlock textBlock : cell.getTextBlocks()) {
expandRectangle(textBlock, page, expansionsRectangle);
}
}
}
@ -116,9 +120,9 @@ public class BodyTextFrameService {
}
private void expandRectangle(ClassificationTextBlock textBlock, ClassificationPage classificationPage, BodyTextFrameExpansionsRectangle expansionsRectangle) {
private void expandRectangle(TextPageBlock textBlock, ClassificationPage page, BodyTextFrameExpansionsRectangle expansionsRectangle) {
if (classificationPage.getPageWidth() > classificationPage.getPageHeight() && classificationPage.getRotation() != 0) {
if (page.getPageWidth() > page.getPageHeight() && page.getRotation() != 0) {
if (textBlock.getPdfMinY() < expansionsRectangle.minX) {
expansionsRectangle.minX = textBlock.getPdfMinY();
}

View File

@ -6,10 +6,11 @@ import java.util.regex.Pattern;
import org.springframework.stereotype.Service;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.PositionUtils;
import lombok.RequiredArgsConstructor;
@ -31,43 +32,43 @@ public class ClassificationService {
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
for (ClassificationPage classificationPage : document.getPages()) {
bodyTextFrameService.setBodyTextFrameAdjustedToPage(classificationPage, bodyTextFrame, landscapeBodyTextFrame);
classifyPage(classificationPage, document, headlineFontSizes);
for (ClassificationPage page : document.getPages()) {
bodyTextFrameService.setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame);
classifyPage(page, document, headlineFontSizes);
}
}
public void classifyPage(ClassificationPage classificationPage, ClassificationDocument document, List<Float> headlineFontSizes) {
public void classifyPage(ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
for (AbstractTextContainer textBlock : classificationPage.getTextBlocks()) {
if (textBlock instanceof ClassificationTextBlock) {
classifyBlock((ClassificationTextBlock) textBlock, classificationPage, document, headlineFontSizes);
for (AbstractPageBlock textBlock : page.getTextBlocks()) {
if (textBlock instanceof TextPageBlock) {
classifyBlock((TextPageBlock) textBlock, page, document, headlineFontSizes);
}
}
}
public void classifyBlock(ClassificationTextBlock textBlock, ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
public void classifyBlock(TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
var bodyTextFrame = page.getBodyTextFrame();
if (document.getFontSizeCounter().getMostPopular() == null) {
textBlock.setClassification("Other");
textBlock.setClassification(PageBlockType.OTHER);
return;
}
if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
textBlock.setClassification("Header");
textBlock.setClassification(PageBlockType.HEADER);
} else if (PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
textBlock.setClassification("Footer");
textBlock.setClassification(PageBlockType.FOOTER);
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
.size() == 1)) {
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
textBlock.setClassification("Title");
textBlock.setClassification(PageBlockType.TITLE);
}
} else if (textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter()
.getMostPopular() && PositionUtils.getApproxLineCount(textBlock) < 4.9 && (textBlock.getMostPopularWordStyle().equals("bold") || !document.getFontStyleCounter()
@ -80,36 +81,34 @@ public class ClassificationService {
for (int i = 1; i <= headlineFontSizes.size(); i++) {
if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) {
textBlock.setClassification("H " + i);
textBlock.setClassification(PageBlockType.getHeadlineType(i));
document.setHeadlines(true);
}
}
} else if (!textBlock.getText().startsWith("Table ") && !textBlock.getText().startsWith("Figure ") && PositionUtils.isWithinBodyTextFrame(bodyTextFrame,
textBlock) && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter()
.getMostPopular()
.equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9 && textBlock.getSequences()
} else if (!textBlock.getText().startsWith("Figure ") && PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordStyle()
.equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9 && textBlock.getSequences()
.get(0)
.getTextPositions()
.get(0)
.getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
textBlock.setClassification("H " + (headlineFontSizes.size() + 1));
textBlock.setClassification(PageBlockType.getHeadlineType(headlineFontSizes.size() + 1));
document.setHeadlines(true);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) {
textBlock.setClassification("TextBlock Bold");
textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFont()
.equals(document.getFontCounter().getMostPopular()) && textBlock.getMostPopularWordStyle()
.equals(document.getFontStyleCounter().getMostPopular()) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) {
textBlock.setClassification("TextBlock");
textBlock.setClassification(PageBlockType.PARAGRAPH);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("italic") && !document.getFontStyleCounter()
.getMostPopular()
.equals("italic") && PositionUtils.getApproxLineCount(textBlock) < 2.9) {
textBlock.setClassification("TextBlock Italic");
textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) {
textBlock.setClassification("TextBlock Unknown");
textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN);
} else {
textBlock.setClassification("Other");
textBlock.setClassification(PageBlockType.OTHER);
}
}

View File

@ -9,16 +9,16 @@ import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.adapter.ImageServiceResponseAdapter;
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableCells;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.classification.parsing.PDFLinesTextStripper;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CvParsedTableCell;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
@ -35,7 +35,7 @@ public class PdfParsingService {
private final ImageServiceResponseAdapter imageServiceResponseAdapter;
public ClassificationDocument parseDocument(PDDocument originDocument, Map<Integer, List<CvParsedTableCell>> pdfTableCells, Map<Integer, List<ClassifiedImage>> pdfImages) {
public ClassificationDocument parseDocument(PDDocument originDocument, Map<Integer, List<TableCells>> pdfTableCells, Map<Integer, List<ClassifiedImage>> pdfImages) {
ClassificationDocument document = new ClassificationDocument();
List<ClassificationPage> classificationPages = new ArrayList<>();
@ -56,7 +56,7 @@ public class PdfParsingService {
@SneakyThrows
private void parsePage(Map<Integer, List<ClassifiedImage>> pdfImages,
PDDocument pdDocument,
Map<Integer, List<CvParsedTableCell>> pdfTableCells,
Map<Integer, List<TableCells>> pdfTableCells,
ClassificationDocument document,
List<ClassificationPage> classificationPages,
int pageNumber) {
@ -93,7 +93,7 @@ public class PdfParsingService {
imageServiceResponseAdapter.findOcr(classificationPage);
}
tableExtractionService.removeRedundantTableCells(cleanRulings, classificationPage);
tableExtractionService.extractTables(cleanRulings, classificationPage);
buildPageStatistics(classificationPage);
increaseDocumentStatistics(classificationPage, document);
@ -115,12 +115,12 @@ public class PdfParsingService {
private void buildPageStatistics(ClassificationPage classificationPage) {
// Collect all statistics for the classificationPage, except from blocks inside tables, as tables will always be added to BodyTextFrame.
for (AbstractTextContainer textBlock : classificationPage.getTextBlocks()) {
if (textBlock instanceof ClassificationTextBlock) {
if (((ClassificationTextBlock) textBlock).getSequences() == null) {
for (AbstractPageBlock textBlock : classificationPage.getTextBlocks()) {
if (textBlock instanceof TextPageBlock) {
if (((TextPageBlock) textBlock).getSequences() == null) {
continue;
}
for (TextPositionSequence word : ((ClassificationTextBlock) textBlock).getSequences()) {
for (TextPositionSequence word : ((TextPageBlock) textBlock).getSequences()) {
classificationPage.getTextHeightCounter().add(word.getTextHeight());
classificationPage.getFontCounter().add(word.getFont());
classificationPage.getFontSizeCounter().add(word.getFontSize());
@ -132,3 +132,5 @@ public class PdfParsingService {
}
}

View File

@ -12,9 +12,9 @@ import java.util.Map;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CvParsedTableCell;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableCells;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.DoubleComparisons;
import lombok.RequiredArgsConstructor;
@ -25,7 +25,7 @@ import lombok.extern.slf4j.Slf4j;
@RequiredArgsConstructor
public class RulingCleaningService {
public CleanRulings getCleanRulings(List<CvParsedTableCell> cvParsedTableCells, List<Ruling> rulings, float minCharWidth, float maxCharHeight) {
public CleanRulings getCleanRulings(List<TableCells> tableCells, List<Ruling> rulings, float minCharWidth, float maxCharHeight) {
if (!rulings.isEmpty()) {
snapPoints(rulings, minCharWidth, maxCharHeight);
@ -38,7 +38,7 @@ public class RulingCleaningService {
}
}
if (vrs.isEmpty()) {
vrs.addAll(extractVerticalRulings(cvParsedTableCells));
vrs.addAll(extractVerticalRulings(tableCells));
}
List<Ruling> verticalRulingLines = collapseOrientedRulings(vrs);
@ -49,7 +49,7 @@ public class RulingCleaningService {
}
}
if (hrs.isEmpty()) {
hrs.addAll(extractHorizontalRulings(cvParsedTableCells));
hrs.addAll(extractHorizontalRulings(tableCells));
}
List<Ruling> horizontalRulingLines = collapseOrientedRulings(hrs);
@ -132,12 +132,12 @@ public class RulingCleaningService {
}
private Collection<? extends Ruling> extractVerticalRulings(List<CvParsedTableCell> cvParsedTableCells) {
private Collection<? extends Ruling> extractVerticalRulings(List<TableCells> cvParsedTableCells) {
List<Ruling> vrs = new ArrayList<>();
if (cvParsedTableCells != null) {
for (CvParsedTableCell cvParsedTableCell : cvParsedTableCells) {
for (TableCells cvParsedTableCell : cvParsedTableCells) {
Ruling leftLine = createRuling(cvParsedTableCell.getX0(), cvParsedTableCell.getX0(), cvParsedTableCell.getY0(), cvParsedTableCell.getY1());
Ruling rightLine = createRuling(cvParsedTableCell.getX1(), cvParsedTableCell.getX1(), cvParsedTableCell.getY0(), cvParsedTableCell.getY1());
vrs.add(leftLine);
@ -148,12 +148,12 @@ public class RulingCleaningService {
}
private Collection<? extends Ruling> extractHorizontalRulings(List<CvParsedTableCell> cvParsedTableCells) {
private Collection<? extends Ruling> extractHorizontalRulings(List<TableCells> cvParsedTableCells) {
List<Ruling> hrs = new ArrayList<>();
if (cvParsedTableCells != null) {
for (CvParsedTableCell cvParsedTableCell : cvParsedTableCells) {
for (TableCells cvParsedTableCell : cvParsedTableCells) {
Ruling topLine = createRuling(cvParsedTableCell.getX0(), cvParsedTableCell.getX1(), cvParsedTableCell.getY1(), cvParsedTableCell.getY1());
Ruling baseLine = createRuling(cvParsedTableCell.getX0(), cvParsedTableCell.getX1(), cvParsedTableCell.getY0(), cvParsedTableCell.getY0());
hrs.add(topLine);

View File

@ -9,17 +9,18 @@ import java.util.stream.Collectors;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationFooter;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationSection;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationHeader;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.UnclassifiedText;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.TableCell;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Table;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationFooter;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationHeader;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationSection;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.UnclassifiedText;
import lombok.extern.slf4j.Slf4j;
@ -29,44 +30,44 @@ public class SectionsBuilderService {
public void buildSections(ClassificationDocument document) {
List<AbstractTextContainer> chunkWords = new ArrayList<>();
List<AbstractPageBlock> chunkWords = new ArrayList<>();
List<ClassificationSection> chunkBlockList = new ArrayList<>();
List<ClassificationHeader> headers = new ArrayList<>();
List<ClassificationFooter> footers = new ArrayList<>();
List<UnclassifiedText> unclassifiedTexts = new ArrayList<>();
AbstractTextContainer prev = null;
AbstractPageBlock prev = null;
String lastHeadline = "";
Table previousTable = null;
for (ClassificationPage classificationPage : document.getPages()) {
List<ClassificationTextBlock> header = new ArrayList<>();
List<ClassificationTextBlock> footer = new ArrayList<>();
List<ClassificationTextBlock> unclassifiedText = new ArrayList<>();
for (AbstractTextContainer current : classificationPage.getTextBlocks()) {
TablePageBlock previousTable = null;
for (ClassificationPage page : document.getPages()) {
List<TextPageBlock> header = new ArrayList<>();
List<TextPageBlock> footer = new ArrayList<>();
List<TextPageBlock> unclassifiedText = new ArrayList<>();
for (AbstractPageBlock current : page.getTextBlocks()) {
if (current.getClassification() == null) {
continue;
}
current.setPage(classificationPage.getPageNumber());
current.setPage(page.getPageNumber());
if (current.getClassification().equals("Header")) {
header.add((ClassificationTextBlock) current);
if (current.getClassification().equals(PageBlockType.HEADER)) {
header.add((TextPageBlock) current);
continue;
}
if (current.getClassification().equals("Footer")) {
footer.add((ClassificationTextBlock) current);
if (current.getClassification().equals(PageBlockType.FOOTER)) {
footer.add((TextPageBlock) current);
continue;
}
if (current.getClassification().equals("Other")) {
unclassifiedText.add((ClassificationTextBlock) current);
if (current.getClassification().equals(PageBlockType.OTHER)) {
unclassifiedText.add((TextPageBlock) current);
continue;
}
if (prev != null && current.getClassification().startsWith("H ") && !prev.getClassification().startsWith("H ") || !document.isHeadlines()) {
if (prev != null && current.getClassification().isHeadline() && !prev.getClassification().isHeadline() || !document.isHeadlines()) {
ClassificationSection chunkBlock = buildTextBlock(chunkWords, lastHeadline);
chunkBlock.setHeadline(lastHeadline);
if (document.isHeadlines()) {
@ -78,7 +79,7 @@ public class SectionsBuilderService {
previousTable = chunkBlock.getTables().get(chunkBlock.getTables().size() - 1);
}
}
if (current instanceof Table table) {
if (current instanceof TablePageBlock table) {
// Distribute header information for subsequent tables
mergeTableMetadata(table, previousTable);
previousTable = table;
@ -106,15 +107,14 @@ public class SectionsBuilderService {
document.setHeaders(headers);
document.setFooters(footers);
document.setUnclassifiedTexts(unclassifiedTexts);
addImagesToSections(document);
}
private void addImagesToSections(ClassificationDocument document) {
public void addImagesToSections(ClassificationDocument document) {
Map<Integer, List<ClassificationSection>> sectionMap = new HashMap<>();
for (ClassificationSection section : document.getSections()) {
for (AbstractTextContainer container : section.getPageBlocks()) {
for (AbstractPageBlock container : section.getPageBlocks()) {
List<ClassificationSection> sectionsOnPage = sectionMap.computeIfAbsent(container.getPage(), c -> new ArrayList<>());
if (sectionsOnPage.contains(section)) {
@ -138,11 +138,11 @@ public class SectionsBuilderService {
sectionMap.computeIfAbsent(1, x -> new ArrayList<>()).add(section);
}
for (ClassificationPage classificationPage : document.getPages()) {
for (ClassifiedImage image : classificationPage.getImages()) {
List<ClassificationSection> sectionsOnPage = sectionMap.get(classificationPage.getPageNumber());
for (ClassificationPage page : document.getPages()) {
for (ClassifiedImage image : page.getImages()) {
List<ClassificationSection> sectionsOnPage = sectionMap.get(page.getPageNumber());
if (sectionsOnPage == null) {
int i = classificationPage.getPageNumber();
int i = page.getPageNumber();
while (sectionsOnPage == null) {
sectionsOnPage = sectionMap.get(i);
i--;
@ -154,8 +154,8 @@ public class SectionsBuilderService {
Float xMax = null;
Float yMax = null;
for (AbstractTextContainer abs : section.getPageBlocks()) {
if (abs.getPage() != classificationPage.getPageNumber()) {
for (AbstractPageBlock abs : section.getPageBlocks()) {
if (abs.getPage() != page.getPageNumber()) {
continue;
}
@ -212,23 +212,23 @@ public class SectionsBuilderService {
}
private void mergeTableMetadata(Table currentTable, Table previousTable) {
private void mergeTableMetadata(TablePageBlock currentTable, TablePageBlock previousTable) {
// Distribute header information for subsequent tables
if (previousTable != null && hasInvalidHeaderInformation(currentTable) && hasValidHeaderInformation(previousTable)) {
List<TableCell> previousTableNonHeaderRow = getRowWithNonHeaderCells(previousTable);
List<TableCell> tableNonHeaderRow = getRowWithNonHeaderCells(currentTable);
List<Cell> previousTableNonHeaderRow = getRowWithNonHeaderCells(previousTable);
List<Cell> tableNonHeaderRow = getRowWithNonHeaderCells(currentTable);
// Allow merging of tables if header row is separated from first logical non-header row
if (previousTableNonHeaderRow.isEmpty() && previousTable.getRowCount() == 1 && previousTable.getRows().get(0).size() == tableNonHeaderRow.size()) {
previousTableNonHeaderRow = previousTable.getRows().get(0).stream().map(cell -> {
TableCell fakeCell = new TableCell(cell.getPoints()[0], cell.getPoints()[2]);
Cell fakeCell = new Cell(cell.getPoints()[0], cell.getPoints()[2]);
fakeCell.setHeaderCells(Collections.singletonList(cell));
return fakeCell;
}).collect(Collectors.toList());
}
if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) {
for (int i = currentTable.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
List<TableCell> row = currentTable.getRows().get(i);
List<Cell> row = currentTable.getRows().get(i);
if (row.size() == tableNonHeaderRow.size() && row.stream().allMatch(cell -> cell.getHeaderCells().isEmpty())) {
for (int j = 0; j < row.size(); j++) {
row.get(j).setHeaderCells(previousTableNonHeaderRow.get(j).getHeaderCells());
@ -240,52 +240,52 @@ public class SectionsBuilderService {
}
private ClassificationSection buildTextBlock(List<AbstractTextContainer> wordBlockList, String lastHeadline) {
private ClassificationSection buildTextBlock(List<AbstractPageBlock> wordBlockList, String lastHeadline) {
ClassificationSection section = new ClassificationSection();
for (AbstractTextContainer container : wordBlockList) {
if (container instanceof Table table) {
for (AbstractPageBlock container : wordBlockList) {
if (container instanceof TablePageBlock table) {
if (lastHeadline == null || lastHeadline.isEmpty()) {
table.setHeadline("Text in table");
} else {
table.setHeadline("Table in: " + lastHeadline);
table.setHeadline("TablePageBlock in: " + lastHeadline);
}
section.getPageBlocks().add(table);
continue;
}
ClassificationTextBlock wordBlock = (ClassificationTextBlock) container;
TextPageBlock wordBlock = (TextPageBlock) container;
section.getPageBlocks().add(wordBlock);
}
return section;
}
private boolean hasValidHeaderInformation(Table table) {
private boolean hasValidHeaderInformation(TablePageBlock table) {
return !hasInvalidHeaderInformation(table);
}
private boolean hasInvalidHeaderInformation(Table table) {
private boolean hasInvalidHeaderInformation(TablePageBlock table) {
return table.getRows().stream().flatMap(row -> row.stream().filter(cell -> !cell.getHeaderCells().isEmpty())).findAny().isEmpty();
}
private List<TableCell> getRowWithNonHeaderCells(Table table) {
private List<Cell> getRowWithNonHeaderCells(TablePageBlock table) {
for (int i = table.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
List<TableCell> row = table.getRows().get(i);
List<Cell> row = table.getRows().get(i);
if (row.size() == 1) {
continue;
}
boolean allNonHeader = true;
for (TableCell cell : row) {
for (Cell cell : row) {
if (cell.isHeaderCell()) {
allNonHeader = false;
break;

View File

@ -9,20 +9,18 @@ import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.TableCell;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.QuickSort;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.DoubleComparisons;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Table;
@Service
public class TableExtractionService {
@ -68,28 +66,28 @@ public class TableExtractionService {
/**
* Finds tables on a classificationPage and moves textblocks into cells of the found tables.
* Note: This algorithm uses Pdf Coordinate System where {0,0} rotated with the classificationPage rotation.
* Finds tables on a page and moves textblocks into cells of the found tables.
* Note: This algorithm uses Pdf Coordinate System where {0,0} rotated with the page rotation.
* 0 -> LowerLeft
* 90 -> UpperLeft
* 180 -> UpperRight
* 270 -> LowerRight
*
* <p>
* DirAdj (Text direction adjusted) values can not be used here.
*
* @param cleanRulings The lines used to build the table.
* @param classificationPage ClassificationPage object that contains textblocks and statistics.
* @param page Page object that contains textblocks and statistics.
*/
public void removeRedundantTableCells(CleanRulings cleanRulings, ClassificationPage classificationPage) {
public void extractTables(CleanRulings cleanRulings, ClassificationPage page) {
List<TableCell> cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical());
List<Cell> cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical());
List<ClassificationTextBlock> toBeRemoved = new ArrayList<>();
List<TextPageBlock> toBeRemoved = new ArrayList<>();
for (AbstractTextContainer abstractTextContainer : classificationPage.getTextBlocks()) {
ClassificationTextBlock textBlock = (ClassificationTextBlock) abstractTextContainer;
for (TableCell cell : cells) {
if (cell.intersects(textBlock.getPdfMinX(),
for (AbstractPageBlock abstractPageBlock : page.getTextBlocks()) {
TextPageBlock textBlock = (TextPageBlock) abstractPageBlock;
for (Cell cell : cells) {
if (cell.hasMinimumSize() && cell.intersects(textBlock.getPdfMinX(),
textBlock.getPdfMinY(),
textBlock.getPdfMaxX() - textBlock.getPdfMinX(),
textBlock.getPdfMaxY() - textBlock.getPdfMinY())) {
@ -101,44 +99,44 @@ public class TableExtractionService {
}
cells = new ArrayList<>(new HashSet<>(cells));
QuickSort.sort(cells, Rectangle.ILL_DEFINED_ORDER);
DoubleComparisons.sort(cells, Rectangle.ILL_DEFINED_ORDER);
List<Rectangle> spreadsheetAreas = findSpreadsheetsFromCells(cells).stream().filter(r -> r.getWidth() > 0f && r.getHeight() > 0f).collect(Collectors.toList());
List<Rectangle> spreadsheetAreas = findSpreadsheetsFromCells(cells).stream().filter(r -> r.getWidth() > 0f && r.getHeight() > 0f).toList();
List<Table> tables = new ArrayList<>();
List<TablePageBlock> tables = new ArrayList<>();
for (Rectangle area : spreadsheetAreas) {
List<TableCell> overlappingCells = new ArrayList<>();
for (TableCell c : cells) {
if (c.intersects(area)) {
List<Cell> overlappingCells = new ArrayList<>();
for (Cell c : cells) {
if (c.hasMinimumSize() && c.intersects(area)) {
overlappingCells.add(c);
}
}
tables.add(new Table(overlappingCells, area, classificationPage.getRotation()));
tables.add(new TablePageBlock(overlappingCells, area, page.getRotation()));
}
for (Table table : tables) {
for (TablePageBlock table : tables) {
int position = -1;
Iterator<AbstractTextContainer> itty = classificationPage.getTextBlocks().iterator();
Iterator<AbstractPageBlock> itty = page.getTextBlocks().iterator();
while (itty.hasNext()) {
AbstractTextContainer textBlock = itty.next();
if (textBlock instanceof ClassificationTextBlock ? table.containsBlock((ClassificationTextBlock) textBlock) : table.contains(textBlock) && position == -1) {
position = classificationPage.getTextBlocks().indexOf(textBlock);
AbstractPageBlock textBlock = itty.next();
if (textBlock instanceof TextPageBlock ? table.containsBlock((TextPageBlock) textBlock) : table.contains(textBlock) && position == -1) {
position = page.getTextBlocks().indexOf(textBlock);
}
}
if (position != -1) {
classificationPage.getTextBlocks().add(position, table);
page.getTextBlocks().add(position, table);
}
}
classificationPage.getTextBlocks().removeAll(toBeRemoved);
page.getTextBlocks().removeAll(toBeRemoved);
}
public List<TableCell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
public List<Cell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
List<TableCell> cellsFound = new ArrayList<>();
List<Cell> cellsFound = new ArrayList<>();
Map<Point2D, Ruling[]> intersectionPoints = Ruling.findIntersections(horizontalRulingLines, verticalRulingLines);
List<Point2D> intersectionPointsList = new ArrayList<>(intersectionPoints.keySet());
intersectionPointsList.sort(POINT_COMPARATOR);
@ -174,7 +172,7 @@ public class TableExtractionService {
Point2D btmRight = new Point2D.Float((float) yPoint.getX(), (float) xPoint.getY());
if (intersectionPoints.containsKey(btmRight) && intersectionPoints.get(btmRight)[0].equals(intersectionPoints.get(xPoint)[0]) && intersectionPoints.get(btmRight)[1].equals(
intersectionPoints.get(yPoint)[1])) {
cellsFound.add(new TableCell(topLeft, btmRight));
cellsFound.add(new Cell(topLeft, btmRight));
break outer;
}
}

View File

@ -1,6 +1,8 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.utils;
import java.math.BigDecimal;
import java.util.Comparator;
import java.util.List;
import lombok.experimental.UtilityClass;
import lombok.extern.slf4j.Slf4j;
@ -20,11 +22,22 @@ public final class DoubleComparisons {
public static float round(double d, int decimalPlace) {
BigDecimal bd = BigDecimal.valueOf(d);
bd = bd.setScale(decimalPlace, BigDecimal.ROUND_HALF_UP);
return bd.floatValue();
}
public static <T> void sort(List<T> list, Comparator<? super T> comparator) {
try {
QuickSort.sort(list, comparator);
} catch (IllegalArgumentException e) {
// This should not happen since we use QuickSort from PDFBox
log.warn(e.getMessage());
}
}
}

View File

@ -0,0 +1,56 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.utils;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import lombok.experimental.UtilityClass;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@UtilityClass
public class FileUtils {
public File createTempFile(String filenamePrefix, String filenameSuffix) throws IOException {
File tempFile = Files.createTempFile(filenamePrefix, filenameSuffix).toFile();
setRWPermissionsOnlyForOwner(tempFile);
return tempFile;
}
/**
* Deletes a file; logs a message with the reason if the deletion fails.
* This method is null-safe.
*
* @param file The file to delete. Can be null.
*/
public void deleteFile(File file) {
if (file != null) {
try {
Files.deleteIfExists(file.toPath());
} catch (IOException ex) {
log.warn("Could not delete file!", ex);
}
}
}
// We don't need to check the results of the permission setters below,
// since we're manipulating a file we created ourselves.
@SuppressWarnings({"ResultOfMethodCallIgnored", "squid:S899"})
private void setRWPermissionsOnlyForOwner(File tempFile) {
try {
tempFile.setReadable(true, true);
tempFile.setWritable(true, true);
tempFile.setExecutable(false);
} catch (SecurityException ex) {
// This should never happen since we're creating a temp file ourselves.
log.warn("Caught an exception during temp file creation. This should not happend. Check the code.", ex);
}
}
}

View File

@ -1,7 +1,7 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.utils;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
import lombok.experimental.UtilityClass;
@ -11,7 +11,7 @@ public final class PositionUtils {
// TODO This currently uses pdf coord system. In the futher this should use java coord system.
// Note: DirAdj (TextDirection Adjusted) can not be user for this.
public boolean isWithinBodyTextFrame(Rectangle btf, ClassificationTextBlock textBlock) {
public boolean isWithinBodyTextFrame(Rectangle btf, TextPageBlock textBlock) {
if (btf == null || textBlock == null) {
return false;
@ -32,7 +32,7 @@ public final class PositionUtils {
// TODO This currently uses pdf coord system. In the futher this should use java coord system.
// Note: DirAdj (TextDirection Adjusted) can not be user for this.
public boolean isOverBodyTextFrame(Rectangle btf, ClassificationTextBlock textBlock, int rotation) {
public boolean isOverBodyTextFrame(Rectangle btf, TextPageBlock textBlock, int rotation) {
if (btf == null || textBlock == null) {
return false;
@ -58,9 +58,10 @@ public final class PositionUtils {
}
// TODO This currently uses pdf coord system. In the futher this should use java coord system.
// Note: DirAdj (TextDirection Adjusted) can not be user for this.
public boolean isUnderBodyTextFrame(Rectangle btf, ClassificationTextBlock textBlock, int rotation) {
public boolean isUnderBodyTextFrame(Rectangle btf, TextPageBlock textBlock, int rotation) {
if (btf == null || textBlock == null) {
return false;
@ -86,9 +87,10 @@ public final class PositionUtils {
}
// TODO This currently uses pdf coord system. In the futher this should use java coord system.
// Note: DirAdj (TextDirection Adjusted) can not be user for this.
public boolean isTouchingUnderBodyTextFrame(Rectangle btf, ClassificationTextBlock textBlock) {
public boolean isTouchingUnderBodyTextFrame(Rectangle btf, TextPageBlock textBlock) {
//TODO Currently this is not working for rotated pages.
@ -105,13 +107,13 @@ public final class PositionUtils {
}
public float getHeightDifferenceBetweenChunkWordAndDocumentWord(ClassificationTextBlock textBlock, Float documentMostPopularWordHeight) {
public float getHeightDifferenceBetweenChunkWordAndDocumentWord(TextPageBlock textBlock, Float documentMostPopularWordHeight) {
return textBlock.getMostPopularWordHeight() - documentMostPopularWordHeight;
}
public Float getApproxLineCount(ClassificationTextBlock textBlock) {
public Float getApproxLineCount(TextPageBlock textBlock) {
return textBlock.getHeight() / textBlock.getMostPopularWordHeight();
}

View File

@ -3,7 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.classification.util
import java.awt.geom.Line2D;
import java.awt.geom.Point2D;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Ruling;
import lombok.experimental.UtilityClass;
@ -13,7 +13,7 @@ public final class RulingTextDirAdjustUtil {
/**
* Converts a ruling (line of a table) the same way TextPositions are converted in PDFBox.
* This will get the y position of the text, adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
*
* <p>
* See org.apache.pdfbox.text.TextPosition
*/
public Line2D.Float convertToDirAdj(Ruling ruling, float dir, float pageWidth, float pageHeight) {

View File

@ -16,4 +16,16 @@ public final class TextNormalizationUtilities {
return text.replaceAll("([^\\s\\d\\-]{2,500})[\\-\\u00AD]\\R", "$1");
}
public static String removeLineBreaks(String text) {
return text.replaceAll("\n", " ");
}
public static String removeRepeatingWhitespaces(String text) {
return text.replaceAll(" {2}", " ");
}
}

View File

@ -4,286 +4,124 @@ import static java.lang.String.format;
import static java.util.stream.Collectors.groupingBy;
import static java.util.stream.Collectors.toList;
import java.util.Collections;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationFooter;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationHeader;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Footer;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.GenericSemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Header;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Headline;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Image;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Paragraph;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Section;
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.AtomicTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.utils.IdBuilder;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.DocumentGraph;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.FooterNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.HeaderNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.HeadlineNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ImageNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.NodeType;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ParagraphNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SectionNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableCellNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.AtomicTextBlock;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationFooter;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationHeader;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Table;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.TableCell;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextPositionSequence;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Getter;
import lombok.experimental.FieldDefaults;
import lombok.experimental.UtilityClass;
@Service
@UtilityClass
public class DocumentGraphFactory {
public static final double TABLE_CELL_MERGE_CONTENTS_SIZE_THRESHOLD = 0.05;
public Document buildDocumentGraph(ClassificationDocument document) {
Document documentGraph = new Document();
Context context = new Context(documentGraph);
public DocumentGraph buildDocumentGraph(ClassificationDocument document) {
TextBlockFactory textBlockFactory = new TextBlockFactory();
DocumentGraph documentGraph = new DocumentGraph();
Context context = new Context(new TableOfContents(documentGraph), new HashMap<>(), new LinkedList<>(), new LinkedList<>(), textBlockFactory);
document.getPages().stream().map(this::buildPage).forEach(page -> context.pages().put(page, new AtomicInteger(1)));
document.getSections().stream().flatMap(section -> section.getImages().stream()).forEach(image -> context.images().add(image));
document.getPages().forEach(context::buildAndAddPageWithCounter);
document.getSections().stream().flatMap(section -> section.getImages().stream()).forEach(image -> context.getImages().add(image));
addSections(document, context);
addHeaderAndFooterToEachPage(document, context);
documentGraph.setNumberOfPages(context.pages.size());
documentGraph.setPages(context.pages.keySet());
documentGraph.setTableOfContents(context.tableOfContents);
documentGraph.setTextBlock(documentGraph.buildTextBlock());
documentGraph.setDocumentTree(context.documentTree);
documentGraph.setTextBlock(documentGraph.getTextBlock());
return documentGraph;
}
private void addSections(ClassificationDocument document, Context context) {
document.getSections().forEach(section -> addSection(null, section.getPageBlocks(), section.getImages(), context));
document.getSections().forEach(section -> SectionNodeFactory.addSection(null, section.getPageBlocks(), section.getImages(), context));
}
private void addSection(SemanticNode parentNode, List<AbstractTextContainer> pageBlocks, List<ClassifiedImage> images, Context context) {
public void addParagraphOrHeadline(GenericSemanticNode parentNode, TextPageBlock originalTextBlock, Context context, List<TextPageBlock> textBlocksToMerge) {
Map<Integer, List<AbstractTextContainer>> blocksPerPage = pageBlocks.stream().collect(groupingBy(AbstractTextContainer::getPage));
SectionNode sectionNode = SectionNode.builder().entities(new HashSet<>()).tableOfContents(context.tableOfContents()).build();
Page page = context.getPage(originalTextBlock.getPage());
context.sections().add(sectionNode);
blocksPerPage.keySet().forEach(pageNumber -> addSectionNodeToPageNode(context, sectionNode, pageNumber));
List<Integer> tocId;
if (parentNode == null) {
tocId = context.tableOfContents.createNewMainEntryAndReturnId(NodeType.SECTION, sectionNode);
GenericSemanticNode node;
if (originalTextBlock.isHeadline()) {
node = Headline.builder().documentTree(context.getDocumentTree()).build();
} else {
tocId = context.tableOfContents.createNewChildEntryAndReturnId(parentNode.getTocId(), NodeType.SECTION, sectionNode);
}
sectionNode.setTocId(tocId);
Set<AbstractTextContainer> alreadyMerged = new HashSet<>();
for (AbstractTextContainer abstractTextContainer : pageBlocks) {
if (alreadyMerged.contains(abstractTextContainer)) {
continue;
}
if (abstractTextContainer instanceof ClassificationTextBlock) {
List<ClassificationTextBlock> textBlocks = findTextBlocksWithSameClassificationAndAlignsY(abstractTextContainer, pageBlocks);
alreadyMerged.addAll(textBlocks);
addParagraphOrHeadline(sectionNode, (ClassificationTextBlock) abstractTextContainer, context, textBlocks);
}
if (abstractTextContainer instanceof Table) {
addTable(sectionNode, (Table) abstractTextContainer, context);
}
}
for (ClassifiedImage image : images) {
addImage(sectionNode, image, context);
}
}
private static List<ClassificationTextBlock> findTextBlocksWithSameClassificationAndAlignsY(AbstractTextContainer atc, List<AbstractTextContainer> pageBlocks) {
return pageBlocks.stream()
.filter(abstractTextContainer -> !abstractTextContainer.equals(atc))
.filter(abstractTextContainer -> abstractTextContainer.getPage() == atc.getPage())
.filter(abstractTextContainer -> abstractTextContainer instanceof ClassificationTextBlock)
.filter(abstractTextContainer -> abstractTextContainer.intersectsY(atc))
.map(abstractTextContainer -> (ClassificationTextBlock) abstractTextContainer)
.toList();
}
private void addSectionNodeToPageNode(Context context, SectionNode sectionNode, Integer pageNumber) {
PageNode page = getPage(pageNumber, context);
page.getMainBody().add(sectionNode);
}
private void addTable(SemanticNode parentNode, Table table, Context context) {
PageNode page = getPage(table.getPage(), context);
TableNode tableNode = TableNode.builder().tableOfContents(context.tableOfContents()).numberOfCols(table.getColCount()).numberOfRows(table.getRowCount()).build();
if (!page.getMainBody().contains(parentNode)) {
parentNode.getPages().add(page);
}
page.getMainBody().add(tableNode);
List<Integer> tocId = context.tableOfContents().createNewChildEntryAndReturnId(parentNode.getTocId(), NodeType.TABLE, tableNode);
tableNode.setTocId(tocId);
addTableCells(table.getRows(), tableNode, context, table.getPage());
}
private void addTableCells(List<List<TableCell>> rows, SemanticNode parentNode, Context context, int pageNumber) {
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
for (int colIndex = 0; colIndex < rows.get(rowIndex).size(); colIndex++) {
addTableCell(rows.get(rowIndex).get(colIndex), rowIndex, colIndex, parentNode, pageNumber, context);
}
}
}
private void addTableCell(TableCell cell, int rowIndex, int colIndex, SemanticNode parentNode, int pageNumber, Context context) {
PageNode page = getPage(pageNumber, context);
cell.getTextBlocks().stream().filter(tb -> tb.getPage() == 0).forEach(tb -> tb.setPage(pageNumber));
TableCellNode tableCellNode = TableCellNode.builder()
.tableOfContents(context.tableOfContents())
.row(rowIndex)
.col(colIndex)
.header(cell.isHeaderCell())
.bBox(cell.getBounds2D())
.build();
page.getMainBody().add(tableCellNode);
TextBlock textBlock;
List<Integer> tocId = context.tableOfContents().createNewChildEntryAndReturnId(parentNode.getTocId(), NodeType.TABLE_CELL, tableCellNode);
tableCellNode.setTocId(tocId);
if (cell.getTextBlocks().isEmpty()) {
tableCellNode.setTerminalTextBlock(context.textBlockFactory.emptyTextBlock(parentNode, context, page));
tableCellNode.setTerminal(true);
} else if (cell.getTextBlocks().size() == 1) {
textBlock = context.textBlockFactory().buildAtomicTextBlock(cell.getTextBlocks().get(0).getSequences(), tableCellNode, context, page);
tableCellNode.setTerminalTextBlock(textBlock);
tableCellNode.setTerminal(true);
} else if (firstTextBlockIsHeadline(cell)) {
addSection(tableCellNode, cell.getTextBlocks().stream().map(tb -> (AbstractTextContainer) tb).toList(), Collections.emptyList(), context);
tableCellNode.setTerminal(false);
} else if (cellAreaIsSmallerThanPageAreaTimesThreshold(cell, page)) {
List<TextPositionSequence> sequences = TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(cell.getTextBlocks());
textBlock = context.textBlockFactory().buildAtomicTextBlock(sequences, tableCellNode, context, page);
tableCellNode.setTerminalTextBlock(textBlock);
tableCellNode.setTerminal(true);
} else {
cell.getTextBlocks().forEach(tb -> addParagraphOrHeadline(tableCellNode, tb, context));
tableCellNode.setTerminal(false);
}
}
private static boolean cellAreaIsSmallerThanPageAreaTimesThreshold(TableCell cell, PageNode page) {
return cell.getArea() < TABLE_CELL_MERGE_CONTENTS_SIZE_THRESHOLD * page.getHeight() * page.getWidth();
}
private static boolean firstTextBlockIsHeadline(TableCell cell) {
String classification = cell.getTextBlocks().get(0).getClassification();
return classification != null && classification.startsWith("H");
}
private void addParagraphOrHeadline(SemanticNode parentNode, ClassificationTextBlock originalTextBlock, Context context) {
addParagraphOrHeadline(parentNode, originalTextBlock, context, Collections.emptyList());
}
private void addParagraphOrHeadline(SemanticNode parentNode, ClassificationTextBlock originalTextBlock, Context context, List<ClassificationTextBlock> textBlocksToMerge) {
PageNode page = getPage(originalTextBlock.getPage(), context);
SemanticNode node;
if (originalTextBlock.getClassification() != null && originalTextBlock.getClassification().startsWith("H")) {
node = HeadlineNode.builder().tableOfContents(context.tableOfContents()).build();
} else {
node = ParagraphNode.builder().tableOfContents(context.tableOfContents()).build();
node = Paragraph.builder().documentTree(context.getDocumentTree()).build();
}
page.getMainBody().add(node);
List<ClassificationTextBlock> textBlocks = new LinkedList<>(textBlocksToMerge);
List<TextPageBlock> textBlocks = new ArrayList<>(textBlocksToMerge);
textBlocks.add(originalTextBlock);
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), node, context, page);
if (node instanceof HeadlineNode headlineNode) {
List<Integer> tocId = context.tableOfContents.createNewChildEntryAndReturnId(parentNode.getTocId(), NodeType.HEADLINE, node);
headlineNode.setTerminalTextBlock(textBlock);
headlineNode.setTocId(tocId);
}
if (node instanceof ParagraphNode paragraphNode) {
List<Integer> tocId = context.tableOfContents.createNewChildEntryAndReturnId(parentNode.getTocId(), NodeType.PARAGRAPH, node);
paragraphNode.setTerminalTextBlock(textBlock);
paragraphNode.setTocId(tocId);
}
List<Integer> treeId = context.documentTree.createNewChildEntryAndReturnId(parentNode, node);
node.setLeafTextBlock(textBlock);
node.setTreeId(treeId);
}
private void addImage(SectionNode sectionNode, ClassifiedImage image, Context context) {
public void addImage(Section section, ClassifiedImage image, Context context) {
PageNode page = getPage(image.getPage(), context);
ImageNode imageNode = ImageNode.builder()
Rectangle2D position = image.getPosition();
Page page = context.getPage(image.getPage());
Image imageNode = Image.builder()
.id(IdBuilder.buildId(Set.of(page), List.of(position)))
.imageType(image.getImageType())
.position(image.getPosition())
.transparency(image.isHasTransparency())
.position(position)
.transparent(image.isHasTransparency())
.page(page)
.tableOfContents(context.tableOfContents())
.documentTree(context.getDocumentTree())
.build();
page.getMainBody().add(imageNode);
List<Integer> tocId = context.tableOfContents().createNewChildEntryAndReturnId(sectionNode.getTocId(), NodeType.IMAGE, imageNode);
imageNode.setTocId(tocId);
List<Integer> tocId = context.getDocumentTree().createNewChildEntryAndReturnId(section, imageNode);
imageNode.setTreeId(tocId);
}
private void addHeaderAndFooterToEachPage(ClassificationDocument document, Context context) {
Map<Integer, List<ClassificationTextBlock>> headers = document.getHeaders()
Map<Integer, List<TextPageBlock>> headers = document.getHeaders()
.stream()
.map(ClassificationHeader::getTextBlocks)
.flatMap(List::stream)
.collect(groupingBy(AbstractTextContainer::getPage, toList()));
.collect(groupingBy(AbstractPageBlock::getPage, toList()));
Map<Integer, List<ClassificationTextBlock>> footers = document.getFooters()
Map<Integer, List<TextPageBlock>> footers = document.getFooters()
.stream()
.map(ClassificationFooter::getTextBlocks)
.flatMap(List::stream)
.collect(groupingBy(AbstractTextContainer::getPage, toList()));
.collect(groupingBy(AbstractPageBlock::getPage, toList()));
for (int pageIndex = 1; pageIndex <= document.getPages().size(); pageIndex++) {
if (headers.containsKey(pageIndex)) {
@ -303,85 +141,105 @@ public class DocumentGraphFactory {
}
private void addFooter(List<ClassificationTextBlock> textBlocks, Context context) {
private void addFooter(List<TextPageBlock> textBlocks, Context context) {
PageNode page = getPage(textBlocks.get(0).getPage(), context);
FooterNode footer = FooterNode.builder().tableOfContents(context.tableOfContents()).build();
Page page = context.getPage(textBlocks.get(0).getPage());
Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build();
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks),
footer,
context,
page);
List<Integer> tocId = context.tableOfContents().createNewMainEntryAndReturnId(NodeType.FOOTER, footer);
footer.setTocId(tocId);
footer.setTerminalTextBlock(textBlock);
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer);
footer.setTreeId(tocId);
footer.setLeafTextBlock(textBlock);
page.setFooter(footer);
}
public void addHeader(List<ClassificationTextBlock> textBlocks, Context context) {
public void addHeader(List<TextPageBlock> textBlocks, Context context) {
PageNode page = getPage(textBlocks.get(0).getPage(), context);
HeaderNode header = HeaderNode.builder().tableOfContents(context.tableOfContents()).build();
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks),
header,
context,
0,
page);
List<Integer> tocId = context.tableOfContents().createNewMainEntryAndReturnId(NodeType.HEADER, header);
header.setTocId(tocId);
header.setTerminalTextBlock(textBlock);
Page page = context.getPage(textBlocks.get(0).getPage());
Header header = Header.builder().documentTree(context.getDocumentTree()).build();
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), header, 0, page);
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header);
header.setTreeId(tocId);
header.setLeafTextBlock(textBlock);
page.setHeader(header);
}
private void addEmptyFooter(int pageIndex, Context context) {
PageNode page = getPage(pageIndex, context);
FooterNode footer = FooterNode.builder().tableOfContents(context.tableOfContents()).build();
Page page = context.getPage(pageIndex);
Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build();
AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(footer, context, page);
List<Integer> tocId = context.tableOfContents().createNewMainEntryAndReturnId(NodeType.FOOTER, footer);
footer.setTocId(tocId);
footer.setTerminalTextBlock(textBlock);
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer);
footer.setTreeId(tocId);
footer.setLeafTextBlock(textBlock);
page.setFooter(footer);
}
private void addEmptyHeader(int pageIndex, Context context) {
PageNode page = getPage(pageIndex, context);
HeaderNode header = HeaderNode.builder().tableOfContents(context.tableOfContents()).build();
Page page = context.getPage(pageIndex);
Header header = Header.builder().documentTree(context.getDocumentTree()).build();
AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(header, 0, page);
List<Integer> tocId = context.tableOfContents().createNewMainEntryAndReturnId(NodeType.HEADER, header);
header.setTocId(tocId);
header.setTerminalTextBlock(textBlock);
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header);
header.setTreeId(tocId);
header.setLeafTextBlock(textBlock);
page.setHeader(header);
}
private PageNode buildPage(ClassificationPage p) {
@Getter
@Builder
@AllArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public final class Context {
return PageNode.builder()
.height((int) p.getPageHeight())
.width((int) p.getPageWidth())
.number(p.getPageNumber())
.rotation(p.getRotation())
.mainBody(new LinkedList<>())
.build();
}
DocumentTree documentTree;
Map<Page, Integer> pages;
List<Section> sections;
List<ClassifiedImage> images;
TextBlockFactory textBlockFactory;
private PageNode getPage(int pageIndex, Context context) {
public Context(Document document) {
return context.pages.keySet()
.stream()
.filter(page -> page.getNumber() == pageIndex)
.findFirst()
.orElseThrow(() -> new NoSuchElementException(format("ClassificationPage with number %d not found", pageIndex)));
}
documentTree = new DocumentTree(document);
pages = new HashMap<>();
sections = new LinkedList<>();
images = new LinkedList<>();
textBlockFactory = new TextBlockFactory();
}
record Context(
TableOfContents tableOfContents, Map<PageNode, AtomicInteger> pages, List<SectionNode> sections, List<ClassifiedImage> images, TextBlockFactory textBlockFactory) {
public void buildAndAddPageWithCounter(ClassificationPage classificationPage) {
Page page = Page.fromClassificationPage(classificationPage);
//this counter counts the TextBlocks per page
//initial value is set to 1, because 0 is reserved for Header
pages.put(page, 1);
}
public int getAndIncrementTextBlockNumberOnPage(Page page) {
Integer textBlockNumberOnPage = pages.get(page);
pages.merge(page, 1, Integer::sum);
return textBlockNumberOnPage;
}
public Page getPage(int pageIndex) {
return pages.keySet()
.stream()
.filter(page -> page.getNumber() == pageIndex)
.findFirst()
.orElseThrow(() -> new NoSuchElementException(format("ClassificationPage with number %d not found", pageIndex)));
}
}

View File

@ -1,105 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.factory;
import static java.lang.String.format;
import java.awt.geom.Area;
import java.awt.geom.Rectangle2D;
import java.util.Arrays;
import java.util.List;
import java.util.Set;
import java.util.function.BiConsumer;
import java.util.function.BinaryOperator;
import java.util.function.Function;
import java.util.function.Supplier;
import java.util.stream.Collector;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer;
public class RectangleTransformations {
public static Rectangle2D pad(Rectangle2D rectangle2D, int deltaX, int deltaY) {
return new Rectangle2D.Double(rectangle2D.getMinX() - deltaX, rectangle2D.getMinY() - deltaY, rectangle2D.getWidth() + 2 * deltaX, rectangle2D.getHeight() + 2 * deltaY);
}
public static Rectangle2D bBoxUnionAbstractTextContainer(List<AbstractTextContainer> abstractTextContainers) {
return abstractTextContainers.stream().map(RectangleTransformations::toRectangle2D).collect(new Rectangle2DUnion());
}
public static Rectangle2D rectangleUnion(List<Rectangle2D> rectangle2DList) {
return rectangle2DList.stream().collect(new Rectangle2DUnion());
}
public static Rectangle2D toRectangle2D(AbstractTextContainer abstractTextContainer) {
return new Rectangle2D.Float(abstractTextContainer.getMinX(), abstractTextContainer.getMinY(), abstractTextContainer.getWidth(), abstractTextContainer.getHeight());
}
public static Rectangle2D toRectangle2D(PDRectangle rectangle) {
return new Rectangle2D.Double(rectangle.getLowerLeftX(), rectangle.getLowerLeftY(), rectangle.getWidth(), rectangle.getHeight());
}
public static String toString(Rectangle2D rectangle2D) {
return format("%f,%f,%f,%f", rectangle2D.getX(), rectangle2D.getY(), rectangle2D.getWidth(), rectangle2D.getHeight());
}
public static Rectangle2D parseRectangle2D(String bBox) {
List<Float> floats = Arrays.stream(bBox.split(",")).map(Float::parseFloat).toList();
return new Rectangle2D.Float(floats.get(0), floats.get(1), floats.get(2), floats.get(3));
}
private static class Rectangle2DUnion implements Collector<Rectangle2D, Area, Rectangle2D> {
@Override
public Supplier<Area> supplier() {
return Area::new;
}
@Override
public BiConsumer<Area, Rectangle2D> accumulator() {
return (area, rectangle2D) -> area.add(new Area(rectangle2D));
}
@Override
public BinaryOperator<Area> combiner() {
return (area1, area2) -> {
area1.add(area2);
return area1;
};
}
@Override
public Function<Area, Rectangle2D> finisher() {
return Area::getBounds2D;
}
@Override
public Set<Characteristics> characteristics() {
return Set.of(Characteristics.CONCURRENT, Characteristics.UNORDERED);
}
}
}

View File

@ -1,6 +1,7 @@
package com.knecon.fforesight.service.layoutparser.processor.factory;
import java.awt.geom.Rectangle2D;
import java.util.Collections;
import java.util.List;
import lombok.AccessLevel;
@ -11,10 +12,22 @@ import lombok.experimental.FieldDefaults;
@Builder
@Getter
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class SearchTextWithTextPositionModel {
public class SearchTextWithTextPositionDto {
String searchText;
List<Integer> lineBreaks;
List<Integer> stringCoordsToPositionCoords;
List<Rectangle2D> positions;
public static SearchTextWithTextPositionDto empty() {
return SearchTextWithTextPositionDto.builder()
.searchText("")
.lineBreaks(Collections.emptyList())
.positions(Collections.emptyList())
.stringCoordsToPositionCoords(Collections.emptyList())
.build();
}
}

View File

@ -2,38 +2,35 @@ package com.knecon.fforesight.service.layoutparser.processor.factory;
import java.awt.geom.AffineTransform;
import java.awt.geom.Rectangle2D;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.Objects;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.RedTextPosition;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextDirection;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.RedTextPosition;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextDirection;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence;
import lombok.experimental.UtilityClass;
@UtilityClass
public class SearchTextWithTextPositionFactory {
public static final int HEIGHT_PADDING = 2;
public final int HEIGHT_PADDING = 2;
// when checking for a hyphen linebreak, we need to check after a linebreak if the last hyphen was less than three symbols away.
// We detect a linebreak as either a "\n" character or if two adjacent symbol's position differ in y-coordinates by at least one character height.
// If there is a hyphen linebreak, the hyphen will be 1 position in front of a "\n" or 2 positions in front of the character which has a lower y-coordinate
// This is why, we need to initialize this to < -2, otherwise, if the very first symbol is a \n we would detect a hyphen linebreak that isn't there.
// Also, Integer.MIN_VALUE is a bad idea due to potential overflow during arithmetic operations. This is why the default should be -3.
public final int MAX_HYPHEN_LINEBREAK_DISTANCE = 3;
public static SearchTextWithTextPositionModel buildSearchTextToTextPositionModel(List<TextPositionSequence> sequences) {
public SearchTextWithTextPositionDto buildSearchTextToTextPositionModel(List<TextPositionSequence> sequences) {
if (sequences.isEmpty() || sequences.stream().allMatch(sequence -> sequence.getTextPositions().isEmpty())) {
return SearchTextWithTextPositionModel.builder()
.searchText("")
.lineBreaks(Collections.emptyList())
.positions(Collections.emptyList())
.stringCoordsToPositionCoords(Collections.emptyList())
.build();
return SearchTextWithTextPositionDto.empty();
}
List<Integer> stringIdxToPositionIdx = new LinkedList<>();
List<Integer> lineBreaksStringIdx = new LinkedList<>();
StringBuilder sb = new StringBuilder();
int stringIdx = 0;
int positionIdx = 0;
int lastHyphenIdx = -3;
Context context = new Context();
RedTextPosition currentTextPosition = sequences.get(0).getTextPositions().get(0);
RedTextPosition previousTextPosition = RedTextPosition.builder().unicode(" ").position(currentTextPosition.getPosition()).build();
@ -42,60 +39,78 @@ public class SearchTextWithTextPositionFactory {
for (int i = 0; i < word.getTextPositions().size(); ++i) {
currentTextPosition = word.getTextPositions().get(i);
if (isLineBreak(currentTextPosition, previousTextPosition)) {
if (stringIdx - lastHyphenIdx < 3) {
sb.delete(lastHyphenIdx, sb.length());
stringIdxToPositionIdx = stringIdxToPositionIdx.subList(0, lastHyphenIdx);
stringIdx = lastHyphenIdx;
lastHyphenIdx = -3;
}
lineBreaksStringIdx.add(stringIdx);
removeHyphenLinebreaks(context);
context.lineBreaksStringIdx.add(context.stringIdx);
}
if (!isRepeatedWhitespace(currentTextPosition.getUnicode(), previousTextPosition.getUnicode())) {
if (isHyphen(currentTextPosition.getUnicode())) {
lastHyphenIdx = stringIdx;
context.lastHyphenIdx = context.stringIdx;
}
sb.append(currentTextPosition.getUnicode());
stringIdxToPositionIdx.add(positionIdx);
++stringIdx;
appendCurrentTextPosition(context, currentTextPosition);
}
previousTextPosition = currentTextPosition;
++positionIdx;
++context.positionIdx;
}
previousTextPosition = RedTextPosition.builder().unicode(" ").position(previousTextPosition.getPosition()).build();
sb.append(previousTextPosition.getUnicode());
stringIdxToPositionIdx.add(positionIdx);
++stringIdx;
context.stringBuilder.append(" ");
context.stringIdxToPositionIdx.add(context.positionIdx);
++context.stringIdx;
}
assert sb.length() == stringIdxToPositionIdx.size();
assert context.stringBuilder.length() == context.stringIdxToPositionIdx.size();
List<Rectangle2D> positions = sequences.stream()
.flatMap(sequence -> sequence.getTextPositions().stream().map(textPosition -> mapRedTextPositionToInitialUserSpace(textPosition, sequence)))
.toList();
return SearchTextWithTextPositionModel.builder()
.searchText(sb.toString())
.lineBreaks(lineBreaksStringIdx)
.stringCoordsToPositionCoords(stringIdxToPositionIdx)
return SearchTextWithTextPositionDto.builder()
.searchText(context.stringBuilder.toString())
.lineBreaks(context.lineBreaksStringIdx)
.stringCoordsToPositionCoords(context.stringIdxToPositionIdx)
.positions(positions)
.build();
}
private static boolean isLineBreak(RedTextPosition currentTextPosition, RedTextPosition previousTextPosition) {
private void appendCurrentTextPosition(Context context, RedTextPosition currentTextPosition) {
context.stringBuilder.append(currentTextPosition.getUnicode());
// unicode characters with more than 16-bit encoding have a length > 1 in java strings
for (int j = 0; j < currentTextPosition.getUnicode().length(); j++) {
context.stringIdxToPositionIdx.add(context.positionIdx);
}
context.stringIdx += currentTextPosition.getUnicode().length();
}
private void removeHyphenLinebreaks(Context context) {
if (lastHyphenDirectlyBeforeLineBreak(context)) {
context.stringBuilder.delete(context.lastHyphenIdx, context.stringBuilder.length());
context.stringIdxToPositionIdx = context.stringIdxToPositionIdx.subList(0, context.lastHyphenIdx);
context.stringIdx = context.lastHyphenIdx;
context.lastHyphenIdx = -MAX_HYPHEN_LINEBREAK_DISTANCE;
}
}
private boolean lastHyphenDirectlyBeforeLineBreak(Context context) {
return context.stringIdx - context.lastHyphenIdx < MAX_HYPHEN_LINEBREAK_DISTANCE;
}
private boolean isLineBreak(RedTextPosition currentTextPosition, RedTextPosition previousTextPosition) {
return Objects.equals(currentTextPosition.getUnicode(), "\n") || isDeltaYLargerThanTextHeight(currentTextPosition, previousTextPosition);
}
private static boolean isDeltaYLargerThanTextHeight(RedTextPosition currentPosition, RedTextPosition previousPosition) {
private boolean isDeltaYLargerThanTextHeight(RedTextPosition currentPosition, RedTextPosition previousPosition) {
if (previousPosition == null) {
return false;
@ -106,13 +121,13 @@ public class SearchTextWithTextPositionFactory {
}
private static boolean isRepeatedWhitespace(String currentUnicode, String previousUnicode) {
private boolean isRepeatedWhitespace(String currentUnicode, String previousUnicode) {
return Objects.equals(previousUnicode, " ") && Objects.equals(currentUnicode, " ");
}
private static boolean isHyphen(String unicodeCharacter) {
private boolean isHyphen(String unicodeCharacter) {
return Objects.equals(unicodeCharacter, "-") || //
Objects.equals(unicodeCharacter, "~") || //
@ -128,7 +143,7 @@ public class SearchTextWithTextPositionFactory {
}
private static Rectangle2D mapRedTextPositionToInitialUserSpace(RedTextPosition textPosition, TextPositionSequence sequence) {
private Rectangle2D mapRedTextPositionToInitialUserSpace(RedTextPosition textPosition, TextPositionSequence sequence) {
float textHeight = sequence.getTextHeight() + HEIGHT_PADDING;
Rectangle2D rectangle2D = new Rectangle2D.Double(textPosition.getXDirAdj(),
@ -153,4 +168,18 @@ public class SearchTextWithTextPositionFactory {
return transform.createTransformedShape(rectangle2D).getBounds2D();
}
private class Context {
List<Integer> stringIdxToPositionIdx = new LinkedList<>();
List<Integer> lineBreaksStringIdx = new LinkedList<>();
StringBuilder stringBuilder = new StringBuilder();
int stringIdx;
int positionIdx;
int lastHyphenIdx = -MAX_HYPHEN_LINEBREAK_DISTANCE;
}
}

View File

@ -0,0 +1,183 @@
package com.knecon.fforesight.service.layoutparser.processor.factory;
import static java.lang.String.format;
import static java.util.Collections.emptyList;
import static java.util.stream.Collectors.groupingBy;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.GenericSemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Section;
import com.knecon.fforesight.service.layoutparser.processor.utils.TableMergingUtility;
import lombok.experimental.UtilityClass;
@UtilityClass
public class SectionNodeFactory {
public void addSection(GenericSemanticNode parentNode, List<AbstractPageBlock> pageBlocks, List<ClassifiedImage> images, DocumentGraphFactory.Context context) {
if (pageBlocks.isEmpty()) {
return;
}
Map<Integer, List<AbstractPageBlock>> blocksPerPage = pageBlocks.stream().collect(groupingBy(AbstractPageBlock::getPage));
Section section = Section.builder().documentTree(context.getDocumentTree()).build();
context.getSections().add(section);
blocksPerPage.keySet().forEach(pageNumber -> addSectionNodeToPageNode(context, section, pageNumber));
section.setTreeId(getTreeId(parentNode, context, section));
addFirstHeadlineDirectlyToSection(pageBlocks, context, section);
if (containsTablesAndTextBlocks(pageBlocks)) {
splitPageBlocksIntoSubSections(pageBlocks).forEach(subSectionPageBlocks -> addSection(section, subSectionPageBlocks, emptyList(), context));
} else {
addTablesAndParagraphsAndHeadlinesToSection(pageBlocks, context, section);
}
images.stream().distinct().forEach(image -> DocumentGraphFactory.addImage(section, image, context));
}
private List<Integer> getTreeId(GenericSemanticNode parentNode, DocumentGraphFactory.Context context, Section section) {
if (parentNode == null) {
return context.getDocumentTree().createNewMainEntryAndReturnId(section);
} else {
return context.getDocumentTree().createNewChildEntryAndReturnId(parentNode, section);
}
}
private void addFirstHeadlineDirectlyToSection(List<AbstractPageBlock> pageBlocks, DocumentGraphFactory.Context context, Section section) {
if (pageBlocks.get(0).isHeadline()) {
addTablesAndParagraphsAndHeadlinesToSection(List.of(pageBlocks.get(0)), context, section);
pageBlocks.remove(0);
}
}
private void addTablesAndParagraphsAndHeadlinesToSection(List<AbstractPageBlock> pageBlocks, DocumentGraphFactory.Context context, Section section) {
Set<AbstractPageBlock> alreadyMerged = new HashSet<>();
List<AbstractPageBlock> remainingBlocks = new LinkedList<>(pageBlocks);
for (AbstractPageBlock abstractPageBlock : pageBlocks) {
if (alreadyMerged.contains(abstractPageBlock)) {
continue;
}
remainingBlocks.removeAll(alreadyMerged);
if (abstractPageBlock instanceof TextPageBlock) {
List<TextPageBlock> textBlocks = findTextBlocksWithSameClassificationAndAlignsY(abstractPageBlock, remainingBlocks);
alreadyMerged.addAll(textBlocks);
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, textBlocks);
} else if (abstractPageBlock instanceof TablePageBlock tablePageBlock) {
List<TablePageBlock> tablesToMerge = TableMergingUtility.findConsecutiveTablesWithSameColCountAndSameHeaders(tablePageBlock, remainingBlocks);
alreadyMerged.addAll(tablesToMerge);
TableNodeFactory.addTable(section, tablesToMerge, context);
} else {
throw new RuntimeException(format("Unhandled AbstractPageBlockType %s!", abstractPageBlock.getClass()));
}
}
}
private boolean containsTablesAndTextBlocks(List<AbstractPageBlock> pageBlocks) {
return pageBlocks.stream().anyMatch(pageBlock -> pageBlock instanceof TablePageBlock) && pageBlocks.stream().anyMatch(pageBlock -> pageBlock instanceof TextPageBlock);
}
/**
* This function splits the list of PageBlocks around TablePageBlocks, such that SubSections can be created, that don't include tables.
* This is needed so we can execute rules on sections, that do not contain tables.
* See: <a href="https://knecon.atlassian.net/wiki/spaces/RED/pages/14765218/Document+Structure">document structure wiki</a>
*
* @param pageBlocks a List of AbstractPageBlocks, which have at least one TablePageBlock and one ClassificationTextBlock
* @return List of Lists of AbstractPageBlocks, which include either a single Headline ClassificationTextBlock and a TablePageBlock or only ClassificationTextBlocks.
*/
private List<List<AbstractPageBlock>> splitPageBlocksIntoSubSections(List<AbstractPageBlock> pageBlocks) {
List<List<AbstractPageBlock>> splitList = splitIntoCoherentList(pageBlocks);
movePrecedingHeadlineToTableList(splitList);
return splitList.stream().filter(list -> !list.isEmpty()).toList();
}
private void movePrecedingHeadlineToTableList(List<List<AbstractPageBlock>> splitList) {
for (int i = 0; i < splitList.size(); i++) {
if (listIsTablesOnly(splitList.get(i)) && i > 0) {
List<AbstractPageBlock> previousList = splitList.get(i - 1);
AbstractPageBlock lastPageBlockInPreviousList = previousList.get(previousList.size() - 1);
if (lastPageBlockInPreviousList.isHeadline()) {
previousList.remove(i - 1);
splitList.get(i).add(0, lastPageBlockInPreviousList);
}
}
}
}
private boolean listIsTablesOnly(List<AbstractPageBlock> abstractPageBlocks) {
return abstractPageBlocks.stream().allMatch(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock);
}
/**
* @param pageBlocks a List of AbstractPageBlocks, which have at least one TablePageBlock and one ClassificationTextBlock
* @return List of Lists of AbstractPageBlocks, which are exclusively of type ClassificationTextBlock or TablePageBlock
*/
private List<List<AbstractPageBlock>> splitIntoCoherentList(List<AbstractPageBlock> pageBlocks) {
List<List<AbstractPageBlock>> splitList = new LinkedList<>();
List<AbstractPageBlock> currentList = new LinkedList<>();
splitList.add(currentList);
Class<? extends AbstractPageBlock> lastPageBlockClass = pageBlocks.get(0).getClass();
for (AbstractPageBlock pageBlock : pageBlocks) {
if (lastPageBlockClass.isInstance(pageBlock)) {
currentList.add(pageBlock);
} else {
currentList = new LinkedList<>();
currentList.add(pageBlock);
splitList.add(currentList);
lastPageBlockClass = pageBlock.getClass();
}
}
return splitList;
}
private List<TextPageBlock> findTextBlocksWithSameClassificationAndAlignsY(AbstractPageBlock atc, List<AbstractPageBlock> pageBlocks) {
return pageBlocks.stream()
.filter(abstractTextContainer -> !abstractTextContainer.equals(atc))
.filter(abstractTextContainer -> abstractTextContainer.getPage() == atc.getPage())
.filter(abstractTextContainer -> abstractTextContainer instanceof TextPageBlock)
.filter(abstractTextContainer -> abstractTextContainer.intersectsY(atc))
.map(abstractTextContainer -> (TextPageBlock) abstractTextContainer)
.toList();
}
private void addSectionNodeToPageNode(DocumentGraphFactory.Context context, Section section, Integer pageNumber) {
Page page = context.getPage(pageNumber);
page.getMainBody().add(section);
}
}

View File

@ -0,0 +1,136 @@
package com.knecon.fforesight.service.layoutparser.processor.factory;
import static java.util.Collections.emptyList;
import java.util.Collection;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.GenericSemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Table;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.TableCell;
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
import lombok.experimental.UtilityClass;
@UtilityClass
public class TableNodeFactory {
public final double TABLE_CELL_MERGE_CONTENTS_SIZE_THRESHOLD = 0.05;
public void addTable(GenericSemanticNode parentNode, List<TablePageBlock> tablesToMerge, DocumentGraphFactory.Context context) {
setPageNumberInCells(tablesToMerge);
Set<Page> pages = tablesToMerge.stream().map(AbstractPageBlock::getPage).map(context::getPage).collect(Collectors.toSet());
List<List<Cell>> mergedRows = tablesToMerge.stream().map(TablePageBlock::getRows).flatMap(Collection::stream).toList();
Table table = Table.builder().documentTree(context.getDocumentTree()).numberOfCols(mergedRows.get(0).size()).numberOfRows(mergedRows.size()).build();
pages.forEach(page -> addTableToPage(page, parentNode, table));
List<Integer> treeId = context.getDocumentTree().createNewChildEntryAndReturnId(parentNode, table);
table.setTreeId(treeId);
addTableCells(mergedRows, table, context);
ifTableHasNoHeadersSetFirstRowAsHeaders(table);
}
private void setPageNumberInCells(List<TablePageBlock> tablesToMerge) {
// For some reason I can't figure out, in some table cells, the ClassificationTextBlocks have 0 as page number
// So I am fixing this here, but this should actually be fixed upstream.
tablesToMerge.forEach(table -> table.getRows()
.stream()
.flatMap(Collection::stream)
.peek(cell -> cell.setPageNumber(table.getPage()))
.forEach(cell -> setPageNumberInTextBlocksWithPageNumberSetTo0(table, cell)));
}
private void setPageNumberInTextBlocksWithPageNumberSetTo0(TablePageBlock table, Cell cell) {
cell.getTextBlocks().stream()//
.filter(tb -> tb.getPage() == 0)//
.forEach(tb -> tb.setPage(table.getPage()));
}
@SuppressWarnings("PMD.UnusedPrivateMethod") // PMD actually flags this wrong
private void addTableToPage(Page page, SemanticNode parentNode, Table table) {
if (!page.getMainBody().contains(parentNode)) {
parentNode.getPages().add(page);
}
page.getMainBody().add(table);
}
private void ifTableHasNoHeadersSetFirstRowAsHeaders(Table table) {
if (table.streamHeaders().findAny().isEmpty()) {
table.streamRow(0).forEach(tableCellNode -> tableCellNode.setHeader(true));
}
}
private void addTableCells(List<List<Cell>> rows, Table table, DocumentGraphFactory.Context context) {
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
for (int colIndex = 0; colIndex < rows.get(rowIndex).size(); colIndex++) {
addTableCell(rows.get(rowIndex).get(colIndex), rowIndex, colIndex, table, context);
}
}
}
@SuppressWarnings("PMD.UnusedPrivateMethod") // PMD actually flags this wrong
private void addTableCell(Cell cell, int rowIndex, int colIndex, Table tableNode, DocumentGraphFactory.Context context) {
Page page = context.getPage(cell.getPageNumber());
TableCell tableCell = TableCell.builder().documentTree(context.getDocumentTree()).row(rowIndex).col(colIndex).header(cell.isHeaderCell()).bBox(cell.getBounds2D()).build();
page.getMainBody().add(tableCell);
List<Integer> treeId = context.getDocumentTree().createNewTableChildEntryAndReturnId(tableNode, tableCell);
tableCell.setTreeId(treeId);
TextBlock textBlock;
if (cell.getTextBlocks().isEmpty()) {
tableCell.setLeafTextBlock(context.getTextBlockFactory().emptyTextBlock(tableNode, context, page));
} else if (cell.getTextBlocks().size() == 1) {
textBlock = context.getTextBlockFactory().buildAtomicTextBlock(cell.getTextBlocks().get(0).getSequences(), tableCell, context, page);
tableCell.setLeafTextBlock(textBlock);
} else if (firstTextBlockIsHeadline(cell)) {
SectionNodeFactory.addSection(tableCell, cell.getTextBlocks().stream().map(tb -> (AbstractPageBlock) tb).toList(), emptyList(), context);
} else if (cellAreaIsSmallerThanPageAreaTimesThreshold(cell, page)) {
List<TextPositionSequence> sequences = TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(cell.getTextBlocks());
textBlock = context.getTextBlockFactory().buildAtomicTextBlock(sequences, tableCell, context, page);
tableCell.setLeafTextBlock(textBlock);
} else {
cell.getTextBlocks().forEach(tb -> DocumentGraphFactory.addParagraphOrHeadline(tableCell, tb, context, emptyList()));
}
}
private boolean cellAreaIsSmallerThanPageAreaTimesThreshold(Cell cell, Page page) {
return cell.getArea() < TABLE_CELL_MERGE_CONTENTS_SIZE_THRESHOLD * page.getHeight() * page.getWidth();
}
private boolean firstTextBlockIsHeadline(Cell cell) {
return cell.getTextBlocks().get(0).isHeadline();
}
}

View File

@ -1,79 +1,53 @@
package com.knecon.fforesight.service.layoutparser.processor.factory;
import java.util.Collections;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.AtomicTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.AtomicTextBlock;
import lombok.AccessLevel;
import lombok.experimental.FieldDefaults;
@FieldDefaults(level = AccessLevel.PRIVATE)
public class TextBlockFactory {
AtomicInteger stringOffset;
AtomicLong textBlockIdx;
int stringOffset;
long textBlockIdx;
public TextBlockFactory() {
public AtomicTextBlock buildAtomicTextBlock(List<TextPositionSequence> sequences, SemanticNode parent, DocumentGraphFactory.Context context, Page page) {
stringOffset = new AtomicInteger();
textBlockIdx = new AtomicLong();
Integer numberOnPage = context.getAndIncrementTextBlockNumberOnPage(page);
return buildAtomicTextBlock(sequences, parent, numberOnPage, page);
}
public AtomicTextBlock buildAtomicTextBlock(List<TextPositionSequence> sequences, SemanticNode parent, DocumentGraphFactory.Context context, PageNode page) {
public AtomicTextBlock buildAtomicTextBlock(List<TextPositionSequence> sequences, SemanticNode parent, Integer numberOnPage, Page page) {
Integer numberOnPage = context.pages().get(page).getAndIncrement();
return buildAtomicTextBlock(sequences, parent, context, numberOnPage, page);
SearchTextWithTextPositionDto searchTextWithTextPositionDto = SearchTextWithTextPositionFactory.buildSearchTextToTextPositionModel(sequences);
int offset = stringOffset;
stringOffset += searchTextWithTextPositionDto.getSearchText().length();
long idx = textBlockIdx;
textBlockIdx++;
return AtomicTextBlock.fromSearchTextWithTextPositionDto(searchTextWithTextPositionDto, parent, offset, idx, numberOnPage, page);
}
public AtomicTextBlock buildAtomicTextBlock(List<TextPositionSequence> sequences,
SemanticNode parent,
DocumentGraphFactory.Context context,
Integer numberOnPage,
PageNode page) {
public AtomicTextBlock emptyTextBlock(SemanticNode parent, DocumentGraphFactory.Context context, Page page) {
SearchTextWithTextPositionModel searchTextWithTextPositionModel = SearchTextWithTextPositionFactory.buildSearchTextToTextPositionModel(sequences);
int offset = stringOffset.getAndAdd(searchTextWithTextPositionModel.getSearchText().length());
return AtomicTextBlock.builder()
.id(textBlockIdx.getAndIncrement())
.parent(parent)
.searchText(searchTextWithTextPositionModel.getSearchText())
.numberOnPage(numberOnPage)
.page(page)
.lineBreaks(searchTextWithTextPositionModel.getLineBreaks())
.positions(searchTextWithTextPositionModel.getPositions())
.stringIdxToPositionIdx(searchTextWithTextPositionModel.getStringCoordsToPositionCoords())
.boundary(new Boundary(offset, offset + searchTextWithTextPositionModel.getSearchText().length()))
.build();
long idx = textBlockIdx;
textBlockIdx++;
return AtomicTextBlock.empty(idx, stringOffset, page, context.getAndIncrementTextBlockNumberOnPage(page), parent);
}
public AtomicTextBlock emptyTextBlock(SemanticNode parent, DocumentGraphFactory.Context context, PageNode page) {
public AtomicTextBlock emptyTextBlock(SemanticNode parent, Integer numberOnPage, Page page) {
return emptyTextBlock(parent, context.pages().get(page).getAndIncrement(), page);
}
public AtomicTextBlock emptyTextBlock(SemanticNode parent, Integer numberOnPage, PageNode page) {
return AtomicTextBlock.builder()
.id(textBlockIdx.getAndIncrement())
.boundary(new Boundary(stringOffset.get(), stringOffset.get()))
.searchText("")
.lineBreaks(Collections.emptyList())
.page(page)
.numberOnPage(numberOnPage)
.stringIdxToPositionIdx(Collections.emptyList())
.positions(Collections.emptyList())
.parent(parent)
.build();
long idx = textBlockIdx;
textBlockIdx++;
return AtomicTextBlock.empty(idx, stringOffset, page, numberOnPage, parent);
}
}

View File

@ -1,11 +1,18 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph;
package com.knecon.fforesight.service.layoutparser.processor.graph;
import static java.lang.String.format;
import java.util.Collection;
import java.util.LinkedList;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock;
import lombok.EqualsAndHashCode;
import lombok.Setter;
@Setter
@EqualsAndHashCode
public class Boundary implements Comparable<Boundary> {
private int start;
@ -15,7 +22,7 @@ public class Boundary implements Comparable<Boundary> {
public Boundary(int start, int end) {
if (start > end) {
throw new IllegalArgumentException(String.format("start: %d > end: %d", start, end));
throw new IllegalArgumentException(format("start: %d > end: %d", start, end));
}
this.start = start;
this.end = end;
@ -55,7 +62,7 @@ public class Boundary implements Comparable<Boundary> {
public boolean contains(int start, int end) {
if (start > end) {
throw new IllegalArgumentException(String.format("start: %d > end: %d", start, end));
throw new IllegalArgumentException(format("start: %d > end: %d", start, end));
}
return this.start <= start && end <= this.end;
}
@ -64,7 +71,7 @@ public class Boundary implements Comparable<Boundary> {
public boolean containedBy(int start, int end) {
if (start > end) {
throw new IllegalArgumentException(String.format("start: %d > end: %d", start, end));
throw new IllegalArgumentException(format("start: %d > end: %d", start, end));
}
return start <= this.start && this.end <= end;
}
@ -78,14 +85,14 @@ public class Boundary implements Comparable<Boundary> {
public boolean intersects(Boundary boundary) {
return contains(boundary.start()) || contains(boundary.end() - 1);
return boundary.start() < this.end && this.start < boundary.end();
}
public List<Boundary> split(List<Integer> splitIndices) {
if (splitIndices.stream().anyMatch(idx -> !this.contains(idx))) {
throw new IndexOutOfBoundsException(String.format("%s splitting indices are out of range for %s", splitIndices.stream().filter(idx -> !this.contains(idx)).toList(), this));
throw new IndexOutOfBoundsException(format("%s splitting indices are out of range for %s", splitIndices.stream().filter(idx -> !this.contains(idx)).toList(), this));
}
List<Boundary> splitBoundaries = new LinkedList<>();
int previousIndex = start;
@ -103,7 +110,7 @@ public class Boundary implements Comparable<Boundary> {
}
public static Boundary merge(List<Boundary> boundaries) {
public static Boundary merge(Collection<Boundary> boundaries) {
int minStart = boundaries.stream().mapToInt(Boundary::start).min().orElseThrow(IllegalArgumentException::new);
int maxEnd = boundaries.stream().mapToInt(Boundary::end).max().orElseThrow(IllegalArgumentException::new);
@ -114,7 +121,7 @@ public class Boundary implements Comparable<Boundary> {
@Override
public String toString() {
return String.format("Boundary [%d|%d)", start, end);
return format("Boundary [%d|%d)", start, end);
}
@ -132,17 +139,25 @@ public class Boundary implements Comparable<Boundary> {
}
@Override
public int hashCode() {
/**
* shrinks the boundary, such that textBlock.subSequence(boundary) returns a string without whitespaces.
*
* @param textBlock TextBlock to check whitespaces against
* @return boundary
*/
public Boundary trim(TextBlock textBlock) {
return toString().hashCode();
}
int trimmedStart = this.start;
while (Character.isWhitespace(textBlock.charAt(trimmedStart))) {
trimmedStart++;
}
int trimmedEnd = this.end;
while (Character.isWhitespace(textBlock.charAt(trimmedEnd - 1))) {
trimmedEnd--;
}
@Override
public boolean equals(Object object) {
return hashCode() == object.hashCode();
return new Boundary(trimmedStart, Math.max(trimmedEnd, trimmedStart));
}
}

View File

@ -0,0 +1,217 @@
package com.knecon.fforesight.service.layoutparser.processor.graph;
import static java.lang.String.format;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.GenericSemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Table;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.TableCell;
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlockCollector;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.experimental.FieldDefaults;
@Data
@EqualsAndHashCode
public class DocumentTree {
private final Entry root;
public DocumentTree(Document document) {
root = Entry.builder().treeId(Collections.emptyList()).children(new LinkedList<>()).node(document).build();
}
public TextBlock buildTextBlock() {
return allEntriesInOrder().map(Entry::getNode).filter(SemanticNode::isLeaf).map(SemanticNode::getLeafTextBlock).collect(new TextBlockCollector());
}
public List<Integer> createNewMainEntryAndReturnId(GenericSemanticNode node) {
return createNewChildEntryAndReturnIdImpl(Collections.emptyList(), node);
}
public List<Integer> createNewChildEntryAndReturnId(GenericSemanticNode parentNode, GenericSemanticNode node) {
return createNewChildEntryAndReturnIdImpl(parentNode.getTreeId(), node);
}
public List<Integer> createNewChildEntryAndReturnId(GenericSemanticNode parentNode, Table node) {
return createNewChildEntryAndReturnIdImpl(parentNode.getTreeId(), node);
}
public List<Integer> createNewTableChildEntryAndReturnId(Table parentTable, TableCell tableCell) {
return createNewChildEntryAndReturnIdImpl(parentTable.getTreeId(), tableCell);
}
@SuppressWarnings("PMD.UnusedPrivateMethod") // PMD actually flags this wrong
private List<Integer> createNewChildEntryAndReturnIdImpl(List<Integer> parentId, SemanticNode node) {
if (!entryExists(parentId)) {
throw new IllegalArgumentException(format("parentId %s does not exist!", parentId));
}
Entry parent = getEntryById(parentId);
List<Integer> newId = new LinkedList<>(parentId);
newId.add(parent.children.size());
parent.children.add(Entry.builder().treeId(newId).node(node).build());
return newId;
}
private boolean entryExists(List<Integer> treeId) {
if (treeId.isEmpty()) {
return root != null;
}
Entry entry = root.children.get(treeId.get(0));
for (int id : treeId.subList(1, treeId.size())) {
if (id >= entry.children.size() || 0 > id) {
return false;
}
entry = entry.children.get(id);
}
return true;
}
public Entry getParentEntryById(List<Integer> treeId) {
return getEntryById(getParentId(treeId));
}
public boolean hasParentById(List<Integer> treeId) {
return !treeId.isEmpty();
}
public Stream<SemanticNode> childNodes(List<Integer> treeId) {
return getEntryById(treeId).children.stream().map(Entry::getNode);
}
public Stream<SemanticNode> childNodesOfType(List<Integer> treeId, NodeType nodeType) {
return getEntryById(treeId).children.stream().filter(entry -> entry.node.getType().equals(nodeType)).map(Entry::getNode);
}
private static List<Integer> getParentId(List<Integer> treeId) {
if (treeId.isEmpty()) {
throw new UnsupportedOperationException("Root has no parent!");
}
if (treeId.size() < 2) {
return Collections.emptyList();
}
return treeId.subList(0, treeId.size() - 1);
}
public Entry getEntryById(List<Integer> treeId) {
if (treeId.isEmpty()) {
return root;
}
Entry entry = root.children.get(treeId.get(0));
for (int id : treeId.subList(1, treeId.size())) {
entry = entry.children.get(id);
}
return entry;
}
public Stream<Entry> mainEntries() {
return root.children.stream();
}
public Stream<Entry> allEntriesInOrder() {
return Stream.of(root).flatMap(DocumentTree::flatten);
}
public Stream<Entry> allSubEntriesInOrder(List<Integer> parentId) {
return getEntryById(parentId).children.stream().flatMap(DocumentTree::flatten);
}
@Override
public String toString() {
return String.join("\n", allEntriesInOrder().map(Entry::toString).toList());
}
private static Stream<Entry> flatten(Entry entry) {
return Stream.concat(Stream.of(entry), entry.children.stream().flatMap(DocumentTree::flatten));
}
public SemanticNode getHighestParentById(List<Integer> treeId) {
if (treeId.isEmpty()) {
return root.node;
}
return root.children.get(treeId.get(0)).node;
}
@Builder
@Getter
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE, makeFinal = true)
public static class Entry {
List<Integer> treeId;
SemanticNode node;
@Builder.Default
List<Entry> children = new LinkedList<>();
@Override
public String toString() {
return node.toString();
}
public NodeType getType() {
return node.getType();
}
}
}

View File

@ -0,0 +1,8 @@
package com.knecon.fforesight.service.layoutparser.processor.graph.entity;
public enum EntityType {
ENTITY,
RECOMMENDATION,
FALSE_POSITIVE,
FALSE_RECOMMENDATION
}

View File

@ -0,0 +1,228 @@
package com.knecon.fforesight.service.layoutparser.processor.graph.entity;
import java.awt.geom.Rectangle2D;
import java.util.Collection;
import java.util.Comparator;
import java.util.Deque;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Engine;
import com.knecon.fforesight.service.layoutparser.processor.graph.Boundary;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.utils.IdBuilder;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
public class RedactionEntity {
// initial values
@EqualsAndHashCode.Include
final Boundary boundary;
@EqualsAndHashCode.Include
final String type;
@EqualsAndHashCode.Include
final EntityType entityType;
// empty defaults
boolean redaction;
boolean removed;
boolean ignored;
boolean resized;
boolean skipRemoveEntitiesContainedInLarger;
boolean dictionaryEntry;
boolean dossierDictionaryEntry;
Set<Engine> engines;
Set<RedactionEntity> references;
@Builder.Default
Deque<Integer> matchedRules = new LinkedList<>();
String redactionReason;
String legalBasis;
// inferred on graph insertion
@EqualsAndHashCode.Include
String value;
String textBefore;
String textAfter;
@Builder.Default
Set<Page> pages = new HashSet<>();
List<RedactionPosition> redactionPositionsPerPage;
@Builder.Default
List<SemanticNode> intersectingNodes = new LinkedList<>();
SemanticNode deepestFullyContainingNode;
public static RedactionEntity initialEntityNode(Boundary boundary, String type, EntityType entityType) {
return RedactionEntity.builder().type(type).entityType(entityType).boundary(boundary).engines(new HashSet<>()).references(new HashSet<>()).build();
}
public boolean occursInNodeOfType(Class<? extends SemanticNode> clazz) {
return intersectingNodes.stream().anyMatch(clazz::isInstance);
}
public boolean occursInNode(SemanticNode semanticNode) {
return intersectingNodes.stream().anyMatch(node -> node.equals(semanticNode));
}
public boolean isType(String type) {
return this.type.equals(type);
}
public boolean isAnyType(List<String> types) {
return types.contains(type);
}
public void addIntersectingNode(SemanticNode containingNode) {
intersectingNodes.add(containingNode);
}
public void removeFromGraph() {
intersectingNodes.forEach(node -> node.getEntities().remove(this));
pages.forEach(page -> page.getEntities().remove(this));
intersectingNodes = new LinkedList<>();
deepestFullyContainingNode = null;
pages = new HashSet<>();
removed = true;
ignored = true;
}
public void addMatchedRule(int ruleNumber) {
matchedRules.add(ruleNumber);
}
public int getMatchedRule() {
if (matchedRules.isEmpty()) {
return 0;
}
return matchedRules.getLast();
}
public List<RedactionPosition> getRedactionPositionsPerPage() {
if (redactionPositionsPerPage == null || redactionPositionsPerPage.isEmpty()) {
Map<Page, List<Rectangle2D>> rectanglesPerLinePerPage = deepestFullyContainingNode.getTextBlock().getPositionsPerPage(boundary);
Page firstPage = rectanglesPerLinePerPage.keySet()
.stream()
.min(Comparator.comparingInt(Page::getNumber))
.orElseThrow(() -> new RuntimeException("No Positions found on any page!"));
String id = IdBuilder.buildId(pages, rectanglesPerLinePerPage.values().stream().flatMap(Collection::stream).toList());
redactionPositionsPerPage = rectanglesPerLinePerPage.entrySet().stream().map(entry -> buildRedactionPosition(firstPage, id, entry)).toList();
}
return redactionPositionsPerPage;
}
private static RedactionPosition buildRedactionPosition(Page firstPage, String id, Map.Entry<Page, List<Rectangle2D>> entry) {
if (entry.getKey().equals(firstPage)) {
return new RedactionPosition(id, entry.getKey(), entry.getValue());
} else {
return new RedactionPosition(id + "-" + entry.getKey().getNumber(), entry.getKey(), entry.getValue());
}
}
public boolean containedBy(RedactionEntity redactionEntity) {
return this.boundary.containedBy(redactionEntity.getBoundary());
}
public boolean contains(RedactionEntity redactionEntity) {
return this.boundary.contains(redactionEntity.getBoundary());
}
public boolean intersects(RedactionEntity redactionEntity) {
return this.boundary.intersects(redactionEntity.getBoundary());
}
public void addEngine(Engine engine) {
engines.add(engine);
}
public void addEngines(Set<Engine> engines) {
this.engines.addAll(engines);
}
public void addReference(RedactionEntity reference) {
references.add(reference);
}
public void addReferences(List<RedactionEntity> references) {
this.references.addAll(references);
}
public boolean matchesAnnotationId(String manualRedactionId) {
return getRedactionPositionsPerPage().stream().anyMatch(entityPosition -> entityPosition.getId().equals(manualRedactionId));
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("Entity[\"");
sb.append(value);
sb.append("\", ");
sb.append(boundary);
sb.append(", pages[");
pages.forEach(page -> {
sb.append(page.getNumber());
sb.append(", ");
});
sb.delete(sb.length() - 2, sb.length());
sb.append("], type = \"");
sb.append(type);
sb.append("\", EntityType.");
sb.append(entityType);
sb.append("]");
return sb.toString();
}
}

View File

@ -0,0 +1,24 @@
package com.knecon.fforesight.service.layoutparser.processor.graph.entity;
import java.awt.geom.Rectangle2D;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.experimental.FieldDefaults;
@Data
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class RedactionPosition {
final String id;
Page page;
// Each entry in this list corresponds to an entry in the redaction log, this means:
// An entity might be represented by multiple redaction log entries
List<Rectangle2D> rectanglePerLine;
}

View File

@ -0,0 +1,120 @@
package com.knecon.fforesight.service.layoutparser.processor.graph.nodes;
import java.awt.geom.Rectangle2D;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import com.amazonaws.services.kms.model.NotFoundException;
import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity;
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlockCollector;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class Document implements GenericSemanticNode {
Set<Page> pages;
DocumentTree documentTree;
Integer numberOfPages;
TextBlock textBlock;
@Builder.Default
Set<RedactionEntity> entities = new HashSet<>();
@Override
public NodeType getType() {
return NodeType.DOCUMENT;
}
public TextBlock getTextBlock() {
if (textBlock == null) {
textBlock = streamTerminalTextBlocksInOrder().collect(new TextBlockCollector());
}
return textBlock;
}
public List<Section> getMainSections() {
return streamChildrenOfType(NodeType.SECTION).map(node -> (Section) node).collect(Collectors.toList());
}
public Stream<TextBlock> streamTerminalTextBlocksInOrder() {
return streamAllNodes().filter(SemanticNode::isLeaf).map(SemanticNode::getLeafTextBlock);
}
@Override
public List<Integer> getTreeId() {
return Collections.emptyList();
}
@Override
public void setTreeId(List<Integer> tocId) {
throw new UnsupportedOperationException("Document is always the root of the TablePageBlock of Contents");
}
@Override
public Headline getHeadline() {
return streamAllSubNodesOfType(NodeType.HEADLINE).map(node -> (Headline) node).findFirst().orElseThrow(() -> new NotFoundException("No Headlines found in this document!"));
}
private Stream<SemanticNode> streamAllNodes() {
return documentTree.allEntriesInOrder().map(DocumentTree.Entry::getNode);
}
public Stream<Image> streamAllImages() {
return streamAllSubNodesOfType(NodeType.IMAGE).map(node -> (Image) node);
}
@Override
public String toString() {
return NodeType.DOCUMENT + ": " + this.getTextBlock().buildSummary();
}
@Override
public Map<Page, Rectangle2D> getBBox() {
Map<Page, Rectangle2D> bBox = new HashMap<>();
for (Page page : pages) {
bBox.put(page, new Rectangle2D.Double(0, 0, page.getWidth(), page.getHeight()));
}
return bBox;
}
}

View File

@ -0,0 +1,65 @@
package com.knecon.fforesight.service.layoutparser.processor.graph.nodes;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity;
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class Footer implements GenericSemanticNode {
List<Integer> treeId;
TextBlock leafTextBlock;
@EqualsAndHashCode.Exclude
DocumentTree documentTree;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<RedactionEntity> entities = new HashSet<>();
@Override
public NodeType getType() {
return NodeType.FOOTER;
}
@Override
public boolean isLeaf() {
return true;
}
@Override
public TextBlock getTextBlock() {
return leafTextBlock;
}
@Override
public String toString() {
return treeId + ": " + NodeType.FOOTER + ": " + leafTextBlock.buildSummary();
}
}

Some files were not shown because too many files have changed in this diff Show More