RED-6009: Document Tree Structure

*moved all layoutparsing code to separate project
*wip (some dependency issues)
This commit is contained in:
Kilian Schuettler 2023-04-12 11:06:28 +02:00
parent ae941e0982
commit aac0259caf
199 changed files with 14707 additions and 0 deletions

33
.gitignore vendored Normal file
View File

@ -0,0 +1,33 @@
HELP.md
target/
!.mvn/wrapper/maven-wrapper.jar
!**/src/main/**/target/
!**/src/test/**/target/
### STS ###
.apt_generated
.classpath
.factorypath
.project
.settings
.springBeans
.sts4-cache
### IntelliJ IDEA ###
.idea
*.iws
*.iml
*.ipr
### NetBeans ###
/nbproject/private/
/nbbuild/
/dist/
/nbdist/
/.nb-gradle/
build/
!**/src/main/**/build/
!**/src/test/**/build/
### VS Code ###
.vscode/

BIN
.mvn/wrapper/maven-wrapper.jar vendored Normal file

Binary file not shown.

18
.mvn/wrapper/maven-wrapper.properties vendored Normal file
View File

@ -0,0 +1,18 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
distributionUrl=https://repo.maven.apache.org/maven2/org/apache/maven/apache-maven/3.8.7/apache-maven-3.8.7-bin.zip
wrapperUrl=https://repo.maven.apache.org/maven2/org/apache/maven/wrapper/maven-wrapper/3.1.1/maven-wrapper-3.1.1.jar

View File

@ -0,0 +1,17 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.knecon.fforesight</groupId>
<artifactId>layoutparser</artifactId>
<version>1.0.0</version>
</parent>
<artifactId>layoutparser-service-image</artifactId>
<version>1.0.0</version>
</project>

View File

@ -0,0 +1,49 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns="http://maven.apache.org/POM/4.0.0"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.knecon.fforesight</groupId>
<artifactId>layoutparser-service</artifactId>
<version>1.0.0</version>
</parent>
<artifactId>layoutparser-service-internal-api</artifactId>
<version>1.0.0</version>
<packaging>pom</packaging>
<dependencies>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.26</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>31.1-jre</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
<configuration>
<excludes>
<exclude>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
</exclude>
</excludes>
</configuration>
</plugin>
</plugins>
</build>
</project>

View File

@ -0,0 +1,19 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class AtomicPositionBlockData {
Long id;
int[] stringIdxToPositionIdx;
float[][] positions;
}

View File

@ -0,0 +1,23 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class AtomicTextBlockData {
Long id;
Long page;
String searchText;
int numberOnPage;
int start;
int end;
int[] lineBreaks;
}

View File

@ -0,0 +1,20 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class DocumentData {
PageData[] pages;
AtomicTextBlockData[] atomicTextBlocks;
AtomicPositionBlockData[] atomicPositionBlocks;
TableOfContentsData tableOfContents;
}

View File

@ -0,0 +1,20 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class PageData {
int number;
int height;
int width;
int rotation;
}

View File

@ -0,0 +1,90 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.stream.Stream;
import javax.management.openmbean.InvalidKeyException;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.NodeType;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class TableOfContentsData {
List<EntryData> entries;
public EntryData get(List<Integer> tocId) {
if (tocId.size() < 1) {
throw new InvalidKeyException(String.format("ClassificationSection Identifier: \"%s\" is not valid.", tocId));
}
EntryData entry = entries.get(tocId.get(0));
for (int id : tocId.subList(1, tocId.size())) {
entry = entry.subEntries().get(id);
}
return entry;
}
public Stream<EntryData> streamAllEntries() {
return entries.stream().flatMap(TableOfContentsData::flatten);
}
private static List<Integer> getIds(String idsAsString) {
return Arrays.stream(idsAsString.split("\\.")).map(Integer::valueOf).toList();
}
public String toString() {
return String.join("\n", streamAllEntries().map(EntryData::toString).toList());
}
private static Stream<EntryData> flatten(EntryData entry) {
return Stream.concat(Stream.of(entry), entry.subEntries().stream().flatMap(TableOfContentsData::flatten));
}
@Builder
public record EntryData(NodeType type, int[] tocId, Long[] atomicBlocks, Long[] pages, Map<String, String> properties, List<EntryData> subEntries) {
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("[");
for (int i : tocId) {
sb.append(i);
sb.append(",");
}
sb.delete(sb.length() - 1, sb.length());
sb.append("]: ");
sb.append(type);
sb.append(" atbs = ");
sb.append(atomicBlocks.length);
return sb.toString();
}
}
}

View File

@ -0,0 +1,148 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph;
import java.util.LinkedList;
import java.util.List;
import lombok.Setter;
@Setter
public class Boundary implements Comparable<Boundary> {
private int start;
private int end;
public Boundary(int start, int end) {
if (start > end) {
throw new IllegalArgumentException(String.format("start: %d > end: %d", start, end));
}
this.start = start;
this.end = end;
}
public int length() {
return end - start;
}
public int start() {
return start;
}
public int end() {
return end;
}
public boolean contains(Boundary boundary) {
return start <= boundary.start() && boundary.end() <= end;
}
public boolean containedBy(Boundary boundary) {
return boundary.contains(this);
}
public boolean contains(int start, int end) {
if (start > end) {
throw new IllegalArgumentException(String.format("start: %d > end: %d", start, end));
}
return this.start <= start && end <= this.end;
}
public boolean containedBy(int start, int end) {
if (start > end) {
throw new IllegalArgumentException(String.format("start: %d > end: %d", start, end));
}
return start <= this.start && this.end <= end;
}
public boolean contains(int index) {
return start <= index && index < end;
}
public boolean intersects(Boundary boundary) {
return contains(boundary.start()) || contains(boundary.end() - 1);
}
public List<Boundary> split(List<Integer> splitIndices) {
if (splitIndices.stream().anyMatch(idx -> !this.contains(idx))) {
throw new IndexOutOfBoundsException(String.format("%s splitting indices are out of range for %s", splitIndices.stream().filter(idx -> !this.contains(idx)).toList(), this));
}
List<Boundary> splitBoundaries = new LinkedList<>();
int previousIndex = start;
for (int splitIndex : splitIndices) {
// skip split if it would produce a boundary of length 0
if (splitIndex == previousIndex) {
continue;
}
splitBoundaries.add(new Boundary(previousIndex, splitIndex));
previousIndex = splitIndex;
}
splitBoundaries.add(new Boundary(previousIndex, end));
return splitBoundaries;
}
public static Boundary merge(List<Boundary> boundaries) {
int minStart = boundaries.stream().mapToInt(Boundary::start).min().orElseThrow(IllegalArgumentException::new);
int maxEnd = boundaries.stream().mapToInt(Boundary::end).max().orElseThrow(IllegalArgumentException::new);
return new Boundary(minStart, maxEnd);
}
@Override
public String toString() {
return String.format("Boundary [%d|%d)", start, end);
}
@Override
public int compareTo(Boundary boundary) {
if (end < boundary.end() && start < boundary.start()) {
return -1;
}
if (start > boundary.start() && end > boundary.end()) {
return 1;
}
return 0;
}
@Override
public int hashCode() {
return toString().hashCode();
}
@Override
public boolean equals(Object object) {
return hashCode() == object.hashCode();
}
}

View File

@ -0,0 +1,98 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph;
import java.awt.geom.Rectangle2D;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SectionNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class DocumentGraph implements SemanticNode {
Set<PageNode> pages;
TableOfContents tableOfContents;
Integer numberOfPages;
TextBlock textBlock;
public TextBlock buildTextBlock() {
return streamTerminalTextBlocksInOrder().collect(new TextBlockCollector());
}
public List<SectionNode> getMainSections() {
return tableOfContents.entries.stream().filter(entry -> entry.node() instanceof SectionNode).map(entry -> (SectionNode) entry.node()).collect(Collectors.toList());
}
public Stream<TextBlock> streamTerminalTextBlocksInOrder() {
return streamAllNodes().filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock);
}
public Set<EntityNode> getEntities() {
return streamAllNodes().map(SemanticNode::getEntities).flatMap(Set::stream).collect(Collectors.toUnmodifiableSet());
}
@Override
public List<Integer> getTocId() {
return Collections.emptyList();
}
@Override
public void setTocId(List<Integer> tocId) {
throw new UnsupportedOperationException("DocumentGraph is always the root of the Table of Contents");
}
private Stream<SemanticNode> streamAllNodes() {
return tableOfContents.streamEntriesInOrder().map(TableOfContents.Entry::node);
}
@Override
public String toString() {
return tableOfContents.toString();
}
@Override
public Map<PageNode, Rectangle2D> getBBox() {
Map<PageNode, Rectangle2D> bBox = new HashMap<>();
for (PageNode page : pages) {
bBox.put(page, new Rectangle2D.Double(0, 0, page.getWidth(), page.getHeight()));
}
return bBox;
}
}

View File

@ -0,0 +1,164 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph;
import java.nio.charset.StandardCharsets;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.stream.Stream;
import com.google.common.hash.Hashing;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.NodeType;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector;
import lombok.Builder;
import lombok.Data;
@Data
public class TableOfContents {
List<Entry> entries;
public TableOfContents() {
entries = new LinkedList<>();
}
public TextBlock buildTextBlock() {
return streamEntriesInOrder().map(Entry::node).filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock).collect(new TextBlockCollector());
}
public List<Integer> createNewEntryAndReturnId(NodeType nodeType, SemanticNode node) {
return createNewChildEntryAndReturnId(Collections.emptyList(), nodeType, node);
}
public List<Integer> createNewChildEntryAndReturnId(List<Integer> parentId, NodeType nodeType, SemanticNode node) {
List<Integer> newId;
if (entryExists(parentId)) {
Entry parent = getEntryById(parentId);
newId = new LinkedList<>(parentId);
newId.add(parent.children().size());
parent.children().add(Entry.builder().tocId(newId).node(node).type(nodeType).children(new LinkedList<>()).build());
} else {
newId = List.of(entries.size());
entries.add(Entry.builder().tocId(newId).node(node).type(nodeType).children(new LinkedList<>()).build());
}
return newId;
}
private boolean entryExists(List<Integer> tocId) {
if (tocId.size() < 1) {
return false;
}
Entry entry = entries.get(tocId.get(0));
for (int id : tocId.subList(1, tocId.size())) {
if (id >= entry.children.size() || 0 > id) {
return false;
}
entry = entry.children().get(id);
}
return true;
}
public Entry getParentEntryById(List<Integer> tocId) {
List<Integer> parentIds = getParentId(tocId);
if (parentIds.size() < 1) {
throw new NoSuchElementException(String.format("Node with tocId \"%s\" has no parent!", tocId));
}
return getEntryById(parentIds);
}
public boolean hasParentById(List<Integer> tocId) {
List<Integer> parentId = getParentId(tocId);
return entryExists(parentId);
}
public Stream<SemanticNode> streamChildren(List<Integer> tocId) {
return getEntryById(tocId).children().stream().map(Entry::node);
}
private static List<Integer> getParentId(List<Integer> tocId) {
return tocId.subList(0, tocId.size() - 1);
}
public Entry getEntryById(List<Integer> tocId) {
Entry entry = entries.get(tocId.get(0));
for (int id : tocId.subList(1, tocId.size())) {
entry = entry.children().get(id);
}
return entry;
}
public Stream<Entry> streamEntriesInOrder() {
return entries.stream().flatMap(TableOfContents::flatten);
}
public Stream<Entry> streamSubEntriesInOrder(List<Integer> parentId) {
return Stream.of(getEntryById(parentId)).flatMap(TableOfContents::flatten);
}
@Override
public String toString() {
return String.join("\n", streamEntriesInOrder().map(Entry::toString).toList());
}
public String toString(List<Integer> id) {
return String.join("\n", streamSubEntriesInOrder(id).map(Entry::toString).toList());
}
private static Stream<Entry> flatten(Entry entry) {
return Stream.concat(Stream.of(entry), entry.children().stream().flatMap(TableOfContents::flatten));
}
@Builder
public record Entry(List<Integer> tocId, NodeType type, SemanticNode node, List<Entry> children) {
@Override
public String toString() {
return node().toString();
}
@Override
public int hashCode() {
return Hashing.murmur3_32_fixed().hashString(toString(), StandardCharsets.UTF_8).hashCode();
}
}
}

View File

@ -0,0 +1,76 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph.entity;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode;
public interface EntityNode {
/**
* This represents the text, which is contained within the boundary of the Entity.
*
* @return String
*/
String getValue();
/**
* The Boundary primarily defines the Entity, all other values may be inferred from it.
*
* @return Boundary, uniquely identifying this Entity
*/
Boundary getBoundary();
/**
* The deepest fully containing node represents the node which is the deepest node in the document tree structure,
* whose boundary also fully contains the boundary of this entity
*
* @return the deepest fully containing node
*/
SemanticNode getDeepestFullyContainingNode();
/**
* The intersecting nodes represent all nodes, whose boundary intersects the boundary of this entity.
*
* @return all intersecting Nodes
*/
List<SemanticNode> getIntersectingNodes();
void setDeepestFullyContainingNode(SemanticNode semanticNode);
void addIntersectingNode(SemanticNode semanticNode);
void setIntersectingNodes(List<SemanticNode> semanticNodes);
/**
* @return all pages this entity intersects.
*/
Set<PageNode> getPages();
void setPages(Set<PageNode> pages);
/**
* removes all occurrences of this node in the graph and resets all graph specific fields
*/
default void removeFromGraph() {
getIntersectingNodes().forEach(node -> node.getEntities().remove(this));
getPages().forEach(page -> page.getEntities().remove(this));
setPages(Collections.emptySet());
setDeepestFullyContainingNode(null);
setIntersectingNodes(Collections.emptyList());
}
}

View File

@ -0,0 +1,39 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph.entity;
import java.awt.geom.Rectangle2D;
import java.nio.charset.StandardCharsets;
import java.util.List;
import com.google.common.hash.Hashing;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode;
import lombok.AccessLevel;
import lombok.Builder;
import lombok.Data;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@FieldDefaults(level = AccessLevel.PRIVATE)
public class EntityPosition {
PageNode pageNode;
List<Rectangle2D> rectanglePerLine;
public String getId() {
return String.valueOf(hashCode());
}
@Override
public int hashCode() {
StringBuilder sb = new StringBuilder();
sb.append(pageNode.getNumber());
rectanglePerLine.forEach(r -> sb.append(r.getX()).append(r.getY()).append(r.getWidth()).append(r.getHeight()));
return Hashing.murmur3_128().hashString(sb.toString(), StandardCharsets.UTF_8).hashCode();
}
}

View File

@ -0,0 +1,53 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class FooterNode implements SemanticNode {
List<Integer> tocId;
TextBlock terminalTextBlock;
@Builder.Default
boolean terminal = true;
@EqualsAndHashCode.Exclude
TableOfContents tableOfContents;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<EntityNode> entities = new HashSet<>();
@Override
public TextBlock buildTextBlock() {
return terminalTextBlock;
}
@Override
public String toString() {
return tocId + ": " + NodeType.FOOTER + ": " + terminalTextBlock.buildSummary();
}
}

View File

@ -0,0 +1,53 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class HeaderNode implements SemanticNode {
List<Integer> tocId;
TextBlock terminalTextBlock;
@Builder.Default
boolean terminal = true;
@EqualsAndHashCode.Exclude
TableOfContents tableOfContents;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<EntityNode> entities = new HashSet<>();
@Override
public TextBlock buildTextBlock() {
return terminalTextBlock;
}
@Override
public String toString() {
return tocId + ": " + NodeType.HEADER + ": " + terminalTextBlock.buildSummary();
}
}

View File

@ -0,0 +1,60 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class HeadlineNode implements SemanticNode {
List<Integer> tocId;
TextBlock terminalTextBlock;
@Builder.Default
boolean terminal = true;
@EqualsAndHashCode.Exclude
TableOfContents tableOfContents;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<EntityNode> entities = new HashSet<>();
@Override
public TextBlock buildTextBlock() {
return terminalTextBlock;
}
@Override
public String toString() {
return tocId + ": " + NodeType.HEADLINE + ": " + terminalTextBlock.buildSummary();
}
@Override
public SemanticNode getHeadline() {
return this;
}
}

View File

@ -0,0 +1,88 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
import java.awt.geom.Rectangle2D;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class ImageNode implements SemanticNode {
List<Integer> tocId;
ImageType imageType;
boolean transparency;
Rectangle2D position;
@Builder.Default
boolean redaction = false;
@Builder.Default
boolean ignored = false;
@Builder.Default
String redactionReason = "";
@Builder.Default
String legalBasis = "";
@Builder.Default
int matchedRule = -1;
@EqualsAndHashCode.Exclude
PageNode page;
@EqualsAndHashCode.Exclude
TableOfContents tableOfContents;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<EntityNode> entities = new HashSet<>();
@Override
public TextBlock buildTextBlock() {
return streamAllSubNodes().filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock).collect(new TextBlockCollector());
}
@Override
public Set<PageNode> getPages() {
return Collections.singleton(page);
}
@Override
public String toString() {
return tocId + ": " + NodeType.IMAGE + ": " + imageType.toString() + " " + position;
}
@Override
public Map<PageNode, Rectangle2D> getBBox() {
Map<PageNode, Rectangle2D> bBoxPerPage = new HashMap<>();
bBoxPerPage.put(page, position);
return bBoxPerPage;
}
}

View File

@ -0,0 +1,9 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
public enum ImageType {
LOGO,
FORMULA,
SIGNATURE,
OTHER,
OCR
}

View File

@ -0,0 +1,12 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
public enum NodeType {
SECTION,
HEADLINE,
PARAGRAPH,
TABLE,
TABLE_CELL,
IMAGE,
HEADER,
FOOTER
}

View File

@ -0,0 +1,66 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.Setter;
import lombok.experimental.FieldDefaults;
@Getter
@Setter
@Builder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class PageNode {
Integer number;
Integer height;
Integer width;
Integer rotation;
@EqualsAndHashCode.Exclude
List<SemanticNode> mainBody;
@EqualsAndHashCode.Exclude
HeaderNode header;
@EqualsAndHashCode.Exclude
FooterNode footer;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<EntityNode> entities = new HashSet<>();
@Builder.Default
@EqualsAndHashCode.Exclude
Set<ImageNode> images = new HashSet<>();
public TextBlock getMainBodyTextBlock() {
return mainBody.stream().filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock).collect(new TextBlockCollector());
}
@Override
public String toString() {
return String.valueOf(number);
}
@Override
public int hashCode() {
return number;
}
}

View File

@ -0,0 +1,51 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class ParagraphNode implements SemanticNode {
List<Integer> tocId;
TextBlock terminalTextBlock;
@Builder.Default
boolean terminal = true;
@EqualsAndHashCode.Exclude
TableOfContents tableOfContents;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<EntityNode> entities = new HashSet<>();
@Override
public TextBlock buildTextBlock() {
return terminalTextBlock;
}
@Override
public String toString() {
return tocId + ": " + NodeType.PARAGRAPH + ": " + terminalTextBlock.buildSummary();
}
}

View File

@ -0,0 +1,63 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
import java.util.HashSet;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class SectionNode implements SemanticNode {
List<Integer> tocId;
TextBlock textBlock;
@EqualsAndHashCode.Exclude
TableOfContents tableOfContents;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<EntityNode> entities = new HashSet<>();
@Override
public TextBlock buildTextBlock() {
if (textBlock == null) {
textBlock = streamAllSubNodes().filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock).collect(new TextBlockCollector());
}
return textBlock;
}
@Override
public String toString() {
return tocId.toString() + ": " + NodeType.SECTION + ": " + buildTextBlock().buildSummary();
}
public HeadlineNode getHeadline() {
return streamChildren().filter(node -> node instanceof HeadlineNode)
.map(node -> (HeadlineNode) node)
.findFirst()
.orElseThrow(() -> new NoSuchElementException("ClassificationSection has no Headline!"));
}
}

View File

@ -0,0 +1,275 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
import java.awt.geom.Rectangle2D;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.AtomicTextBlock;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.internal.api.services.RectangleTransformations;
public interface SemanticNode {
/**
* Searches all Nodes located underneath this Node in the TableOfContents and concatenates their AtomicTextBlocks into a single TextBlockEntity.
* So, for a ClassificationSection all TextBlocks of Subsections, Paragraphs, and Tables are concatenated into a single TextBlockEntity
* If the Node is Terminal, the TerminalTextBlock will be returned instead.
*
* @return ClassificationTextBlock containing all AtomicTextBlocks that are located under this Node.
*/
TextBlock buildTextBlock();
/**
* Any Node maintains its own Set of Entities.
* This Set contains all Entities whose boundary intersects the boundary of this node.
*
* @return Set of all Entities associated with this Node
*/
Set<EntityNode> getEntities();
/**
* Each AtomicTextBlock is assigned a page, so to get the pages this node appears on, it collects the PageNodes from each AtomicTextBlock belonging to this node's ClassificationTextBlock
*
* @return Set of PageNodes this node appears on.
*/
default Set<PageNode> getPages() {
return buildTextBlock().getPages();
}
/**
* @return the TableOfContents of the ClassificationDocument this node belongs to
*/
TableOfContents getTableOfContents();
/**
* The id is a List of Integers uniquely identifying this node in the TableOfContents
*
* @return the TableOfContents ID
*/
List<Integer> getTocId();
/**
* This should only be used during graph construction
*
* @param tocId List of Integers
*/
void setTocId(List<Integer> tocId);
/**
* Traverses the Tree up, until it hits a HeadlineNode or hits a SectionNode which will then return the first HeadlineNode from its children.
* Throws NotFoundException if no Headline is found this way
*
* @return First HeadlineNode found
*/
default SemanticNode getHeadline() {
return getParent().getHeadline();
}
/**
* @return boolean indicating wether this Node has a Parent in the TableOfContents
*/
default boolean hasParent() {
return getTableOfContents().hasParentById(getTocId());
}
/**
* @return The SemanticNode representing the Parent in the TableOfContents
* throws NotFoundException, when no parent is present
*/
default SemanticNode getParent() {
return getTableOfContents().getParentEntryById(getTocId()).node();
}
/**
* Terminal means a SemanticNode has direct access to a ClassificationTextBlock, by default this is false and must be overridden.
* Currently only Sections, Images, and Tables are not terminal.
* A TableCell might be Terminal depending on its area compared to the page.
*
* @return boolean, indicating if a Node has direct access to a ClassificationTextBlock
*/
default boolean isTerminal() {
return false;
}
/**
* Terminal means a SemanticNode has direct access to a ClassificationTextBlock, by default this is false and must be overridden.
* Currently only Sections and Tables are not terminal.
*
* @return AtomicTextBlock
*/
default TextBlock getTerminalTextBlock() {
throw new UnsupportedOperationException("Only terminal Nodes have access to TerminalTextBlocks!");
}
default void setTerminalTextBlock(TextBlock textBlock) {
throw new UnsupportedOperationException();
}
/**
* Each AtomicTextBlock has an index on its page, this returns the number of the first AtomicTextBlock underneath this node.
* If this node does not have any AtomicTexBlocks underneath it, e.g. an empty TableCell. It returns -1.
*
* @return Integer representing the number on the page
*/
default Integer getNumberOnPage() {
TextBlock textBlock = buildTextBlock();
if (textBlock.getAtomicTextBlocks().size() > 0) {
return buildTextBlock().getAtomicTextBlocks().get(0).getNumberOnPage();
} else {
return -1;
}
}
/**
* @return true, if this node's ClassificationTextBlock is not empty
*/
default boolean hasText() {
return buildTextBlock().length() > 0;
}
/**
* @param string A String which the ClassificationTextBlock might contain
* @return true, if this node's ClassificationTextBlock contains the string
*/
default boolean containsString(String string) {
return buildTextBlock().getSearchText().contains(string);
}
/**
* @param strings A List of Strings which the ClassificationTextBlock might contain
* @return true, if this node's ClassificationTextBlock contains any of the strings
*/
default boolean containsAnyString(List<String> strings) {
return strings.stream().anyMatch(this::containsString);
}
/**
* This function is used during insertion of EntityNodes into the graph, it checks if the boundary of the EntityNode intersects or even contains the EntityNode.
* It sets the fields accordingly and recursively calls this function on all its children.
*
* @param entityNode EntityNode, which is being inserted into the graph
*/
default void addThisToEntityIfIntersects(EntityNode entityNode) {
TextBlock textBlock = buildTextBlock();
if (textBlock.getBoundary().intersects(entityNode.getBoundary())) {
if (textBlock.containsBoundary(entityNode.getBoundary())) {
entityNode.setDeepestFullyContainingNode(this);
}
entityNode.addIntersectingNode(this);
streamChildren().forEach(node -> node.addThisToEntityIfIntersects(entityNode));
}
}
/**
* Streams all children located directly underneath this node in the TableOfContents
*
* @return Stream of all children
*/
default Stream<SemanticNode> streamChildren() {
return getTableOfContents().streamChildren(getTocId());
}
/**
* recursively streams all SemanticNodes located underneath this node in the TableOfContents in order.
*
* @return Stream of all SubNodes
*/
default Stream<SemanticNode> streamAllSubNodes() {
return getTableOfContents().streamSubEntriesInOrder(getTocId()).map(TableOfContents.Entry::node);
}
/**
* @return Boundary of this Node's ClassificationTextBlock
*/
default Boundary getBoundary() {
return buildTextBlock().getBoundary();
}
/**
* If this Node is Terminal it will calculate the boundingBox of its TerminalTextBlock, otherwise it will calculate the Union of the BoundingBoxes of all its Children.
* If called on the ClassificationDocument, it will return the cropbox of each page
*
* @return Rectangle2D fully encapsulating this Node for each page.
*/
default Map<PageNode, Rectangle2D> getBBox() {
Map<PageNode, Rectangle2D> bBoxPerPage = new HashMap<>();
if (isTerminal()) {
return getBBoxFromTerminalTextBlock(bBoxPerPage);
}
return getBBoxFromChildren(bBoxPerPage);
}
/**
* TODO this does not yet work for sections spanning multiple columns
*
* @param bBoxPerPage initial empty BoundingBox
* @return The union of the BoundingBoxes of all children
*/
private Map<PageNode, Rectangle2D> getBBoxFromChildren(Map<PageNode, Rectangle2D> bBoxPerPage) {
return streamChildren().map(SemanticNode::getBBox).reduce((map1, map2) -> {
map1.forEach((page, rectangle) -> map2.merge(page, rectangle, (rect1, rect2) -> rect1.createUnion(rect2).getBounds2D()));
return map2;
}).orElse(bBoxPerPage);
}
/**
* @param bBoxPerPage initial empty BoundingBox
* @return The union of all BoundingBoxes of the ClassificationTextBlock of this node
*/
private Map<PageNode, Rectangle2D> getBBoxFromTerminalTextBlock(Map<PageNode, Rectangle2D> bBoxPerPage) {
Map<PageNode, List<AtomicTextBlock>> atomicTextBlockPerPage = buildTextBlock().getAtomicTextBlocks().stream().collect(Collectors.groupingBy(AtomicTextBlock::getPage));
atomicTextBlockPerPage.forEach((page, atbs) -> bBoxPerPage.put(page, RectangleTransformations.bBoxUnionAtomicTextBlock(atbs)));
return bBoxPerPage;
}
}

View File

@ -0,0 +1,92 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
import java.awt.geom.Rectangle2D;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class TableCellNode implements SemanticNode {
List<Integer> tocId;
int row;
int col;
boolean header;
Rectangle2D bBox;
@Builder.Default
boolean terminal = true;
TextBlock terminalTextBlock;
TextBlock textBlock;
@EqualsAndHashCode.Exclude
TableOfContents tableOfContents;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<EntityNode> entities = new HashSet<>();
@Override
public Map<PageNode, Rectangle2D> getBBox() {
Map<PageNode, Rectangle2D> bBoxPerPage = new HashMap<>();
getPages().forEach(page -> bBoxPerPage.put(page, bBox));
return bBoxPerPage;
}
@Override
public TextBlock buildTextBlock() {
if (terminal) {
return terminalTextBlock;
}
if (textBlock == null) {
textBlock = streamAllSubNodes().filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock).collect(new TextBlockCollector());
}
return textBlock;
}
@Override
public String toString() {
return tocId + ": " + NodeType.TABLE_CELL + ": " + buildTextBlock().buildSummary();
}
public boolean hasHeader(String headerString) {
return getHeaders().anyMatch(header -> header.buildTextBlock().getSearchText().strip().equals(headerString));
}
private Stream<TableCellNode> getHeaders() {
TableNode tableNode = (TableNode) getParent();
return tableNode.streamHeadersForCell(row, col);
}
}

View File

@ -0,0 +1,73 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class TableNode implements SemanticNode {
List<Integer> tocId;
TableOfContents tableOfContents;
Integer numberOfRows;
Integer numberOfCols;
TextBlock textBlock;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<EntityNode> entities = new HashSet<>();
public Stream<TableCellNode> streamTableCells() {
return streamChildren().map(node -> (TableCellNode) node);
}
public Stream<TableCellNode> streamHeaders() {
return streamTableCells().filter(TableCellNode::isHeader);
}
public Stream<TableCellNode> streamHeadersForCell(int row, int col) {
return streamHeaders().filter(cell -> cell.getRow() == row || cell.getCol() == col);
}
@Override
public TextBlock buildTextBlock() {
if (textBlock == null) {
textBlock = streamAllSubNodes().filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock).collect(new TextBlockCollector());
}
return textBlock;
}
@Override
public String toString() {
return tocId.toString() + ": " + NodeType.TABLE + ": " + buildTextBlock().buildSummary();
}
}

View File

@ -0,0 +1,131 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock;
import java.awt.geom.Rectangle2D;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityPosition;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.internal.api.services.RectangleTransformations;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class AtomicTextBlock implements TextBlock {
Long id;
Integer numberOnPage;
PageNode page;
//string coordinates
Boundary boundary;
String searchText;
List<Integer> lineBreaks;
//position coordinates
List<Integer> stringIdxToPositionIdx;
List<Rectangle2D> positions;
@EqualsAndHashCode.Exclude
SemanticNode parent;
@Override
public int numberOfLines() {
return lineBreaks.size() + 1;
}
public CharSequence getLine(int lineNumber) {
if (lineNumber >= numberOfLines() || lineNumber < 0) {
throw new IndexOutOfBoundsException(String.format("line %d out of range for AtomicTextBlock with %d lines", lineNumber, numberOfLines()));
}
if (lineNumber == 0) {
return subSequence(boundary.start(), lineBreaks.get(0) + boundary.start());
} else if (lineNumber == numberOfLines() - 1) {
return subSequence(lineBreaks.get(lineBreaks.size() - 1) + boundary.start(), boundary.end());
}
return subSequence(lineBreaks.get(lineNumber - 1) + boundary.start(), lineBreaks.get(lineNumber) + boundary.start());
}
@Override
public List<AtomicTextBlock> getAtomicTextBlocks() {
return List.of(this);
}
@Override
public int getNextLinebreak(int fromIndex) {
return lineBreaks.stream()//
.filter(linebreak -> linebreak > fromIndex - boundary.start()) //
.findFirst() //
.orElse(searchText.length()) + boundary.start();
}
@Override
public int getPreviousLinebreak(int fromIndex) {
return lineBreaks.stream()//
.filter(linebreak -> linebreak <= fromIndex - boundary.start())//
.reduce((a, b) -> b)//
.orElse(0) + boundary.start();
}
@Override
public Rectangle2D getPosition(int stringIdx) {
return positions.get(stringIdxToPositionIdx.get(stringIdx - boundary.start()));
}
@Override
public List<Rectangle2D> getPositions(Boundary stringBoundary) {
if (!containsBoundary(stringBoundary)) {
throw new IndexOutOfBoundsException(String.format("%s is out of bounds for %s", stringBoundary, this.boundary));
}
if (stringBoundary.end() == this.boundary.end()) {
return positions.subList(stringIdxToPositionIdx.get(stringBoundary.start() - this.boundary.start()), positions.size());
}
return positions.subList(stringIdxToPositionIdx.get(stringBoundary.start() - this.boundary.start()),
stringIdxToPositionIdx.get(stringBoundary.end() - this.boundary.start()));
}
public List<EntityPosition> getEntityPositionsPerPage(Boundary stringBoundary) {
List<Rectangle2D> positionsPerLine = stringBoundary.split(getLineBreaks().stream().map(lb -> lb + boundary.start()).filter(stringBoundary::contains).toList())
.stream()
.map(this::getPositions)
.map(RectangleTransformations::rectangleUnion)
.toList();
return List.of(EntityPosition.builder().rectanglePerLine(positionsPerLine).pageNode(page).build());
}
@Override
public String toString() {
return searchText;
}
}

View File

@ -0,0 +1,179 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock;
import java.awt.geom.Rectangle2D;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityPosition;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode;
import lombok.AccessLevel;
import lombok.Data;
import lombok.experimental.FieldDefaults;
@Data
@FieldDefaults(level = AccessLevel.PRIVATE)
public class ConcatenatedTextBlock implements TextBlock {
List<AtomicTextBlock> atomicTextBlocks;
String searchText;
Boundary boundary;
public ConcatenatedTextBlock(List<AtomicTextBlock> atomicTextBlocks) {
this.atomicTextBlocks = new LinkedList<>();
if (atomicTextBlocks.isEmpty()) {
boundary = new Boundary(-1, -1);
return;
}
var firstTextBlock = atomicTextBlocks.get(0);
this.atomicTextBlocks.add(firstTextBlock);
boundary = new Boundary(firstTextBlock.getBoundary().start(), firstTextBlock.getBoundary().end());
atomicTextBlocks.subList(1, atomicTextBlocks.size()).forEach(this::concat);
}
public ConcatenatedTextBlock concat(TextBlock textBlock) {
if (this.atomicTextBlocks.isEmpty()) {
boundary.setStart(textBlock.getBoundary().start());
boundary.setEnd(textBlock.getBoundary().end());
} else if (boundary.end() != textBlock.getBoundary().start()) {
throw new UnsupportedOperationException(String.format("Can only concat consecutive TextBlocks, trying to concat %s and %s", boundary, textBlock.getBoundary()));
}
this.atomicTextBlocks.addAll(textBlock.getAtomicTextBlocks());
boundary.setEnd(textBlock.getBoundary().end());
return this;
}
private AtomicTextBlock getAtomicTextBlockByStringIndex(int stringIdx) {
return atomicTextBlocks.stream().filter(textBlock -> (textBlock.getBoundary().contains(stringIdx))).findAny().orElseThrow(IndexOutOfBoundsException::new);
}
private List<AtomicTextBlock> getAllAtomicTextBlocksPartiallyInStringBoundary(Boundary boundary) {
return atomicTextBlocks.stream().filter(tb -> tb.getBoundary().intersects(boundary)).toList();
}
@Override
public String getSearchText() {
if (searchText == null) {
StringBuilder sb = new StringBuilder();
getAtomicTextBlocks().forEach(atb -> sb.append(atb.getSearchText()));
searchText = sb.toString();
}
return searchText;
}
@Override
public int numberOfLines() {
return atomicTextBlocks.stream().map(AtomicTextBlock::getLineBreaks).mapToInt(List::size).sum();
}
@Override
public int getNextLinebreak(int fromIndex) {
return getAtomicTextBlockByStringIndex(fromIndex).getNextLinebreak(fromIndex);
}
@Override
public int getPreviousLinebreak(int fromIndex) {
return getAtomicTextBlockByStringIndex(fromIndex).getPreviousLinebreak(fromIndex);
}
@Override
public List<Integer> getLineBreaks() {
return getAtomicTextBlocks().stream().flatMap(atomicTextBlock -> atomicTextBlock.getLineBreaks().stream()).toList();
}
@Override
public Rectangle2D getPosition(int stringIdx) {
return getAtomicTextBlockByStringIndex(stringIdx).getPosition(stringIdx);
}
@Override
public List<Rectangle2D> getPositions(Boundary stringBoundary) {
List<AtomicTextBlock> textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringBoundary);
if (textBlocks.size() == 1) {
return textBlocks.get(0).getPositions(stringBoundary);
}
AtomicTextBlock firstTextBlock = textBlocks.get(0);
List<Rectangle2D> positions = new LinkedList<>(firstTextBlock.getPositions(new Boundary(stringBoundary.start(), firstTextBlock.getBoundary().end())));
for (AtomicTextBlock textBlock : textBlocks.subList(1, textBlocks.size() - 1)) {
positions.addAll(textBlock.getPositions());
}
var lastTextBlock = textBlocks.get(textBlocks.size() - 1);
positions.addAll(lastTextBlock.getPositions(new Boundary(lastTextBlock.getBoundary().start(), stringBoundary.end())));
return positions;
}
@Override
public List<EntityPosition> getEntityPositionsPerPage(Boundary stringBoundary) {
List<AtomicTextBlock> textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringBoundary);
if (textBlocks.size() == 1) {
return textBlocks.get(0).getEntityPositionsPerPage(stringBoundary);
}
AtomicTextBlock firstTextBlock = textBlocks.get(0);
List<EntityPosition> positions = new LinkedList<>(firstTextBlock.getEntityPositionsPerPage(new Boundary(stringBoundary.start(), firstTextBlock.getBoundary().end())));
for (AtomicTextBlock textBlock : textBlocks.subList(1, textBlocks.size() - 1)) {
positions.addAll(textBlock.getEntityPositionsPerPage(textBlock.getBoundary()));
}
AtomicTextBlock lastTextBlock = textBlocks.get(textBlocks.size() - 1);
positions.addAll(lastTextBlock.getEntityPositionsPerPage(new Boundary(lastTextBlock.getBoundary().start(), stringBoundary.end())));
return mergeEntityPositionsWithSamePageNode(positions);
}
private List<EntityPosition> mergeEntityPositionsWithSamePageNode(List<EntityPosition> positions) {
Map<PageNode, List<Rectangle2D>> entityPositionsPerPage = positions.stream().collect(//
Collectors.groupingBy(EntityPosition::getPageNode, //
Collectors.flatMapping(entityPosition -> entityPosition.getRectanglePerLine().stream(), Collectors.toList())));
return entityPositionsPerPage.entrySet().stream()//
.map(entry -> EntityPosition.builder().pageNode(entry.getKey()).rectanglePerLine(entry.getValue()).build())//
.toList();
}
@Override
public String toString() {
return getSearchText();
}
}

View File

@ -0,0 +1,125 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityPosition;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode;
public interface TextBlock extends CharSequence {
String getSearchText();
List<AtomicTextBlock> getAtomicTextBlocks();
Boundary getBoundary();
int getNextLinebreak(int fromIndex);
int getPreviousLinebreak(int fromIndex);
List<Integer> getLineBreaks();
Rectangle2D getPosition(int stringIdx);
List<Rectangle2D> getPositions(Boundary stringBoundary);
List<EntityPosition> getEntityPositionsPerPage(Boundary stringBoundary);
int numberOfLines();
default int indexOf(String searchTerm) {
return indexOf(searchTerm, getBoundary().start());
}
default Set<PageNode> getPages() {
return getAtomicTextBlocks().stream().map(AtomicTextBlock::getPage).collect(Collectors.toUnmodifiableSet());
}
default int indexOf(String searchTerm, int startOffset) {
int start = getSearchText().indexOf(searchTerm, startOffset - getBoundary().start());
if (start == -1) {
return -1;
}
return start + getBoundary().start();
}
default CharSequence getFirstLine() {
return subSequence(getBoundary().start(), getNextLinebreak(getBoundary().start()));
}
default boolean containsBoundary(Boundary boundary) {
if (boundary.end() < boundary.start()) {
throw new IllegalArgumentException(String.format("Invalid %s, StartIndex must be smaller than EndIndex", boundary));
}
return getBoundary().contains(boundary);
}
default boolean containsIndex(int stringIndex) {
return getBoundary().contains(stringIndex);
}
default CharSequence subSequence(Boundary boundary) {
return subSequence(boundary.start(), boundary.end());
}
default String buildSummary() {
String[] words = getSearchText().split(" ");
int bound = Math.min(words.length, 4);
List<String> list = new ArrayList<>(Arrays.asList(words).subList(0, bound));
return String.join(" ", list);
}
@Override
default CharSequence subSequence(int start, int end) {
return getSearchText().substring(start - getBoundary().start(), end - getBoundary().start());
}
@Override
default int length() {
return getBoundary().length();
}
@Override
default char charAt(int index) {
return getSearchText().charAt(index - getBoundary().start());
}
}

View File

@ -0,0 +1,50 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock;
import java.util.Collections;
import java.util.Set;
import java.util.function.BiConsumer;
import java.util.function.BinaryOperator;
import java.util.function.Function;
import java.util.function.Supplier;
import java.util.stream.Collector;
import lombok.NoArgsConstructor;
@NoArgsConstructor
public class TextBlockCollector implements Collector<TextBlock, ConcatenatedTextBlock, TextBlock> {
@Override
public Supplier<ConcatenatedTextBlock> supplier() {
return () -> new ConcatenatedTextBlock(Collections.emptyList());
}
@Override
public BiConsumer<ConcatenatedTextBlock, TextBlock> accumulator() {
return ConcatenatedTextBlock::concat;
}
@Override
public BinaryOperator<ConcatenatedTextBlock> combiner() {
return ConcatenatedTextBlock::concat;
}
@Override
public Function<ConcatenatedTextBlock, TextBlock> finisher() {
return a -> a;
}
@Override
public Set<Characteristics> characteristics() {
return Set.of(Characteristics.IDENTITY_FINISH, Characteristics.CONCURRENT);
}
}

View File

@ -0,0 +1,143 @@
package com.knecon.fforesight.service.layoutparser.internal.api.mapper;
import java.awt.geom.Rectangle2D;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicPositionBlockData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicTextBlockData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.PageData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.TableOfContentsData;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.DocumentGraph;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ImageNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableCellNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.AtomicTextBlock;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
public class DocumentDataMapper {
public DocumentData toDocumentData(DocumentGraph documentGraph) {
List<AtomicTextBlockData> atomicTextBlockData = documentGraph.streamTerminalTextBlocksInOrder()
.flatMap(textBlock -> textBlock.getAtomicTextBlocks().stream())
.distinct()
.map(this::toAtomicTextBlockData)
.toList();
List<AtomicPositionBlockData> atomicPositionBlockData = documentGraph.streamTerminalTextBlocksInOrder()
.flatMap(textBlock -> textBlock.getAtomicTextBlocks().stream())
.distinct()
.map(this::toAtomicPositionBlockData)
.toList();
List<PageData> pageData = documentGraph.getPages().stream().map(this::toPageData).toList();
TableOfContentsData tableOfContentsData = toTableOfContentsData(documentGraph.getTableOfContents());
return DocumentData.builder()
.atomicTextBlocks(atomicTextBlockData.toArray(new AtomicTextBlockData[0]))
.atomicPositionBlocks(atomicPositionBlockData.toArray(new AtomicPositionBlockData[0]))
.pages(pageData.toArray(new PageData[0]))
.tableOfContents(tableOfContentsData)
.build();
}
private TableOfContentsData toTableOfContentsData(TableOfContents tableOfContents) {
return new TableOfContentsData(tableOfContents.getEntries().stream().map(this::toEntryData).toList());
}
private TableOfContentsData.EntryData toEntryData(TableOfContents.Entry entry) {
Long[] atomicTextBlocks;
if (entry.node().isTerminal()) {
atomicTextBlocks = toAtomicTextBlockIds(entry.node().getTerminalTextBlock());
} else {
atomicTextBlocks = new Long[]{};
}
Map<String, String> properties = switch (entry.type()) {
case TABLE -> PropertiesMapper.buildTableProperties((TableNode) entry.node());
case TABLE_CELL -> PropertiesMapper.buildTableCellProperties((TableCellNode) entry.node());
case IMAGE -> PropertiesMapper.buildImageProperties((ImageNode) entry.node());
default -> new HashMap<>();
};
return TableOfContentsData.EntryData.builder()
.tocId(toPrimitiveIntArray(entry.tocId()))
.subEntries(entry.children().stream().map(this::toEntryData).toList())
.type(entry.type())
.atomicBlocks(atomicTextBlocks)
.pages(entry.node().getPages().stream().map(PageNode::getNumber).map(Integer::longValue).toArray(Long[]::new))
.properties(properties)
.build();
}
private Long[] toAtomicTextBlockIds(TextBlock textBlock) {
return textBlock.getAtomicTextBlocks().stream().map(AtomicTextBlock::getId).toArray(Long[]::new);
}
private PageData toPageData(PageNode p) {
return PageData.builder().rotation(p.getRotation()).height(p.getHeight()).width(p.getWidth()).number(p.getNumber()).build();
}
private AtomicTextBlockData toAtomicTextBlockData(AtomicTextBlock atomicTextBlock) {
return AtomicTextBlockData.builder()
.id(atomicTextBlock.getId())
.page(atomicTextBlock.getPage().getNumber().longValue())
.searchText(atomicTextBlock.getSearchText())
.numberOnPage(atomicTextBlock.getNumberOnPage())
.start(atomicTextBlock.getBoundary().start())
.end(atomicTextBlock.getBoundary().end())
.lineBreaks(toPrimitiveIntArray(atomicTextBlock.getLineBreaks()))
.build();
}
private AtomicPositionBlockData toAtomicPositionBlockData(AtomicTextBlock atomicTextBlock) {
return AtomicPositionBlockData.builder()
.id(atomicTextBlock.getId())
.positions(toPrimitiveFloatMatrix(atomicTextBlock.getPositions()))
.stringIdxToPositionIdx(toPrimitiveIntArray(atomicTextBlock.getStringIdxToPositionIdx()))
.build();
}
private float[][] toPrimitiveFloatMatrix(List<Rectangle2D> positions) {
float[][] positionMatrix = new float[positions.size()][];
for (int i = 0; i < positions.size(); i++) {
float[] singlePositions = new float[4];
singlePositions[0] = (float) positions.get(i).getMinX();
singlePositions[1] = (float) positions.get(i).getMinY();
singlePositions[2] = (float) positions.get(i).getWidth();
singlePositions[3] = (float) positions.get(i).getHeight();
positionMatrix[i] = singlePositions;
}
return positionMatrix;
}
private int[] toPrimitiveIntArray(List<Integer> list) {
int[] array = new int[list.size()];
for (int i = 0; i < list.size(); i++) {
array[i] = list.get(i);
}
return array;
}
}

View File

@ -0,0 +1,225 @@
package com.knecon.fforesight.service.layoutparser.internal.api.mapper;
import static com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.NodeType.FOOTER;
import static com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.NodeType.HEADER;
import java.awt.geom.Rectangle2D;
import java.util.Arrays;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import com.google.common.primitives.Ints;
import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicPositionBlockData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicTextBlockData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.PageData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.TableOfContentsData;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.DocumentGraph;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.FooterNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.HeaderNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.HeadlineNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ImageNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ParagraphNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SectionNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableCellNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.AtomicTextBlock;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector;
public class DocumentGraphMapper {
public DocumentGraph toDocumentGraph(DocumentData documentData) {
Context context = new Context(documentData,
new TableOfContents(),
new LinkedList<>(),
new LinkedList<>(),
Arrays.stream(documentData.getAtomicTextBlocks()).toList(),
Arrays.stream(documentData.getAtomicPositionBlocks()).toList());
context.pages.addAll(Arrays.stream(documentData.getPages()).map(this::buildPage).toList());
context.tableOfContents.setEntries(buildEntries(documentData.getTableOfContents().getEntries(), context));
DocumentGraph documentGraph = DocumentGraph.builder()
.numberOfPages(documentData.getPages().length)
.pages(new HashSet<>(context.pages))
.tableOfContents(context.tableOfContents)
.build();
documentGraph.setTextBlock(documentGraph.buildTextBlock());
return documentGraph;
}
private List<TableOfContents.Entry> buildEntries(List<TableOfContentsData.EntryData> entries,
Context context) {
List<TableOfContents.Entry> newEntries = new LinkedList<>();
for (TableOfContentsData.EntryData entryData : entries) {
boolean terminal = isTerminal(entryData);
List<PageNode> pages = Arrays.stream(entryData.pages()).map(pageNumber -> getPage(pageNumber, context)).toList();
SemanticNode node = switch (entryData.type()) {
case SECTION -> buildSection(context);
case PARAGRAPH -> buildParagraph(context, terminal);
case HEADLINE -> buildHeadline(context, terminal);
case HEADER -> buildHeader(context, terminal);
case FOOTER -> buildFooter(context, terminal);
case TABLE -> buildTable(context, entryData.properties());
case TABLE_CELL -> buildTableCell(context, entryData.properties(), terminal);
case IMAGE -> buildImage(context, entryData.properties());
default -> throw new UnsupportedOperationException("Not yet implemented for type " + entryData.type());
};
if (node.isTerminal()) {
TextBlock textBlock = toTextBlock(entryData.atomicBlocks(), context, node);
node.setTerminalTextBlock(textBlock);
}
List<Integer> tocId = Arrays.stream(entryData.tocId()).boxed().toList();
node.setTocId(tocId);
if (entryData.type() == HEADER) {
pages.forEach(page -> page.setHeader((HeaderNode) node));
} else if (entryData.type() == FOOTER) {
pages.forEach(page -> page.setFooter((FooterNode) node));
} else {
pages.forEach(page -> page.getMainBody().add(node));
}
newEntries.add(TableOfContents.Entry.builder().tocId(tocId).type(entryData.type()).children(buildEntries(entryData.subEntries(), context)).node(node).build());
}
return newEntries;
}
private HeadlineNode buildHeadline(Context context, boolean terminal) {
return HeadlineNode.builder().terminal(terminal).tableOfContents(context.tableOfContents()).build();
}
private static boolean isTerminal(TableOfContentsData.EntryData entryData) {
return entryData.atomicBlocks().length > 0;
}
private ImageNode buildImage(Context context, Map<String, String> properties) {
var builder = ImageNode.builder();
PropertiesMapper.parseImageProperties(properties, builder);
return builder.tableOfContents(context.tableOfContents()).build();
}
private TableCellNode buildTableCell(Context context, Map<String, String> properties, boolean terminal) {
TableCellNode.TableCellNodeBuilder builder = TableCellNode.builder();
PropertiesMapper.parseTableCellProperties(properties, builder);
return builder.terminal(terminal).tableOfContents(context.tableOfContents()).build();
}
private TableNode buildTable(Context context, Map<String, String> properties) {
TableNode.TableNodeBuilder builder = TableNode.builder();
PropertiesMapper.parseTableProperties(properties, builder);
return TableNode.builder().tableOfContents(context.tableOfContents()).build();
}
private FooterNode buildFooter(Context context, boolean terminal) {
return FooterNode.builder().terminal(terminal).tableOfContents(context.tableOfContents()).build();
}
private HeaderNode buildHeader(Context context, boolean terminal) {
return HeaderNode.builder().terminal(terminal).tableOfContents(context.tableOfContents()).build();
}
private SectionNode buildSection(Context context) {
return SectionNode.builder().tableOfContents(context.tableOfContents()).build();
}
private ParagraphNode buildParagraph(Context context, boolean terminal) {
return ParagraphNode.builder().terminal(terminal).tableOfContents(context.tableOfContents()).build();
}
private TextBlock toTextBlock(Long[] atomicTextBlockIds, Context context, SemanticNode parent) {
return Arrays.stream(atomicTextBlockIds)
.map(atomicTextBlockId -> toAtomicTextBlock(context.atomicTextBlockData.get(Math.toIntExact(atomicTextBlockId)),
context.atomicPositionBlockData.get(Math.toIntExact(atomicTextBlockId)),
parent,
context))
.collect(new TextBlockCollector());
}
private PageNode buildPage(PageData p) {
return PageNode.builder().rotation(p.getRotation()).height(p.getHeight()).width(p.getWidth()).number(p.getNumber()).mainBody(new LinkedList<>()).build();
}
private AtomicTextBlock toAtomicTextBlock(AtomicTextBlockData atomicTextBlockData,
AtomicPositionBlockData atomicPositionBlockData,
SemanticNode parent,
Context context) {
return AtomicTextBlock.builder()
.id(atomicTextBlockData.getId())
.numberOnPage(atomicTextBlockData.getNumberOnPage())
.page(getPage(atomicTextBlockData.getPage(), context))
.boundary(new Boundary(atomicTextBlockData.getStart(), atomicTextBlockData.getEnd()))
.searchText(atomicTextBlockData.getSearchText())
.lineBreaks(Ints.asList(atomicTextBlockData.getLineBreaks()))
.stringIdxToPositionIdx(Ints.asList(atomicPositionBlockData.getStringIdxToPositionIdx()))
.positions(toRectangle2DList(atomicPositionBlockData.getPositions()))
.parent(parent)
.build();
}
private static List<Rectangle2D> toRectangle2DList(float[][] positions) {
return Arrays.stream(positions).map(floatArr -> (Rectangle2D) new Rectangle2D.Float(floatArr[0], floatArr[1], floatArr[2], floatArr[3])).toList();
}
private PageNode getPage(Long pageIndex, Context context) {
return context.pages.stream()
.filter(page -> page.getNumber() == Math.toIntExact(pageIndex))
.findFirst()
.orElseThrow(() -> new NoSuchElementException(String.format("ClassificationPage with number %d not found", pageIndex)));
}
record Context(
DocumentData layoutParsingModel,
TableOfContents tableOfContents,
List<PageNode> pages,
List<SectionNode> sections,
List<AtomicTextBlockData> atomicTextBlockData,
List<AtomicPositionBlockData> atomicPositionBlockData) {
}
}

View File

@ -0,0 +1,101 @@
package com.knecon.fforesight.service.layoutparser.internal.api.mapper;
import java.awt.geom.Rectangle2D;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ImageNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ImageType;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableCellNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableNode;
import com.knecon.fforesight.service.layoutparser.internal.api.services.RectangleTransformations;
public class PropertiesMapper {
public static Map<String, String> buildImageProperties(ImageNode image) {
Map<String, String> properties = new HashMap<>();
properties.put("imageType", image.getImageType().toString());
properties.put("transparency", String.valueOf(image.isTransparency()));
properties.put("position", RectangleTransformations.toString(image.getPosition()));
return properties;
}
public static Map<String, String> buildTableCellProperties(TableCellNode tableCell) {
Map<String, String> properties = new HashMap<>();
properties.put("row", String.valueOf(tableCell.getRow()));
properties.put("col", String.valueOf(tableCell.getCol()));
properties.put("header", String.valueOf(tableCell.isHeader()));
if (tableCell.getPages().size() > 1 || tableCell.getBBox().keySet().size() > 1) {
throw new IllegalArgumentException("TableCell can only occur on a single page!");
}
String bBoxString = RectangleTransformations.toString(tableCell.getBBox().get(tableCell.getPages().stream().findFirst().get()));
properties.put("bBox", bBoxString);
return properties;
}
public static Map<String, String> buildTableProperties(TableNode table) {
Map<String, String> properties = new HashMap<>();
properties.put("numberOfRows", String.valueOf(table.getNumberOfRows()));
properties.put("numberOfCols", String.valueOf(table.getNumberOfCols()));
return properties;
}
public static void parseImageProperties(Map<String, String> properties, ImageNode.ImageNodeBuilder builder) {
builder.imageType(parseImageType(properties.get("imageType")));
builder.transparency(Boolean.parseBoolean(properties.get("transparency")));
builder.position(parseRectangle2D(properties.get("position")));
}
public static void parseTableCellProperties(Map<String, String> properties, TableCellNode.TableCellNodeBuilder builder) {
builder.row(Integer.parseInt(properties.get("row")));
builder.col(Integer.parseInt(properties.get("col")));
builder.header(Boolean.parseBoolean(properties.get("header")));
builder.bBox(parseRectangle2D(properties.get("bBox")));
}
public static void parseTableProperties(Map<String, String> properties, TableNode.TableNodeBuilder builder) {
builder.numberOfRows(Integer.parseInt(properties.get("numberOfRows")));
builder.numberOfCols(Integer.parseInt(properties.get("numberOfCols")));
}
private static ImageType parseImageType(String imageType) {
return switch (imageType) {
case "LOGO" -> ImageType.LOGO;
case "FORMULA" -> ImageType.FORMULA;
case "SIGNATURE" -> ImageType.SIGNATURE;
case "OCR" -> ImageType.OCR;
default -> ImageType.OTHER;
};
}
public static String toString(Rectangle2D rectangle2D) {
return String.format("%f,%f,%f,%f", rectangle2D.getX(), rectangle2D.getY(), rectangle2D.getWidth(), rectangle2D.getHeight());
}
public static Rectangle2D parseRectangle2D(String bBox) {
List<Float> floats = Arrays.stream(bBox.split(",")).map(Float::parseFloat).toList();
return new Rectangle2D.Float(floats.get(0), floats.get(1), floats.get(2), floats.get(3));
}
}

View File

@ -0,0 +1,10 @@
package com.knecon.fforesight.service.layoutparser.internal.api.services;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
public interface EntityEnrichmentService {
void enrichEntity(EntityNode entity, TextBlock textBlock);
}

View File

@ -0,0 +1,65 @@
package com.knecon.fforesight.service.layoutparser.internal.api.services;
import java.util.NoSuchElementException;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
import lombok.RequiredArgsConstructor;
@RequiredArgsConstructor
public class EntityInsertionService {
private final EntityEnrichmentService entityEnrichmentService;
public void addEntityToGraph(EntityNode entity, TableOfContents tableOfContents) {
try {
SemanticNode containingNode = tableOfContents.getEntries()
.stream()
.map(TableOfContents.Entry::node)
.filter(node -> node.buildTextBlock().containsBoundary(entity.getBoundary()))
.findFirst()
.orElseThrow(() -> new NoSuchElementException("No containing Node found!"));
containingNode.addThisToEntityIfIntersects(entity);
TextBlock textBlock = entity.getDeepestFullyContainingNode().buildTextBlock();
entityEnrichmentService.enrichEntity(entity, textBlock);
addToPages(entity);
addToNodeEntitySets(entity);
} catch (NoSuchElementException e) {
entityEnrichmentService.enrichEntity(entity, tableOfContents.buildTextBlock());
entity.removeFromGraph();
}
}
private void addToPages(EntityNode entity) {
Set<PageNode> pages = entity.getDeepestFullyContainingNode().getPages();
entity.getPages().addAll(pages);
pages.forEach(page -> page.getEntities().add(entity));
}
private void addToNodeEntitySets(EntityNode entity) {
entity.getIntersectingNodes().forEach(node -> node.getEntities().add(entity));
}
private static Boundary toLineAfterBoundary(TextBlock textBlock, Boundary boundary) {
return new Boundary(boundary.end(), textBlock.getNextLinebreak(boundary.end()));
}
}

View File

@ -0,0 +1,95 @@
package com.knecon.fforesight.service.layoutparser.internal.api.services;
import static java.lang.String.format;
import java.awt.geom.Area;
import java.awt.geom.Rectangle2D;
import java.util.Arrays;
import java.util.List;
import java.util.Set;
import java.util.function.BiConsumer;
import java.util.function.BinaryOperator;
import java.util.function.Function;
import java.util.function.Supplier;
import java.util.stream.Collector;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.AtomicTextBlock;
import lombok.experimental.UtilityClass;
@UtilityClass
public class RectangleTransformations {
public static Rectangle2D pad(Rectangle2D rectangle2D, int deltaX, int deltaY) {
return new Rectangle2D.Double(rectangle2D.getMinX() - deltaX, rectangle2D.getMinY() - deltaY, rectangle2D.getWidth() + 2 * deltaX, rectangle2D.getHeight() + 2 * deltaY);
}
public static Rectangle2D bBoxUnionAtomicTextBlock(List<AtomicTextBlock> atomicTextBlocks) {
return atomicTextBlocks.stream().flatMap(atomicTextBlock -> atomicTextBlock.getPositions().stream()).collect(new Rectangle2DUnion());
}
public static Rectangle2D rectangleUnion(List<Rectangle2D> rectangle2DList) {
return rectangle2DList.stream().collect(new Rectangle2DUnion());
}
public static String toString(Rectangle2D rectangle2D) {
return format("%f,%f,%f,%f", rectangle2D.getX(), rectangle2D.getY(), rectangle2D.getWidth(), rectangle2D.getHeight());
}
public static Rectangle2D parseRectangle2D(String bBox) {
List<Float> floats = Arrays.stream(bBox.split(",")).map(Float::parseFloat).toList();
return new Rectangle2D.Float(floats.get(0), floats.get(1), floats.get(2), floats.get(3));
}
private static class Rectangle2DUnion implements Collector<Rectangle2D, Area, Rectangle2D> {
@Override
public Supplier<Area> supplier() {
return Area::new;
}
@Override
public BiConsumer<Area, Rectangle2D> accumulator() {
return (area, rectangle2D) -> area.add(new Area(rectangle2D));
}
@Override
public BinaryOperator<Area> combiner() {
return (area1, area2) -> {
area1.add(area2);
return area1;
};
}
@Override
public Function<Area, Rectangle2D> finisher() {
return Area::getBounds2D;
}
@Override
public Set<Characteristics> characteristics() {
return Set.of(Characteristics.CONCURRENT, Characteristics.UNORDERED);
}
}
}

View File

@ -0,0 +1,161 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.knecon.fforesight</groupId>
<artifactId>layoutparser-service</artifactId>
<version>1.0.0</version>
</parent>
<artifactId>layoutparser-service-processor</artifactId>
<version>1.0.0</version>
<dependencies>
<dependency>
<groupId>com.iqser.red.service</groupId>
<artifactId>persistence-service-internal-api-v1</artifactId>
<version>2.36.0</version>
</dependency>
<dependency>
<groupId>com.knecon.fforesight</groupId>
<artifactId>layoutparser-service-internal-api</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>com.iqser.red.commons</groupId>
<artifactId>spring-commons</artifactId>
<version>6.2.0</version>
</dependency>
<dependency>
<groupId>com.iqser.red.commons</groupId>
<artifactId>storage-commons</artifactId>
<version>1.13.0</version>
</dependency>
<dependency>
<groupId>com.dslplatform</groupId>
<artifactId>dsl-json-java8</artifactId>
<version>1.10.0</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>${pdfbox.version}</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox-tools</artifactId>
<version>${pdfbox.version}</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>31.1-jre</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.module</groupId>
<artifactId>jackson-module-afterburner</artifactId>
<version>${jackson.version}</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.datatype</groupId>
<artifactId>jackson-datatype-jsr310</artifactId>
<version>${jackson.version}</version>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-security</artifactId>
<version>${spring.version}</version>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
<version>${spring.version}</version>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<optional>true</optional>
</dependency>
<dependency>
<groupId>org.springframework.cloud</groupId>
<artifactId>spring-cloud-starter-openfeign</artifactId>
<version>4.0.2</version>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-amqp</artifactId>
<version>${spring.version}</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
<configuration>
<excludes>
<exclude>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
</exclude>
</excludes>
</configuration>
</plugin>
</plugins>
</build>
<repositories>
<repository>
<id>spring-milestones</id>
<name>Spring Milestones</name>
<url>https://repo.spring.io/milestone</url>
<snapshots>
<enabled>false</enabled>
</snapshots>
</repository>
<repository>
<id>spring-snapshots</id>
<name>Spring Snapshots</name>
<url>https://repo.spring.io/snapshot</url>
<releases>
<enabled>false</enabled>
</releases>
</repository>
</repositories>
<pluginRepositories>
<pluginRepository>
<id>spring-milestones</id>
<name>Spring Milestones</name>
<url>https://repo.spring.io/milestone</url>
<snapshots>
<enabled>false</enabled>
</snapshots>
</pluginRepository>
<pluginRepository>
<id>spring-snapshots</id>
<name>Spring Snapshots</name>
<url>https://repo.spring.io/snapshot</url>
<releases>
<enabled>false</enabled>
</releases>
</pluginRepository>
</pluginRepositories>
</project>

View File

@ -0,0 +1,114 @@
package com.knecon.fforesight.service.layoutparser.processor;
import static java.lang.String.format;
import java.io.IOException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.DocumentGraph;
import com.knecon.fforesight.service.layoutparser.internal.api.mapper.DocumentDataMapper;
import com.knecon.fforesight.service.layoutparser.processor.adapter.CvTableParsingAdapter;
import com.knecon.fforesight.service.layoutparser.processor.adapter.ImageServiceResponseAdapter;
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.image.ImageServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.classification.service.ClassificationService;
import com.knecon.fforesight.service.layoutparser.processor.classification.service.PdfParsingService;
import com.knecon.fforesight.service.layoutparser.processor.classification.service.SectionsBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.processor.queue.LayoutParsingFinishedEvent;
import com.knecon.fforesight.service.layoutparser.processor.queue.LayoutParsingRequest;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
public class LayoutParsingService {
private final ImageServiceResponseAdapter imageServiceResponseAdapter;
private final CvTableParsingAdapter cvTableParsingAdapter;
private final LayoutParsingStorageService layoutParsingStorageService;
private final PdfParsingService pdfParsingService;
private final ClassificationService classificationService;
private final SectionsBuilderService sectionsBuilderService;
private final DocumentGraphFactory documentGraphFactory;
private final DocumentDataMapper documentDataMapper;
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) {
PDDocument originDocument;
try {
originDocument = layoutParsingStorageService.getOriginFile(layoutParsingRequest.getOriginFileStorageId());
} catch (IOException e) {
log.error(e.toString());
return LayoutParsingFinishedEvent.builder()
.status(400)
.message(format("Origin PDF File with id %s could not be loaded!", layoutParsingRequest.getPageFileStorageId()))
.build();
}
ImageServiceResponse imageServiceResponse = new ImageServiceResponse();
if (layoutParsingRequest.getImagesFileStorageId().isPresent()) {
try {
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.getPageFileStorageId());
} catch (IOException e) {
log.error(e.toString());
return LayoutParsingFinishedEvent.builder()
.status(400)
.message(format("Image Service File with id %s could not be loaded!", layoutParsingRequest.getImagesFileStorageId()))
.build();
}
}
TableServiceResponse tableServiceResponse = new TableServiceResponse();
if (layoutParsingRequest.getTablesFileStorageId().isPresent()) {
try {
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.getPageFileStorageId());
} catch (IOException e) {
log.error(e.toString());
return LayoutParsingFinishedEvent.builder()
.status(400)
.message(format("CV Table Parsing File with id %s could not be loaded!", layoutParsingRequest.getPageFileStorageId()))
.build();
}
}
DocumentGraph documentGraph = parseLayout(originDocument, imageServiceResponse, tableServiceResponse);
try {
layoutParsingStorageService.storeDocumentData(layoutParsingRequest, documentDataMapper.toDocumentData(documentGraph));
} catch (IOException e) {
log.error("Parsed Document files could not be saved!");
log.error(e.getMessage());
return LayoutParsingFinishedEvent.builder().status(500).message("Files could not be saved").build();
}
return LayoutParsingFinishedEvent.builder()
.status(200)
.message(format("Layout parsing is finished and files have been saved with Ids:\n Structure: %s\nText: %s\nPositions: %s\nPageData: %s",
layoutParsingRequest.getStructureFileStorageId(),
layoutParsingRequest.getTextBlockFileStorageId(),
layoutParsingRequest.getPositionBlockFileStorageId(),
layoutParsingRequest.getPageFileStorageId()))
.build();
}
public DocumentGraph parseLayout(PDDocument originDocument, ImageServiceResponse imageServiceResponse, TableServiceResponse tableServiceResponse) {
ClassificationDocument classificationDocument = pdfParsingService.parseDocument(originDocument,
cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse),
imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse));
classificationService.classifyDocument(classificationDocument);
sectionsBuilderService.buildSections(classificationDocument);
return documentGraphFactory.buildDocumentGraph(classificationDocument);
}
}

View File

@ -0,0 +1,126 @@
package com.knecon.fforesight.service.layoutparser.processor;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import org.apache.commons.io.IOUtils;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.io.MemoryUsageSetting;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.springframework.stereotype.Service;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.storage.commons.service.StorageService;
import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicPositionBlockData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicTextBlockData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.PageData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.TableOfContentsData;
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.image.ImageServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.multitenancy.TenantContext;
import com.knecon.fforesight.service.layoutparser.processor.queue.LayoutParsingRequest;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
public class LayoutParsingStorageService {
private final StorageService storageService;
private final ObjectMapper objectMapper;
public PDDocument getOriginFile(String storageId) throws IOException {
try (var originDocumentInputStream = storageService.getObject(TenantContext.getTenantId(), storageId).getInputStream()) {
File tempFile = createTempFile("document", ".pdf");
try (var tempFileOutputStream = new FileOutputStream(tempFile)) {
IOUtils.copy(originDocumentInputStream, tempFileOutputStream);
}
return Loader.loadPDF(tempFile, MemoryUsageSetting.setupMixed(67108864L));
}
}
public ImageServiceResponse getImagesFile(String storageId) throws IOException {
try (InputStream inputStream = storageService.getObject(TenantContext.getTenantId(), storageId).getInputStream()) {
return objectMapper.readValue(inputStream, ImageServiceResponse.class);
}
}
public TableServiceResponse getTablesFile(String storageId) throws IOException {
try (var tableClassificationStream = storageService.getObject(TenantContext.getTenantId(), storageId).getInputStream()) {
return objectMapper.readValue(tableClassificationStream, TableServiceResponse.class);
}
}
public void storeDocumentData(LayoutParsingRequest layoutParsingRequest, DocumentData documentData) throws IOException {
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.getStructureFileStorageId(), documentData.getTableOfContents());
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.getTextBlockFileStorageId(), documentData.getAtomicTextBlocks());
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.getPositionBlockFileStorageId(), documentData.getAtomicPositionBlocks());
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.getPageFileStorageId(), documentData.getPages());
}
public DocumentData readDocumentData(LayoutParsingRequest layoutParsingRequest) throws IOException {
PageData[] pageData = storageService.readJSONObject(TenantContext.getTenantId(), layoutParsingRequest.getPageFileStorageId(), PageData[].class);
AtomicTextBlockData[] atomicTextBlockData = storageService.readJSONObject(TenantContext.getTenantId(),
layoutParsingRequest.getTextBlockFileStorageId(),
AtomicTextBlockData[].class);
AtomicPositionBlockData[] atomicPositionBlockData = storageService.readJSONObject(TenantContext.getTenantId(),
layoutParsingRequest.getPositionBlockFileStorageId(),
AtomicPositionBlockData[].class);
TableOfContentsData tableOfContentsData = storageService.readJSONObject(TenantContext.getTenantId(),
layoutParsingRequest.getStructureFileStorageId(),
TableOfContentsData.class);
return DocumentData.builder()
.tableOfContents(tableOfContentsData)
.atomicPositionBlocks(atomicPositionBlockData)
.atomicTextBlocks(atomicTextBlockData)
.pages(pageData)
.build();
}
private File createTempFile(String filenamePrefix, String filenameSuffix) throws IOException {
File tempFile = Files.createTempFile(filenamePrefix, filenameSuffix).toFile();
setRWPermissionsOnlyForOwner(tempFile);
return tempFile;
}
// We don't need to check the results of the permission setters below,
// since we're manipulating a file we created ourselves.
@SuppressWarnings({"ResultOfMethodCallIgnored", "squid:S899"})
private void setRWPermissionsOnlyForOwner(File tempFile) {
try {
tempFile.setReadable(true, true);
tempFile.setWritable(true, true);
tempFile.setExecutable(false);
} catch (SecurityException ex) {
// This should never happen since we're creating a temp file ourselves.
log.warn("Caught an exception during temp file creation. This should not happend. Check the code.", ex);
}
}
}

View File

@ -0,0 +1,10 @@
package com.knecon.fforesight.service.layoutparser.processor;
import org.springframework.context.annotation.ComponentScan;
import org.springframework.context.annotation.Configuration;
@Configuration
@ComponentScan
public class LayoutparserServiceProcessorConfiguration {
}

View File

@ -0,0 +1,49 @@
package com.knecon.fforesight.service.layoutparser.processor.adapter;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.CvParsedTableCell;
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
public class CvTableParsingAdapter {
public Map<Integer, List<com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CvParsedTableCell>> buildCvParsedTablesPerPage(TableServiceResponse tableServiceResponse) {
Map<Integer, List<com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CvParsedTableCell>> tableCells = new HashMap<>();
tableServiceResponse.getData()
.forEach(tableData -> tableCells.computeIfAbsent(tableData.getPageInfo().getNumber(), tableCell -> new ArrayList<>())
.addAll(convertTableCells(tableData.getTableCells())));
return tableCells;
}
private Collection<? extends com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CvParsedTableCell> convertTableCells(List<CvParsedTableCell> tableCells) {
List<com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CvParsedTableCell> cvParsedTableCells = new ArrayList<>();
tableCells.forEach(t -> cvParsedTableCells.add(com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CvParsedTableCell.builder()
.y0(t.getY0())
.x1(t.getX1())
.y1(t.getY1())
.x0(t.getX0())
.width(t.getWidth())
.height(t.getHeight())
.build()));
return cvParsedTableCells;
}
}

View File

@ -0,0 +1,67 @@
package com.knecon.fforesight.service.layoutparser.processor.adapter;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ImageType;
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.image.ImageServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.image.ClassifiedImage;
import lombok.RequiredArgsConstructor;
@Service
@RequiredArgsConstructor
public class ImageServiceResponseAdapter {
public Map<Integer, List<ClassifiedImage>> buildClassifiedImagesPerPage(ImageServiceResponse imageServiceResponse ) {
Map<Integer, List<ClassifiedImage>> images = new HashMap<>();
imageServiceResponse.getData().forEach(imageMetadata -> {
var classification = imageMetadata.getFilters().isAllPassed() ? ImageType.valueOf(imageMetadata.getClassification()
.getLabel()
.toUpperCase(Locale.ROOT)) : ImageType.OTHER;
images.computeIfAbsent(imageMetadata.getPosition().getPageNumber(), x -> new ArrayList<>())
.add(new ClassifiedImage(new Rectangle2D.Double(imageMetadata.getPosition().getX1(),
imageMetadata.getPosition().getY1(),
imageMetadata.getGeometry().getWidth(),
imageMetadata.getGeometry().getHeight()), classification, imageMetadata.isAlpha(), imageMetadata.getPosition().getPageNumber()));
});
// Currently This is a copy but, it will be changed later because i don' t think that we should unclassified images.
imageServiceResponse.getDataCV().forEach(imageMetadata -> {
var classification = imageMetadata.getFilters().isAllPassed() ? ImageType.valueOf(imageMetadata.getClassification()
.getLabel()
.toUpperCase(Locale.ROOT)) : ImageType.OTHER;
images.computeIfAbsent(imageMetadata.getPosition().getPageNumber(), x -> new ArrayList<>())
.add(new ClassifiedImage(new Rectangle2D.Double(imageMetadata.getPosition().getX1(),
imageMetadata.getPosition().getY1(),
imageMetadata.getGeometry().getWidth(),
imageMetadata.getGeometry().getHeight()), classification, imageMetadata.isAlpha(), imageMetadata.getPosition().getPageNumber()));
});
return images;
}
public void findOcr(ClassificationPage classificationPage) {
classificationPage.getImages().forEach(image -> {
if (image.getImageType().equals(ImageType.OTHER)) {
classificationPage.getTextBlocks().forEach(textblock -> {
if (image.getPosition().contains(textblock.getMinX(), textblock.getMinY(), textblock.getWidth(), textblock.getHeight())) {
image.setImageType(ImageType.OCR);
}
});
}
});
}
}

View File

@ -0,0 +1,17 @@
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
import java.util.HashMap;
import java.util.Map;
import com.dslplatform.json.CompiledJson;
import lombok.Data;
@Data
@CompiledJson
public class Classification {
private Map<String, Float> probabilities = new HashMap<>();
private String label;
}

View File

@ -0,0 +1,14 @@
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
import com.dslplatform.json.CompiledJson;
import lombok.Data;
@Data
@CompiledJson
public class FilterGeometry {
private ImageSize imageSize;
private Format imageFormat;
}

View File

@ -0,0 +1,15 @@
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
import com.dslplatform.json.CompiledJson;
import lombok.Data;
@Data
@CompiledJson
public class Filters {
private FilterGeometry geometry;
private Probability probability;
private boolean allPassed;
}

View File

@ -0,0 +1,15 @@
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
import com.dslplatform.json.CompiledJson;
import lombok.Data;
@Data
@CompiledJson
public class Format {
private float quotient;
private boolean tooTall;
private boolean tooWide;
}

View File

@ -0,0 +1,14 @@
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
import com.dslplatform.json.CompiledJson;
import lombok.Data;
@Data
@CompiledJson
public class Geometry {
private float width;
private float height;
}

View File

@ -0,0 +1,33 @@
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
import java.util.ArrayList;
import java.util.List;
import com.dslplatform.json.CompiledJson;
import com.dslplatform.json.JsonAttribute;
import com.fasterxml.jackson.annotation.JsonAlias;
import com.fasterxml.jackson.annotation.JsonProperty;
import lombok.Data;
@Data
@CompiledJson
public class ImageServiceResponse {
private String dossierId;
private String fileId;
@JsonProperty(value = "imageMetadata")
@JsonAlias("data")
@JsonAttribute(alternativeNames = {"imageMetadata"})
private List<Metadata> data = new ArrayList<>();
private List<Metadata> dataCV = new ArrayList<>();
@JsonProperty(value = "imageMetadata")
@JsonAlias("data")
@JsonAttribute(alternativeNames = {"imageMetadata"})
public void setData(List<Metadata> data) {this.data = data;}
}

View File

@ -0,0 +1,15 @@
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
import com.dslplatform.json.CompiledJson;
import lombok.Data;
@Data
@CompiledJson
public class ImageSize {
private float quotient;
private boolean tooLarge;
private boolean tooSmall;
}

View File

@ -0,0 +1,17 @@
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
import com.dslplatform.json.CompiledJson;
import lombok.Data;
@Data
@CompiledJson
public class Metadata {
private Classification classification;
private Position position;
private Geometry geometry;
private Filters filters;
private boolean alpha;
}

View File

@ -0,0 +1,17 @@
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
import com.dslplatform.json.CompiledJson;
import lombok.Data;
@Data
@CompiledJson
public class Position {
private float x1;
private float x2;
private float y1;
private float y2;
private int pageNumber;
}

View File

@ -0,0 +1,13 @@
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
import com.dslplatform.json.CompiledJson;
import lombok.Data;
@Data
@CompiledJson
public class Probability {
private boolean unconfident;
}

View File

@ -0,0 +1,16 @@
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.table;
import com.dslplatform.json.CompiledJson;
import lombok.Data;
@Data
@CompiledJson
public class CvParsedPageInfo {
private int number;
private int rotation;
private float width;
private float height;
}

View File

@ -0,0 +1,18 @@
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.table;
import com.dslplatform.json.CompiledJson;
import lombok.Data;
@Data
@CompiledJson
public class CvParsedTableCell {
private float x0;
private float y0;
private float x1;
private float y1;
private float width;
private float height;
}

View File

@ -0,0 +1,17 @@
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.table;
import java.util.ArrayList;
import java.util.List;
import com.dslplatform.json.CompiledJson;
import lombok.Data;
@Data
@CompiledJson
public class CvParsedTableModel {
private CvParsedPageInfo pageInfo;
private List<CvParsedTableCell> tableCells = new ArrayList<>();
}

View File

@ -0,0 +1,22 @@
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.table;
import java.util.ArrayList;
import java.util.List;
import com.dslplatform.json.CompiledJson;
import lombok.Data;
@Data
@CompiledJson
public class TableServiceResponse {
private String dossierId;
private String fileId;
private String operation;
private String targetFileExtension;
private String responseFileExtension;
private List<CvParsedTableModel> data = new ArrayList<>();
}

View File

@ -0,0 +1,71 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto;
import java.awt.geom.Rectangle2D;
import com.dslplatform.json.JsonAttribute;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextBlockOrientation;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@AllArgsConstructor
@NoArgsConstructor
public abstract class AbstractTextContainer {
protected float minX;
protected float maxX;
protected float minY;
protected float maxY;
protected String classification;
protected int page;
private TextBlockOrientation orientation = TextBlockOrientation.NONE;
public abstract String getText();
public boolean containsBlock(ClassificationTextBlock other) {
return this.minX <= other.getPdfMinX() && this.maxX >= other.getPdfMaxX() && this.minY >= other.getPdfMinY() && this.maxY <= other.getPdfMaxY();
}
public boolean contains(AbstractTextContainer other) {
return this.minX <= other.minX && this.maxX >= other.maxX && this.minY >= other.minY && this.maxY <= other.maxY;
}
public boolean contains(Rectangle2D other) {
return other.contains(minX, minY, getWidth(), getHeight());
}
@JsonIgnore
@JsonAttribute(ignore = true)
public float getHeight() {
return maxY - minY;
}
@JsonIgnore
@JsonAttribute(ignore = true)
public float getWidth() {
return maxX - minX;
}
public boolean intersectsY(AbstractTextContainer atc) {
return this.minY <= atc.getMaxY() && this.maxY >= atc.getMinY();
}
}

View File

@ -0,0 +1,27 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto;
import java.util.ArrayList;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.StringFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.UnclassifiedText;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@NoArgsConstructor
public class ClassificationDocument {
private List<ClassificationPage> pages = new ArrayList<>();
private List<ClassificationSection> sections = new ArrayList<>();
private List<ClassificationHeader> headers = new ArrayList<>();
private List<ClassificationFooter> footers = new ArrayList<>();
private List<UnclassifiedText> unclassifiedTexts = new ArrayList<>();
private FloatFrequencyCounter textHeightCounter = new FloatFrequencyCounter();
private FloatFrequencyCounter fontSizeCounter = new FloatFrequencyCounter();
private StringFrequencyCounter fontCounter = new StringFrequencyCounter();
private StringFrequencyCounter fontStyleCounter = new StringFrequencyCounter();
private boolean headlines;
}

View File

@ -0,0 +1,16 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
import lombok.AllArgsConstructor;
import lombok.Data;
@Data
@AllArgsConstructor
public class ClassificationFooter {
private List<ClassificationTextBlock> textBlocks;
}

View File

@ -0,0 +1,16 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
import lombok.AllArgsConstructor;
import lombok.Data;
@Data
@AllArgsConstructor
public class ClassificationHeader {
private List<ClassificationTextBlock> textBlocks;
}

View File

@ -0,0 +1,38 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto;
import java.util.ArrayList;
import java.util.List;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.StringFrequencyCounter;
import lombok.Data;
import lombok.NonNull;
import lombok.RequiredArgsConstructor;
@Data
@RequiredArgsConstructor
public class ClassificationPage {
@NonNull
private List<AbstractTextContainer> textBlocks;
private List<ClassifiedImage> images = new ArrayList<>();
private Rectangle bodyTextFrame;
private boolean landscape;
private int rotation;
private int pageNumber;
private FloatFrequencyCounter textHeightCounter = new FloatFrequencyCounter();
private FloatFrequencyCounter fontSizeCounter = new FloatFrequencyCounter();
private StringFrequencyCounter fontCounter = new StringFrequencyCounter();
private StringFrequencyCounter fontStyleCounter = new StringFrequencyCounter();
private float pageWidth;
private float pageHeight;
}

View File

@ -0,0 +1,38 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto;
import java.util.ArrayList;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Table;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@NoArgsConstructor
public class ClassificationSection implements Comparable {
private List<AbstractTextContainer> pageBlocks = new ArrayList<>();
private List<ClassifiedImage> images = new ArrayList<>();
private String headline;
public List<Table> getTables() {
List<Table> tables = new ArrayList<>();
pageBlocks.forEach(block -> {
if (block instanceof Table) {
tables.add((Table) block);
}
});
return tables;
}
@Override
public int compareTo(Object o) {
return 0;
}
}

View File

@ -0,0 +1,77 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import lombok.Getter;
public class FloatFrequencyCounter {
@Getter
Map<Float, Integer> countPerValue = new HashMap<>();
public void add(float value) {
if (!countPerValue.containsKey(value)) {
countPerValue.put(value, 1);
} else {
countPerValue.put(value, countPerValue.get(value) + 1);
}
}
public void addAll(Map<Float, Integer> otherCounter) {
for (Map.Entry<Float, Integer> entry : otherCounter.entrySet()) {
if (countPerValue.containsKey(entry.getKey())) {
countPerValue.put(entry.getKey(), countPerValue.get(entry.getKey()) + entry.getValue());
} else {
countPerValue.put(entry.getKey(), entry.getValue());
}
}
}
public Float getMostPopular() {
Map.Entry<Float, Integer> mostPopular = null;
for (Map.Entry<Float, Integer> entry : countPerValue.entrySet()) {
if (mostPopular == null || entry.getValue() >= mostPopular.getValue()) {
mostPopular = entry;
}
}
return mostPopular != null ? mostPopular.getKey() : null;
}
public List<Float> getHighterThanMostPopular() {
Float mostPopular = getMostPopular();
List<Float> higher = new ArrayList<>();
for (Float value : countPerValue.keySet()) {
if (value > mostPopular) {
higher.add(value);
}
}
return higher.stream().sorted(Collections.reverseOrder()).collect(Collectors.toList());
}
public Float getHighest() {
Float highest = null;
for (Float value : countPerValue.keySet()) {
if (highest == null || value > highest) {
highest = value;
}
}
return highest;
}
}

View File

@ -0,0 +1,218 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.util.Comparator;
import java.util.List;
@SuppressWarnings("all")
public class Rectangle extends Rectangle2D.Float {
protected static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f;
/**
* Ill-defined comparator, from when Rectangle was Comparable.
* <p>
* see https://github.com/tabulapdf/tabula-java/issues/116
*
* @deprecated with no replacement
*/
@Deprecated
public static final Comparator<Rectangle> ILL_DEFINED_ORDER = new Comparator<Rectangle>() {
@Override
public int compare(Rectangle o1, Rectangle o2) {
if (o1.equals(o2)) {
return 0;
}
if (o1.verticalOverlap(o2) > VERTICAL_COMPARISON_THRESHOLD) {
return o1.isLtrDominant() == -1 && o2.isLtrDominant() == -1 ? -java.lang.Double.compare(o1.getX(), o2.getX()) : java.lang.Double.compare(o1.getX(), o2.getX());
} else {
return java.lang.Float.compare(o1.getBottom(), o2.getBottom());
}
}
};
public Rectangle() {
super();
}
public Rectangle(float top, float left, float width, float height) {
super();
this.setRect(left, top, width, height);
}
/**
* @param rectangles
* @return minimum bounding box that contains all the rectangles
*/
public static Rectangle boundingBoxOf(List<? extends Rectangle> rectangles) {
float minx = java.lang.Float.MAX_VALUE;
float miny = java.lang.Float.MAX_VALUE;
float maxx = java.lang.Float.MIN_VALUE;
float maxy = java.lang.Float.MIN_VALUE;
for (Rectangle r : rectangles) {
minx = (float) Math.min(r.getMinX(), minx);
miny = (float) Math.min(r.getMinY(), miny);
maxx = (float) Math.max(r.getMaxX(), maxx);
maxy = (float) Math.max(r.getMaxY(), maxy);
}
return new Rectangle(miny, minx, maxx - minx, maxy - miny);
}
public int compareTo(Rectangle other) {
return ILL_DEFINED_ORDER.compare(this, other);
}
// I'm bad at Java and need this for fancy sorting in
// technology.tabula.TextChunk.
public int isLtrDominant() {
return 0;
}
public float getArea() {
return this.width * this.height;
}
public float verticalOverlap(Rectangle other) {
return Math.max(0, Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
}
public boolean verticallyOverlaps(Rectangle other) {
return verticalOverlap(other) > 0;
}
public float horizontalOverlap(Rectangle other) {
return Math.max(0, Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
}
public boolean horizontallyOverlaps(Rectangle other) {
return horizontalOverlap(other) > 0;
}
public float verticalOverlapRatio(Rectangle other) {
float rv = 0, delta = Math.min(this.getBottom() - this.getTop(), other.getBottom() - other.getTop());
if (other.getTop() <= this.getTop() && this.getTop() <= other.getBottom() && other.getBottom() <= this.getBottom()) {
rv = (other.getBottom() - this.getTop()) / delta;
} else if (this.getTop() <= other.getTop() && other.getTop() <= this.getBottom() && this.getBottom() <= other.getBottom()) {
rv = (this.getBottom() - other.getTop()) / delta;
} else if (this.getTop() <= other.getTop() && other.getTop() <= other.getBottom() && other.getBottom() <= this.getBottom()) {
rv = (other.getBottom() - other.getTop()) / delta;
} else if (other.getTop() <= this.getTop() && this.getTop() <= this.getBottom() && this.getBottom() <= other.getBottom()) {
rv = (this.getBottom() - this.getTop()) / delta;
}
return rv;
}
public float overlapRatio(Rectangle other) {
double intersectionWidth = Math.max(0, Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
double intersectionHeight = Math.max(0, Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
double intersectionArea = Math.max(0, intersectionWidth * intersectionHeight);
double unionArea = this.getArea() + other.getArea() - intersectionArea;
return (float) (intersectionArea / unionArea);
}
public Rectangle merge(Rectangle other) {
this.setRect(this.createUnion(other));
return this;
}
public float getTop() {
return (float) this.getMinY();
}
public void setTop(float top) {
float deltaHeight = top - this.y;
this.setRect(this.x, top, this.width, this.height - deltaHeight);
}
public float getRight() {
return (float) this.getMaxX();
}
public void setRight(float right) {
this.setRect(this.x, this.y, right - this.x, this.height);
}
public float getLeft() {
return (float) this.getMinX();
}
public void setLeft(float left) {
float deltaWidth = left - this.x;
this.setRect(left, this.y, this.width - deltaWidth, this.height);
}
public float getBottom() {
return (float) this.getMaxY();
}
public void setBottom(float bottom) {
this.setRect(this.x, this.y, this.width, bottom - this.y);
}
public Point2D[] getPoints() {
return new Point2D[]{new Point2D.Float(this.getLeft(), this.getTop()), new Point2D.Float(this.getRight(), this.getTop()), new Point2D.Float(this.getRight(),
this.getBottom()), new Point2D.Float(this.getLeft(), this.getBottom())};
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
String s = super.toString();
sb.append(s.substring(0, s.length() - 1));
sb.append(String.format(",bottom=%f,right=%f]", this.getBottom(), this.getRight()));
return sb.toString();
}
}

View File

@ -0,0 +1,25 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.image;
import java.awt.geom.Rectangle2D;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ImageType;
import lombok.Data;
import lombok.NonNull;
import lombok.RequiredArgsConstructor;
@Data
@RequiredArgsConstructor
public class ClassifiedImage {
@NonNull
private Rectangle2D position;
@NonNull
private ImageType imageType;
private boolean isAppendedToSection;
@NonNull
private boolean hasTransparency;
@NonNull
private int page;
}

View File

@ -0,0 +1,17 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.table;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Ruling;
import lombok.Builder;
import lombok.Data;
@Data
@Builder
public class CleanRulings {
List<Ruling> horizontal;
List<Ruling> vertical;
}

View File

@ -0,0 +1,21 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.table;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.RequiredArgsConstructor;
@Data
@Builder
@AllArgsConstructor
@RequiredArgsConstructor
public class CvParsedTableCell {
private float x0;
private float y0;
private float x1;
private float y1;
private float width;
private float height;
}

View File

@ -0,0 +1,437 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.table;
import java.awt.geom.Line2D;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Formatter;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.CohenSutherlandClipping;
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.DoubleComparisons;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@SuppressWarnings("all")
public class Ruling extends Line2D.Float {
private static int PERPENDICULAR_PIXEL_EXPAND_AMOUNT = 2;
public Ruling(Point2D p1, Point2D p2) {
super(p1, p2);
}
public static List<Ruling> cropRulingsToArea(List<Ruling> rulings, Rectangle2D area) {
ArrayList<Ruling> rv = new ArrayList<>();
for (Ruling r : rulings) {
if (r.intersects(area)) {
rv.add(r.intersect(area));
}
}
return rv;
}
// log(n) implementation of find_intersections
// based on http://people.csail.mit.edu/indyk/6.838-old/handouts/lec2.pdf
public static Map<Point2D, Ruling[]> findIntersections(List<Ruling> horizontals, List<Ruling> verticals) {
class SortObject {
protected SOType type;
protected float position;
protected Ruling ruling;
public SortObject(SOType type, float position, Ruling ruling) {
this.type = type;
this.position = position;
this.ruling = ruling;
}
}
List<SortObject> sos = new ArrayList<>();
TreeMap<Ruling, Boolean> tree = new TreeMap<>(new Comparator<Ruling>() {
@Override
public int compare(Ruling o1, Ruling o2) {
return java.lang.Double.compare(o1.getTop(), o2.getTop());
}
});
TreeMap<Point2D, Ruling[]> rv = new TreeMap<>(new Comparator<Point2D>() {
@Override
public int compare(Point2D o1, Point2D o2) {
if (o1.getY() > o2.getY()) {
return 1;
}
if (o1.getY() < o2.getY()) {
return -1;
}
if (o1.getX() > o2.getX()) {
return 1;
}
if (o1.getX() < o2.getX()) {
return -1;
}
return 0;
}
});
for (Ruling h : horizontals) {
sos.add(new SortObject(SOType.HLEFT, h.getLeft() - PERPENDICULAR_PIXEL_EXPAND_AMOUNT, h));
sos.add(new SortObject(SOType.HRIGHT, h.getRight() + PERPENDICULAR_PIXEL_EXPAND_AMOUNT, h));
}
for (Ruling v : verticals) {
sos.add(new SortObject(SOType.VERTICAL, v.getLeft(), v));
}
Collections.sort(sos, new Comparator<SortObject>() {
@Override
public int compare(SortObject a, SortObject b) {
int rv;
if (DoubleComparisons.feq(a.position, b.position)) {
if (a.type == SOType.VERTICAL && b.type == SOType.HLEFT) {
rv = 1;
} else if (a.type == SOType.VERTICAL && b.type == SOType.HRIGHT) {
rv = -1;
} else if (a.type == SOType.HLEFT && b.type == SOType.VERTICAL) {
rv = -1;
} else if (a.type == SOType.HRIGHT && b.type == SOType.VERTICAL) {
rv = 1;
} else {
rv = java.lang.Double.compare(a.position, b.position);
}
} else {
return java.lang.Double.compare(a.position, b.position);
}
return rv;
}
});
for (SortObject so : sos) {
switch (so.type) {
case VERTICAL:
for (Map.Entry<Ruling, Boolean> h : tree.entrySet()) {
try {
Point2D i = h.getKey().intersectionPoint(so.ruling);
if (i == null) {
continue;
}
rv.put(i, new Ruling[]{h.getKey().expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT), so.ruling.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT)});
} catch (UnsupportedOperationException e) {
log.info("Some line are oblique, ignoring...");
continue;
}
}
break;
case HRIGHT:
tree.remove(so.ruling);
break;
case HLEFT:
tree.put(so.ruling, true);
break;
}
}
return rv;
}
public boolean vertical() {
return this.length() > 0 && DoubleComparisons.feq(this.x1, this.x2); //diff < ORIENTATION_CHECK_THRESHOLD;
}
public boolean horizontal() {
return this.length() > 0 && DoubleComparisons.feq(this.y1, this.y2); //diff < ORIENTATION_CHECK_THRESHOLD;
}
// attributes that make sense only for non-oblique lines
// these are used to have a single collapse method (in page, currently)
public boolean oblique() {
return !(this.vertical() || this.horizontal());
}
public float getPosition() {
if (this.oblique()) {
throw new UnsupportedOperationException();
}
return this.vertical() ? this.getLeft() : this.getTop();
}
public float getStart() {
if (this.oblique()) {
throw new UnsupportedOperationException();
}
return this.vertical() ? this.getTop() : this.getLeft();
}
public void setStart(float v) {
if (this.oblique()) {
throw new UnsupportedOperationException();
}
if (this.vertical()) {
this.setTop(v);
} else {
this.setLeft(v);
}
}
public float getEnd() {
if (this.oblique()) {
throw new UnsupportedOperationException();
}
return this.vertical() ? this.getBottom() : this.getRight();
}
public void setEnd(float v) {
if (this.oblique()) {
throw new UnsupportedOperationException();
}
if (this.vertical()) {
this.setBottom(v);
} else {
this.setRight(v);
}
}
public void setStartEnd(float start, float end) {
if (this.oblique()) {
throw new UnsupportedOperationException();
}
if (this.vertical()) {
this.setTop(start);
this.setBottom(end);
} else {
this.setLeft(start);
this.setRight(end);
}
}
public boolean perpendicularTo(Ruling other) {
return this.vertical() == other.horizontal();
}
public boolean nearlyIntersects(Ruling another, int colinearOrParallelExpandAmount) {
if (this.intersectsLine(another)) {
return true;
}
boolean rv = false;
if (this.perpendicularTo(another)) {
rv = this.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT).intersectsLine(another);
} else {
rv = this.expand(colinearOrParallelExpandAmount).intersectsLine(another.expand(colinearOrParallelExpandAmount));
}
return rv;
}
public double length() {
return Math.sqrt(Math.pow(this.x1 - this.x2, 2) + Math.pow(this.y1 - this.y2, 2));
}
public Ruling intersect(Rectangle2D clip) {
Float clipee = (Float) this.clone();
boolean clipped = new CohenSutherlandClipping(clip).clip(clipee);
if (clipped) {
return new Ruling(clipee.getP1(), clipee.getP2());
} else {
return this;
}
}
public Ruling expand(float amount) {
Ruling r = (Ruling) this.clone();
try {
r.setStart(this.getStart() - amount);
r.setEnd(this.getEnd() + amount);
} catch (UnsupportedOperationException e) {
log.warn("Could not expand ruling!");
}
return r;
}
public Point2D intersectionPoint(Ruling other) {
Ruling this_l = this.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT);
Ruling other_l = other.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT);
Ruling horizontal, vertical;
if (!this_l.intersectsLine(other_l)) {
return null;
}
if (this_l.horizontal() && other_l.vertical()) {
horizontal = this_l;
vertical = other_l;
} else if (this_l.vertical() && other_l.horizontal()) {
vertical = this_l;
horizontal = other_l;
} else {
log.warn("lines must be orthogonal, vertical and horizontal");
return null;
}
return new Point2D.Float(vertical.getLeft(), horizontal.getTop());
}
@Override
public boolean equals(Object other) {
if (this == other) {
return true;
}
if (!(other instanceof Ruling)) {
return false;
}
Ruling o = (Ruling) other;
return this.getP1().equals(o.getP1()) && this.getP2().equals(o.getP2());
}
@Override
public int hashCode() {
return super.hashCode();
}
public float getTop() {
return this.y1;
}
public void setTop(float v) {
setLine(this.getLeft(), v, this.getRight(), this.getBottom());
}
public float getLeft() {
return this.x1;
}
public void setLeft(float v) {
setLine(v, this.getTop(), this.getRight(), this.getBottom());
}
public float getBottom() {
return this.y2;
}
public void setBottom(float v) {
setLine(this.getLeft(), this.getTop(), this.getRight(), v);
}
public float getRight() {
return this.x2;
}
public void setRight(float v) {
setLine(this.getLeft(), this.getTop(), v, this.getBottom());
}
public float getWidth() {
return this.getRight() - this.getLeft();
}
public float getHeight() {
return this.getBottom() - this.getTop();
}
public double getAngle() {
double angle = Math.toDegrees(Math.atan2(this.getP2().getY() - this.getP1().getY(), this.getP2().getX() - this.getP1().getX()));
if (angle < 0) {
angle += 360;
}
return angle;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
Formatter formatter = new Formatter(sb);
String rv = formatter.format("%s[minX=%f minY=%f maxX=%f maxY=%f]", this.getClass().toString(), this.x1, this.y1, this.x2, this.y2).toString();
formatter.close();
return rv;
}
private enum SOType {
VERTICAL,
HRIGHT,
HLEFT
}
}

View File

@ -0,0 +1,350 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.table;
import java.awt.geom.Point2D;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.TreeMap;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.Rectangle;
import lombok.Getter;
import lombok.Setter;
import lombok.extern.slf4j.Slf4j;
@Slf4j
public class Table extends AbstractTextContainer {
private final TreeMap<TableCellPosition, TableCell> cells = new TreeMap<>();
private final int rotation;
@Getter
@Setter
private String headline;
private int unrotatedRowCount;
private int unrotatedColCount;
private int rowCount = -1;
private int colCount = -1;
private List<List<TableCell>> rows;
public Table(List<TableCell> cells, Rectangle area, int rotation) {
addCells(cells);
minX = area.getLeft();
minY = area.getBottom();
maxX = area.getRight();
maxY = area.getTop();
classification = "Table";
this.rotation = rotation;
}
public List<List<TableCell>> getRows() {
if (rows == null) {
rows = computeRows();
// Ignore rows that does not contain any cells and values.
List<List<TableCell>> rowsToRemove = new ArrayList<>();
for (List<TableCell> row : rows) {
if (row.size() == 1 && row.get(0).getTextBlocks().isEmpty()) {
rowsToRemove.add(row);
}
}
rows.removeAll(rowsToRemove);
computeHeaders();
}
return rows;
}
public int getRowCount() {
if (rowCount == -1) {
rowCount = getRows().size();
}
return rowCount;
}
public int getColCount() {
if (colCount == -1) {
colCount = getRows().stream().mapToInt(List::size).max().orElse(0);
}
return colCount;
}
/**
* Detect header cells (either first row or first column):
* Column is marked as header if cell text is bold and row cell text is not bold.
* Defaults to row.
*/
private void computeHeaders() {
if (rows == null) {
rows = computeRows();
}
// A bold cell is a header cell as long as every cell to the left/top is bold, too
// we move from left to right and top to bottom
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
List<TableCell> rowCells = rows.get(rowIndex);
if (rowCells.size() == 1) {
continue;
}
for (int colIndex = 0; colIndex < rowCells.size(); colIndex++) {
TableCell cell = rowCells.get(colIndex);
List<TableCell> cellsToTheLeft = rowCells.subList(0, colIndex);
TableCell lastHeaderCell = null;
for (TableCell leftCell : cellsToTheLeft) {
if (leftCell.isHeaderCell()) {
lastHeaderCell = leftCell;
} else {
break;
}
}
if (lastHeaderCell != null) {
cell.getHeaderCells().add(lastHeaderCell);
}
List<TableCell> cellsToTheTop = new ArrayList<>();
for (int i = 0; i < rowIndex; i++) {
try {
cellsToTheTop.add(rows.get(i).get(colIndex));
} catch (IndexOutOfBoundsException e) {
log.debug("No cell {} in row {}, ignoring.", colIndex, rowIndex);
}
}
for (TableCell topCell : cellsToTheTop) {
if (topCell.isHeaderCell()) {
lastHeaderCell = topCell;
} else {
break;
}
}
if (lastHeaderCell != null) {
cell.getHeaderCells().add(lastHeaderCell);
}
if (!cell.getTextBlocks().isEmpty() && cell.getTextBlocks().get(0).getMostPopularWordStyle().equals("bold")) {
cell.setHeaderCell(true);
}
}
}
}
private List<List<TableCell>> computeRows() {
List<List<TableCell>> rows = new ArrayList<>();
if (rotation == 90) {
for (int i = 0; i < unrotatedColCount; i++) { // rows
List<TableCell> lastRow = new ArrayList<>();
for (int j = unrotatedRowCount - 1; j >= 0; j--) { // cols
TableCell cell = cells.get(new TableCellPosition(j, i));
if (cell != null) {
lastRow.add(cell);
}
}
rows.add(lastRow);
}
} else if (rotation == 270) {
for (int i = unrotatedColCount - 1; i >= 0; i--) { // rows
List<TableCell> lastRow = new ArrayList<>();
for (int j = 0; j < unrotatedRowCount; j++) { // cols
TableCell cell = cells.get(new TableCellPosition(j, i));
if (cell != null) {
lastRow.add(cell);
}
}
rows.add(lastRow);
}
} else {
for (int i = 0; i < unrotatedRowCount; i++) {
List<TableCell> lastRow = new ArrayList<>();
for (int j = 0; j < unrotatedColCount; j++) {
TableCell cell = cells.get(new TableCellPosition(i, j)); // JAVA_8 use getOrDefault()
if (cell != null) {
lastRow.add(cell);
}
}
rows.add(lastRow);
}
}
return rows;
}
private void add(TableCell chunk, int row, int col) {
unrotatedRowCount = Math.max(unrotatedRowCount, row + 1);
unrotatedColCount = Math.max(unrotatedColCount, col + 1);
TableCellPosition cp = new TableCellPosition(row, col);
cells.put(cp, chunk);
}
private void addCells(List<TableCell> cells) {
if (cells.isEmpty()) {
return;
}
cells.removeIf(cell -> cell.getWidth() < 1.1 || cell.getHeight() < 1.1);
List<List<TableCell>> rowsOfCells = calculateStructure(cells);
for (int i = 0; i < rowsOfCells.size(); i++) {
for (int j = 0; j < rowsOfCells.get(i).size(); j++) {
add(rowsOfCells.get(i).get(j), i, j);
}
}
}
/**
* Calculates the structure of the table. For spanning rows and columns multiple cells with the same values will be inserted.
*
* @param cells The found cells
* @return Table Structure
*/
private List<List<TableCell>> calculateStructure(List<TableCell> cells) {
List<List<TableCell>> matrix = new ArrayList<>();
if (cells.isEmpty()) {
return matrix;
}
Set<Float> uniqueX = new HashSet<>();
Set<Float> uniqueY = new HashSet<>();
cells.stream().filter(c -> !c.getTextBlocks().isEmpty() || c.getHeight() > 3 && c.getWidth() > 3).forEach(c -> {
uniqueX.add(c.getLeft());
uniqueX.add(c.getRight());
uniqueY.add(c.getBottom());
uniqueY.add(c.getTop());
});
var sortedUniqueX = uniqueX.stream().sorted().collect(Collectors.toList());
var sortedUniqueY = uniqueY.stream().sorted().collect(Collectors.toList());
Float prevY = null;
for (Float y : sortedUniqueY) {
List<TableCell> row = new ArrayList<>();
Float prevX = null;
for (Float x : sortedUniqueX) {
if (prevY != null && prevX != null) {
var cell = new TableCell(new Point2D.Float(prevX, prevY), new Point2D.Float(x, y));
var intersectionCell = cells.stream().filter(c -> cell.intersects(c) && cell.overlapRatio(c) > 0.1f).findFirst();
if (intersectionCell.isPresent()) {
cell.getTextBlocks().addAll(intersectionCell.get().getTextBlocks());
}
row.add(cell);
}
prevX = x;
}
if (prevY != null && prevX != null) {
matrix.add(row);
}
prevY = y;
}
Collections.reverse(matrix);
return matrix;
}
@Override
public String getText() {
StringBuilder sb = new StringBuilder();
List<List<TableCell>> rows = getRows();
int i = 0;
for (List<TableCell> row : rows) {
if (i != 0) {
sb.append("\n");
}
if (!row.isEmpty()) {
boolean firstColumn = true;
for (TableCell column : row) {
if (!firstColumn) {
sb.append(",");
}
if (column != null && column.getTextBlocks() != null) {
boolean first = true;
for (ClassificationTextBlock textBlock : column.getTextBlocks()) {
if (!first) {
sb.append("\n");
}
sb.append('\"').append(textBlock.getText().replaceAll("\"", "\\\"")).append('\"');
first = false;
}
}
firstColumn = false;
}
}
i++;
}
return sb.toString();
}
public String getTextAsHtml() {
StringBuilder sb = new StringBuilder();
List<List<TableCell>> rows = getRows();
sb.append("<table border=\"1\">");
int i = 0;
for (List<TableCell> row : rows) {
sb.append("\n<tr>");
if (!row.isEmpty()) {
for (TableCell column : row) {
sb.append(i == 0 ? "\n<th>" : "\n<td>");
if (column != null && column.getTextBlocks() != null) {
boolean first = true;
for (ClassificationTextBlock textBlock : column.getTextBlocks()) {
if (!first) {
sb.append("<br />");
}
sb.append(textBlock.getText().replaceAll("\\n", "<br />"));
first = false;
}
}
sb.append(i == 0 ? "</th>" : "</td>");
}
}
sb.append("</tr>");
i++;
}
sb.append("</table>");
return sb.toString();
}
}

View File

@ -0,0 +1,38 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.table;
import java.awt.geom.Point2D;
import java.util.ArrayList;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.Rectangle;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
@SuppressWarnings("serial")
@Data
@EqualsAndHashCode(callSuper = true)
@NoArgsConstructor
public class TableCell extends Rectangle {
private List<ClassificationTextBlock> textBlocks = new ArrayList<>();
private List<TableCell> headerCells = new ArrayList<>();
private boolean isHeaderCell;
public TableCell(Point2D topLeft, Point2D bottomRight) {
super((float) topLeft.getY(), (float) topLeft.getX(), (float) (bottomRight.getX() - topLeft.getX()), (float) (bottomRight.getY() - topLeft.getY()));
}
public void addTextBlock(ClassificationTextBlock textBlock) {
textBlocks.add(textBlock);
}
}

View File

@ -0,0 +1,22 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.table;
import lombok.RequiredArgsConstructor;
import lombok.Value;
@Value
@RequiredArgsConstructor
public class TableCellPosition implements Comparable<TableCellPosition> {
int row;
int col;
@Override
public int compareTo(TableCellPosition other) {
int rowDiff = row - other.row;
return rowDiff != 0 ? rowDiff : col - other.col;
}
}

View File

@ -0,0 +1,286 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.text;
import java.util.ArrayList;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer;
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.TextNormalizationUtilities;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
@EqualsAndHashCode(callSuper = true)
@AllArgsConstructor
@Builder
@Data
@NoArgsConstructor
public class ClassificationTextBlock extends AbstractTextContainer {
@Builder.Default
private List<TextPositionSequence> sequences = new ArrayList<>();
private int rotation;
private int indexOnPage;
private String mostPopularWordFont;
private String mostPopularWordStyle;
private float mostPopularWordFontSize;
private float mostPopularWordHeight;
private float mostPopularWordSpaceWidth;
private float highestFontSize;
private String classification;
public TextDirection getDir() {
return sequences.get(0).getDir();
}
private float getPageHeight() {
return sequences.get(0).getPageHeight();
}
private float getPageWidth() {
return sequences.get(0).getPageWidth();
}
/**
* Returns the minX value in pdf coordinate system.
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
* 0 -> LowerLeft
* 90 -> UpperLeft
* 180 -> UpperRight
* 270 -> LowerRight
*
* @return the minX value in pdf coordinate system
*/
public float getPdfMinX() {
if (getDir().getDegrees() == 90) {
return minY;
} else if (getDir().getDegrees() == 180) {
return getPageWidth() - maxX;
} else if (getDir().getDegrees() == 270) {
return getPageWidth() - maxY;
} else {
return minX;
}
}
/**
* Returns the maxX value in pdf coordinate system.
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
* 0 -> LowerLeft
* 90 -> UpperLeft
* 180 -> UpperRight
* 270 -> LowerRight
*
* @return the maxX value in pdf coordinate system
*/
public float getPdfMaxX() {
if (getDir().getDegrees() == 90) {
return maxY;
} else if (getDir().getDegrees() == 180) {
return getPageWidth() - minX;
} else if (getDir().getDegrees() == 270) {
return getPageWidth() - minY;
} else {
return maxX;
}
}
/**
* Returns the minY value in pdf coordinate system.
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
* 0 -> LowerLeft
* 90 -> UpperLeft
* 180 -> UpperRight
* 270 -> LowerRight
*
* @return the minY value in pdf coordinate system
*/
public float getPdfMinY() {
if (getDir().getDegrees() == 90) {
return minX;
} else if (getDir().getDegrees() == 180) {
return maxY;
} else if (getDir().getDegrees() == 270) {
return getPageHeight() - maxX;
} else {
return getPageHeight() - maxY;
}
}
/**
* Returns the maxY value in pdf coordinate system.
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
* 0 -> LowerLeft
* 90 -> UpperLeft
* 180 -> UpperRight
* 270 -> LowerRight
*
* @return the maxY value in pdf coordinate system
*/
public float getPdfMaxY() {
if (getDir().getDegrees() == 90) {
return maxX;
} else if (getDir().getDegrees() == 180) {
return minY;
} else if (getDir().getDegrees() == 270) {
return getPageHeight() - minX;
} else {
return getPageHeight() - minY;
}
}
public ClassificationTextBlock(float minX, float maxX, float minY, float maxY, List<TextPositionSequence> sequences, int rotation, int indexOnPage) {
super();
this.indexOnPage = indexOnPage;
super.minX = minX;
super.maxX = maxX;
super.minY = minY;
super.maxY = maxY;
this.sequences = sequences;
this.rotation = rotation;
}
public ClassificationTextBlock union(TextPositionSequence r) {
ClassificationTextBlock union = this.copy();
union.add(r);
return union;
}
public ClassificationTextBlock union(ClassificationTextBlock r) {
ClassificationTextBlock union = this.copy();
union.add(r);
return union;
}
public void add(ClassificationTextBlock r) {
if (r.getMinX() < minX) {
minX = r.getMinX();
}
if (r.getMaxX() > maxX) {
maxX = r.getMaxX();
}
if (r.getMinY() < minY) {
minY = r.getMinY();
}
if (r.getMaxY() > maxY) {
maxY = r.getMaxY();
}
sequences.addAll(r.getSequences());
}
public void add(TextPositionSequence r) {
if (r.getMinXDirAdj() < minX) {
minX = r.getMinXDirAdj();
}
if (r.getMaxXDirAdj() > maxX) {
maxX = r.getMaxXDirAdj();
}
if (r.getMinYDirAdj() < minY) {
minY = r.getMinYDirAdj();
}
if (r.getMaxYDirAdj() > maxY) {
maxY = r.getMaxYDirAdj();
}
}
public ClassificationTextBlock copy() {
return new ClassificationTextBlock(minX, maxX, minY, maxY, sequences, rotation, indexOnPage);
}
public void resize(float x1, float y1, float width, float height) {
set(x1, y1, x1 + width, y1 + height);
}
public void set(float x1, float y1, float x2, float y2) {
this.minX = Math.min(x1, x2);
this.maxX = Math.max(x1, x2);
this.minY = Math.min(y1, y2);
this.maxY = Math.max(y1, y2);
}
@Override
public String toString() {
StringBuilder builder = new StringBuilder();
for (int i = 0; i < sequences.size(); i++) {
String sequenceAsString = sequences.get(i).toString();
// Fix for missing Whitespace. This is recognized in getSequences method. See PDFTextStripper Line 1730.
if (i != 0 && sequences.get(i - 1).charAt(sequences.get(i - 1).length() - 1) != ' ' && sequenceAsString.charAt(0) != ' ') {
builder.append(' ');
}
builder.append(sequenceAsString);
}
return builder.toString();
}
@Override
public String getText() {
StringBuilder sb = new StringBuilder();
TextPositionSequence previous = null;
for (TextPositionSequence word : sequences) {
if (previous != null) {
if (Math.abs(previous.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight()) {
sb.append('\n');
} else {
sb.append(' ');
}
}
sb.append(word.toString());
previous = word;
}
return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString());
}
}

View File

@ -0,0 +1,106 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.text;
import org.apache.pdfbox.text.TextPosition;
import org.springframework.beans.BeanUtils;
import com.dslplatform.json.CompiledJson;
import com.dslplatform.json.JsonAttribute;
import com.fasterxml.jackson.annotation.JsonIgnore;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.SneakyThrows;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
@CompiledJson
public class RedTextPosition {
private String textMatrix;
private float[] position;
@JsonIgnore
private int rotation;
@JsonIgnore
private float pageHeight;
@JsonIgnore
private float pageWidth;
private String unicode;
@JsonIgnore
private float dir;
// not used in reanalysis
@JsonIgnore
@JsonAttribute(ignore = true)
private float widthOfSpace;
// not used in reanalysis
@JsonIgnore
@JsonAttribute(ignore = true)
private float fontSizeInPt;
// not used in reanalysis
@JsonIgnore
@JsonAttribute(ignore = true)
private String fontName;
@SneakyThrows
public static RedTextPosition fromTextPosition(TextPosition textPosition) {
var pos = new RedTextPosition();
BeanUtils.copyProperties(textPosition, pos);
pos.setFontName(textPosition.getFont().getName());
pos.setFontSizeInPt(textPosition.getFontSizeInPt());
pos.setTextMatrix(textPosition.getTextMatrix().toString());
var position = new float[4];
position[0] = textPosition.getXDirAdj();
position[1] = textPosition.getYDirAdj();
position[2] = textPosition.getWidthDirAdj();
position[3] = textPosition.getHeightDir();
pos.setPosition(position);
return pos;
}
@JsonIgnore
public float getXDirAdj() {
return position[0];
}
@JsonIgnore
public float getYDirAdj() {
return position[1];
}
@JsonIgnore
public float getWidthDirAdj() {
return position[2];
}
@JsonIgnore
public float getHeightDir() {
return position[3];
}
}

View File

@ -0,0 +1,47 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.text;
import java.util.HashMap;
import java.util.Map;
import lombok.Getter;
public class StringFrequencyCounter {
@Getter
private final Map<String, Integer> countPerValue = new HashMap<>();
public void add(String value) {
if (!countPerValue.containsKey(value)) {
countPerValue.put(value, 1);
} else {
countPerValue.put(value, countPerValue.get(value) + 1);
}
}
public void addAll(Map<String, Integer> otherCounter) {
for (Map.Entry<String, Integer> entry : otherCounter.entrySet()) {
if (countPerValue.containsKey(entry.getKey())) {
countPerValue.put(entry.getKey(), countPerValue.get(entry.getKey()) + entry.getValue());
} else {
countPerValue.put(entry.getKey(), entry.getValue());
}
}
}
public String getMostPopular() {
Map.Entry<String, Integer> mostPopular = null;
for (Map.Entry<String, Integer> entry : countPerValue.entrySet()) {
if (mostPopular == null || entry.getValue() > mostPopular.getValue()) {
mostPopular = entry;
}
}
return mostPopular != null ? mostPopular.getKey() : null;
}
}

View File

@ -0,0 +1,8 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.text;
public enum TextBlockOrientation {
NONE,
LEFT,
RIGHT
}

View File

@ -0,0 +1,54 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.text;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonValue;
import lombok.Getter;
@Getter
public enum TextDirection {
ZERO(0f),
QUARTER_CIRCLE(90f),
HALF_CIRCLE(180f),
THREE_QUARTER_CIRCLE(270f);
public static final String VALUE_STRING_SUFFIX = "°";
@JsonValue
private final float degrees;
private final float radians;
TextDirection(float degreeValue) {
degrees = degreeValue;
radians = (float) Math.toRadians(degreeValue);
}
@Override
public String toString() {
return degrees + VALUE_STRING_SUFFIX;
}
@com.dslplatform.json.JsonValue
public float jsonValue() {
return getDegrees();
}
@JsonCreator(mode = JsonCreator.Mode.DELEGATING)
public static TextDirection fromDegrees(float degrees) {
for (var dir : TextDirection.values()) {
if (degrees == dir.degrees) {
return dir;
}
}
throw new IllegalArgumentException(String.format("A value of %f is not supported by TextDirection", degrees));
}
}

View File

@ -0,0 +1,298 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.text;
import java.awt.geom.AffineTransform;
import java.awt.geom.Point2D;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.pdfbox.text.TextPosition;
import com.dslplatform.json.JsonAttribute;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class TextPositionSequence implements CharSequence {
public static final int HEIGHT_PADDING = 2;
private int page;
private List<RedTextPosition> textPositions = new ArrayList<>();
private TextDirection dir;
private int rotation;
private float pageHeight;
private float pageWidth;
public TextPositionSequence(List<TextPosition> textPositions, int page) {
this.textPositions = textPositions.stream().map(RedTextPosition::fromTextPosition).collect(Collectors.toList());
this.page = page;
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
this.rotation = textPositions.get(0).getRotation();
this.pageHeight = textPositions.get(0).getPageHeight();
this.pageWidth = textPositions.get(0).getPageWidth();
}
@Override
public int length() {
return textPositions.size();
}
@Override
public char charAt(int index) {
RedTextPosition textPosition = textPositionAt(index);
String text = textPosition.getUnicode();
return text.charAt(0);
}
@Override
public TextPositionSequence subSequence(int start, int end) {
var textPositionSequence = new TextPositionSequence();
textPositionSequence.textPositions = textPositions.subList(start, end);
textPositionSequence.page = page;
textPositionSequence.dir = dir;
textPositionSequence.rotation = rotation;
textPositionSequence.pageHeight = pageHeight;
textPositionSequence.pageWidth = pageWidth;
return textPositionSequence;
}
@Override
public String toString() {
StringBuilder builder = new StringBuilder(length());
for (int i = 0; i < length(); i++) {
builder.append(charAt(i));
}
return builder.toString();
}
public RedTextPosition textPositionAt(int index) {
return textPositions.get(index);
}
public void add(TextPositionSequence textPositionSequence, RedTextPosition textPosition) {
this.textPositions.add(textPosition);
this.page = textPositionSequence.getPage();
this.dir = textPositionSequence.getDir();
this.rotation = textPositionSequence.getRotation();
this.pageHeight = textPositionSequence.getPageHeight();
this.pageWidth = textPositionSequence.getPageWidth();
}
public void add(TextPosition textPosition) {
this.textPositions.add(RedTextPosition.fromTextPosition(textPosition));
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
this.rotation = textPositions.get(0).getRotation();
this.pageHeight = textPositions.get(0).getPageHeight();
this.pageWidth = textPositions.get(0).getPageWidth();
}
/**
* This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
* This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt.
*
* @return the text direction adjusted minX value
*/
@JsonIgnore
@JsonAttribute(ignore = true)
public float getMinXDirAdj() {
return textPositions.get(0).getXDirAdj();
}
/**
* This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
* This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt.
*
* @return the text direction adjusted maxX value
*/
@JsonIgnore
@JsonAttribute(ignore = true)
public float getMaxXDirAdj() {
return textPositions.get(textPositions.size() - 1).getXDirAdj() + textPositions.get(textPositions.size() - 1).getWidthDirAdj() + HEIGHT_PADDING;
}
/**
* This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
* This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt.
*
* @return the text direction adjusted minY value. The upper border of the bounding box of the word.
*/
@JsonIgnore
@JsonAttribute(ignore = true)
public float getMinYDirAdj() {
return textPositions.get(0).getYDirAdj() - getTextHeight();
}
/**
* This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
* This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt.
*
* @return the text direction adjusted maxY value. The lower border of the bounding box of the word.
*/
@JsonIgnore
@JsonAttribute(ignore = true)
public float getMaxYDirAdj() {
return textPositions.get(0).getYDirAdj();
}
@JsonIgnore
@JsonAttribute(ignore = true)
public float getTextHeight() {
return textPositions.get(0).getHeightDir() + HEIGHT_PADDING;
}
@JsonIgnore
@JsonAttribute(ignore = true)
public float getHeight() {
return getMaxYDirAdj() - getMinYDirAdj();
}
@JsonIgnore
@JsonAttribute(ignore = true)
public float getWidth() {
return getMaxXDirAdj() - getMinXDirAdj();
}
@JsonIgnore
@JsonAttribute(ignore = true)
public String getFont() {
return textPositions.get(0).getFontName().toLowerCase().replaceAll(",bold", "").replaceAll(",italic", "");
}
@JsonIgnore
@JsonAttribute(ignore = true)
public String getFontStyle() {
String lowercaseFontName = textPositions.get(0).getFontName().toLowerCase();
if (lowercaseFontName.contains("bold") && lowercaseFontName.contains("italic")) {
return "bold, italic";
} else if (lowercaseFontName.contains("bold")) {
return "bold";
} else if (lowercaseFontName.contains("italic")) {
return "italic";
} else {
return "standard";
}
}
@JsonIgnore
@JsonAttribute(ignore = true)
public float getFontSize() {
return textPositions.get(0).getFontSizeInPt();
}
@JsonIgnore
@JsonAttribute(ignore = true)
public float getSpaceWidth() {
return textPositions.get(0).getWidthOfSpace();
}
/**
* This returns the bounding box of the word in Pdf Coordinate System where {0,0} rotated with the page rotation.
* 0 -> LowerLeft
* 90 -> UpperLeft
* 180 -> UpperRight
* 270 -> LowerRight
*
* @return bounding box of the word in Pdf Coordinate System
*/
@JsonIgnore
@JsonAttribute(ignore = true)
@SneakyThrows
public Rectangle getRectangle() {
log.debug("ClassificationPage: '{}', Word: '{}', Rotation: '{}', textRotation {}", page, this, rotation, dir);
float textHeight = getTextHeight();
RedTextPosition firstTextPos = textPositions.get(0);
RedTextPosition lastTextPos = textPositions.get(textPositions.size() - 1);
Point2D bottomLeft = new Point2D.Double(firstTextPos.getXDirAdj(), firstTextPos.getYDirAdj() - HEIGHT_PADDING);
Point2D topRight = new Point2D.Double(lastTextPos.getXDirAdj() + lastTextPos.getWidthDirAdj(), lastTextPos.getYDirAdj() + textHeight + HEIGHT_PADDING);
AffineTransform transform = new AffineTransform();
if (dir == TextDirection.ZERO || dir == TextDirection.HALF_CIRCLE) {
transform.rotate(dir.getRadians(), pageWidth / 2f, pageHeight / 2f);
transform.translate(0f, pageHeight + textHeight);
transform.scale(1., -1.);
} else if (dir == TextDirection.QUARTER_CIRCLE) {
transform.rotate(dir.getRadians(), pageWidth / 2f, pageWidth / 2f);
transform.translate(0f, pageWidth + textHeight);
transform.scale(1., -1.);
} else {
transform.rotate(dir.getRadians(), pageHeight / 2f, pageHeight / 2f);
transform.translate(0f, pageWidth + textHeight);
transform.scale(1., -1.);
}
bottomLeft = transform.transform(bottomLeft, null);
topRight = transform.transform(topRight, null);
return new Rectangle( //
new Point((float) bottomLeft.getX(), (float) bottomLeft.getY()),
(float) (topRight.getX() - bottomLeft.getX()),
(float) (topRight.getY() - bottomLeft.getY()),
page);
}
}

View File

@ -0,0 +1,14 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.text;
import java.util.List;
import lombok.AllArgsConstructor;
import lombok.Data;
@Data
@AllArgsConstructor
public class UnclassifiedText {
private List<ClassificationTextBlock> textBlocks;
}

View File

@ -0,0 +1,384 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.knecon.fforesight.service.layoutparser.processor.classification.parsing;
import java.io.IOException;
import java.io.InputStream;
import java.util.Map;
import java.util.WeakHashMap;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.fontbox.ttf.TrueTypeFont;
import org.apache.fontbox.util.BoundingBox;
import org.apache.pdfbox.contentstream.PDFStreamEngine;
import org.apache.pdfbox.contentstream.operator.DrawObject;
import org.apache.pdfbox.contentstream.operator.state.Concatenate;
import org.apache.pdfbox.contentstream.operator.state.Restore;
import org.apache.pdfbox.contentstream.operator.state.Save;
import org.apache.pdfbox.contentstream.operator.state.SetGraphicsStateParameters;
import org.apache.pdfbox.contentstream.operator.state.SetMatrix;
import org.apache.pdfbox.contentstream.operator.text.BeginText;
import org.apache.pdfbox.contentstream.operator.text.EndText;
import org.apache.pdfbox.contentstream.operator.text.MoveText;
import org.apache.pdfbox.contentstream.operator.text.MoveTextSetLeading;
import org.apache.pdfbox.contentstream.operator.text.NextLine;
import org.apache.pdfbox.contentstream.operator.text.SetCharSpacing;
import org.apache.pdfbox.contentstream.operator.text.SetFontAndSize;
import org.apache.pdfbox.contentstream.operator.text.SetTextHorizontalScaling;
import org.apache.pdfbox.contentstream.operator.text.SetTextLeading;
import org.apache.pdfbox.contentstream.operator.text.SetTextRenderingMode;
import org.apache.pdfbox.contentstream.operator.text.SetTextRise;
import org.apache.pdfbox.contentstream.operator.text.SetWordSpacing;
import org.apache.pdfbox.contentstream.operator.text.ShowText;
import org.apache.pdfbox.contentstream.operator.text.ShowTextAdjusted;
import org.apache.pdfbox.contentstream.operator.text.ShowTextLine;
import org.apache.pdfbox.contentstream.operator.text.ShowTextLineAndSpace;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.font.PDCIDFont;
import org.apache.pdfbox.pdmodel.font.PDCIDFontType2;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDFontDescriptor;
import org.apache.pdfbox.pdmodel.font.PDSimpleFont;
import org.apache.pdfbox.pdmodel.font.PDTrueTypeFont;
import org.apache.pdfbox.pdmodel.font.PDType0Font;
import org.apache.pdfbox.pdmodel.font.PDType3Font;
import org.apache.pdfbox.pdmodel.font.encoding.GlyphList;
import org.apache.pdfbox.pdmodel.graphics.state.PDGraphicsState;
import org.apache.pdfbox.text.TextPosition;
import org.apache.pdfbox.util.Matrix;
import org.apache.pdfbox.util.Vector;
/**
* LEGACY text calculations which are known to be incorrect but are depended on by PDFTextStripper.
* <p>
* This class exists only so that we don't break the code of users who have their own subclasses of
* PDFTextStripper. It replaces the mostly empty implementation of showGlyph() in PDFStreamEngine
* with a heuristic implementation which is backwards compatible.
* <p>
* DO NOT USE THIS CODE UNLESS YOU ARE WORKING WITH PDFTextStripper.
* THIS CODE IS DELIBERATELY INCORRECT, USE PDFStreamEngine INSTEAD.
*/
@SuppressWarnings({"PMD", "checkstyle:all"})
class LegacyPDFStreamEngine extends PDFStreamEngine {
private static final Log LOG = LogFactory.getLog(LegacyPDFStreamEngine.class);
private int pageRotation;
private PDRectangle pageSize;
private Matrix translateMatrix;
private final GlyphList glyphList;
private final Map<COSDictionary, Float> fontHeightMap = new WeakHashMap<COSDictionary, Float>();
/**
* Constructor.
*/
LegacyPDFStreamEngine() throws IOException {
addOperator(new BeginText());
addOperator(new Concatenate());
addOperator(new DrawObject()); // special text version
addOperator(new EndText());
addOperator(new SetGraphicsStateParameters());
addOperator(new Save());
addOperator(new Restore());
addOperator(new NextLine());
addOperator(new SetCharSpacing());
addOperator(new MoveText());
addOperator(new MoveTextSetLeading());
addOperator(new SetFontAndSize());
addOperator(new ShowText());
addOperator(new ShowTextAdjusted());
addOperator(new SetTextLeading());
addOperator(new SetMatrix());
addOperator(new SetTextRenderingMode());
addOperator(new SetTextRise());
addOperator(new SetWordSpacing());
addOperator(new SetTextHorizontalScaling());
addOperator(new ShowTextLine());
addOperator(new ShowTextLineAndSpace());
// load additional glyph list for Unicode mapping
String path = "/org/apache/pdfbox/resources/glyphlist/additional.txt";
InputStream input = GlyphList.class.getResourceAsStream(path);
glyphList = new GlyphList(GlyphList.getAdobeGlyphList(), input);
}
/**
* This will initialize and process the contents of the stream.
*
* @param page the page to process
* @throws IOException if there is an error accessing the stream.
*/
@Override
public void processPage(PDPage page) throws IOException {
this.pageRotation = page.getRotation();
this.pageSize = page.getCropBox();
if (pageSize.getLowerLeftX() == 0 && pageSize.getLowerLeftY() == 0) {
translateMatrix = null;
} else {
// translation matrix for cropbox
translateMatrix = Matrix.getTranslateInstance(-pageSize.getLowerLeftX(), -pageSize.getLowerLeftY());
}
super.processPage(page);
}
/**
* Called when a glyph is to be processed. The heuristic calculations here were originally
* written by Ben Litchfield for PDFStreamEngine.
*/
@Override
protected void showGlyph(Matrix textRenderingMatrix, PDFont font, int code,Vector displacement) throws IOException {
//
// legacy calculations which were previously in PDFStreamEngine
//
// DO NOT USE THIS CODE UNLESS YOU ARE WORKING WITH PDFTextStripper.
// THIS CODE IS DELIBERATELY INCORRECT
//
PDGraphicsState state = getGraphicsState();
Matrix ctm = state.getCurrentTransformationMatrix();
float fontSize = state.getTextState().getFontSize();
float horizontalScaling = state.getTextState().getHorizontalScaling() / 100f;
Matrix textMatrix = getTextMatrix();
float displacementX = displacement.getX();
// the sorting algorithm is based on the width of the character. As the displacement
// for vertical characters doesn't provide any suitable value for it, we have to
// calculate our own
if (font.isVertical()) {
displacementX = font.getWidth(code) / 1000;
// there may be an additional scaling factor for true type fonts
TrueTypeFont ttf = null;
if (font instanceof PDTrueTypeFont) {
ttf = ((PDTrueTypeFont) font).getTrueTypeFont();
} else if (font instanceof PDType0Font) {
PDCIDFont cidFont = ((PDType0Font) font).getDescendantFont();
if (cidFont instanceof PDCIDFontType2) {
ttf = ((PDCIDFontType2) cidFont).getTrueTypeFont();
}
}
if (ttf != null && ttf.getUnitsPerEm() != 1000) {
displacementX *= 1000f / ttf.getUnitsPerEm();
}
}
//
// legacy calculations which were previously in PDFStreamEngine
//
// DO NOT USE THIS CODE UNLESS YOU ARE WORKING WITH PDFTextStripper.
// THIS CODE IS DELIBERATELY INCORRECT
//
// (modified) combined displacement, this is calculated *without* taking the character
// spacing and word spacing into account, due to legacy code in TextStripper
float tx = displacementX * fontSize * horizontalScaling;
float ty = displacement.getY() * fontSize;
// (modified) combined displacement matrix
Matrix td = Matrix.getTranslateInstance(tx, ty);
// (modified) text rendering matrix
Matrix nextTextRenderingMatrix = td.multiply(textMatrix).multiply(ctm); // text space -> device space
float nextX = nextTextRenderingMatrix.getTranslateX();
float nextY = nextTextRenderingMatrix.getTranslateY();
// (modified) width and height calculations
float dxDisplay = nextX - textRenderingMatrix.getTranslateX();
Float fontHeight = fontHeightMap.get(font.getCOSObject());
if (fontHeight == null) {
fontHeight = computeFontHeight(font);
fontHeightMap.put(font.getCOSObject(), fontHeight);
}
float dyDisplay = fontHeight * textRenderingMatrix.getScalingFactorY();
//
// start of the original method
//
// Note on variable names. There are three different units being used in this code.
// Character sizes are given in glyph units, text locations are initially given in text
// units, and we want to save the data in display units. The variable names should end with
// Text or Disp to represent if the values are in text or disp units (no glyph units are
// saved).
float glyphSpaceToTextSpaceFactor = 1 / 1000f;
if (font instanceof PDType3Font) {
glyphSpaceToTextSpaceFactor = font.getFontMatrix().getScaleX();
}
float spaceWidthText = 0;
try {
// to avoid crash as described in PDFBOX-614, see what the space displacement should be
spaceWidthText = font.getSpaceWidth() * glyphSpaceToTextSpaceFactor;
} catch (Throwable exception) {
LOG.warn(exception, exception);
}
if (spaceWidthText == 0) {
spaceWidthText = font.getAverageFontWidth() * glyphSpaceToTextSpaceFactor;
// the average space width appears to be higher than necessary so make it smaller
spaceWidthText *= .80f;
}
if (spaceWidthText == 0) {
spaceWidthText = 1.0f; // if could not find font, use a generic value
}
// the space width has to be transformed into display units
float spaceWidthDisplay = spaceWidthText * textRenderingMatrix.getScalingFactorX();
// use our additional glyph list for Unicode mapping
String unicodeMapping = font.toUnicode(code, glyphList);
// when there is no Unicode mapping available, Acrobat simply coerces the character code
// into Unicode, so we do the same. Subclasses of PDFStreamEngine don't necessarily want
// this, which is why we leave it until this point in PDFTextStreamEngine.
if (unicodeMapping == null) {
if (font instanceof PDSimpleFont) {
char c = (char) code;
unicodeMapping = new String(new char[]{c});
} else {
// Acrobat doesn't seem to coerce composite font's character codes, instead it
// skips them. See the "allah2.pdf" TestTextStripper file.
return;
}
}
// adjust for cropbox if needed
Matrix translatedTextRenderingMatrix;
if (translateMatrix == null) {
translatedTextRenderingMatrix = textRenderingMatrix;
} else {
translatedTextRenderingMatrix = Matrix.concatenate(translateMatrix, textRenderingMatrix);
nextX -= pageSize.getLowerLeftX();
nextY -= pageSize.getLowerLeftY();
}
// This is a hack for unicode letter with 2 chars e.g. RA see unicodeProblem.pdf
if (unicodeMapping.length() == 2) {
processTextPosition(new TextPosition(pageRotation,
pageSize.getWidth(),
pageSize.getHeight(),
translatedTextRenderingMatrix,
nextX,
nextY,
Math.abs(dyDisplay),
dxDisplay,
Math.abs(spaceWidthDisplay),
Character.toString(unicodeMapping.charAt(0)),
new int[]{code},
font,
fontSize,
(int) (fontSize * textMatrix.getScalingFactorX())));
processTextPosition(new TextPosition(pageRotation,
pageSize.getWidth(),
pageSize.getHeight(),
translatedTextRenderingMatrix,
nextX,
nextY,
Math.abs(dyDisplay),
dxDisplay,
Math.abs(spaceWidthDisplay),
Character.toString(unicodeMapping.charAt(1)),
new int[]{code},
font,
fontSize,
(int) (fontSize * textMatrix.getScalingFactorX())));
} else {
processTextPosition(new TextPosition(pageRotation,
pageSize.getWidth(),
pageSize.getHeight(),
translatedTextRenderingMatrix,
nextX,
nextY,
Math.abs(dyDisplay),
dxDisplay,
Math.abs(spaceWidthDisplay),
unicodeMapping,
new int[]{code},
font,
fontSize,
(int) (fontSize * textMatrix.getScalingFactorX())));
}
}
/**
* Compute the font height. Override this if you want to use own calculations.
*
* @param font the font.
* @return the font height.
* @throws IOException if there is an error while getting the font bounding box.
*/
protected float computeFontHeight(PDFont font) throws IOException {
BoundingBox bbox = font.getBoundingBox();
if (bbox.getLowerLeftY() < Short.MIN_VALUE) {
// PDFBOX-2158 and PDFBOX-3130
// files by Salmat eSolutions / ClibPDF Library
bbox.setLowerLeftY(-(bbox.getLowerLeftY() + 65536));
}
// 1/2 the bbox is used as the height todo: why?
float glyphHeight = bbox.getHeight() / 2;
// sometimes the bbox has very high values, but CapHeight is OK
PDFontDescriptor fontDescriptor = font.getFontDescriptor();
if (fontDescriptor != null) {
float capHeight = fontDescriptor.getCapHeight();
if (Float.compare(capHeight, 0) != 0 && (capHeight < glyphHeight || Float.compare(glyphHeight, 0) == 0)) {
glyphHeight = capHeight;
}
// PDFBOX-3464, PDFBOX-4480, PDFBOX-4553:
// sometimes even CapHeight has very high value, but Ascent and Descent are ok
float ascent = fontDescriptor.getAscent();
float descent = fontDescriptor.getDescent();
if (capHeight > ascent && ascent > 0 && descent < 0 && ((ascent - descent) / 2 < glyphHeight || Float.compare(glyphHeight, 0) == 0)) {
glyphHeight = (ascent - descent) / 2;
}
}
// transformPoint from glyph space -> text space
float height;
if (font instanceof PDType3Font) {
height = font.getFontMatrix().transformPoint(0, glyphHeight).y;
} else {
height = glyphHeight / 1000;
}
return height;
}
/**
* A method provided as an event interface to allow a subclass to perform some specific
* functionality when text needs to be processed.
*
* @param text The text to be processed.
*/
protected void processTextPosition(TextPosition text) {
// subclasses can override to provide specific functionality
}
}

View File

@ -0,0 +1,82 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.parsing;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.pdfbox.text.PDFTextStripperByArea;
import org.apache.pdfbox.text.TextPosition;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextPositionSequence;
import lombok.Getter;
import lombok.Setter;
public class PDFAreaTextStripper extends PDFTextStripperByArea {
@Getter
private List<TextPositionSequence> textPositionSequences = new ArrayList<>();
@Setter
private int pageNumber;
public PDFAreaTextStripper() throws IOException {
}
@Override
public void writeString(String text, List<TextPosition> textPositions) throws IOException {
int startIndex = 0;
for (int i = 0; i <= textPositions.size() - 1; i++) {
if (i == 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i).getUnicode().equals("\u00A0"))) {
startIndex++;
continue;
}
// Strange but sometimes this is happening, for example: Metolachlor2.pdf
if (i > 0 && textPositions.get(i).getX() < textPositions.get(i - 1).getX()) {
List<TextPosition> sublist = textPositions.subList(startIndex, i);
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) {
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
}
startIndex = i;
}
if (textPositions.get(i).getRotation() == 0 && i > 0 && textPositions.get(i).getX() > textPositions.get(i - 1).getEndX() + 1) {
List<TextPosition> sublist = textPositions.subList(startIndex, i);
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) {
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
}
startIndex = i;
}
if (i > 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i).getUnicode().equals("\u00A0")) && i <= textPositions.size() - 2) {
List<TextPosition> sublist = textPositions.subList(startIndex, i);
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) {
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
}
startIndex = i + 1;
}
}
List<TextPosition> sublist = textPositions.subList(startIndex, textPositions.size());
if (!sublist.isEmpty() && (sublist.get(sublist.size() - 1).getUnicode().equals(" ") || sublist.get(sublist.size() - 1).getUnicode().equals("\u00A0"))) {
sublist = sublist.subList(0, sublist.size() - 1);
}
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) {
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
}
super.writeString(text);
}
public void clearPositions() {
textPositionSequences = new ArrayList<>();
}
}

View File

@ -0,0 +1,335 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.parsing;
import java.awt.geom.Point2D;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.contentstream.operator.OperatorName;
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColor;
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColorN;
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColorSpace;
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceCMYKColor;
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceGrayColor;
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceRGBColor;
import org.apache.pdfbox.contentstream.operator.color.SetStrokingColor;
import org.apache.pdfbox.contentstream.operator.color.SetStrokingColorN;
import org.apache.pdfbox.contentstream.operator.color.SetStrokingColorSpace;
import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceCMYKColor;
import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceGrayColor;
import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceRGBColor;
import org.apache.pdfbox.contentstream.operator.state.SetFlatness;
import org.apache.pdfbox.contentstream.operator.state.SetLineCapStyle;
import org.apache.pdfbox.contentstream.operator.state.SetLineDashPattern;
import org.apache.pdfbox.contentstream.operator.state.SetLineJoinStyle;
import org.apache.pdfbox.contentstream.operator.state.SetLineMiterLimit;
import org.apache.pdfbox.contentstream.operator.state.SetLineWidth;
import org.apache.pdfbox.contentstream.operator.state.SetRenderingIntent;
import org.apache.pdfbox.contentstream.operator.text.SetFontAndSize;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSNumber;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.text.TextPosition;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.RedTextPosition;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Ruling;
import lombok.Getter;
import lombok.Setter;
import lombok.extern.slf4j.Slf4j;
@Slf4j
public class PDFLinesTextStripper extends PDFTextStripper {
@Getter
private final List<TextPositionSequence> textPositionSequences = new ArrayList<>();
@Getter
private final List<Ruling> rulings = new ArrayList<>();
private final List<Ruling> graphicsPath = new ArrayList<>();
@Setter
protected PDPage pdpage;
@Getter
private int minCharWidth;
@Getter
private int maxCharWidth;
@Getter
private int minCharHeight;
@Getter
private int maxCharHeight;
private float path_x;
private float path_y;
@Setter
private int pageNumber;
public PDFLinesTextStripper() throws IOException {
super();
this.addOperator(new SetStrokingColorSpace());
this.addOperator(new SetNonStrokingColorSpace());
this.addOperator(new SetLineDashPattern());
this.addOperator(new SetStrokingDeviceGrayColor());
this.addOperator(new SetNonStrokingDeviceGrayColor());
this.addOperator(new SetFlatness());
this.addOperator(new SetLineJoinStyle());
this.addOperator(new SetLineCapStyle());
this.addOperator(new SetStrokingDeviceCMYKColor());
this.addOperator(new SetNonStrokingDeviceCMYKColor());
this.addOperator(new SetLineMiterLimit());
this.addOperator(new SetStrokingDeviceRGBColor());
this.addOperator(new SetNonStrokingDeviceRGBColor());
this.addOperator(new SetRenderingIntent());
this.addOperator(new SetStrokingColor());
this.addOperator(new SetNonStrokingColor());
this.addOperator(new SetStrokingColorN());
this.addOperator(new SetNonStrokingColorN());
this.addOperator(new SetFontAndSize());
this.addOperator(new SetLineWidth());
}
@Override
protected void processOperator(Operator operator, List<COSBase> arguments) throws IOException {
String operation = operator.getName();
//move
switch (operation) {
case OperatorName.MOVE_TO:
if (arguments.size() == 2) {
Point2D.Float pos = transformPosition(floatValue(arguments.get(0)), floatValue(arguments.get(1)));
path_x = (float) pos.getX();
path_y = (float) pos.getY();
}
break;
//line
case OperatorName.LINE_TO:
if (arguments.size() == 2) {
Point2D.Float pos = transformPosition(floatValue(arguments.get(0)), floatValue(arguments.get(1)));
// The direction of vertical lines must always be from bottom to top for the table extraction algorithm.
if (pos.getY() > path_y) {
graphicsPath.add(new Ruling(new Point2D.Float(path_x, path_y), new Point2D.Float((float) pos.getX(), (float) pos.getY())));
} else {
graphicsPath.add(new Ruling(new Point2D.Float(path_x, (float) pos.getY()), new Point2D.Float((float) pos.getX(), path_y)));
}
path_x = (float) pos.getX();
path_y = (float) pos.getY();
}
break;
//rectangle
case OperatorName.APPEND_RECT:
if (arguments.size() == 4) {
float x = floatValue(arguments.get(0));
float y = floatValue(arguments.get(1));
float width = floatValue(arguments.get(2));
float height = floatValue(arguments.get(3));
Point2D p1 = transformPosition(x, y);
Point2D p2 = transformPosition(x + width, y + height);
// Horizontal lines
graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p1.getY()), new Point2D.Float((float) p2.getX(), (float) p1.getY())));
graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p2.getY()), new Point2D.Float((float) p2.getX(), (float) p2.getY())));
// Vertical lines, direction must always be from bottom to top for the table extraction algorithm.
if (p2.getY() > p1.getY()) {
graphicsPath.add(new Ruling(new Point2D.Float((float) p2.getX(), (float) p1.getY()), new Point2D.Float((float) p2.getX(), (float) p2.getY())));
} else {
graphicsPath.add(new Ruling(new Point2D.Float((float) p2.getX(), (float) p2.getY()), new Point2D.Float((float) p2.getX(), (float) p1.getY())));
}
if (p2.getY() > p1.getY()) {
graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p1.getY()), new Point2D.Float((float) p1.getX(), (float) p2.getY())));
} else {
graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p2.getY()), new Point2D.Float((float) p1.getX(), (float) p1.getY())));
}
}
break;
//fill
case OperatorName.FILL_NON_ZERO:
case OperatorName.LEGACY_FILL_NON_ZERO:
case OperatorName.FILL_EVEN_ODD:
addVisibleRulings(graphicsPath, false);
graphicsPath.clear();
break;
//stroke
case OperatorName.STROKE_PATH:
addVisibleRulings(graphicsPath, true);
graphicsPath.clear();
break;
//cancel path
case OperatorName.ENDPATH:
graphicsPath.clear();
break;
}
super.processOperator(operator, arguments);
}
private float floatValue(COSBase value) {
if (value instanceof COSNumber) {
return ((COSNumber) value).floatValue();
} else {
return 0;
}
}
private Point2D.Float transformPosition(float x, float y) {
return super.transformedPoint(x, y);
}
private void addVisibleRulings(List<Ruling> path, boolean stroke) throws IOException {
try {
if (stroke && !getGraphicsState().getStrokingColor().isPattern() && getGraphicsState().getStrokingColor()
.toRGB() == 0 || !stroke && !getGraphicsState().getNonStrokingColor().isPattern() && getGraphicsState().getNonStrokingColor().toRGB() == 0) {
rulings.addAll(path);
}
} catch (UnsupportedOperationException e) {
log.debug("UnsupportedOperationException: " + getGraphicsState().getStrokingColor().getColorSpace().getName() + " or " + getGraphicsState().getNonStrokingColor()
.getColorSpace()
.getName() + " does not support toRGB");
}
}
@Override
public void writeString(String text, List<TextPosition> textPositions) throws IOException {
int startIndex = 0;
RedTextPosition previous = null;
textPositions.sort(Comparator.comparing(TextPosition::getXDirAdj));
for (int i = 0; i <= textPositions.size() - 1; i++) {
if (!textPositionSequences.isEmpty()) {
previous = textPositionSequences.get(textPositionSequences.size() - 1)
.getTextPositions()
.get(textPositionSequences.get(textPositionSequences.size() - 1).getTextPositions().size() - 1);
}
int charWidth = (int) textPositions.get(i).getWidthDirAdj();
if (charWidth < minCharWidth) {
minCharWidth = charWidth;
}
if (charWidth > maxCharWidth) {
maxCharWidth = charWidth;
}
int charHeight = (int) textPositions.get(i).getHeightDir();
if (charHeight < minCharHeight) {
minCharHeight = charHeight;
}
if (charWidth > maxCharHeight) {
maxCharHeight = charHeight;
}
if (i == 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i).getUnicode().equals("\u00A0") || textPositions.get(i).getUnicode().equals("\t"))) {
startIndex++;
continue;
}
// Strange but sometimes this is happening, for example: Metolachlor2.pdf
if (i > 0 && textPositions.get(i).getXDirAdj() < textPositions.get(i - 1).getXDirAdj()) {
List<TextPosition> sublist = textPositions.subList(startIndex, i);
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0)
.getUnicode()
.equals("\t")))) {
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
}
startIndex = i;
}
if (textPositions.get(i).getRotation() == 0 && i > 0 && textPositions.get(i).getX() > textPositions.get(i - 1).getEndX() + 1) {
List<TextPosition> sublist = textPositions.subList(startIndex, i);
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0)
.getUnicode()
.equals("\t")))) {
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
}
startIndex = i;
}
if (i > 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i).getUnicode().equals("\u00A0") || textPositions.get(i)
.getUnicode()
.equals("\t")) && i <= textPositions.size() - 2) {
List<TextPosition> sublist = textPositions.subList(startIndex, i);
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0)
.getUnicode()
.equals("\t")))) {
// Remove false sequence ends (whitespaces)
if (previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0)
.getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < 0.01) {
for (TextPosition textPosition : sublist) {
textPositionSequences.get(textPositionSequences.size() - 1).add(textPosition);
}
} else {
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
}
}
startIndex = i + 1;
}
}
List<TextPosition> sublist = textPositions.subList(startIndex, textPositions.size());
if (!sublist.isEmpty() && (sublist.get(sublist.size() - 1).getUnicode().equals(" ") || sublist.get(sublist.size() - 1)
.getUnicode()
.equals("\u00A0") || sublist.get(sublist.size() - 1).getUnicode().equals("\t"))) {
sublist = sublist.subList(0, sublist.size() - 1);
}
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0)
.getUnicode()
.equals("\t")))) {
if (previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0)
.getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < 0.01) {
for (TextPosition t : sublist) {
textPositionSequences.get(textPositionSequences.size() - 1).add(t);
}
} else {
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
}
}
super.writeString(text);
}
@Override
public String getText(PDDocument doc) throws IOException {
minCharWidth = Integer.MAX_VALUE;
maxCharWidth = 0;
minCharHeight = Integer.MAX_VALUE;
maxCharHeight = 0;
textPositionSequences.clear();
rulings.clear();
graphicsPath.clear();
path_x = 0.0f;
path_y = 0.0f;
return super.getText(doc);
}
}

View File

@ -0,0 +1,279 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.service;
import static java.util.stream.Collectors.toSet;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.FloatFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextBlockOrientation;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.StringFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.RulingTextDirAdjustUtil;
@Service
@SuppressWarnings("all")
public class BlockificationService {
static final float THRESHOLD = 1f;
/**
* This method is building blocks by expanding the minX/maxX and minY/maxY value on each word that is not split by the conditions.
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
* Rulings (Table lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
*
* @param textPositions The words of a page.
* @param horizontalRulingLines Horizontal table lines.
* @param verticalRulingLines Vertical table lines.
* @return ClassificationPage object that contains the Textblock and text statistics.
*/
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
int indexOnPage = 0;
List<TextPositionSequence> chunkWords = new ArrayList<>();
List<AbstractTextContainer> chunkBlockList1 = new ArrayList<>();
float minX = 1000, maxX = 0, minY = 1000, maxY = 0;
TextPositionSequence prev = null;
boolean wasSplitted = false;
Float splitX1 = null;
for (TextPositionSequence word : textPositions) {
boolean lineSeparation = word.getMinYDirAdj() - maxY > word.getHeight() * 1.25;
boolean startFromTop = prev != null && word.getMinYDirAdj() < prev.getMinYDirAdj() - prev.getTextHeight();
boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj();
boolean xIsBeforeFirstX = prev != null && word.getMinXDirAdj() < minX;
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) {
TextBlockOrientation prevOrientation = null;
if (!chunkBlockList1.isEmpty()) {
prevOrientation = chunkBlockList1.get(chunkBlockList1.size() - 1).getOrientation();
}
ClassificationTextBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
indexOnPage++;
chunkBlockList1.add(cb1);
chunkWords = new ArrayList<>();
if (splitByX && !isSplitByRuling) {
wasSplitted = true;
cb1.setOrientation(TextBlockOrientation.LEFT);
splitX1 = word.getMinXDirAdj();
} else if (newLineAfterSplit && !isSplitByRuling) {
wasSplitted = false;
cb1.setOrientation(TextBlockOrientation.RIGHT);
splitX1 = null;
} else if (prevOrientation != null && prevOrientation.equals(TextBlockOrientation.RIGHT) && (lineSeparation || !startFromTop || !splitByX || !newLineAfterSplit || !isSplitByRuling)) {
cb1.setOrientation(TextBlockOrientation.LEFT);
}
minX = 1000;
maxX = 0;
minY = 1000;
maxY = 0;
prev = null;
}
chunkWords.add(word);
prev = word;
if (word.getMinXDirAdj() < minX) {
minX = word.getMinXDirAdj();
}
if (word.getMaxXDirAdj() > maxX) {
maxX = word.getMaxXDirAdj();
}
if (word.getMinYDirAdj() < minY) {
minY = word.getMinYDirAdj();
}
if (word.getMaxYDirAdj() > maxY) {
maxY = word.getMaxYDirAdj();
}
}
ClassificationTextBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
if (cb1 != null) {
chunkBlockList1.add(cb1);
}
Iterator<AbstractTextContainer> itty = chunkBlockList1.iterator();
ClassificationTextBlock previousLeft = null;
ClassificationTextBlock previousRight = null;
while (itty.hasNext()) {
ClassificationTextBlock block = (ClassificationTextBlock) itty.next();
if (previousLeft != null && block.getOrientation().equals(TextBlockOrientation.LEFT)) {
if (previousLeft.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousLeft.getMinY()) {
previousLeft.add(block);
itty.remove();
continue;
}
}
if (previousRight != null && block.getOrientation().equals(TextBlockOrientation.RIGHT)) {
if (previousRight.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousRight.getMinY()) {
previousRight.add(block);
itty.remove();
continue;
}
}
if (block.getOrientation().equals(TextBlockOrientation.LEFT)) {
previousLeft = block;
} else if (block.getOrientation().equals(TextBlockOrientation.RIGHT)) {
previousRight = block;
}
}
itty = chunkBlockList1.iterator();
ClassificationTextBlock previous = null;
while (itty.hasNext()) {
ClassificationTextBlock block = (ClassificationTextBlock) itty.next();
if (previous != null && previous.getOrientation().equals(TextBlockOrientation.LEFT) && block.getOrientation().equals(TextBlockOrientation.LEFT) && equalsWithThreshold(block.getMaxY(),
previous.getMaxY()) || previous != null && previous.getOrientation().equals(TextBlockOrientation.LEFT) && block.getOrientation()
.equals(TextBlockOrientation.RIGHT) && equalsWithThreshold(block.getMaxY(), previous.getMaxY())) {
previous.add(block);
itty.remove();
continue;
}
previous = block;
}
return new ClassificationPage(chunkBlockList1);
}
private boolean equalsWithThreshold(float f1, float f2) {
return Math.abs(f1 - f2) < THRESHOLD;
}
private ClassificationTextBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
ClassificationTextBlock textBlock = null;
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter();
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
for (TextPositionSequence wordBlock : wordBlockList) {
lineHeightFrequencyCounter.add(wordBlock.getTextHeight());
fontSizeFrequencyCounter.add(wordBlock.getFontSize());
spaceFrequencyCounter.add(wordBlock.getSpaceWidth());
fontFrequencyCounter.add(wordBlock.getFont());
styleFrequencyCounter.add(wordBlock.getFontStyle());
if (textBlock == null) {
textBlock = new ClassificationTextBlock(wordBlock.getMinXDirAdj(),
wordBlock.getMaxXDirAdj(),
wordBlock.getMinYDirAdj(),
wordBlock.getMaxYDirAdj(),
wordBlockList,
wordBlock.getRotation(),
indexOnPage);
} else {
ClassificationTextBlock spatialEntity = textBlock.union(wordBlock);
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
}
}
if (textBlock != null) {
textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
}
if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) {
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
}
return textBlock;
}
private boolean isSplitByRuling(float minX,
float minY,
float maxX,
float maxY,
TextPositionSequence word,
List<Ruling> horizontalRulingLines,
List<Ruling> verticalRulingLines) {
return isSplitByRuling(maxX,
minY,
word.getMinXDirAdj(),
word.getMinYDirAdj(),
verticalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(minX,
minY,
word.getMinXDirAdj(),
word.getMaxYDirAdj(),
horizontalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(maxX,
minY,
word.getMinXDirAdj(),
word.getMinYDirAdj(),
horizontalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(minX,
minY,
word.getMinXDirAdj(),
word.getMaxYDirAdj(),
verticalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()); //
}
private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List<Ruling> rulingLines, float dir, float pageWidth, float pageHeight) {
for (Ruling ruling : rulingLines) {
var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight);
if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) {
return true;
}
}
return false;
}
private double round(float value, int decimalPoints) {
var d = Math.pow(10, decimalPoints);
return Math.round(value * d) / d;
}
}

View File

@ -0,0 +1,160 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.service;
import java.util.List;
import org.springframework.stereotype.Service;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.FloatFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.TableCell;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Table;
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.PositionUtils;
@Service
public class BodyTextFrameService {
/**
* Adjusts and sets the body text frame to a classificationPage.
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the classificationPage rotation.
* 0 -> LowerLeft
* 90 -> UpperLeft
* 180 -> UpperRight
* 270 -> LowerRight
* The aspect ratio of the classificationPage is also regarded.
*
* @param classificationPage The classificationPage
* @param bodyTextFrame frame that contains the main text on portrait pages
* @param landscapeBodyTextFrame frame that contains the main text on landscape pages
*/
public void setBodyTextFrameAdjustedToPage(ClassificationPage classificationPage, Rectangle bodyTextFrame, Rectangle landscapeBodyTextFrame) {
Rectangle textFrame = classificationPage.isLandscape() ? landscapeBodyTextFrame : bodyTextFrame;
if (classificationPage.getPageWidth() > classificationPage.getPageHeight() && classificationPage.getRotation() == 270) {
textFrame = new Rectangle(new Point(textFrame.getTopLeft().getY(), classificationPage.getPageHeight() - textFrame.getTopLeft().getX() - textFrame.getWidth()),
textFrame.getHeight(),
textFrame.getWidth(),
0);
} else if (classificationPage.getPageWidth() > classificationPage.getPageHeight() && classificationPage.getRotation() != 0) {
textFrame = new Rectangle(new Point(textFrame.getTopLeft().getY(), textFrame.getTopLeft().getX()), textFrame.getHeight(), textFrame.getWidth(), classificationPage.getPageNumber());
} else if (classificationPage.getRotation() == 180) {
textFrame = new Rectangle(new Point(textFrame.getTopLeft().getX(), classificationPage.getPageHeight() - textFrame.getTopLeft().getY() - textFrame.getHeight()),
textFrame.getWidth(),
textFrame.getHeight(),
0);
}
classificationPage.setBodyTextFrame(textFrame);
}
/**
* Calculates the frame that contains the main text, text outside the frame will be e.g. headers or footers.
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
* 0 -> LowerLeft
* 90 -> UpperLeft
* 180 -> UpperRight
* 270 -> LowerRight
* The aspect ratio of the page is also regarded.
*
* @param classificationPages List of all classificationPages
* @param documentFontSizeCounter Statistics of the document
* @param landscape Calculate for landscape or portrait
* @return Rectangle of the text frame
*/
public Rectangle calculateBodyTextFrame(List<ClassificationPage> classificationPages, FloatFrequencyCounter documentFontSizeCounter, boolean landscape) {
BodyTextFrameExpansionsRectangle expansionsRectangle = new BodyTextFrameExpansionsRectangle();
for (ClassificationPage classificationPage : classificationPages) {
if (classificationPage.getTextBlocks().isEmpty() || landscape != classificationPage.isLandscape()) {
continue;
}
for (AbstractTextContainer container : classificationPage.getTextBlocks()) {
if (container instanceof ClassificationTextBlock) {
ClassificationTextBlock textBlock = (ClassificationTextBlock) container;
if (textBlock.getMostPopularWordFont() == null || textBlock.getMostPopularWordStyle() == null) {
continue;
}
float approxLineCount = PositionUtils.getApproxLineCount(textBlock);
if (approxLineCount < 2.9f) {
continue;
}
if (documentFontSizeCounter.getMostPopular() != null && textBlock.getMostPopularWordFontSize() >= documentFontSizeCounter.getMostPopular()) {
expandRectangle(textBlock, classificationPage, expansionsRectangle);
}
}
if (container instanceof Table) {
Table table = (Table) container;
for (List<TableCell> row : table.getRows()) {
for (TableCell cell : row) {
if (cell == null || cell.getTextBlocks() == null) {
continue;
}
for (ClassificationTextBlock textBlock : cell.getTextBlocks()) {
expandRectangle(textBlock, classificationPage, expansionsRectangle);
}
}
}
}
}
}
return new Rectangle(new Point(expansionsRectangle.minX, expansionsRectangle.minY),
expansionsRectangle.maxX - expansionsRectangle.minX,
expansionsRectangle.maxY - expansionsRectangle.minY,
0);
}
private void expandRectangle(ClassificationTextBlock textBlock, ClassificationPage classificationPage, BodyTextFrameExpansionsRectangle expansionsRectangle) {
if (classificationPage.getPageWidth() > classificationPage.getPageHeight() && classificationPage.getRotation() != 0) {
if (textBlock.getPdfMinY() < expansionsRectangle.minX) {
expansionsRectangle.minX = textBlock.getPdfMinY();
}
if (textBlock.getPdfMaxY() > expansionsRectangle.maxX) {
expansionsRectangle.maxX = textBlock.getPdfMaxY();
}
if (textBlock.getPdfMinX() < expansionsRectangle.minY) {
expansionsRectangle.minY = textBlock.getPdfMinX();
}
if (textBlock.getPdfMaxX() > expansionsRectangle.maxY) {
expansionsRectangle.maxY = textBlock.getPdfMaxX();
}
} else {
if (textBlock.getPdfMinX() < expansionsRectangle.minX) {
expansionsRectangle.minX = textBlock.getPdfMinX();
}
if (textBlock.getPdfMaxX() > expansionsRectangle.maxX) {
expansionsRectangle.maxX = textBlock.getPdfMaxX();
}
if (textBlock.getPdfMinY() < expansionsRectangle.minY) {
expansionsRectangle.minY = textBlock.getPdfMinY();
}
if (textBlock.getPdfMaxY() > expansionsRectangle.maxY) {
expansionsRectangle.maxY = textBlock.getPdfMaxY();
}
}
}
private class BodyTextFrameExpansionsRectangle {
float minX = 10000;
float maxX = -100;
float minY = 10000;
float maxY = -100;
}
}

View File

@ -0,0 +1,116 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.service;
import java.util.List;
import java.util.regex.Pattern;
import org.springframework.stereotype.Service;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer;
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.PositionUtils;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
public class ClassificationService {
private final BodyTextFrameService bodyTextFrameService;
public void classifyDocument(ClassificationDocument document) {
Rectangle bodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), false);
Rectangle landscapeBodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), true);
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
for (ClassificationPage classificationPage : document.getPages()) {
bodyTextFrameService.setBodyTextFrameAdjustedToPage(classificationPage, bodyTextFrame, landscapeBodyTextFrame);
classifyPage(classificationPage, document, headlineFontSizes);
}
}
public void classifyPage(ClassificationPage classificationPage, ClassificationDocument document, List<Float> headlineFontSizes) {
for (AbstractTextContainer textBlock : classificationPage.getTextBlocks()) {
if (textBlock instanceof ClassificationTextBlock) {
classifyBlock((ClassificationTextBlock) textBlock, classificationPage, document, headlineFontSizes);
}
}
}
public void classifyBlock(ClassificationTextBlock textBlock, ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
var bodyTextFrame = page.getBodyTextFrame();
if (document.getFontSizeCounter().getMostPopular() == null) {
textBlock.setClassification("Other");
return;
}
if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
textBlock.setClassification("Header");
} else if (PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
textBlock.setClassification("Footer");
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
.size() == 1)) {
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
textBlock.setClassification("Title");
}
} else if (textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter()
.getMostPopular() && PositionUtils.getApproxLineCount(textBlock) < 4.9 && (textBlock.getMostPopularWordStyle().equals("bold") || !document.getFontStyleCounter()
.getCountPerValue()
.containsKey("bold") && textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() + 1) && textBlock.getSequences()
.get(0)
.getTextPositions()
.get(0)
.getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
for (int i = 1; i <= headlineFontSizes.size(); i++) {
if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) {
textBlock.setClassification("H " + i);
document.setHeadlines(true);
}
}
} else if (!textBlock.getText().startsWith("Table ") && !textBlock.getText().startsWith("Figure ") && PositionUtils.isWithinBodyTextFrame(bodyTextFrame,
textBlock) && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter()
.getMostPopular()
.equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9 && textBlock.getSequences()
.get(0)
.getTextPositions()
.get(0)
.getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
textBlock.setClassification("H " + (headlineFontSizes.size() + 1));
document.setHeadlines(true);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) {
textBlock.setClassification("TextBlock Bold");
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFont()
.equals(document.getFontCounter().getMostPopular()) && textBlock.getMostPopularWordStyle()
.equals(document.getFontStyleCounter().getMostPopular()) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) {
textBlock.setClassification("TextBlock");
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("italic") && !document.getFontStyleCounter()
.getMostPopular()
.equals("italic") && PositionUtils.getApproxLineCount(textBlock) < 2.9) {
textBlock.setClassification("TextBlock Italic");
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) {
textBlock.setClassification("TextBlock Unknown");
} else {
textBlock.setClassification("Other");
}
}
}

View File

@ -0,0 +1,134 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.service;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.adapter.ImageServiceResponseAdapter;
import com.knecon.fforesight.service.layoutparser.processor.classification.parsing.PDFLinesTextStripper;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CvParsedTableCell;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
public class PdfParsingService {
private final RulingCleaningService rulingCleaningService;
private final TableExtractionService tableExtractionService;
private final BlockificationService blockificationService;
private final ImageServiceResponseAdapter imageServiceResponseAdapter;
public ClassificationDocument parseDocument(PDDocument originDocument, Map<Integer, List<CvParsedTableCell>> pdfTableCells, Map<Integer, List<ClassifiedImage>> pdfImages) {
ClassificationDocument document = new ClassificationDocument();
List<ClassificationPage> classificationPages = new ArrayList<>();
originDocument.setAllSecurityToBeRemoved(true);
long pageCount = originDocument.getNumberOfPages();
for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) {
parsePage(pdfImages, originDocument, pdfTableCells, document, classificationPages, pageNumber);
}
document.setPages(classificationPages);
return document;
}
@SneakyThrows
private void parsePage(Map<Integer, List<ClassifiedImage>> pdfImages,
PDDocument pdDocument,
Map<Integer, List<CvParsedTableCell>> pdfTableCells,
ClassificationDocument document,
List<ClassificationPage> classificationPages,
int pageNumber) {
PDFLinesTextStripper stripper = new PDFLinesTextStripper();
PDPage pdPage = pdDocument.getPage(pageNumber - 1);
stripper.setPageNumber(pageNumber);
stripper.setStartPage(pageNumber);
stripper.setEndPage(pageNumber);
stripper.setPdpage(pdPage);
stripper.getText(pdDocument);
PDRectangle pdr = pdPage.getMediaBox();
int rotation = pdPage.getRotation();
boolean isLandscape = pdr.getWidth() > pdr.getHeight() && (rotation == 0 || rotation == 180) || pdr.getHeight() > pdr.getWidth() && (rotation == 90 || rotation == 270);
PDRectangle cropbox = pdPage.getCropBox();
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber),
stripper.getRulings(),
stripper.getMinCharWidth(),
stripper.getMaxCharHeight());
ClassificationPage classificationPage = blockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
classificationPage.setRotation(rotation);
classificationPage.setLandscape(isLandscape);
classificationPage.setPageNumber(pageNumber);
classificationPage.setPageWidth(cropbox.getWidth());
classificationPage.setPageHeight(cropbox.getHeight());
// If images is ocr needs to be calculated before textBlocks are moved into tables, otherwise findOcr algorithm needs to be adopted.
if (pdfImages != null && pdfImages.containsKey(pageNumber)) {
classificationPage.setImages(pdfImages.get(pageNumber));
imageServiceResponseAdapter.findOcr(classificationPage);
}
tableExtractionService.removeRedundantTableCells(cleanRulings, classificationPage);
buildPageStatistics(classificationPage);
increaseDocumentStatistics(classificationPage, document);
classificationPages.add(classificationPage);
}
private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) {
if (!classificationPage.isLandscape()) {
document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue());
}
document.getFontCounter().addAll(classificationPage.getFontCounter().getCountPerValue());
document.getTextHeightCounter().addAll(classificationPage.getTextHeightCounter().getCountPerValue());
document.getFontStyleCounter().addAll(classificationPage.getFontStyleCounter().getCountPerValue());
}
private void buildPageStatistics(ClassificationPage classificationPage) {
// Collect all statistics for the classificationPage, except from blocks inside tables, as tables will always be added to BodyTextFrame.
for (AbstractTextContainer textBlock : classificationPage.getTextBlocks()) {
if (textBlock instanceof ClassificationTextBlock) {
if (((ClassificationTextBlock) textBlock).getSequences() == null) {
continue;
}
for (TextPositionSequence word : ((ClassificationTextBlock) textBlock).getSequences()) {
classificationPage.getTextHeightCounter().add(word.getTextHeight());
classificationPage.getFontCounter().add(word.getFont());
classificationPage.getFontSizeCounter().add(word.getFontSize());
classificationPage.getFontStyleCounter().add(word.getFontStyle());
}
}
}
}
}

View File

@ -0,0 +1,231 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.service;
import java.awt.geom.Line2D;
import java.awt.geom.Point2D;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CvParsedTableCell;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.DoubleComparisons;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
public class RulingCleaningService {
public CleanRulings getCleanRulings(List<CvParsedTableCell> cvParsedTableCells, List<Ruling> rulings, float minCharWidth, float maxCharHeight) {
if (!rulings.isEmpty()) {
snapPoints(rulings, minCharWidth, maxCharHeight);
}
List<Ruling> vrs = new ArrayList<>();
for (Ruling vr : rulings) {
if (vr.vertical()) {
vrs.add(vr);
}
}
if (vrs.isEmpty()) {
vrs.addAll(extractVerticalRulings(cvParsedTableCells));
}
List<Ruling> verticalRulingLines = collapseOrientedRulings(vrs);
List<Ruling> hrs = new ArrayList<>();
for (Ruling hr : rulings) {
if (hr.horizontal()) {
hrs.add(hr);
}
}
if (hrs.isEmpty()) {
hrs.addAll(extractHorizontalRulings(cvParsedTableCells));
}
List<Ruling> horizontalRulingLines = collapseOrientedRulings(hrs);
return CleanRulings.builder().vertical(verticalRulingLines).horizontal(horizontalRulingLines).build();
}
public void snapPoints(List<? extends Line2D.Float> rulings, float xThreshold, float yThreshold) {
// collect points and keep a Line -> p1,p2 map
Map<Line2D.Float, Point2D[]> linesToPoints = new HashMap<>();
List<Point2D> points = new ArrayList<>();
for (Line2D.Float r : rulings) {
Point2D p1 = r.getP1();
Point2D p2 = r.getP2();
linesToPoints.put(r, new Point2D[]{p1, p2});
points.add(p1);
points.add(p2);
}
// snap by X
points.sort(Comparator.comparingDouble(Point2D::getX));
List<List<Point2D>> groupedPoints = new ArrayList<>();
groupedPoints.add(new ArrayList<>(Collections.singletonList(points.get(0))));
for (Point2D p : points.subList(1, points.size() - 1)) {
List<Point2D> last = groupedPoints.get(groupedPoints.size() - 1);
if (Math.abs(p.getX() - last.get(0).getX()) < xThreshold) {
groupedPoints.get(groupedPoints.size() - 1).add(p);
} else {
groupedPoints.add(new ArrayList<>(Collections.singletonList(p)));
}
}
for (List<Point2D> group : groupedPoints) {
float avgLoc = 0;
for (Point2D p : group) {
avgLoc += p.getX();
}
avgLoc /= group.size();
for (Point2D p : group) {
p.setLocation(avgLoc, p.getY());
}
}
// ---
// snap by Y
points.sort(Comparator.comparingDouble(Point2D::getY));
groupedPoints = new ArrayList<>();
groupedPoints.add(new ArrayList<>(Collections.singletonList(points.get(0))));
for (Point2D p : points.subList(1, points.size() - 1)) {
List<Point2D> last = groupedPoints.get(groupedPoints.size() - 1);
if (Math.abs(p.getY() - last.get(0).getY()) < yThreshold) {
groupedPoints.get(groupedPoints.size() - 1).add(p);
} else {
groupedPoints.add(new ArrayList<>(Collections.singletonList(p)));
}
}
for (List<Point2D> group : groupedPoints) {
float avgLoc = 0;
for (Point2D p : group) {
avgLoc += p.getY();
}
avgLoc /= group.size();
for (Point2D p : group) {
p.setLocation(p.getX(), avgLoc);
}
}
// ---
// finally, modify lines
for (Map.Entry<Line2D.Float, Point2D[]> ltp : linesToPoints.entrySet()) {
Point2D[] p = ltp.getValue();
ltp.getKey().setLine(p[0], p[1]);
}
}
private Collection<? extends Ruling> extractVerticalRulings(List<CvParsedTableCell> cvParsedTableCells) {
List<Ruling> vrs = new ArrayList<>();
if (cvParsedTableCells != null) {
for (CvParsedTableCell cvParsedTableCell : cvParsedTableCells) {
Ruling leftLine = createRuling(cvParsedTableCell.getX0(), cvParsedTableCell.getX0(), cvParsedTableCell.getY0(), cvParsedTableCell.getY1());
Ruling rightLine = createRuling(cvParsedTableCell.getX1(), cvParsedTableCell.getX1(), cvParsedTableCell.getY0(), cvParsedTableCell.getY1());
vrs.add(leftLine);
vrs.add(rightLine);
}
}
return vrs;
}
private Collection<? extends Ruling> extractHorizontalRulings(List<CvParsedTableCell> cvParsedTableCells) {
List<Ruling> hrs = new ArrayList<>();
if (cvParsedTableCells != null) {
for (CvParsedTableCell cvParsedTableCell : cvParsedTableCells) {
Ruling topLine = createRuling(cvParsedTableCell.getX0(), cvParsedTableCell.getX1(), cvParsedTableCell.getY1(), cvParsedTableCell.getY1());
Ruling baseLine = createRuling(cvParsedTableCell.getX0(), cvParsedTableCell.getX1(), cvParsedTableCell.getY0(), cvParsedTableCell.getY0());
hrs.add(topLine);
hrs.add(baseLine);
}
}
return hrs;
}
private Ruling createRuling(float tableCellX0, float tableCellX1, float tableCellY0, float tableCellY1) {
float x0 = tableCellX0;
float x1 = tableCellX1;
float y0 = tableCellY0;
float y1 = tableCellY1;
if (x1 < x0) {
x0 = tableCellX1;
x1 = tableCellX0;
}
if (y1 < y0) {
y0 = tableCellY1;
y1 = tableCellY0;
}
return new Ruling(new Point2D.Float(x0, y0), new Point2D.Float(x1, y1));
}
private List<Ruling> collapseOrientedRulings(List<Ruling> lines) {
int COLINEAR_OR_PARALLEL_PIXEL_EXPAND_AMOUNT = 1;
return collapseOrientedRulings(lines, COLINEAR_OR_PARALLEL_PIXEL_EXPAND_AMOUNT);
}
private List<Ruling> collapseOrientedRulings(List<Ruling> lines, int expandAmount) {
ArrayList<Ruling> rv = new ArrayList<>();
lines.sort((a, b) -> {
final float diff = a.getPosition() - b.getPosition();
return Float.compare(diff == 0 ? a.getStart() - b.getStart() : diff, 0f);
});
for (Ruling next_line : lines) {
Ruling last = rv.isEmpty() ? null : rv.get(rv.size() - 1);
// if current line colinear with next, and are "close enough": expand current line
if (last != null && DoubleComparisons.feq(next_line.getPosition(), last.getPosition()) && last.nearlyIntersects(next_line, expandAmount)) {
final float lastStart = last.getStart();
final float lastEnd = last.getEnd();
final boolean lastFlipped = lastStart > lastEnd;
final boolean nextFlipped = next_line.getStart() > next_line.getEnd();
boolean differentDirections = nextFlipped != lastFlipped;
float nextS = differentDirections ? next_line.getEnd() : next_line.getStart();
float nextE = differentDirections ? next_line.getStart() : next_line.getEnd();
final float newStart = lastFlipped ? Math.max(nextS, lastStart) : Math.min(nextS, lastStart);
final float newEnd = lastFlipped ? Math.min(nextE, lastEnd) : Math.max(nextE, lastEnd);
last.setStartEnd(newStart, newEnd);
assert !last.oblique();
} else if (next_line.length() == 0) {
continue;
} else {
rv.add(next_line);
}
}
return rv;
}
}

View File

@ -0,0 +1,303 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.service;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationFooter;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationSection;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationHeader;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.UnclassifiedText;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.TableCell;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Table;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
public class SectionsBuilderService {
public void buildSections(ClassificationDocument document) {
List<AbstractTextContainer> chunkWords = new ArrayList<>();
List<ClassificationSection> chunkBlockList = new ArrayList<>();
List<ClassificationHeader> headers = new ArrayList<>();
List<ClassificationFooter> footers = new ArrayList<>();
List<UnclassifiedText> unclassifiedTexts = new ArrayList<>();
AbstractTextContainer prev = null;
String lastHeadline = "";
Table previousTable = null;
for (ClassificationPage classificationPage : document.getPages()) {
List<ClassificationTextBlock> header = new ArrayList<>();
List<ClassificationTextBlock> footer = new ArrayList<>();
List<ClassificationTextBlock> unclassifiedText = new ArrayList<>();
for (AbstractTextContainer current : classificationPage.getTextBlocks()) {
if (current.getClassification() == null) {
continue;
}
current.setPage(classificationPage.getPageNumber());
if (current.getClassification().equals("Header")) {
header.add((ClassificationTextBlock) current);
continue;
}
if (current.getClassification().equals("Footer")) {
footer.add((ClassificationTextBlock) current);
continue;
}
if (current.getClassification().equals("Other")) {
unclassifiedText.add((ClassificationTextBlock) current);
continue;
}
if (prev != null && current.getClassification().startsWith("H ") && !prev.getClassification().startsWith("H ") || !document.isHeadlines()) {
ClassificationSection chunkBlock = buildTextBlock(chunkWords, lastHeadline);
chunkBlock.setHeadline(lastHeadline);
if (document.isHeadlines()) {
lastHeadline = current.getText();
}
chunkBlockList.add(chunkBlock);
chunkWords = new ArrayList<>();
if (!chunkBlock.getTables().isEmpty()) {
previousTable = chunkBlock.getTables().get(chunkBlock.getTables().size() - 1);
}
}
if (current instanceof Table table) {
// Distribute header information for subsequent tables
mergeTableMetadata(table, previousTable);
previousTable = table;
}
chunkWords.add(current);
prev = current;
}
if (!header.isEmpty()) {
headers.add(new ClassificationHeader(header));
}
if (!footer.isEmpty()) {
footers.add(new ClassificationFooter(footer));
}
if (!unclassifiedText.isEmpty()) {
unclassifiedTexts.add(new UnclassifiedText(unclassifiedText));
}
}
ClassificationSection chunkBlock = buildTextBlock(chunkWords, lastHeadline);
chunkBlock.setHeadline(lastHeadline);
chunkBlockList.add(chunkBlock);
document.setSections(chunkBlockList);
document.setHeaders(headers);
document.setFooters(footers);
document.setUnclassifiedTexts(unclassifiedTexts);
addImagesToSections(document);
}
private void addImagesToSections(ClassificationDocument document) {
Map<Integer, List<ClassificationSection>> sectionMap = new HashMap<>();
for (ClassificationSection section : document.getSections()) {
for (AbstractTextContainer container : section.getPageBlocks()) {
List<ClassificationSection> sectionsOnPage = sectionMap.computeIfAbsent(container.getPage(), c -> new ArrayList<>());
if (sectionsOnPage.contains(section)) {
continue;
}
sectionsOnPage.add(section);
}
}
if (sectionMap.isEmpty()) {
ClassificationSection section = new ClassificationSection();
document.getSections().add(section);
sectionMap.computeIfAbsent(1, x -> new ArrayList<>()).add(section);
}
// first page is always a paragraph, else we can't process pages 1..N,
// where N is the first found page with a paragraph
if (sectionMap.get(1) == null) {
ClassificationSection section = new ClassificationSection();
document.getSections().add(section);
sectionMap.computeIfAbsent(1, x -> new ArrayList<>()).add(section);
}
for (ClassificationPage classificationPage : document.getPages()) {
for (ClassifiedImage image : classificationPage.getImages()) {
List<ClassificationSection> sectionsOnPage = sectionMap.get(classificationPage.getPageNumber());
if (sectionsOnPage == null) {
int i = classificationPage.getPageNumber();
while (sectionsOnPage == null) {
sectionsOnPage = sectionMap.get(i);
i--;
}
}
for (ClassificationSection section : sectionsOnPage) {
Float xMin = null;
Float yMin = null;
Float xMax = null;
Float yMax = null;
for (AbstractTextContainer abs : section.getPageBlocks()) {
if (abs.getPage() != classificationPage.getPageNumber()) {
continue;
}
if (abs.getMinX() < abs.getMaxX()) {
if (xMin == null || abs.getMinX() < xMin) {
xMin = abs.getMinX();
}
if (xMax == null || abs.getMaxX() > xMax) {
xMax = abs.getMaxX();
}
} else {
if (xMin == null || abs.getMaxX() < xMin) {
xMin = abs.getMaxX();
}
if (xMax == null || abs.getMinX() > xMax) {
xMax = abs.getMinX();
}
}
if (abs.getMinY() < abs.getMaxY()) {
if (yMin == null || abs.getMinY() < yMin) {
yMin = abs.getMinY();
}
if (yMax == null || abs.getMaxY() > yMax) {
yMax = abs.getMaxY();
}
} else {
if (yMin == null || abs.getMaxY() < yMin) {
yMin = abs.getMaxY();
}
if (yMax == null || abs.getMinY() > yMax) {
yMax = abs.getMinY();
}
}
}
log.debug("Image position x: {}, y: {}", image.getPosition().getX(), image.getPosition().getY());
log.debug("Paragraph position xMin: {}, xMax: {}, yMin: {}, yMax: {}", xMin, xMax, yMin, yMax);
if (xMin != null && xMax != null && yMin != null && yMax != null && image.getPosition().getX() >= xMin && image.getPosition()
.getX() <= xMax && image.getPosition().getY() >= yMin && image.getPosition().getY() <= yMax) {
section.getImages().add(image);
image.setAppendedToSection(true);
}
}
if (!image.isAppendedToSection()) {
log.debug("Image uses first paragraph");
sectionsOnPage.get(0).getImages().add(image);
image.setAppendedToSection(true);
}
}
}
}
private void mergeTableMetadata(Table currentTable, Table previousTable) {
// Distribute header information for subsequent tables
if (previousTable != null && hasInvalidHeaderInformation(currentTable) && hasValidHeaderInformation(previousTable)) {
List<TableCell> previousTableNonHeaderRow = getRowWithNonHeaderCells(previousTable);
List<TableCell> tableNonHeaderRow = getRowWithNonHeaderCells(currentTable);
// Allow merging of tables if header row is separated from first logical non-header row
if (previousTableNonHeaderRow.isEmpty() && previousTable.getRowCount() == 1 && previousTable.getRows().get(0).size() == tableNonHeaderRow.size()) {
previousTableNonHeaderRow = previousTable.getRows().get(0).stream().map(cell -> {
TableCell fakeCell = new TableCell(cell.getPoints()[0], cell.getPoints()[2]);
fakeCell.setHeaderCells(Collections.singletonList(cell));
return fakeCell;
}).collect(Collectors.toList());
}
if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) {
for (int i = currentTable.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
List<TableCell> row = currentTable.getRows().get(i);
if (row.size() == tableNonHeaderRow.size() && row.stream().allMatch(cell -> cell.getHeaderCells().isEmpty())) {
for (int j = 0; j < row.size(); j++) {
row.get(j).setHeaderCells(previousTableNonHeaderRow.get(j).getHeaderCells());
}
}
}
}
}
}
private ClassificationSection buildTextBlock(List<AbstractTextContainer> wordBlockList, String lastHeadline) {
ClassificationSection section = new ClassificationSection();
for (AbstractTextContainer container : wordBlockList) {
if (container instanceof Table table) {
if (lastHeadline == null || lastHeadline.isEmpty()) {
table.setHeadline("Text in table");
} else {
table.setHeadline("Table in: " + lastHeadline);
}
section.getPageBlocks().add(table);
continue;
}
ClassificationTextBlock wordBlock = (ClassificationTextBlock) container;
section.getPageBlocks().add(wordBlock);
}
return section;
}
private boolean hasValidHeaderInformation(Table table) {
return !hasInvalidHeaderInformation(table);
}
private boolean hasInvalidHeaderInformation(Table table) {
return table.getRows().stream().flatMap(row -> row.stream().filter(cell -> !cell.getHeaderCells().isEmpty())).findAny().isEmpty();
}
private List<TableCell> getRowWithNonHeaderCells(Table table) {
for (int i = table.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
List<TableCell> row = table.getRows().get(i);
if (row.size() == 1) {
continue;
}
boolean allNonHeader = true;
for (TableCell cell : row) {
if (cell.isHeaderCell()) {
allNonHeader = false;
break;
}
}
if (allNonHeader) {
return row;
}
}
return Collections.emptyList();
}
}

View File

@ -0,0 +1,338 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.service;
import java.awt.geom.Point2D;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.TableCell;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.QuickSort;
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.DoubleComparisons;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Table;
@Service
public class TableExtractionService {
private static final Comparator<Point2D> X_FIRST_POINT_COMPARATOR = (arg0, arg1) -> {
int rv = 0;
float arg0X = DoubleComparisons.round(arg0.getX(), 2);
float arg0Y = DoubleComparisons.round(arg0.getY(), 2);
float arg1X = DoubleComparisons.round(arg1.getX(), 2);
float arg1Y = DoubleComparisons.round(arg1.getY(), 2);
if (arg0X > arg1X) {
rv = 1;
} else if (arg0X < arg1X) {
rv = -1;
} else if (arg0Y > arg1Y) {
rv = 1;
} else if (arg0Y < arg1Y) {
rv = -1;
}
return rv;
};
private static final Comparator<Point2D> POINT_COMPARATOR = (arg0, arg1) -> {
int rv = 0;
float arg0X = DoubleComparisons.round(arg0.getX(), 2);
float arg0Y = DoubleComparisons.round(arg0.getY(), 2);
float arg1X = DoubleComparisons.round(arg1.getX(), 2);
float arg1Y = DoubleComparisons.round(arg1.getY(), 2);
if (arg0Y > arg1Y) {
rv = 1;
} else if (arg0Y < arg1Y) {
rv = -1;
} else if (arg0X > arg1X) {
rv = 1;
} else if (arg0X < arg1X) {
rv = -1;
}
return rv;
};
/**
* Finds tables on a classificationPage and moves textblocks into cells of the found tables.
* Note: This algorithm uses Pdf Coordinate System where {0,0} rotated with the classificationPage rotation.
* 0 -> LowerLeft
* 90 -> UpperLeft
* 180 -> UpperRight
* 270 -> LowerRight
*
* DirAdj (Text direction adjusted) values can not be used here.
*
* @param cleanRulings The lines used to build the table.
* @param classificationPage ClassificationPage object that contains textblocks and statistics.
*/
public void removeRedundantTableCells(CleanRulings cleanRulings, ClassificationPage classificationPage) {
List<TableCell> cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical());
List<ClassificationTextBlock> toBeRemoved = new ArrayList<>();
for (AbstractTextContainer abstractTextContainer : classificationPage.getTextBlocks()) {
ClassificationTextBlock textBlock = (ClassificationTextBlock) abstractTextContainer;
for (TableCell cell : cells) {
if (cell.intersects(textBlock.getPdfMinX(),
textBlock.getPdfMinY(),
textBlock.getPdfMaxX() - textBlock.getPdfMinX(),
textBlock.getPdfMaxY() - textBlock.getPdfMinY())) {
cell.addTextBlock(textBlock);
toBeRemoved.add(textBlock);
break;
}
}
}
cells = new ArrayList<>(new HashSet<>(cells));
QuickSort.sort(cells, Rectangle.ILL_DEFINED_ORDER);
List<Rectangle> spreadsheetAreas = findSpreadsheetsFromCells(cells).stream().filter(r -> r.getWidth() > 0f && r.getHeight() > 0f).collect(Collectors.toList());
List<Table> tables = new ArrayList<>();
for (Rectangle area : spreadsheetAreas) {
List<TableCell> overlappingCells = new ArrayList<>();
for (TableCell c : cells) {
if (c.intersects(area)) {
overlappingCells.add(c);
}
}
tables.add(new Table(overlappingCells, area, classificationPage.getRotation()));
}
for (Table table : tables) {
int position = -1;
Iterator<AbstractTextContainer> itty = classificationPage.getTextBlocks().iterator();
while (itty.hasNext()) {
AbstractTextContainer textBlock = itty.next();
if (textBlock instanceof ClassificationTextBlock ? table.containsBlock((ClassificationTextBlock) textBlock) : table.contains(textBlock) && position == -1) {
position = classificationPage.getTextBlocks().indexOf(textBlock);
}
}
if (position != -1) {
classificationPage.getTextBlocks().add(position, table);
}
}
classificationPage.getTextBlocks().removeAll(toBeRemoved);
}
public List<TableCell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
List<TableCell> cellsFound = new ArrayList<>();
Map<Point2D, Ruling[]> intersectionPoints = Ruling.findIntersections(horizontalRulingLines, verticalRulingLines);
List<Point2D> intersectionPointsList = new ArrayList<>(intersectionPoints.keySet());
intersectionPointsList.sort(POINT_COMPARATOR);
for (int i = 0; i < intersectionPointsList.size(); i++) {
Point2D topLeft = intersectionPointsList.get(i);
Ruling[] hv = intersectionPoints.get(topLeft);
// CrossingPointsDirectlyBelow( topLeft );
List<Point2D> xPoints = new ArrayList<>();
// CrossingPointsDirectlyToTheRight( topLeft );
List<Point2D> yPoints = new ArrayList<>();
for (Point2D p : intersectionPointsList.subList(i, intersectionPointsList.size())) {
if (p.getX() == topLeft.getX() && p.getY() > topLeft.getY()) {
xPoints.add(p);
}
if (p.getY() == topLeft.getY() && p.getX() > topLeft.getX()) {
yPoints.add(p);
}
}
outer:
for (Point2D xPoint : xPoints) {
// is there a vertical edge b/w topLeft and xPoint?
if (!hv[1].equals(intersectionPoints.get(xPoint)[1])) {
continue;
}
for (Point2D yPoint : yPoints) {
// is there an horizontal edge b/w topLeft and yPoint ?
if (!hv[0].equals(intersectionPoints.get(yPoint)[0])) {
continue;
}
Point2D btmRight = new Point2D.Float((float) yPoint.getX(), (float) xPoint.getY());
if (intersectionPoints.containsKey(btmRight) && intersectionPoints.get(btmRight)[0].equals(intersectionPoints.get(xPoint)[0]) && intersectionPoints.get(btmRight)[1].equals(
intersectionPoints.get(yPoint)[1])) {
cellsFound.add(new TableCell(topLeft, btmRight));
break outer;
}
}
}
}
// TODO create cells for vertical ruling lines with aligned endpoints at the top/bottom of a grid
// that aren't connected with an horizontal ruler?
// see: https://github.com/jazzido/tabula-extractor/issues/78#issuecomment-41481207
return cellsFound;
}
private List<Rectangle> findSpreadsheetsFromCells(List<? extends Rectangle> cells) {
// via: http://stackoverflow.com/questions/13746284/merging-multiple-adjacent-rectangles-into-one-polygon
List<Rectangle> rectangles = new ArrayList<>();
Set<Point2D> pointSet = new HashSet<>();
Map<Point2D, Point2D> edgesH = new HashMap<>();
Map<Point2D, Point2D> edgesV = new HashMap<>();
int i = 0;
for (Rectangle cell : cells) {
for (Point2D pt : cell.getPoints()) {
if (pointSet.contains(pt)) { // shared vertex, remove it
pointSet.remove(pt);
} else {
pointSet.add(pt);
}
}
}
// X first sort
List<Point2D> pointsSortX = new ArrayList<>(pointSet);
pointsSortX.sort(X_FIRST_POINT_COMPARATOR);
// Y first sort
List<Point2D> pointsSortY = new ArrayList<>(pointSet);
pointsSortY.sort(POINT_COMPARATOR);
while (i < pointSet.size()) {
float currY = (float) pointsSortY.get(i).getY();
while (i < pointSet.size() && DoubleComparisons.feq(pointsSortY.get(i).getY(), currY)) {
edgesH.put(pointsSortY.get(i), pointsSortY.get(i + 1));
edgesH.put(pointsSortY.get(i + 1), pointsSortY.get(i));
i += 2;
}
}
i = 0;
while (i < pointSet.size()) {
float currX = (float) pointsSortX.get(i).getX();
while (i < pointSet.size() && DoubleComparisons.feq(pointsSortX.get(i).getX(), currX)) {
edgesV.put(pointsSortX.get(i), pointsSortX.get(i + 1));
edgesV.put(pointsSortX.get(i + 1), pointsSortX.get(i));
i += 2;
}
}
// Get all the polygons
List<List<PolygonVertex>> polygons = new ArrayList<>();
Point2D nextVertex;
while (!edgesH.isEmpty()) {
ArrayList<PolygonVertex> polygon = new ArrayList<>();
Point2D first = edgesH.keySet().iterator().next();
polygon.add(new PolygonVertex(first, Direction.HORIZONTAL));
edgesH.remove(first);
while (true) {
PolygonVertex curr = polygon.get(polygon.size() - 1);
PolygonVertex lastAddedVertex;
if (curr.direction == Direction.HORIZONTAL) {
nextVertex = edgesV.get(curr.point);
edgesV.remove(curr.point);
lastAddedVertex = new PolygonVertex(nextVertex, Direction.VERTICAL);
} else {
nextVertex = edgesH.get(curr.point);
edgesH.remove(curr.point);
lastAddedVertex = new PolygonVertex(nextVertex, Direction.HORIZONTAL);
}
polygon.add(lastAddedVertex);
if (lastAddedVertex.equals(polygon.get(0))) {
// closed polygon
polygon.remove(polygon.size() - 1);
break;
}
}
for (PolygonVertex vertex : polygon) {
edgesH.remove(vertex.point);
edgesV.remove(vertex.point);
}
polygons.add(polygon);
}
// calculate grid-aligned minimum area rectangles for each found polygon
for (List<PolygonVertex> poly : polygons) {
float top = Float.MAX_VALUE;
float left = Float.MAX_VALUE;
float bottom = Float.MIN_VALUE;
float right = Float.MIN_VALUE;
for (PolygonVertex pt : poly) {
top = (float) Math.min(top, pt.point.getY());
left = (float) Math.min(left, pt.point.getX());
bottom = (float) Math.max(bottom, pt.point.getY());
right = (float) Math.max(right, pt.point.getX());
}
rectangles.add(new Rectangle(top, left, right - left, bottom - top));
}
return rectangles;
}
private enum Direction {
HORIZONTAL,
VERTICAL
}
static class PolygonVertex {
Point2D point;
Direction direction;
PolygonVertex(Point2D point, Direction direction) {
this.direction = direction;
this.point = point;
}
@Override
public boolean equals(Object other) {
if (this == other) {
return true;
}
if (!(other instanceof PolygonVertex)) {
return false;
}
return this.point.equals(((PolygonVertex) other).point);
}
@Override
public int hashCode() {
return this.point.hashCode();
}
@Override
public String toString() {
return String.format("%s[point=%s,direction=%s]", this.getClass().getName(), this.point.toString(), this.direction.toString());
}
}
}

View File

@ -0,0 +1,142 @@
/*
* CohenSutherland.java
* --------------------
* (c) 2007 by Intevation GmbH
*
* @author Sascha L. Teichmann (teichmann@intevation.de)
* @author Ludwig Reiter (ludwig@intevation.de)
*
* This program is free software under the LGPL (>=v2.1)
* Read the file LICENSE.txt coming with the sources for details.
*/
package com.knecon.fforesight.service.layoutparser.processor.classification.utils;
import java.awt.geom.Line2D;
import java.awt.geom.Rectangle2D;
/**
* Implements the well known Cohen Sutherland line
* clipping algorithm (line against clip rectangle).
*/
@SuppressWarnings("all")
public final class CohenSutherlandClipping {
private static final int INSIDE = 0;
private static final int LEFT = 1;
private static final int RIGHT = 2;
private static final int BOTTOM = 4;
private static final int TOP = 8;
private double xMin;
private double yMin;
private double xMax;
private double yMax;
/**
* Creates a Cohen Sutherland clipper with clip rect (0, 0, 0, 0).
*/
public CohenSutherlandClipping() {
}
/**
* Creates a Cohen Sutherland clipper with the given clip rectangle.
*
* @param clip the clip rectangle to use
*/
public CohenSutherlandClipping(Rectangle2D clip) {
setClip(clip);
}
/**
* Sets the clip rectangle.
*
* @param clip the clip rectangle
*/
public void setClip(Rectangle2D clip) {
xMin = clip.getX();
xMax = xMin + clip.getWidth();
yMin = clip.getY();
yMax = yMin + clip.getHeight();
}
private final int regionCode(double x, double y) {
int code = x < xMin ? LEFT : x > xMax ? RIGHT : INSIDE;
if (y < yMin) {
code |= BOTTOM;
} else if (y > yMax) {
code |= TOP;
}
return code;
}
/**
* Clips a given line against the clip rectangle.
* The modification (if needed) is done in place.
*
* @param line the line to clip
* @return true if line is clipped, false if line is
* totally outside the clip rect.
*/
public boolean clip(Line2D.Float line) {
double p1x = line.getX1();
double p1y = line.getY1();
double p2x = line.getX2();
double p2y = line.getY2();
double qx = 0d;
double qy = 0d;
boolean vertical = p1x == p2x;
double slope = vertical ? 0d : (p2y - p1y) / (p2x - p1x);
int c1 = regionCode(p1x, p1y);
int c2 = regionCode(p2x, p2y);
while (c1 != INSIDE || c2 != INSIDE) {
if ((c1 & c2) != INSIDE) {
return false;
}
int c = c1 == INSIDE ? c2 : c1;
if ((c & LEFT) != INSIDE) {
qx = xMin;
qy = (DoubleComparisons.feq(qx, p1x) ? 0 : qx - p1x) * slope + p1y;
} else if ((c & RIGHT) != INSIDE) {
qx = xMax;
qy = (DoubleComparisons.feq(qx, p1x) ? 0 : qx - p1x) * slope + p1y;
} else if ((c & BOTTOM) != INSIDE) {
qy = yMin;
qx = vertical ? p1x : (DoubleComparisons.feq(qy, p1y) ? 0 : qy - p1y) / slope + p1x;
} else if ((c & TOP) != INSIDE) {
qy = yMax;
qx = vertical ? p1x : (DoubleComparisons.feq(qy, p1y) ? 0 : qy - p1y) / slope + p1x;
}
if (c == c1) {
p1x = qx;
p1y = qy;
c1 = regionCode(p1x, p1y);
} else {
p2x = qx;
p2y = qy;
c2 = regionCode(p2x, p2y);
}
}
line.setLine(p1x, p1y, p2x, p2y);
return true;
}
}
// end of file

View File

@ -0,0 +1,30 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.utils;
import java.math.BigDecimal;
import lombok.experimental.UtilityClass;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@UtilityClass
@SuppressWarnings("all")
public final class DoubleComparisons {
private final static float EPSILON = 0.1f;
public static boolean feq(double f1, double f2) {
return (Math.abs(f1 - f2) < EPSILON);
}
public static float round(double d, int decimalPlace) {
BigDecimal bd = BigDecimal.valueOf(d);
bd = bd.setScale(decimalPlace, BigDecimal.ROUND_HALF_UP);
return bd.floatValue();
}
}

View File

@ -0,0 +1,119 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.utils;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
import lombok.experimental.UtilityClass;
@UtilityClass
@SuppressWarnings("all")
public final class PositionUtils {
// TODO This currently uses pdf coord system. In the futher this should use java coord system.
// Note: DirAdj (TextDirection Adjusted) can not be user for this.
public boolean isWithinBodyTextFrame(Rectangle btf, ClassificationTextBlock textBlock) {
if (btf == null || textBlock == null) {
return false;
}
double threshold = textBlock.getMostPopularWordHeight() * 3;
if (textBlock.getPdfMinX() + threshold > btf.getTopLeft().getX() && textBlock.getPdfMaxX() - threshold < btf.getTopLeft()
.getX() + btf.getWidth() && textBlock.getPdfMinY() + threshold > btf.getTopLeft().getY() && textBlock.getPdfMaxY() - threshold < btf.getTopLeft()
.getY() + btf.getHeight()) {
return true;
} else {
return false;
}
}
// TODO This currently uses pdf coord system. In the futher this should use java coord system.
// Note: DirAdj (TextDirection Adjusted) can not be user for this.
public boolean isOverBodyTextFrame(Rectangle btf, ClassificationTextBlock textBlock, int rotation) {
if (btf == null || textBlock == null) {
return false;
}
if (rotation == 90 && textBlock.getPdfMaxX() < btf.getTopLeft().getX()) {
return true;
}
if (rotation == 180 && textBlock.getPdfMaxY() < btf.getTopLeft().getY()) {
return true;
}
if (rotation == 270 && textBlock.getPdfMinX() > btf.getTopLeft().getX() + btf.getWidth()) {
return true;
}
if (rotation == 0 && textBlock.getPdfMinY() > btf.getTopLeft().getY() + btf.getHeight()) {
return true;
} else {
return false;
}
}
// TODO This currently uses pdf coord system. In the futher this should use java coord system.
// Note: DirAdj (TextDirection Adjusted) can not be user for this.
public boolean isUnderBodyTextFrame(Rectangle btf, ClassificationTextBlock textBlock, int rotation) {
if (btf == null || textBlock == null) {
return false;
}
if (rotation == 90 && textBlock.getPdfMinX() > btf.getTopLeft().getX() + btf.getWidth()) {
return true;
}
if (rotation == 180 && textBlock.getPdfMinY() > btf.getTopLeft().getY() + btf.getHeight()) {
return true;
}
if (rotation == 270 && textBlock.getPdfMaxX() < btf.getTopLeft().getX()) {
return true;
}
if (rotation == 0 && textBlock.getPdfMaxY() < btf.getTopLeft().getY()) {
return true;
} else {
return false;
}
}
// TODO This currently uses pdf coord system. In the futher this should use java coord system.
// Note: DirAdj (TextDirection Adjusted) can not be user for this.
public boolean isTouchingUnderBodyTextFrame(Rectangle btf, ClassificationTextBlock textBlock) {
//TODO Currently this is not working for rotated pages.
if (btf == null || textBlock == null) {
return false;
}
if (textBlock.getMinY() < btf.getTopLeft().getY()) {
return true;
} else {
return false;
}
}
public float getHeightDifferenceBetweenChunkWordAndDocumentWord(ClassificationTextBlock textBlock, Float documentMostPopularWordHeight) {
return textBlock.getMostPopularWordHeight() - documentMostPopularWordHeight;
}
public Float getApproxLineCount(ClassificationTextBlock textBlock) {
return textBlock.getHeight() / textBlock.getMostPopularWordHeight();
}
}

View File

@ -0,0 +1,109 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.utils;
import java.util.ArrayDeque;
import java.util.Comparator;
import java.util.Deque;
import java.util.List;
import lombok.experimental.UtilityClass;
/**
* Copied and minimal modified from PDFBox.
*/
@UtilityClass
public final class QuickSort {
private static final Comparator<? extends Comparable> OBJCOMP = new Comparator<Comparable>() {
@Override
public int compare(Comparable object1, Comparable object2) {
return object1.compareTo(object2);
}
};
/**
* Sorts the given list using the given comparator.
*
* @param <T> type of the objects to be sorted.
* @param list list to be sorted
* @param cmp comparator used to compare the objects within the list
*/
public static <T> void sort(List<T> list, Comparator<? super T> cmp) {
int size = list.size();
if (size < 2) {
return;
}
quicksort(list, cmp);
}
/**
* Sorts the given list using compareTo as comparator.
*
* @param <T> type of the objects to be sorted.
* @param list list to be sorted
*/
public static <T extends Comparable> void sort(List<T> list) {
sort(list, (Comparator<T>) OBJCOMP);
}
private static <T> void quicksort(List<T> list, Comparator<? super T> cmp) {
Deque<Integer> stack = new ArrayDeque<Integer>();
stack.push(0);
stack.push(list.size());
while (!stack.isEmpty()) {
int right = stack.pop();
int left = stack.pop();
if (right - left < 2) {
continue;
}
int p = left + ((right - left) / 2);
p = partition(list, cmp, p, left, right);
stack.push(p + 1);
stack.push(right);
stack.push(left);
stack.push(p);
}
}
private static <T> int partition(List<T> list, Comparator<? super T> cmp, int p, int start, int end) {
int l = start;
int h = end - 2;
T piv = list.get(p);
swap(list, p, end - 1);
while (l < h) {
if (cmp.compare(list.get(l), piv) <= 0) {
l++;
} else if (cmp.compare(piv, list.get(h)) <= 0) {
h--;
} else {
swap(list, l, h);
}
}
int idx = h;
if (cmp.compare(list.get(h), piv) < 0) {
idx++;
}
swap(list, end - 1, idx);
return idx;
}
private static <T> void swap(List<T> list, int i, int j) {
T tmp = list.get(i);
list.set(i, list.get(j));
list.set(j, tmp);
}
}

View File

@ -0,0 +1,64 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.utils;
import java.awt.geom.Line2D;
import java.awt.geom.Point2D;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Ruling;
import lombok.experimental.UtilityClass;
@UtilityClass
public final class RulingTextDirAdjustUtil {
/**
* Converts a ruling (line of a table) the same way TextPositions are converted in PDFBox.
* This will get the y position of the text, adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
*
* See org.apache.pdfbox.text.TextPosition
*/
public Line2D.Float convertToDirAdj(Ruling ruling, float dir, float pageWidth, float pageHeight) {
return new Line2D.Float(convertPoint(ruling.x1, ruling.y1, dir, pageWidth, pageHeight), convertPoint(ruling.x2, ruling.y2, dir, pageWidth, pageHeight));
}
private Point2D convertPoint(float x, float y, float dir, float pageWidth, float pageHeight) {
var xAdj = getXRot(x, y, dir, pageWidth, pageHeight);
var yLowerLeftRot = getYLowerLeftRot(x, y, dir, pageWidth, pageHeight);
var yAdj = dir == 0 || dir == 180 ? pageHeight - yLowerLeftRot : pageWidth - yLowerLeftRot;
return new Point2D.Float(xAdj, yAdj);
}
@SuppressWarnings("SuspiciousNameCombination")
private float getXRot(float x, float y, float dir, float pageWidth, float pageHeight) {
if (dir == 0) {
return x;
} else if (dir == 90) {
return y;
} else if (dir == 180) {
return pageWidth - x;
} else if (dir == 270) {
return pageHeight - y;
}
return 0;
}
private float getYLowerLeftRot(float x, float y, float dir, float pageWidth, float pageHeight) {
if (dir == 0) {
return y;
} else if (dir == 90) {
return pageWidth - x;
} else if (dir == 180) {
return pageHeight - y;
} else if (dir == 270) {
return x;
}
return 0;
}
}

View File

@ -0,0 +1,19 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.utils;
import lombok.experimental.UtilityClass;
@UtilityClass
public final class TextNormalizationUtilities {
/**
* Revert hyphenation due to line breaks.
*
* @param text Text to be processed.
* @return Text without line-break hyphenation.
*/
public static String removeHyphenLineBreaks(String text) {
return text.replaceAll("([^\\s\\d\\-]{2,500})[\\-\\u00AD]\\R", "$1");
}
}

View File

@ -0,0 +1,386 @@
package com.knecon.fforesight.service.layoutparser.processor.factory;
import static java.lang.String.format;
import static java.util.stream.Collectors.groupingBy;
import static java.util.stream.Collectors.toList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.DocumentGraph;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.FooterNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.HeaderNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.HeadlineNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ImageNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.NodeType;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ParagraphNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SectionNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableCellNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.AtomicTextBlock;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationFooter;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationHeader;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Table;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.TableCell;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextPositionSequence;
@Service
public class DocumentGraphFactory {
public static final double TABLE_CELL_MERGE_CONTENTS_SIZE_THRESHOLD = 0.05;
public DocumentGraph buildDocumentGraph(ClassificationDocument document) {
TextBlockFactory textBlockFactory = new TextBlockFactory();
Context context = new Context(new TableOfContents(), new HashMap<>(), new LinkedList<>(), new LinkedList<>(), textBlockFactory);
document.getPages().stream().map(this::buildPage).forEach(page -> context.pages().put(page, new AtomicInteger(1)));
document.getSections().stream().flatMap(section -> section.getImages().stream()).forEach(image -> context.images().add(image));
addSections(document, context);
addHeaderAndFooterToEachPage(document, context);
DocumentGraph documentGraph = DocumentGraph.builder().numberOfPages(context.pages.size()).pages(context.pages.keySet()).tableOfContents(context.tableOfContents).build();
documentGraph.setTextBlock(documentGraph.buildTextBlock());
return documentGraph;
}
private void addSections(ClassificationDocument document, Context context) {
document.getSections().forEach(section -> addSection(null, section.getPageBlocks(), section.getImages(), context));
}
private void addSection(SemanticNode parentNode, List<AbstractTextContainer> pageBlocks, List<ClassifiedImage> images, Context context) {
Map<Integer, List<AbstractTextContainer>> blocksPerPage = pageBlocks.stream().collect(groupingBy(AbstractTextContainer::getPage));
SectionNode sectionNode = SectionNode.builder().entities(new HashSet<>()).tableOfContents(context.tableOfContents()).build();
context.sections().add(sectionNode);
blocksPerPage.keySet().forEach(pageNumber -> addSectionNodeToPageNode(context, sectionNode, pageNumber));
List<Integer> tocId;
if (parentNode == null) {
tocId = context.tableOfContents.createNewEntryAndReturnId(NodeType.SECTION, sectionNode);
} else {
tocId = context.tableOfContents.createNewChildEntryAndReturnId(parentNode.getTocId(), NodeType.SECTION, sectionNode);
}
sectionNode.setTocId(tocId);
Set<AbstractTextContainer> alreadyMerged = new HashSet<>();
for (AbstractTextContainer abstractTextContainer : pageBlocks) {
if (alreadyMerged.contains(abstractTextContainer)) {
continue;
}
if (abstractTextContainer instanceof ClassificationTextBlock) {
List<ClassificationTextBlock> textBlocks = findTextBlocksWithSameClassificationAndAlignsY(abstractTextContainer, pageBlocks);
alreadyMerged.addAll(textBlocks);
addParagraphOrHeadline(sectionNode, (ClassificationTextBlock) abstractTextContainer, context, textBlocks);
}
if (abstractTextContainer instanceof Table) {
addTable(sectionNode, (Table) abstractTextContainer, context);
}
}
for (ClassifiedImage image : images) {
addImage(sectionNode, image, context);
}
}
private static List<ClassificationTextBlock> findTextBlocksWithSameClassificationAndAlignsY(AbstractTextContainer atc, List<AbstractTextContainer> pageBlocks) {
return pageBlocks.stream()
.filter(abstractTextContainer -> !abstractTextContainer.equals(atc))
.filter(abstractTextContainer -> abstractTextContainer.getPage() == atc.getPage())
.filter(abstractTextContainer -> abstractTextContainer instanceof ClassificationTextBlock)
.filter(abstractTextContainer -> abstractTextContainer.intersectsY(atc))
.map(abstractTextContainer -> (ClassificationTextBlock) abstractTextContainer)
.toList();
}
private void addSectionNodeToPageNode(Context context, SectionNode sectionNode, Integer pageNumber) {
PageNode page = getPage(pageNumber, context);
page.getMainBody().add(sectionNode);
}
private void addTable(SemanticNode parentNode, Table table, Context context) {
PageNode page = getPage(table.getPage(), context);
TableNode tableNode = TableNode.builder().tableOfContents(context.tableOfContents()).numberOfCols(table.getColCount()).numberOfRows(table.getRowCount()).build();
if (!page.getMainBody().contains(parentNode)) {
parentNode.getPages().add(page);
}
page.getMainBody().add(tableNode);
List<Integer> tocId = context.tableOfContents().createNewChildEntryAndReturnId(parentNode.getTocId(), NodeType.TABLE, tableNode);
tableNode.setTocId(tocId);
addTableCells(table.getRows(), tableNode, context, table.getPage());
}
private void addTableCells(List<List<TableCell>> rows, SemanticNode parentNode, Context context, int pageNumber) {
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
for (int colIndex = 0; colIndex < rows.get(rowIndex).size(); colIndex++) {
addTableCell(rows.get(rowIndex).get(colIndex), rowIndex, colIndex, parentNode, pageNumber, context);
}
}
}
private void addTableCell(TableCell cell, int rowIndex, int colIndex, SemanticNode parentNode, int pageNumber, Context context) {
PageNode page = getPage(pageNumber, context);
cell.getTextBlocks().stream().filter(tb -> tb.getPage() == 0).forEach(tb -> tb.setPage(pageNumber));
TableCellNode tableCellNode = TableCellNode.builder()
.tableOfContents(context.tableOfContents())
.row(rowIndex)
.col(colIndex)
.header(cell.isHeaderCell())
.bBox(cell.getBounds2D())
.build();
page.getMainBody().add(tableCellNode);
TextBlock textBlock;
List<Integer> tocId = context.tableOfContents().createNewChildEntryAndReturnId(parentNode.getTocId(), NodeType.TABLE_CELL, tableCellNode);
tableCellNode.setTocId(tocId);
if (cell.getTextBlocks().isEmpty()) {
tableCellNode.setTerminalTextBlock(context.textBlockFactory.emptyTextBlock(parentNode, context, page));
tableCellNode.setTerminal(true);
} else if (cell.getTextBlocks().size() == 1) {
textBlock = context.textBlockFactory().buildAtomicTextBlock(cell.getTextBlocks().get(0).getSequences(), tableCellNode, context, page);
tableCellNode.setTerminalTextBlock(textBlock);
tableCellNode.setTerminal(true);
} else if (firstTextBlockIsHeadline(cell)) {
addSection(tableCellNode, cell.getTextBlocks().stream().map(tb -> (AbstractTextContainer) tb).toList(), Collections.emptyList(), context);
tableCellNode.setTerminal(false);
} else if (cellAreaIsSmallerThanPageAreaTimesThreshold(cell, page)) {
List<TextPositionSequence> sequences = TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(cell.getTextBlocks());
textBlock = context.textBlockFactory().buildAtomicTextBlock(sequences, tableCellNode, context, page);
tableCellNode.setTerminalTextBlock(textBlock);
tableCellNode.setTerminal(true);
} else {
cell.getTextBlocks().forEach(tb -> addParagraphOrHeadline(tableCellNode, tb, context));
tableCellNode.setTerminal(false);
}
}
private static boolean cellAreaIsSmallerThanPageAreaTimesThreshold(TableCell cell, PageNode page) {
return cell.getArea() < TABLE_CELL_MERGE_CONTENTS_SIZE_THRESHOLD * page.getHeight() * page.getWidth();
}
private static boolean firstTextBlockIsHeadline(TableCell cell) {
String classification = cell.getTextBlocks().get(0).getClassification();
return classification != null && classification.startsWith("H");
}
private void addParagraphOrHeadline(SemanticNode parentNode, ClassificationTextBlock originalTextBlock, Context context) {
addParagraphOrHeadline(parentNode, originalTextBlock, context, Collections.emptyList());
}
private void addParagraphOrHeadline(SemanticNode parentNode, ClassificationTextBlock originalTextBlock, Context context, List<ClassificationTextBlock> textBlocksToMerge) {
PageNode page = getPage(originalTextBlock.getPage(), context);
SemanticNode node;
if (originalTextBlock.getClassification() != null && originalTextBlock.getClassification().startsWith("H")) {
node = HeadlineNode.builder().tableOfContents(context.tableOfContents()).build();
} else {
node = ParagraphNode.builder().tableOfContents(context.tableOfContents()).build();
}
page.getMainBody().add(node);
List<ClassificationTextBlock> textBlocks = new LinkedList<>(textBlocksToMerge);
textBlocks.add(originalTextBlock);
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), node, context, page);
if (node instanceof HeadlineNode headlineNode) {
List<Integer> tocId = context.tableOfContents.createNewChildEntryAndReturnId(parentNode.getTocId(), NodeType.HEADLINE, node);
headlineNode.setTerminalTextBlock(textBlock);
headlineNode.setTocId(tocId);
}
if (node instanceof ParagraphNode paragraphNode) {
List<Integer> tocId = context.tableOfContents.createNewChildEntryAndReturnId(parentNode.getTocId(), NodeType.PARAGRAPH, node);
paragraphNode.setTerminalTextBlock(textBlock);
paragraphNode.setTocId(tocId);
}
}
private void addImage(SectionNode sectionNode, ClassifiedImage image, Context context) {
PageNode page = getPage(image.getPage(), context);
ImageNode imageNode = ImageNode.builder()
.imageType(image.getImageType())
.position(image.getPosition())
.transparency(image.isHasTransparency())
.page(page)
.tableOfContents(context.tableOfContents())
.build();
page.getMainBody().add(imageNode);
List<Integer> tocId = context.tableOfContents().createNewChildEntryAndReturnId(sectionNode.getTocId(), NodeType.IMAGE, imageNode);
imageNode.setTocId(tocId);
}
private void addHeaderAndFooterToEachPage(ClassificationDocument document, Context context) {
Map<Integer, List<ClassificationTextBlock>> headers = document.getHeaders()
.stream()
.map(ClassificationHeader::getTextBlocks)
.flatMap(List::stream)
.collect(groupingBy(AbstractTextContainer::getPage, toList()));
Map<Integer, List<ClassificationTextBlock>> footers = document.getFooters()
.stream()
.map(ClassificationFooter::getTextBlocks)
.flatMap(List::stream)
.collect(groupingBy(AbstractTextContainer::getPage, toList()));
for (int pageIndex = 1; pageIndex <= document.getPages().size(); pageIndex++) {
if (headers.containsKey(pageIndex)) {
addHeader(headers.get(pageIndex), context);
} else {
addEmptyHeader(pageIndex, context);
}
}
for (int pageIndex = 1; pageIndex <= document.getPages().size(); pageIndex++) {
if (footers.containsKey(pageIndex)) {
addFooter(footers.get(pageIndex), context);
} else {
addEmptyFooter(pageIndex, context);
}
}
}
private void addFooter(List<ClassificationTextBlock> textBlocks, Context context) {
PageNode page = getPage(textBlocks.get(0).getPage(), context);
FooterNode footer = FooterNode.builder().tableOfContents(context.tableOfContents()).build();
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks),
footer,
context,
page);
List<Integer> tocId = context.tableOfContents().createNewEntryAndReturnId(NodeType.FOOTER, footer);
footer.setTocId(tocId);
footer.setTerminalTextBlock(textBlock);
page.setFooter(footer);
}
public void addHeader(List<ClassificationTextBlock> textBlocks, Context context) {
PageNode page = getPage(textBlocks.get(0).getPage(), context);
HeaderNode header = HeaderNode.builder().tableOfContents(context.tableOfContents()).build();
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks),
header,
context,
0,
page);
List<Integer> tocId = context.tableOfContents().createNewEntryAndReturnId(NodeType.HEADER, header);
header.setTocId(tocId);
header.setTerminalTextBlock(textBlock);
page.setHeader(header);
}
private void addEmptyFooter(int pageIndex, Context context) {
PageNode page = getPage(pageIndex, context);
FooterNode footer = FooterNode.builder().tableOfContents(context.tableOfContents()).build();
AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(footer, context, page);
List<Integer> tocId = context.tableOfContents().createNewEntryAndReturnId(NodeType.FOOTER, footer);
footer.setTocId(tocId);
footer.setTerminalTextBlock(textBlock);
page.setFooter(footer);
}
private void addEmptyHeader(int pageIndex, Context context) {
PageNode page = getPage(pageIndex, context);
HeaderNode header = HeaderNode.builder().tableOfContents(context.tableOfContents()).build();
AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(header, 0, page);
List<Integer> tocId = context.tableOfContents().createNewEntryAndReturnId(NodeType.HEADER, header);
header.setTocId(tocId);
header.setTerminalTextBlock(textBlock);
page.setHeader(header);
}
private PageNode buildPage(ClassificationPage p) {
return PageNode.builder()
.height((int) p.getPageHeight())
.width((int) p.getPageWidth())
.number(p.getPageNumber())
.rotation(p.getRotation())
.mainBody(new LinkedList<>())
.build();
}
private PageNode getPage(int pageIndex, Context context) {
return context.pages.keySet()
.stream()
.filter(page -> page.getNumber() == pageIndex)
.findFirst()
.orElseThrow(() -> new NoSuchElementException(format("ClassificationPage with number %d not found", pageIndex)));
}
record Context(
TableOfContents tableOfContents, Map<PageNode, AtomicInteger> pages, List<SectionNode> sections, List<ClassifiedImage> images, TextBlockFactory textBlockFactory) {
}
}

View File

@ -0,0 +1,132 @@
package com.knecon.fforesight.service.layoutparser.processor.factory;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.stream.Collectors;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationSection;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Table;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.TableCell;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
@Service
public class ImageSortService {
public SortedImages sortImagesIntoStructure(ClassificationDocument document) {
SortedImages sortedImages = new SortedImages(new HashMap<>(), new HashMap<>(), new HashMap<>(), new HashMap<>(), new HashMap<>());
Map<Integer, List<ClassifiedImage>> imagesByPage = document.getSections()
.stream()
.flatMap(section -> section.getImages().stream())
.distinct()
.collect(Collectors.groupingBy(ClassifiedImage::getPage));
for (int pageNumber : imagesByPage.keySet()) {
List<AbstractTextContainer> textContainersOnPage = document.getSections()
.stream()
.flatMap(section -> section.getPageBlocks().stream())
.filter(abstractTextContainer -> abstractTextContainer.getPage() == pageNumber)
.toList();
List<ClassificationSection> sectionsOnPage = document.getSections()
.stream()
.filter(section -> section.getPageBlocks().stream().anyMatch(block -> block.getPage() == pageNumber))
.toList();
for (ClassifiedImage image : imagesByPage.get(pageNumber)) {
sortImage(textContainersOnPage, sectionsOnPage, image, sortedImages);
}
}
return sortedImages;
}
private void sortImage(List<AbstractTextContainer> textContainersOnPage, List<ClassificationSection> sectionsOnPage, ClassifiedImage image, SortedImages sortedImages) {
Optional<AbstractTextContainer> containingTextContainer = getContainingTextContainer(image, textContainersOnPage);
Optional<ClassificationSection> sectionContainingTextContainer = getContainingSection(image, sectionsOnPage);
List<AbstractTextContainer> containedTextContainers = getContainedTextContainers(image, textContainersOnPage);
List<ClassificationSection> containedSections = getContainedSections(image, sectionsOnPage);
if (containingTextContainer.isPresent()) {
if (sortImageIntoTextContainerOrCell(image, sortedImages, containingTextContainer.get())) {
return;
}
}
}
private static boolean sortImageIntoTextContainerOrCell(ClassifiedImage image, SortedImages sortedImages, AbstractTextContainer containingTextContainer) {
if (containingTextContainer instanceof ClassificationTextBlock) {
sortedImages.containedInTextContainer().computeIfAbsent(containingTextContainer, sortedImage -> new ArrayList<>()).add(image);
return true;
}
if (containingTextContainer instanceof Table) {
Optional<TableCell> containingCell = getContainingCell((Table) containingTextContainer, image);
if (containingCell.isPresent()) {
sortedImages.containedInCell().computeIfAbsent(containingCell.get(), sortedImage -> new ArrayList<>()).add(image);
return true;
}
}
return false;
}
private static Optional<TableCell> getContainingCell(Table table, ClassifiedImage image) {
return table.getRows().stream().flatMap(List::stream).filter(cell -> cell.contains(image.getPosition())).findFirst();
}
private List<ClassificationSection> getContainedSections(ClassifiedImage image, List<ClassificationSection> sectionsOnPage) {
return sectionsOnPage.stream()
.filter(section -> image.getPosition().contains(RectangleTransformations.bBoxUnionAbstractTextContainer(section.getPageBlocks()
.stream()
.filter(block -> block.getPage() == image.getPage())
.toList())))
.toList();
}
private List<AbstractTextContainer> getContainedTextContainers(ClassifiedImage image, List<AbstractTextContainer> textContainersOnPage) {
return textContainersOnPage.stream().filter(textContainer -> image.getPosition().contains(RectangleTransformations.toRectangle2D(textContainer))).toList();
}
private Optional<ClassificationSection> getContainingSection(ClassifiedImage image, List<ClassificationSection> sectionsOnPage) {
return sectionsOnPage.stream()//
.filter(section -> //
RectangleTransformations.bBoxUnionAbstractTextContainer(section.getPageBlocks().stream().filter(block -> block.getPage() == image.getPage()).toList())//
.contains(image.getPosition())).findFirst();
}
private Optional<AbstractTextContainer> getContainingTextContainer(ClassifiedImage image, List<AbstractTextContainer> textContainersOnPage) {
return textContainersOnPage.stream().filter(textContainer -> RectangleTransformations.toRectangle2D(textContainer).contains(image.getPosition())).findFirst();
}
public record SortedImages(
Map<TableCell, List<ClassifiedImage>> containedInCell,
Map<AbstractTextContainer, List<ClassifiedImage>> containedInTextContainer,
Map<ClassificationSection, List<ClassifiedImage>> containedInSection,
Map<ClassifiedImage, List<AbstractTextContainer>> containedByImage,
Map<ClassifiedImage, List<ClassificationSection>> sectionContainedByImage) {
}
}

View File

@ -0,0 +1,105 @@
package com.knecon.fforesight.service.layoutparser.processor.factory;
import static java.lang.String.format;
import java.awt.geom.Area;
import java.awt.geom.Rectangle2D;
import java.util.Arrays;
import java.util.List;
import java.util.Set;
import java.util.function.BiConsumer;
import java.util.function.BinaryOperator;
import java.util.function.Function;
import java.util.function.Supplier;
import java.util.stream.Collector;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer;
public class RectangleTransformations {
public static Rectangle2D pad(Rectangle2D rectangle2D, int deltaX, int deltaY) {
return new Rectangle2D.Double(rectangle2D.getMinX() - deltaX, rectangle2D.getMinY() - deltaY, rectangle2D.getWidth() + 2 * deltaX, rectangle2D.getHeight() + 2 * deltaY);
}
public static Rectangle2D bBoxUnionAbstractTextContainer(List<AbstractTextContainer> abstractTextContainers) {
return abstractTextContainers.stream().map(RectangleTransformations::toRectangle2D).collect(new Rectangle2DUnion());
}
public static Rectangle2D rectangleUnion(List<Rectangle2D> rectangle2DList) {
return rectangle2DList.stream().collect(new Rectangle2DUnion());
}
public static Rectangle2D toRectangle2D(AbstractTextContainer abstractTextContainer) {
return new Rectangle2D.Float(abstractTextContainer.getMinX(), abstractTextContainer.getMinY(), abstractTextContainer.getWidth(), abstractTextContainer.getHeight());
}
public static Rectangle2D toRectangle2D(PDRectangle rectangle) {
return new Rectangle2D.Double(rectangle.getLowerLeftX(), rectangle.getLowerLeftY(), rectangle.getWidth(), rectangle.getHeight());
}
public static String toString(Rectangle2D rectangle2D) {
return format("%f,%f,%f,%f", rectangle2D.getX(), rectangle2D.getY(), rectangle2D.getWidth(), rectangle2D.getHeight());
}
public static Rectangle2D parseRectangle2D(String bBox) {
List<Float> floats = Arrays.stream(bBox.split(",")).map(Float::parseFloat).toList();
return new Rectangle2D.Float(floats.get(0), floats.get(1), floats.get(2), floats.get(3));
}
private static class Rectangle2DUnion implements Collector<Rectangle2D, Area, Rectangle2D> {
@Override
public Supplier<Area> supplier() {
return Area::new;
}
@Override
public BiConsumer<Area, Rectangle2D> accumulator() {
return (area, rectangle2D) -> area.add(new Area(rectangle2D));
}
@Override
public BinaryOperator<Area> combiner() {
return (area1, area2) -> {
area1.add(area2);
return area1;
};
}
@Override
public Function<Area, Rectangle2D> finisher() {
return Area::getBounds2D;
}
@Override
public Set<Characteristics> characteristics() {
return Set.of(Characteristics.CONCURRENT, Characteristics.UNORDERED);
}
}
}

View File

@ -0,0 +1,156 @@
package com.knecon.fforesight.service.layoutparser.processor.factory;
import java.awt.geom.AffineTransform;
import java.awt.geom.Rectangle2D;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.Objects;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.RedTextPosition;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextDirection;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextPositionSequence;
public class SearchTextWithTextPositionFactory {
public static final int HEIGHT_PADDING = 2;
public static SearchTextWithTextPositionModel buildSearchTextToTextPositionModel(List<TextPositionSequence> sequences) {
if (sequences.isEmpty() || sequences.stream().allMatch(sequence -> sequence.getTextPositions().isEmpty())) {
return SearchTextWithTextPositionModel.builder()
.searchText("")
.lineBreaks(Collections.emptyList())
.positions(Collections.emptyList())
.stringCoordsToPositionCoords(Collections.emptyList())
.build();
}
List<Integer> stringIdxToPositionIdx = new LinkedList<>();
List<Integer> lineBreaksStringIdx = new LinkedList<>();
StringBuilder sb = new StringBuilder();
int stringIdx = 0;
int positionIdx = 0;
int lastHyphenIdx = -3;
RedTextPosition currentTextPosition = sequences.get(0).getTextPositions().get(0);
RedTextPosition previousTextPosition = RedTextPosition.builder().unicode(" ").position(currentTextPosition.getPosition()).build();
for (TextPositionSequence word : sequences) {
for (int i = 0; i < word.getTextPositions().size(); ++i) {
currentTextPosition = word.getTextPositions().get(i);
if (isLineBreak(currentTextPosition, previousTextPosition)) {
if (stringIdx - lastHyphenIdx < 3) {
sb.delete(lastHyphenIdx, sb.length());
stringIdxToPositionIdx = stringIdxToPositionIdx.subList(0, lastHyphenIdx);
stringIdx = lastHyphenIdx;
lastHyphenIdx = -3;
}
lineBreaksStringIdx.add(stringIdx);
}
if (!isRepeatedWhitespace(currentTextPosition.getUnicode(), previousTextPosition.getUnicode())) {
if (isHyphen(currentTextPosition.getUnicode())) {
lastHyphenIdx = stringIdx;
}
sb.append(currentTextPosition.getUnicode());
stringIdxToPositionIdx.add(positionIdx);
++stringIdx;
}
previousTextPosition = currentTextPosition;
++positionIdx;
}
previousTextPosition = RedTextPosition.builder().unicode(" ").position(previousTextPosition.getPosition()).build();
sb.append(previousTextPosition.getUnicode());
stringIdxToPositionIdx.add(positionIdx);
++stringIdx;
}
assert sb.length() == stringIdxToPositionIdx.size();
List<Rectangle2D> positions = sequences.stream()
.flatMap(sequence -> sequence.getTextPositions().stream().map(textPosition -> mapRedTextPositionToInitialUserSpace(textPosition, sequence)))
.toList();
return SearchTextWithTextPositionModel.builder()
.searchText(sb.toString())
.lineBreaks(lineBreaksStringIdx)
.stringCoordsToPositionCoords(stringIdxToPositionIdx)
.positions(positions)
.build();
}
private static boolean isLineBreak(RedTextPosition currentTextPosition, RedTextPosition previousTextPosition) {
return Objects.equals(currentTextPosition.getUnicode(), "\n") || isDeltaYLargerThanTextHeight(currentTextPosition, previousTextPosition);
}
private static boolean isDeltaYLargerThanTextHeight(RedTextPosition currentPosition, RedTextPosition previousPosition) {
if (previousPosition == null) {
return false;
}
float deltaY = Math.abs(currentPosition.getYDirAdj() - previousPosition.getYDirAdj());
return deltaY >= currentPosition.getHeightDir();
}
private static boolean isRepeatedWhitespace(String currentUnicode, String previousUnicode) {
return Objects.equals(previousUnicode, " ") && Objects.equals(currentUnicode, " ");
}
private static boolean isHyphen(String unicodeCharacter) {
return Objects.equals(unicodeCharacter, "-") || //
Objects.equals(unicodeCharacter, "~") || //
Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "\u00AD");
}
private static Rectangle2D mapRedTextPositionToInitialUserSpace(RedTextPosition textPosition, TextPositionSequence sequence) {
float textHeight = sequence.getTextHeight() + HEIGHT_PADDING;
Rectangle2D rectangle2D = new Rectangle2D.Double(textPosition.getXDirAdj(),
textPosition.getYDirAdj() - textHeight,
textPosition.getWidthDirAdj(),
textHeight + HEIGHT_PADDING);
AffineTransform transform = new AffineTransform();
if (sequence.getDir() == TextDirection.ZERO || sequence.getDir() == TextDirection.HALF_CIRCLE) {
transform.rotate(sequence.getDir().getRadians(), sequence.getPageWidth() / 2f, sequence.getPageHeight() / 2f);
transform.translate(0f, sequence.getPageHeight());
} else if (sequence.getDir() == TextDirection.QUARTER_CIRCLE) {
transform.rotate(sequence.getDir().getRadians(), sequence.getPageWidth() / 2f, sequence.getPageWidth() / 2f);
transform.translate(0f, sequence.getPageWidth());
} else {
transform.rotate(sequence.getDir().getRadians(), sequence.getPageHeight() / 2f, sequence.getPageHeight() / 2f);
transform.translate(0f, sequence.getPageWidth());
}
transform.scale(1., -1.);
return transform.createTransformedShape(rectangle2D).getBounds2D();
}
}

Some files were not shown because too many files have changed in this diff Show More