RED-6009: Document Tree Structure
*moved all layoutparsing code to separate project *wip (some dependency issues)
This commit is contained in:
parent
ae941e0982
commit
aac0259caf
33
.gitignore
vendored
Normal file
33
.gitignore
vendored
Normal file
@ -0,0 +1,33 @@
|
||||
HELP.md
|
||||
target/
|
||||
!.mvn/wrapper/maven-wrapper.jar
|
||||
!**/src/main/**/target/
|
||||
!**/src/test/**/target/
|
||||
|
||||
### STS ###
|
||||
.apt_generated
|
||||
.classpath
|
||||
.factorypath
|
||||
.project
|
||||
.settings
|
||||
.springBeans
|
||||
.sts4-cache
|
||||
|
||||
### IntelliJ IDEA ###
|
||||
.idea
|
||||
*.iws
|
||||
*.iml
|
||||
*.ipr
|
||||
|
||||
### NetBeans ###
|
||||
/nbproject/private/
|
||||
/nbbuild/
|
||||
/dist/
|
||||
/nbdist/
|
||||
/.nb-gradle/
|
||||
build/
|
||||
!**/src/main/**/build/
|
||||
!**/src/test/**/build/
|
||||
|
||||
### VS Code ###
|
||||
.vscode/
|
||||
BIN
.mvn/wrapper/maven-wrapper.jar
vendored
Normal file
BIN
.mvn/wrapper/maven-wrapper.jar
vendored
Normal file
Binary file not shown.
18
.mvn/wrapper/maven-wrapper.properties
vendored
Normal file
18
.mvn/wrapper/maven-wrapper.properties
vendored
Normal file
@ -0,0 +1,18 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# https://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
distributionUrl=https://repo.maven.apache.org/maven2/org/apache/maven/apache-maven/3.8.7/apache-maven-3.8.7-bin.zip
|
||||
wrapperUrl=https://repo.maven.apache.org/maven2/org/apache/maven/wrapper/maven-wrapper/3.1.1/maven-wrapper-3.1.1.jar
|
||||
17
layoutparser-service-image/pom.xml
Normal file
17
layoutparser-service-image/pom.xml
Normal file
@ -0,0 +1,17 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
||||
|
||||
<parent>
|
||||
<groupId>com.knecon.fforesight</groupId>
|
||||
<artifactId>layoutparser</artifactId>
|
||||
<version>1.0.0</version>
|
||||
</parent>
|
||||
|
||||
<artifactId>layoutparser-service-image</artifactId>
|
||||
<version>1.0.0</version>
|
||||
|
||||
</project>
|
||||
49
layoutparser-service/layoutparser-service-internal-api/pom.xml
Executable file
49
layoutparser-service/layoutparser-service-internal-api/pom.xml
Executable file
@ -0,0 +1,49 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
||||
<parent>
|
||||
<groupId>com.knecon.fforesight</groupId>
|
||||
<artifactId>layoutparser-service</artifactId>
|
||||
<version>1.0.0</version>
|
||||
</parent>
|
||||
|
||||
<artifactId>layoutparser-service-internal-api</artifactId>
|
||||
<version>1.0.0</version>
|
||||
|
||||
<packaging>pom</packaging>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.projectlombok</groupId>
|
||||
<artifactId>lombok</artifactId>
|
||||
<version>1.18.26</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.google.guava</groupId>
|
||||
<artifactId>guava</artifactId>
|
||||
<version>31.1-jre</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.springframework.boot</groupId>
|
||||
<artifactId>spring-boot-maven-plugin</artifactId>
|
||||
<configuration>
|
||||
<excludes>
|
||||
<exclude>
|
||||
<groupId>org.projectlombok</groupId>
|
||||
<artifactId>lombok</artifactId>
|
||||
</exclude>
|
||||
</excludes>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
|
||||
</project>
|
||||
@ -0,0 +1,19 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class AtomicPositionBlockData {
|
||||
|
||||
Long id;
|
||||
int[] stringIdxToPositionIdx;
|
||||
float[][] positions;
|
||||
|
||||
}
|
||||
@ -0,0 +1,23 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class AtomicTextBlockData {
|
||||
|
||||
Long id;
|
||||
Long page;
|
||||
String searchText;
|
||||
int numberOnPage;
|
||||
int start;
|
||||
int end;
|
||||
int[] lineBreaks;
|
||||
|
||||
}
|
||||
@ -0,0 +1,20 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class DocumentData {
|
||||
|
||||
PageData[] pages;
|
||||
AtomicTextBlockData[] atomicTextBlocks;
|
||||
AtomicPositionBlockData[] atomicPositionBlocks;
|
||||
TableOfContentsData tableOfContents;
|
||||
|
||||
}
|
||||
@ -0,0 +1,20 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class PageData {
|
||||
|
||||
int number;
|
||||
int height;
|
||||
int width;
|
||||
int rotation;
|
||||
|
||||
}
|
||||
@ -0,0 +1,90 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import javax.management.openmbean.InvalidKeyException;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.NodeType;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class TableOfContentsData {
|
||||
|
||||
List<EntryData> entries;
|
||||
|
||||
|
||||
public EntryData get(List<Integer> tocId) {
|
||||
|
||||
if (tocId.size() < 1) {
|
||||
throw new InvalidKeyException(String.format("ClassificationSection Identifier: \"%s\" is not valid.", tocId));
|
||||
}
|
||||
EntryData entry = entries.get(tocId.get(0));
|
||||
for (int id : tocId.subList(1, tocId.size())) {
|
||||
entry = entry.subEntries().get(id);
|
||||
}
|
||||
return entry;
|
||||
}
|
||||
|
||||
|
||||
public Stream<EntryData> streamAllEntries() {
|
||||
|
||||
return entries.stream().flatMap(TableOfContentsData::flatten);
|
||||
}
|
||||
|
||||
|
||||
private static List<Integer> getIds(String idsAsString) {
|
||||
|
||||
return Arrays.stream(idsAsString.split("\\.")).map(Integer::valueOf).toList();
|
||||
}
|
||||
|
||||
|
||||
public String toString() {
|
||||
|
||||
return String.join("\n", streamAllEntries().map(EntryData::toString).toList());
|
||||
}
|
||||
|
||||
|
||||
private static Stream<EntryData> flatten(EntryData entry) {
|
||||
|
||||
return Stream.concat(Stream.of(entry), entry.subEntries().stream().flatMap(TableOfContentsData::flatten));
|
||||
}
|
||||
|
||||
|
||||
@Builder
|
||||
public record EntryData(NodeType type, int[] tocId, Long[] atomicBlocks, Long[] pages, Map<String, String> properties, List<EntryData> subEntries) {
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append("[");
|
||||
for (int i : tocId) {
|
||||
sb.append(i);
|
||||
sb.append(",");
|
||||
}
|
||||
sb.delete(sb.length() - 1, sb.length());
|
||||
sb.append("]: ");
|
||||
|
||||
sb.append(type);
|
||||
sb.append(" atbs = ");
|
||||
sb.append(atomicBlocks.length);
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,148 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph;
|
||||
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import lombok.Setter;
|
||||
|
||||
@Setter
|
||||
public class Boundary implements Comparable<Boundary> {
|
||||
|
||||
private int start;
|
||||
private int end;
|
||||
|
||||
|
||||
public Boundary(int start, int end) {
|
||||
|
||||
if (start > end) {
|
||||
throw new IllegalArgumentException(String.format("start: %d > end: %d", start, end));
|
||||
}
|
||||
this.start = start;
|
||||
this.end = end;
|
||||
}
|
||||
|
||||
|
||||
public int length() {
|
||||
|
||||
return end - start;
|
||||
}
|
||||
|
||||
|
||||
public int start() {
|
||||
|
||||
return start;
|
||||
}
|
||||
|
||||
|
||||
public int end() {
|
||||
|
||||
return end;
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(Boundary boundary) {
|
||||
|
||||
return start <= boundary.start() && boundary.end() <= end;
|
||||
}
|
||||
|
||||
|
||||
public boolean containedBy(Boundary boundary) {
|
||||
|
||||
return boundary.contains(this);
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(int start, int end) {
|
||||
|
||||
if (start > end) {
|
||||
throw new IllegalArgumentException(String.format("start: %d > end: %d", start, end));
|
||||
}
|
||||
return this.start <= start && end <= this.end;
|
||||
}
|
||||
|
||||
|
||||
public boolean containedBy(int start, int end) {
|
||||
|
||||
if (start > end) {
|
||||
throw new IllegalArgumentException(String.format("start: %d > end: %d", start, end));
|
||||
}
|
||||
return start <= this.start && this.end <= end;
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(int index) {
|
||||
|
||||
return start <= index && index < end;
|
||||
}
|
||||
|
||||
|
||||
public boolean intersects(Boundary boundary) {
|
||||
|
||||
return contains(boundary.start()) || contains(boundary.end() - 1);
|
||||
}
|
||||
|
||||
|
||||
public List<Boundary> split(List<Integer> splitIndices) {
|
||||
|
||||
if (splitIndices.stream().anyMatch(idx -> !this.contains(idx))) {
|
||||
throw new IndexOutOfBoundsException(String.format("%s splitting indices are out of range for %s", splitIndices.stream().filter(idx -> !this.contains(idx)).toList(), this));
|
||||
}
|
||||
List<Boundary> splitBoundaries = new LinkedList<>();
|
||||
int previousIndex = start;
|
||||
for (int splitIndex : splitIndices) {
|
||||
|
||||
// skip split if it would produce a boundary of length 0
|
||||
if (splitIndex == previousIndex) {
|
||||
continue;
|
||||
}
|
||||
splitBoundaries.add(new Boundary(previousIndex, splitIndex));
|
||||
previousIndex = splitIndex;
|
||||
}
|
||||
splitBoundaries.add(new Boundary(previousIndex, end));
|
||||
return splitBoundaries;
|
||||
}
|
||||
|
||||
|
||||
public static Boundary merge(List<Boundary> boundaries) {
|
||||
|
||||
int minStart = boundaries.stream().mapToInt(Boundary::start).min().orElseThrow(IllegalArgumentException::new);
|
||||
int maxEnd = boundaries.stream().mapToInt(Boundary::end).max().orElseThrow(IllegalArgumentException::new);
|
||||
return new Boundary(minStart, maxEnd);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return String.format("Boundary [%d|%d)", start, end);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int compareTo(Boundary boundary) {
|
||||
|
||||
if (end < boundary.end() && start < boundary.start()) {
|
||||
return -1;
|
||||
}
|
||||
if (start > boundary.start() && end > boundary.end()) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
|
||||
return toString().hashCode();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean equals(Object object) {
|
||||
|
||||
return hashCode() == object.hashCode();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,98 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SectionNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class DocumentGraph implements SemanticNode {
|
||||
|
||||
Set<PageNode> pages;
|
||||
TableOfContents tableOfContents;
|
||||
Integer numberOfPages;
|
||||
TextBlock textBlock;
|
||||
|
||||
|
||||
public TextBlock buildTextBlock() {
|
||||
|
||||
return streamTerminalTextBlocksInOrder().collect(new TextBlockCollector());
|
||||
}
|
||||
|
||||
|
||||
public List<SectionNode> getMainSections() {
|
||||
|
||||
return tableOfContents.entries.stream().filter(entry -> entry.node() instanceof SectionNode).map(entry -> (SectionNode) entry.node()).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
||||
public Stream<TextBlock> streamTerminalTextBlocksInOrder() {
|
||||
|
||||
return streamAllNodes().filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock);
|
||||
}
|
||||
|
||||
|
||||
public Set<EntityNode> getEntities() {
|
||||
|
||||
return streamAllNodes().map(SemanticNode::getEntities).flatMap(Set::stream).collect(Collectors.toUnmodifiableSet());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public List<Integer> getTocId() {
|
||||
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void setTocId(List<Integer> tocId) {
|
||||
|
||||
throw new UnsupportedOperationException("DocumentGraph is always the root of the Table of Contents");
|
||||
}
|
||||
|
||||
|
||||
private Stream<SemanticNode> streamAllNodes() {
|
||||
|
||||
return tableOfContents.streamEntriesInOrder().map(TableOfContents.Entry::node);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return tableOfContents.toString();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Map<PageNode, Rectangle2D> getBBox() {
|
||||
|
||||
Map<PageNode, Rectangle2D> bBox = new HashMap<>();
|
||||
for (PageNode page : pages) {
|
||||
bBox.put(page, new Rectangle2D.Double(0, 0, page.getWidth(), page.getHeight()));
|
||||
}
|
||||
return bBox;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,164 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph;
|
||||
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.google.common.hash.Hashing;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector;
|
||||
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
public class TableOfContents {
|
||||
|
||||
List<Entry> entries;
|
||||
|
||||
|
||||
public TableOfContents() {
|
||||
|
||||
entries = new LinkedList<>();
|
||||
}
|
||||
|
||||
|
||||
public TextBlock buildTextBlock() {
|
||||
|
||||
return streamEntriesInOrder().map(Entry::node).filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock).collect(new TextBlockCollector());
|
||||
}
|
||||
|
||||
|
||||
public List<Integer> createNewEntryAndReturnId(NodeType nodeType, SemanticNode node) {
|
||||
|
||||
return createNewChildEntryAndReturnId(Collections.emptyList(), nodeType, node);
|
||||
}
|
||||
|
||||
|
||||
public List<Integer> createNewChildEntryAndReturnId(List<Integer> parentId, NodeType nodeType, SemanticNode node) {
|
||||
|
||||
List<Integer> newId;
|
||||
if (entryExists(parentId)) {
|
||||
Entry parent = getEntryById(parentId);
|
||||
newId = new LinkedList<>(parentId);
|
||||
newId.add(parent.children().size());
|
||||
parent.children().add(Entry.builder().tocId(newId).node(node).type(nodeType).children(new LinkedList<>()).build());
|
||||
} else {
|
||||
newId = List.of(entries.size());
|
||||
entries.add(Entry.builder().tocId(newId).node(node).type(nodeType).children(new LinkedList<>()).build());
|
||||
}
|
||||
|
||||
return newId;
|
||||
}
|
||||
|
||||
|
||||
private boolean entryExists(List<Integer> tocId) {
|
||||
|
||||
if (tocId.size() < 1) {
|
||||
return false;
|
||||
}
|
||||
Entry entry = entries.get(tocId.get(0));
|
||||
for (int id : tocId.subList(1, tocId.size())) {
|
||||
if (id >= entry.children.size() || 0 > id) {
|
||||
return false;
|
||||
}
|
||||
entry = entry.children().get(id);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
public Entry getParentEntryById(List<Integer> tocId) {
|
||||
|
||||
List<Integer> parentIds = getParentId(tocId);
|
||||
if (parentIds.size() < 1) {
|
||||
throw new NoSuchElementException(String.format("Node with tocId \"%s\" has no parent!", tocId));
|
||||
}
|
||||
return getEntryById(parentIds);
|
||||
}
|
||||
|
||||
|
||||
public boolean hasParentById(List<Integer> tocId) {
|
||||
|
||||
List<Integer> parentId = getParentId(tocId);
|
||||
return entryExists(parentId);
|
||||
}
|
||||
|
||||
|
||||
public Stream<SemanticNode> streamChildren(List<Integer> tocId) {
|
||||
|
||||
return getEntryById(tocId).children().stream().map(Entry::node);
|
||||
}
|
||||
|
||||
|
||||
private static List<Integer> getParentId(List<Integer> tocId) {
|
||||
|
||||
return tocId.subList(0, tocId.size() - 1);
|
||||
}
|
||||
|
||||
|
||||
public Entry getEntryById(List<Integer> tocId) {
|
||||
|
||||
Entry entry = entries.get(tocId.get(0));
|
||||
for (int id : tocId.subList(1, tocId.size())) {
|
||||
entry = entry.children().get(id);
|
||||
}
|
||||
return entry;
|
||||
}
|
||||
|
||||
|
||||
public Stream<Entry> streamEntriesInOrder() {
|
||||
|
||||
return entries.stream().flatMap(TableOfContents::flatten);
|
||||
}
|
||||
|
||||
|
||||
public Stream<Entry> streamSubEntriesInOrder(List<Integer> parentId) {
|
||||
|
||||
return Stream.of(getEntryById(parentId)).flatMap(TableOfContents::flatten);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return String.join("\n", streamEntriesInOrder().map(Entry::toString).toList());
|
||||
}
|
||||
|
||||
|
||||
public String toString(List<Integer> id) {
|
||||
|
||||
return String.join("\n", streamSubEntriesInOrder(id).map(Entry::toString).toList());
|
||||
}
|
||||
|
||||
|
||||
private static Stream<Entry> flatten(Entry entry) {
|
||||
|
||||
return Stream.concat(Stream.of(entry), entry.children().stream().flatMap(TableOfContents::flatten));
|
||||
}
|
||||
|
||||
|
||||
@Builder
|
||||
public record Entry(List<Integer> tocId, NodeType type, SemanticNode node, List<Entry> children) {
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return node().toString();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
|
||||
return Hashing.murmur3_32_fixed().hashString(toString(), StandardCharsets.UTF_8).hashCode();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,76 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph.entity;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode;
|
||||
|
||||
public interface EntityNode {
|
||||
|
||||
/**
|
||||
* This represents the text, which is contained within the boundary of the Entity.
|
||||
*
|
||||
* @return String
|
||||
*/
|
||||
String getValue();
|
||||
|
||||
|
||||
/**
|
||||
* The Boundary primarily defines the Entity, all other values may be inferred from it.
|
||||
*
|
||||
* @return Boundary, uniquely identifying this Entity
|
||||
*/
|
||||
Boundary getBoundary();
|
||||
|
||||
|
||||
/**
|
||||
* The deepest fully containing node represents the node which is the deepest node in the document tree structure,
|
||||
* whose boundary also fully contains the boundary of this entity
|
||||
*
|
||||
* @return the deepest fully containing node
|
||||
*/
|
||||
SemanticNode getDeepestFullyContainingNode();
|
||||
|
||||
|
||||
/**
|
||||
* The intersecting nodes represent all nodes, whose boundary intersects the boundary of this entity.
|
||||
*
|
||||
* @return all intersecting Nodes
|
||||
*/
|
||||
List<SemanticNode> getIntersectingNodes();
|
||||
|
||||
|
||||
void setDeepestFullyContainingNode(SemanticNode semanticNode);
|
||||
|
||||
|
||||
void addIntersectingNode(SemanticNode semanticNode);
|
||||
|
||||
|
||||
void setIntersectingNodes(List<SemanticNode> semanticNodes);
|
||||
|
||||
|
||||
/**
|
||||
* @return all pages this entity intersects.
|
||||
*/
|
||||
Set<PageNode> getPages();
|
||||
|
||||
|
||||
void setPages(Set<PageNode> pages);
|
||||
|
||||
|
||||
/**
|
||||
* removes all occurrences of this node in the graph and resets all graph specific fields
|
||||
*/
|
||||
default void removeFromGraph() {
|
||||
|
||||
getIntersectingNodes().forEach(node -> node.getEntities().remove(this));
|
||||
getPages().forEach(page -> page.getEntities().remove(this));
|
||||
setPages(Collections.emptySet());
|
||||
setDeepestFullyContainingNode(null);
|
||||
setIntersectingNodes(Collections.emptyList());
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,39 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph.entity;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.List;
|
||||
|
||||
import com.google.common.hash.Hashing;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class EntityPosition {
|
||||
|
||||
PageNode pageNode;
|
||||
List<Rectangle2D> rectanglePerLine;
|
||||
|
||||
|
||||
public String getId() {
|
||||
|
||||
return String.valueOf(hashCode());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append(pageNode.getNumber());
|
||||
rectanglePerLine.forEach(r -> sb.append(r.getX()).append(r.getY()).append(r.getWidth()).append(r.getHeight()));
|
||||
return Hashing.murmur3_128().hashString(sb.toString(), StandardCharsets.UTF_8).hashCode();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,53 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class FooterNode implements SemanticNode {
|
||||
|
||||
List<Integer> tocId;
|
||||
TextBlock terminalTextBlock;
|
||||
|
||||
@Builder.Default
|
||||
boolean terminal = true;
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
TableOfContents tableOfContents;
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<EntityNode> entities = new HashSet<>();
|
||||
|
||||
|
||||
@Override
|
||||
public TextBlock buildTextBlock() {
|
||||
|
||||
return terminalTextBlock;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return tocId + ": " + NodeType.FOOTER + ": " + terminalTextBlock.buildSummary();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,53 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class HeaderNode implements SemanticNode {
|
||||
|
||||
List<Integer> tocId;
|
||||
TextBlock terminalTextBlock;
|
||||
|
||||
@Builder.Default
|
||||
boolean terminal = true;
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
TableOfContents tableOfContents;
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<EntityNode> entities = new HashSet<>();
|
||||
|
||||
|
||||
@Override
|
||||
public TextBlock buildTextBlock() {
|
||||
|
||||
return terminalTextBlock;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return tocId + ": " + NodeType.HEADER + ": " + terminalTextBlock.buildSummary();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,60 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class HeadlineNode implements SemanticNode {
|
||||
|
||||
List<Integer> tocId;
|
||||
TextBlock terminalTextBlock;
|
||||
|
||||
@Builder.Default
|
||||
boolean terminal = true;
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
TableOfContents tableOfContents;
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<EntityNode> entities = new HashSet<>();
|
||||
|
||||
|
||||
@Override
|
||||
public TextBlock buildTextBlock() {
|
||||
|
||||
return terminalTextBlock;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return tocId + ": " + NodeType.HEADLINE + ": " + terminalTextBlock.buildSummary();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public SemanticNode getHeadline() {
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,88 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class ImageNode implements SemanticNode {
|
||||
|
||||
List<Integer> tocId;
|
||||
|
||||
ImageType imageType;
|
||||
boolean transparency;
|
||||
Rectangle2D position;
|
||||
|
||||
@Builder.Default
|
||||
boolean redaction = false;
|
||||
@Builder.Default
|
||||
boolean ignored = false;
|
||||
@Builder.Default
|
||||
String redactionReason = "";
|
||||
@Builder.Default
|
||||
String legalBasis = "";
|
||||
@Builder.Default
|
||||
int matchedRule = -1;
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
PageNode page;
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
TableOfContents tableOfContents;
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<EntityNode> entities = new HashSet<>();
|
||||
|
||||
|
||||
@Override
|
||||
public TextBlock buildTextBlock() {
|
||||
|
||||
return streamAllSubNodes().filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock).collect(new TextBlockCollector());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Set<PageNode> getPages() {
|
||||
|
||||
return Collections.singleton(page);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return tocId + ": " + NodeType.IMAGE + ": " + imageType.toString() + " " + position;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Map<PageNode, Rectangle2D> getBBox() {
|
||||
|
||||
Map<PageNode, Rectangle2D> bBoxPerPage = new HashMap<>();
|
||||
bBoxPerPage.put(page, position);
|
||||
return bBoxPerPage;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,9 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
|
||||
|
||||
public enum ImageType {
|
||||
LOGO,
|
||||
FORMULA,
|
||||
SIGNATURE,
|
||||
OTHER,
|
||||
OCR
|
||||
}
|
||||
@ -0,0 +1,12 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
|
||||
|
||||
public enum NodeType {
|
||||
SECTION,
|
||||
HEADLINE,
|
||||
PARAGRAPH,
|
||||
TABLE,
|
||||
TABLE_CELL,
|
||||
IMAGE,
|
||||
HEADER,
|
||||
FOOTER
|
||||
}
|
||||
@ -0,0 +1,66 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Getter
|
||||
@Setter
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class PageNode {
|
||||
|
||||
Integer number;
|
||||
Integer height;
|
||||
Integer width;
|
||||
Integer rotation;
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
List<SemanticNode> mainBody;
|
||||
@EqualsAndHashCode.Exclude
|
||||
HeaderNode header;
|
||||
@EqualsAndHashCode.Exclude
|
||||
FooterNode footer;
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<EntityNode> entities = new HashSet<>();
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<ImageNode> images = new HashSet<>();
|
||||
|
||||
|
||||
public TextBlock getMainBodyTextBlock() {
|
||||
|
||||
return mainBody.stream().filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock).collect(new TextBlockCollector());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return String.valueOf(number);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
|
||||
return number;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,51 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class ParagraphNode implements SemanticNode {
|
||||
|
||||
List<Integer> tocId;
|
||||
TextBlock terminalTextBlock;
|
||||
|
||||
@Builder.Default
|
||||
boolean terminal = true;
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
TableOfContents tableOfContents;
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<EntityNode> entities = new HashSet<>();
|
||||
|
||||
|
||||
@Override
|
||||
public TextBlock buildTextBlock() {
|
||||
|
||||
return terminalTextBlock;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return tocId + ": " + NodeType.PARAGRAPH + ": " + terminalTextBlock.buildSummary();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,63 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class SectionNode implements SemanticNode {
|
||||
|
||||
List<Integer> tocId;
|
||||
|
||||
TextBlock textBlock;
|
||||
@EqualsAndHashCode.Exclude
|
||||
TableOfContents tableOfContents;
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<EntityNode> entities = new HashSet<>();
|
||||
|
||||
|
||||
@Override
|
||||
public TextBlock buildTextBlock() {
|
||||
|
||||
if (textBlock == null) {
|
||||
textBlock = streamAllSubNodes().filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock).collect(new TextBlockCollector());
|
||||
}
|
||||
return textBlock;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return tocId.toString() + ": " + NodeType.SECTION + ": " + buildTextBlock().buildSummary();
|
||||
}
|
||||
|
||||
|
||||
public HeadlineNode getHeadline() {
|
||||
|
||||
return streamChildren().filter(node -> node instanceof HeadlineNode)
|
||||
.map(node -> (HeadlineNode) node)
|
||||
.findFirst()
|
||||
.orElseThrow(() -> new NoSuchElementException("ClassificationSection has no Headline!"));
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,275 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.AtomicTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.services.RectangleTransformations;
|
||||
|
||||
public interface SemanticNode {
|
||||
|
||||
/**
|
||||
* Searches all Nodes located underneath this Node in the TableOfContents and concatenates their AtomicTextBlocks into a single TextBlockEntity.
|
||||
* So, for a ClassificationSection all TextBlocks of Subsections, Paragraphs, and Tables are concatenated into a single TextBlockEntity
|
||||
* If the Node is Terminal, the TerminalTextBlock will be returned instead.
|
||||
*
|
||||
* @return ClassificationTextBlock containing all AtomicTextBlocks that are located under this Node.
|
||||
*/
|
||||
TextBlock buildTextBlock();
|
||||
|
||||
|
||||
/**
|
||||
* Any Node maintains its own Set of Entities.
|
||||
* This Set contains all Entities whose boundary intersects the boundary of this node.
|
||||
*
|
||||
* @return Set of all Entities associated with this Node
|
||||
*/
|
||||
Set<EntityNode> getEntities();
|
||||
|
||||
|
||||
/**
|
||||
* Each AtomicTextBlock is assigned a page, so to get the pages this node appears on, it collects the PageNodes from each AtomicTextBlock belonging to this node's ClassificationTextBlock
|
||||
*
|
||||
* @return Set of PageNodes this node appears on.
|
||||
*/
|
||||
default Set<PageNode> getPages() {
|
||||
|
||||
return buildTextBlock().getPages();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @return the TableOfContents of the ClassificationDocument this node belongs to
|
||||
*/
|
||||
TableOfContents getTableOfContents();
|
||||
|
||||
|
||||
/**
|
||||
* The id is a List of Integers uniquely identifying this node in the TableOfContents
|
||||
*
|
||||
* @return the TableOfContents ID
|
||||
*/
|
||||
List<Integer> getTocId();
|
||||
|
||||
|
||||
/**
|
||||
* This should only be used during graph construction
|
||||
*
|
||||
* @param tocId List of Integers
|
||||
*/
|
||||
void setTocId(List<Integer> tocId);
|
||||
|
||||
|
||||
/**
|
||||
* Traverses the Tree up, until it hits a HeadlineNode or hits a SectionNode which will then return the first HeadlineNode from its children.
|
||||
* Throws NotFoundException if no Headline is found this way
|
||||
*
|
||||
* @return First HeadlineNode found
|
||||
*/
|
||||
default SemanticNode getHeadline() {
|
||||
|
||||
return getParent().getHeadline();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @return boolean indicating wether this Node has a Parent in the TableOfContents
|
||||
*/
|
||||
default boolean hasParent() {
|
||||
|
||||
return getTableOfContents().hasParentById(getTocId());
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @return The SemanticNode representing the Parent in the TableOfContents
|
||||
* throws NotFoundException, when no parent is present
|
||||
*/
|
||||
default SemanticNode getParent() {
|
||||
|
||||
return getTableOfContents().getParentEntryById(getTocId()).node();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Terminal means a SemanticNode has direct access to a ClassificationTextBlock, by default this is false and must be overridden.
|
||||
* Currently only Sections, Images, and Tables are not terminal.
|
||||
* A TableCell might be Terminal depending on its area compared to the page.
|
||||
*
|
||||
* @return boolean, indicating if a Node has direct access to a ClassificationTextBlock
|
||||
*/
|
||||
default boolean isTerminal() {
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Terminal means a SemanticNode has direct access to a ClassificationTextBlock, by default this is false and must be overridden.
|
||||
* Currently only Sections and Tables are not terminal.
|
||||
*
|
||||
* @return AtomicTextBlock
|
||||
*/
|
||||
default TextBlock getTerminalTextBlock() {
|
||||
|
||||
throw new UnsupportedOperationException("Only terminal Nodes have access to TerminalTextBlocks!");
|
||||
}
|
||||
|
||||
|
||||
default void setTerminalTextBlock(TextBlock textBlock) {
|
||||
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Each AtomicTextBlock has an index on its page, this returns the number of the first AtomicTextBlock underneath this node.
|
||||
* If this node does not have any AtomicTexBlocks underneath it, e.g. an empty TableCell. It returns -1.
|
||||
*
|
||||
* @return Integer representing the number on the page
|
||||
*/
|
||||
default Integer getNumberOnPage() {
|
||||
|
||||
TextBlock textBlock = buildTextBlock();
|
||||
if (textBlock.getAtomicTextBlocks().size() > 0) {
|
||||
return buildTextBlock().getAtomicTextBlocks().get(0).getNumberOnPage();
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @return true, if this node's ClassificationTextBlock is not empty
|
||||
*/
|
||||
default boolean hasText() {
|
||||
|
||||
return buildTextBlock().length() > 0;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @param string A String which the ClassificationTextBlock might contain
|
||||
* @return true, if this node's ClassificationTextBlock contains the string
|
||||
*/
|
||||
default boolean containsString(String string) {
|
||||
|
||||
return buildTextBlock().getSearchText().contains(string);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @param strings A List of Strings which the ClassificationTextBlock might contain
|
||||
* @return true, if this node's ClassificationTextBlock contains any of the strings
|
||||
*/
|
||||
default boolean containsAnyString(List<String> strings) {
|
||||
|
||||
return strings.stream().anyMatch(this::containsString);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* This function is used during insertion of EntityNodes into the graph, it checks if the boundary of the EntityNode intersects or even contains the EntityNode.
|
||||
* It sets the fields accordingly and recursively calls this function on all its children.
|
||||
*
|
||||
* @param entityNode EntityNode, which is being inserted into the graph
|
||||
*/
|
||||
default void addThisToEntityIfIntersects(EntityNode entityNode) {
|
||||
|
||||
TextBlock textBlock = buildTextBlock();
|
||||
if (textBlock.getBoundary().intersects(entityNode.getBoundary())) {
|
||||
|
||||
if (textBlock.containsBoundary(entityNode.getBoundary())) {
|
||||
entityNode.setDeepestFullyContainingNode(this);
|
||||
}
|
||||
|
||||
entityNode.addIntersectingNode(this);
|
||||
streamChildren().forEach(node -> node.addThisToEntityIfIntersects(entityNode));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Streams all children located directly underneath this node in the TableOfContents
|
||||
*
|
||||
* @return Stream of all children
|
||||
*/
|
||||
default Stream<SemanticNode> streamChildren() {
|
||||
|
||||
return getTableOfContents().streamChildren(getTocId());
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* recursively streams all SemanticNodes located underneath this node in the TableOfContents in order.
|
||||
*
|
||||
* @return Stream of all SubNodes
|
||||
*/
|
||||
default Stream<SemanticNode> streamAllSubNodes() {
|
||||
|
||||
return getTableOfContents().streamSubEntriesInOrder(getTocId()).map(TableOfContents.Entry::node);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @return Boundary of this Node's ClassificationTextBlock
|
||||
*/
|
||||
default Boundary getBoundary() {
|
||||
|
||||
return buildTextBlock().getBoundary();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* If this Node is Terminal it will calculate the boundingBox of its TerminalTextBlock, otherwise it will calculate the Union of the BoundingBoxes of all its Children.
|
||||
* If called on the ClassificationDocument, it will return the cropbox of each page
|
||||
*
|
||||
* @return Rectangle2D fully encapsulating this Node for each page.
|
||||
*/
|
||||
default Map<PageNode, Rectangle2D> getBBox() {
|
||||
|
||||
Map<PageNode, Rectangle2D> bBoxPerPage = new HashMap<>();
|
||||
if (isTerminal()) {
|
||||
return getBBoxFromTerminalTextBlock(bBoxPerPage);
|
||||
}
|
||||
|
||||
return getBBoxFromChildren(bBoxPerPage);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* TODO this does not yet work for sections spanning multiple columns
|
||||
*
|
||||
* @param bBoxPerPage initial empty BoundingBox
|
||||
* @return The union of the BoundingBoxes of all children
|
||||
*/
|
||||
private Map<PageNode, Rectangle2D> getBBoxFromChildren(Map<PageNode, Rectangle2D> bBoxPerPage) {
|
||||
|
||||
return streamChildren().map(SemanticNode::getBBox).reduce((map1, map2) -> {
|
||||
map1.forEach((page, rectangle) -> map2.merge(page, rectangle, (rect1, rect2) -> rect1.createUnion(rect2).getBounds2D()));
|
||||
return map2;
|
||||
}).orElse(bBoxPerPage);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @param bBoxPerPage initial empty BoundingBox
|
||||
* @return The union of all BoundingBoxes of the ClassificationTextBlock of this node
|
||||
*/
|
||||
private Map<PageNode, Rectangle2D> getBBoxFromTerminalTextBlock(Map<PageNode, Rectangle2D> bBoxPerPage) {
|
||||
|
||||
Map<PageNode, List<AtomicTextBlock>> atomicTextBlockPerPage = buildTextBlock().getAtomicTextBlocks().stream().collect(Collectors.groupingBy(AtomicTextBlock::getPage));
|
||||
atomicTextBlockPerPage.forEach((page, atbs) -> bBoxPerPage.put(page, RectangleTransformations.bBoxUnionAtomicTextBlock(atbs)));
|
||||
return bBoxPerPage;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,92 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class TableCellNode implements SemanticNode {
|
||||
|
||||
List<Integer> tocId;
|
||||
int row;
|
||||
int col;
|
||||
boolean header;
|
||||
|
||||
Rectangle2D bBox;
|
||||
|
||||
@Builder.Default
|
||||
boolean terminal = true;
|
||||
TextBlock terminalTextBlock;
|
||||
|
||||
TextBlock textBlock;
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
TableOfContents tableOfContents;
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<EntityNode> entities = new HashSet<>();
|
||||
|
||||
|
||||
@Override
|
||||
public Map<PageNode, Rectangle2D> getBBox() {
|
||||
|
||||
Map<PageNode, Rectangle2D> bBoxPerPage = new HashMap<>();
|
||||
getPages().forEach(page -> bBoxPerPage.put(page, bBox));
|
||||
return bBoxPerPage;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public TextBlock buildTextBlock() {
|
||||
|
||||
if (terminal) {
|
||||
return terminalTextBlock;
|
||||
}
|
||||
|
||||
if (textBlock == null) {
|
||||
textBlock = streamAllSubNodes().filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock).collect(new TextBlockCollector());
|
||||
}
|
||||
return textBlock;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return tocId + ": " + NodeType.TABLE_CELL + ": " + buildTextBlock().buildSummary();
|
||||
}
|
||||
|
||||
|
||||
public boolean hasHeader(String headerString) {
|
||||
|
||||
return getHeaders().anyMatch(header -> header.buildTextBlock().getSearchText().strip().equals(headerString));
|
||||
}
|
||||
|
||||
|
||||
private Stream<TableCellNode> getHeaders() {
|
||||
|
||||
TableNode tableNode = (TableNode) getParent();
|
||||
return tableNode.streamHeadersForCell(row, col);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,73 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class TableNode implements SemanticNode {
|
||||
|
||||
List<Integer> tocId;
|
||||
TableOfContents tableOfContents;
|
||||
|
||||
Integer numberOfRows;
|
||||
Integer numberOfCols;
|
||||
|
||||
TextBlock textBlock;
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<EntityNode> entities = new HashSet<>();
|
||||
|
||||
|
||||
public Stream<TableCellNode> streamTableCells() {
|
||||
|
||||
return streamChildren().map(node -> (TableCellNode) node);
|
||||
}
|
||||
|
||||
|
||||
public Stream<TableCellNode> streamHeaders() {
|
||||
|
||||
return streamTableCells().filter(TableCellNode::isHeader);
|
||||
}
|
||||
|
||||
|
||||
public Stream<TableCellNode> streamHeadersForCell(int row, int col) {
|
||||
|
||||
return streamHeaders().filter(cell -> cell.getRow() == row || cell.getCol() == col);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public TextBlock buildTextBlock() {
|
||||
|
||||
if (textBlock == null) {
|
||||
textBlock = streamAllSubNodes().filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock).collect(new TextBlockCollector());
|
||||
}
|
||||
return textBlock;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return tocId.toString() + ": " + NodeType.TABLE + ": " + buildTextBlock().buildSummary();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,131 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityPosition;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.services.RectangleTransformations;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class AtomicTextBlock implements TextBlock {
|
||||
|
||||
Long id;
|
||||
Integer numberOnPage;
|
||||
PageNode page;
|
||||
|
||||
//string coordinates
|
||||
Boundary boundary;
|
||||
String searchText;
|
||||
List<Integer> lineBreaks;
|
||||
|
||||
//position coordinates
|
||||
List<Integer> stringIdxToPositionIdx;
|
||||
List<Rectangle2D> positions;
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
SemanticNode parent;
|
||||
|
||||
|
||||
@Override
|
||||
public int numberOfLines() {
|
||||
|
||||
return lineBreaks.size() + 1;
|
||||
}
|
||||
|
||||
|
||||
public CharSequence getLine(int lineNumber) {
|
||||
|
||||
if (lineNumber >= numberOfLines() || lineNumber < 0) {
|
||||
throw new IndexOutOfBoundsException(String.format("line %d out of range for AtomicTextBlock with %d lines", lineNumber, numberOfLines()));
|
||||
}
|
||||
if (lineNumber == 0) {
|
||||
return subSequence(boundary.start(), lineBreaks.get(0) + boundary.start());
|
||||
} else if (lineNumber == numberOfLines() - 1) {
|
||||
return subSequence(lineBreaks.get(lineBreaks.size() - 1) + boundary.start(), boundary.end());
|
||||
}
|
||||
return subSequence(lineBreaks.get(lineNumber - 1) + boundary.start(), lineBreaks.get(lineNumber) + boundary.start());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public List<AtomicTextBlock> getAtomicTextBlocks() {
|
||||
|
||||
return List.of(this);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int getNextLinebreak(int fromIndex) {
|
||||
|
||||
return lineBreaks.stream()//
|
||||
.filter(linebreak -> linebreak > fromIndex - boundary.start()) //
|
||||
.findFirst() //
|
||||
.orElse(searchText.length()) + boundary.start();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int getPreviousLinebreak(int fromIndex) {
|
||||
|
||||
return lineBreaks.stream()//
|
||||
.filter(linebreak -> linebreak <= fromIndex - boundary.start())//
|
||||
.reduce((a, b) -> b)//
|
||||
.orElse(0) + boundary.start();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Rectangle2D getPosition(int stringIdx) {
|
||||
|
||||
return positions.get(stringIdxToPositionIdx.get(stringIdx - boundary.start()));
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public List<Rectangle2D> getPositions(Boundary stringBoundary) {
|
||||
|
||||
if (!containsBoundary(stringBoundary)) {
|
||||
throw new IndexOutOfBoundsException(String.format("%s is out of bounds for %s", stringBoundary, this.boundary));
|
||||
}
|
||||
|
||||
if (stringBoundary.end() == this.boundary.end()) {
|
||||
return positions.subList(stringIdxToPositionIdx.get(stringBoundary.start() - this.boundary.start()), positions.size());
|
||||
}
|
||||
|
||||
return positions.subList(stringIdxToPositionIdx.get(stringBoundary.start() - this.boundary.start()),
|
||||
stringIdxToPositionIdx.get(stringBoundary.end() - this.boundary.start()));
|
||||
}
|
||||
|
||||
|
||||
public List<EntityPosition> getEntityPositionsPerPage(Boundary stringBoundary) {
|
||||
|
||||
List<Rectangle2D> positionsPerLine = stringBoundary.split(getLineBreaks().stream().map(lb -> lb + boundary.start()).filter(stringBoundary::contains).toList())
|
||||
.stream()
|
||||
.map(this::getPositions)
|
||||
.map(RectangleTransformations::rectangleUnion)
|
||||
.toList();
|
||||
|
||||
return List.of(EntityPosition.builder().rectanglePerLine(positionsPerLine).pageNode(page).build());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return searchText;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,179 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityPosition;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Data;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class ConcatenatedTextBlock implements TextBlock {
|
||||
|
||||
List<AtomicTextBlock> atomicTextBlocks;
|
||||
String searchText;
|
||||
Boundary boundary;
|
||||
|
||||
|
||||
public ConcatenatedTextBlock(List<AtomicTextBlock> atomicTextBlocks) {
|
||||
|
||||
this.atomicTextBlocks = new LinkedList<>();
|
||||
if (atomicTextBlocks.isEmpty()) {
|
||||
boundary = new Boundary(-1, -1);
|
||||
return;
|
||||
}
|
||||
var firstTextBlock = atomicTextBlocks.get(0);
|
||||
this.atomicTextBlocks.add(firstTextBlock);
|
||||
boundary = new Boundary(firstTextBlock.getBoundary().start(), firstTextBlock.getBoundary().end());
|
||||
|
||||
atomicTextBlocks.subList(1, atomicTextBlocks.size()).forEach(this::concat);
|
||||
}
|
||||
|
||||
|
||||
public ConcatenatedTextBlock concat(TextBlock textBlock) {
|
||||
|
||||
if (this.atomicTextBlocks.isEmpty()) {
|
||||
boundary.setStart(textBlock.getBoundary().start());
|
||||
boundary.setEnd(textBlock.getBoundary().end());
|
||||
} else if (boundary.end() != textBlock.getBoundary().start()) {
|
||||
throw new UnsupportedOperationException(String.format("Can only concat consecutive TextBlocks, trying to concat %s and %s", boundary, textBlock.getBoundary()));
|
||||
}
|
||||
this.atomicTextBlocks.addAll(textBlock.getAtomicTextBlocks());
|
||||
boundary.setEnd(textBlock.getBoundary().end());
|
||||
return this;
|
||||
}
|
||||
|
||||
|
||||
private AtomicTextBlock getAtomicTextBlockByStringIndex(int stringIdx) {
|
||||
|
||||
return atomicTextBlocks.stream().filter(textBlock -> (textBlock.getBoundary().contains(stringIdx))).findAny().orElseThrow(IndexOutOfBoundsException::new);
|
||||
}
|
||||
|
||||
|
||||
private List<AtomicTextBlock> getAllAtomicTextBlocksPartiallyInStringBoundary(Boundary boundary) {
|
||||
|
||||
return atomicTextBlocks.stream().filter(tb -> tb.getBoundary().intersects(boundary)).toList();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String getSearchText() {
|
||||
|
||||
if (searchText == null) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
getAtomicTextBlocks().forEach(atb -> sb.append(atb.getSearchText()));
|
||||
searchText = sb.toString();
|
||||
}
|
||||
return searchText;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int numberOfLines() {
|
||||
|
||||
return atomicTextBlocks.stream().map(AtomicTextBlock::getLineBreaks).mapToInt(List::size).sum();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int getNextLinebreak(int fromIndex) {
|
||||
|
||||
return getAtomicTextBlockByStringIndex(fromIndex).getNextLinebreak(fromIndex);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int getPreviousLinebreak(int fromIndex) {
|
||||
|
||||
return getAtomicTextBlockByStringIndex(fromIndex).getPreviousLinebreak(fromIndex);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public List<Integer> getLineBreaks() {
|
||||
|
||||
return getAtomicTextBlocks().stream().flatMap(atomicTextBlock -> atomicTextBlock.getLineBreaks().stream()).toList();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Rectangle2D getPosition(int stringIdx) {
|
||||
|
||||
return getAtomicTextBlockByStringIndex(stringIdx).getPosition(stringIdx);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public List<Rectangle2D> getPositions(Boundary stringBoundary) {
|
||||
|
||||
List<AtomicTextBlock> textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringBoundary);
|
||||
|
||||
if (textBlocks.size() == 1) {
|
||||
return textBlocks.get(0).getPositions(stringBoundary);
|
||||
}
|
||||
|
||||
AtomicTextBlock firstTextBlock = textBlocks.get(0);
|
||||
List<Rectangle2D> positions = new LinkedList<>(firstTextBlock.getPositions(new Boundary(stringBoundary.start(), firstTextBlock.getBoundary().end())));
|
||||
|
||||
for (AtomicTextBlock textBlock : textBlocks.subList(1, textBlocks.size() - 1)) {
|
||||
positions.addAll(textBlock.getPositions());
|
||||
}
|
||||
|
||||
var lastTextBlock = textBlocks.get(textBlocks.size() - 1);
|
||||
positions.addAll(lastTextBlock.getPositions(new Boundary(lastTextBlock.getBoundary().start(), stringBoundary.end())));
|
||||
|
||||
return positions;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public List<EntityPosition> getEntityPositionsPerPage(Boundary stringBoundary) {
|
||||
|
||||
List<AtomicTextBlock> textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringBoundary);
|
||||
|
||||
if (textBlocks.size() == 1) {
|
||||
return textBlocks.get(0).getEntityPositionsPerPage(stringBoundary);
|
||||
}
|
||||
|
||||
AtomicTextBlock firstTextBlock = textBlocks.get(0);
|
||||
List<EntityPosition> positions = new LinkedList<>(firstTextBlock.getEntityPositionsPerPage(new Boundary(stringBoundary.start(), firstTextBlock.getBoundary().end())));
|
||||
|
||||
for (AtomicTextBlock textBlock : textBlocks.subList(1, textBlocks.size() - 1)) {
|
||||
positions.addAll(textBlock.getEntityPositionsPerPage(textBlock.getBoundary()));
|
||||
}
|
||||
|
||||
AtomicTextBlock lastTextBlock = textBlocks.get(textBlocks.size() - 1);
|
||||
positions.addAll(lastTextBlock.getEntityPositionsPerPage(new Boundary(lastTextBlock.getBoundary().start(), stringBoundary.end())));
|
||||
|
||||
return mergeEntityPositionsWithSamePageNode(positions);
|
||||
}
|
||||
|
||||
|
||||
private List<EntityPosition> mergeEntityPositionsWithSamePageNode(List<EntityPosition> positions) {
|
||||
|
||||
Map<PageNode, List<Rectangle2D>> entityPositionsPerPage = positions.stream().collect(//
|
||||
Collectors.groupingBy(EntityPosition::getPageNode, //
|
||||
Collectors.flatMapping(entityPosition -> entityPosition.getRectanglePerLine().stream(), Collectors.toList())));
|
||||
|
||||
return entityPositionsPerPage.entrySet().stream()//
|
||||
.map(entry -> EntityPosition.builder().pageNode(entry.getKey()).rectanglePerLine(entry.getValue()).build())//
|
||||
.toList();
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return getSearchText();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,125 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityPosition;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode;
|
||||
|
||||
public interface TextBlock extends CharSequence {
|
||||
|
||||
String getSearchText();
|
||||
|
||||
|
||||
List<AtomicTextBlock> getAtomicTextBlocks();
|
||||
|
||||
|
||||
Boundary getBoundary();
|
||||
|
||||
|
||||
int getNextLinebreak(int fromIndex);
|
||||
|
||||
|
||||
int getPreviousLinebreak(int fromIndex);
|
||||
|
||||
|
||||
List<Integer> getLineBreaks();
|
||||
|
||||
|
||||
Rectangle2D getPosition(int stringIdx);
|
||||
|
||||
|
||||
List<Rectangle2D> getPositions(Boundary stringBoundary);
|
||||
|
||||
|
||||
List<EntityPosition> getEntityPositionsPerPage(Boundary stringBoundary);
|
||||
|
||||
|
||||
int numberOfLines();
|
||||
|
||||
|
||||
default int indexOf(String searchTerm) {
|
||||
|
||||
return indexOf(searchTerm, getBoundary().start());
|
||||
}
|
||||
|
||||
|
||||
default Set<PageNode> getPages() {
|
||||
|
||||
return getAtomicTextBlocks().stream().map(AtomicTextBlock::getPage).collect(Collectors.toUnmodifiableSet());
|
||||
}
|
||||
|
||||
|
||||
default int indexOf(String searchTerm, int startOffset) {
|
||||
|
||||
int start = getSearchText().indexOf(searchTerm, startOffset - getBoundary().start());
|
||||
if (start == -1) {
|
||||
return -1;
|
||||
}
|
||||
return start + getBoundary().start();
|
||||
}
|
||||
|
||||
|
||||
default CharSequence getFirstLine() {
|
||||
|
||||
return subSequence(getBoundary().start(), getNextLinebreak(getBoundary().start()));
|
||||
}
|
||||
|
||||
|
||||
default boolean containsBoundary(Boundary boundary) {
|
||||
|
||||
if (boundary.end() < boundary.start()) {
|
||||
throw new IllegalArgumentException(String.format("Invalid %s, StartIndex must be smaller than EndIndex", boundary));
|
||||
}
|
||||
return getBoundary().contains(boundary);
|
||||
}
|
||||
|
||||
|
||||
default boolean containsIndex(int stringIndex) {
|
||||
|
||||
return getBoundary().contains(stringIndex);
|
||||
}
|
||||
|
||||
|
||||
default CharSequence subSequence(Boundary boundary) {
|
||||
|
||||
return subSequence(boundary.start(), boundary.end());
|
||||
}
|
||||
|
||||
|
||||
default String buildSummary() {
|
||||
|
||||
String[] words = getSearchText().split(" ");
|
||||
int bound = Math.min(words.length, 4);
|
||||
List<String> list = new ArrayList<>(Arrays.asList(words).subList(0, bound));
|
||||
|
||||
return String.join(" ", list);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
default CharSequence subSequence(int start, int end) {
|
||||
|
||||
return getSearchText().substring(start - getBoundary().start(), end - getBoundary().start());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
default int length() {
|
||||
|
||||
return getBoundary().length();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
default char charAt(int index) {
|
||||
|
||||
return getSearchText().charAt(index - getBoundary().start());
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,50 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.Set;
|
||||
import java.util.function.BiConsumer;
|
||||
import java.util.function.BinaryOperator;
|
||||
import java.util.function.Function;
|
||||
import java.util.function.Supplier;
|
||||
import java.util.stream.Collector;
|
||||
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@NoArgsConstructor
|
||||
public class TextBlockCollector implements Collector<TextBlock, ConcatenatedTextBlock, TextBlock> {
|
||||
|
||||
@Override
|
||||
public Supplier<ConcatenatedTextBlock> supplier() {
|
||||
|
||||
return () -> new ConcatenatedTextBlock(Collections.emptyList());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public BiConsumer<ConcatenatedTextBlock, TextBlock> accumulator() {
|
||||
|
||||
return ConcatenatedTextBlock::concat;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public BinaryOperator<ConcatenatedTextBlock> combiner() {
|
||||
|
||||
return ConcatenatedTextBlock::concat;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Function<ConcatenatedTextBlock, TextBlock> finisher() {
|
||||
|
||||
return a -> a;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Set<Characteristics> characteristics() {
|
||||
|
||||
return Set.of(Characteristics.IDENTITY_FINISH, Characteristics.CONCURRENT);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,143 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.mapper;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicPositionBlockData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicTextBlockData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.PageData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.TableOfContentsData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.DocumentGraph;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ImageNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableCellNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.AtomicTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
|
||||
|
||||
public class DocumentDataMapper {
|
||||
|
||||
public DocumentData toDocumentData(DocumentGraph documentGraph) {
|
||||
|
||||
List<AtomicTextBlockData> atomicTextBlockData = documentGraph.streamTerminalTextBlocksInOrder()
|
||||
.flatMap(textBlock -> textBlock.getAtomicTextBlocks().stream())
|
||||
.distinct()
|
||||
.map(this::toAtomicTextBlockData)
|
||||
.toList();
|
||||
|
||||
List<AtomicPositionBlockData> atomicPositionBlockData = documentGraph.streamTerminalTextBlocksInOrder()
|
||||
.flatMap(textBlock -> textBlock.getAtomicTextBlocks().stream())
|
||||
.distinct()
|
||||
.map(this::toAtomicPositionBlockData)
|
||||
.toList();
|
||||
|
||||
List<PageData> pageData = documentGraph.getPages().stream().map(this::toPageData).toList();
|
||||
TableOfContentsData tableOfContentsData = toTableOfContentsData(documentGraph.getTableOfContents());
|
||||
return DocumentData.builder()
|
||||
.atomicTextBlocks(atomicTextBlockData.toArray(new AtomicTextBlockData[0]))
|
||||
.atomicPositionBlocks(atomicPositionBlockData.toArray(new AtomicPositionBlockData[0]))
|
||||
.pages(pageData.toArray(new PageData[0]))
|
||||
.tableOfContents(tableOfContentsData)
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
private TableOfContentsData toTableOfContentsData(TableOfContents tableOfContents) {
|
||||
|
||||
return new TableOfContentsData(tableOfContents.getEntries().stream().map(this::toEntryData).toList());
|
||||
}
|
||||
|
||||
|
||||
private TableOfContentsData.EntryData toEntryData(TableOfContents.Entry entry) {
|
||||
|
||||
Long[] atomicTextBlocks;
|
||||
|
||||
if (entry.node().isTerminal()) {
|
||||
atomicTextBlocks = toAtomicTextBlockIds(entry.node().getTerminalTextBlock());
|
||||
} else {
|
||||
atomicTextBlocks = new Long[]{};
|
||||
}
|
||||
|
||||
Map<String, String> properties = switch (entry.type()) {
|
||||
case TABLE -> PropertiesMapper.buildTableProperties((TableNode) entry.node());
|
||||
case TABLE_CELL -> PropertiesMapper.buildTableCellProperties((TableCellNode) entry.node());
|
||||
case IMAGE -> PropertiesMapper.buildImageProperties((ImageNode) entry.node());
|
||||
default -> new HashMap<>();
|
||||
};
|
||||
|
||||
return TableOfContentsData.EntryData.builder()
|
||||
.tocId(toPrimitiveIntArray(entry.tocId()))
|
||||
.subEntries(entry.children().stream().map(this::toEntryData).toList())
|
||||
.type(entry.type())
|
||||
.atomicBlocks(atomicTextBlocks)
|
||||
.pages(entry.node().getPages().stream().map(PageNode::getNumber).map(Integer::longValue).toArray(Long[]::new))
|
||||
.properties(properties)
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
private Long[] toAtomicTextBlockIds(TextBlock textBlock) {
|
||||
|
||||
return textBlock.getAtomicTextBlocks().stream().map(AtomicTextBlock::getId).toArray(Long[]::new);
|
||||
}
|
||||
|
||||
|
||||
private PageData toPageData(PageNode p) {
|
||||
|
||||
return PageData.builder().rotation(p.getRotation()).height(p.getHeight()).width(p.getWidth()).number(p.getNumber()).build();
|
||||
}
|
||||
|
||||
|
||||
private AtomicTextBlockData toAtomicTextBlockData(AtomicTextBlock atomicTextBlock) {
|
||||
|
||||
return AtomicTextBlockData.builder()
|
||||
.id(atomicTextBlock.getId())
|
||||
.page(atomicTextBlock.getPage().getNumber().longValue())
|
||||
.searchText(atomicTextBlock.getSearchText())
|
||||
.numberOnPage(atomicTextBlock.getNumberOnPage())
|
||||
.start(atomicTextBlock.getBoundary().start())
|
||||
.end(atomicTextBlock.getBoundary().end())
|
||||
.lineBreaks(toPrimitiveIntArray(atomicTextBlock.getLineBreaks()))
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
private AtomicPositionBlockData toAtomicPositionBlockData(AtomicTextBlock atomicTextBlock) {
|
||||
|
||||
return AtomicPositionBlockData.builder()
|
||||
.id(atomicTextBlock.getId())
|
||||
.positions(toPrimitiveFloatMatrix(atomicTextBlock.getPositions()))
|
||||
.stringIdxToPositionIdx(toPrimitiveIntArray(atomicTextBlock.getStringIdxToPositionIdx()))
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
private float[][] toPrimitiveFloatMatrix(List<Rectangle2D> positions) {
|
||||
|
||||
float[][] positionMatrix = new float[positions.size()][];
|
||||
for (int i = 0; i < positions.size(); i++) {
|
||||
float[] singlePositions = new float[4];
|
||||
singlePositions[0] = (float) positions.get(i).getMinX();
|
||||
singlePositions[1] = (float) positions.get(i).getMinY();
|
||||
singlePositions[2] = (float) positions.get(i).getWidth();
|
||||
singlePositions[3] = (float) positions.get(i).getHeight();
|
||||
positionMatrix[i] = singlePositions;
|
||||
}
|
||||
return positionMatrix;
|
||||
}
|
||||
|
||||
|
||||
private int[] toPrimitiveIntArray(List<Integer> list) {
|
||||
|
||||
int[] array = new int[list.size()];
|
||||
for (int i = 0; i < list.size(); i++) {
|
||||
array[i] = list.get(i);
|
||||
}
|
||||
return array;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,225 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.mapper;
|
||||
|
||||
import static com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.NodeType.FOOTER;
|
||||
import static com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.NodeType.HEADER;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.NoSuchElementException;
|
||||
|
||||
import com.google.common.primitives.Ints;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicPositionBlockData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicTextBlockData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.PageData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.TableOfContentsData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.DocumentGraph;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.FooterNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.HeaderNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.HeadlineNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ImageNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ParagraphNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SectionNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableCellNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.AtomicTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector;
|
||||
|
||||
public class DocumentGraphMapper {
|
||||
|
||||
public DocumentGraph toDocumentGraph(DocumentData documentData) {
|
||||
|
||||
Context context = new Context(documentData,
|
||||
new TableOfContents(),
|
||||
new LinkedList<>(),
|
||||
new LinkedList<>(),
|
||||
Arrays.stream(documentData.getAtomicTextBlocks()).toList(),
|
||||
Arrays.stream(documentData.getAtomicPositionBlocks()).toList());
|
||||
|
||||
context.pages.addAll(Arrays.stream(documentData.getPages()).map(this::buildPage).toList());
|
||||
|
||||
context.tableOfContents.setEntries(buildEntries(documentData.getTableOfContents().getEntries(), context));
|
||||
|
||||
DocumentGraph documentGraph = DocumentGraph.builder()
|
||||
.numberOfPages(documentData.getPages().length)
|
||||
.pages(new HashSet<>(context.pages))
|
||||
.tableOfContents(context.tableOfContents)
|
||||
.build();
|
||||
documentGraph.setTextBlock(documentGraph.buildTextBlock());
|
||||
return documentGraph;
|
||||
}
|
||||
|
||||
|
||||
private List<TableOfContents.Entry> buildEntries(List<TableOfContentsData.EntryData> entries,
|
||||
Context context) {
|
||||
|
||||
List<TableOfContents.Entry> newEntries = new LinkedList<>();
|
||||
for (TableOfContentsData.EntryData entryData : entries) {
|
||||
|
||||
boolean terminal = isTerminal(entryData);
|
||||
List<PageNode> pages = Arrays.stream(entryData.pages()).map(pageNumber -> getPage(pageNumber, context)).toList();
|
||||
|
||||
SemanticNode node = switch (entryData.type()) {
|
||||
case SECTION -> buildSection(context);
|
||||
case PARAGRAPH -> buildParagraph(context, terminal);
|
||||
case HEADLINE -> buildHeadline(context, terminal);
|
||||
case HEADER -> buildHeader(context, terminal);
|
||||
case FOOTER -> buildFooter(context, terminal);
|
||||
case TABLE -> buildTable(context, entryData.properties());
|
||||
case TABLE_CELL -> buildTableCell(context, entryData.properties(), terminal);
|
||||
case IMAGE -> buildImage(context, entryData.properties());
|
||||
default -> throw new UnsupportedOperationException("Not yet implemented for type " + entryData.type());
|
||||
};
|
||||
|
||||
if (node.isTerminal()) {
|
||||
TextBlock textBlock = toTextBlock(entryData.atomicBlocks(), context, node);
|
||||
node.setTerminalTextBlock(textBlock);
|
||||
}
|
||||
List<Integer> tocId = Arrays.stream(entryData.tocId()).boxed().toList();
|
||||
node.setTocId(tocId);
|
||||
|
||||
if (entryData.type() == HEADER) {
|
||||
pages.forEach(page -> page.setHeader((HeaderNode) node));
|
||||
} else if (entryData.type() == FOOTER) {
|
||||
pages.forEach(page -> page.setFooter((FooterNode) node));
|
||||
} else {
|
||||
pages.forEach(page -> page.getMainBody().add(node));
|
||||
}
|
||||
newEntries.add(TableOfContents.Entry.builder().tocId(tocId).type(entryData.type()).children(buildEntries(entryData.subEntries(), context)).node(node).build());
|
||||
}
|
||||
return newEntries;
|
||||
}
|
||||
|
||||
|
||||
private HeadlineNode buildHeadline(Context context, boolean terminal) {
|
||||
|
||||
return HeadlineNode.builder().terminal(terminal).tableOfContents(context.tableOfContents()).build();
|
||||
}
|
||||
|
||||
|
||||
private static boolean isTerminal(TableOfContentsData.EntryData entryData) {
|
||||
|
||||
return entryData.atomicBlocks().length > 0;
|
||||
}
|
||||
|
||||
|
||||
private ImageNode buildImage(Context context, Map<String, String> properties) {
|
||||
|
||||
var builder = ImageNode.builder();
|
||||
PropertiesMapper.parseImageProperties(properties, builder);
|
||||
return builder.tableOfContents(context.tableOfContents()).build();
|
||||
}
|
||||
|
||||
|
||||
private TableCellNode buildTableCell(Context context, Map<String, String> properties, boolean terminal) {
|
||||
|
||||
TableCellNode.TableCellNodeBuilder builder = TableCellNode.builder();
|
||||
PropertiesMapper.parseTableCellProperties(properties, builder);
|
||||
return builder.terminal(terminal).tableOfContents(context.tableOfContents()).build();
|
||||
}
|
||||
|
||||
|
||||
private TableNode buildTable(Context context, Map<String, String> properties) {
|
||||
|
||||
TableNode.TableNodeBuilder builder = TableNode.builder();
|
||||
PropertiesMapper.parseTableProperties(properties, builder);
|
||||
return TableNode.builder().tableOfContents(context.tableOfContents()).build();
|
||||
}
|
||||
|
||||
|
||||
private FooterNode buildFooter(Context context, boolean terminal) {
|
||||
|
||||
return FooterNode.builder().terminal(terminal).tableOfContents(context.tableOfContents()).build();
|
||||
}
|
||||
|
||||
|
||||
private HeaderNode buildHeader(Context context, boolean terminal) {
|
||||
|
||||
return HeaderNode.builder().terminal(terminal).tableOfContents(context.tableOfContents()).build();
|
||||
}
|
||||
|
||||
|
||||
private SectionNode buildSection(Context context) {
|
||||
|
||||
return SectionNode.builder().tableOfContents(context.tableOfContents()).build();
|
||||
|
||||
}
|
||||
|
||||
|
||||
private ParagraphNode buildParagraph(Context context, boolean terminal) {
|
||||
|
||||
return ParagraphNode.builder().terminal(terminal).tableOfContents(context.tableOfContents()).build();
|
||||
}
|
||||
|
||||
|
||||
private TextBlock toTextBlock(Long[] atomicTextBlockIds, Context context, SemanticNode parent) {
|
||||
|
||||
return Arrays.stream(atomicTextBlockIds)
|
||||
.map(atomicTextBlockId -> toAtomicTextBlock(context.atomicTextBlockData.get(Math.toIntExact(atomicTextBlockId)),
|
||||
context.atomicPositionBlockData.get(Math.toIntExact(atomicTextBlockId)),
|
||||
parent,
|
||||
context))
|
||||
.collect(new TextBlockCollector());
|
||||
}
|
||||
|
||||
|
||||
private PageNode buildPage(PageData p) {
|
||||
|
||||
return PageNode.builder().rotation(p.getRotation()).height(p.getHeight()).width(p.getWidth()).number(p.getNumber()).mainBody(new LinkedList<>()).build();
|
||||
}
|
||||
|
||||
|
||||
private AtomicTextBlock toAtomicTextBlock(AtomicTextBlockData atomicTextBlockData,
|
||||
AtomicPositionBlockData atomicPositionBlockData,
|
||||
SemanticNode parent,
|
||||
Context context) {
|
||||
|
||||
return AtomicTextBlock.builder()
|
||||
.id(atomicTextBlockData.getId())
|
||||
.numberOnPage(atomicTextBlockData.getNumberOnPage())
|
||||
.page(getPage(atomicTextBlockData.getPage(), context))
|
||||
.boundary(new Boundary(atomicTextBlockData.getStart(), atomicTextBlockData.getEnd()))
|
||||
.searchText(atomicTextBlockData.getSearchText())
|
||||
.lineBreaks(Ints.asList(atomicTextBlockData.getLineBreaks()))
|
||||
.stringIdxToPositionIdx(Ints.asList(atomicPositionBlockData.getStringIdxToPositionIdx()))
|
||||
.positions(toRectangle2DList(atomicPositionBlockData.getPositions()))
|
||||
.parent(parent)
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
private static List<Rectangle2D> toRectangle2DList(float[][] positions) {
|
||||
|
||||
return Arrays.stream(positions).map(floatArr -> (Rectangle2D) new Rectangle2D.Float(floatArr[0], floatArr[1], floatArr[2], floatArr[3])).toList();
|
||||
}
|
||||
|
||||
|
||||
private PageNode getPage(Long pageIndex, Context context) {
|
||||
|
||||
return context.pages.stream()
|
||||
.filter(page -> page.getNumber() == Math.toIntExact(pageIndex))
|
||||
.findFirst()
|
||||
.orElseThrow(() -> new NoSuchElementException(String.format("ClassificationPage with number %d not found", pageIndex)));
|
||||
}
|
||||
|
||||
|
||||
record Context(
|
||||
DocumentData layoutParsingModel,
|
||||
TableOfContents tableOfContents,
|
||||
List<PageNode> pages,
|
||||
List<SectionNode> sections,
|
||||
List<AtomicTextBlockData> atomicTextBlockData,
|
||||
List<AtomicPositionBlockData> atomicPositionBlockData) {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,101 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.mapper;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ImageNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ImageType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableCellNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.services.RectangleTransformations;
|
||||
|
||||
public class PropertiesMapper {
|
||||
|
||||
public static Map<String, String> buildImageProperties(ImageNode image) {
|
||||
|
||||
Map<String, String> properties = new HashMap<>();
|
||||
properties.put("imageType", image.getImageType().toString());
|
||||
properties.put("transparency", String.valueOf(image.isTransparency()));
|
||||
properties.put("position", RectangleTransformations.toString(image.getPosition()));
|
||||
return properties;
|
||||
}
|
||||
|
||||
|
||||
public static Map<String, String> buildTableCellProperties(TableCellNode tableCell) {
|
||||
|
||||
Map<String, String> properties = new HashMap<>();
|
||||
properties.put("row", String.valueOf(tableCell.getRow()));
|
||||
properties.put("col", String.valueOf(tableCell.getCol()));
|
||||
properties.put("header", String.valueOf(tableCell.isHeader()));
|
||||
|
||||
if (tableCell.getPages().size() > 1 || tableCell.getBBox().keySet().size() > 1) {
|
||||
throw new IllegalArgumentException("TableCell can only occur on a single page!");
|
||||
}
|
||||
String bBoxString = RectangleTransformations.toString(tableCell.getBBox().get(tableCell.getPages().stream().findFirst().get()));
|
||||
properties.put("bBox", bBoxString);
|
||||
|
||||
return properties;
|
||||
}
|
||||
|
||||
|
||||
public static Map<String, String> buildTableProperties(TableNode table) {
|
||||
|
||||
Map<String, String> properties = new HashMap<>();
|
||||
properties.put("numberOfRows", String.valueOf(table.getNumberOfRows()));
|
||||
properties.put("numberOfCols", String.valueOf(table.getNumberOfCols()));
|
||||
return properties;
|
||||
}
|
||||
|
||||
|
||||
public static void parseImageProperties(Map<String, String> properties, ImageNode.ImageNodeBuilder builder) {
|
||||
|
||||
builder.imageType(parseImageType(properties.get("imageType")));
|
||||
builder.transparency(Boolean.parseBoolean(properties.get("transparency")));
|
||||
builder.position(parseRectangle2D(properties.get("position")));
|
||||
}
|
||||
|
||||
|
||||
public static void parseTableCellProperties(Map<String, String> properties, TableCellNode.TableCellNodeBuilder builder) {
|
||||
|
||||
builder.row(Integer.parseInt(properties.get("row")));
|
||||
builder.col(Integer.parseInt(properties.get("col")));
|
||||
builder.header(Boolean.parseBoolean(properties.get("header")));
|
||||
builder.bBox(parseRectangle2D(properties.get("bBox")));
|
||||
}
|
||||
|
||||
|
||||
public static void parseTableProperties(Map<String, String> properties, TableNode.TableNodeBuilder builder) {
|
||||
|
||||
builder.numberOfRows(Integer.parseInt(properties.get("numberOfRows")));
|
||||
builder.numberOfCols(Integer.parseInt(properties.get("numberOfCols")));
|
||||
}
|
||||
|
||||
|
||||
private static ImageType parseImageType(String imageType) {
|
||||
|
||||
return switch (imageType) {
|
||||
case "LOGO" -> ImageType.LOGO;
|
||||
case "FORMULA" -> ImageType.FORMULA;
|
||||
case "SIGNATURE" -> ImageType.SIGNATURE;
|
||||
case "OCR" -> ImageType.OCR;
|
||||
default -> ImageType.OTHER;
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
public static String toString(Rectangle2D rectangle2D) {
|
||||
|
||||
return String.format("%f,%f,%f,%f", rectangle2D.getX(), rectangle2D.getY(), rectangle2D.getWidth(), rectangle2D.getHeight());
|
||||
}
|
||||
|
||||
|
||||
public static Rectangle2D parseRectangle2D(String bBox) {
|
||||
|
||||
List<Float> floats = Arrays.stream(bBox.split(",")).map(Float::parseFloat).toList();
|
||||
return new Rectangle2D.Float(floats.get(0), floats.get(1), floats.get(2), floats.get(3));
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,10 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.services;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
|
||||
|
||||
public interface EntityEnrichmentService {
|
||||
|
||||
void enrichEntity(EntityNode entity, TextBlock textBlock);
|
||||
|
||||
}
|
||||
@ -0,0 +1,65 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.services;
|
||||
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@RequiredArgsConstructor
|
||||
public class EntityInsertionService {
|
||||
|
||||
private final EntityEnrichmentService entityEnrichmentService;
|
||||
|
||||
|
||||
public void addEntityToGraph(EntityNode entity, TableOfContents tableOfContents) {
|
||||
|
||||
try {
|
||||
SemanticNode containingNode = tableOfContents.getEntries()
|
||||
.stream()
|
||||
.map(TableOfContents.Entry::node)
|
||||
.filter(node -> node.buildTextBlock().containsBoundary(entity.getBoundary()))
|
||||
.findFirst()
|
||||
.orElseThrow(() -> new NoSuchElementException("No containing Node found!"));
|
||||
|
||||
containingNode.addThisToEntityIfIntersects(entity);
|
||||
|
||||
TextBlock textBlock = entity.getDeepestFullyContainingNode().buildTextBlock();
|
||||
entityEnrichmentService.enrichEntity(entity, textBlock);
|
||||
|
||||
addToPages(entity);
|
||||
addToNodeEntitySets(entity);
|
||||
|
||||
} catch (NoSuchElementException e) {
|
||||
entityEnrichmentService.enrichEntity(entity, tableOfContents.buildTextBlock());
|
||||
entity.removeFromGraph();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void addToPages(EntityNode entity) {
|
||||
|
||||
Set<PageNode> pages = entity.getDeepestFullyContainingNode().getPages();
|
||||
entity.getPages().addAll(pages);
|
||||
pages.forEach(page -> page.getEntities().add(entity));
|
||||
}
|
||||
|
||||
|
||||
private void addToNodeEntitySets(EntityNode entity) {
|
||||
|
||||
entity.getIntersectingNodes().forEach(node -> node.getEntities().add(entity));
|
||||
}
|
||||
|
||||
|
||||
private static Boundary toLineAfterBoundary(TextBlock textBlock, Boundary boundary) {
|
||||
|
||||
return new Boundary(boundary.end(), textBlock.getNextLinebreak(boundary.end()));
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,95 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.services;
|
||||
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.awt.geom.Area;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.function.BiConsumer;
|
||||
import java.util.function.BinaryOperator;
|
||||
import java.util.function.Function;
|
||||
import java.util.function.Supplier;
|
||||
import java.util.stream.Collector;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.AtomicTextBlock;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class RectangleTransformations {
|
||||
|
||||
public static Rectangle2D pad(Rectangle2D rectangle2D, int deltaX, int deltaY) {
|
||||
|
||||
return new Rectangle2D.Double(rectangle2D.getMinX() - deltaX, rectangle2D.getMinY() - deltaY, rectangle2D.getWidth() + 2 * deltaX, rectangle2D.getHeight() + 2 * deltaY);
|
||||
}
|
||||
|
||||
|
||||
public static Rectangle2D bBoxUnionAtomicTextBlock(List<AtomicTextBlock> atomicTextBlocks) {
|
||||
|
||||
return atomicTextBlocks.stream().flatMap(atomicTextBlock -> atomicTextBlock.getPositions().stream()).collect(new Rectangle2DUnion());
|
||||
}
|
||||
|
||||
|
||||
public static Rectangle2D rectangleUnion(List<Rectangle2D> rectangle2DList) {
|
||||
|
||||
return rectangle2DList.stream().collect(new Rectangle2DUnion());
|
||||
}
|
||||
|
||||
|
||||
public static String toString(Rectangle2D rectangle2D) {
|
||||
|
||||
return format("%f,%f,%f,%f", rectangle2D.getX(), rectangle2D.getY(), rectangle2D.getWidth(), rectangle2D.getHeight());
|
||||
}
|
||||
|
||||
|
||||
public static Rectangle2D parseRectangle2D(String bBox) {
|
||||
|
||||
List<Float> floats = Arrays.stream(bBox.split(",")).map(Float::parseFloat).toList();
|
||||
return new Rectangle2D.Float(floats.get(0), floats.get(1), floats.get(2), floats.get(3));
|
||||
}
|
||||
|
||||
|
||||
private static class Rectangle2DUnion implements Collector<Rectangle2D, Area, Rectangle2D> {
|
||||
|
||||
@Override
|
||||
public Supplier<Area> supplier() {
|
||||
|
||||
return Area::new;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public BiConsumer<Area, Rectangle2D> accumulator() {
|
||||
|
||||
return (area, rectangle2D) -> area.add(new Area(rectangle2D));
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public BinaryOperator<Area> combiner() {
|
||||
|
||||
return (area1, area2) -> {
|
||||
area1.add(area2);
|
||||
return area1;
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Function<Area, Rectangle2D> finisher() {
|
||||
|
||||
return Area::getBounds2D;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Set<Characteristics> characteristics() {
|
||||
|
||||
return Set.of(Characteristics.CONCURRENT, Characteristics.UNORDERED);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
161
layoutparser-service/layoutparser-service-processor/pom.xml
Normal file
161
layoutparser-service/layoutparser-service-processor/pom.xml
Normal file
@ -0,0 +1,161 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<parent>
|
||||
<groupId>com.knecon.fforesight</groupId>
|
||||
<artifactId>layoutparser-service</artifactId>
|
||||
<version>1.0.0</version>
|
||||
</parent>
|
||||
|
||||
<artifactId>layoutparser-service-processor</artifactId>
|
||||
<version>1.0.0</version>
|
||||
|
||||
<dependencies>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.iqser.red.service</groupId>
|
||||
<artifactId>persistence-service-internal-api-v1</artifactId>
|
||||
<version>2.36.0</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.knecon.fforesight</groupId>
|
||||
<artifactId>layoutparser-service-internal-api</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.iqser.red.commons</groupId>
|
||||
<artifactId>spring-commons</artifactId>
|
||||
<version>6.2.0</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.iqser.red.commons</groupId>
|
||||
<artifactId>storage-commons</artifactId>
|
||||
<version>1.13.0</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.dslplatform</groupId>
|
||||
<artifactId>dsl-json-java8</artifactId>
|
||||
<version>1.10.0</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.pdfbox</groupId>
|
||||
<artifactId>pdfbox</artifactId>
|
||||
<version>${pdfbox.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.pdfbox</groupId>
|
||||
<artifactId>pdfbox-tools</artifactId>
|
||||
<version>${pdfbox.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.google.guava</groupId>
|
||||
<artifactId>guava</artifactId>
|
||||
<version>31.1-jre</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.fasterxml.jackson.module</groupId>
|
||||
<artifactId>jackson-module-afterburner</artifactId>
|
||||
<version>${jackson.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.fasterxml.jackson.datatype</groupId>
|
||||
<artifactId>jackson-datatype-jsr310</artifactId>
|
||||
<version>${jackson.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.springframework.boot</groupId>
|
||||
<artifactId>spring-boot-starter-security</artifactId>
|
||||
<version>${spring.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.springframework.boot</groupId>
|
||||
<artifactId>spring-boot-starter-web</artifactId>
|
||||
<version>${spring.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.projectlombok</groupId>
|
||||
<artifactId>lombok</artifactId>
|
||||
<optional>true</optional>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.springframework.cloud</groupId>
|
||||
<artifactId>spring-cloud-starter-openfeign</artifactId>
|
||||
<version>4.0.2</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.springframework.boot</groupId>
|
||||
<artifactId>spring-boot-starter-amqp</artifactId>
|
||||
<version>${spring.version}</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.springframework.boot</groupId>
|
||||
<artifactId>spring-boot-maven-plugin</artifactId>
|
||||
<configuration>
|
||||
<excludes>
|
||||
<exclude>
|
||||
<groupId>org.projectlombok</groupId>
|
||||
<artifactId>lombok</artifactId>
|
||||
</exclude>
|
||||
</excludes>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
<repositories>
|
||||
<repository>
|
||||
<id>spring-milestones</id>
|
||||
<name>Spring Milestones</name>
|
||||
<url>https://repo.spring.io/milestone</url>
|
||||
<snapshots>
|
||||
<enabled>false</enabled>
|
||||
</snapshots>
|
||||
</repository>
|
||||
<repository>
|
||||
<id>spring-snapshots</id>
|
||||
<name>Spring Snapshots</name>
|
||||
<url>https://repo.spring.io/snapshot</url>
|
||||
<releases>
|
||||
<enabled>false</enabled>
|
||||
</releases>
|
||||
</repository>
|
||||
</repositories>
|
||||
<pluginRepositories>
|
||||
<pluginRepository>
|
||||
<id>spring-milestones</id>
|
||||
<name>Spring Milestones</name>
|
||||
<url>https://repo.spring.io/milestone</url>
|
||||
<snapshots>
|
||||
<enabled>false</enabled>
|
||||
</snapshots>
|
||||
</pluginRepository>
|
||||
<pluginRepository>
|
||||
<id>spring-snapshots</id>
|
||||
<name>Spring Snapshots</name>
|
||||
<url>https://repo.spring.io/snapshot</url>
|
||||
<releases>
|
||||
<enabled>false</enabled>
|
||||
</releases>
|
||||
</pluginRepository>
|
||||
</pluginRepositories>
|
||||
|
||||
</project>
|
||||
@ -0,0 +1,114 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor;
|
||||
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.DocumentGraph;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.mapper.DocumentDataMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.CvTableParsingAdapter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.ImageServiceResponseAdapter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.image.ImageServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.service.ClassificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.service.PdfParsingService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.service.SectionsBuilderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.queue.LayoutParsingFinishedEvent;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.queue.LayoutParsingRequest;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class LayoutParsingService {
|
||||
|
||||
private final ImageServiceResponseAdapter imageServiceResponseAdapter;
|
||||
private final CvTableParsingAdapter cvTableParsingAdapter;
|
||||
private final LayoutParsingStorageService layoutParsingStorageService;
|
||||
private final PdfParsingService pdfParsingService;
|
||||
private final ClassificationService classificationService;
|
||||
private final SectionsBuilderService sectionsBuilderService;
|
||||
private final DocumentGraphFactory documentGraphFactory;
|
||||
private final DocumentDataMapper documentDataMapper;
|
||||
|
||||
|
||||
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) {
|
||||
|
||||
PDDocument originDocument;
|
||||
try {
|
||||
originDocument = layoutParsingStorageService.getOriginFile(layoutParsingRequest.getOriginFileStorageId());
|
||||
} catch (IOException e) {
|
||||
log.error(e.toString());
|
||||
return LayoutParsingFinishedEvent.builder()
|
||||
.status(400)
|
||||
.message(format("Origin PDF File with id %s could not be loaded!", layoutParsingRequest.getPageFileStorageId()))
|
||||
.build();
|
||||
}
|
||||
|
||||
ImageServiceResponse imageServiceResponse = new ImageServiceResponse();
|
||||
if (layoutParsingRequest.getImagesFileStorageId().isPresent()) {
|
||||
try {
|
||||
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.getPageFileStorageId());
|
||||
} catch (IOException e) {
|
||||
log.error(e.toString());
|
||||
return LayoutParsingFinishedEvent.builder()
|
||||
.status(400)
|
||||
.message(format("Image Service File with id %s could not be loaded!", layoutParsingRequest.getImagesFileStorageId()))
|
||||
.build();
|
||||
}
|
||||
}
|
||||
|
||||
TableServiceResponse tableServiceResponse = new TableServiceResponse();
|
||||
if (layoutParsingRequest.getTablesFileStorageId().isPresent()) {
|
||||
try {
|
||||
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.getPageFileStorageId());
|
||||
} catch (IOException e) {
|
||||
log.error(e.toString());
|
||||
return LayoutParsingFinishedEvent.builder()
|
||||
.status(400)
|
||||
.message(format("CV Table Parsing File with id %s could not be loaded!", layoutParsingRequest.getPageFileStorageId()))
|
||||
.build();
|
||||
}
|
||||
}
|
||||
|
||||
DocumentGraph documentGraph = parseLayout(originDocument, imageServiceResponse, tableServiceResponse);
|
||||
|
||||
try {
|
||||
layoutParsingStorageService.storeDocumentData(layoutParsingRequest, documentDataMapper.toDocumentData(documentGraph));
|
||||
} catch (IOException e) {
|
||||
log.error("Parsed Document files could not be saved!");
|
||||
log.error(e.getMessage());
|
||||
return LayoutParsingFinishedEvent.builder().status(500).message("Files could not be saved").build();
|
||||
}
|
||||
return LayoutParsingFinishedEvent.builder()
|
||||
.status(200)
|
||||
.message(format("Layout parsing is finished and files have been saved with Ids:\n Structure: %s\nText: %s\nPositions: %s\nPageData: %s",
|
||||
layoutParsingRequest.getStructureFileStorageId(),
|
||||
layoutParsingRequest.getTextBlockFileStorageId(),
|
||||
layoutParsingRequest.getPositionBlockFileStorageId(),
|
||||
layoutParsingRequest.getPageFileStorageId()))
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
public DocumentGraph parseLayout(PDDocument originDocument, ImageServiceResponse imageServiceResponse, TableServiceResponse tableServiceResponse) {
|
||||
|
||||
ClassificationDocument classificationDocument = pdfParsingService.parseDocument(originDocument,
|
||||
cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse),
|
||||
imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse));
|
||||
|
||||
classificationService.classifyDocument(classificationDocument);
|
||||
|
||||
sectionsBuilderService.buildSections(classificationDocument);
|
||||
|
||||
return documentGraphFactory.buildDocumentGraph(classificationDocument);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,126 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.file.Files;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.io.MemoryUsageSetting;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.iqser.red.storage.commons.service.StorageService;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicPositionBlockData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicTextBlockData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.PageData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.TableOfContentsData;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.image.ImageServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.multitenancy.TenantContext;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.queue.LayoutParsingRequest;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class LayoutParsingStorageService {
|
||||
|
||||
private final StorageService storageService;
|
||||
private final ObjectMapper objectMapper;
|
||||
|
||||
|
||||
public PDDocument getOriginFile(String storageId) throws IOException {
|
||||
|
||||
try (var originDocumentInputStream = storageService.getObject(TenantContext.getTenantId(), storageId).getInputStream()) {
|
||||
File tempFile = createTempFile("document", ".pdf");
|
||||
try (var tempFileOutputStream = new FileOutputStream(tempFile)) {
|
||||
IOUtils.copy(originDocumentInputStream, tempFileOutputStream);
|
||||
}
|
||||
return Loader.loadPDF(tempFile, MemoryUsageSetting.setupMixed(67108864L));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public ImageServiceResponse getImagesFile(String storageId) throws IOException {
|
||||
|
||||
try (InputStream inputStream = storageService.getObject(TenantContext.getTenantId(), storageId).getInputStream()) {
|
||||
|
||||
return objectMapper.readValue(inputStream, ImageServiceResponse.class);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public TableServiceResponse getTablesFile(String storageId) throws IOException {
|
||||
|
||||
try (var tableClassificationStream = storageService.getObject(TenantContext.getTenantId(), storageId).getInputStream()) {
|
||||
|
||||
return objectMapper.readValue(tableClassificationStream, TableServiceResponse.class);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void storeDocumentData(LayoutParsingRequest layoutParsingRequest, DocumentData documentData) throws IOException {
|
||||
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.getStructureFileStorageId(), documentData.getTableOfContents());
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.getTextBlockFileStorageId(), documentData.getAtomicTextBlocks());
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.getPositionBlockFileStorageId(), documentData.getAtomicPositionBlocks());
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.getPageFileStorageId(), documentData.getPages());
|
||||
|
||||
}
|
||||
|
||||
|
||||
public DocumentData readDocumentData(LayoutParsingRequest layoutParsingRequest) throws IOException {
|
||||
|
||||
PageData[] pageData = storageService.readJSONObject(TenantContext.getTenantId(), layoutParsingRequest.getPageFileStorageId(), PageData[].class);
|
||||
AtomicTextBlockData[] atomicTextBlockData = storageService.readJSONObject(TenantContext.getTenantId(),
|
||||
layoutParsingRequest.getTextBlockFileStorageId(),
|
||||
AtomicTextBlockData[].class);
|
||||
AtomicPositionBlockData[] atomicPositionBlockData = storageService.readJSONObject(TenantContext.getTenantId(),
|
||||
layoutParsingRequest.getPositionBlockFileStorageId(),
|
||||
AtomicPositionBlockData[].class);
|
||||
TableOfContentsData tableOfContentsData = storageService.readJSONObject(TenantContext.getTenantId(),
|
||||
layoutParsingRequest.getStructureFileStorageId(),
|
||||
TableOfContentsData.class);
|
||||
|
||||
return DocumentData.builder()
|
||||
.tableOfContents(tableOfContentsData)
|
||||
.atomicPositionBlocks(atomicPositionBlockData)
|
||||
.atomicTextBlocks(atomicTextBlockData)
|
||||
.pages(pageData)
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
private File createTempFile(String filenamePrefix, String filenameSuffix) throws IOException {
|
||||
|
||||
File tempFile = Files.createTempFile(filenamePrefix, filenameSuffix).toFile();
|
||||
setRWPermissionsOnlyForOwner(tempFile);
|
||||
|
||||
return tempFile;
|
||||
}
|
||||
|
||||
|
||||
// We don't need to check the results of the permission setters below,
|
||||
// since we're manipulating a file we created ourselves.
|
||||
@SuppressWarnings({"ResultOfMethodCallIgnored", "squid:S899"})
|
||||
private void setRWPermissionsOnlyForOwner(File tempFile) {
|
||||
|
||||
try {
|
||||
tempFile.setReadable(true, true);
|
||||
tempFile.setWritable(true, true);
|
||||
tempFile.setExecutable(false);
|
||||
} catch (SecurityException ex) {
|
||||
// This should never happen since we're creating a temp file ourselves.
|
||||
log.warn("Caught an exception during temp file creation. This should not happend. Check the code.", ex);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,10 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor;
|
||||
|
||||
import org.springframework.context.annotation.ComponentScan;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
|
||||
@Configuration
|
||||
@ComponentScan
|
||||
public class LayoutparserServiceProcessorConfiguration {
|
||||
|
||||
}
|
||||
@ -0,0 +1,49 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.adapter;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.CvParsedTableCell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class CvTableParsingAdapter {
|
||||
|
||||
public Map<Integer, List<com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CvParsedTableCell>> buildCvParsedTablesPerPage(TableServiceResponse tableServiceResponse) {
|
||||
|
||||
Map<Integer, List<com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CvParsedTableCell>> tableCells = new HashMap<>();
|
||||
tableServiceResponse.getData()
|
||||
.forEach(tableData -> tableCells.computeIfAbsent(tableData.getPageInfo().getNumber(), tableCell -> new ArrayList<>())
|
||||
.addAll(convertTableCells(tableData.getTableCells())));
|
||||
|
||||
return tableCells;
|
||||
}
|
||||
|
||||
|
||||
private Collection<? extends com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CvParsedTableCell> convertTableCells(List<CvParsedTableCell> tableCells) {
|
||||
|
||||
List<com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CvParsedTableCell> cvParsedTableCells = new ArrayList<>();
|
||||
|
||||
tableCells.forEach(t -> cvParsedTableCells.add(com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CvParsedTableCell.builder()
|
||||
.y0(t.getY0())
|
||||
.x1(t.getX1())
|
||||
.y1(t.getY1())
|
||||
.x0(t.getX0())
|
||||
.width(t.getWidth())
|
||||
.height(t.getHeight())
|
||||
.build()));
|
||||
|
||||
return cvParsedTableCells;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,67 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.adapter;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ImageType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.image.ImageServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.image.ClassifiedImage;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class ImageServiceResponseAdapter {
|
||||
|
||||
|
||||
public Map<Integer, List<ClassifiedImage>> buildClassifiedImagesPerPage(ImageServiceResponse imageServiceResponse ) {
|
||||
|
||||
Map<Integer, List<ClassifiedImage>> images = new HashMap<>();
|
||||
imageServiceResponse.getData().forEach(imageMetadata -> {
|
||||
var classification = imageMetadata.getFilters().isAllPassed() ? ImageType.valueOf(imageMetadata.getClassification()
|
||||
.getLabel()
|
||||
.toUpperCase(Locale.ROOT)) : ImageType.OTHER;
|
||||
images.computeIfAbsent(imageMetadata.getPosition().getPageNumber(), x -> new ArrayList<>())
|
||||
.add(new ClassifiedImage(new Rectangle2D.Double(imageMetadata.getPosition().getX1(),
|
||||
imageMetadata.getPosition().getY1(),
|
||||
imageMetadata.getGeometry().getWidth(),
|
||||
imageMetadata.getGeometry().getHeight()), classification, imageMetadata.isAlpha(), imageMetadata.getPosition().getPageNumber()));
|
||||
});
|
||||
|
||||
// Currently This is a copy but, it will be changed later because i don' t think that we should unclassified images.
|
||||
imageServiceResponse.getDataCV().forEach(imageMetadata -> {
|
||||
var classification = imageMetadata.getFilters().isAllPassed() ? ImageType.valueOf(imageMetadata.getClassification()
|
||||
.getLabel()
|
||||
.toUpperCase(Locale.ROOT)) : ImageType.OTHER;
|
||||
images.computeIfAbsent(imageMetadata.getPosition().getPageNumber(), x -> new ArrayList<>())
|
||||
.add(new ClassifiedImage(new Rectangle2D.Double(imageMetadata.getPosition().getX1(),
|
||||
imageMetadata.getPosition().getY1(),
|
||||
imageMetadata.getGeometry().getWidth(),
|
||||
imageMetadata.getGeometry().getHeight()), classification, imageMetadata.isAlpha(), imageMetadata.getPosition().getPageNumber()));
|
||||
});
|
||||
|
||||
return images;
|
||||
}
|
||||
|
||||
|
||||
public void findOcr(ClassificationPage classificationPage) {
|
||||
|
||||
classificationPage.getImages().forEach(image -> {
|
||||
if (image.getImageType().equals(ImageType.OTHER)) {
|
||||
classificationPage.getTextBlocks().forEach(textblock -> {
|
||||
if (image.getPosition().contains(textblock.getMinX(), textblock.getMinY(), textblock.getWidth(), textblock.getHeight())) {
|
||||
image.setImageType(ImageType.OCR);
|
||||
}
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,17 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import com.dslplatform.json.CompiledJson;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@CompiledJson
|
||||
public class Classification {
|
||||
|
||||
private Map<String, Float> probabilities = new HashMap<>();
|
||||
private String label;
|
||||
|
||||
}
|
||||
@ -0,0 +1,14 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
|
||||
|
||||
import com.dslplatform.json.CompiledJson;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@CompiledJson
|
||||
public class FilterGeometry {
|
||||
|
||||
private ImageSize imageSize;
|
||||
private Format imageFormat;
|
||||
|
||||
}
|
||||
@ -0,0 +1,15 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
|
||||
|
||||
import com.dslplatform.json.CompiledJson;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@CompiledJson
|
||||
public class Filters {
|
||||
|
||||
private FilterGeometry geometry;
|
||||
private Probability probability;
|
||||
private boolean allPassed;
|
||||
|
||||
}
|
||||
@ -0,0 +1,15 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
|
||||
|
||||
import com.dslplatform.json.CompiledJson;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@CompiledJson
|
||||
public class Format {
|
||||
|
||||
private float quotient;
|
||||
private boolean tooTall;
|
||||
private boolean tooWide;
|
||||
|
||||
}
|
||||
@ -0,0 +1,14 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
|
||||
|
||||
import com.dslplatform.json.CompiledJson;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@CompiledJson
|
||||
public class Geometry {
|
||||
|
||||
private float width;
|
||||
private float height;
|
||||
|
||||
}
|
||||
@ -0,0 +1,33 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.dslplatform.json.CompiledJson;
|
||||
import com.dslplatform.json.JsonAttribute;
|
||||
import com.fasterxml.jackson.annotation.JsonAlias;
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@CompiledJson
|
||||
public class ImageServiceResponse {
|
||||
|
||||
private String dossierId;
|
||||
private String fileId;
|
||||
|
||||
@JsonProperty(value = "imageMetadata")
|
||||
@JsonAlias("data")
|
||||
@JsonAttribute(alternativeNames = {"imageMetadata"})
|
||||
private List<Metadata> data = new ArrayList<>();
|
||||
|
||||
private List<Metadata> dataCV = new ArrayList<>();
|
||||
|
||||
|
||||
@JsonProperty(value = "imageMetadata")
|
||||
@JsonAlias("data")
|
||||
@JsonAttribute(alternativeNames = {"imageMetadata"})
|
||||
public void setData(List<Metadata> data) {this.data = data;}
|
||||
|
||||
}
|
||||
@ -0,0 +1,15 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
|
||||
|
||||
import com.dslplatform.json.CompiledJson;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@CompiledJson
|
||||
public class ImageSize {
|
||||
|
||||
private float quotient;
|
||||
private boolean tooLarge;
|
||||
private boolean tooSmall;
|
||||
|
||||
}
|
||||
@ -0,0 +1,17 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
|
||||
|
||||
import com.dslplatform.json.CompiledJson;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@CompiledJson
|
||||
public class Metadata {
|
||||
|
||||
private Classification classification;
|
||||
private Position position;
|
||||
private Geometry geometry;
|
||||
private Filters filters;
|
||||
private boolean alpha;
|
||||
|
||||
}
|
||||
@ -0,0 +1,17 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
|
||||
|
||||
import com.dslplatform.json.CompiledJson;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@CompiledJson
|
||||
public class Position {
|
||||
|
||||
private float x1;
|
||||
private float x2;
|
||||
private float y1;
|
||||
private float y2;
|
||||
private int pageNumber;
|
||||
|
||||
}
|
||||
@ -0,0 +1,13 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
|
||||
|
||||
import com.dslplatform.json.CompiledJson;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@CompiledJson
|
||||
public class Probability {
|
||||
|
||||
private boolean unconfident;
|
||||
|
||||
}
|
||||
@ -0,0 +1,16 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.table;
|
||||
|
||||
import com.dslplatform.json.CompiledJson;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@CompiledJson
|
||||
public class CvParsedPageInfo {
|
||||
|
||||
private int number;
|
||||
private int rotation;
|
||||
private float width;
|
||||
private float height;
|
||||
|
||||
}
|
||||
@ -0,0 +1,18 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.table;
|
||||
|
||||
import com.dslplatform.json.CompiledJson;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@CompiledJson
|
||||
public class CvParsedTableCell {
|
||||
|
||||
private float x0;
|
||||
private float y0;
|
||||
private float x1;
|
||||
private float y1;
|
||||
private float width;
|
||||
private float height;
|
||||
|
||||
}
|
||||
@ -0,0 +1,17 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.table;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.dslplatform.json.CompiledJson;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@CompiledJson
|
||||
public class CvParsedTableModel {
|
||||
|
||||
private CvParsedPageInfo pageInfo;
|
||||
private List<CvParsedTableCell> tableCells = new ArrayList<>();
|
||||
|
||||
}
|
||||
@ -0,0 +1,22 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.table;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.dslplatform.json.CompiledJson;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@CompiledJson
|
||||
public class TableServiceResponse {
|
||||
|
||||
private String dossierId;
|
||||
private String fileId;
|
||||
private String operation;
|
||||
private String targetFileExtension;
|
||||
private String responseFileExtension;
|
||||
|
||||
private List<CvParsedTableModel> data = new ArrayList<>();
|
||||
|
||||
}
|
||||
@ -0,0 +1,71 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
import com.dslplatform.json.JsonAttribute;
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextBlockOrientation;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
public abstract class AbstractTextContainer {
|
||||
|
||||
protected float minX;
|
||||
protected float maxX;
|
||||
protected float minY;
|
||||
protected float maxY;
|
||||
protected String classification;
|
||||
protected int page;
|
||||
|
||||
private TextBlockOrientation orientation = TextBlockOrientation.NONE;
|
||||
|
||||
|
||||
public abstract String getText();
|
||||
|
||||
|
||||
public boolean containsBlock(ClassificationTextBlock other) {
|
||||
|
||||
return this.minX <= other.getPdfMinX() && this.maxX >= other.getPdfMaxX() && this.minY >= other.getPdfMinY() && this.maxY <= other.getPdfMaxY();
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(AbstractTextContainer other) {
|
||||
|
||||
return this.minX <= other.minX && this.maxX >= other.maxX && this.minY >= other.minY && this.maxY <= other.maxY;
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(Rectangle2D other) {
|
||||
|
||||
return other.contains(minX, minY, getWidth(), getHeight());
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public float getHeight() {
|
||||
|
||||
return maxY - minY;
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public float getWidth() {
|
||||
|
||||
return maxX - minX;
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsY(AbstractTextContainer atc) {
|
||||
|
||||
return this.minY <= atc.getMaxY() && this.maxY >= atc.getMinY();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,27 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.StringFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.UnclassifiedText;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
public class ClassificationDocument {
|
||||
|
||||
private List<ClassificationPage> pages = new ArrayList<>();
|
||||
private List<ClassificationSection> sections = new ArrayList<>();
|
||||
private List<ClassificationHeader> headers = new ArrayList<>();
|
||||
private List<ClassificationFooter> footers = new ArrayList<>();
|
||||
private List<UnclassifiedText> unclassifiedTexts = new ArrayList<>();
|
||||
private FloatFrequencyCounter textHeightCounter = new FloatFrequencyCounter();
|
||||
private FloatFrequencyCounter fontSizeCounter = new FloatFrequencyCounter();
|
||||
private StringFrequencyCounter fontCounter = new StringFrequencyCounter();
|
||||
private StringFrequencyCounter fontStyleCounter = new StringFrequencyCounter();
|
||||
private boolean headlines;
|
||||
|
||||
}
|
||||
@ -0,0 +1,16 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
public class ClassificationFooter {
|
||||
|
||||
private List<ClassificationTextBlock> textBlocks;
|
||||
|
||||
}
|
||||
@ -0,0 +1,16 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
public class ClassificationHeader {
|
||||
|
||||
private List<ClassificationTextBlock> textBlocks;
|
||||
|
||||
}
|
||||
@ -0,0 +1,38 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.StringFrequencyCounter;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.NonNull;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@Data
|
||||
@RequiredArgsConstructor
|
||||
public class ClassificationPage {
|
||||
|
||||
@NonNull
|
||||
private List<AbstractTextContainer> textBlocks;
|
||||
|
||||
private List<ClassifiedImage> images = new ArrayList<>();
|
||||
|
||||
private Rectangle bodyTextFrame;
|
||||
|
||||
private boolean landscape;
|
||||
private int rotation;
|
||||
|
||||
private int pageNumber;
|
||||
|
||||
private FloatFrequencyCounter textHeightCounter = new FloatFrequencyCounter();
|
||||
private FloatFrequencyCounter fontSizeCounter = new FloatFrequencyCounter();
|
||||
private StringFrequencyCounter fontCounter = new StringFrequencyCounter();
|
||||
private StringFrequencyCounter fontStyleCounter = new StringFrequencyCounter();
|
||||
|
||||
private float pageWidth;
|
||||
private float pageHeight;
|
||||
|
||||
}
|
||||
@ -0,0 +1,38 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Table;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
public class ClassificationSection implements Comparable {
|
||||
|
||||
private List<AbstractTextContainer> pageBlocks = new ArrayList<>();
|
||||
private List<ClassifiedImage> images = new ArrayList<>();
|
||||
private String headline;
|
||||
|
||||
|
||||
public List<Table> getTables() {
|
||||
|
||||
List<Table> tables = new ArrayList<>();
|
||||
pageBlocks.forEach(block -> {
|
||||
if (block instanceof Table) {
|
||||
tables.add((Table) block);
|
||||
}
|
||||
});
|
||||
return tables;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int compareTo(Object o) {
|
||||
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,77 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import lombok.Getter;
|
||||
|
||||
public class FloatFrequencyCounter {
|
||||
|
||||
@Getter
|
||||
Map<Float, Integer> countPerValue = new HashMap<>();
|
||||
|
||||
|
||||
public void add(float value) {
|
||||
|
||||
if (!countPerValue.containsKey(value)) {
|
||||
countPerValue.put(value, 1);
|
||||
} else {
|
||||
countPerValue.put(value, countPerValue.get(value) + 1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void addAll(Map<Float, Integer> otherCounter) {
|
||||
|
||||
for (Map.Entry<Float, Integer> entry : otherCounter.entrySet()) {
|
||||
if (countPerValue.containsKey(entry.getKey())) {
|
||||
countPerValue.put(entry.getKey(), countPerValue.get(entry.getKey()) + entry.getValue());
|
||||
} else {
|
||||
countPerValue.put(entry.getKey(), entry.getValue());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public Float getMostPopular() {
|
||||
|
||||
Map.Entry<Float, Integer> mostPopular = null;
|
||||
for (Map.Entry<Float, Integer> entry : countPerValue.entrySet()) {
|
||||
if (mostPopular == null || entry.getValue() >= mostPopular.getValue()) {
|
||||
mostPopular = entry;
|
||||
}
|
||||
}
|
||||
return mostPopular != null ? mostPopular.getKey() : null;
|
||||
}
|
||||
|
||||
|
||||
public List<Float> getHighterThanMostPopular() {
|
||||
|
||||
Float mostPopular = getMostPopular();
|
||||
List<Float> higher = new ArrayList<>();
|
||||
for (Float value : countPerValue.keySet()) {
|
||||
if (value > mostPopular) {
|
||||
higher.add(value);
|
||||
}
|
||||
}
|
||||
|
||||
return higher.stream().sorted(Collections.reverseOrder()).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
||||
public Float getHighest() {
|
||||
|
||||
Float highest = null;
|
||||
for (Float value : countPerValue.keySet()) {
|
||||
if (highest == null || value > highest) {
|
||||
highest = value;
|
||||
}
|
||||
}
|
||||
return highest;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,218 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
@SuppressWarnings("all")
|
||||
public class Rectangle extends Rectangle2D.Float {
|
||||
|
||||
protected static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f;
|
||||
/**
|
||||
* Ill-defined comparator, from when Rectangle was Comparable.
|
||||
* <p>
|
||||
* see https://github.com/tabulapdf/tabula-java/issues/116
|
||||
*
|
||||
* @deprecated with no replacement
|
||||
*/
|
||||
@Deprecated
|
||||
public static final Comparator<Rectangle> ILL_DEFINED_ORDER = new Comparator<Rectangle>() {
|
||||
@Override
|
||||
public int compare(Rectangle o1, Rectangle o2) {
|
||||
|
||||
if (o1.equals(o2)) {
|
||||
return 0;
|
||||
}
|
||||
if (o1.verticalOverlap(o2) > VERTICAL_COMPARISON_THRESHOLD) {
|
||||
return o1.isLtrDominant() == -1 && o2.isLtrDominant() == -1 ? -java.lang.Double.compare(o1.getX(), o2.getX()) : java.lang.Double.compare(o1.getX(), o2.getX());
|
||||
} else {
|
||||
return java.lang.Float.compare(o1.getBottom(), o2.getBottom());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
public Rectangle() {
|
||||
|
||||
super();
|
||||
}
|
||||
|
||||
|
||||
public Rectangle(float top, float left, float width, float height) {
|
||||
|
||||
super();
|
||||
this.setRect(left, top, width, height);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @param rectangles
|
||||
* @return minimum bounding box that contains all the rectangles
|
||||
*/
|
||||
public static Rectangle boundingBoxOf(List<? extends Rectangle> rectangles) {
|
||||
|
||||
float minx = java.lang.Float.MAX_VALUE;
|
||||
float miny = java.lang.Float.MAX_VALUE;
|
||||
float maxx = java.lang.Float.MIN_VALUE;
|
||||
float maxy = java.lang.Float.MIN_VALUE;
|
||||
|
||||
for (Rectangle r : rectangles) {
|
||||
minx = (float) Math.min(r.getMinX(), minx);
|
||||
miny = (float) Math.min(r.getMinY(), miny);
|
||||
maxx = (float) Math.max(r.getMaxX(), maxx);
|
||||
maxy = (float) Math.max(r.getMaxY(), maxy);
|
||||
}
|
||||
return new Rectangle(miny, minx, maxx - minx, maxy - miny);
|
||||
}
|
||||
|
||||
|
||||
public int compareTo(Rectangle other) {
|
||||
|
||||
return ILL_DEFINED_ORDER.compare(this, other);
|
||||
}
|
||||
|
||||
|
||||
// I'm bad at Java and need this for fancy sorting in
|
||||
// technology.tabula.TextChunk.
|
||||
public int isLtrDominant() {
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
public float getArea() {
|
||||
|
||||
return this.width * this.height;
|
||||
}
|
||||
|
||||
|
||||
public float verticalOverlap(Rectangle other) {
|
||||
|
||||
return Math.max(0, Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
|
||||
}
|
||||
|
||||
|
||||
public boolean verticallyOverlaps(Rectangle other) {
|
||||
|
||||
return verticalOverlap(other) > 0;
|
||||
}
|
||||
|
||||
|
||||
public float horizontalOverlap(Rectangle other) {
|
||||
|
||||
return Math.max(0, Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
|
||||
}
|
||||
|
||||
|
||||
public boolean horizontallyOverlaps(Rectangle other) {
|
||||
|
||||
return horizontalOverlap(other) > 0;
|
||||
}
|
||||
|
||||
|
||||
public float verticalOverlapRatio(Rectangle other) {
|
||||
|
||||
float rv = 0, delta = Math.min(this.getBottom() - this.getTop(), other.getBottom() - other.getTop());
|
||||
|
||||
if (other.getTop() <= this.getTop() && this.getTop() <= other.getBottom() && other.getBottom() <= this.getBottom()) {
|
||||
rv = (other.getBottom() - this.getTop()) / delta;
|
||||
} else if (this.getTop() <= other.getTop() && other.getTop() <= this.getBottom() && this.getBottom() <= other.getBottom()) {
|
||||
rv = (this.getBottom() - other.getTop()) / delta;
|
||||
} else if (this.getTop() <= other.getTop() && other.getTop() <= other.getBottom() && other.getBottom() <= this.getBottom()) {
|
||||
rv = (other.getBottom() - other.getTop()) / delta;
|
||||
} else if (other.getTop() <= this.getTop() && this.getTop() <= this.getBottom() && this.getBottom() <= other.getBottom()) {
|
||||
rv = (this.getBottom() - this.getTop()) / delta;
|
||||
}
|
||||
|
||||
return rv;
|
||||
|
||||
}
|
||||
|
||||
|
||||
public float overlapRatio(Rectangle other) {
|
||||
|
||||
double intersectionWidth = Math.max(0, Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
|
||||
double intersectionHeight = Math.max(0, Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
|
||||
double intersectionArea = Math.max(0, intersectionWidth * intersectionHeight);
|
||||
double unionArea = this.getArea() + other.getArea() - intersectionArea;
|
||||
|
||||
return (float) (intersectionArea / unionArea);
|
||||
}
|
||||
|
||||
|
||||
public Rectangle merge(Rectangle other) {
|
||||
|
||||
this.setRect(this.createUnion(other));
|
||||
return this;
|
||||
}
|
||||
|
||||
|
||||
public float getTop() {
|
||||
|
||||
return (float) this.getMinY();
|
||||
}
|
||||
|
||||
|
||||
public void setTop(float top) {
|
||||
|
||||
float deltaHeight = top - this.y;
|
||||
this.setRect(this.x, top, this.width, this.height - deltaHeight);
|
||||
}
|
||||
|
||||
|
||||
public float getRight() {
|
||||
|
||||
return (float) this.getMaxX();
|
||||
}
|
||||
|
||||
|
||||
public void setRight(float right) {
|
||||
|
||||
this.setRect(this.x, this.y, right - this.x, this.height);
|
||||
}
|
||||
|
||||
|
||||
public float getLeft() {
|
||||
|
||||
return (float) this.getMinX();
|
||||
}
|
||||
|
||||
|
||||
public void setLeft(float left) {
|
||||
|
||||
float deltaWidth = left - this.x;
|
||||
this.setRect(left, this.y, this.width - deltaWidth, this.height);
|
||||
}
|
||||
|
||||
|
||||
public float getBottom() {
|
||||
|
||||
return (float) this.getMaxY();
|
||||
}
|
||||
|
||||
|
||||
public void setBottom(float bottom) {
|
||||
|
||||
this.setRect(this.x, this.y, this.width, bottom - this.y);
|
||||
}
|
||||
|
||||
|
||||
public Point2D[] getPoints() {
|
||||
|
||||
return new Point2D[]{new Point2D.Float(this.getLeft(), this.getTop()), new Point2D.Float(this.getRight(), this.getTop()), new Point2D.Float(this.getRight(),
|
||||
this.getBottom()), new Point2D.Float(this.getLeft(), this.getBottom())};
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
String s = super.toString();
|
||||
sb.append(s.substring(0, s.length() - 1));
|
||||
sb.append(String.format(",bottom=%f,right=%f]", this.getBottom(), this.getRight()));
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,25 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.image;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ImageType;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.NonNull;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@Data
|
||||
@RequiredArgsConstructor
|
||||
public class ClassifiedImage {
|
||||
|
||||
@NonNull
|
||||
private Rectangle2D position;
|
||||
@NonNull
|
||||
private ImageType imageType;
|
||||
private boolean isAppendedToSection;
|
||||
@NonNull
|
||||
private boolean hasTransparency;
|
||||
@NonNull
|
||||
private int page;
|
||||
|
||||
}
|
||||
@ -0,0 +1,17 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.table;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Ruling;
|
||||
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
public class CleanRulings {
|
||||
|
||||
List<Ruling> horizontal;
|
||||
List<Ruling> vertical;
|
||||
|
||||
}
|
||||
@ -0,0 +1,21 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.table;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@RequiredArgsConstructor
|
||||
public class CvParsedTableCell {
|
||||
|
||||
private float x0;
|
||||
private float y0;
|
||||
private float x1;
|
||||
private float y1;
|
||||
private float width;
|
||||
private float height;
|
||||
|
||||
}
|
||||
@ -0,0 +1,437 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.table;
|
||||
|
||||
import java.awt.geom.Line2D;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.Formatter;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.TreeMap;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.CohenSutherlandClipping;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.DoubleComparisons;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@SuppressWarnings("all")
|
||||
public class Ruling extends Line2D.Float {
|
||||
|
||||
private static int PERPENDICULAR_PIXEL_EXPAND_AMOUNT = 2;
|
||||
|
||||
|
||||
public Ruling(Point2D p1, Point2D p2) {
|
||||
|
||||
super(p1, p2);
|
||||
}
|
||||
|
||||
|
||||
public static List<Ruling> cropRulingsToArea(List<Ruling> rulings, Rectangle2D area) {
|
||||
|
||||
ArrayList<Ruling> rv = new ArrayList<>();
|
||||
for (Ruling r : rulings) {
|
||||
if (r.intersects(area)) {
|
||||
rv.add(r.intersect(area));
|
||||
}
|
||||
}
|
||||
return rv;
|
||||
}
|
||||
|
||||
|
||||
// log(n) implementation of find_intersections
|
||||
// based on http://people.csail.mit.edu/indyk/6.838-old/handouts/lec2.pdf
|
||||
public static Map<Point2D, Ruling[]> findIntersections(List<Ruling> horizontals, List<Ruling> verticals) {
|
||||
|
||||
class SortObject {
|
||||
|
||||
protected SOType type;
|
||||
protected float position;
|
||||
protected Ruling ruling;
|
||||
|
||||
|
||||
public SortObject(SOType type, float position, Ruling ruling) {
|
||||
|
||||
this.type = type;
|
||||
this.position = position;
|
||||
this.ruling = ruling;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
List<SortObject> sos = new ArrayList<>();
|
||||
|
||||
TreeMap<Ruling, Boolean> tree = new TreeMap<>(new Comparator<Ruling>() {
|
||||
@Override
|
||||
public int compare(Ruling o1, Ruling o2) {
|
||||
|
||||
return java.lang.Double.compare(o1.getTop(), o2.getTop());
|
||||
}
|
||||
});
|
||||
|
||||
TreeMap<Point2D, Ruling[]> rv = new TreeMap<>(new Comparator<Point2D>() {
|
||||
@Override
|
||||
public int compare(Point2D o1, Point2D o2) {
|
||||
|
||||
if (o1.getY() > o2.getY()) {
|
||||
return 1;
|
||||
}
|
||||
if (o1.getY() < o2.getY()) {
|
||||
return -1;
|
||||
}
|
||||
if (o1.getX() > o2.getX()) {
|
||||
return 1;
|
||||
}
|
||||
if (o1.getX() < o2.getX()) {
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
});
|
||||
|
||||
for (Ruling h : horizontals) {
|
||||
sos.add(new SortObject(SOType.HLEFT, h.getLeft() - PERPENDICULAR_PIXEL_EXPAND_AMOUNT, h));
|
||||
sos.add(new SortObject(SOType.HRIGHT, h.getRight() + PERPENDICULAR_PIXEL_EXPAND_AMOUNT, h));
|
||||
}
|
||||
|
||||
for (Ruling v : verticals) {
|
||||
sos.add(new SortObject(SOType.VERTICAL, v.getLeft(), v));
|
||||
}
|
||||
|
||||
Collections.sort(sos, new Comparator<SortObject>() {
|
||||
@Override
|
||||
public int compare(SortObject a, SortObject b) {
|
||||
|
||||
int rv;
|
||||
if (DoubleComparisons.feq(a.position, b.position)) {
|
||||
if (a.type == SOType.VERTICAL && b.type == SOType.HLEFT) {
|
||||
rv = 1;
|
||||
} else if (a.type == SOType.VERTICAL && b.type == SOType.HRIGHT) {
|
||||
rv = -1;
|
||||
} else if (a.type == SOType.HLEFT && b.type == SOType.VERTICAL) {
|
||||
rv = -1;
|
||||
} else if (a.type == SOType.HRIGHT && b.type == SOType.VERTICAL) {
|
||||
rv = 1;
|
||||
} else {
|
||||
rv = java.lang.Double.compare(a.position, b.position);
|
||||
}
|
||||
} else {
|
||||
return java.lang.Double.compare(a.position, b.position);
|
||||
}
|
||||
return rv;
|
||||
}
|
||||
});
|
||||
|
||||
for (SortObject so : sos) {
|
||||
switch (so.type) {
|
||||
case VERTICAL:
|
||||
for (Map.Entry<Ruling, Boolean> h : tree.entrySet()) {
|
||||
try {
|
||||
Point2D i = h.getKey().intersectionPoint(so.ruling);
|
||||
if (i == null) {
|
||||
continue;
|
||||
}
|
||||
rv.put(i, new Ruling[]{h.getKey().expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT), so.ruling.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT)});
|
||||
} catch (UnsupportedOperationException e) {
|
||||
log.info("Some line are oblique, ignoring...");
|
||||
continue;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case HRIGHT:
|
||||
tree.remove(so.ruling);
|
||||
break;
|
||||
case HLEFT:
|
||||
tree.put(so.ruling, true);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return rv;
|
||||
|
||||
}
|
||||
|
||||
|
||||
public boolean vertical() {
|
||||
|
||||
return this.length() > 0 && DoubleComparisons.feq(this.x1, this.x2); //diff < ORIENTATION_CHECK_THRESHOLD;
|
||||
}
|
||||
|
||||
|
||||
public boolean horizontal() {
|
||||
|
||||
return this.length() > 0 && DoubleComparisons.feq(this.y1, this.y2); //diff < ORIENTATION_CHECK_THRESHOLD;
|
||||
}
|
||||
|
||||
// attributes that make sense only for non-oblique lines
|
||||
// these are used to have a single collapse method (in page, currently)
|
||||
|
||||
|
||||
public boolean oblique() {
|
||||
|
||||
return !(this.vertical() || this.horizontal());
|
||||
}
|
||||
|
||||
|
||||
public float getPosition() {
|
||||
|
||||
if (this.oblique()) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
return this.vertical() ? this.getLeft() : this.getTop();
|
||||
}
|
||||
|
||||
|
||||
public float getStart() {
|
||||
|
||||
if (this.oblique()) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
return this.vertical() ? this.getTop() : this.getLeft();
|
||||
}
|
||||
|
||||
|
||||
public void setStart(float v) {
|
||||
|
||||
if (this.oblique()) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
if (this.vertical()) {
|
||||
this.setTop(v);
|
||||
} else {
|
||||
this.setLeft(v);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public float getEnd() {
|
||||
|
||||
if (this.oblique()) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
return this.vertical() ? this.getBottom() : this.getRight();
|
||||
}
|
||||
|
||||
|
||||
public void setEnd(float v) {
|
||||
|
||||
if (this.oblique()) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
if (this.vertical()) {
|
||||
this.setBottom(v);
|
||||
} else {
|
||||
this.setRight(v);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void setStartEnd(float start, float end) {
|
||||
|
||||
if (this.oblique()) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
if (this.vertical()) {
|
||||
this.setTop(start);
|
||||
this.setBottom(end);
|
||||
} else {
|
||||
this.setLeft(start);
|
||||
this.setRight(end);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public boolean perpendicularTo(Ruling other) {
|
||||
|
||||
return this.vertical() == other.horizontal();
|
||||
}
|
||||
|
||||
|
||||
public boolean nearlyIntersects(Ruling another, int colinearOrParallelExpandAmount) {
|
||||
|
||||
if (this.intersectsLine(another)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
boolean rv = false;
|
||||
|
||||
if (this.perpendicularTo(another)) {
|
||||
rv = this.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT).intersectsLine(another);
|
||||
} else {
|
||||
rv = this.expand(colinearOrParallelExpandAmount).intersectsLine(another.expand(colinearOrParallelExpandAmount));
|
||||
}
|
||||
|
||||
return rv;
|
||||
}
|
||||
|
||||
|
||||
public double length() {
|
||||
|
||||
return Math.sqrt(Math.pow(this.x1 - this.x2, 2) + Math.pow(this.y1 - this.y2, 2));
|
||||
}
|
||||
|
||||
|
||||
public Ruling intersect(Rectangle2D clip) {
|
||||
|
||||
Float clipee = (Float) this.clone();
|
||||
boolean clipped = new CohenSutherlandClipping(clip).clip(clipee);
|
||||
|
||||
if (clipped) {
|
||||
return new Ruling(clipee.getP1(), clipee.getP2());
|
||||
} else {
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public Ruling expand(float amount) {
|
||||
|
||||
Ruling r = (Ruling) this.clone();
|
||||
try {
|
||||
r.setStart(this.getStart() - amount);
|
||||
r.setEnd(this.getEnd() + amount);
|
||||
} catch (UnsupportedOperationException e) {
|
||||
log.warn("Could not expand ruling!");
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
|
||||
public Point2D intersectionPoint(Ruling other) {
|
||||
|
||||
Ruling this_l = this.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT);
|
||||
Ruling other_l = other.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT);
|
||||
Ruling horizontal, vertical;
|
||||
|
||||
if (!this_l.intersectsLine(other_l)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (this_l.horizontal() && other_l.vertical()) {
|
||||
horizontal = this_l;
|
||||
vertical = other_l;
|
||||
} else if (this_l.vertical() && other_l.horizontal()) {
|
||||
vertical = this_l;
|
||||
horizontal = other_l;
|
||||
} else {
|
||||
log.warn("lines must be orthogonal, vertical and horizontal");
|
||||
return null;
|
||||
}
|
||||
return new Point2D.Float(vertical.getLeft(), horizontal.getTop());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean equals(Object other) {
|
||||
|
||||
if (this == other) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!(other instanceof Ruling)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
Ruling o = (Ruling) other;
|
||||
return this.getP1().equals(o.getP1()) && this.getP2().equals(o.getP2());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
|
||||
return super.hashCode();
|
||||
}
|
||||
|
||||
|
||||
public float getTop() {
|
||||
|
||||
return this.y1;
|
||||
}
|
||||
|
||||
|
||||
public void setTop(float v) {
|
||||
|
||||
setLine(this.getLeft(), v, this.getRight(), this.getBottom());
|
||||
}
|
||||
|
||||
|
||||
public float getLeft() {
|
||||
|
||||
return this.x1;
|
||||
}
|
||||
|
||||
|
||||
public void setLeft(float v) {
|
||||
|
||||
setLine(v, this.getTop(), this.getRight(), this.getBottom());
|
||||
}
|
||||
|
||||
|
||||
public float getBottom() {
|
||||
|
||||
return this.y2;
|
||||
}
|
||||
|
||||
|
||||
public void setBottom(float v) {
|
||||
|
||||
setLine(this.getLeft(), this.getTop(), this.getRight(), v);
|
||||
}
|
||||
|
||||
|
||||
public float getRight() {
|
||||
|
||||
return this.x2;
|
||||
}
|
||||
|
||||
|
||||
public void setRight(float v) {
|
||||
|
||||
setLine(this.getLeft(), this.getTop(), v, this.getBottom());
|
||||
}
|
||||
|
||||
|
||||
public float getWidth() {
|
||||
|
||||
return this.getRight() - this.getLeft();
|
||||
}
|
||||
|
||||
|
||||
public float getHeight() {
|
||||
|
||||
return this.getBottom() - this.getTop();
|
||||
}
|
||||
|
||||
|
||||
public double getAngle() {
|
||||
|
||||
double angle = Math.toDegrees(Math.atan2(this.getP2().getY() - this.getP1().getY(), this.getP2().getX() - this.getP1().getX()));
|
||||
|
||||
if (angle < 0) {
|
||||
angle += 360;
|
||||
}
|
||||
return angle;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
Formatter formatter = new Formatter(sb);
|
||||
String rv = formatter.format("%s[minX=%f minY=%f maxX=%f maxY=%f]", this.getClass().toString(), this.x1, this.y1, this.x2, this.y2).toString();
|
||||
formatter.close();
|
||||
return rv;
|
||||
}
|
||||
|
||||
|
||||
private enum SOType {
|
||||
VERTICAL,
|
||||
HRIGHT,
|
||||
HLEFT
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,350 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.table;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.TreeMap;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.Rectangle;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
public class Table extends AbstractTextContainer {
|
||||
|
||||
private final TreeMap<TableCellPosition, TableCell> cells = new TreeMap<>();
|
||||
|
||||
private final int rotation;
|
||||
@Getter
|
||||
@Setter
|
||||
private String headline;
|
||||
private int unrotatedRowCount;
|
||||
private int unrotatedColCount;
|
||||
private int rowCount = -1;
|
||||
private int colCount = -1;
|
||||
private List<List<TableCell>> rows;
|
||||
|
||||
|
||||
public Table(List<TableCell> cells, Rectangle area, int rotation) {
|
||||
|
||||
addCells(cells);
|
||||
minX = area.getLeft();
|
||||
minY = area.getBottom();
|
||||
maxX = area.getRight();
|
||||
maxY = area.getTop();
|
||||
classification = "Table";
|
||||
this.rotation = rotation;
|
||||
|
||||
}
|
||||
|
||||
|
||||
public List<List<TableCell>> getRows() {
|
||||
|
||||
if (rows == null) {
|
||||
rows = computeRows();
|
||||
|
||||
// Ignore rows that does not contain any cells and values.
|
||||
List<List<TableCell>> rowsToRemove = new ArrayList<>();
|
||||
for (List<TableCell> row : rows) {
|
||||
if (row.size() == 1 && row.get(0).getTextBlocks().isEmpty()) {
|
||||
rowsToRemove.add(row);
|
||||
}
|
||||
}
|
||||
rows.removeAll(rowsToRemove);
|
||||
|
||||
computeHeaders();
|
||||
}
|
||||
|
||||
return rows;
|
||||
|
||||
}
|
||||
|
||||
|
||||
public int getRowCount() {
|
||||
|
||||
if (rowCount == -1) {
|
||||
rowCount = getRows().size();
|
||||
}
|
||||
return rowCount;
|
||||
}
|
||||
|
||||
|
||||
public int getColCount() {
|
||||
|
||||
if (colCount == -1) {
|
||||
colCount = getRows().stream().mapToInt(List::size).max().orElse(0);
|
||||
}
|
||||
return colCount;
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Detect header cells (either first row or first column):
|
||||
* Column is marked as header if cell text is bold and row cell text is not bold.
|
||||
* Defaults to row.
|
||||
*/
|
||||
private void computeHeaders() {
|
||||
|
||||
if (rows == null) {
|
||||
rows = computeRows();
|
||||
}
|
||||
// A bold cell is a header cell as long as every cell to the left/top is bold, too
|
||||
// we move from left to right and top to bottom
|
||||
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
|
||||
List<TableCell> rowCells = rows.get(rowIndex);
|
||||
if (rowCells.size() == 1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (int colIndex = 0; colIndex < rowCells.size(); colIndex++) {
|
||||
TableCell cell = rowCells.get(colIndex);
|
||||
List<TableCell> cellsToTheLeft = rowCells.subList(0, colIndex);
|
||||
TableCell lastHeaderCell = null;
|
||||
for (TableCell leftCell : cellsToTheLeft) {
|
||||
if (leftCell.isHeaderCell()) {
|
||||
lastHeaderCell = leftCell;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (lastHeaderCell != null) {
|
||||
cell.getHeaderCells().add(lastHeaderCell);
|
||||
}
|
||||
List<TableCell> cellsToTheTop = new ArrayList<>();
|
||||
for (int i = 0; i < rowIndex; i++) {
|
||||
try {
|
||||
cellsToTheTop.add(rows.get(i).get(colIndex));
|
||||
} catch (IndexOutOfBoundsException e) {
|
||||
log.debug("No cell {} in row {}, ignoring.", colIndex, rowIndex);
|
||||
}
|
||||
}
|
||||
for (TableCell topCell : cellsToTheTop) {
|
||||
if (topCell.isHeaderCell()) {
|
||||
lastHeaderCell = topCell;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (lastHeaderCell != null) {
|
||||
cell.getHeaderCells().add(lastHeaderCell);
|
||||
}
|
||||
if (!cell.getTextBlocks().isEmpty() && cell.getTextBlocks().get(0).getMostPopularWordStyle().equals("bold")) {
|
||||
cell.setHeaderCell(true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
private List<List<TableCell>> computeRows() {
|
||||
|
||||
List<List<TableCell>> rows = new ArrayList<>();
|
||||
if (rotation == 90) {
|
||||
for (int i = 0; i < unrotatedColCount; i++) { // rows
|
||||
List<TableCell> lastRow = new ArrayList<>();
|
||||
for (int j = unrotatedRowCount - 1; j >= 0; j--) { // cols
|
||||
TableCell cell = cells.get(new TableCellPosition(j, i));
|
||||
if (cell != null) {
|
||||
lastRow.add(cell);
|
||||
}
|
||||
}
|
||||
rows.add(lastRow);
|
||||
}
|
||||
} else if (rotation == 270) {
|
||||
for (int i = unrotatedColCount - 1; i >= 0; i--) { // rows
|
||||
List<TableCell> lastRow = new ArrayList<>();
|
||||
for (int j = 0; j < unrotatedRowCount; j++) { // cols
|
||||
TableCell cell = cells.get(new TableCellPosition(j, i));
|
||||
if (cell != null) {
|
||||
lastRow.add(cell);
|
||||
}
|
||||
}
|
||||
rows.add(lastRow);
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < unrotatedRowCount; i++) {
|
||||
List<TableCell> lastRow = new ArrayList<>();
|
||||
for (int j = 0; j < unrotatedColCount; j++) {
|
||||
TableCell cell = cells.get(new TableCellPosition(i, j)); // JAVA_8 use getOrDefault()
|
||||
if (cell != null) {
|
||||
lastRow.add(cell);
|
||||
}
|
||||
}
|
||||
rows.add(lastRow);
|
||||
}
|
||||
}
|
||||
|
||||
return rows;
|
||||
|
||||
}
|
||||
|
||||
|
||||
private void add(TableCell chunk, int row, int col) {
|
||||
|
||||
unrotatedRowCount = Math.max(unrotatedRowCount, row + 1);
|
||||
unrotatedColCount = Math.max(unrotatedColCount, col + 1);
|
||||
|
||||
TableCellPosition cp = new TableCellPosition(row, col);
|
||||
cells.put(cp, chunk);
|
||||
|
||||
}
|
||||
|
||||
|
||||
private void addCells(List<TableCell> cells) {
|
||||
|
||||
if (cells.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
cells.removeIf(cell -> cell.getWidth() < 1.1 || cell.getHeight() < 1.1);
|
||||
|
||||
List<List<TableCell>> rowsOfCells = calculateStructure(cells);
|
||||
|
||||
for (int i = 0; i < rowsOfCells.size(); i++) {
|
||||
for (int j = 0; j < rowsOfCells.get(i).size(); j++) {
|
||||
add(rowsOfCells.get(i).get(j), i, j);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Calculates the structure of the table. For spanning rows and columns multiple cells with the same values will be inserted.
|
||||
*
|
||||
* @param cells The found cells
|
||||
* @return Table Structure
|
||||
*/
|
||||
private List<List<TableCell>> calculateStructure(List<TableCell> cells) {
|
||||
|
||||
List<List<TableCell>> matrix = new ArrayList<>();
|
||||
|
||||
if (cells.isEmpty()) {
|
||||
return matrix;
|
||||
}
|
||||
|
||||
Set<Float> uniqueX = new HashSet<>();
|
||||
Set<Float> uniqueY = new HashSet<>();
|
||||
cells.stream().filter(c -> !c.getTextBlocks().isEmpty() || c.getHeight() > 3 && c.getWidth() > 3).forEach(c -> {
|
||||
uniqueX.add(c.getLeft());
|
||||
uniqueX.add(c.getRight());
|
||||
uniqueY.add(c.getBottom());
|
||||
uniqueY.add(c.getTop());
|
||||
});
|
||||
|
||||
var sortedUniqueX = uniqueX.stream().sorted().collect(Collectors.toList());
|
||||
var sortedUniqueY = uniqueY.stream().sorted().collect(Collectors.toList());
|
||||
|
||||
Float prevY = null;
|
||||
for (Float y : sortedUniqueY) {
|
||||
|
||||
List<TableCell> row = new ArrayList<>();
|
||||
|
||||
Float prevX = null;
|
||||
for (Float x : sortedUniqueX) {
|
||||
|
||||
if (prevY != null && prevX != null) {
|
||||
var cell = new TableCell(new Point2D.Float(prevX, prevY), new Point2D.Float(x, y));
|
||||
|
||||
var intersectionCell = cells.stream().filter(c -> cell.intersects(c) && cell.overlapRatio(c) > 0.1f).findFirst();
|
||||
if (intersectionCell.isPresent()) {
|
||||
cell.getTextBlocks().addAll(intersectionCell.get().getTextBlocks());
|
||||
}
|
||||
row.add(cell);
|
||||
}
|
||||
prevX = x;
|
||||
}
|
||||
|
||||
if (prevY != null && prevX != null) {
|
||||
matrix.add(row);
|
||||
}
|
||||
prevY = y;
|
||||
}
|
||||
|
||||
Collections.reverse(matrix);
|
||||
|
||||
return matrix;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String getText() {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
List<List<TableCell>> rows = getRows();
|
||||
|
||||
int i = 0;
|
||||
for (List<TableCell> row : rows) {
|
||||
if (i != 0) {
|
||||
sb.append("\n");
|
||||
}
|
||||
if (!row.isEmpty()) {
|
||||
boolean firstColumn = true;
|
||||
for (TableCell column : row) {
|
||||
if (!firstColumn) {
|
||||
sb.append(",");
|
||||
}
|
||||
if (column != null && column.getTextBlocks() != null) {
|
||||
boolean first = true;
|
||||
for (ClassificationTextBlock textBlock : column.getTextBlocks()) {
|
||||
if (!first) {
|
||||
sb.append("\n");
|
||||
}
|
||||
sb.append('\"').append(textBlock.getText().replaceAll("\"", "\\\"")).append('\"');
|
||||
first = false;
|
||||
}
|
||||
}
|
||||
firstColumn = false;
|
||||
}
|
||||
}
|
||||
i++;
|
||||
}
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
|
||||
public String getTextAsHtml() {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
List<List<TableCell>> rows = getRows();
|
||||
|
||||
sb.append("<table border=\"1\">");
|
||||
int i = 0;
|
||||
for (List<TableCell> row : rows) {
|
||||
sb.append("\n<tr>");
|
||||
if (!row.isEmpty()) {
|
||||
for (TableCell column : row) {
|
||||
sb.append(i == 0 ? "\n<th>" : "\n<td>");
|
||||
if (column != null && column.getTextBlocks() != null) {
|
||||
boolean first = true;
|
||||
for (ClassificationTextBlock textBlock : column.getTextBlocks()) {
|
||||
if (!first) {
|
||||
sb.append("<br />");
|
||||
}
|
||||
sb.append(textBlock.getText().replaceAll("\\n", "<br />"));
|
||||
first = false;
|
||||
}
|
||||
}
|
||||
sb.append(i == 0 ? "</th>" : "</td>");
|
||||
}
|
||||
}
|
||||
sb.append("</tr>");
|
||||
i++;
|
||||
}
|
||||
sb.append("</table>");
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,38 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.table;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.Rectangle;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@SuppressWarnings("serial")
|
||||
@Data
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
@NoArgsConstructor
|
||||
public class TableCell extends Rectangle {
|
||||
|
||||
private List<ClassificationTextBlock> textBlocks = new ArrayList<>();
|
||||
|
||||
private List<TableCell> headerCells = new ArrayList<>();
|
||||
|
||||
private boolean isHeaderCell;
|
||||
|
||||
|
||||
public TableCell(Point2D topLeft, Point2D bottomRight) {
|
||||
|
||||
super((float) topLeft.getY(), (float) topLeft.getX(), (float) (bottomRight.getX() - topLeft.getX()), (float) (bottomRight.getY() - topLeft.getY()));
|
||||
}
|
||||
|
||||
|
||||
public void addTextBlock(ClassificationTextBlock textBlock) {
|
||||
|
||||
textBlocks.add(textBlock);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,22 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.table;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.Value;
|
||||
|
||||
@Value
|
||||
@RequiredArgsConstructor
|
||||
public class TableCellPosition implements Comparable<TableCellPosition> {
|
||||
|
||||
int row;
|
||||
|
||||
int col;
|
||||
|
||||
|
||||
@Override
|
||||
public int compareTo(TableCellPosition other) {
|
||||
|
||||
int rowDiff = row - other.row;
|
||||
return rowDiff != 0 ? rowDiff : col - other.col;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,286 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.text;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.TextNormalizationUtilities;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
@AllArgsConstructor
|
||||
@Builder
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
public class ClassificationTextBlock extends AbstractTextContainer {
|
||||
|
||||
@Builder.Default
|
||||
private List<TextPositionSequence> sequences = new ArrayList<>();
|
||||
|
||||
private int rotation;
|
||||
|
||||
private int indexOnPage;
|
||||
|
||||
private String mostPopularWordFont;
|
||||
|
||||
private String mostPopularWordStyle;
|
||||
|
||||
private float mostPopularWordFontSize;
|
||||
|
||||
private float mostPopularWordHeight;
|
||||
|
||||
private float mostPopularWordSpaceWidth;
|
||||
|
||||
private float highestFontSize;
|
||||
|
||||
private String classification;
|
||||
|
||||
|
||||
public TextDirection getDir() {
|
||||
|
||||
return sequences.get(0).getDir();
|
||||
}
|
||||
|
||||
private float getPageHeight() {
|
||||
|
||||
return sequences.get(0).getPageHeight();
|
||||
}
|
||||
|
||||
|
||||
private float getPageWidth() {
|
||||
|
||||
return sequences.get(0).getPageWidth();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns the minX value in pdf coordinate system.
|
||||
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
|
||||
* 0 -> LowerLeft
|
||||
* 90 -> UpperLeft
|
||||
* 180 -> UpperRight
|
||||
* 270 -> LowerRight
|
||||
*
|
||||
* @return the minX value in pdf coordinate system
|
||||
*/
|
||||
public float getPdfMinX() {
|
||||
|
||||
if (getDir().getDegrees() == 90) {
|
||||
return minY;
|
||||
} else if (getDir().getDegrees() == 180) {
|
||||
return getPageWidth() - maxX;
|
||||
|
||||
} else if (getDir().getDegrees() == 270) {
|
||||
|
||||
return getPageWidth() - maxY;
|
||||
} else {
|
||||
return minX;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the maxX value in pdf coordinate system.
|
||||
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
|
||||
* 0 -> LowerLeft
|
||||
* 90 -> UpperLeft
|
||||
* 180 -> UpperRight
|
||||
* 270 -> LowerRight
|
||||
*
|
||||
* @return the maxX value in pdf coordinate system
|
||||
*/
|
||||
public float getPdfMaxX() {
|
||||
|
||||
if (getDir().getDegrees() == 90) {
|
||||
return maxY;
|
||||
} else if (getDir().getDegrees() == 180) {
|
||||
return getPageWidth() - minX;
|
||||
} else if (getDir().getDegrees() == 270) {
|
||||
return getPageWidth() - minY;
|
||||
|
||||
} else {
|
||||
return maxX;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns the minY value in pdf coordinate system.
|
||||
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
|
||||
* 0 -> LowerLeft
|
||||
* 90 -> UpperLeft
|
||||
* 180 -> UpperRight
|
||||
* 270 -> LowerRight
|
||||
*
|
||||
* @return the minY value in pdf coordinate system
|
||||
*/
|
||||
public float getPdfMinY() {
|
||||
|
||||
if (getDir().getDegrees() == 90) {
|
||||
return minX;
|
||||
} else if (getDir().getDegrees() == 180) {
|
||||
return maxY;
|
||||
|
||||
} else if (getDir().getDegrees() == 270) {
|
||||
return getPageHeight() - maxX;
|
||||
|
||||
} else {
|
||||
return getPageHeight() - maxY;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns the maxY value in pdf coordinate system.
|
||||
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
|
||||
* 0 -> LowerLeft
|
||||
* 90 -> UpperLeft
|
||||
* 180 -> UpperRight
|
||||
* 270 -> LowerRight
|
||||
*
|
||||
* @return the maxY value in pdf coordinate system
|
||||
*/
|
||||
public float getPdfMaxY() {
|
||||
|
||||
if (getDir().getDegrees() == 90) {
|
||||
return maxX;
|
||||
} else if (getDir().getDegrees() == 180) {
|
||||
|
||||
return minY;
|
||||
} else if (getDir().getDegrees() == 270) {
|
||||
return getPageHeight() - minX;
|
||||
} else {
|
||||
return getPageHeight() - minY;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public ClassificationTextBlock(float minX, float maxX, float minY, float maxY, List<TextPositionSequence> sequences, int rotation, int indexOnPage) {
|
||||
super();
|
||||
this.indexOnPage = indexOnPage;
|
||||
super.minX = minX;
|
||||
super.maxX = maxX;
|
||||
super.minY = minY;
|
||||
super.maxY = maxY;
|
||||
this.sequences = sequences;
|
||||
this.rotation = rotation;
|
||||
}
|
||||
|
||||
|
||||
public ClassificationTextBlock union(TextPositionSequence r) {
|
||||
|
||||
ClassificationTextBlock union = this.copy();
|
||||
union.add(r);
|
||||
return union;
|
||||
}
|
||||
|
||||
|
||||
public ClassificationTextBlock union(ClassificationTextBlock r) {
|
||||
|
||||
ClassificationTextBlock union = this.copy();
|
||||
union.add(r);
|
||||
return union;
|
||||
}
|
||||
|
||||
|
||||
public void add(ClassificationTextBlock r) {
|
||||
|
||||
if (r.getMinX() < minX) {
|
||||
minX = r.getMinX();
|
||||
}
|
||||
if (r.getMaxX() > maxX) {
|
||||
maxX = r.getMaxX();
|
||||
}
|
||||
if (r.getMinY() < minY) {
|
||||
minY = r.getMinY();
|
||||
}
|
||||
if (r.getMaxY() > maxY) {
|
||||
maxY = r.getMaxY();
|
||||
}
|
||||
sequences.addAll(r.getSequences());
|
||||
}
|
||||
|
||||
|
||||
public void add(TextPositionSequence r) {
|
||||
|
||||
if (r.getMinXDirAdj() < minX) {
|
||||
minX = r.getMinXDirAdj();
|
||||
}
|
||||
if (r.getMaxXDirAdj() > maxX) {
|
||||
maxX = r.getMaxXDirAdj();
|
||||
}
|
||||
if (r.getMinYDirAdj() < minY) {
|
||||
minY = r.getMinYDirAdj();
|
||||
}
|
||||
if (r.getMaxYDirAdj() > maxY) {
|
||||
maxY = r.getMaxYDirAdj();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public ClassificationTextBlock copy() {
|
||||
|
||||
return new ClassificationTextBlock(minX, maxX, minY, maxY, sequences, rotation, indexOnPage);
|
||||
}
|
||||
|
||||
|
||||
public void resize(float x1, float y1, float width, float height) {
|
||||
|
||||
set(x1, y1, x1 + width, y1 + height);
|
||||
}
|
||||
|
||||
|
||||
public void set(float x1, float y1, float x2, float y2) {
|
||||
|
||||
this.minX = Math.min(x1, x2);
|
||||
this.maxX = Math.max(x1, x2);
|
||||
this.minY = Math.min(y1, y2);
|
||||
this.maxY = Math.max(y1, y2);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
StringBuilder builder = new StringBuilder();
|
||||
|
||||
for (int i = 0; i < sequences.size(); i++) {
|
||||
String sequenceAsString = sequences.get(i).toString();
|
||||
// Fix for missing Whitespace. This is recognized in getSequences method. See PDFTextStripper Line 1730.
|
||||
if (i != 0 && sequences.get(i - 1).charAt(sequences.get(i - 1).length() - 1) != ' ' && sequenceAsString.charAt(0) != ' ') {
|
||||
builder.append(' ');
|
||||
}
|
||||
builder.append(sequenceAsString);
|
||||
}
|
||||
|
||||
return builder.toString();
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String getText() {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
TextPositionSequence previous = null;
|
||||
for (TextPositionSequence word : sequences) {
|
||||
if (previous != null) {
|
||||
if (Math.abs(previous.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight()) {
|
||||
sb.append('\n');
|
||||
} else {
|
||||
sb.append(' ');
|
||||
}
|
||||
}
|
||||
sb.append(word.toString());
|
||||
previous = word;
|
||||
}
|
||||
|
||||
return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString());
|
||||
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,106 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.text;
|
||||
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
import org.springframework.beans.BeanUtils;
|
||||
|
||||
import com.dslplatform.json.CompiledJson;
|
||||
import com.dslplatform.json.JsonAttribute;
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@CompiledJson
|
||||
public class RedTextPosition {
|
||||
|
||||
private String textMatrix;
|
||||
private float[] position;
|
||||
|
||||
@JsonIgnore
|
||||
private int rotation;
|
||||
|
||||
@JsonIgnore
|
||||
private float pageHeight;
|
||||
|
||||
@JsonIgnore
|
||||
private float pageWidth;
|
||||
|
||||
private String unicode;
|
||||
|
||||
@JsonIgnore
|
||||
private float dir;
|
||||
|
||||
// not used in reanalysis
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
private float widthOfSpace;
|
||||
|
||||
// not used in reanalysis
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
private float fontSizeInPt;
|
||||
|
||||
// not used in reanalysis
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
private String fontName;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public static RedTextPosition fromTextPosition(TextPosition textPosition) {
|
||||
|
||||
var pos = new RedTextPosition();
|
||||
BeanUtils.copyProperties(textPosition, pos);
|
||||
pos.setFontName(textPosition.getFont().getName());
|
||||
|
||||
pos.setFontSizeInPt(textPosition.getFontSizeInPt());
|
||||
|
||||
pos.setTextMatrix(textPosition.getTextMatrix().toString());
|
||||
|
||||
var position = new float[4];
|
||||
|
||||
position[0] = textPosition.getXDirAdj();
|
||||
position[1] = textPosition.getYDirAdj();
|
||||
position[2] = textPosition.getWidthDirAdj();
|
||||
position[3] = textPosition.getHeightDir();
|
||||
|
||||
pos.setPosition(position);
|
||||
return pos;
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getXDirAdj() {
|
||||
|
||||
return position[0];
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getYDirAdj() {
|
||||
|
||||
return position[1];
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getWidthDirAdj() {
|
||||
|
||||
return position[2];
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getHeightDir() {
|
||||
|
||||
return position[3];
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,47 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.text;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import lombok.Getter;
|
||||
|
||||
public class StringFrequencyCounter {
|
||||
|
||||
@Getter
|
||||
private final Map<String, Integer> countPerValue = new HashMap<>();
|
||||
|
||||
|
||||
public void add(String value) {
|
||||
|
||||
if (!countPerValue.containsKey(value)) {
|
||||
countPerValue.put(value, 1);
|
||||
} else {
|
||||
countPerValue.put(value, countPerValue.get(value) + 1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void addAll(Map<String, Integer> otherCounter) {
|
||||
|
||||
for (Map.Entry<String, Integer> entry : otherCounter.entrySet()) {
|
||||
if (countPerValue.containsKey(entry.getKey())) {
|
||||
countPerValue.put(entry.getKey(), countPerValue.get(entry.getKey()) + entry.getValue());
|
||||
} else {
|
||||
countPerValue.put(entry.getKey(), entry.getValue());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public String getMostPopular() {
|
||||
|
||||
Map.Entry<String, Integer> mostPopular = null;
|
||||
for (Map.Entry<String, Integer> entry : countPerValue.entrySet()) {
|
||||
if (mostPopular == null || entry.getValue() > mostPopular.getValue()) {
|
||||
mostPopular = entry;
|
||||
}
|
||||
}
|
||||
return mostPopular != null ? mostPopular.getKey() : null;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,8 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.text;
|
||||
|
||||
public enum TextBlockOrientation {
|
||||
|
||||
NONE,
|
||||
LEFT,
|
||||
RIGHT
|
||||
}
|
||||
@ -0,0 +1,54 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.text;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonCreator;
|
||||
import com.fasterxml.jackson.annotation.JsonValue;
|
||||
|
||||
import lombok.Getter;
|
||||
|
||||
@Getter
|
||||
public enum TextDirection {
|
||||
ZERO(0f),
|
||||
QUARTER_CIRCLE(90f),
|
||||
HALF_CIRCLE(180f),
|
||||
THREE_QUARTER_CIRCLE(270f);
|
||||
|
||||
public static final String VALUE_STRING_SUFFIX = "°";
|
||||
|
||||
@JsonValue
|
||||
private final float degrees;
|
||||
private final float radians;
|
||||
|
||||
|
||||
TextDirection(float degreeValue) {
|
||||
|
||||
degrees = degreeValue;
|
||||
radians = (float) Math.toRadians(degreeValue);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return degrees + VALUE_STRING_SUFFIX;
|
||||
}
|
||||
|
||||
|
||||
@com.dslplatform.json.JsonValue
|
||||
public float jsonValue() {
|
||||
|
||||
return getDegrees();
|
||||
}
|
||||
|
||||
|
||||
@JsonCreator(mode = JsonCreator.Mode.DELEGATING)
|
||||
public static TextDirection fromDegrees(float degrees) {
|
||||
|
||||
for (var dir : TextDirection.values()) {
|
||||
if (degrees == dir.degrees) {
|
||||
return dir;
|
||||
}
|
||||
}
|
||||
|
||||
throw new IllegalArgumentException(String.format("A value of %f is not supported by TextDirection", degrees));
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,298 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.text;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
|
||||
import com.dslplatform.json.JsonAttribute;
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class TextPositionSequence implements CharSequence {
|
||||
|
||||
public static final int HEIGHT_PADDING = 2;
|
||||
private int page;
|
||||
private List<RedTextPosition> textPositions = new ArrayList<>();
|
||||
|
||||
private TextDirection dir;
|
||||
private int rotation;
|
||||
private float pageHeight;
|
||||
private float pageWidth;
|
||||
|
||||
|
||||
public TextPositionSequence(List<TextPosition> textPositions, int page) {
|
||||
|
||||
this.textPositions = textPositions.stream().map(RedTextPosition::fromTextPosition).collect(Collectors.toList());
|
||||
this.page = page;
|
||||
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
|
||||
this.rotation = textPositions.get(0).getRotation();
|
||||
this.pageHeight = textPositions.get(0).getPageHeight();
|
||||
this.pageWidth = textPositions.get(0).getPageWidth();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int length() {
|
||||
|
||||
return textPositions.size();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public char charAt(int index) {
|
||||
|
||||
RedTextPosition textPosition = textPositionAt(index);
|
||||
String text = textPosition.getUnicode();
|
||||
return text.charAt(0);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public TextPositionSequence subSequence(int start, int end) {
|
||||
|
||||
var textPositionSequence = new TextPositionSequence();
|
||||
textPositionSequence.textPositions = textPositions.subList(start, end);
|
||||
textPositionSequence.page = page;
|
||||
textPositionSequence.dir = dir;
|
||||
textPositionSequence.rotation = rotation;
|
||||
textPositionSequence.pageHeight = pageHeight;
|
||||
textPositionSequence.pageWidth = pageWidth;
|
||||
|
||||
return textPositionSequence;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
StringBuilder builder = new StringBuilder(length());
|
||||
for (int i = 0; i < length(); i++) {
|
||||
builder.append(charAt(i));
|
||||
}
|
||||
return builder.toString();
|
||||
}
|
||||
|
||||
|
||||
public RedTextPosition textPositionAt(int index) {
|
||||
|
||||
return textPositions.get(index);
|
||||
}
|
||||
|
||||
|
||||
public void add(TextPositionSequence textPositionSequence, RedTextPosition textPosition) {
|
||||
|
||||
this.textPositions.add(textPosition);
|
||||
this.page = textPositionSequence.getPage();
|
||||
this.dir = textPositionSequence.getDir();
|
||||
this.rotation = textPositionSequence.getRotation();
|
||||
this.pageHeight = textPositionSequence.getPageHeight();
|
||||
this.pageWidth = textPositionSequence.getPageWidth();
|
||||
}
|
||||
|
||||
|
||||
public void add(TextPosition textPosition) {
|
||||
|
||||
this.textPositions.add(RedTextPosition.fromTextPosition(textPosition));
|
||||
|
||||
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
|
||||
this.rotation = textPositions.get(0).getRotation();
|
||||
this.pageHeight = textPositions.get(0).getPageHeight();
|
||||
this.pageWidth = textPositions.get(0).getPageWidth();
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
|
||||
* This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt.
|
||||
*
|
||||
* @return the text direction adjusted minX value
|
||||
*/
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public float getMinXDirAdj() {
|
||||
|
||||
return textPositions.get(0).getXDirAdj();
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
|
||||
* This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt.
|
||||
*
|
||||
* @return the text direction adjusted maxX value
|
||||
*/
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public float getMaxXDirAdj() {
|
||||
|
||||
return textPositions.get(textPositions.size() - 1).getXDirAdj() + textPositions.get(textPositions.size() - 1).getWidthDirAdj() + HEIGHT_PADDING;
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
|
||||
* This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt.
|
||||
*
|
||||
* @return the text direction adjusted minY value. The upper border of the bounding box of the word.
|
||||
*/
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public float getMinYDirAdj() {
|
||||
|
||||
return textPositions.get(0).getYDirAdj() - getTextHeight();
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
|
||||
* This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt.
|
||||
*
|
||||
* @return the text direction adjusted maxY value. The lower border of the bounding box of the word.
|
||||
*/
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public float getMaxYDirAdj() {
|
||||
|
||||
return textPositions.get(0).getYDirAdj();
|
||||
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public float getTextHeight() {
|
||||
|
||||
return textPositions.get(0).getHeightDir() + HEIGHT_PADDING;
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public float getHeight() {
|
||||
|
||||
return getMaxYDirAdj() - getMinYDirAdj();
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public float getWidth() {
|
||||
|
||||
return getMaxXDirAdj() - getMinXDirAdj();
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public String getFont() {
|
||||
|
||||
return textPositions.get(0).getFontName().toLowerCase().replaceAll(",bold", "").replaceAll(",italic", "");
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public String getFontStyle() {
|
||||
|
||||
String lowercaseFontName = textPositions.get(0).getFontName().toLowerCase();
|
||||
|
||||
if (lowercaseFontName.contains("bold") && lowercaseFontName.contains("italic")) {
|
||||
return "bold, italic";
|
||||
} else if (lowercaseFontName.contains("bold")) {
|
||||
return "bold";
|
||||
} else if (lowercaseFontName.contains("italic")) {
|
||||
return "italic";
|
||||
} else {
|
||||
return "standard";
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public float getFontSize() {
|
||||
|
||||
return textPositions.get(0).getFontSizeInPt();
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public float getSpaceWidth() {
|
||||
|
||||
return textPositions.get(0).getWidthOfSpace();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* This returns the bounding box of the word in Pdf Coordinate System where {0,0} rotated with the page rotation.
|
||||
* 0 -> LowerLeft
|
||||
* 90 -> UpperLeft
|
||||
* 180 -> UpperRight
|
||||
* 270 -> LowerRight
|
||||
*
|
||||
* @return bounding box of the word in Pdf Coordinate System
|
||||
*/
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
@SneakyThrows
|
||||
public Rectangle getRectangle() {
|
||||
|
||||
log.debug("ClassificationPage: '{}', Word: '{}', Rotation: '{}', textRotation {}", page, this, rotation, dir);
|
||||
|
||||
float textHeight = getTextHeight();
|
||||
|
||||
RedTextPosition firstTextPos = textPositions.get(0);
|
||||
RedTextPosition lastTextPos = textPositions.get(textPositions.size() - 1);
|
||||
|
||||
Point2D bottomLeft = new Point2D.Double(firstTextPos.getXDirAdj(), firstTextPos.getYDirAdj() - HEIGHT_PADDING);
|
||||
Point2D topRight = new Point2D.Double(lastTextPos.getXDirAdj() + lastTextPos.getWidthDirAdj(), lastTextPos.getYDirAdj() + textHeight + HEIGHT_PADDING);
|
||||
|
||||
AffineTransform transform = new AffineTransform();
|
||||
if (dir == TextDirection.ZERO || dir == TextDirection.HALF_CIRCLE) {
|
||||
transform.rotate(dir.getRadians(), pageWidth / 2f, pageHeight / 2f);
|
||||
transform.translate(0f, pageHeight + textHeight);
|
||||
transform.scale(1., -1.);
|
||||
} else if (dir == TextDirection.QUARTER_CIRCLE) {
|
||||
transform.rotate(dir.getRadians(), pageWidth / 2f, pageWidth / 2f);
|
||||
transform.translate(0f, pageWidth + textHeight);
|
||||
transform.scale(1., -1.);
|
||||
} else {
|
||||
transform.rotate(dir.getRadians(), pageHeight / 2f, pageHeight / 2f);
|
||||
transform.translate(0f, pageWidth + textHeight);
|
||||
transform.scale(1., -1.);
|
||||
}
|
||||
|
||||
bottomLeft = transform.transform(bottomLeft, null);
|
||||
topRight = transform.transform(topRight, null);
|
||||
|
||||
return new Rectangle( //
|
||||
new Point((float) bottomLeft.getX(), (float) bottomLeft.getY()),
|
||||
(float) (topRight.getX() - bottomLeft.getX()),
|
||||
(float) (topRight.getY() - bottomLeft.getY()),
|
||||
page);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,14 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.text;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
public class UnclassifiedText {
|
||||
|
||||
private List<ClassificationTextBlock> textBlocks;
|
||||
|
||||
}
|
||||
@ -0,0 +1,384 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.parsing;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.Map;
|
||||
import java.util.WeakHashMap;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.fontbox.ttf.TrueTypeFont;
|
||||
import org.apache.fontbox.util.BoundingBox;
|
||||
import org.apache.pdfbox.contentstream.PDFStreamEngine;
|
||||
import org.apache.pdfbox.contentstream.operator.DrawObject;
|
||||
import org.apache.pdfbox.contentstream.operator.state.Concatenate;
|
||||
import org.apache.pdfbox.contentstream.operator.state.Restore;
|
||||
import org.apache.pdfbox.contentstream.operator.state.Save;
|
||||
import org.apache.pdfbox.contentstream.operator.state.SetGraphicsStateParameters;
|
||||
import org.apache.pdfbox.contentstream.operator.state.SetMatrix;
|
||||
import org.apache.pdfbox.contentstream.operator.text.BeginText;
|
||||
import org.apache.pdfbox.contentstream.operator.text.EndText;
|
||||
import org.apache.pdfbox.contentstream.operator.text.MoveText;
|
||||
import org.apache.pdfbox.contentstream.operator.text.MoveTextSetLeading;
|
||||
import org.apache.pdfbox.contentstream.operator.text.NextLine;
|
||||
import org.apache.pdfbox.contentstream.operator.text.SetCharSpacing;
|
||||
import org.apache.pdfbox.contentstream.operator.text.SetFontAndSize;
|
||||
import org.apache.pdfbox.contentstream.operator.text.SetTextHorizontalScaling;
|
||||
import org.apache.pdfbox.contentstream.operator.text.SetTextLeading;
|
||||
import org.apache.pdfbox.contentstream.operator.text.SetTextRenderingMode;
|
||||
import org.apache.pdfbox.contentstream.operator.text.SetTextRise;
|
||||
import org.apache.pdfbox.contentstream.operator.text.SetWordSpacing;
|
||||
import org.apache.pdfbox.contentstream.operator.text.ShowText;
|
||||
import org.apache.pdfbox.contentstream.operator.text.ShowTextAdjusted;
|
||||
import org.apache.pdfbox.contentstream.operator.text.ShowTextLine;
|
||||
import org.apache.pdfbox.contentstream.operator.text.ShowTextLineAndSpace;
|
||||
import org.apache.pdfbox.cos.COSDictionary;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
import org.apache.pdfbox.pdmodel.font.PDCIDFont;
|
||||
import org.apache.pdfbox.pdmodel.font.PDCIDFontType2;
|
||||
import org.apache.pdfbox.pdmodel.font.PDFont;
|
||||
import org.apache.pdfbox.pdmodel.font.PDFontDescriptor;
|
||||
import org.apache.pdfbox.pdmodel.font.PDSimpleFont;
|
||||
import org.apache.pdfbox.pdmodel.font.PDTrueTypeFont;
|
||||
import org.apache.pdfbox.pdmodel.font.PDType0Font;
|
||||
import org.apache.pdfbox.pdmodel.font.PDType3Font;
|
||||
import org.apache.pdfbox.pdmodel.font.encoding.GlyphList;
|
||||
import org.apache.pdfbox.pdmodel.graphics.state.PDGraphicsState;
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
import org.apache.pdfbox.util.Matrix;
|
||||
import org.apache.pdfbox.util.Vector;
|
||||
|
||||
/**
|
||||
* LEGACY text calculations which are known to be incorrect but are depended on by PDFTextStripper.
|
||||
* <p>
|
||||
* This class exists only so that we don't break the code of users who have their own subclasses of
|
||||
* PDFTextStripper. It replaces the mostly empty implementation of showGlyph() in PDFStreamEngine
|
||||
* with a heuristic implementation which is backwards compatible.
|
||||
* <p>
|
||||
* DO NOT USE THIS CODE UNLESS YOU ARE WORKING WITH PDFTextStripper.
|
||||
* THIS CODE IS DELIBERATELY INCORRECT, USE PDFStreamEngine INSTEAD.
|
||||
*/
|
||||
@SuppressWarnings({"PMD", "checkstyle:all"})
|
||||
class LegacyPDFStreamEngine extends PDFStreamEngine {
|
||||
|
||||
private static final Log LOG = LogFactory.getLog(LegacyPDFStreamEngine.class);
|
||||
|
||||
private int pageRotation;
|
||||
private PDRectangle pageSize;
|
||||
private Matrix translateMatrix;
|
||||
private final GlyphList glyphList;
|
||||
private final Map<COSDictionary, Float> fontHeightMap = new WeakHashMap<COSDictionary, Float>();
|
||||
|
||||
|
||||
/**
|
||||
* Constructor.
|
||||
*/
|
||||
LegacyPDFStreamEngine() throws IOException {
|
||||
|
||||
addOperator(new BeginText());
|
||||
addOperator(new Concatenate());
|
||||
addOperator(new DrawObject()); // special text version
|
||||
addOperator(new EndText());
|
||||
addOperator(new SetGraphicsStateParameters());
|
||||
addOperator(new Save());
|
||||
addOperator(new Restore());
|
||||
addOperator(new NextLine());
|
||||
addOperator(new SetCharSpacing());
|
||||
addOperator(new MoveText());
|
||||
addOperator(new MoveTextSetLeading());
|
||||
addOperator(new SetFontAndSize());
|
||||
addOperator(new ShowText());
|
||||
addOperator(new ShowTextAdjusted());
|
||||
addOperator(new SetTextLeading());
|
||||
addOperator(new SetMatrix());
|
||||
addOperator(new SetTextRenderingMode());
|
||||
addOperator(new SetTextRise());
|
||||
addOperator(new SetWordSpacing());
|
||||
addOperator(new SetTextHorizontalScaling());
|
||||
addOperator(new ShowTextLine());
|
||||
addOperator(new ShowTextLineAndSpace());
|
||||
|
||||
// load additional glyph list for Unicode mapping
|
||||
String path = "/org/apache/pdfbox/resources/glyphlist/additional.txt";
|
||||
InputStream input = GlyphList.class.getResourceAsStream(path);
|
||||
glyphList = new GlyphList(GlyphList.getAdobeGlyphList(), input);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* This will initialize and process the contents of the stream.
|
||||
*
|
||||
* @param page the page to process
|
||||
* @throws IOException if there is an error accessing the stream.
|
||||
*/
|
||||
@Override
|
||||
public void processPage(PDPage page) throws IOException {
|
||||
|
||||
this.pageRotation = page.getRotation();
|
||||
this.pageSize = page.getCropBox();
|
||||
|
||||
if (pageSize.getLowerLeftX() == 0 && pageSize.getLowerLeftY() == 0) {
|
||||
translateMatrix = null;
|
||||
} else {
|
||||
// translation matrix for cropbox
|
||||
translateMatrix = Matrix.getTranslateInstance(-pageSize.getLowerLeftX(), -pageSize.getLowerLeftY());
|
||||
}
|
||||
super.processPage(page);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Called when a glyph is to be processed. The heuristic calculations here were originally
|
||||
* written by Ben Litchfield for PDFStreamEngine.
|
||||
*/
|
||||
@Override
|
||||
protected void showGlyph(Matrix textRenderingMatrix, PDFont font, int code,Vector displacement) throws IOException {
|
||||
//
|
||||
// legacy calculations which were previously in PDFStreamEngine
|
||||
//
|
||||
// DO NOT USE THIS CODE UNLESS YOU ARE WORKING WITH PDFTextStripper.
|
||||
// THIS CODE IS DELIBERATELY INCORRECT
|
||||
//
|
||||
|
||||
PDGraphicsState state = getGraphicsState();
|
||||
Matrix ctm = state.getCurrentTransformationMatrix();
|
||||
float fontSize = state.getTextState().getFontSize();
|
||||
float horizontalScaling = state.getTextState().getHorizontalScaling() / 100f;
|
||||
Matrix textMatrix = getTextMatrix();
|
||||
|
||||
float displacementX = displacement.getX();
|
||||
// the sorting algorithm is based on the width of the character. As the displacement
|
||||
// for vertical characters doesn't provide any suitable value for it, we have to
|
||||
// calculate our own
|
||||
if (font.isVertical()) {
|
||||
displacementX = font.getWidth(code) / 1000;
|
||||
// there may be an additional scaling factor for true type fonts
|
||||
TrueTypeFont ttf = null;
|
||||
if (font instanceof PDTrueTypeFont) {
|
||||
ttf = ((PDTrueTypeFont) font).getTrueTypeFont();
|
||||
} else if (font instanceof PDType0Font) {
|
||||
PDCIDFont cidFont = ((PDType0Font) font).getDescendantFont();
|
||||
if (cidFont instanceof PDCIDFontType2) {
|
||||
ttf = ((PDCIDFontType2) cidFont).getTrueTypeFont();
|
||||
}
|
||||
}
|
||||
if (ttf != null && ttf.getUnitsPerEm() != 1000) {
|
||||
displacementX *= 1000f / ttf.getUnitsPerEm();
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// legacy calculations which were previously in PDFStreamEngine
|
||||
//
|
||||
// DO NOT USE THIS CODE UNLESS YOU ARE WORKING WITH PDFTextStripper.
|
||||
// THIS CODE IS DELIBERATELY INCORRECT
|
||||
//
|
||||
|
||||
// (modified) combined displacement, this is calculated *without* taking the character
|
||||
// spacing and word spacing into account, due to legacy code in TextStripper
|
||||
float tx = displacementX * fontSize * horizontalScaling;
|
||||
float ty = displacement.getY() * fontSize;
|
||||
|
||||
// (modified) combined displacement matrix
|
||||
Matrix td = Matrix.getTranslateInstance(tx, ty);
|
||||
|
||||
// (modified) text rendering matrix
|
||||
Matrix nextTextRenderingMatrix = td.multiply(textMatrix).multiply(ctm); // text space -> device space
|
||||
float nextX = nextTextRenderingMatrix.getTranslateX();
|
||||
float nextY = nextTextRenderingMatrix.getTranslateY();
|
||||
|
||||
// (modified) width and height calculations
|
||||
float dxDisplay = nextX - textRenderingMatrix.getTranslateX();
|
||||
Float fontHeight = fontHeightMap.get(font.getCOSObject());
|
||||
if (fontHeight == null) {
|
||||
fontHeight = computeFontHeight(font);
|
||||
fontHeightMap.put(font.getCOSObject(), fontHeight);
|
||||
}
|
||||
float dyDisplay = fontHeight * textRenderingMatrix.getScalingFactorY();
|
||||
|
||||
//
|
||||
// start of the original method
|
||||
//
|
||||
|
||||
// Note on variable names. There are three different units being used in this code.
|
||||
// Character sizes are given in glyph units, text locations are initially given in text
|
||||
// units, and we want to save the data in display units. The variable names should end with
|
||||
// Text or Disp to represent if the values are in text or disp units (no glyph units are
|
||||
// saved).
|
||||
|
||||
float glyphSpaceToTextSpaceFactor = 1 / 1000f;
|
||||
if (font instanceof PDType3Font) {
|
||||
glyphSpaceToTextSpaceFactor = font.getFontMatrix().getScaleX();
|
||||
}
|
||||
|
||||
float spaceWidthText = 0;
|
||||
try {
|
||||
// to avoid crash as described in PDFBOX-614, see what the space displacement should be
|
||||
spaceWidthText = font.getSpaceWidth() * glyphSpaceToTextSpaceFactor;
|
||||
} catch (Throwable exception) {
|
||||
LOG.warn(exception, exception);
|
||||
}
|
||||
|
||||
if (spaceWidthText == 0) {
|
||||
spaceWidthText = font.getAverageFontWidth() * glyphSpaceToTextSpaceFactor;
|
||||
// the average space width appears to be higher than necessary so make it smaller
|
||||
spaceWidthText *= .80f;
|
||||
}
|
||||
if (spaceWidthText == 0) {
|
||||
spaceWidthText = 1.0f; // if could not find font, use a generic value
|
||||
}
|
||||
|
||||
// the space width has to be transformed into display units
|
||||
float spaceWidthDisplay = spaceWidthText * textRenderingMatrix.getScalingFactorX();
|
||||
|
||||
// use our additional glyph list for Unicode mapping
|
||||
String unicodeMapping = font.toUnicode(code, glyphList);
|
||||
|
||||
// when there is no Unicode mapping available, Acrobat simply coerces the character code
|
||||
// into Unicode, so we do the same. Subclasses of PDFStreamEngine don't necessarily want
|
||||
// this, which is why we leave it until this point in PDFTextStreamEngine.
|
||||
if (unicodeMapping == null) {
|
||||
if (font instanceof PDSimpleFont) {
|
||||
char c = (char) code;
|
||||
unicodeMapping = new String(new char[]{c});
|
||||
} else {
|
||||
// Acrobat doesn't seem to coerce composite font's character codes, instead it
|
||||
// skips them. See the "allah2.pdf" TestTextStripper file.
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// adjust for cropbox if needed
|
||||
Matrix translatedTextRenderingMatrix;
|
||||
if (translateMatrix == null) {
|
||||
translatedTextRenderingMatrix = textRenderingMatrix;
|
||||
} else {
|
||||
translatedTextRenderingMatrix = Matrix.concatenate(translateMatrix, textRenderingMatrix);
|
||||
nextX -= pageSize.getLowerLeftX();
|
||||
nextY -= pageSize.getLowerLeftY();
|
||||
}
|
||||
|
||||
// This is a hack for unicode letter with 2 chars e.g. RA see unicodeProblem.pdf
|
||||
if (unicodeMapping.length() == 2) {
|
||||
processTextPosition(new TextPosition(pageRotation,
|
||||
pageSize.getWidth(),
|
||||
pageSize.getHeight(),
|
||||
translatedTextRenderingMatrix,
|
||||
nextX,
|
||||
nextY,
|
||||
Math.abs(dyDisplay),
|
||||
dxDisplay,
|
||||
Math.abs(spaceWidthDisplay),
|
||||
Character.toString(unicodeMapping.charAt(0)),
|
||||
new int[]{code},
|
||||
font,
|
||||
fontSize,
|
||||
(int) (fontSize * textMatrix.getScalingFactorX())));
|
||||
processTextPosition(new TextPosition(pageRotation,
|
||||
pageSize.getWidth(),
|
||||
pageSize.getHeight(),
|
||||
translatedTextRenderingMatrix,
|
||||
nextX,
|
||||
nextY,
|
||||
Math.abs(dyDisplay),
|
||||
dxDisplay,
|
||||
Math.abs(spaceWidthDisplay),
|
||||
Character.toString(unicodeMapping.charAt(1)),
|
||||
new int[]{code},
|
||||
font,
|
||||
fontSize,
|
||||
(int) (fontSize * textMatrix.getScalingFactorX())));
|
||||
} else {
|
||||
|
||||
processTextPosition(new TextPosition(pageRotation,
|
||||
pageSize.getWidth(),
|
||||
pageSize.getHeight(),
|
||||
translatedTextRenderingMatrix,
|
||||
nextX,
|
||||
nextY,
|
||||
Math.abs(dyDisplay),
|
||||
dxDisplay,
|
||||
Math.abs(spaceWidthDisplay),
|
||||
unicodeMapping,
|
||||
new int[]{code},
|
||||
font,
|
||||
fontSize,
|
||||
(int) (fontSize * textMatrix.getScalingFactorX())));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Compute the font height. Override this if you want to use own calculations.
|
||||
*
|
||||
* @param font the font.
|
||||
* @return the font height.
|
||||
* @throws IOException if there is an error while getting the font bounding box.
|
||||
*/
|
||||
protected float computeFontHeight(PDFont font) throws IOException {
|
||||
|
||||
BoundingBox bbox = font.getBoundingBox();
|
||||
if (bbox.getLowerLeftY() < Short.MIN_VALUE) {
|
||||
// PDFBOX-2158 and PDFBOX-3130
|
||||
// files by Salmat eSolutions / ClibPDF Library
|
||||
bbox.setLowerLeftY(-(bbox.getLowerLeftY() + 65536));
|
||||
}
|
||||
// 1/2 the bbox is used as the height todo: why?
|
||||
float glyphHeight = bbox.getHeight() / 2;
|
||||
|
||||
// sometimes the bbox has very high values, but CapHeight is OK
|
||||
PDFontDescriptor fontDescriptor = font.getFontDescriptor();
|
||||
if (fontDescriptor != null) {
|
||||
float capHeight = fontDescriptor.getCapHeight();
|
||||
if (Float.compare(capHeight, 0) != 0 && (capHeight < glyphHeight || Float.compare(glyphHeight, 0) == 0)) {
|
||||
glyphHeight = capHeight;
|
||||
}
|
||||
// PDFBOX-3464, PDFBOX-4480, PDFBOX-4553:
|
||||
// sometimes even CapHeight has very high value, but Ascent and Descent are ok
|
||||
float ascent = fontDescriptor.getAscent();
|
||||
float descent = fontDescriptor.getDescent();
|
||||
if (capHeight > ascent && ascent > 0 && descent < 0 && ((ascent - descent) / 2 < glyphHeight || Float.compare(glyphHeight, 0) == 0)) {
|
||||
glyphHeight = (ascent - descent) / 2;
|
||||
}
|
||||
}
|
||||
|
||||
// transformPoint from glyph space -> text space
|
||||
float height;
|
||||
if (font instanceof PDType3Font) {
|
||||
height = font.getFontMatrix().transformPoint(0, glyphHeight).y;
|
||||
} else {
|
||||
height = glyphHeight / 1000;
|
||||
}
|
||||
|
||||
return height;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* A method provided as an event interface to allow a subclass to perform some specific
|
||||
* functionality when text needs to be processed.
|
||||
*
|
||||
* @param text The text to be processed.
|
||||
*/
|
||||
protected void processTextPosition(TextPosition text) {
|
||||
// subclasses can override to provide specific functionality
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,82 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.parsing;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.pdfbox.text.PDFTextStripperByArea;
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextPositionSequence;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
|
||||
public class PDFAreaTextStripper extends PDFTextStripperByArea {
|
||||
|
||||
@Getter
|
||||
private List<TextPositionSequence> textPositionSequences = new ArrayList<>();
|
||||
|
||||
@Setter
|
||||
private int pageNumber;
|
||||
|
||||
|
||||
public PDFAreaTextStripper() throws IOException {
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void writeString(String text, List<TextPosition> textPositions) throws IOException {
|
||||
|
||||
int startIndex = 0;
|
||||
for (int i = 0; i <= textPositions.size() - 1; i++) {
|
||||
|
||||
if (i == 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i).getUnicode().equals("\u00A0"))) {
|
||||
startIndex++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Strange but sometimes this is happening, for example: Metolachlor2.pdf
|
||||
if (i > 0 && textPositions.get(i).getX() < textPositions.get(i - 1).getX()) {
|
||||
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
||||
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) {
|
||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
|
||||
}
|
||||
startIndex = i;
|
||||
}
|
||||
|
||||
if (textPositions.get(i).getRotation() == 0 && i > 0 && textPositions.get(i).getX() > textPositions.get(i - 1).getEndX() + 1) {
|
||||
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
||||
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) {
|
||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
|
||||
}
|
||||
startIndex = i;
|
||||
}
|
||||
|
||||
if (i > 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i).getUnicode().equals("\u00A0")) && i <= textPositions.size() - 2) {
|
||||
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
||||
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) {
|
||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
|
||||
}
|
||||
startIndex = i + 1;
|
||||
}
|
||||
}
|
||||
|
||||
List<TextPosition> sublist = textPositions.subList(startIndex, textPositions.size());
|
||||
if (!sublist.isEmpty() && (sublist.get(sublist.size() - 1).getUnicode().equals(" ") || sublist.get(sublist.size() - 1).getUnicode().equals("\u00A0"))) {
|
||||
sublist = sublist.subList(0, sublist.size() - 1);
|
||||
}
|
||||
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) {
|
||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
|
||||
}
|
||||
super.writeString(text);
|
||||
}
|
||||
|
||||
|
||||
public void clearPositions() {
|
||||
|
||||
textPositionSequences = new ArrayList<>();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,335 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.parsing;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.pdfbox.contentstream.operator.Operator;
|
||||
import org.apache.pdfbox.contentstream.operator.OperatorName;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColor;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColorN;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColorSpace;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceCMYKColor;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceGrayColor;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceRGBColor;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetStrokingColor;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetStrokingColorN;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetStrokingColorSpace;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceCMYKColor;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceGrayColor;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceRGBColor;
|
||||
import org.apache.pdfbox.contentstream.operator.state.SetFlatness;
|
||||
import org.apache.pdfbox.contentstream.operator.state.SetLineCapStyle;
|
||||
import org.apache.pdfbox.contentstream.operator.state.SetLineDashPattern;
|
||||
import org.apache.pdfbox.contentstream.operator.state.SetLineJoinStyle;
|
||||
import org.apache.pdfbox.contentstream.operator.state.SetLineMiterLimit;
|
||||
import org.apache.pdfbox.contentstream.operator.state.SetLineWidth;
|
||||
import org.apache.pdfbox.contentstream.operator.state.SetRenderingIntent;
|
||||
import org.apache.pdfbox.contentstream.operator.text.SetFontAndSize;
|
||||
import org.apache.pdfbox.cos.COSBase;
|
||||
import org.apache.pdfbox.cos.COSNumber;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.RedTextPosition;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Ruling;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
|
||||
@Getter
|
||||
private final List<TextPositionSequence> textPositionSequences = new ArrayList<>();
|
||||
@Getter
|
||||
private final List<Ruling> rulings = new ArrayList<>();
|
||||
private final List<Ruling> graphicsPath = new ArrayList<>();
|
||||
@Setter
|
||||
protected PDPage pdpage;
|
||||
@Getter
|
||||
private int minCharWidth;
|
||||
@Getter
|
||||
private int maxCharWidth;
|
||||
@Getter
|
||||
private int minCharHeight;
|
||||
@Getter
|
||||
private int maxCharHeight;
|
||||
|
||||
private float path_x;
|
||||
private float path_y;
|
||||
|
||||
@Setter
|
||||
private int pageNumber;
|
||||
|
||||
|
||||
public PDFLinesTextStripper() throws IOException {
|
||||
|
||||
super();
|
||||
this.addOperator(new SetStrokingColorSpace());
|
||||
this.addOperator(new SetNonStrokingColorSpace());
|
||||
this.addOperator(new SetLineDashPattern());
|
||||
this.addOperator(new SetStrokingDeviceGrayColor());
|
||||
this.addOperator(new SetNonStrokingDeviceGrayColor());
|
||||
this.addOperator(new SetFlatness());
|
||||
this.addOperator(new SetLineJoinStyle());
|
||||
this.addOperator(new SetLineCapStyle());
|
||||
this.addOperator(new SetStrokingDeviceCMYKColor());
|
||||
this.addOperator(new SetNonStrokingDeviceCMYKColor());
|
||||
this.addOperator(new SetLineMiterLimit());
|
||||
this.addOperator(new SetStrokingDeviceRGBColor());
|
||||
this.addOperator(new SetNonStrokingDeviceRGBColor());
|
||||
this.addOperator(new SetRenderingIntent());
|
||||
this.addOperator(new SetStrokingColor());
|
||||
this.addOperator(new SetNonStrokingColor());
|
||||
this.addOperator(new SetStrokingColorN());
|
||||
this.addOperator(new SetNonStrokingColorN());
|
||||
this.addOperator(new SetFontAndSize());
|
||||
this.addOperator(new SetLineWidth());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
protected void processOperator(Operator operator, List<COSBase> arguments) throws IOException {
|
||||
|
||||
String operation = operator.getName();
|
||||
|
||||
//move
|
||||
switch (operation) {
|
||||
case OperatorName.MOVE_TO:
|
||||
if (arguments.size() == 2) {
|
||||
Point2D.Float pos = transformPosition(floatValue(arguments.get(0)), floatValue(arguments.get(1)));
|
||||
path_x = (float) pos.getX();
|
||||
path_y = (float) pos.getY();
|
||||
}
|
||||
break;
|
||||
|
||||
//line
|
||||
case OperatorName.LINE_TO:
|
||||
if (arguments.size() == 2) {
|
||||
Point2D.Float pos = transformPosition(floatValue(arguments.get(0)), floatValue(arguments.get(1)));
|
||||
|
||||
// The direction of vertical lines must always be from bottom to top for the table extraction algorithm.
|
||||
if (pos.getY() > path_y) {
|
||||
graphicsPath.add(new Ruling(new Point2D.Float(path_x, path_y), new Point2D.Float((float) pos.getX(), (float) pos.getY())));
|
||||
} else {
|
||||
graphicsPath.add(new Ruling(new Point2D.Float(path_x, (float) pos.getY()), new Point2D.Float((float) pos.getX(), path_y)));
|
||||
}
|
||||
|
||||
path_x = (float) pos.getX();
|
||||
path_y = (float) pos.getY();
|
||||
}
|
||||
break;
|
||||
|
||||
//rectangle
|
||||
case OperatorName.APPEND_RECT:
|
||||
|
||||
if (arguments.size() == 4) {
|
||||
float x = floatValue(arguments.get(0));
|
||||
float y = floatValue(arguments.get(1));
|
||||
float width = floatValue(arguments.get(2));
|
||||
float height = floatValue(arguments.get(3));
|
||||
|
||||
Point2D p1 = transformPosition(x, y);
|
||||
Point2D p2 = transformPosition(x + width, y + height);
|
||||
|
||||
// Horizontal lines
|
||||
graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p1.getY()), new Point2D.Float((float) p2.getX(), (float) p1.getY())));
|
||||
graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p2.getY()), new Point2D.Float((float) p2.getX(), (float) p2.getY())));
|
||||
|
||||
// Vertical lines, direction must always be from bottom to top for the table extraction algorithm.
|
||||
if (p2.getY() > p1.getY()) {
|
||||
graphicsPath.add(new Ruling(new Point2D.Float((float) p2.getX(), (float) p1.getY()), new Point2D.Float((float) p2.getX(), (float) p2.getY())));
|
||||
} else {
|
||||
graphicsPath.add(new Ruling(new Point2D.Float((float) p2.getX(), (float) p2.getY()), new Point2D.Float((float) p2.getX(), (float) p1.getY())));
|
||||
}
|
||||
if (p2.getY() > p1.getY()) {
|
||||
graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p1.getY()), new Point2D.Float((float) p1.getX(), (float) p2.getY())));
|
||||
} else {
|
||||
graphicsPath.add(new Ruling(new Point2D.Float((float) p1.getX(), (float) p2.getY()), new Point2D.Float((float) p1.getX(), (float) p1.getY())));
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
//fill
|
||||
case OperatorName.FILL_NON_ZERO:
|
||||
case OperatorName.LEGACY_FILL_NON_ZERO:
|
||||
case OperatorName.FILL_EVEN_ODD:
|
||||
addVisibleRulings(graphicsPath, false);
|
||||
graphicsPath.clear();
|
||||
break;
|
||||
|
||||
//stroke
|
||||
case OperatorName.STROKE_PATH:
|
||||
addVisibleRulings(graphicsPath, true);
|
||||
graphicsPath.clear();
|
||||
break;
|
||||
|
||||
//cancel path
|
||||
case OperatorName.ENDPATH:
|
||||
graphicsPath.clear();
|
||||
break;
|
||||
|
||||
}
|
||||
|
||||
super.processOperator(operator, arguments);
|
||||
}
|
||||
|
||||
|
||||
private float floatValue(COSBase value) {
|
||||
|
||||
if (value instanceof COSNumber) {
|
||||
return ((COSNumber) value).floatValue();
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private Point2D.Float transformPosition(float x, float y) {
|
||||
|
||||
return super.transformedPoint(x, y);
|
||||
}
|
||||
|
||||
|
||||
private void addVisibleRulings(List<Ruling> path, boolean stroke) throws IOException {
|
||||
|
||||
try {
|
||||
if (stroke && !getGraphicsState().getStrokingColor().isPattern() && getGraphicsState().getStrokingColor()
|
||||
.toRGB() == 0 || !stroke && !getGraphicsState().getNonStrokingColor().isPattern() && getGraphicsState().getNonStrokingColor().toRGB() == 0) {
|
||||
rulings.addAll(path);
|
||||
}
|
||||
} catch (UnsupportedOperationException e) {
|
||||
log.debug("UnsupportedOperationException: " + getGraphicsState().getStrokingColor().getColorSpace().getName() + " or " + getGraphicsState().getNonStrokingColor()
|
||||
.getColorSpace()
|
||||
.getName() + " does not support toRGB");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void writeString(String text, List<TextPosition> textPositions) throws IOException {
|
||||
|
||||
int startIndex = 0;
|
||||
RedTextPosition previous = null;
|
||||
|
||||
textPositions.sort(Comparator.comparing(TextPosition::getXDirAdj));
|
||||
|
||||
for (int i = 0; i <= textPositions.size() - 1; i++) {
|
||||
|
||||
if (!textPositionSequences.isEmpty()) {
|
||||
previous = textPositionSequences.get(textPositionSequences.size() - 1)
|
||||
.getTextPositions()
|
||||
.get(textPositionSequences.get(textPositionSequences.size() - 1).getTextPositions().size() - 1);
|
||||
}
|
||||
|
||||
int charWidth = (int) textPositions.get(i).getWidthDirAdj();
|
||||
if (charWidth < minCharWidth) {
|
||||
minCharWidth = charWidth;
|
||||
}
|
||||
if (charWidth > maxCharWidth) {
|
||||
maxCharWidth = charWidth;
|
||||
}
|
||||
|
||||
int charHeight = (int) textPositions.get(i).getHeightDir();
|
||||
if (charHeight < minCharHeight) {
|
||||
minCharHeight = charHeight;
|
||||
}
|
||||
if (charWidth > maxCharHeight) {
|
||||
maxCharHeight = charHeight;
|
||||
}
|
||||
|
||||
if (i == 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i).getUnicode().equals("\u00A0") || textPositions.get(i).getUnicode().equals("\t"))) {
|
||||
startIndex++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Strange but sometimes this is happening, for example: Metolachlor2.pdf
|
||||
if (i > 0 && textPositions.get(i).getXDirAdj() < textPositions.get(i - 1).getXDirAdj()) {
|
||||
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
||||
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0)
|
||||
.getUnicode()
|
||||
.equals("\t")))) {
|
||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
|
||||
}
|
||||
startIndex = i;
|
||||
}
|
||||
|
||||
if (textPositions.get(i).getRotation() == 0 && i > 0 && textPositions.get(i).getX() > textPositions.get(i - 1).getEndX() + 1) {
|
||||
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
||||
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0)
|
||||
.getUnicode()
|
||||
.equals("\t")))) {
|
||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
|
||||
}
|
||||
startIndex = i;
|
||||
}
|
||||
|
||||
if (i > 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i).getUnicode().equals("\u00A0") || textPositions.get(i)
|
||||
.getUnicode()
|
||||
.equals("\t")) && i <= textPositions.size() - 2) {
|
||||
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
||||
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0)
|
||||
.getUnicode()
|
||||
.equals("\t")))) {
|
||||
|
||||
// Remove false sequence ends (whitespaces)
|
||||
if (previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0)
|
||||
.getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < 0.01) {
|
||||
for (TextPosition textPosition : sublist) {
|
||||
textPositionSequences.get(textPositionSequences.size() - 1).add(textPosition);
|
||||
}
|
||||
} else {
|
||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
|
||||
}
|
||||
}
|
||||
startIndex = i + 1;
|
||||
}
|
||||
}
|
||||
|
||||
List<TextPosition> sublist = textPositions.subList(startIndex, textPositions.size());
|
||||
if (!sublist.isEmpty() && (sublist.get(sublist.size() - 1).getUnicode().equals(" ") || sublist.get(sublist.size() - 1)
|
||||
.getUnicode()
|
||||
.equals("\u00A0") || sublist.get(sublist.size() - 1).getUnicode().equals("\t"))) {
|
||||
sublist = sublist.subList(0, sublist.size() - 1);
|
||||
}
|
||||
|
||||
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0)
|
||||
.getUnicode()
|
||||
.equals("\t")))) {
|
||||
if (previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0)
|
||||
.getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < 0.01) {
|
||||
for (TextPosition t : sublist) {
|
||||
textPositionSequences.get(textPositionSequences.size() - 1).add(t);
|
||||
}
|
||||
} else {
|
||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
|
||||
}
|
||||
}
|
||||
super.writeString(text);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String getText(PDDocument doc) throws IOException {
|
||||
|
||||
minCharWidth = Integer.MAX_VALUE;
|
||||
maxCharWidth = 0;
|
||||
minCharHeight = Integer.MAX_VALUE;
|
||||
maxCharHeight = 0;
|
||||
textPositionSequences.clear();
|
||||
rulings.clear();
|
||||
graphicsPath.clear();
|
||||
path_x = 0.0f;
|
||||
path_y = 0.0f;
|
||||
|
||||
return super.getText(doc);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,279 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.service;
|
||||
|
||||
import static java.util.stream.Collectors.toSet;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.FloatFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextBlockOrientation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.StringFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.RulingTextDirAdjustUtil;
|
||||
|
||||
@Service
|
||||
@SuppressWarnings("all")
|
||||
public class BlockificationService {
|
||||
|
||||
static final float THRESHOLD = 1f;
|
||||
|
||||
|
||||
/**
|
||||
* This method is building blocks by expanding the minX/maxX and minY/maxY value on each word that is not split by the conditions.
|
||||
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
|
||||
* Rulings (Table lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
|
||||
*
|
||||
* @param textPositions The words of a page.
|
||||
* @param horizontalRulingLines Horizontal table lines.
|
||||
* @param verticalRulingLines Vertical table lines.
|
||||
* @return ClassificationPage object that contains the Textblock and text statistics.
|
||||
*/
|
||||
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
||||
|
||||
int indexOnPage = 0;
|
||||
List<TextPositionSequence> chunkWords = new ArrayList<>();
|
||||
List<AbstractTextContainer> chunkBlockList1 = new ArrayList<>();
|
||||
|
||||
float minX = 1000, maxX = 0, minY = 1000, maxY = 0;
|
||||
TextPositionSequence prev = null;
|
||||
|
||||
boolean wasSplitted = false;
|
||||
Float splitX1 = null;
|
||||
for (TextPositionSequence word : textPositions) {
|
||||
|
||||
boolean lineSeparation = word.getMinYDirAdj() - maxY > word.getHeight() * 1.25;
|
||||
boolean startFromTop = prev != null && word.getMinYDirAdj() < prev.getMinYDirAdj() - prev.getTextHeight();
|
||||
boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj();
|
||||
boolean xIsBeforeFirstX = prev != null && word.getMinXDirAdj() < minX;
|
||||
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
|
||||
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
|
||||
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
|
||||
|
||||
if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) {
|
||||
|
||||
TextBlockOrientation prevOrientation = null;
|
||||
if (!chunkBlockList1.isEmpty()) {
|
||||
prevOrientation = chunkBlockList1.get(chunkBlockList1.size() - 1).getOrientation();
|
||||
}
|
||||
|
||||
ClassificationTextBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
|
||||
indexOnPage++;
|
||||
|
||||
chunkBlockList1.add(cb1);
|
||||
chunkWords = new ArrayList<>();
|
||||
|
||||
if (splitByX && !isSplitByRuling) {
|
||||
wasSplitted = true;
|
||||
cb1.setOrientation(TextBlockOrientation.LEFT);
|
||||
splitX1 = word.getMinXDirAdj();
|
||||
} else if (newLineAfterSplit && !isSplitByRuling) {
|
||||
wasSplitted = false;
|
||||
cb1.setOrientation(TextBlockOrientation.RIGHT);
|
||||
splitX1 = null;
|
||||
} else if (prevOrientation != null && prevOrientation.equals(TextBlockOrientation.RIGHT) && (lineSeparation || !startFromTop || !splitByX || !newLineAfterSplit || !isSplitByRuling)) {
|
||||
cb1.setOrientation(TextBlockOrientation.LEFT);
|
||||
}
|
||||
|
||||
minX = 1000;
|
||||
maxX = 0;
|
||||
minY = 1000;
|
||||
maxY = 0;
|
||||
prev = null;
|
||||
}
|
||||
|
||||
chunkWords.add(word);
|
||||
|
||||
prev = word;
|
||||
if (word.getMinXDirAdj() < minX) {
|
||||
minX = word.getMinXDirAdj();
|
||||
}
|
||||
if (word.getMaxXDirAdj() > maxX) {
|
||||
maxX = word.getMaxXDirAdj();
|
||||
}
|
||||
if (word.getMinYDirAdj() < minY) {
|
||||
minY = word.getMinYDirAdj();
|
||||
}
|
||||
if (word.getMaxYDirAdj() > maxY) {
|
||||
maxY = word.getMaxYDirAdj();
|
||||
}
|
||||
}
|
||||
|
||||
ClassificationTextBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
|
||||
if (cb1 != null) {
|
||||
chunkBlockList1.add(cb1);
|
||||
}
|
||||
|
||||
Iterator<AbstractTextContainer> itty = chunkBlockList1.iterator();
|
||||
|
||||
ClassificationTextBlock previousLeft = null;
|
||||
ClassificationTextBlock previousRight = null;
|
||||
while (itty.hasNext()) {
|
||||
ClassificationTextBlock block = (ClassificationTextBlock) itty.next();
|
||||
|
||||
if (previousLeft != null && block.getOrientation().equals(TextBlockOrientation.LEFT)) {
|
||||
if (previousLeft.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousLeft.getMinY()) {
|
||||
previousLeft.add(block);
|
||||
itty.remove();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (previousRight != null && block.getOrientation().equals(TextBlockOrientation.RIGHT)) {
|
||||
if (previousRight.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousRight.getMinY()) {
|
||||
previousRight.add(block);
|
||||
itty.remove();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (block.getOrientation().equals(TextBlockOrientation.LEFT)) {
|
||||
previousLeft = block;
|
||||
} else if (block.getOrientation().equals(TextBlockOrientation.RIGHT)) {
|
||||
previousRight = block;
|
||||
}
|
||||
}
|
||||
|
||||
itty = chunkBlockList1.iterator();
|
||||
ClassificationTextBlock previous = null;
|
||||
while (itty.hasNext()) {
|
||||
ClassificationTextBlock block = (ClassificationTextBlock) itty.next();
|
||||
|
||||
if (previous != null && previous.getOrientation().equals(TextBlockOrientation.LEFT) && block.getOrientation().equals(TextBlockOrientation.LEFT) && equalsWithThreshold(block.getMaxY(),
|
||||
previous.getMaxY()) || previous != null && previous.getOrientation().equals(TextBlockOrientation.LEFT) && block.getOrientation()
|
||||
.equals(TextBlockOrientation.RIGHT) && equalsWithThreshold(block.getMaxY(), previous.getMaxY())) {
|
||||
previous.add(block);
|
||||
itty.remove();
|
||||
continue;
|
||||
}
|
||||
|
||||
previous = block;
|
||||
}
|
||||
|
||||
return new ClassificationPage(chunkBlockList1);
|
||||
}
|
||||
|
||||
|
||||
private boolean equalsWithThreshold(float f1, float f2) {
|
||||
|
||||
return Math.abs(f1 - f2) < THRESHOLD;
|
||||
}
|
||||
|
||||
|
||||
private ClassificationTextBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
|
||||
|
||||
ClassificationTextBlock textBlock = null;
|
||||
|
||||
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
|
||||
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
|
||||
FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter();
|
||||
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
|
||||
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
|
||||
|
||||
for (TextPositionSequence wordBlock : wordBlockList) {
|
||||
|
||||
lineHeightFrequencyCounter.add(wordBlock.getTextHeight());
|
||||
fontSizeFrequencyCounter.add(wordBlock.getFontSize());
|
||||
spaceFrequencyCounter.add(wordBlock.getSpaceWidth());
|
||||
fontFrequencyCounter.add(wordBlock.getFont());
|
||||
styleFrequencyCounter.add(wordBlock.getFontStyle());
|
||||
|
||||
if (textBlock == null) {
|
||||
textBlock = new ClassificationTextBlock(wordBlock.getMinXDirAdj(),
|
||||
wordBlock.getMaxXDirAdj(),
|
||||
wordBlock.getMinYDirAdj(),
|
||||
wordBlock.getMaxYDirAdj(),
|
||||
wordBlockList,
|
||||
wordBlock.getRotation(),
|
||||
indexOnPage);
|
||||
} else {
|
||||
ClassificationTextBlock spatialEntity = textBlock.union(wordBlock);
|
||||
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
|
||||
}
|
||||
}
|
||||
|
||||
if (textBlock != null) {
|
||||
textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
|
||||
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
|
||||
}
|
||||
|
||||
if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) {
|
||||
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
|
||||
}
|
||||
return textBlock;
|
||||
}
|
||||
|
||||
|
||||
private boolean isSplitByRuling(float minX,
|
||||
float minY,
|
||||
float maxX,
|
||||
float maxY,
|
||||
TextPositionSequence word,
|
||||
List<Ruling> horizontalRulingLines,
|
||||
List<Ruling> verticalRulingLines) {
|
||||
|
||||
return isSplitByRuling(maxX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMinYDirAdj(),
|
||||
verticalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight()) //
|
||||
|| isSplitByRuling(minX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMaxYDirAdj(),
|
||||
horizontalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight()) //
|
||||
|| isSplitByRuling(maxX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMinYDirAdj(),
|
||||
horizontalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight()) //
|
||||
|| isSplitByRuling(minX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMaxYDirAdj(),
|
||||
verticalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight()); //
|
||||
}
|
||||
|
||||
|
||||
private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List<Ruling> rulingLines, float dir, float pageWidth, float pageHeight) {
|
||||
|
||||
for (Ruling ruling : rulingLines) {
|
||||
var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight);
|
||||
if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
private double round(float value, int decimalPoints) {
|
||||
|
||||
var d = Math.pow(10, decimalPoints);
|
||||
return Math.round(value * d) / d;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,160 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.service;
|
||||
import java.util.List;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.FloatFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.TableCell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Table;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.PositionUtils;
|
||||
|
||||
@Service
|
||||
public class BodyTextFrameService {
|
||||
|
||||
/**
|
||||
* Adjusts and sets the body text frame to a classificationPage.
|
||||
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the classificationPage rotation.
|
||||
* 0 -> LowerLeft
|
||||
* 90 -> UpperLeft
|
||||
* 180 -> UpperRight
|
||||
* 270 -> LowerRight
|
||||
* The aspect ratio of the classificationPage is also regarded.
|
||||
*
|
||||
* @param classificationPage The classificationPage
|
||||
* @param bodyTextFrame frame that contains the main text on portrait pages
|
||||
* @param landscapeBodyTextFrame frame that contains the main text on landscape pages
|
||||
*/
|
||||
public void setBodyTextFrameAdjustedToPage(ClassificationPage classificationPage, Rectangle bodyTextFrame, Rectangle landscapeBodyTextFrame) {
|
||||
|
||||
Rectangle textFrame = classificationPage.isLandscape() ? landscapeBodyTextFrame : bodyTextFrame;
|
||||
|
||||
if (classificationPage.getPageWidth() > classificationPage.getPageHeight() && classificationPage.getRotation() == 270) {
|
||||
textFrame = new Rectangle(new Point(textFrame.getTopLeft().getY(), classificationPage.getPageHeight() - textFrame.getTopLeft().getX() - textFrame.getWidth()),
|
||||
textFrame.getHeight(),
|
||||
textFrame.getWidth(),
|
||||
0);
|
||||
} else if (classificationPage.getPageWidth() > classificationPage.getPageHeight() && classificationPage.getRotation() != 0) {
|
||||
textFrame = new Rectangle(new Point(textFrame.getTopLeft().getY(), textFrame.getTopLeft().getX()), textFrame.getHeight(), textFrame.getWidth(), classificationPage.getPageNumber());
|
||||
} else if (classificationPage.getRotation() == 180) {
|
||||
textFrame = new Rectangle(new Point(textFrame.getTopLeft().getX(), classificationPage.getPageHeight() - textFrame.getTopLeft().getY() - textFrame.getHeight()),
|
||||
textFrame.getWidth(),
|
||||
textFrame.getHeight(),
|
||||
0);
|
||||
}
|
||||
classificationPage.setBodyTextFrame(textFrame);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Calculates the frame that contains the main text, text outside the frame will be e.g. headers or footers.
|
||||
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
|
||||
* 0 -> LowerLeft
|
||||
* 90 -> UpperLeft
|
||||
* 180 -> UpperRight
|
||||
* 270 -> LowerRight
|
||||
* The aspect ratio of the page is also regarded.
|
||||
*
|
||||
* @param classificationPages List of all classificationPages
|
||||
* @param documentFontSizeCounter Statistics of the document
|
||||
* @param landscape Calculate for landscape or portrait
|
||||
* @return Rectangle of the text frame
|
||||
*/
|
||||
public Rectangle calculateBodyTextFrame(List<ClassificationPage> classificationPages, FloatFrequencyCounter documentFontSizeCounter, boolean landscape) {
|
||||
|
||||
BodyTextFrameExpansionsRectangle expansionsRectangle = new BodyTextFrameExpansionsRectangle();
|
||||
|
||||
for (ClassificationPage classificationPage : classificationPages) {
|
||||
|
||||
if (classificationPage.getTextBlocks().isEmpty() || landscape != classificationPage.isLandscape()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (AbstractTextContainer container : classificationPage.getTextBlocks()) {
|
||||
|
||||
if (container instanceof ClassificationTextBlock) {
|
||||
ClassificationTextBlock textBlock = (ClassificationTextBlock) container;
|
||||
if (textBlock.getMostPopularWordFont() == null || textBlock.getMostPopularWordStyle() == null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
float approxLineCount = PositionUtils.getApproxLineCount(textBlock);
|
||||
if (approxLineCount < 2.9f) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (documentFontSizeCounter.getMostPopular() != null && textBlock.getMostPopularWordFontSize() >= documentFontSizeCounter.getMostPopular()) {
|
||||
|
||||
expandRectangle(textBlock, classificationPage, expansionsRectangle);
|
||||
}
|
||||
}
|
||||
|
||||
if (container instanceof Table) {
|
||||
Table table = (Table) container;
|
||||
for (List<TableCell> row : table.getRows()) {
|
||||
for (TableCell cell : row) {
|
||||
|
||||
if (cell == null || cell.getTextBlocks() == null) {
|
||||
continue;
|
||||
}
|
||||
for (ClassificationTextBlock textBlock : cell.getTextBlocks()) {
|
||||
expandRectangle(textBlock, classificationPage, expansionsRectangle);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return new Rectangle(new Point(expansionsRectangle.minX, expansionsRectangle.minY),
|
||||
expansionsRectangle.maxX - expansionsRectangle.minX,
|
||||
expansionsRectangle.maxY - expansionsRectangle.minY,
|
||||
0);
|
||||
}
|
||||
|
||||
|
||||
private void expandRectangle(ClassificationTextBlock textBlock, ClassificationPage classificationPage, BodyTextFrameExpansionsRectangle expansionsRectangle) {
|
||||
|
||||
if (classificationPage.getPageWidth() > classificationPage.getPageHeight() && classificationPage.getRotation() != 0) {
|
||||
if (textBlock.getPdfMinY() < expansionsRectangle.minX) {
|
||||
expansionsRectangle.minX = textBlock.getPdfMinY();
|
||||
}
|
||||
if (textBlock.getPdfMaxY() > expansionsRectangle.maxX) {
|
||||
expansionsRectangle.maxX = textBlock.getPdfMaxY();
|
||||
}
|
||||
if (textBlock.getPdfMinX() < expansionsRectangle.minY) {
|
||||
expansionsRectangle.minY = textBlock.getPdfMinX();
|
||||
}
|
||||
if (textBlock.getPdfMaxX() > expansionsRectangle.maxY) {
|
||||
expansionsRectangle.maxY = textBlock.getPdfMaxX();
|
||||
}
|
||||
} else {
|
||||
if (textBlock.getPdfMinX() < expansionsRectangle.minX) {
|
||||
expansionsRectangle.minX = textBlock.getPdfMinX();
|
||||
}
|
||||
if (textBlock.getPdfMaxX() > expansionsRectangle.maxX) {
|
||||
expansionsRectangle.maxX = textBlock.getPdfMaxX();
|
||||
}
|
||||
if (textBlock.getPdfMinY() < expansionsRectangle.minY) {
|
||||
expansionsRectangle.minY = textBlock.getPdfMinY();
|
||||
}
|
||||
if (textBlock.getPdfMaxY() > expansionsRectangle.maxY) {
|
||||
expansionsRectangle.maxY = textBlock.getPdfMaxY();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private class BodyTextFrameExpansionsRectangle {
|
||||
|
||||
float minX = 10000;
|
||||
float maxX = -100;
|
||||
float minY = 10000;
|
||||
float maxY = -100;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,116 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.service;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.PositionUtils;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class ClassificationService {
|
||||
|
||||
private final BodyTextFrameService bodyTextFrameService;
|
||||
|
||||
|
||||
public void classifyDocument(ClassificationDocument document) {
|
||||
|
||||
Rectangle bodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), false);
|
||||
Rectangle landscapeBodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), true);
|
||||
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
|
||||
|
||||
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
|
||||
|
||||
for (ClassificationPage classificationPage : document.getPages()) {
|
||||
bodyTextFrameService.setBodyTextFrameAdjustedToPage(classificationPage, bodyTextFrame, landscapeBodyTextFrame);
|
||||
classifyPage(classificationPage, document, headlineFontSizes);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void classifyPage(ClassificationPage classificationPage, ClassificationDocument document, List<Float> headlineFontSizes) {
|
||||
|
||||
for (AbstractTextContainer textBlock : classificationPage.getTextBlocks()) {
|
||||
if (textBlock instanceof ClassificationTextBlock) {
|
||||
classifyBlock((ClassificationTextBlock) textBlock, classificationPage, document, headlineFontSizes);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void classifyBlock(ClassificationTextBlock textBlock, ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
|
||||
|
||||
var bodyTextFrame = page.getBodyTextFrame();
|
||||
|
||||
if (document.getFontSizeCounter().getMostPopular() == null) {
|
||||
textBlock.setClassification("Other");
|
||||
return;
|
||||
}
|
||||
if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
|
||||
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
|
||||
textBlock.setClassification("Header");
|
||||
|
||||
} else if (PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
|
||||
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
|
||||
textBlock.setClassification("Footer");
|
||||
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
|
||||
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
|
||||
.size() == 1)) {
|
||||
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
|
||||
textBlock.setClassification("Title");
|
||||
}
|
||||
} else if (textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter()
|
||||
.getMostPopular() && PositionUtils.getApproxLineCount(textBlock) < 4.9 && (textBlock.getMostPopularWordStyle().equals("bold") || !document.getFontStyleCounter()
|
||||
.getCountPerValue()
|
||||
.containsKey("bold") && textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() + 1) && textBlock.getSequences()
|
||||
.get(0)
|
||||
.getTextPositions()
|
||||
.get(0)
|
||||
.getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
|
||||
|
||||
for (int i = 1; i <= headlineFontSizes.size(); i++) {
|
||||
if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) {
|
||||
textBlock.setClassification("H " + i);
|
||||
document.setHeadlines(true);
|
||||
}
|
||||
}
|
||||
} else if (!textBlock.getText().startsWith("Table ") && !textBlock.getText().startsWith("Figure ") && PositionUtils.isWithinBodyTextFrame(bodyTextFrame,
|
||||
textBlock) && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter()
|
||||
.getMostPopular()
|
||||
.equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9 && textBlock.getSequences()
|
||||
.get(0)
|
||||
.getTextPositions()
|
||||
.get(0)
|
||||
.getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
|
||||
textBlock.setClassification("H " + (headlineFontSizes.size() + 1));
|
||||
document.setHeadlines(true);
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
|
||||
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) {
|
||||
textBlock.setClassification("TextBlock Bold");
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFont()
|
||||
.equals(document.getFontCounter().getMostPopular()) && textBlock.getMostPopularWordStyle()
|
||||
.equals(document.getFontStyleCounter().getMostPopular()) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) {
|
||||
textBlock.setClassification("TextBlock");
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
|
||||
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("italic") && !document.getFontStyleCounter()
|
||||
.getMostPopular()
|
||||
.equals("italic") && PositionUtils.getApproxLineCount(textBlock) < 2.9) {
|
||||
textBlock.setClassification("TextBlock Italic");
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) {
|
||||
textBlock.setClassification("TextBlock Unknown");
|
||||
} else {
|
||||
textBlock.setClassification("Other");
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,134 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.service;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.ImageServiceResponseAdapter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.parsing.PDFLinesTextStripper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CvParsedTableCell;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class PdfParsingService {
|
||||
|
||||
private final RulingCleaningService rulingCleaningService;
|
||||
private final TableExtractionService tableExtractionService;
|
||||
private final BlockificationService blockificationService;
|
||||
private final ImageServiceResponseAdapter imageServiceResponseAdapter;
|
||||
|
||||
|
||||
public ClassificationDocument parseDocument(PDDocument originDocument, Map<Integer, List<CvParsedTableCell>> pdfTableCells, Map<Integer, List<ClassifiedImage>> pdfImages) {
|
||||
|
||||
ClassificationDocument document = new ClassificationDocument();
|
||||
List<ClassificationPage> classificationPages = new ArrayList<>();
|
||||
|
||||
originDocument.setAllSecurityToBeRemoved(true);
|
||||
long pageCount = originDocument.getNumberOfPages();
|
||||
|
||||
for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) {
|
||||
parsePage(pdfImages, originDocument, pdfTableCells, document, classificationPages, pageNumber);
|
||||
}
|
||||
|
||||
document.setPages(classificationPages);
|
||||
|
||||
return document;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void parsePage(Map<Integer, List<ClassifiedImage>> pdfImages,
|
||||
PDDocument pdDocument,
|
||||
Map<Integer, List<CvParsedTableCell>> pdfTableCells,
|
||||
ClassificationDocument document,
|
||||
List<ClassificationPage> classificationPages,
|
||||
int pageNumber) {
|
||||
|
||||
PDFLinesTextStripper stripper = new PDFLinesTextStripper();
|
||||
PDPage pdPage = pdDocument.getPage(pageNumber - 1);
|
||||
stripper.setPageNumber(pageNumber);
|
||||
stripper.setStartPage(pageNumber);
|
||||
stripper.setEndPage(pageNumber);
|
||||
stripper.setPdpage(pdPage);
|
||||
stripper.getText(pdDocument);
|
||||
|
||||
PDRectangle pdr = pdPage.getMediaBox();
|
||||
|
||||
int rotation = pdPage.getRotation();
|
||||
boolean isLandscape = pdr.getWidth() > pdr.getHeight() && (rotation == 0 || rotation == 180) || pdr.getHeight() > pdr.getWidth() && (rotation == 90 || rotation == 270);
|
||||
|
||||
PDRectangle cropbox = pdPage.getCropBox();
|
||||
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber),
|
||||
stripper.getRulings(),
|
||||
stripper.getMinCharWidth(),
|
||||
stripper.getMaxCharHeight());
|
||||
ClassificationPage classificationPage = blockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
|
||||
classificationPage.setRotation(rotation);
|
||||
classificationPage.setLandscape(isLandscape);
|
||||
classificationPage.setPageNumber(pageNumber);
|
||||
classificationPage.setPageWidth(cropbox.getWidth());
|
||||
classificationPage.setPageHeight(cropbox.getHeight());
|
||||
|
||||
// If images is ocr needs to be calculated before textBlocks are moved into tables, otherwise findOcr algorithm needs to be adopted.
|
||||
if (pdfImages != null && pdfImages.containsKey(pageNumber)) {
|
||||
classificationPage.setImages(pdfImages.get(pageNumber));
|
||||
imageServiceResponseAdapter.findOcr(classificationPage);
|
||||
}
|
||||
|
||||
tableExtractionService.removeRedundantTableCells(cleanRulings, classificationPage);
|
||||
buildPageStatistics(classificationPage);
|
||||
increaseDocumentStatistics(classificationPage, document);
|
||||
|
||||
classificationPages.add(classificationPage);
|
||||
}
|
||||
|
||||
|
||||
private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) {
|
||||
|
||||
if (!classificationPage.isLandscape()) {
|
||||
document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue());
|
||||
}
|
||||
document.getFontCounter().addAll(classificationPage.getFontCounter().getCountPerValue());
|
||||
document.getTextHeightCounter().addAll(classificationPage.getTextHeightCounter().getCountPerValue());
|
||||
document.getFontStyleCounter().addAll(classificationPage.getFontStyleCounter().getCountPerValue());
|
||||
}
|
||||
|
||||
|
||||
private void buildPageStatistics(ClassificationPage classificationPage) {
|
||||
|
||||
// Collect all statistics for the classificationPage, except from blocks inside tables, as tables will always be added to BodyTextFrame.
|
||||
for (AbstractTextContainer textBlock : classificationPage.getTextBlocks()) {
|
||||
if (textBlock instanceof ClassificationTextBlock) {
|
||||
if (((ClassificationTextBlock) textBlock).getSequences() == null) {
|
||||
continue;
|
||||
}
|
||||
for (TextPositionSequence word : ((ClassificationTextBlock) textBlock).getSequences()) {
|
||||
classificationPage.getTextHeightCounter().add(word.getTextHeight());
|
||||
classificationPage.getFontCounter().add(word.getFont());
|
||||
classificationPage.getFontSizeCounter().add(word.getFontSize());
|
||||
classificationPage.getFontStyleCounter().add(word.getFontStyle());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,231 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.service;
|
||||
|
||||
import java.awt.geom.Line2D;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CvParsedTableCell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.DoubleComparisons;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class RulingCleaningService {
|
||||
|
||||
public CleanRulings getCleanRulings(List<CvParsedTableCell> cvParsedTableCells, List<Ruling> rulings, float minCharWidth, float maxCharHeight) {
|
||||
|
||||
if (!rulings.isEmpty()) {
|
||||
snapPoints(rulings, minCharWidth, maxCharHeight);
|
||||
}
|
||||
|
||||
List<Ruling> vrs = new ArrayList<>();
|
||||
for (Ruling vr : rulings) {
|
||||
if (vr.vertical()) {
|
||||
vrs.add(vr);
|
||||
}
|
||||
}
|
||||
if (vrs.isEmpty()) {
|
||||
vrs.addAll(extractVerticalRulings(cvParsedTableCells));
|
||||
}
|
||||
List<Ruling> verticalRulingLines = collapseOrientedRulings(vrs);
|
||||
|
||||
List<Ruling> hrs = new ArrayList<>();
|
||||
for (Ruling hr : rulings) {
|
||||
if (hr.horizontal()) {
|
||||
hrs.add(hr);
|
||||
}
|
||||
}
|
||||
if (hrs.isEmpty()) {
|
||||
hrs.addAll(extractHorizontalRulings(cvParsedTableCells));
|
||||
}
|
||||
List<Ruling> horizontalRulingLines = collapseOrientedRulings(hrs);
|
||||
|
||||
return CleanRulings.builder().vertical(verticalRulingLines).horizontal(horizontalRulingLines).build();
|
||||
}
|
||||
|
||||
|
||||
public void snapPoints(List<? extends Line2D.Float> rulings, float xThreshold, float yThreshold) {
|
||||
|
||||
// collect points and keep a Line -> p1,p2 map
|
||||
Map<Line2D.Float, Point2D[]> linesToPoints = new HashMap<>();
|
||||
List<Point2D> points = new ArrayList<>();
|
||||
for (Line2D.Float r : rulings) {
|
||||
Point2D p1 = r.getP1();
|
||||
Point2D p2 = r.getP2();
|
||||
linesToPoints.put(r, new Point2D[]{p1, p2});
|
||||
points.add(p1);
|
||||
points.add(p2);
|
||||
}
|
||||
|
||||
// snap by X
|
||||
points.sort(Comparator.comparingDouble(Point2D::getX));
|
||||
|
||||
List<List<Point2D>> groupedPoints = new ArrayList<>();
|
||||
groupedPoints.add(new ArrayList<>(Collections.singletonList(points.get(0))));
|
||||
|
||||
for (Point2D p : points.subList(1, points.size() - 1)) {
|
||||
List<Point2D> last = groupedPoints.get(groupedPoints.size() - 1);
|
||||
if (Math.abs(p.getX() - last.get(0).getX()) < xThreshold) {
|
||||
groupedPoints.get(groupedPoints.size() - 1).add(p);
|
||||
} else {
|
||||
groupedPoints.add(new ArrayList<>(Collections.singletonList(p)));
|
||||
}
|
||||
}
|
||||
|
||||
for (List<Point2D> group : groupedPoints) {
|
||||
float avgLoc = 0;
|
||||
for (Point2D p : group) {
|
||||
avgLoc += p.getX();
|
||||
}
|
||||
avgLoc /= group.size();
|
||||
for (Point2D p : group) {
|
||||
p.setLocation(avgLoc, p.getY());
|
||||
}
|
||||
}
|
||||
// ---
|
||||
|
||||
// snap by Y
|
||||
points.sort(Comparator.comparingDouble(Point2D::getY));
|
||||
|
||||
groupedPoints = new ArrayList<>();
|
||||
groupedPoints.add(new ArrayList<>(Collections.singletonList(points.get(0))));
|
||||
|
||||
for (Point2D p : points.subList(1, points.size() - 1)) {
|
||||
List<Point2D> last = groupedPoints.get(groupedPoints.size() - 1);
|
||||
if (Math.abs(p.getY() - last.get(0).getY()) < yThreshold) {
|
||||
groupedPoints.get(groupedPoints.size() - 1).add(p);
|
||||
} else {
|
||||
groupedPoints.add(new ArrayList<>(Collections.singletonList(p)));
|
||||
}
|
||||
}
|
||||
|
||||
for (List<Point2D> group : groupedPoints) {
|
||||
float avgLoc = 0;
|
||||
for (Point2D p : group) {
|
||||
avgLoc += p.getY();
|
||||
}
|
||||
avgLoc /= group.size();
|
||||
for (Point2D p : group) {
|
||||
p.setLocation(p.getX(), avgLoc);
|
||||
}
|
||||
}
|
||||
// ---
|
||||
|
||||
// finally, modify lines
|
||||
for (Map.Entry<Line2D.Float, Point2D[]> ltp : linesToPoints.entrySet()) {
|
||||
Point2D[] p = ltp.getValue();
|
||||
ltp.getKey().setLine(p[0], p[1]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private Collection<? extends Ruling> extractVerticalRulings(List<CvParsedTableCell> cvParsedTableCells) {
|
||||
|
||||
List<Ruling> vrs = new ArrayList<>();
|
||||
|
||||
if (cvParsedTableCells != null) {
|
||||
for (CvParsedTableCell cvParsedTableCell : cvParsedTableCells) {
|
||||
Ruling leftLine = createRuling(cvParsedTableCell.getX0(), cvParsedTableCell.getX0(), cvParsedTableCell.getY0(), cvParsedTableCell.getY1());
|
||||
Ruling rightLine = createRuling(cvParsedTableCell.getX1(), cvParsedTableCell.getX1(), cvParsedTableCell.getY0(), cvParsedTableCell.getY1());
|
||||
vrs.add(leftLine);
|
||||
vrs.add(rightLine);
|
||||
}
|
||||
}
|
||||
return vrs;
|
||||
}
|
||||
|
||||
|
||||
private Collection<? extends Ruling> extractHorizontalRulings(List<CvParsedTableCell> cvParsedTableCells) {
|
||||
|
||||
List<Ruling> hrs = new ArrayList<>();
|
||||
|
||||
if (cvParsedTableCells != null) {
|
||||
for (CvParsedTableCell cvParsedTableCell : cvParsedTableCells) {
|
||||
Ruling topLine = createRuling(cvParsedTableCell.getX0(), cvParsedTableCell.getX1(), cvParsedTableCell.getY1(), cvParsedTableCell.getY1());
|
||||
Ruling baseLine = createRuling(cvParsedTableCell.getX0(), cvParsedTableCell.getX1(), cvParsedTableCell.getY0(), cvParsedTableCell.getY0());
|
||||
hrs.add(topLine);
|
||||
hrs.add(baseLine);
|
||||
}
|
||||
}
|
||||
|
||||
return hrs;
|
||||
}
|
||||
|
||||
|
||||
private Ruling createRuling(float tableCellX0, float tableCellX1, float tableCellY0, float tableCellY1) {
|
||||
|
||||
float x0 = tableCellX0;
|
||||
float x1 = tableCellX1;
|
||||
float y0 = tableCellY0;
|
||||
float y1 = tableCellY1;
|
||||
|
||||
if (x1 < x0) {
|
||||
x0 = tableCellX1;
|
||||
x1 = tableCellX0;
|
||||
}
|
||||
|
||||
if (y1 < y0) {
|
||||
y0 = tableCellY1;
|
||||
y1 = tableCellY0;
|
||||
}
|
||||
|
||||
return new Ruling(new Point2D.Float(x0, y0), new Point2D.Float(x1, y1));
|
||||
}
|
||||
|
||||
|
||||
private List<Ruling> collapseOrientedRulings(List<Ruling> lines) {
|
||||
|
||||
int COLINEAR_OR_PARALLEL_PIXEL_EXPAND_AMOUNT = 1;
|
||||
return collapseOrientedRulings(lines, COLINEAR_OR_PARALLEL_PIXEL_EXPAND_AMOUNT);
|
||||
}
|
||||
|
||||
|
||||
private List<Ruling> collapseOrientedRulings(List<Ruling> lines, int expandAmount) {
|
||||
|
||||
ArrayList<Ruling> rv = new ArrayList<>();
|
||||
lines.sort((a, b) -> {
|
||||
final float diff = a.getPosition() - b.getPosition();
|
||||
return Float.compare(diff == 0 ? a.getStart() - b.getStart() : diff, 0f);
|
||||
});
|
||||
|
||||
for (Ruling next_line : lines) {
|
||||
Ruling last = rv.isEmpty() ? null : rv.get(rv.size() - 1);
|
||||
// if current line colinear with next, and are "close enough": expand current line
|
||||
if (last != null && DoubleComparisons.feq(next_line.getPosition(), last.getPosition()) && last.nearlyIntersects(next_line, expandAmount)) {
|
||||
final float lastStart = last.getStart();
|
||||
final float lastEnd = last.getEnd();
|
||||
|
||||
final boolean lastFlipped = lastStart > lastEnd;
|
||||
final boolean nextFlipped = next_line.getStart() > next_line.getEnd();
|
||||
|
||||
boolean differentDirections = nextFlipped != lastFlipped;
|
||||
float nextS = differentDirections ? next_line.getEnd() : next_line.getStart();
|
||||
float nextE = differentDirections ? next_line.getStart() : next_line.getEnd();
|
||||
|
||||
final float newStart = lastFlipped ? Math.max(nextS, lastStart) : Math.min(nextS, lastStart);
|
||||
final float newEnd = lastFlipped ? Math.min(nextE, lastEnd) : Math.max(nextE, lastEnd);
|
||||
last.setStartEnd(newStart, newEnd);
|
||||
assert !last.oblique();
|
||||
} else if (next_line.length() == 0) {
|
||||
continue;
|
||||
} else {
|
||||
rv.add(next_line);
|
||||
}
|
||||
}
|
||||
return rv;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,303 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.service;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationFooter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationSection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationHeader;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.UnclassifiedText;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.TableCell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Table;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
public class SectionsBuilderService {
|
||||
|
||||
public void buildSections(ClassificationDocument document) {
|
||||
|
||||
List<AbstractTextContainer> chunkWords = new ArrayList<>();
|
||||
List<ClassificationSection> chunkBlockList = new ArrayList<>();
|
||||
List<ClassificationHeader> headers = new ArrayList<>();
|
||||
List<ClassificationFooter> footers = new ArrayList<>();
|
||||
List<UnclassifiedText> unclassifiedTexts = new ArrayList<>();
|
||||
|
||||
AbstractTextContainer prev = null;
|
||||
|
||||
String lastHeadline = "";
|
||||
Table previousTable = null;
|
||||
for (ClassificationPage classificationPage : document.getPages()) {
|
||||
List<ClassificationTextBlock> header = new ArrayList<>();
|
||||
List<ClassificationTextBlock> footer = new ArrayList<>();
|
||||
List<ClassificationTextBlock> unclassifiedText = new ArrayList<>();
|
||||
for (AbstractTextContainer current : classificationPage.getTextBlocks()) {
|
||||
|
||||
if (current.getClassification() == null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
current.setPage(classificationPage.getPageNumber());
|
||||
|
||||
if (current.getClassification().equals("Header")) {
|
||||
header.add((ClassificationTextBlock) current);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (current.getClassification().equals("Footer")) {
|
||||
footer.add((ClassificationTextBlock) current);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (current.getClassification().equals("Other")) {
|
||||
unclassifiedText.add((ClassificationTextBlock) current);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (prev != null && current.getClassification().startsWith("H ") && !prev.getClassification().startsWith("H ") || !document.isHeadlines()) {
|
||||
ClassificationSection chunkBlock = buildTextBlock(chunkWords, lastHeadline);
|
||||
chunkBlock.setHeadline(lastHeadline);
|
||||
if (document.isHeadlines()) {
|
||||
lastHeadline = current.getText();
|
||||
}
|
||||
chunkBlockList.add(chunkBlock);
|
||||
chunkWords = new ArrayList<>();
|
||||
if (!chunkBlock.getTables().isEmpty()) {
|
||||
previousTable = chunkBlock.getTables().get(chunkBlock.getTables().size() - 1);
|
||||
}
|
||||
}
|
||||
if (current instanceof Table table) {
|
||||
// Distribute header information for subsequent tables
|
||||
mergeTableMetadata(table, previousTable);
|
||||
previousTable = table;
|
||||
}
|
||||
chunkWords.add(current);
|
||||
prev = current;
|
||||
}
|
||||
|
||||
if (!header.isEmpty()) {
|
||||
headers.add(new ClassificationHeader(header));
|
||||
}
|
||||
if (!footer.isEmpty()) {
|
||||
footers.add(new ClassificationFooter(footer));
|
||||
}
|
||||
if (!unclassifiedText.isEmpty()) {
|
||||
unclassifiedTexts.add(new UnclassifiedText(unclassifiedText));
|
||||
}
|
||||
}
|
||||
|
||||
ClassificationSection chunkBlock = buildTextBlock(chunkWords, lastHeadline);
|
||||
chunkBlock.setHeadline(lastHeadline);
|
||||
chunkBlockList.add(chunkBlock);
|
||||
|
||||
document.setSections(chunkBlockList);
|
||||
document.setHeaders(headers);
|
||||
document.setFooters(footers);
|
||||
document.setUnclassifiedTexts(unclassifiedTexts);
|
||||
addImagesToSections(document);
|
||||
}
|
||||
|
||||
|
||||
private void addImagesToSections(ClassificationDocument document) {
|
||||
|
||||
Map<Integer, List<ClassificationSection>> sectionMap = new HashMap<>();
|
||||
for (ClassificationSection section : document.getSections()) {
|
||||
for (AbstractTextContainer container : section.getPageBlocks()) {
|
||||
|
||||
List<ClassificationSection> sectionsOnPage = sectionMap.computeIfAbsent(container.getPage(), c -> new ArrayList<>());
|
||||
if (sectionsOnPage.contains(section)) {
|
||||
continue;
|
||||
}
|
||||
sectionsOnPage.add(section);
|
||||
}
|
||||
}
|
||||
|
||||
if (sectionMap.isEmpty()) {
|
||||
ClassificationSection section = new ClassificationSection();
|
||||
document.getSections().add(section);
|
||||
sectionMap.computeIfAbsent(1, x -> new ArrayList<>()).add(section);
|
||||
}
|
||||
|
||||
// first page is always a paragraph, else we can't process pages 1..N,
|
||||
// where N is the first found page with a paragraph
|
||||
if (sectionMap.get(1) == null) {
|
||||
ClassificationSection section = new ClassificationSection();
|
||||
document.getSections().add(section);
|
||||
sectionMap.computeIfAbsent(1, x -> new ArrayList<>()).add(section);
|
||||
}
|
||||
|
||||
for (ClassificationPage classificationPage : document.getPages()) {
|
||||
for (ClassifiedImage image : classificationPage.getImages()) {
|
||||
List<ClassificationSection> sectionsOnPage = sectionMap.get(classificationPage.getPageNumber());
|
||||
if (sectionsOnPage == null) {
|
||||
int i = classificationPage.getPageNumber();
|
||||
while (sectionsOnPage == null) {
|
||||
sectionsOnPage = sectionMap.get(i);
|
||||
i--;
|
||||
}
|
||||
}
|
||||
for (ClassificationSection section : sectionsOnPage) {
|
||||
Float xMin = null;
|
||||
Float yMin = null;
|
||||
Float xMax = null;
|
||||
Float yMax = null;
|
||||
|
||||
for (AbstractTextContainer abs : section.getPageBlocks()) {
|
||||
if (abs.getPage() != classificationPage.getPageNumber()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (abs.getMinX() < abs.getMaxX()) {
|
||||
if (xMin == null || abs.getMinX() < xMin) {
|
||||
xMin = abs.getMinX();
|
||||
}
|
||||
if (xMax == null || abs.getMaxX() > xMax) {
|
||||
xMax = abs.getMaxX();
|
||||
}
|
||||
} else {
|
||||
if (xMin == null || abs.getMaxX() < xMin) {
|
||||
xMin = abs.getMaxX();
|
||||
}
|
||||
if (xMax == null || abs.getMinX() > xMax) {
|
||||
xMax = abs.getMinX();
|
||||
}
|
||||
}
|
||||
|
||||
if (abs.getMinY() < abs.getMaxY()) {
|
||||
if (yMin == null || abs.getMinY() < yMin) {
|
||||
yMin = abs.getMinY();
|
||||
}
|
||||
if (yMax == null || abs.getMaxY() > yMax) {
|
||||
yMax = abs.getMaxY();
|
||||
}
|
||||
} else {
|
||||
if (yMin == null || abs.getMaxY() < yMin) {
|
||||
yMin = abs.getMaxY();
|
||||
}
|
||||
if (yMax == null || abs.getMinY() > yMax) {
|
||||
yMax = abs.getMinY();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
log.debug("Image position x: {}, y: {}", image.getPosition().getX(), image.getPosition().getY());
|
||||
log.debug("Paragraph position xMin: {}, xMax: {}, yMin: {}, yMax: {}", xMin, xMax, yMin, yMax);
|
||||
|
||||
if (xMin != null && xMax != null && yMin != null && yMax != null && image.getPosition().getX() >= xMin && image.getPosition()
|
||||
.getX() <= xMax && image.getPosition().getY() >= yMin && image.getPosition().getY() <= yMax) {
|
||||
section.getImages().add(image);
|
||||
image.setAppendedToSection(true);
|
||||
}
|
||||
}
|
||||
if (!image.isAppendedToSection()) {
|
||||
log.debug("Image uses first paragraph");
|
||||
sectionsOnPage.get(0).getImages().add(image);
|
||||
image.setAppendedToSection(true);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void mergeTableMetadata(Table currentTable, Table previousTable) {
|
||||
|
||||
// Distribute header information for subsequent tables
|
||||
if (previousTable != null && hasInvalidHeaderInformation(currentTable) && hasValidHeaderInformation(previousTable)) {
|
||||
List<TableCell> previousTableNonHeaderRow = getRowWithNonHeaderCells(previousTable);
|
||||
List<TableCell> tableNonHeaderRow = getRowWithNonHeaderCells(currentTable);
|
||||
// Allow merging of tables if header row is separated from first logical non-header row
|
||||
if (previousTableNonHeaderRow.isEmpty() && previousTable.getRowCount() == 1 && previousTable.getRows().get(0).size() == tableNonHeaderRow.size()) {
|
||||
previousTableNonHeaderRow = previousTable.getRows().get(0).stream().map(cell -> {
|
||||
TableCell fakeCell = new TableCell(cell.getPoints()[0], cell.getPoints()[2]);
|
||||
fakeCell.setHeaderCells(Collections.singletonList(cell));
|
||||
return fakeCell;
|
||||
}).collect(Collectors.toList());
|
||||
}
|
||||
if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) {
|
||||
for (int i = currentTable.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
|
||||
List<TableCell> row = currentTable.getRows().get(i);
|
||||
if (row.size() == tableNonHeaderRow.size() && row.stream().allMatch(cell -> cell.getHeaderCells().isEmpty())) {
|
||||
for (int j = 0; j < row.size(); j++) {
|
||||
row.get(j).setHeaderCells(previousTableNonHeaderRow.get(j).getHeaderCells());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private ClassificationSection buildTextBlock(List<AbstractTextContainer> wordBlockList, String lastHeadline) {
|
||||
|
||||
ClassificationSection section = new ClassificationSection();
|
||||
|
||||
for (AbstractTextContainer container : wordBlockList) {
|
||||
if (container instanceof Table table) {
|
||||
|
||||
if (lastHeadline == null || lastHeadline.isEmpty()) {
|
||||
table.setHeadline("Text in table");
|
||||
} else {
|
||||
table.setHeadline("Table in: " + lastHeadline);
|
||||
}
|
||||
|
||||
section.getPageBlocks().add(table);
|
||||
continue;
|
||||
}
|
||||
|
||||
ClassificationTextBlock wordBlock = (ClassificationTextBlock) container;
|
||||
section.getPageBlocks().add(wordBlock);
|
||||
}
|
||||
return section;
|
||||
}
|
||||
|
||||
|
||||
private boolean hasValidHeaderInformation(Table table) {
|
||||
|
||||
return !hasInvalidHeaderInformation(table);
|
||||
}
|
||||
|
||||
|
||||
private boolean hasInvalidHeaderInformation(Table table) {
|
||||
|
||||
return table.getRows().stream().flatMap(row -> row.stream().filter(cell -> !cell.getHeaderCells().isEmpty())).findAny().isEmpty();
|
||||
|
||||
}
|
||||
|
||||
|
||||
private List<TableCell> getRowWithNonHeaderCells(Table table) {
|
||||
|
||||
for (int i = table.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
|
||||
List<TableCell> row = table.getRows().get(i);
|
||||
if (row.size() == 1) {
|
||||
continue;
|
||||
}
|
||||
boolean allNonHeader = true;
|
||||
for (TableCell cell : row) {
|
||||
if (cell.isHeaderCell()) {
|
||||
allNonHeader = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (allNonHeader) {
|
||||
return row;
|
||||
}
|
||||
}
|
||||
|
||||
return Collections.emptyList();
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,338 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.service;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.TableCell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.QuickSort;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.DoubleComparisons;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Table;
|
||||
|
||||
@Service
|
||||
public class TableExtractionService {
|
||||
|
||||
private static final Comparator<Point2D> X_FIRST_POINT_COMPARATOR = (arg0, arg1) -> {
|
||||
|
||||
int rv = 0;
|
||||
float arg0X = DoubleComparisons.round(arg0.getX(), 2);
|
||||
float arg0Y = DoubleComparisons.round(arg0.getY(), 2);
|
||||
float arg1X = DoubleComparisons.round(arg1.getX(), 2);
|
||||
float arg1Y = DoubleComparisons.round(arg1.getY(), 2);
|
||||
|
||||
if (arg0X > arg1X) {
|
||||
rv = 1;
|
||||
} else if (arg0X < arg1X) {
|
||||
rv = -1;
|
||||
} else if (arg0Y > arg1Y) {
|
||||
rv = 1;
|
||||
} else if (arg0Y < arg1Y) {
|
||||
rv = -1;
|
||||
}
|
||||
return rv;
|
||||
};
|
||||
private static final Comparator<Point2D> POINT_COMPARATOR = (arg0, arg1) -> {
|
||||
|
||||
int rv = 0;
|
||||
float arg0X = DoubleComparisons.round(arg0.getX(), 2);
|
||||
float arg0Y = DoubleComparisons.round(arg0.getY(), 2);
|
||||
float arg1X = DoubleComparisons.round(arg1.getX(), 2);
|
||||
float arg1Y = DoubleComparisons.round(arg1.getY(), 2);
|
||||
|
||||
if (arg0Y > arg1Y) {
|
||||
rv = 1;
|
||||
} else if (arg0Y < arg1Y) {
|
||||
rv = -1;
|
||||
} else if (arg0X > arg1X) {
|
||||
rv = 1;
|
||||
} else if (arg0X < arg1X) {
|
||||
rv = -1;
|
||||
}
|
||||
return rv;
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* Finds tables on a classificationPage and moves textblocks into cells of the found tables.
|
||||
* Note: This algorithm uses Pdf Coordinate System where {0,0} rotated with the classificationPage rotation.
|
||||
* 0 -> LowerLeft
|
||||
* 90 -> UpperLeft
|
||||
* 180 -> UpperRight
|
||||
* 270 -> LowerRight
|
||||
*
|
||||
* DirAdj (Text direction adjusted) values can not be used here.
|
||||
*
|
||||
* @param cleanRulings The lines used to build the table.
|
||||
* @param classificationPage ClassificationPage object that contains textblocks and statistics.
|
||||
*/
|
||||
public void removeRedundantTableCells(CleanRulings cleanRulings, ClassificationPage classificationPage) {
|
||||
|
||||
List<TableCell> cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
|
||||
List<ClassificationTextBlock> toBeRemoved = new ArrayList<>();
|
||||
|
||||
for (AbstractTextContainer abstractTextContainer : classificationPage.getTextBlocks()) {
|
||||
ClassificationTextBlock textBlock = (ClassificationTextBlock) abstractTextContainer;
|
||||
for (TableCell cell : cells) {
|
||||
if (cell.intersects(textBlock.getPdfMinX(),
|
||||
textBlock.getPdfMinY(),
|
||||
textBlock.getPdfMaxX() - textBlock.getPdfMinX(),
|
||||
textBlock.getPdfMaxY() - textBlock.getPdfMinY())) {
|
||||
cell.addTextBlock(textBlock);
|
||||
toBeRemoved.add(textBlock);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cells = new ArrayList<>(new HashSet<>(cells));
|
||||
QuickSort.sort(cells, Rectangle.ILL_DEFINED_ORDER);
|
||||
|
||||
List<Rectangle> spreadsheetAreas = findSpreadsheetsFromCells(cells).stream().filter(r -> r.getWidth() > 0f && r.getHeight() > 0f).collect(Collectors.toList());
|
||||
|
||||
List<Table> tables = new ArrayList<>();
|
||||
for (Rectangle area : spreadsheetAreas) {
|
||||
|
||||
List<TableCell> overlappingCells = new ArrayList<>();
|
||||
for (TableCell c : cells) {
|
||||
if (c.intersects(area)) {
|
||||
overlappingCells.add(c);
|
||||
}
|
||||
}
|
||||
tables.add(new Table(overlappingCells, area, classificationPage.getRotation()));
|
||||
}
|
||||
|
||||
for (Table table : tables) {
|
||||
int position = -1;
|
||||
|
||||
Iterator<AbstractTextContainer> itty = classificationPage.getTextBlocks().iterator();
|
||||
while (itty.hasNext()) {
|
||||
AbstractTextContainer textBlock = itty.next();
|
||||
if (textBlock instanceof ClassificationTextBlock ? table.containsBlock((ClassificationTextBlock) textBlock) : table.contains(textBlock) && position == -1) {
|
||||
position = classificationPage.getTextBlocks().indexOf(textBlock);
|
||||
}
|
||||
}
|
||||
if (position != -1) {
|
||||
classificationPage.getTextBlocks().add(position, table);
|
||||
}
|
||||
}
|
||||
|
||||
classificationPage.getTextBlocks().removeAll(toBeRemoved);
|
||||
}
|
||||
|
||||
|
||||
public List<TableCell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
||||
|
||||
List<TableCell> cellsFound = new ArrayList<>();
|
||||
Map<Point2D, Ruling[]> intersectionPoints = Ruling.findIntersections(horizontalRulingLines, verticalRulingLines);
|
||||
List<Point2D> intersectionPointsList = new ArrayList<>(intersectionPoints.keySet());
|
||||
intersectionPointsList.sort(POINT_COMPARATOR);
|
||||
|
||||
for (int i = 0; i < intersectionPointsList.size(); i++) {
|
||||
Point2D topLeft = intersectionPointsList.get(i);
|
||||
Ruling[] hv = intersectionPoints.get(topLeft);
|
||||
|
||||
// CrossingPointsDirectlyBelow( topLeft );
|
||||
List<Point2D> xPoints = new ArrayList<>();
|
||||
// CrossingPointsDirectlyToTheRight( topLeft );
|
||||
List<Point2D> yPoints = new ArrayList<>();
|
||||
|
||||
for (Point2D p : intersectionPointsList.subList(i, intersectionPointsList.size())) {
|
||||
if (p.getX() == topLeft.getX() && p.getY() > topLeft.getY()) {
|
||||
xPoints.add(p);
|
||||
}
|
||||
if (p.getY() == topLeft.getY() && p.getX() > topLeft.getX()) {
|
||||
yPoints.add(p);
|
||||
}
|
||||
}
|
||||
outer:
|
||||
for (Point2D xPoint : xPoints) {
|
||||
// is there a vertical edge b/w topLeft and xPoint?
|
||||
if (!hv[1].equals(intersectionPoints.get(xPoint)[1])) {
|
||||
continue;
|
||||
}
|
||||
for (Point2D yPoint : yPoints) {
|
||||
// is there an horizontal edge b/w topLeft and yPoint ?
|
||||
if (!hv[0].equals(intersectionPoints.get(yPoint)[0])) {
|
||||
continue;
|
||||
}
|
||||
Point2D btmRight = new Point2D.Float((float) yPoint.getX(), (float) xPoint.getY());
|
||||
if (intersectionPoints.containsKey(btmRight) && intersectionPoints.get(btmRight)[0].equals(intersectionPoints.get(xPoint)[0]) && intersectionPoints.get(btmRight)[1].equals(
|
||||
intersectionPoints.get(yPoint)[1])) {
|
||||
cellsFound.add(new TableCell(topLeft, btmRight));
|
||||
break outer;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO create cells for vertical ruling lines with aligned endpoints at the top/bottom of a grid
|
||||
// that aren't connected with an horizontal ruler?
|
||||
// see: https://github.com/jazzido/tabula-extractor/issues/78#issuecomment-41481207
|
||||
|
||||
return cellsFound;
|
||||
}
|
||||
|
||||
|
||||
private List<Rectangle> findSpreadsheetsFromCells(List<? extends Rectangle> cells) {
|
||||
// via: http://stackoverflow.com/questions/13746284/merging-multiple-adjacent-rectangles-into-one-polygon
|
||||
List<Rectangle> rectangles = new ArrayList<>();
|
||||
Set<Point2D> pointSet = new HashSet<>();
|
||||
Map<Point2D, Point2D> edgesH = new HashMap<>();
|
||||
Map<Point2D, Point2D> edgesV = new HashMap<>();
|
||||
int i = 0;
|
||||
|
||||
for (Rectangle cell : cells) {
|
||||
for (Point2D pt : cell.getPoints()) {
|
||||
if (pointSet.contains(pt)) { // shared vertex, remove it
|
||||
pointSet.remove(pt);
|
||||
} else {
|
||||
pointSet.add(pt);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// X first sort
|
||||
List<Point2D> pointsSortX = new ArrayList<>(pointSet);
|
||||
pointsSortX.sort(X_FIRST_POINT_COMPARATOR);
|
||||
// Y first sort
|
||||
List<Point2D> pointsSortY = new ArrayList<>(pointSet);
|
||||
pointsSortY.sort(POINT_COMPARATOR);
|
||||
|
||||
while (i < pointSet.size()) {
|
||||
float currY = (float) pointsSortY.get(i).getY();
|
||||
while (i < pointSet.size() && DoubleComparisons.feq(pointsSortY.get(i).getY(), currY)) {
|
||||
edgesH.put(pointsSortY.get(i), pointsSortY.get(i + 1));
|
||||
edgesH.put(pointsSortY.get(i + 1), pointsSortY.get(i));
|
||||
i += 2;
|
||||
}
|
||||
}
|
||||
|
||||
i = 0;
|
||||
while (i < pointSet.size()) {
|
||||
float currX = (float) pointsSortX.get(i).getX();
|
||||
while (i < pointSet.size() && DoubleComparisons.feq(pointsSortX.get(i).getX(), currX)) {
|
||||
edgesV.put(pointsSortX.get(i), pointsSortX.get(i + 1));
|
||||
edgesV.put(pointsSortX.get(i + 1), pointsSortX.get(i));
|
||||
i += 2;
|
||||
}
|
||||
}
|
||||
|
||||
// Get all the polygons
|
||||
List<List<PolygonVertex>> polygons = new ArrayList<>();
|
||||
Point2D nextVertex;
|
||||
while (!edgesH.isEmpty()) {
|
||||
ArrayList<PolygonVertex> polygon = new ArrayList<>();
|
||||
Point2D first = edgesH.keySet().iterator().next();
|
||||
polygon.add(new PolygonVertex(first, Direction.HORIZONTAL));
|
||||
edgesH.remove(first);
|
||||
|
||||
while (true) {
|
||||
PolygonVertex curr = polygon.get(polygon.size() - 1);
|
||||
PolygonVertex lastAddedVertex;
|
||||
if (curr.direction == Direction.HORIZONTAL) {
|
||||
nextVertex = edgesV.get(curr.point);
|
||||
edgesV.remove(curr.point);
|
||||
lastAddedVertex = new PolygonVertex(nextVertex, Direction.VERTICAL);
|
||||
} else {
|
||||
nextVertex = edgesH.get(curr.point);
|
||||
edgesH.remove(curr.point);
|
||||
lastAddedVertex = new PolygonVertex(nextVertex, Direction.HORIZONTAL);
|
||||
}
|
||||
polygon.add(lastAddedVertex);
|
||||
|
||||
if (lastAddedVertex.equals(polygon.get(0))) {
|
||||
// closed polygon
|
||||
polygon.remove(polygon.size() - 1);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for (PolygonVertex vertex : polygon) {
|
||||
edgesH.remove(vertex.point);
|
||||
edgesV.remove(vertex.point);
|
||||
}
|
||||
polygons.add(polygon);
|
||||
}
|
||||
|
||||
// calculate grid-aligned minimum area rectangles for each found polygon
|
||||
for (List<PolygonVertex> poly : polygons) {
|
||||
float top = Float.MAX_VALUE;
|
||||
float left = Float.MAX_VALUE;
|
||||
float bottom = Float.MIN_VALUE;
|
||||
float right = Float.MIN_VALUE;
|
||||
for (PolygonVertex pt : poly) {
|
||||
top = (float) Math.min(top, pt.point.getY());
|
||||
left = (float) Math.min(left, pt.point.getX());
|
||||
bottom = (float) Math.max(bottom, pt.point.getY());
|
||||
right = (float) Math.max(right, pt.point.getX());
|
||||
}
|
||||
rectangles.add(new Rectangle(top, left, right - left, bottom - top));
|
||||
}
|
||||
|
||||
return rectangles;
|
||||
}
|
||||
|
||||
|
||||
private enum Direction {
|
||||
HORIZONTAL,
|
||||
VERTICAL
|
||||
}
|
||||
|
||||
static class PolygonVertex {
|
||||
|
||||
Point2D point;
|
||||
Direction direction;
|
||||
|
||||
|
||||
PolygonVertex(Point2D point, Direction direction) {
|
||||
|
||||
this.direction = direction;
|
||||
this.point = point;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean equals(Object other) {
|
||||
|
||||
if (this == other) {
|
||||
return true;
|
||||
}
|
||||
if (!(other instanceof PolygonVertex)) {
|
||||
return false;
|
||||
}
|
||||
return this.point.equals(((PolygonVertex) other).point);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
|
||||
return this.point.hashCode();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return String.format("%s[point=%s,direction=%s]", this.getClass().getName(), this.point.toString(), this.direction.toString());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,142 @@
|
||||
/*
|
||||
* CohenSutherland.java
|
||||
* --------------------
|
||||
* (c) 2007 by Intevation GmbH
|
||||
*
|
||||
* @author Sascha L. Teichmann (teichmann@intevation.de)
|
||||
* @author Ludwig Reiter (ludwig@intevation.de)
|
||||
*
|
||||
* This program is free software under the LGPL (>=v2.1)
|
||||
* Read the file LICENSE.txt coming with the sources for details.
|
||||
*/
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.utils;
|
||||
|
||||
import java.awt.geom.Line2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
/**
|
||||
* Implements the well known Cohen Sutherland line
|
||||
* clipping algorithm (line against clip rectangle).
|
||||
*/
|
||||
@SuppressWarnings("all")
|
||||
public final class CohenSutherlandClipping {
|
||||
|
||||
private static final int INSIDE = 0;
|
||||
private static final int LEFT = 1;
|
||||
private static final int RIGHT = 2;
|
||||
private static final int BOTTOM = 4;
|
||||
private static final int TOP = 8;
|
||||
private double xMin;
|
||||
private double yMin;
|
||||
private double xMax;
|
||||
private double yMax;
|
||||
|
||||
|
||||
/**
|
||||
* Creates a Cohen Sutherland clipper with clip rect (0, 0, 0, 0).
|
||||
*/
|
||||
public CohenSutherlandClipping() {
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Creates a Cohen Sutherland clipper with the given clip rectangle.
|
||||
*
|
||||
* @param clip the clip rectangle to use
|
||||
*/
|
||||
public CohenSutherlandClipping(Rectangle2D clip) {
|
||||
|
||||
setClip(clip);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Sets the clip rectangle.
|
||||
*
|
||||
* @param clip the clip rectangle
|
||||
*/
|
||||
public void setClip(Rectangle2D clip) {
|
||||
|
||||
xMin = clip.getX();
|
||||
xMax = xMin + clip.getWidth();
|
||||
yMin = clip.getY();
|
||||
yMax = yMin + clip.getHeight();
|
||||
}
|
||||
|
||||
|
||||
private final int regionCode(double x, double y) {
|
||||
|
||||
int code = x < xMin ? LEFT : x > xMax ? RIGHT : INSIDE;
|
||||
if (y < yMin) {
|
||||
code |= BOTTOM;
|
||||
} else if (y > yMax) {
|
||||
code |= TOP;
|
||||
}
|
||||
return code;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Clips a given line against the clip rectangle.
|
||||
* The modification (if needed) is done in place.
|
||||
*
|
||||
* @param line the line to clip
|
||||
* @return true if line is clipped, false if line is
|
||||
* totally outside the clip rect.
|
||||
*/
|
||||
public boolean clip(Line2D.Float line) {
|
||||
|
||||
double p1x = line.getX1();
|
||||
double p1y = line.getY1();
|
||||
double p2x = line.getX2();
|
||||
double p2y = line.getY2();
|
||||
|
||||
double qx = 0d;
|
||||
double qy = 0d;
|
||||
|
||||
boolean vertical = p1x == p2x;
|
||||
|
||||
double slope = vertical ? 0d : (p2y - p1y) / (p2x - p1x);
|
||||
|
||||
int c1 = regionCode(p1x, p1y);
|
||||
int c2 = regionCode(p2x, p2y);
|
||||
|
||||
while (c1 != INSIDE || c2 != INSIDE) {
|
||||
|
||||
if ((c1 & c2) != INSIDE) {
|
||||
return false;
|
||||
}
|
||||
|
||||
int c = c1 == INSIDE ? c2 : c1;
|
||||
|
||||
if ((c & LEFT) != INSIDE) {
|
||||
qx = xMin;
|
||||
qy = (DoubleComparisons.feq(qx, p1x) ? 0 : qx - p1x) * slope + p1y;
|
||||
} else if ((c & RIGHT) != INSIDE) {
|
||||
qx = xMax;
|
||||
qy = (DoubleComparisons.feq(qx, p1x) ? 0 : qx - p1x) * slope + p1y;
|
||||
} else if ((c & BOTTOM) != INSIDE) {
|
||||
qy = yMin;
|
||||
qx = vertical ? p1x : (DoubleComparisons.feq(qy, p1y) ? 0 : qy - p1y) / slope + p1x;
|
||||
} else if ((c & TOP) != INSIDE) {
|
||||
qy = yMax;
|
||||
qx = vertical ? p1x : (DoubleComparisons.feq(qy, p1y) ? 0 : qy - p1y) / slope + p1x;
|
||||
}
|
||||
|
||||
if (c == c1) {
|
||||
p1x = qx;
|
||||
p1y = qy;
|
||||
c1 = regionCode(p1x, p1y);
|
||||
} else {
|
||||
p2x = qx;
|
||||
p2y = qy;
|
||||
c2 = regionCode(p2x, p2y);
|
||||
}
|
||||
}
|
||||
line.setLine(p1x, p1y, p2x, p2y);
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
// end of file
|
||||
@ -0,0 +1,30 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.utils;
|
||||
|
||||
import java.math.BigDecimal;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@UtilityClass
|
||||
@SuppressWarnings("all")
|
||||
public final class DoubleComparisons {
|
||||
|
||||
private final static float EPSILON = 0.1f;
|
||||
|
||||
|
||||
public static boolean feq(double f1, double f2) {
|
||||
|
||||
return (Math.abs(f1 - f2) < EPSILON);
|
||||
}
|
||||
|
||||
|
||||
public static float round(double d, int decimalPlace) {
|
||||
BigDecimal bd = BigDecimal.valueOf(d);
|
||||
bd = bd.setScale(decimalPlace, BigDecimal.ROUND_HALF_UP);
|
||||
return bd.floatValue();
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,119 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.utils;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
@SuppressWarnings("all")
|
||||
public final class PositionUtils {
|
||||
|
||||
// TODO This currently uses pdf coord system. In the futher this should use java coord system.
|
||||
// Note: DirAdj (TextDirection Adjusted) can not be user for this.
|
||||
public boolean isWithinBodyTextFrame(Rectangle btf, ClassificationTextBlock textBlock) {
|
||||
|
||||
if (btf == null || textBlock == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
double threshold = textBlock.getMostPopularWordHeight() * 3;
|
||||
|
||||
if (textBlock.getPdfMinX() + threshold > btf.getTopLeft().getX() && textBlock.getPdfMaxX() - threshold < btf.getTopLeft()
|
||||
.getX() + btf.getWidth() && textBlock.getPdfMinY() + threshold > btf.getTopLeft().getY() && textBlock.getPdfMaxY() - threshold < btf.getTopLeft()
|
||||
.getY() + btf.getHeight()) {
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
// TODO This currently uses pdf coord system. In the futher this should use java coord system.
|
||||
// Note: DirAdj (TextDirection Adjusted) can not be user for this.
|
||||
public boolean isOverBodyTextFrame(Rectangle btf, ClassificationTextBlock textBlock, int rotation) {
|
||||
|
||||
if (btf == null || textBlock == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (rotation == 90 && textBlock.getPdfMaxX() < btf.getTopLeft().getX()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (rotation == 180 && textBlock.getPdfMaxY() < btf.getTopLeft().getY()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (rotation == 270 && textBlock.getPdfMinX() > btf.getTopLeft().getX() + btf.getWidth()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (rotation == 0 && textBlock.getPdfMinY() > btf.getTopLeft().getY() + btf.getHeight()) {
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// TODO This currently uses pdf coord system. In the futher this should use java coord system.
|
||||
// Note: DirAdj (TextDirection Adjusted) can not be user for this.
|
||||
public boolean isUnderBodyTextFrame(Rectangle btf, ClassificationTextBlock textBlock, int rotation) {
|
||||
|
||||
if (btf == null || textBlock == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (rotation == 90 && textBlock.getPdfMinX() > btf.getTopLeft().getX() + btf.getWidth()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (rotation == 180 && textBlock.getPdfMinY() > btf.getTopLeft().getY() + btf.getHeight()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (rotation == 270 && textBlock.getPdfMaxX() < btf.getTopLeft().getX()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (rotation == 0 && textBlock.getPdfMaxY() < btf.getTopLeft().getY()) {
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// TODO This currently uses pdf coord system. In the futher this should use java coord system.
|
||||
// Note: DirAdj (TextDirection Adjusted) can not be user for this.
|
||||
public boolean isTouchingUnderBodyTextFrame(Rectangle btf, ClassificationTextBlock textBlock) {
|
||||
|
||||
//TODO Currently this is not working for rotated pages.
|
||||
|
||||
if (btf == null || textBlock == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (textBlock.getMinY() < btf.getTopLeft().getY()) {
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
public float getHeightDifferenceBetweenChunkWordAndDocumentWord(ClassificationTextBlock textBlock, Float documentMostPopularWordHeight) {
|
||||
|
||||
return textBlock.getMostPopularWordHeight() - documentMostPopularWordHeight;
|
||||
}
|
||||
|
||||
|
||||
public Float getApproxLineCount(ClassificationTextBlock textBlock) {
|
||||
|
||||
return textBlock.getHeight() / textBlock.getMostPopularWordHeight();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,109 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.utils;
|
||||
|
||||
import java.util.ArrayDeque;
|
||||
import java.util.Comparator;
|
||||
import java.util.Deque;
|
||||
import java.util.List;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
/**
|
||||
* Copied and minimal modified from PDFBox.
|
||||
*/
|
||||
@UtilityClass
|
||||
public final class QuickSort {
|
||||
|
||||
private static final Comparator<? extends Comparable> OBJCOMP = new Comparator<Comparable>() {
|
||||
@Override
|
||||
public int compare(Comparable object1, Comparable object2) {
|
||||
|
||||
return object1.compareTo(object2);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* Sorts the given list using the given comparator.
|
||||
*
|
||||
* @param <T> type of the objects to be sorted.
|
||||
* @param list list to be sorted
|
||||
* @param cmp comparator used to compare the objects within the list
|
||||
*/
|
||||
public static <T> void sort(List<T> list, Comparator<? super T> cmp) {
|
||||
|
||||
int size = list.size();
|
||||
if (size < 2) {
|
||||
return;
|
||||
}
|
||||
quicksort(list, cmp);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Sorts the given list using compareTo as comparator.
|
||||
*
|
||||
* @param <T> type of the objects to be sorted.
|
||||
* @param list list to be sorted
|
||||
*/
|
||||
public static <T extends Comparable> void sort(List<T> list) {
|
||||
|
||||
sort(list, (Comparator<T>) OBJCOMP);
|
||||
}
|
||||
|
||||
|
||||
private static <T> void quicksort(List<T> list, Comparator<? super T> cmp) {
|
||||
|
||||
Deque<Integer> stack = new ArrayDeque<Integer>();
|
||||
stack.push(0);
|
||||
stack.push(list.size());
|
||||
while (!stack.isEmpty()) {
|
||||
int right = stack.pop();
|
||||
int left = stack.pop();
|
||||
if (right - left < 2) {
|
||||
continue;
|
||||
}
|
||||
int p = left + ((right - left) / 2);
|
||||
p = partition(list, cmp, p, left, right);
|
||||
|
||||
stack.push(p + 1);
|
||||
stack.push(right);
|
||||
|
||||
stack.push(left);
|
||||
stack.push(p);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static <T> int partition(List<T> list, Comparator<? super T> cmp, int p, int start, int end) {
|
||||
|
||||
int l = start;
|
||||
int h = end - 2;
|
||||
T piv = list.get(p);
|
||||
swap(list, p, end - 1);
|
||||
|
||||
while (l < h) {
|
||||
if (cmp.compare(list.get(l), piv) <= 0) {
|
||||
l++;
|
||||
} else if (cmp.compare(piv, list.get(h)) <= 0) {
|
||||
h--;
|
||||
} else {
|
||||
swap(list, l, h);
|
||||
}
|
||||
}
|
||||
int idx = h;
|
||||
if (cmp.compare(list.get(h), piv) < 0) {
|
||||
idx++;
|
||||
}
|
||||
swap(list, end - 1, idx);
|
||||
return idx;
|
||||
}
|
||||
|
||||
|
||||
private static <T> void swap(List<T> list, int i, int j) {
|
||||
|
||||
T tmp = list.get(i);
|
||||
list.set(i, list.get(j));
|
||||
list.set(j, tmp);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,64 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.utils;
|
||||
|
||||
import java.awt.geom.Line2D;
|
||||
import java.awt.geom.Point2D;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Ruling;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public final class RulingTextDirAdjustUtil {
|
||||
|
||||
/**
|
||||
* Converts a ruling (line of a table) the same way TextPositions are converted in PDFBox.
|
||||
* This will get the y position of the text, adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
|
||||
*
|
||||
* See org.apache.pdfbox.text.TextPosition
|
||||
*/
|
||||
public Line2D.Float convertToDirAdj(Ruling ruling, float dir, float pageWidth, float pageHeight) {
|
||||
|
||||
return new Line2D.Float(convertPoint(ruling.x1, ruling.y1, dir, pageWidth, pageHeight), convertPoint(ruling.x2, ruling.y2, dir, pageWidth, pageHeight));
|
||||
}
|
||||
|
||||
|
||||
private Point2D convertPoint(float x, float y, float dir, float pageWidth, float pageHeight) {
|
||||
|
||||
var xAdj = getXRot(x, y, dir, pageWidth, pageHeight);
|
||||
var yLowerLeftRot = getYLowerLeftRot(x, y, dir, pageWidth, pageHeight);
|
||||
var yAdj = dir == 0 || dir == 180 ? pageHeight - yLowerLeftRot : pageWidth - yLowerLeftRot;
|
||||
return new Point2D.Float(xAdj, yAdj);
|
||||
}
|
||||
|
||||
|
||||
@SuppressWarnings("SuspiciousNameCombination")
|
||||
private float getXRot(float x, float y, float dir, float pageWidth, float pageHeight) {
|
||||
|
||||
if (dir == 0) {
|
||||
return x;
|
||||
} else if (dir == 90) {
|
||||
return y;
|
||||
} else if (dir == 180) {
|
||||
return pageWidth - x;
|
||||
} else if (dir == 270) {
|
||||
return pageHeight - y;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
private float getYLowerLeftRot(float x, float y, float dir, float pageWidth, float pageHeight) {
|
||||
|
||||
if (dir == 0) {
|
||||
return y;
|
||||
} else if (dir == 90) {
|
||||
return pageWidth - x;
|
||||
} else if (dir == 180) {
|
||||
return pageHeight - y;
|
||||
} else if (dir == 270) {
|
||||
return x;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,19 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.utils;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public final class TextNormalizationUtilities {
|
||||
|
||||
/**
|
||||
* Revert hyphenation due to line breaks.
|
||||
*
|
||||
* @param text Text to be processed.
|
||||
* @return Text without line-break hyphenation.
|
||||
*/
|
||||
public static String removeHyphenLineBreaks(String text) {
|
||||
|
||||
return text.replaceAll("([^\\s\\d\\-]{2,500})[\\-\\u00AD]\\R", "$1");
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,386 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.factory;
|
||||
|
||||
import static java.lang.String.format;
|
||||
import static java.util.stream.Collectors.groupingBy;
|
||||
import static java.util.stream.Collectors.toList;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.DocumentGraph;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.FooterNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.HeaderNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.HeadlineNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ImageNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ParagraphNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SectionNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableCellNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.AtomicTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationFooter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationHeader;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Table;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.TableCell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextPositionSequence;
|
||||
|
||||
|
||||
@Service
|
||||
public class DocumentGraphFactory {
|
||||
|
||||
public static final double TABLE_CELL_MERGE_CONTENTS_SIZE_THRESHOLD = 0.05;
|
||||
|
||||
|
||||
public DocumentGraph buildDocumentGraph(ClassificationDocument document) {
|
||||
|
||||
TextBlockFactory textBlockFactory = new TextBlockFactory();
|
||||
Context context = new Context(new TableOfContents(), new HashMap<>(), new LinkedList<>(), new LinkedList<>(), textBlockFactory);
|
||||
|
||||
document.getPages().stream().map(this::buildPage).forEach(page -> context.pages().put(page, new AtomicInteger(1)));
|
||||
document.getSections().stream().flatMap(section -> section.getImages().stream()).forEach(image -> context.images().add(image));
|
||||
addSections(document, context);
|
||||
addHeaderAndFooterToEachPage(document, context);
|
||||
|
||||
DocumentGraph documentGraph = DocumentGraph.builder().numberOfPages(context.pages.size()).pages(context.pages.keySet()).tableOfContents(context.tableOfContents).build();
|
||||
|
||||
documentGraph.setTextBlock(documentGraph.buildTextBlock());
|
||||
return documentGraph;
|
||||
}
|
||||
|
||||
|
||||
private void addSections(ClassificationDocument document, Context context) {
|
||||
|
||||
document.getSections().forEach(section -> addSection(null, section.getPageBlocks(), section.getImages(), context));
|
||||
}
|
||||
|
||||
|
||||
private void addSection(SemanticNode parentNode, List<AbstractTextContainer> pageBlocks, List<ClassifiedImage> images, Context context) {
|
||||
|
||||
Map<Integer, List<AbstractTextContainer>> blocksPerPage = pageBlocks.stream().collect(groupingBy(AbstractTextContainer::getPage));
|
||||
SectionNode sectionNode = SectionNode.builder().entities(new HashSet<>()).tableOfContents(context.tableOfContents()).build();
|
||||
|
||||
context.sections().add(sectionNode);
|
||||
blocksPerPage.keySet().forEach(pageNumber -> addSectionNodeToPageNode(context, sectionNode, pageNumber));
|
||||
|
||||
List<Integer> tocId;
|
||||
if (parentNode == null) {
|
||||
tocId = context.tableOfContents.createNewEntryAndReturnId(NodeType.SECTION, sectionNode);
|
||||
} else {
|
||||
tocId = context.tableOfContents.createNewChildEntryAndReturnId(parentNode.getTocId(), NodeType.SECTION, sectionNode);
|
||||
}
|
||||
sectionNode.setTocId(tocId);
|
||||
Set<AbstractTextContainer> alreadyMerged = new HashSet<>();
|
||||
for (AbstractTextContainer abstractTextContainer : pageBlocks) {
|
||||
|
||||
if (alreadyMerged.contains(abstractTextContainer)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (abstractTextContainer instanceof ClassificationTextBlock) {
|
||||
List<ClassificationTextBlock> textBlocks = findTextBlocksWithSameClassificationAndAlignsY(abstractTextContainer, pageBlocks);
|
||||
alreadyMerged.addAll(textBlocks);
|
||||
addParagraphOrHeadline(sectionNode, (ClassificationTextBlock) abstractTextContainer, context, textBlocks);
|
||||
}
|
||||
if (abstractTextContainer instanceof Table) {
|
||||
addTable(sectionNode, (Table) abstractTextContainer, context);
|
||||
}
|
||||
}
|
||||
for (ClassifiedImage image : images) {
|
||||
|
||||
addImage(sectionNode, image, context);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static List<ClassificationTextBlock> findTextBlocksWithSameClassificationAndAlignsY(AbstractTextContainer atc, List<AbstractTextContainer> pageBlocks) {
|
||||
|
||||
return pageBlocks.stream()
|
||||
.filter(abstractTextContainer -> !abstractTextContainer.equals(atc))
|
||||
.filter(abstractTextContainer -> abstractTextContainer.getPage() == atc.getPage())
|
||||
.filter(abstractTextContainer -> abstractTextContainer instanceof ClassificationTextBlock)
|
||||
.filter(abstractTextContainer -> abstractTextContainer.intersectsY(atc))
|
||||
.map(abstractTextContainer -> (ClassificationTextBlock) abstractTextContainer)
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
private void addSectionNodeToPageNode(Context context, SectionNode sectionNode, Integer pageNumber) {
|
||||
|
||||
PageNode page = getPage(pageNumber, context);
|
||||
page.getMainBody().add(sectionNode);
|
||||
}
|
||||
|
||||
|
||||
private void addTable(SemanticNode parentNode, Table table, Context context) {
|
||||
|
||||
PageNode page = getPage(table.getPage(), context);
|
||||
TableNode tableNode = TableNode.builder().tableOfContents(context.tableOfContents()).numberOfCols(table.getColCount()).numberOfRows(table.getRowCount()).build();
|
||||
|
||||
if (!page.getMainBody().contains(parentNode)) {
|
||||
parentNode.getPages().add(page);
|
||||
}
|
||||
|
||||
page.getMainBody().add(tableNode);
|
||||
|
||||
List<Integer> tocId = context.tableOfContents().createNewChildEntryAndReturnId(parentNode.getTocId(), NodeType.TABLE, tableNode);
|
||||
tableNode.setTocId(tocId);
|
||||
|
||||
addTableCells(table.getRows(), tableNode, context, table.getPage());
|
||||
}
|
||||
|
||||
|
||||
private void addTableCells(List<List<TableCell>> rows, SemanticNode parentNode, Context context, int pageNumber) {
|
||||
|
||||
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
|
||||
for (int colIndex = 0; colIndex < rows.get(rowIndex).size(); colIndex++) {
|
||||
addTableCell(rows.get(rowIndex).get(colIndex), rowIndex, colIndex, parentNode, pageNumber, context);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void addTableCell(TableCell cell, int rowIndex, int colIndex, SemanticNode parentNode, int pageNumber, Context context) {
|
||||
|
||||
PageNode page = getPage(pageNumber, context);
|
||||
cell.getTextBlocks().stream().filter(tb -> tb.getPage() == 0).forEach(tb -> tb.setPage(pageNumber));
|
||||
|
||||
TableCellNode tableCellNode = TableCellNode.builder()
|
||||
.tableOfContents(context.tableOfContents())
|
||||
.row(rowIndex)
|
||||
.col(colIndex)
|
||||
.header(cell.isHeaderCell())
|
||||
.bBox(cell.getBounds2D())
|
||||
.build();
|
||||
page.getMainBody().add(tableCellNode);
|
||||
|
||||
TextBlock textBlock;
|
||||
|
||||
List<Integer> tocId = context.tableOfContents().createNewChildEntryAndReturnId(parentNode.getTocId(), NodeType.TABLE_CELL, tableCellNode);
|
||||
tableCellNode.setTocId(tocId);
|
||||
|
||||
if (cell.getTextBlocks().isEmpty()) {
|
||||
tableCellNode.setTerminalTextBlock(context.textBlockFactory.emptyTextBlock(parentNode, context, page));
|
||||
tableCellNode.setTerminal(true);
|
||||
|
||||
} else if (cell.getTextBlocks().size() == 1) {
|
||||
textBlock = context.textBlockFactory().buildAtomicTextBlock(cell.getTextBlocks().get(0).getSequences(), tableCellNode, context, page);
|
||||
tableCellNode.setTerminalTextBlock(textBlock);
|
||||
tableCellNode.setTerminal(true);
|
||||
|
||||
} else if (firstTextBlockIsHeadline(cell)) {
|
||||
addSection(tableCellNode, cell.getTextBlocks().stream().map(tb -> (AbstractTextContainer) tb).toList(), Collections.emptyList(), context);
|
||||
tableCellNode.setTerminal(false);
|
||||
|
||||
} else if (cellAreaIsSmallerThanPageAreaTimesThreshold(cell, page)) {
|
||||
List<TextPositionSequence> sequences = TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(cell.getTextBlocks());
|
||||
textBlock = context.textBlockFactory().buildAtomicTextBlock(sequences, tableCellNode, context, page);
|
||||
tableCellNode.setTerminalTextBlock(textBlock);
|
||||
tableCellNode.setTerminal(true);
|
||||
|
||||
} else {
|
||||
cell.getTextBlocks().forEach(tb -> addParagraphOrHeadline(tableCellNode, tb, context));
|
||||
tableCellNode.setTerminal(false);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
private static boolean cellAreaIsSmallerThanPageAreaTimesThreshold(TableCell cell, PageNode page) {
|
||||
|
||||
return cell.getArea() < TABLE_CELL_MERGE_CONTENTS_SIZE_THRESHOLD * page.getHeight() * page.getWidth();
|
||||
}
|
||||
|
||||
|
||||
private static boolean firstTextBlockIsHeadline(TableCell cell) {
|
||||
|
||||
String classification = cell.getTextBlocks().get(0).getClassification();
|
||||
return classification != null && classification.startsWith("H");
|
||||
}
|
||||
|
||||
|
||||
private void addParagraphOrHeadline(SemanticNode parentNode, ClassificationTextBlock originalTextBlock, Context context) {
|
||||
|
||||
addParagraphOrHeadline(parentNode, originalTextBlock, context, Collections.emptyList());
|
||||
}
|
||||
|
||||
|
||||
private void addParagraphOrHeadline(SemanticNode parentNode, ClassificationTextBlock originalTextBlock, Context context, List<ClassificationTextBlock> textBlocksToMerge) {
|
||||
|
||||
PageNode page = getPage(originalTextBlock.getPage(), context);
|
||||
|
||||
SemanticNode node;
|
||||
if (originalTextBlock.getClassification() != null && originalTextBlock.getClassification().startsWith("H")) {
|
||||
node = HeadlineNode.builder().tableOfContents(context.tableOfContents()).build();
|
||||
} else {
|
||||
node = ParagraphNode.builder().tableOfContents(context.tableOfContents()).build();
|
||||
}
|
||||
|
||||
page.getMainBody().add(node);
|
||||
|
||||
List<ClassificationTextBlock> textBlocks = new LinkedList<>(textBlocksToMerge);
|
||||
textBlocks.add(originalTextBlock);
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), node, context, page);
|
||||
|
||||
if (node instanceof HeadlineNode headlineNode) {
|
||||
List<Integer> tocId = context.tableOfContents.createNewChildEntryAndReturnId(parentNode.getTocId(), NodeType.HEADLINE, node);
|
||||
headlineNode.setTerminalTextBlock(textBlock);
|
||||
headlineNode.setTocId(tocId);
|
||||
}
|
||||
if (node instanceof ParagraphNode paragraphNode) {
|
||||
List<Integer> tocId = context.tableOfContents.createNewChildEntryAndReturnId(parentNode.getTocId(), NodeType.PARAGRAPH, node);
|
||||
paragraphNode.setTerminalTextBlock(textBlock);
|
||||
paragraphNode.setTocId(tocId);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void addImage(SectionNode sectionNode, ClassifiedImage image, Context context) {
|
||||
|
||||
PageNode page = getPage(image.getPage(), context);
|
||||
ImageNode imageNode = ImageNode.builder()
|
||||
.imageType(image.getImageType())
|
||||
.position(image.getPosition())
|
||||
.transparency(image.isHasTransparency())
|
||||
.page(page)
|
||||
.tableOfContents(context.tableOfContents())
|
||||
.build();
|
||||
page.getMainBody().add(imageNode);
|
||||
|
||||
List<Integer> tocId = context.tableOfContents().createNewChildEntryAndReturnId(sectionNode.getTocId(), NodeType.IMAGE, imageNode);
|
||||
imageNode.setTocId(tocId);
|
||||
}
|
||||
|
||||
|
||||
private void addHeaderAndFooterToEachPage(ClassificationDocument document, Context context) {
|
||||
|
||||
Map<Integer, List<ClassificationTextBlock>> headers = document.getHeaders()
|
||||
.stream()
|
||||
.map(ClassificationHeader::getTextBlocks)
|
||||
.flatMap(List::stream)
|
||||
.collect(groupingBy(AbstractTextContainer::getPage, toList()));
|
||||
|
||||
Map<Integer, List<ClassificationTextBlock>> footers = document.getFooters()
|
||||
.stream()
|
||||
.map(ClassificationFooter::getTextBlocks)
|
||||
.flatMap(List::stream)
|
||||
.collect(groupingBy(AbstractTextContainer::getPage, toList()));
|
||||
|
||||
for (int pageIndex = 1; pageIndex <= document.getPages().size(); pageIndex++) {
|
||||
if (headers.containsKey(pageIndex)) {
|
||||
addHeader(headers.get(pageIndex), context);
|
||||
} else {
|
||||
addEmptyHeader(pageIndex, context);
|
||||
}
|
||||
}
|
||||
|
||||
for (int pageIndex = 1; pageIndex <= document.getPages().size(); pageIndex++) {
|
||||
if (footers.containsKey(pageIndex)) {
|
||||
addFooter(footers.get(pageIndex), context);
|
||||
} else {
|
||||
addEmptyFooter(pageIndex, context);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void addFooter(List<ClassificationTextBlock> textBlocks, Context context) {
|
||||
|
||||
PageNode page = getPage(textBlocks.get(0).getPage(), context);
|
||||
FooterNode footer = FooterNode.builder().tableOfContents(context.tableOfContents()).build();
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks),
|
||||
footer,
|
||||
context,
|
||||
page);
|
||||
List<Integer> tocId = context.tableOfContents().createNewEntryAndReturnId(NodeType.FOOTER, footer);
|
||||
footer.setTocId(tocId);
|
||||
footer.setTerminalTextBlock(textBlock);
|
||||
page.setFooter(footer);
|
||||
}
|
||||
|
||||
|
||||
public void addHeader(List<ClassificationTextBlock> textBlocks, Context context) {
|
||||
|
||||
PageNode page = getPage(textBlocks.get(0).getPage(), context);
|
||||
HeaderNode header = HeaderNode.builder().tableOfContents(context.tableOfContents()).build();
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks),
|
||||
header,
|
||||
context,
|
||||
0,
|
||||
page);
|
||||
List<Integer> tocId = context.tableOfContents().createNewEntryAndReturnId(NodeType.HEADER, header);
|
||||
header.setTocId(tocId);
|
||||
header.setTerminalTextBlock(textBlock);
|
||||
page.setHeader(header);
|
||||
}
|
||||
|
||||
|
||||
private void addEmptyFooter(int pageIndex, Context context) {
|
||||
|
||||
PageNode page = getPage(pageIndex, context);
|
||||
FooterNode footer = FooterNode.builder().tableOfContents(context.tableOfContents()).build();
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(footer, context, page);
|
||||
List<Integer> tocId = context.tableOfContents().createNewEntryAndReturnId(NodeType.FOOTER, footer);
|
||||
footer.setTocId(tocId);
|
||||
footer.setTerminalTextBlock(textBlock);
|
||||
page.setFooter(footer);
|
||||
}
|
||||
|
||||
|
||||
private void addEmptyHeader(int pageIndex, Context context) {
|
||||
|
||||
PageNode page = getPage(pageIndex, context);
|
||||
HeaderNode header = HeaderNode.builder().tableOfContents(context.tableOfContents()).build();
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(header, 0, page);
|
||||
List<Integer> tocId = context.tableOfContents().createNewEntryAndReturnId(NodeType.HEADER, header);
|
||||
header.setTocId(tocId);
|
||||
header.setTerminalTextBlock(textBlock);
|
||||
page.setHeader(header);
|
||||
}
|
||||
|
||||
|
||||
private PageNode buildPage(ClassificationPage p) {
|
||||
|
||||
return PageNode.builder()
|
||||
.height((int) p.getPageHeight())
|
||||
.width((int) p.getPageWidth())
|
||||
.number(p.getPageNumber())
|
||||
.rotation(p.getRotation())
|
||||
.mainBody(new LinkedList<>())
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
private PageNode getPage(int pageIndex, Context context) {
|
||||
|
||||
return context.pages.keySet()
|
||||
.stream()
|
||||
.filter(page -> page.getNumber() == pageIndex)
|
||||
.findFirst()
|
||||
.orElseThrow(() -> new NoSuchElementException(format("ClassificationPage with number %d not found", pageIndex)));
|
||||
}
|
||||
|
||||
|
||||
record Context(
|
||||
TableOfContents tableOfContents, Map<PageNode, AtomicInteger> pages, List<SectionNode> sections, List<ClassifiedImage> images, TextBlockFactory textBlockFactory) {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,132 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.factory;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationSection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Table;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.TableCell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
|
||||
|
||||
@Service
|
||||
public class ImageSortService {
|
||||
|
||||
public SortedImages sortImagesIntoStructure(ClassificationDocument document) {
|
||||
|
||||
SortedImages sortedImages = new SortedImages(new HashMap<>(), new HashMap<>(), new HashMap<>(), new HashMap<>(), new HashMap<>());
|
||||
|
||||
Map<Integer, List<ClassifiedImage>> imagesByPage = document.getSections()
|
||||
.stream()
|
||||
.flatMap(section -> section.getImages().stream())
|
||||
.distinct()
|
||||
.collect(Collectors.groupingBy(ClassifiedImage::getPage));
|
||||
|
||||
for (int pageNumber : imagesByPage.keySet()) {
|
||||
List<AbstractTextContainer> textContainersOnPage = document.getSections()
|
||||
.stream()
|
||||
.flatMap(section -> section.getPageBlocks().stream())
|
||||
.filter(abstractTextContainer -> abstractTextContainer.getPage() == pageNumber)
|
||||
.toList();
|
||||
|
||||
List<ClassificationSection> sectionsOnPage = document.getSections()
|
||||
.stream()
|
||||
.filter(section -> section.getPageBlocks().stream().anyMatch(block -> block.getPage() == pageNumber))
|
||||
.toList();
|
||||
|
||||
for (ClassifiedImage image : imagesByPage.get(pageNumber)) {
|
||||
sortImage(textContainersOnPage, sectionsOnPage, image, sortedImages);
|
||||
}
|
||||
}
|
||||
return sortedImages;
|
||||
}
|
||||
|
||||
|
||||
private void sortImage(List<AbstractTextContainer> textContainersOnPage, List<ClassificationSection> sectionsOnPage, ClassifiedImage image, SortedImages sortedImages) {
|
||||
|
||||
Optional<AbstractTextContainer> containingTextContainer = getContainingTextContainer(image, textContainersOnPage);
|
||||
Optional<ClassificationSection> sectionContainingTextContainer = getContainingSection(image, sectionsOnPage);
|
||||
List<AbstractTextContainer> containedTextContainers = getContainedTextContainers(image, textContainersOnPage);
|
||||
List<ClassificationSection> containedSections = getContainedSections(image, sectionsOnPage);
|
||||
if (containingTextContainer.isPresent()) {
|
||||
if (sortImageIntoTextContainerOrCell(image, sortedImages, containingTextContainer.get())) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static boolean sortImageIntoTextContainerOrCell(ClassifiedImage image, SortedImages sortedImages, AbstractTextContainer containingTextContainer) {
|
||||
|
||||
if (containingTextContainer instanceof ClassificationTextBlock) {
|
||||
sortedImages.containedInTextContainer().computeIfAbsent(containingTextContainer, sortedImage -> new ArrayList<>()).add(image);
|
||||
return true;
|
||||
}
|
||||
if (containingTextContainer instanceof Table) {
|
||||
Optional<TableCell> containingCell = getContainingCell((Table) containingTextContainer, image);
|
||||
if (containingCell.isPresent()) {
|
||||
sortedImages.containedInCell().computeIfAbsent(containingCell.get(), sortedImage -> new ArrayList<>()).add(image);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
private static Optional<TableCell> getContainingCell(Table table, ClassifiedImage image) {
|
||||
|
||||
return table.getRows().stream().flatMap(List::stream).filter(cell -> cell.contains(image.getPosition())).findFirst();
|
||||
}
|
||||
|
||||
|
||||
private List<ClassificationSection> getContainedSections(ClassifiedImage image, List<ClassificationSection> sectionsOnPage) {
|
||||
|
||||
return sectionsOnPage.stream()
|
||||
.filter(section -> image.getPosition().contains(RectangleTransformations.bBoxUnionAbstractTextContainer(section.getPageBlocks()
|
||||
.stream()
|
||||
.filter(block -> block.getPage() == image.getPage())
|
||||
.toList())))
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
private List<AbstractTextContainer> getContainedTextContainers(ClassifiedImage image, List<AbstractTextContainer> textContainersOnPage) {
|
||||
|
||||
return textContainersOnPage.stream().filter(textContainer -> image.getPosition().contains(RectangleTransformations.toRectangle2D(textContainer))).toList();
|
||||
}
|
||||
|
||||
|
||||
private Optional<ClassificationSection> getContainingSection(ClassifiedImage image, List<ClassificationSection> sectionsOnPage) {
|
||||
|
||||
return sectionsOnPage.stream()//
|
||||
.filter(section -> //
|
||||
RectangleTransformations.bBoxUnionAbstractTextContainer(section.getPageBlocks().stream().filter(block -> block.getPage() == image.getPage()).toList())//
|
||||
.contains(image.getPosition())).findFirst();
|
||||
}
|
||||
|
||||
|
||||
private Optional<AbstractTextContainer> getContainingTextContainer(ClassifiedImage image, List<AbstractTextContainer> textContainersOnPage) {
|
||||
|
||||
return textContainersOnPage.stream().filter(textContainer -> RectangleTransformations.toRectangle2D(textContainer).contains(image.getPosition())).findFirst();
|
||||
}
|
||||
|
||||
|
||||
public record SortedImages(
|
||||
Map<TableCell, List<ClassifiedImage>> containedInCell,
|
||||
Map<AbstractTextContainer, List<ClassifiedImage>> containedInTextContainer,
|
||||
Map<ClassificationSection, List<ClassifiedImage>> containedInSection,
|
||||
Map<ClassifiedImage, List<AbstractTextContainer>> containedByImage,
|
||||
Map<ClassifiedImage, List<ClassificationSection>> sectionContainedByImage) {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,105 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.factory;
|
||||
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.awt.geom.Area;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.function.BiConsumer;
|
||||
import java.util.function.BinaryOperator;
|
||||
import java.util.function.Function;
|
||||
import java.util.function.Supplier;
|
||||
import java.util.stream.Collector;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer;
|
||||
|
||||
public class RectangleTransformations {
|
||||
|
||||
public static Rectangle2D pad(Rectangle2D rectangle2D, int deltaX, int deltaY) {
|
||||
|
||||
return new Rectangle2D.Double(rectangle2D.getMinX() - deltaX, rectangle2D.getMinY() - deltaY, rectangle2D.getWidth() + 2 * deltaX, rectangle2D.getHeight() + 2 * deltaY);
|
||||
}
|
||||
|
||||
|
||||
public static Rectangle2D bBoxUnionAbstractTextContainer(List<AbstractTextContainer> abstractTextContainers) {
|
||||
|
||||
return abstractTextContainers.stream().map(RectangleTransformations::toRectangle2D).collect(new Rectangle2DUnion());
|
||||
}
|
||||
|
||||
public static Rectangle2D rectangleUnion(List<Rectangle2D> rectangle2DList) {
|
||||
|
||||
return rectangle2DList.stream().collect(new Rectangle2DUnion());
|
||||
}
|
||||
|
||||
|
||||
public static Rectangle2D toRectangle2D(AbstractTextContainer abstractTextContainer) {
|
||||
|
||||
return new Rectangle2D.Float(abstractTextContainer.getMinX(), abstractTextContainer.getMinY(), abstractTextContainer.getWidth(), abstractTextContainer.getHeight());
|
||||
}
|
||||
|
||||
|
||||
public static Rectangle2D toRectangle2D(PDRectangle rectangle) {
|
||||
|
||||
return new Rectangle2D.Double(rectangle.getLowerLeftX(), rectangle.getLowerLeftY(), rectangle.getWidth(), rectangle.getHeight());
|
||||
}
|
||||
|
||||
|
||||
public static String toString(Rectangle2D rectangle2D) {
|
||||
|
||||
return format("%f,%f,%f,%f", rectangle2D.getX(), rectangle2D.getY(), rectangle2D.getWidth(), rectangle2D.getHeight());
|
||||
}
|
||||
|
||||
|
||||
public static Rectangle2D parseRectangle2D(String bBox) {
|
||||
|
||||
List<Float> floats = Arrays.stream(bBox.split(",")).map(Float::parseFloat).toList();
|
||||
return new Rectangle2D.Float(floats.get(0), floats.get(1), floats.get(2), floats.get(3));
|
||||
}
|
||||
|
||||
|
||||
private static class Rectangle2DUnion implements Collector<Rectangle2D, Area, Rectangle2D> {
|
||||
|
||||
@Override
|
||||
public Supplier<Area> supplier() {
|
||||
|
||||
return Area::new;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public BiConsumer<Area, Rectangle2D> accumulator() {
|
||||
|
||||
return (area, rectangle2D) -> area.add(new Area(rectangle2D));
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public BinaryOperator<Area> combiner() {
|
||||
|
||||
return (area1, area2) -> {
|
||||
area1.add(area2);
|
||||
return area1;
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Function<Area, Rectangle2D> finisher() {
|
||||
|
||||
return Area::getBounds2D;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Set<Characteristics> characteristics() {
|
||||
|
||||
return Set.of(Characteristics.CONCURRENT, Characteristics.UNORDERED);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,156 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.factory;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.RedTextPosition;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextDirection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextPositionSequence;
|
||||
|
||||
public class SearchTextWithTextPositionFactory {
|
||||
|
||||
public static final int HEIGHT_PADDING = 2;
|
||||
|
||||
|
||||
public static SearchTextWithTextPositionModel buildSearchTextToTextPositionModel(List<TextPositionSequence> sequences) {
|
||||
|
||||
if (sequences.isEmpty() || sequences.stream().allMatch(sequence -> sequence.getTextPositions().isEmpty())) {
|
||||
return SearchTextWithTextPositionModel.builder()
|
||||
.searchText("")
|
||||
.lineBreaks(Collections.emptyList())
|
||||
.positions(Collections.emptyList())
|
||||
.stringCoordsToPositionCoords(Collections.emptyList())
|
||||
.build();
|
||||
}
|
||||
|
||||
List<Integer> stringIdxToPositionIdx = new LinkedList<>();
|
||||
List<Integer> lineBreaksStringIdx = new LinkedList<>();
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
int stringIdx = 0;
|
||||
int positionIdx = 0;
|
||||
int lastHyphenIdx = -3;
|
||||
|
||||
RedTextPosition currentTextPosition = sequences.get(0).getTextPositions().get(0);
|
||||
RedTextPosition previousTextPosition = RedTextPosition.builder().unicode(" ").position(currentTextPosition.getPosition()).build();
|
||||
|
||||
for (TextPositionSequence word : sequences) {
|
||||
for (int i = 0; i < word.getTextPositions().size(); ++i) {
|
||||
|
||||
currentTextPosition = word.getTextPositions().get(i);
|
||||
|
||||
if (isLineBreak(currentTextPosition, previousTextPosition)) {
|
||||
|
||||
if (stringIdx - lastHyphenIdx < 3) {
|
||||
sb.delete(lastHyphenIdx, sb.length());
|
||||
stringIdxToPositionIdx = stringIdxToPositionIdx.subList(0, lastHyphenIdx);
|
||||
stringIdx = lastHyphenIdx;
|
||||
lastHyphenIdx = -3;
|
||||
}
|
||||
lineBreaksStringIdx.add(stringIdx);
|
||||
}
|
||||
if (!isRepeatedWhitespace(currentTextPosition.getUnicode(), previousTextPosition.getUnicode())) {
|
||||
|
||||
if (isHyphen(currentTextPosition.getUnicode())) {
|
||||
lastHyphenIdx = stringIdx;
|
||||
}
|
||||
sb.append(currentTextPosition.getUnicode());
|
||||
stringIdxToPositionIdx.add(positionIdx);
|
||||
++stringIdx;
|
||||
}
|
||||
|
||||
previousTextPosition = currentTextPosition;
|
||||
|
||||
++positionIdx;
|
||||
}
|
||||
|
||||
previousTextPosition = RedTextPosition.builder().unicode(" ").position(previousTextPosition.getPosition()).build();
|
||||
sb.append(previousTextPosition.getUnicode());
|
||||
stringIdxToPositionIdx.add(positionIdx);
|
||||
++stringIdx;
|
||||
}
|
||||
|
||||
assert sb.length() == stringIdxToPositionIdx.size();
|
||||
|
||||
List<Rectangle2D> positions = sequences.stream()
|
||||
.flatMap(sequence -> sequence.getTextPositions().stream().map(textPosition -> mapRedTextPositionToInitialUserSpace(textPosition, sequence)))
|
||||
.toList();
|
||||
|
||||
return SearchTextWithTextPositionModel.builder()
|
||||
.searchText(sb.toString())
|
||||
.lineBreaks(lineBreaksStringIdx)
|
||||
.stringCoordsToPositionCoords(stringIdxToPositionIdx)
|
||||
.positions(positions)
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
private static boolean isLineBreak(RedTextPosition currentTextPosition, RedTextPosition previousTextPosition) {
|
||||
|
||||
return Objects.equals(currentTextPosition.getUnicode(), "\n") || isDeltaYLargerThanTextHeight(currentTextPosition, previousTextPosition);
|
||||
}
|
||||
|
||||
|
||||
private static boolean isDeltaYLargerThanTextHeight(RedTextPosition currentPosition, RedTextPosition previousPosition) {
|
||||
|
||||
if (previousPosition == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
float deltaY = Math.abs(currentPosition.getYDirAdj() - previousPosition.getYDirAdj());
|
||||
return deltaY >= currentPosition.getHeightDir();
|
||||
}
|
||||
|
||||
|
||||
private static boolean isRepeatedWhitespace(String currentUnicode, String previousUnicode) {
|
||||
|
||||
return Objects.equals(previousUnicode, " ") && Objects.equals(currentUnicode, " ");
|
||||
}
|
||||
|
||||
|
||||
private static boolean isHyphen(String unicodeCharacter) {
|
||||
|
||||
return Objects.equals(unicodeCharacter, "-") || //
|
||||
Objects.equals(unicodeCharacter, "~") || //
|
||||
Objects.equals(unicodeCharacter, "‐") || //
|
||||
Objects.equals(unicodeCharacter, "‒") || //
|
||||
Objects.equals(unicodeCharacter, "⁻") || //
|
||||
Objects.equals(unicodeCharacter, "−") || //
|
||||
Objects.equals(unicodeCharacter, "﹣") || //
|
||||
Objects.equals(unicodeCharacter, "゠") || //
|
||||
Objects.equals(unicodeCharacter, "⁓") || //
|
||||
Objects.equals(unicodeCharacter, "‑") || //
|
||||
Objects.equals(unicodeCharacter, "\u00AD");
|
||||
}
|
||||
|
||||
|
||||
private static Rectangle2D mapRedTextPositionToInitialUserSpace(RedTextPosition textPosition, TextPositionSequence sequence) {
|
||||
|
||||
float textHeight = sequence.getTextHeight() + HEIGHT_PADDING;
|
||||
Rectangle2D rectangle2D = new Rectangle2D.Double(textPosition.getXDirAdj(),
|
||||
textPosition.getYDirAdj() - textHeight,
|
||||
textPosition.getWidthDirAdj(),
|
||||
textHeight + HEIGHT_PADDING);
|
||||
|
||||
AffineTransform transform = new AffineTransform();
|
||||
|
||||
if (sequence.getDir() == TextDirection.ZERO || sequence.getDir() == TextDirection.HALF_CIRCLE) {
|
||||
transform.rotate(sequence.getDir().getRadians(), sequence.getPageWidth() / 2f, sequence.getPageHeight() / 2f);
|
||||
transform.translate(0f, sequence.getPageHeight());
|
||||
} else if (sequence.getDir() == TextDirection.QUARTER_CIRCLE) {
|
||||
transform.rotate(sequence.getDir().getRadians(), sequence.getPageWidth() / 2f, sequence.getPageWidth() / 2f);
|
||||
transform.translate(0f, sequence.getPageWidth());
|
||||
} else {
|
||||
transform.rotate(sequence.getDir().getRadians(), sequence.getPageHeight() / 2f, sequence.getPageHeight() / 2f);
|
||||
transform.translate(0f, sequence.getPageWidth());
|
||||
}
|
||||
transform.scale(1., -1.);
|
||||
|
||||
return transform.createTransformedShape(rectangle2D).getBounds2D();
|
||||
}
|
||||
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user