Compare commits

..

No commits in common. "main" and "taas-41-1" have entirely different histories.

398 changed files with 9135 additions and 55519 deletions

1
.gitattributes vendored
View File

@ -1 +0,0 @@
*.pdf filter=lfs diff=lfs merge=lfs -text

7
.gitignore vendored
View File

@ -18,7 +18,6 @@ target/
.settings
.springBeans
.sts4-cache
.gradle
### IntelliJ IDEA ###
.idea
@ -38,9 +37,3 @@ build/
### VS Code ###
.vscode/
gradlew.bat
gradlew
gradle.properties
gradle/
.DS_Store
.DS_Store/

View File

@ -1,26 +1,4 @@
variables:
# SONAR_PROJECT_KEY: 'fforesight_layout-parser_AYd5quv2mRkBOCG22hvF'
GIT_SUBMODULE_STRATEGY: recursive
GIT_SUBMODULE_FORCE_HTTPS: 'true'
include:
- project: 'gitlab/gitlab'
ref: 'main'
file: 'ci-templates/gradle_java.yml'
deploy:
stage: deploy
tags:
- dind
script:
- echo "Building with gradle version ${BUILDVERSION}"
- gradle -Pversion=${BUILDVERSION} publish
- gradle bootBuildImage --publishImage -PbuildbootDockerHostNetwork=true -Pversion=${BUILDVERSION}
- echo "BUILDVERSION=$BUILDVERSION" >> version.env
artifacts:
reports:
dotenv: version.env
rules:
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
- if: $CI_COMMIT_BRANCH =~ /^feature/ && $CI_COMMIT_TAG == ""
- if: $CI_COMMIT_BRANCH =~ /^release/
- if: $CI_COMMIT_TAG
file: 'ci-templates/maven_java.yml'

8
.gitmodules vendored
View File

@ -1,8 +0,0 @@
[submodule "layoutparser-service/layoutparser-service-server/src/test/resources/files/basf"]
path = layoutparser-service/layoutparser-service-server/src/test/resources/files/basf
url = ssh://git@git.knecon.com:22222/fforesight/documents/basf.git
update = merge
[submodule "layoutparser-service/layoutparser-service-server/src/test/resources/files/syngenta"]
path = layoutparser-service/layoutparser-service-server/src/test/resources/files/syngenta
url = ssh://git@git.knecon.com:22222/fforesight/documents/syngenta.git
update = merge

BIN
.mvn/wrapper/maven-wrapper.jar vendored Normal file

Binary file not shown.

18
.mvn/wrapper/maven-wrapper.properties vendored Normal file
View File

@ -0,0 +1,18 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
distributionUrl=https://repo.maven.apache.org/maven2/org/apache/maven/apache-maven/3.8.7/apache-maven-3.8.7-bin.zip
wrapperUrl=https://repo.maven.apache.org/maven2/org/apache/maven/wrapper/maven-wrapper/3.1.1/maven-wrapper-3.1.1.jar

View File

@ -1,89 +1 @@
# PDF Layout Parser Micro-Service: layout-parser
## Introduction
The layout-parser micro-service is a powerful tool designed to efficiently extract structured information from PDF documents. Written in Java and utilizing Spring Boot 3, Apache PDFBox, and RabbitMQ, this micro-service excels at parsing PDFs and organizing their content into a meaningful and coherent layout structure. Notably, the layout-parser micro-service distinguishes itself by relying solely on advanced algorithms, rather than machine learning techniques.
### Key Steps in the PDF Layout Parsing Process:
* **Text Position Extraction:**
The micro-service leverages Apache PDFBox to extract precise text positions for each individual character within the PDF document.
* **Word Segmentation and Text Block Formation:**
Employing an array of diverse algorithms, the micro-service initially identifies and segments words, creating distinct text blocks.
* **Text Block Classification:**
The segmented text blocks are then subjected to classification algorithms. These algorithms categorize the text blocks based on their content and visual properties, distinguishing between sections, subsections, headlines, paragraphs, images, tables, table cells, headers, and footers.
* **Layout Coherence Establishment:**
The classified text blocks are subsequently orchestrated into a cohesive layout structure. This process involves arranging sections, subsections, paragraphs, images, and other elements in a logical and structured manner.
* **Output Generation in Various Formats:**
Once the layout structure is established, the micro-service generates output in multiple formats. These formats are designed for seamless integration with downstream micro-services. The supported formats include JSON, XML, and others, ensuring flexibility in downstream data consumption.
### Optional Enhancements:
* **ML-Based Table Extraction:**
For enhanced results, users have the option to incorporate machine learning-based table extraction. This feature can be activated by providing ML-generated results as a JSON file, which are then integrated seamlessly into the layout structure.
* **Image Classification using ML:**
Additionally, for more accurate image classification, users can optionally feed ML-generated image classification results into the micro-service. Similar to the table extraction option, the micro-service processes the pre-parsed results in JSON format, thus optimizing the accuracy of image content identification.
In conclusion, the layout-parser micro-service is a versatile PDF layout parsing solution crafted entirely around advanced algorithms, without reliance on machine learning. It proficiently extracts text positions, segments content into meaningful blocks, classifies these blocks, arranges them coherently, and outputs structured data for downstream micro-services. Optional integration with ML-generated table extractions and image classifications further enhances its capabilities.
## Installation
### Prerequisites
Before building and using the layout-parser micro-service, please ensure you have the following software and tools installed:
Java Development Kit (JDK) 17 or later
Gradle build tool (preinstalled)
Build and Test
To build and test the micro-service, follow these steps:
### Clone the Repository:
bash
```
git clone ssh://git@git.knecon.com:22222/fforesight/layout-parser.git
cd layout-parser
```
### Build the Project:
Use the following command to build the project using Gradle:
```
gradle clean build
```
### Run Tests:
Run the test suite using the following command:
```
gradle test
```
## Building a Custom Docker Image
To create a custom Docker image for the layout-parser micro-service, execute the provided script:
### Ensure Docker is Installed:
Ensure that Docker is installed and running on your system.
### Run the Image Building Script:
Execute the publish-custom-image script in the project directory:
```
./publish-custom-image
```
## Publishing to Internal Maven Repository
To publish the layout-parser micro-service to your internal Maven repository, execute the following command:
```
gradle -Pversion=buildVersion publish
```
Replace buildVersion with the desired version number.
## Additional Notes
Make sure to configure any necessary application properties before deploying the micro-service.
For advanced usage and configurations, refer to Kilian or Dom or preferably the source code.

View File

@ -1,7 +0,0 @@
plugins {
`kotlin-dsl`
}
repositories {
gradlePluginPortal()
}

View File

@ -1,93 +0,0 @@
plugins {
`java-library`
`maven-publish`
pmd
checkstyle
jacoco
}
group = "com.knecon.fforesight"
val documentVersion by rootProject.extra { "4.433.0" }
java.sourceCompatibility = JavaVersion.VERSION_17
java.targetCompatibility = JavaVersion.VERSION_17
tasks.pmdMain {
pmd.ruleSetFiles = files("${rootDir}/config/pmd/pmd.xml")
}
tasks.pmdTest {
pmd.ruleSetFiles = files("${rootDir}/config/pmd/test_pmd.xml")
}
tasks.named<Test>("test") {
useJUnitPlatform()
reports {
junitXml.outputLocation.set(layout.buildDirectory.dir("reports/junit"))
}
minHeapSize = "512m"
maxHeapSize = "2048m"
}
tasks.test {
finalizedBy(tasks.jacocoTestReport) // report is always generated after tests run
}
tasks.jacocoTestReport {
dependsOn(tasks.test) // tests are required to run before generating the report
reports {
xml.required.set(true)
csv.required.set(false)
html.outputLocation.set(layout.buildDirectory.dir("jacocoHtml"))
}
}
allprojects {
tasks.withType<Javadoc> {
options {
this as StandardJavadocDocletOptions
addBooleanOption("Xdoclint:none", true)
addStringOption("Xmaxwarns", "1")
}
}
pmd {
setConsoleOutput(true)
}
publishing {
publications {
create<MavenPublication>(name) {
from(components["java"])
}
}
repositories {
maven {
url = uri("https://nexus.knecon.com/repository/red-platform-releases/")
credentials {
username = providers.gradleProperty("mavenUser").getOrNull();
password = providers.gradleProperty("mavenPassword").getOrNull();
}
}
}
}
}
java {
withJavadocJar()
}
repositories {
mavenLocal()
mavenCentral()
maven {
url = uri("https://nexus.knecon.com/repository/gindev/")
credentials {
username = providers.gradleProperty("mavenUser").getOrNull();
password = providers.gradleProperty("mavenPassword").getOrNull();
}
}
}

View File

@ -1,39 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE module PUBLIC "-//Puppy Crawl//DTD Check Configuration 1.3//EN"
"http://www.puppycrawl.com/dtds/configuration_1_3.dtd">
<module name="Checker">
<property
name="severity"
value="error"/>
<module name="TreeWalker">
<module name="SuppressWarningsHolder"/>
<module name="MissingDeprecated"/>
<module name="MissingOverride"/>
<module name="AnnotationLocation"/>
<module name="JavadocStyle"/>
<module name="NonEmptyAtclauseDescription"/>
<module name="IllegalImport"/>
<module name="RedundantImport"/>
<module name="RedundantModifier"/>
<module name="EmptyBlock"/>
<module name="DefaultComesLast"/>
<module name="EmptyStatement"/>
<module name="EqualsHashCode"/>
<module name="ExplicitInitialization"/>
<module name="IllegalInstantiation"/>
<module name="ModifiedControlVariable"/>
<module name="MultipleVariableDeclarations"/>
<module name="PackageDeclaration"/>
<module name="ParameterAssignment"/>
<module name="SimplifyBooleanExpression"/>
<module name="SimplifyBooleanReturn"/>
<module name="StringLiteralEquality"/>
<module name="OneStatementPerLine"/>
<module name="FinalClass"/>
<module name="ArrayTypeStyle"/>
<module name="UpperEll"/>
<module name="OuterTypeFilename"/>
</module>
<module name="FileTabCharacter"/>
<module name="SuppressWarningsFilter"/>
</module>

View File

@ -1,21 +0,0 @@
<?xml version="1.0"?>
<ruleset name="Custom ruleset"
xmlns="http://pmd.sourceforge.net/ruleset/2.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://pmd.sourceforge.net/ruleset/2.0.0 http://pmd.sourceforge.net/ruleset_2_0_0.xsd">
<description>
Knecon ruleset checks the code for bad stuff
</description>
<rule ref="category/java/errorprone.xml">
<exclude name="MissingSerialVersionUID"/>
<exclude name="AvoidLiteralsInIfCondition"/>
<exclude name="AvoidDuplicateLiterals"/>
<exclude name="NullAssignment"/>
<exclude name="AssignmentInOperand"/>
<exclude name="BeanMembersShouldSerialize"/>
</rule>
</ruleset>

View File

@ -1,23 +0,0 @@
<?xml version="1.0"?>
<ruleset name="Custom ruleset"
xmlns="http://pmd.sourceforge.net/ruleset/2.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://pmd.sourceforge.net/ruleset/2.0.0 http://pmd.sourceforge.net/ruleset_2_0_0.xsd">
<description>
Knecon test ruleset checks the code for bad stuff
</description>
<rule ref="category/java/errorprone.xml">
<exclude name="MissingSerialVersionUID"/>
<exclude name="AvoidLiteralsInIfCondition"/>
<exclude name="AvoidDuplicateLiterals"/>
<exclude name="AvoidFieldNameMatchingMethodName"/>
<exclude name="NullAssignment"/>
<exclude name="AssignmentInOperand"/>
<exclude name="TestClassWithoutTestCases"/>
<exclude name="BeanMembersShouldSerialize"/>
</rule>
</ruleset>

View File

@ -1 +0,0 @@
version = 0.1-SNAPSHOT

View File

@ -0,0 +1,99 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.knecon.fforesight</groupId>
<artifactId>platform-docker-dependency</artifactId>
<version>0.1.0</version>
<relativePath/>
</parent>
<groupId>com.knecon.fforesight</groupId>
<artifactId>layoutparser-service-image</artifactId>
<version>0.1-SNAPSHOT</version>
<packaging>pom</packaging>
<properties>
<service.server>layoutparser-service-server</service.server>
<platform.jar>${service.server}.jar</platform.jar>
<docker.skip.push>false</docker.skip.push>
<docker.image.prefix>ff</docker.image.prefix>
<docker.image.name>${docker.image.prefix}/${service.server}</docker.image.name>
</properties>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-dependency-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-resources-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>exec-maven-plugin</artifactId>
</plugin>
<plugin>
<groupId>io.fabric8</groupId>
<artifactId>docker-maven-plugin</artifactId>
</plugin>
</plugins>
<pluginManagement>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-dependency-plugin</artifactId>
<executions>
<execution>
<id>download-platform-jar</id>
<phase>prepare-package</phase>
<goals>
<goal>copy</goal>
</goals>
<configuration>
<artifactItems>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>${service.server}</artifactId>
<version>${project.version}</version>
<type>jar</type>
<overWrite>true</overWrite>
<destFileName>${platform.jar}</destFileName>
</dependency>
</artifactItems>
<outputDirectory>${docker.build.directory}</outputDirectory>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>io.fabric8</groupId>
<artifactId>docker-maven-plugin</artifactId>
<configuration>
<images>
<image>
<name>${docker.image.name}</name>
<build>
<dockerFileDir>${docker.build.directory}</dockerFileDir>
<args>
<PLATFORM_JAR>${platform.jar}</PLATFORM_JAR>
</args>
<tags>
<tag>${docker.image.version}</tag>
<tag>latest</tag>
</tags>
</build>
</image>
</images>
</configuration>
</plugin>
</plugins>
</pluginManagement>
</build>
</project>

View File

@ -0,0 +1,9 @@
FROM red/base-image:2.0.2
ARG PLATFORM_JAR
ENV PLATFORM_JAR ${PLATFORM_JAR}
ENV USES_ELASTICSEARCH false
COPY ["${PLATFORM_JAR}", "/"]

View File

@ -1,10 +0,0 @@
plugins {
id("com.knecon.fforesight.java-conventions")
id("io.freefair.lombok") version "8.4"
}
description = "layoutparser-service-internal-api"
dependencies {
implementation("io.swagger.core.v3:swagger-annotations:2.2.15")
}

View File

@ -0,0 +1,24 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns="http://maven.apache.org/POM/4.0.0"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.knecon.fforesight</groupId>
<artifactId>layoutparser-service</artifactId>
<version>0.1-SNAPSHOT</version>
</parent>
<artifactId>layoutparser-service-internal-api</artifactId>
<dependencies>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>${guava.version}</version>
</dependency>
</dependencies>
</project>

View File

@ -0,0 +1,19 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class AtomicPositionBlockData {
Long id;
int[] stringIdxToPositionIdx;
float[][] positions;
}

View File

@ -0,0 +1,27 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class AtomicTextBlockData {
Long id;
Long page;
String searchText;
int numberOnPage;
int start;
int end;
int[] lineBreaks;
}

View File

@ -0,0 +1,21 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class DocumentData {
PageData[] pages;
AtomicTextBlockData[] atomicTextBlocks;
AtomicPositionBlockData[] atomicPositionBlocks;
DocumentTreeData documentTreeData;
}

View File

@ -0,0 +1,90 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
import java.util.List;
import java.util.Map;
import java.util.stream.Stream;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.Getter;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class DocumentTreeData {
EntryData root;
public EntryData get(List<Integer> tocId) {
if (tocId.isEmpty()) {
return root;
}
EntryData entry = root.children.get(tocId.get(0));
for (int id : tocId.subList(1, tocId.size())) {
entry = entry.children.get(id);
}
return entry;
}
public Stream<EntryData> streamAllEntries() {
return Stream.concat(Stream.of(root), root.children.stream()).flatMap(DocumentTreeData::flatten);
}
public String toString() {
return String.join("\n", streamAllEntries().map(EntryData::toString).toList());
}
private static Stream<EntryData> flatten(EntryData entry) {
return Stream.concat(Stream.of(entry), entry.children.stream().flatMap(DocumentTreeData::flatten));
}
@Builder
@Getter
@AllArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public static class EntryData {
NodeType type;
int[] treeId;
Long[] atomicBlockIds;
Long[] pageNumbers;
Map<String, String> properties;
List<EntryData> children;
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("[");
for (int i : treeId) {
sb.append(i);
sb.append(",");
}
sb.delete(sb.length() - 1, sb.length());
sb.append("]: ");
sb.append(type);
sb.append(" atbs = ");
sb.append(atomicBlockIds.length);
return sb.toString();
}
}
}

View File

@ -0,0 +1,21 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
import java.util.Locale;
public enum NodeType {
DOCUMENT,
SECTION,
HEADLINE,
PARAGRAPH,
TABLE,
TABLE_CELL,
IMAGE,
HEADER,
FOOTER;
public String toString() {
return this.name().charAt(0) + this.name().substring(1).toLowerCase(Locale.ROOT);
}
}

View File

@ -0,0 +1,20 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class PageData {
int number;
int height;
int width;
int rotation;
}

View File

@ -1,21 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
@Schema(description = "Object containing a simplified version, which contains almost exclusively text, of the document structure Section class.")
public class SimplifiedSectionText {
@Schema(description = "The number of this Section. This is used to map the simplified section text back to the original Section.")
private String sectionNumber;
@Schema(description = "The text in this Section.")
private String text;
}

View File

@ -1,34 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
import java.util.ArrayList;
import java.util.List;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
@Schema(description = "Object containing a simplified version, which contains almost exclusively text, of the document structure.")
public class SimplifiedText {
@Schema(description = "Number of pages in the entire document.")
private int numberOfPages;
@Schema(description = "A List of simplified Sections, which contains almost exclusively text.")
@Builder.Default
private List<SimplifiedSectionText> sectionTexts = new ArrayList<>();
@Schema(description = "A list of the main section numbers ")
@Builder.Default
private List<String> mainSectionNumbers = new ArrayList<>();
@Schema(description = "A list of the header section numbers ")
@Builder.Default
private List<String> headerSectionNumbers = new ArrayList<>();
@Schema(description = "A list of the footer section numbers ")
@Builder.Default
private List<String> footerSectionNumbers = new ArrayList<>();
}

View File

@ -2,29 +2,20 @@ package com.knecon.fforesight.service.layoutparser.internal.api.data.taas;
import java.util.List;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.Builder;
import lombok.Data;
@Data
@Builder
@Schema(description = "Object containing information about a Paragraph/Headline/Header/Footer.")
public class ParagraphData {
@Schema(description = "The text of this Semantic Node, without any linebreaks.", example = "This is some text.")
private String text;
@Schema(description = "A list of text ranges in string offsets. Every character in any of the ranges is bold.", example = "[0, 15]")
List<Range> boldTextBoundaries;
@Schema(description = "A list of text ranges in string offsets. Every character in any of the ranges is italic.", example = "[0, 15]")
List<Range> italicTextBoundaries;
@Schema(description = "The line breaks in the text of this semantic node in string offsets. They are exclusive end. At the end of each semantic node there is an implicit linebreak.", example = "[5, 10]")
List<Integer> linebreaks;
@Schema(description = "The classification of this Paragraph.", allowableValues = "{paragraph, headline, header, footer}")
private String classification;
@Schema(description = "Describes the text orientation of this semantic node. Any semantic node only has a single text orientation.", allowableValues = "{ZERO, QUARTER_CIRCLE, HALF_CIRCLE, THREE_QUARTER_CIRCLE}")
private String orientation;
@Schema(description = "Describes the text direction in degrees of this semantic node. Any semantic node only has a single text direction.", minimum = "0", maximum = "359")
private int textDirection;
}

View File

@ -1,8 +1,5 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data.taas;
import io.swagger.v3.oas.annotations.media.Schema;
@Schema(description = "Object specifying the start and end offsets of a text range in string offsets.")
public record Range(int start, int end) {
}

View File

@ -2,7 +2,6 @@ package com.knecon.fforesight.service.layoutparser.internal.api.data.taas;
import java.util.List;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
@ -10,12 +9,8 @@ import lombok.Data;
@Builder
@Data
@AllArgsConstructor
@Schema(description = "Object containing a simplified version of the document structure. This simplified form only knows Paragraphs and Tables. The Paragraph Objects might be a Paragraph, Headline, Header or Footer.")
public class ResearchDocumentData {
@Schema(description = "File name of the original uploaded file.")
String originalFile;
@Schema(description = "A List of all paragraphs/headline or table objects, that have been parsed in this document.")
List<StructureObject> structureObjects;
}

View File

@ -2,19 +2,14 @@ package com.knecon.fforesight.service.layoutparser.internal.api.data.taas;
import java.util.List;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.AllArgsConstructor;
import lombok.Data;
@Data
@AllArgsConstructor
@Schema(description = "Object containing information about a Table Row.")
public class RowData {
@Schema(description = "Boolean indicating whether this table row is classified as a header row.")
boolean header;
@Schema(description = "A list of Objects containing information about the text in each cell of this row.")
List<ParagraphData> cellText;
@Schema(description = "The bounding box of this StructureObject. Is always exactly 4 values representing x, y, w, h, where x, y specify the lower left corner.")
float[] bBox;
}

View File

@ -1,8 +1,5 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data.taas;
import java.util.List;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
@ -10,22 +7,13 @@ import lombok.Data;
@Data
@Builder
@AllArgsConstructor
@Schema(description = "Object containing information about either a Paragraph/Headline/Header/Footer or a Table.")
public class StructureObject {
@Schema(description = "The ID of this StructureObject.")
Integer structureObjectNumber;
@Schema(description = "The Tree ID of this StructureObject.")
List<Integer> treeId;
@Schema(description = "This value indicates the start of the string offsets in this Object, with respect to the reading order.")
int page;
@Schema(description = "This stringOffset indicates the start of the string offsets in this Object, with respect to the reading order of the entire document. It is equal to the previous' StructureObject stringOffset + its length.")
int stringOffset;
@Schema(description = "The bounding box of this StructureObject. Is always exactly 4 values representing x, y, w, h, where x, y specify the lower left corner.", example = "[100, 100, 50, 50]")
float[] boundingBox;
@Schema(description = "Object containing information about a Paragraph/Headline/Header/Footer. Either this or table is null.")
ParagraphData paragraph;
@Schema(description = "Object containing information about a Table. Either this or paragraph is null.")
TableData table;
}

View File

@ -2,20 +2,14 @@ package com.knecon.fforesight.service.layoutparser.internal.api.data.taas;
import java.util.List;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.AllArgsConstructor;
import lombok.Data;
@Data
@AllArgsConstructor
@Schema(description = "Object containing information about a Table.")
public class TableData {
@Schema(description = "A list of Objects containing information about all rows in this table.")
List<RowData> rowData;
@Schema(description = "Number of columns in this table.")
Integer numberOfCols;
@Schema(description = "Number of rows in this table.")
Integer numberOfRows;
}

View File

@ -2,26 +2,9 @@ package com.knecon.fforesight.service.layoutparser.internal.api.queue;
import java.util.Map;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.Builder;
@Builder
@Schema(description = "Object containing information about the layout parsing.")
public record LayoutParsingFinishedEvent(
@Schema(description = "General purpose identifier. It is returned exactly the same way it is inserted with the LayoutParsingRequest.") //
Map<String, String> identifier,
@Schema(description = "The duration of a single layout parsing in ms.") //
long duration,
@Schema(description = "The number of pages of the parsed document.") //
int numberOfPages,
@Schema(description = "A general message. It contains some information useful for a developer, like the paths where the files are stored. Not meant to be machine readable.") //
String message,
@Schema(description = "The app version of the layout parser.") //
String layoutParserVersion
) {
public record LayoutParsingFinishedEvent(Map<String, String> identifier, long duration, int numberOfPages, String message) {
}

View File

@ -2,9 +2,7 @@ package com.knecon.fforesight.service.layoutparser.internal.api.queue;
public class LayoutParsingQueueNames {
public static final String LAYOUT_PARSING_REQUEST_QUEUE_PREFIX = "layout_parsing_request";
public static final String LAYOUT_PARSING_REQUEST_EXCHANGE = "layout_parsing_request_exchange";
public static final String LAYOUT_PARSING_RESPONSE_QUEUE_PREFIX = "layout_parsing_response";
public static final String LAYOUT_PARSING_RESPONSE_EXCHANGE = "layout_parsing_response_exchange";
public static final String LAYOUT_PARSING_DLQ = "layout_parsing_error";
public static final String LAYOUT_PARSING_REQUEST_QUEUE = "layout_parsing_request_queue";
public static final String LAYOUT_PARSING_DLQ = "layout_parsing_dead_letter_queue";
public static final String LAYOUT_PARSING_FINISHED_EVENT_QUEUE = "layout_parsing_response_queue";
}

View File

@ -3,45 +3,19 @@ package com.knecon.fforesight.service.layoutparser.internal.api.queue;
import java.util.Map;
import java.util.Optional;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.Builder;
import lombok.NonNull;
@Builder
@Schema(description = "Object containing all storage paths the service needs to know.")
public record LayoutParsingRequest(
@Schema(description = "Enum specifying the type of layout parsing to be performed.", allowableValues = "{RedactManager, DocuMine, TAAS}")//
@NonNull LayoutParsingType layoutParsingType,
@Schema(description = "General purpose identifiers. They are not changed by the service at all and are returned as is in the response queue.")//
Map<String, String> identifier,
@Schema(description = "Path to the original PDF file.")//
@NonNull String originFileStorageId,//
@Schema(description = "Optional Path to the table extraction file.")//
Optional<String> tablesFileStorageId,//
@Schema(description = "Optional Path to the image classification file.")//
Optional<String> imagesFileStorageId,//
@Schema(description = "Optional Path to the the visual layout parsing service file") Optional<String> visualLayoutParsingFileId,//
@Schema(description = "Path where the Document Structure File will be stored.")//
@NonNull String structureFileStorageId,//
@Schema(description = "Path where the Research Data File will be stored.")//
String researchDocumentStorageId,//
@Schema(description = "Path where the Document Text File will be stored.")//
@NonNull String textBlockFileStorageId,//
@Schema(description = "Path where the Document Positions File will be stored.")//
@NonNull String positionBlockFileStorageId,//
@Schema(description = "Path where the Document Pages File will be stored.")//
@NonNull String pageFileStorageId,//
@Schema(description = "Path where the Document Markdown File will be stored.")//
Optional<String> documentMarkdownFileStorageId,//
@Schema(description = "Path where the Simplified Text File will be stored.")//
@NonNull String simplifiedTextStorageId,//
@Schema(description = "Path where the Viewer Document PDF will be stored.")//
@NonNull String viewerDocumentStorageId
) {
String originFileStorageId,
Optional<String> tablesFileStorageId,
Optional<String> imagesFileStorageId,
String structureFileStorageId,
String researchDocumentStorageId,
String textBlockFileStorageId,
String positionBlockFileStorageId,
String pageFileStorageId) {
}

View File

@ -1,12 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.queue;
public enum LayoutParsingType {
REDACT_MANAGER,
REDACT_MANAGER_OLD,
REDACT_MANAGER_PARAGRAPH_DEBUG,
REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
DOCUMINE,
DOCUMINE_OLD,
CLARIFYND,
CLARIFYND_PARAGRAPH_DEBUG
}

View File

@ -1,16 +0,0 @@
<Configuration>
<Appenders>
<Console name="CONSOLE" target="SYSTEM_OUT">
<PatternLayout pattern="%d{HH:mm:ss.SSS} [%t] %-5level %logger{36} - %msg%n"/>
</Console>
</Appenders>
<Loggers>
<Root level="warn">
<AppenderRef ref="CONSOLE"/>
</Root>
<Logger name="com.iqser" level="info"/>
</Loggers>
</Configuration>

View File

@ -1,40 +0,0 @@
plugins {
id("com.knecon.fforesight.java-conventions")
id("io.freefair.lombok") version "8.4"
}
description = "layoutparser-service-processor"
val jacksonVersion = "2.15.2"
val pdfBoxVersion = "3.0.0"
dependencies {
implementation(project(":layoutparser-service-internal-api"))
implementation(project(":viewer-doc-processor"))
implementation("com.knecon.fforesight:document:${rootProject.extra.get("documentVersion")}")
implementation("com.iqser.red.service:persistence-service-shared-api-v1:2.564.0-RED9010.0") {
exclude("org.springframework.boot", "spring-boot-starter-security")
exclude("org.springframework.boot", "spring-boot-starter-validation")
}
implementation("com.knecon.fforesight:tenant-commons:0.30.0") {
exclude("com.iqser.red.commons", "storage-commons")
}
implementation("com.iqser.red.commons:storage-commons:2.50.0")
implementation("org.apache.pdfbox:pdfbox:${pdfBoxVersion}")
implementation("org.apache.pdfbox:pdfbox-tools:${pdfBoxVersion}")
implementation("com.fasterxml.jackson.module:jackson-module-afterburner:${jacksonVersion}")
implementation("com.fasterxml.jackson.datatype:jackson-datatype-jsr310:${jacksonVersion}")
implementation("org.springframework.boot:spring-boot-starter-web:3.1.3")
implementation("org.jgrapht:jgrapht-core:1.5.2")
implementation("org.apache.pdfbox:jbig2-imageio:3.0.4")
implementation("com.github.jai-imageio:jai-imageio-core:1.4.0")
implementation("com.github.jai-imageio:jai-imageio-jpeg2000:1.4.0")
implementation("org.tinspin:tinspin-indexes:2.1.3")
implementation("org.commonmark:commonmark:0.22.0")
implementation("org.commonmark:commonmark-ext-gfm-tables:0.22.0")
implementation("com.pdftron:PDFNet:10.11.0")
implementation("org.apache.commons:commons-text:1.12.0")
}

View File

@ -0,0 +1,65 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.knecon.fforesight</groupId>
<artifactId>layoutparser-service</artifactId>
<version>0.1-SNAPSHOT</version>
</parent>
<artifactId>layoutparser-service-processor</artifactId>
<dependencies>
<dependency>
<groupId>com.iqser.red.service</groupId>
<artifactId>persistence-service-shared-api-v1</artifactId>
<version>2.36.0</version>
</dependency>
<dependency>
<groupId>com.knecon.fforesight</groupId>
<artifactId>tenant-commons</artifactId>
<version>${tennat-commons.version}</version>
</dependency>
<dependency>
<groupId>com.knecon.fforesight</groupId>
<artifactId>layoutparser-service-internal-api</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>com.iqser.red.commons</groupId>
<artifactId>storage-commons</artifactId>
<version>${storage-commons.version}</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>${pdfbox.version}</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox-tools</artifactId>
<version>${pdfbox.version}</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.module</groupId>
<artifactId>jackson-module-afterburner</artifactId>
<version>${jackson.version}</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.datatype</groupId>
<artifactId>jackson-datatype-jsr310</artifactId>
<version>${jackson.version}</version>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-amqp</artifactId>
</dependency>
</dependencies>
</project>

View File

@ -1,20 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor;
import org.springframework.boot.context.properties.ConfigurationProperties;
import org.springframework.context.annotation.Configuration;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import lombok.AccessLevel;
import lombok.Data;
import lombok.experimental.FieldDefaults;
@Data
@Configuration
@ConfigurationProperties("layoutparser")
@FieldDefaults(level = AccessLevel.PRIVATE)
public class LayoutParserSettings {
boolean debug;
LayoutParsingType layoutParsingTypeOverride;
}

View File

@ -1,474 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor;
import static java.lang.String.format;
import java.awt.geom.AffineTransform;
import java.awt.geom.Rectangle2D;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicReference;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.server.mapper.DocumentDataMapper;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.ImageType;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.NodeType;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
import com.knecon.fforesight.service.layoutparser.processor.model.DocumentWithVisualization;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.MarkdownMapper;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineExtractorService;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTreeBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTreeEnhancementService;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTree;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.CvTableParsingAdapter;
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.ImageServiceResponseAdapter;
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.VisualLayoutParsingAdapter;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService;
import com.knecon.fforesight.service.layoutparser.processor.services.TableExtractionService;
import com.knecon.fforesight.service.layoutparser.processor.services.TextRulingsClassifier;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.BlockificationPostprocessingService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.processor.services.graphics.Box;
import com.knecon.fforesight.service.layoutparser.processor.services.graphics.GraphicExtractorService;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper;
import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDFLinesTextStripper;
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
import io.micrometer.observation.Observation;
import io.micrometer.observation.ObservationRegistry;
import io.micrometer.observation.annotation.Observed;
import lombok.AccessLevel;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
@SuppressWarnings("PMD.CloseResource")
@Slf4j
@Service
@RequiredArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class LayoutParsingPipeline {
final ImageServiceResponseAdapter imageServiceResponseAdapter;
final CvTableParsingAdapter cvTableParsingAdapter;
final LayoutParsingStorageService layoutParsingStorageService;
final SectionsBuilderService sectionsBuilderService;
final SimplifiedSectionTextService simplifiedSectionTextService;
final RulingCleaningService rulingCleaningService;
final TableExtractionService tableExtractionService;
final DocuMineBlockificationService docuMineBlockificationService;
final RedactManagerBlockificationService redactManagerBlockificationService;
final BlockificationPostprocessingService blockificationPostprocessingService;
final DocstrumBlockificationService docstrumBlockificationService;
final LayoutGridService layoutGridService;
final ObservationRegistry observationRegistry;
final VisualLayoutParsingAdapter visualLayoutParsingAdapter;
final GraphicExtractorService graphicExtractorService;
final OutlineExtractorService outlineExtractorService;
final SectionTreeBuilderService sectionTreeBuilderService;
final SectionTreeEnhancementService sectionTreeEnhancementService;
final LayoutParserSettings settings;
final ClassificationService classificationService;
@Value("${LAYOUT_PARSER_VERSION:}")
private String layoutParserVersion;
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
long start = System.currentTimeMillis();
log.info("Starting layout parsing for {}", layoutParsingRequest.identifier());
File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId())
.orElse(originFile);
VisualLayoutParsingResponse visualLayoutParsingResponse = layoutParsingRequest.visualLayoutParsingFileId()
.map(layoutParsingStorageService::getVisualLayoutParsingFile)
.orElse(new VisualLayoutParsingResponse());
ImageServiceResponse imageServiceResponse = layoutParsingRequest.imagesFileStorageId()
.map(layoutParsingStorageService::getImagesFile)
.orElse(new ImageServiceResponse());
TableServiceResponse tableServiceResponse = layoutParsingRequest.tablesFileStorageId()
.map(layoutParsingStorageService::getTablesFile)
.orElse(new TableServiceResponse());
LayoutParsingType layoutParsingType = settings.getLayoutParsingTypeOverride() == null //
? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride();
ClassificationDocument classificationDocument = parseLayout(layoutParsingType,
originFile,
imageServiceResponse,
tableServiceResponse,
visualLayoutParsingResponse,
layoutParsingRequest.identifier());
log.info("Building document graph for {}", layoutParsingRequest.identifier());
DocumentWithVisualization documentWithVisualization = observeBuildDocumentGraph(layoutParsingType, classificationDocument);
log.info("Creating viewer document for {}", layoutParsingRequest.identifier());
layoutGridService.addLayoutGrid(viewerDocumentFile, documentWithVisualization, viewerDocumentFile, layoutParsingType, layoutParserVersion, false);
log.info("Storing resulting files for {}", layoutParsingRequest.identifier());
layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentWithVisualization.document()));
if (layoutParsingRequest.documentMarkdownFileStorageId()
.isPresent()) {
layoutParsingStorageService.storeMarkdownFile(layoutParsingRequest.documentMarkdownFileStorageId()
.get(), new MarkdownMapper().toMarkdownContent(documentWithVisualization.document()));
}
layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentWithVisualization.document()));
layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, viewerDocumentFile);
if (layoutParsingRequest.researchDocumentStorageId() != null) {
log.info("Building research document data for {}", layoutParsingRequest.identifier());
var researchDocumentData = TaasDocumentDataMapper.fromDocument(documentWithVisualization.document());
layoutParsingStorageService.storeResearchDocumentData(layoutParsingRequest, researchDocumentData);
}
if (!viewerDocumentFile.equals(originFile)) {
assert !viewerDocumentFile.exists() || viewerDocumentFile.delete();
}
assert !originFile.exists() || originFile.delete();
return LayoutParsingFinishedEvent.builder()
.identifier(layoutParsingRequest.identifier())
.numberOfPages(documentWithVisualization.document().getNumberOfPages())
.duration(System.currentTimeMillis() - start)
.message(format("""
Layout parsing has finished in %.02f s.
identifiers: %s
%s
Files have been saved with Ids:
Structure: %s
Text: %s
Positions: %s
PageData: %s
Simplified Text: %s
Viewer Doc: %s""",
((float) (System.currentTimeMillis() - start)) / 1000,
layoutParsingRequest.identifier(),
buildSemanticNodeCountMessage(documentWithVisualization.document().getNumberOfPages(), documentWithVisualization.buildSemanticNodeCounts()),
layoutParsingRequest.structureFileStorageId(),
layoutParsingRequest.textBlockFileStorageId(),
layoutParsingRequest.positionBlockFileStorageId(),
layoutParsingRequest.pageFileStorageId(),
layoutParsingRequest.simplifiedTextStorageId(),
layoutParsingRequest.viewerDocumentStorageId()))
.layoutParserVersion(layoutParserVersion)
.build();
}
private DocumentWithVisualization observeBuildDocumentGraph(LayoutParsingType layoutParsingType, ClassificationDocument classificationDocument) {
AtomicReference<DocumentWithVisualization> documentReference = new AtomicReference<>();
Observation.createNotStarted("LayoutParsingPipeline", observationRegistry)
.contextualName("build-document-graph")
.observe(() -> documentReference.set(DocumentGraphFactory.buildDocumentGraph(layoutParsingType, classificationDocument)));
return documentReference.get();
}
private String buildSemanticNodeCountMessage(int numberOfPages, Map<NodeType, Long> semanticNodeCounts) {
return format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed",
numberOfPages,
semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION),
semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE),
semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH),
semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE),
semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL),
semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER),
semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER));
}
@SneakyThrows
@Observed(name = "LayoutParsingPipeline", contextualName = "parse-layout")
public ClassificationDocument parseLayout(LayoutParsingType layoutParsingType,
File originFile,
ImageServiceResponse imageServiceResponse,
TableServiceResponse tableServiceResponse,
VisualLayoutParsingResponse visualLayoutParsingResponse,
Map<String, String> identifier) {
PDDocument originDocument = openDocument(originFile);
addNumberOfPagesToTrace(originDocument.getNumberOfPages(), Files.size(originFile.toPath()));
Map<Integer, List<TableCells>> pdfTableCells = cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse);
Map<Integer, List<ClassifiedImage>> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse);
Map<Integer, List<ClassifiedImage>> signatures = visualLayoutParsingAdapter.buildExtractedSignaturesPerPage(visualLayoutParsingResponse);
ClassificationDocument classificationDocument = new ClassificationDocument();
if (settings.isDebug() || identifier.containsKey("debug")) {
classificationDocument.getLayoutDebugLayer().setActive(true);
}
List<ClassificationPage> classificationPages = new ArrayList<>();
classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originDocument));
long pageCount = originDocument.getNumberOfPages();
for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) {
if (pageNumber % 100 == 0) {
// re-open document every once in a while to save on RAM. This has no significant performance impact.
// This is due to PDFBox caching all images and some other stuff with Soft References. This dereferences them and forces the freeing of memory.
originDocument.close();
originDocument = openDocument(originFile);
}
if (pageNumber % 100 == 0 || pageNumber == pageCount || pageNumber == 1) {
log.info("Extracting text on Page {} for {}", pageNumber, identifier);
}
classificationDocument.setPages(classificationPages);
PDFLinesTextStripper stripper = new PDFLinesTextStripper();
PDPage pdPage = originDocument.getPage(pageNumber - 1);
stripper.setPageNumber(pageNumber);
stripper.setStartPage(pageNumber);
stripper.setEndPage(pageNumber);
stripper.setPdpage(pdPage);
stripper.getText(originDocument);
List<Word> words = stripper.getWords();
// rotateDirAdjExactly(words, pdPage); // works really well for many highly rotated documents (e.g. VV-331340.pdf), but it decreases the headline performance by 1.3%, so I am leaving it out for now
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD)) {
var lines = TextPositionOperations.groupByLine(new HashSet<>(words));
classificationDocument.getLayoutDebugLayer().addLineVisualizationsFromNestedTextPosition(lines, pageNumber);
words = TextPositionOperations.sortWords(lines);
}
classificationDocument.getLayoutDebugLayer().addTextVisualizations(words, pageNumber);
PDRectangle pdr = pdPage.getMediaBox();
List<Ruling> rulings = stripper.getRulings();
classificationDocument.getLayoutDebugLayer().addRulingVisualization(rulings, pageNumber);
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(pdfTableCells.get(pageNumber), rulings);
PageInformation pageInformation = PageInformation.fromPDPage(pageNumber, pdPage);
List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontals(), cleanRulings.getVerticals(), pageInformation);
classificationDocument.getLayoutDebugLayer().addCellVisualizations(emptyTableCells, pageNumber);
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(words, cleanRulings);
List<Box> graphics = graphicExtractorService.extractPathElementGraphics(originDocument, pdPage, pageNumber, cleanRulings, stripper.getWords(), false);
pdfImages.computeIfAbsent(pageNumber, x -> new ArrayList<>())
.addAll(graphics.stream()
.map(g -> new ClassifiedImage(new Rectangle2D.Double(g.x1, g.y1, g.width(), g.height()),
ImageType.GRAPHIC,
false,
stripper.getPageNumber(),
""))
.toList());
ClassificationPage classificationPage = switch (layoutParsingType) {
case REDACT_MANAGER_OLD -> redactManagerBlockificationService.blockify(stripper.getWords(), cleanRulings, classificationDocument.getLayoutDebugLayer());
case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings);
case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH ->
docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getLayoutDebugLayer(), layoutParsingType);
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG ->
docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getLayoutDebugLayer(), layoutParsingType);
};
updateClassificationPage(pdPage, pdr, classificationPage, cleanRulings, pageNumber, pageInformation);
blockificationPostprocessingService.findHeadlinesFromOutline(classificationDocument, pageNumber, classificationPage, pageInformation);
classificationDocument.getLayoutDebugLayer().addMarkedContentVisualizations(stripper.getMarkedContents(), pageNumber);
// MarkedContent needs to be converted at this point, otherwise it leads to GC Problems in Pdfbox.
classificationPage.setMarkedContentBboxPerType(convertMarkedContents(stripper.getMarkedContents()));
// If images is ocr needs to be calculated before textBlocks are moved into tables, otherwise findOcr algorithm needs to be adopted.
if (pdfImages.containsKey(pageNumber)) {
classificationPage.setImages(pdfImages.get(pageNumber));
imageServiceResponseAdapter.findOcr(classificationPage);
}
if (signatures.containsKey(pageNumber)) {
if (classificationPage.getImages() == null || classificationPage.getImages().isEmpty()) {
classificationPage.setImages(signatures.get(pageNumber));
} else {
classificationPage.getImages().addAll(signatures.get(pageNumber));
}
}
tableExtractionService.extractTables(emptyTableCells, classificationPage);
buildPageStatistics(classificationPage);
increaseDocumentStatistics(classificationPage, classificationDocument);
classificationPages.add(classificationPage);
}
originDocument.close();
classificationService.classify(classificationDocument, layoutParsingType, identifier);
SectionTree sectionTree = sectionTreeBuilderService.createSectionTree(classificationDocument);
classificationDocument.setSectionTree(sectionTree);
log.info("Building Sections for {}", identifier);
switch (layoutParsingType) {
case CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_PARAGRAPH_DEBUG -> sectionsBuilderService.buildParagraphDebugSections(classificationDocument);
default -> sectionTreeEnhancementService.assignSectionBlocksAndImages(classificationDocument);
}
return classificationDocument;
}
private static void updateClassificationPage(PDPage pdPage,
PDRectangle pdr,
ClassificationPage classificationPage,
CleanRulings cleanRulings,
int pageNumber,
PageInformation pageInformation) {
int rotation = pdPage.getRotation();
boolean isLandscape = pdr.getWidth() > pdr.getHeight() && (rotation == 0 || rotation == 180) || pdr.getHeight() > pdr.getWidth() && (rotation == 90 || rotation == 270);
classificationPage.setCleanRulings(cleanRulings);
classificationPage.setRotation(rotation);
classificationPage.setLandscape(isLandscape);
classificationPage.setPageNumber(pageNumber);
classificationPage.setPageWidth((float) pageInformation.width());
classificationPage.setPageHeight((float) pageInformation.height());
}
private static void rotateDirAdjExactly(List<Word> words, PDPage pdPage) {
for (TextDirection dir : TextDirection.values()) {
double averageRotation = words.stream()
.map(Word::getCharacters)
.flatMap(Collection::stream)
.map(Character::getTextPosition)
.filter(pos -> pos.getDir().equals(dir))
.mapToDouble(RedTextPosition::getExactDir).average()
.orElse(0);
if (averageRotation == 0) {
continue;
}
AffineTransform rotateInstance = AffineTransform.getRotateInstance(averageRotation, pdPage.getMediaBox().getWidth() / 2, pdPage.getMediaBox().getHeight() / 2);
for (Word word : words) {
if (!dir.equals(word.getDir())) {
continue;
}
word.transform(rotateInstance);
}
}
}
private void addNumberOfPagesToTrace(int numberOfPages, long size) {
if (observationRegistry.getCurrentObservation() != null) {
observationRegistry.getCurrentObservation().highCardinalityKeyValue("numberOfPages", String.valueOf(numberOfPages));
observationRegistry.getCurrentObservation().highCardinalityKeyValue("fileSize", String.valueOf(size));
}
}
@SneakyThrows
private PDDocument openDocument(File originFile) {
PDDocument document = Loader.loadPDF(originFile);
document.setAllSecurityToBeRemoved(true);
return document;
}
private Map<String, List<Rectangle2D>> convertMarkedContents(List<PDMarkedContent> pdMarkedContents) {
Map<String, List<Rectangle2D>> markedContentBboxes = new HashMap<>();
markedContentBboxes.put(MarkedContentUtils.HEADER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.HEADER));
markedContentBboxes.put(MarkedContentUtils.FOOTER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.FOOTER));
return markedContentBboxes;
}
private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) {
document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue());
document.getFontCounter().addAll(classificationPage.getFontCounter().getCountPerValue());
document.getTextHeightCounter().addAll(classificationPage.getTextHeightCounter().getCountPerValue());
document.getFontStyleCounter().addAll(classificationPage.getFontStyleCounter().getCountPerValue());
}
private void buildPageStatistics(ClassificationPage classificationPage) {
// Collect all statistics for the classificationPage, except from blocks inside tables, as tables will always be added to BodyTextFrame.
for (AbstractPageBlock textBlock : classificationPage.getTextBlocks()) {
if (textBlock instanceof TextPageBlock) {
if (((TextPageBlock) textBlock).getWords() == null) {
continue;
}
for (Word word : ((TextPageBlock) textBlock).getWords()) {
classificationPage.getTextHeightCounter().add(word.getTextHeight());
classificationPage.getFontCounter().add(word.getFont());
classificationPage.getFontSizeCounter().add(word.getFontSize());
classificationPage.getFontStyleCounter().add(word.getFontStyle());
}
}
}
}
}

View File

@ -0,0 +1,114 @@
package com.knecon.fforesight.service.layoutparser.processor;
import static java.lang.String.format;
import java.io.IOException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.ResearchDocumentData;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
import com.knecon.fforesight.service.layoutparser.processor.adapter.CvTableParsingAdapter;
import com.knecon.fforesight.service.layoutparser.processor.adapter.ImageServiceResponseAdapter;
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.image.ImageServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.classification.service.ClassificationService;
import com.knecon.fforesight.service.layoutparser.processor.classification.service.PdfParsingService;
import com.knecon.fforesight.service.layoutparser.processor.classification.service.SectionsBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.mapper.redaction.DocumentDataMapper;
import com.knecon.fforesight.service.layoutparser.processor.mapper.taas.TaasDocumentDataMapper;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
public class LayoutParsingService {
private final ImageServiceResponseAdapter imageServiceResponseAdapter;
private final CvTableParsingAdapter cvTableParsingAdapter;
private final LayoutParsingStorageService layoutParsingStorageService;
private final PdfParsingService pdfParsingService;
private final ClassificationService classificationService;
private final SectionsBuilderService sectionsBuilderService;
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
long start = System.currentTimeMillis();
PDDocument originDocument = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
ImageServiceResponse imageServiceResponse = new ImageServiceResponse();
if (layoutParsingRequest.imagesFileStorageId().isPresent()) {
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.pageFileStorageId());
}
TableServiceResponse tableServiceResponse = new TableServiceResponse();
if (layoutParsingRequest.tablesFileStorageId().isPresent()) {
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.pageFileStorageId());
}
Document documentGraph = parseLayout(originDocument, imageServiceResponse, tableServiceResponse);
int numberOfPages = originDocument.getNumberOfPages();
originDocument.close();
var researchDocumentData = TaasDocumentDataMapper.fromDocument(documentGraph);
layoutParsingStorageService.storeDocumentData(layoutParsingRequest, researchDocumentData, DocumentDataMapper.toDocumentData(documentGraph));
return LayoutParsingFinishedEvent.builder()
.identifier(layoutParsingRequest.identifier())
.numberOfPages(numberOfPages)
.duration(System.currentTimeMillis() - start)
.message(format("Layout parsing is finished and files have been saved with Ids:\n Structure: %s\nText: %s\nPositions: %s\nPageData: %s",
layoutParsingRequest.structureFileStorageId(),
layoutParsingRequest.textBlockFileStorageId(),
layoutParsingRequest.positionBlockFileStorageId(),
layoutParsingRequest.pageFileStorageId()))
.build();
}
public Document parseLayout(PDDocument originDocument, ImageServiceResponse imageServiceResponse, TableServiceResponse tableServiceResponse) {
ClassificationDocument classificationDocument = pdfParsingService.parseDocument(originDocument,
cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse),
imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse));
classificationService.classifyDocument(classificationDocument);
sectionsBuilderService.buildSections(classificationDocument);
return DocumentGraphFactory.buildDocumentGraph(classificationDocument);
}
public Document parseLayoutWithTimer(PDDocument originDocument, ImageServiceResponse imageServiceResponse, TableServiceResponse tableServiceResponse) {
long start = System.currentTimeMillis();
ClassificationDocument classificationDocument = pdfParsingService.parseDocument(originDocument,
cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse),
imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse));
System.out.printf("parsed %d ms", System.currentTimeMillis() - start);
start = System.currentTimeMillis();
classificationService.classifyDocument(classificationDocument);
System.out.printf(", classified %d ms", System.currentTimeMillis() - start);
start = System.currentTimeMillis();
sectionsBuilderService.buildSections(classificationDocument);
System.out.printf(", sections built %d ms", System.currentTimeMillis() - start);
start = System.currentTimeMillis();
Document document = DocumentGraphFactory.buildDocumentGraph(classificationDocument);
System.out.printf(", graph constructed %d ms", System.currentTimeMillis() - start);
return document;
}
}

View File

@ -1,23 +1,10 @@
package com.knecon.fforesight.service.layoutparser.processor;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.ComponentScan;
import org.springframework.context.annotation.Configuration;
import com.knecon.fforesight.service.viewerdoc.service.PDFTronViewerDocumentService;
import io.micrometer.observation.ObservationRegistry;
@Configuration
@ComponentScan
public class LayoutParsingServiceProcessorConfiguration {
@Bean
@Autowired
public PDFTronViewerDocumentService viewerDocumentService(ObservationRegistry registry) {
return new PDFTronViewerDocumentService(registry);
}
}

View File

@ -1,36 +1,31 @@
package com.knecon.fforesight.service.layoutparser.processor;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardOpenOption;
import java.util.Optional;
import java.util.concurrent.CompletableFuture;
import org.springframework.core.task.TaskExecutor;
import org.apache.commons.io.IOUtils;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.io.MemoryUsageSetting;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.springframework.stereotype.Service;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.redaction.v1.server.data.DocumentData;
import com.iqser.red.storage.commons.service.StorageService;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.SimplifiedText;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.AtomicPositionBlockData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.AtomicTextBlockData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTreeData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.PageData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.ResearchDocumentData;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
import com.knecon.fforesight.service.viewerdoc.service.ViewerDocVersioningUtility;
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.image.ImageServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse;
import com.knecon.fforesight.tenantcommons.TenantContext;
import io.micrometer.observation.annotation.Observed;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@ -41,104 +36,68 @@ public class LayoutParsingStorageService {
private final StorageService storageService;
private final ObjectMapper objectMapper;
private final TaskExecutor taskExecutor;
public PDDocument getOriginFile(String storageId) throws IOException {
@Observed(name = "LayoutParsingStorageService", contextualName = "get-origin-file")
public File getOriginFile(String storageId) throws IOException {
File tempFile = createTempFile("document", ".pdf");
storageService.downloadTo(TenantContext.getTenantId(), storageId, tempFile);
return tempFile;
}
@Observed(name = "LayoutParsingStorageService", contextualName = "get-viewer-doc-file")
public Optional<File> getViewerDocFile(String storageId) throws IOException {
if (!storageService.objectExists(TenantContext.getTenantId(), storageId)) {
return Optional.empty();
}
File tempFile = createTempFile("viewerDocument", ".pdf");
storageService.downloadTo(TenantContext.getTenantId(), storageId, tempFile);
if (!ViewerDocVersioningUtility.isCurrentVersion(tempFile)) {
assert tempFile.delete();
return Optional.empty();
}
return Optional.of(tempFile);
}
@SneakyThrows
public ImageServiceResponse getImagesFile(String storageId) {
try (InputStream inputStream = getObject(storageId)) {
ImageServiceResponse imageServiceResponse = objectMapper.readValue(inputStream, ImageServiceResponse.class);
inputStream.close();
return imageServiceResponse;
try (var originDocumentInputStream = storageService.getObject(TenantContext.getTenantId(), storageId).getInputStream()) {
File tempFile = createTempFile("document", ".pdf");
try (var tempFileOutputStream = new FileOutputStream(tempFile)) {
IOUtils.copy(originDocumentInputStream, tempFileOutputStream);
}
return Loader.loadPDF(tempFile, MemoryUsageSetting.setupMixed(67108864L));
}
}
@SneakyThrows
public TableServiceResponse getTablesFile(String storageId) {
public ImageServiceResponse getImagesFile(String storageId) throws IOException {
try (var tableClassificationStream = getObject(storageId)) {
try (InputStream inputStream = storageService.getObject(TenantContext.getTenantId(), storageId).getInputStream()) {
TableServiceResponse tableServiceResponse = objectMapper.readValue(tableClassificationStream, TableServiceResponse.class);
tableClassificationStream.close();
return tableServiceResponse;
return objectMapper.readValue(inputStream, ImageServiceResponse.class);
}
}
@SneakyThrows
public VisualLayoutParsingResponse getVisualLayoutParsingFile(String storageId) {
public TableServiceResponse getTablesFile(String storageId) throws IOException {
try (var tableClassificationStream = storageService.getObject(TenantContext.getTenantId(), storageId).getInputStream()) {
return objectMapper.readValue(tableClassificationStream, TableServiceResponse.class);
try (InputStream inputStream = getObject(storageId)) {
return objectMapper.readValue(inputStream, VisualLayoutParsingResponse.class);
}
}
@SneakyThrows
@Observed(name = "LayoutParsingStorageService", contextualName = "store-document-data")
public void storeDocumentData(LayoutParsingRequest layoutParsingRequest, DocumentData documentData) {
Runnable storeDocumentStructureRunnable = () -> storageService.storeProtoObject(TenantContext.getTenantId(),
layoutParsingRequest.structureFileStorageId(),
documentData.getDocumentStructure());
CompletableFuture<Void> storeDocumentStructureFuture = CompletableFuture.runAsync(storeDocumentStructureRunnable, taskExecutor);
Runnable storeDocumentTextDataRunnable = () -> storageService.storeProtoObject(TenantContext.getTenantId(),
layoutParsingRequest.textBlockFileStorageId(),
documentData.getDocumentTextData());
CompletableFuture<Void> storeDocumentTextDataFuture = CompletableFuture.runAsync(storeDocumentTextDataRunnable, taskExecutor);
Runnable storeDocumentPositionsRunnable = () -> storageService.storeProtoObject(TenantContext.getTenantId(),
layoutParsingRequest.positionBlockFileStorageId(),
documentData.getDocumentPositionData());
CompletableFuture<Void> storeDocumentPositionsFuture = CompletableFuture.runAsync(storeDocumentPositionsRunnable, taskExecutor);
Runnable storeDocumentPagesRunnable = () -> storageService.storeProtoObject(TenantContext.getTenantId(),
layoutParsingRequest.pageFileStorageId(),
documentData.getDocumentPages());
CompletableFuture<Void> storeDocumentPagesFuture = CompletableFuture.runAsync(storeDocumentPagesRunnable, taskExecutor);
CompletableFuture.allOf(storeDocumentStructureFuture, storeDocumentTextDataFuture, storeDocumentPositionsFuture, storeDocumentPagesFuture).join();
}
public void storeResearchDocumentData(LayoutParsingRequest layoutParsingRequest, ResearchDocumentData researchDocumentData) {
public void storeDocumentData(LayoutParsingRequest layoutParsingRequest, ResearchDocumentData researchDocumentData, DocumentData documentData) {
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.researchDocumentStorageId(), researchDocumentData);
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.structureFileStorageId(), documentData.getDocumentTreeData());
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.textBlockFileStorageId(), documentData.getAtomicTextBlocks());
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.positionBlockFileStorageId(), documentData.getAtomicPositionBlocks());
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.pageFileStorageId(), documentData.getPages());
}
public DocumentData readDocumentData(LayoutParsingRequest layoutParsingRequest) throws IOException {
PageData[] pageData = storageService.readJSONObject(TenantContext.getTenantId(), layoutParsingRequest.pageFileStorageId(), PageData[].class);
AtomicTextBlockData[] atomicTextBlockData = storageService.readJSONObject(TenantContext.getTenantId(),
layoutParsingRequest.textBlockFileStorageId(),
AtomicTextBlockData[].class);
AtomicPositionBlockData[] atomicPositionBlockData = storageService.readJSONObject(TenantContext.getTenantId(),
layoutParsingRequest.positionBlockFileStorageId(),
AtomicPositionBlockData[].class);
DocumentTreeData tableOfContentsData = storageService.readJSONObject(TenantContext.getTenantId(),
layoutParsingRequest.structureFileStorageId(),
DocumentTreeData.class);
return DocumentData.builder()
.documentTreeData(tableOfContentsData)
.atomicPositionBlocks(atomicPositionBlockData)
.atomicTextBlocks(atomicTextBlockData)
.pages(pageData)
.build();
}
@ -166,43 +125,4 @@ public class LayoutParsingStorageService {
}
}
@Observed(name = "LayoutParsingStorageService", contextualName = "store-simplified-text")
public void storeSimplifiedText(LayoutParsingRequest layoutParsingRequest, SimplifiedText simplifiedText) {
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.simplifiedTextStorageId(), simplifiedText);
}
@SneakyThrows
private InputStream getObject(String storageId) {
File tempFile = File.createTempFile("temp", ".data");
storageService.downloadTo(TenantContext.getTenantId(), storageId, tempFile);
Path path = Paths.get(tempFile.getPath());
return Files.newInputStream(path, StandardOpenOption.DELETE_ON_CLOSE);
}
@SneakyThrows
@Observed(name = "LayoutParsingStorageService", contextualName = "store-viewer-document")
public void storeViewerDocument(LayoutParsingRequest layoutParsingRequest, File out) {
try (var in = new FileInputStream(out)) {
storageService.storeObject(TenantContext.getTenantId(), layoutParsingRequest.viewerDocumentStorageId(), in);
}
}
@SneakyThrows
@Observed(name = "LayoutParsingStorageService", contextualName = "store-markdown-file")
public void storeMarkdownFile(String markdownFileStorageId, String markdownContent) {
try (InputStream inputStream = new ByteArrayInputStream(markdownContent.getBytes(StandardCharsets.UTF_8))) {
storageService.storeObject(TenantContext.getTenantId(), markdownFileStorageId, inputStream);
}
}
}

View File

@ -0,0 +1,49 @@
package com.knecon.fforesight.service.layoutparser.processor.adapter;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableCells;
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
public class CvTableParsingAdapter {
public Map<Integer, List<TableCells>> buildCvParsedTablesPerPage(TableServiceResponse tableServiceResponse) {
Map<Integer, List<TableCells>> tableCells = new HashMap<>();
tableServiceResponse.getData()
.forEach(tableData -> tableCells.computeIfAbsent(tableData.getPageInfo().getNumber(), tableCell -> new ArrayList<>())
.addAll(convertTableCells(tableData.getTableCells())));
return tableCells;
}
private Collection<TableCells> convertTableCells(List<TableCells> tableCells) {
List<TableCells> cvParsedTableCells = new ArrayList<>();
tableCells.forEach(t -> cvParsedTableCells.add(TableCells.builder()
.y0(t.getY0())
.x1(t.getX1())
.y1(t.getY1())
.x0(t.getX0())
.width(t.getWidth())
.height(t.getHeight())
.build()));
return cvParsedTableCells;
}
}

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.python_api.adapter;
package com.knecon.fforesight.service.layoutparser.processor.adapter;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
@ -9,11 +9,10 @@ import java.util.Map;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.ImageType;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.image.ImageServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.ImageType;
import lombok.RequiredArgsConstructor;
@ -21,7 +20,8 @@ import lombok.RequiredArgsConstructor;
@RequiredArgsConstructor
public class ImageServiceResponseAdapter {
public Map<Integer, List<ClassifiedImage>> buildClassifiedImagesPerPage(ImageServiceResponse imageServiceResponse) {
public Map<Integer, List<ClassifiedImage>> buildClassifiedImagesPerPage(ImageServiceResponse imageServiceResponse ) {
Map<Integer, List<ClassifiedImage>> images = new HashMap<>();
imageServiceResponse.getData().forEach(imageMetadata -> {
@ -32,7 +32,7 @@ public class ImageServiceResponseAdapter {
.add(new ClassifiedImage(new Rectangle2D.Double(imageMetadata.getPosition().getX1(),
imageMetadata.getPosition().getY1(),
imageMetadata.getGeometry().getWidth(),
imageMetadata.getGeometry().getHeight()), classification, imageMetadata.isAlpha(), imageMetadata.getPosition().getPageNumber(),imageMetadata.getRepresentation()));
imageMetadata.getGeometry().getHeight()), classification, imageMetadata.isAlpha(), imageMetadata.getPosition().getPageNumber()));
});
// Currently This is a copy but, it will be changed later because i don' t think that we should unclassified images.
@ -44,7 +44,7 @@ public class ImageServiceResponseAdapter {
.add(new ClassifiedImage(new Rectangle2D.Double(imageMetadata.getPosition().getX1(),
imageMetadata.getPosition().getY1(),
imageMetadata.getGeometry().getWidth(),
imageMetadata.getGeometry().getHeight()), classification, imageMetadata.isAlpha(), imageMetadata.getPosition().getPageNumber(),imageMetadata.getRepresentation()));
imageMetadata.getGeometry().getHeight()), classification, imageMetadata.isAlpha(), imageMetadata.getPosition().getPageNumber()));
});
return images;
@ -55,12 +55,11 @@ public class ImageServiceResponseAdapter {
classificationPage.getImages().forEach(image -> {
if (image.getImageType().equals(ImageType.OTHER)) {
for (AbstractPageBlock textblock : classificationPage.getTextBlocks()) {
if (image.getPosition().contains(textblock.getBBoxPdf())) {
classificationPage.getTextBlocks().forEach(textblock -> {
if (image.getPosition().contains(textblock.getMinX(), textblock.getMinY(), textblock.getWidth(), textblock.getHeight())) {
image.setImageType(ImageType.OCR);
return;
}
}
});
}
});
}

View File

@ -0,0 +1,14 @@
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
import java.util.HashMap;
import java.util.Map;
import lombok.Data;
@Data
public class Classification {
private Map<String, Float> probabilities = new HashMap<>();
private String label;
}

View File

@ -0,0 +1,11 @@
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
import lombok.Data;
@Data
public class FilterGeometry {
private ImageSize imageSize;
private ImageFormat imageFormat;
}

View File

@ -0,0 +1,12 @@
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
import lombok.Data;
@Data
public class Filters {
private FilterGeometry geometry;
private Probability probability;
private boolean allPassed;
}

View File

@ -0,0 +1,11 @@
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
import lombok.Data;
@Data
public class Geometry {
private float width;
private float height;
}

View File

@ -0,0 +1,12 @@
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
import lombok.Data;
@Data
public class ImageFormat {
private float quotient;
private boolean tooTall;
private boolean tooWide;
}

View File

@ -0,0 +1,14 @@
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
import lombok.Data;
@Data
public class ImageMetadata {
private Classification classification;
private Position position;
private Geometry geometry;
private Filters filters;
private boolean alpha;
}

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.python_api.model.image;
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
import java.util.ArrayList;
import java.util.List;
@ -6,15 +6,9 @@ import java.util.List;
import com.fasterxml.jackson.annotation.JsonAlias;
import com.fasterxml.jackson.annotation.JsonProperty;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class ImageServiceResponse {
private String dossierId;
@ -22,10 +16,8 @@ public class ImageServiceResponse {
@JsonProperty(value = "imageMetadata")
@JsonAlias("data")
@Builder.Default
private List<ImageMetadata> data = new ArrayList<>();
@Builder.Default
private List<ImageMetadata> dataCV = new ArrayList<>();

View File

@ -0,0 +1,12 @@
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
import lombok.Data;
@Data
public class ImageSize {
private float quotient;
private boolean tooLarge;
private boolean tooSmall;
}

View File

@ -0,0 +1,14 @@
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
import lombok.Data;
@Data
public class Position {
private float x1;
private float x2;
private float y1;
private float y2;
private int pageNumber;
}

View File

@ -0,0 +1,10 @@
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
import lombok.Data;
@Data
public class Probability {
private boolean unconfident;
}

View File

@ -0,0 +1,13 @@
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.table;
import lombok.Data;
@Data
public class PageInfo {
private int number;
private int rotation;
private float width;
private float height;
}

View File

@ -1,14 +1,14 @@
package com.knecon.fforesight.service.layoutparser.processor.python_api.model.table;
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.table;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.RequiredArgsConstructor;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
@RequiredArgsConstructor
public class PdfTableCell {
private float x0;

View File

@ -1,14 +1,10 @@
package com.knecon.fforesight.service.layoutparser.processor.python_api.model.table;
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.table;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class TableCells {
private float x0;

View File

@ -0,0 +1,14 @@
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.table;
import java.util.ArrayList;
import java.util.List;
import lombok.Data;
@Data
public class TableData {
private PageInfo pageInfo;
private List<TableCells> tableCells = new ArrayList<>();
}

View File

@ -1,17 +1,11 @@
package com.knecon.fforesight.service.layoutparser.processor.python_api.model.table;
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.table;
import java.util.ArrayList;
import java.util.List;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class TableServiceResponse {
private String dossierId;
@ -19,7 +13,7 @@ public class TableServiceResponse {
private String operation;
private String targetFileExtension;
private String responseFileExtension;
@Builder.Default
private List<TableData> data = new ArrayList<>();
}

View File

@ -0,0 +1,80 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.model;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@AllArgsConstructor
@NoArgsConstructor
public abstract class AbstractPageBlock {
@JsonIgnore
protected float minX;
@JsonIgnore
protected float maxX;
@JsonIgnore
protected float minY;
@JsonIgnore
protected float maxY;
@JsonIgnore
protected PageBlockType classification;
@JsonIgnore
protected int page;
@JsonIgnore
private Orientation orientation = Orientation.NONE;
public abstract String getText();
public boolean isHeadline() {
return this instanceof TextPageBlock && this.getClassification() != null && this.getClassification().isHeadline();
}
public boolean containsBlock(TextPageBlock other) {
return this.minX <= other.getPdfMinX() && this.maxX >= other.getPdfMaxX() && this.minY >= other.getPdfMinY() && this.maxY <= other.getPdfMaxY();
}
public boolean contains(AbstractPageBlock other) {
return this.minX <= other.minX && this.maxX >= other.maxX && this.minY >= other.minY && this.maxY <= other.maxY;
}
public boolean contains(Rectangle other) {
return page == other.getPage() && this.minX <= other.getTopLeft().getX() && this.maxX >= other.getTopLeft().getX() + other.getWidth() && this.minY <= other.getTopLeft()
.getY() && this.maxY >= other.getTopLeft().getY() + other.getHeight();
}
@JsonIgnore
public float getHeight() {
return maxY - minY;
}
@JsonIgnore
public float getWidth() {
return maxX - minX;
}
public boolean intersectsY(AbstractPageBlock atc) {
return this.minY <= atc.getMaxY() && this.maxY >= atc.getMinY();
}
}

View File

@ -1,13 +1,11 @@
package com.knecon.fforesight.service.layoutparser.processor.model;
package com.knecon.fforesight.service.layoutparser.processor.classification.model;
import java.util.ArrayList;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTree;
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.text.UnclassifiedText;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.section.SectionGrid;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.StringFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.UnclassifiedText;
import lombok.Data;
import lombok.NoArgsConstructor;
@ -25,12 +23,9 @@ public class ClassificationDocument {
private FloatFrequencyCounter fontSizeCounter = new FloatFrequencyCounter();
private StringFrequencyCounter fontCounter = new StringFrequencyCounter();
private StringFrequencyCounter fontStyleCounter = new StringFrequencyCounter();
private LayoutDebugLayer layoutDebugLayer = new LayoutDebugLayer();
private boolean headlines;
private SectionGrid sectionGrid = new SectionGrid();
private long rulesVersion;
private OutlineObjectTree outlineObjectTree;
private SectionTree sectionTree;
}

View File

@ -1,8 +1,8 @@
package com.knecon.fforesight.service.layoutparser.processor.model;
package com.knecon.fforesight.service.layoutparser.processor.classification.model;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
import lombok.AllArgsConstructor;
import lombok.Data;

View File

@ -1,8 +1,8 @@
package com.knecon.fforesight.service.layoutparser.processor.model;
package com.knecon.fforesight.service.layoutparser.processor.classification.model;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
import lombok.AllArgsConstructor;
import lombok.Data;

View File

@ -1,16 +1,11 @@
package com.knecon.fforesight.service.layoutparser.processor.model;
package com.knecon.fforesight.service.layoutparser.processor.classification.model;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.StringFrequencyCounter;
import lombok.Data;
import lombok.NonNull;
@ -18,16 +13,11 @@ import lombok.RequiredArgsConstructor;
@Data
@RequiredArgsConstructor
public class ClassificationPage {
@NonNull
private List<AbstractPageBlock> textBlocks;
private List<OutlineObject> outlineObjects = new ArrayList<>();
private List<AbstractPageBlock> headlines = new ArrayList<>();
private List<ClassifiedImage> images = new ArrayList<>();
private Rectangle bodyTextFrame;
@ -45,8 +35,4 @@ public class ClassificationPage {
private float pageWidth;
private float pageHeight;
private CleanRulings cleanRulings;
private Map<String, List<Rectangle2D>> markedContentBboxPerType = new HashMap<>();
}

View File

@ -1,18 +1,16 @@
package com.knecon.fforesight.service.layoutparser.processor.model;
package com.knecon.fforesight.service.layoutparser.processor.classification.model;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@NoArgsConstructor
@Deprecated
public class ClassificationSection {
private List<AbstractPageBlock> pageBlocks = new ArrayList<>();
@ -31,10 +29,4 @@ public class ClassificationSection {
return tables;
}
public List<AbstractPageBlock> getNonEmptyPageBlocks() {
return pageBlocks.stream().filter(pageBlock -> !pageBlock.isEmpty()).collect(Collectors.toList());
}
}

View File

@ -0,0 +1,77 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.model;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import lombok.Getter;
@Getter
public class FloatFrequencyCounter {
Map<Float, Integer> countPerValue = new HashMap<>();
public void add(float value) {
if (!countPerValue.containsKey(value)) {
countPerValue.put(value, 1);
} else {
countPerValue.put(value, countPerValue.get(value) + 1);
}
}
public void addAll(Map<Float, Integer> otherCounter) {
for (Map.Entry<Float, Integer> entry : otherCounter.entrySet()) {
if (countPerValue.containsKey(entry.getKey())) {
countPerValue.put(entry.getKey(), countPerValue.get(entry.getKey()) + entry.getValue());
} else {
countPerValue.put(entry.getKey(), entry.getValue());
}
}
}
public Float getMostPopular() {
Map.Entry<Float, Integer> mostPopular = null;
for (Map.Entry<Float, Integer> entry : countPerValue.entrySet()) {
if (mostPopular == null || entry.getValue() >= mostPopular.getValue()) {
mostPopular = entry;
}
}
return mostPopular != null ? mostPopular.getKey() : null;
}
public List<Float> getHighterThanMostPopular() {
Float mostPopular = getMostPopular();
List<Float> higher = new ArrayList<>();
for (Float value : countPerValue.keySet()) {
if (value > mostPopular) {
higher.add(value);
}
}
return higher.stream().sorted(Collections.reverseOrder()).collect(Collectors.toList());
}
public Float getHighest() {
Float highest = null;
for (Float value : countPerValue.keySet()) {
if (highest == null || value > highest) {
highest = value;
}
}
return highest;
}
}

View File

@ -0,0 +1,8 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.model;
public enum Orientation {
NONE,
LEFT,
RIGHT
}

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.model;
package com.knecon.fforesight.service.layoutparser.processor.classification.model;
public enum PageBlockType {
H1,
@ -9,14 +9,12 @@ public enum PageBlockType {
H6,
HEADER,
FOOTER,
TITLE,
PARAGRAPH,
PARAGRAPH_BOLD,
PARAGRAPH_ITALIC,
PARAGRAPH_UNKNOWN,
OTHER,
TABLE_OF_CONTENTS_HEADLINE,
TABLE_OF_CONTENTS_ITEM,
LIST_ITEM,
TABLE;
@ -33,21 +31,8 @@ public enum PageBlockType {
}
public static int getHeadlineNumber(PageBlockType pageBlockType) {
return switch (pageBlockType) {
case H1, TABLE_OF_CONTENTS_HEADLINE -> 1;
case H2 -> 2;
case H3 -> 3;
case H4 -> 4;
case H5 -> 5;
default -> 6;
};
}
public boolean isHeadline() {
return this.equals(H1) || this.equals(H2) || this.equals(H3) || this.equals(H4) || this.equals(H5) || this.equals(H6) || this.equals(TABLE_OF_CONTENTS_HEADLINE);
return this.equals(H1) || this.equals(H2) || this.equals(H3) || this.equals(H4) || this.equals(H5) || this.equals(H6);
}
}

View File

@ -0,0 +1,25 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.model.image;
import java.awt.geom.Rectangle2D;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.ImageType;
import lombok.Data;
import lombok.NonNull;
import lombok.RequiredArgsConstructor;
@Data
@RequiredArgsConstructor
public class ClassifiedImage {
@NonNull
private Rectangle2D position;
@NonNull
private ImageType imageType;
private boolean isAppendedToSection;
@NonNull
private boolean hasTransparency;
@NonNull
private int page;
}

View File

@ -1,16 +1,13 @@
package com.knecon.fforesight.service.layoutparser.processor.model.table;
package com.knecon.fforesight.service.layoutparser.processor.classification.model.table;
import java.awt.geom.AffineTransform;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.TextNormalizationUtilities;
import lombok.Data;
import lombok.EqualsAndHashCode;
@ -20,7 +17,7 @@ import lombok.NoArgsConstructor;
@Data
@EqualsAndHashCode(callSuper = true)
@NoArgsConstructor
public class Cell extends BoundingBox {
public class Cell extends Rectangle {
private List<TextPageBlock> textBlocks = new ArrayList<>();
@ -35,24 +32,7 @@ public class Cell extends BoundingBox {
public Cell(Point2D topLeft, Point2D bottomRight) {
this.bBoxPdf = new Rectangle2D.Double(topLeft.getX(), topLeft.getY(), (bottomRight.getX() - topLeft.getX()), (bottomRight.getY() - topLeft.getY()));
this.bBox = bBoxPdf;
}
public Cell(Rectangle2D bBoxInitialUserSpace, AffineTransform initialUserSpaceToJava) {
this.bBoxPdf = bBoxInitialUserSpace;
this.bBox = initialUserSpaceToJava.createTransformedShape(bBoxInitialUserSpace).getBounds2D();
}
public static Cell copy(Cell cell) {
Cell copy = new Cell();
copy.bBoxPdf = cell.bBoxPdf;
copy.bBox = cell.bBox;
return copy;
super((float) topLeft.getY(), (float) topLeft.getX(), (float) (bottomRight.getX() - topLeft.getX()), (float) (bottomRight.getY() - topLeft.getY()));
}
@ -68,12 +48,12 @@ public class Cell extends BoundingBox {
StringBuilder sb = new StringBuilder();
Iterator<TextPageBlock> itty = textBlocks.iterator();
Word previous = null;
TextPositionSequence previous = null;
while (itty.hasNext()) {
TextPageBlock textBlock = itty.next();
for (Word word : textBlock.getWords()) {
for (TextPositionSequence word : textBlock.getSequences()) {
if (previous != null) {
if (Math.abs(previous.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight()) {
sb.append('\n');
@ -87,7 +67,7 @@ public class Cell extends BoundingBox {
}
return TextNormalizationUtilities.cleanString(sb.toString());
return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString()).replaceAll("\n", " ").replaceAll(" {2}", " ");
}

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.model.table;
package com.knecon.fforesight.service.layoutparser.processor.classification.model.table;
import lombok.RequiredArgsConstructor;
import lombok.Value;

View File

@ -0,0 +1,15 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.model.table;
import java.util.List;
import lombok.Builder;
import lombok.Data;
@Data
@Builder
public class CleanRulings {
List<Ruling> horizontal;
List<Ruling> vertical;
}

View File

@ -0,0 +1,218 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.model.table;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.util.Comparator;
import java.util.List;
@SuppressWarnings("all")
public class Rectangle extends Rectangle2D.Float {
protected static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f;
/**
* Ill-defined comparator, from when Rectangle was Comparable.
* <p>
* see https://github.com/tabulapdf/tabula-java/issues/116
*
* @deprecated with no replacement
*/
@Deprecated
public static final Comparator<Rectangle> ILL_DEFINED_ORDER = new Comparator<Rectangle>() {
@Override
public int compare(Rectangle o1, Rectangle o2) {
if (o1.equals(o2)) {
return 0;
}
if (o1.verticalOverlap(o2) > VERTICAL_COMPARISON_THRESHOLD) {
return o1.isLtrDominant() == -1 && o2.isLtrDominant() == -1 ? -java.lang.Double.compare(o1.getX(), o2.getX()) : java.lang.Double.compare(o1.getX(), o2.getX());
} else {
return java.lang.Float.compare(o1.getBottom(), o2.getBottom());
}
}
};
public Rectangle() {
super();
}
public Rectangle(float top, float left, float width, float height) {
super();
this.setRect(left, top, width, height);
}
/**
* @param rectangles
* @return minimum bounding box that contains all the rectangles
*/
public static Rectangle boundingBoxOf(List<? extends Rectangle> rectangles) {
float minx = java.lang.Float.MAX_VALUE;
float miny = java.lang.Float.MAX_VALUE;
float maxx = java.lang.Float.MIN_VALUE;
float maxy = java.lang.Float.MIN_VALUE;
for (Rectangle r : rectangles) {
minx = (float) Math.min(r.getMinX(), minx);
miny = (float) Math.min(r.getMinY(), miny);
maxx = (float) Math.max(r.getMaxX(), maxx);
maxy = (float) Math.max(r.getMaxY(), maxy);
}
return new Rectangle(miny, minx, maxx - minx, maxy - miny);
}
public int compareTo(Rectangle other) {
return ILL_DEFINED_ORDER.compare(this, other);
}
// I'm bad at Java and need this for fancy sorting in
// technology.tabula.TextChunk.
public int isLtrDominant() {
return 0;
}
public float getArea() {
return this.width * this.height;
}
public float verticalOverlap(Rectangle other) {
return Math.max(0, Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
}
public boolean verticallyOverlaps(Rectangle other) {
return verticalOverlap(other) > 0;
}
public float horizontalOverlap(Rectangle other) {
return Math.max(0, Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
}
public boolean horizontallyOverlaps(Rectangle other) {
return horizontalOverlap(other) > 0;
}
public float verticalOverlapRatio(Rectangle other) {
float rv = 0, delta = Math.min(this.getBottom() - this.getTop(), other.getBottom() - other.getTop());
if (other.getTop() <= this.getTop() && this.getTop() <= other.getBottom() && other.getBottom() <= this.getBottom()) {
rv = (other.getBottom() - this.getTop()) / delta;
} else if (this.getTop() <= other.getTop() && other.getTop() <= this.getBottom() && this.getBottom() <= other.getBottom()) {
rv = (this.getBottom() - other.getTop()) / delta;
} else if (this.getTop() <= other.getTop() && other.getTop() <= other.getBottom() && other.getBottom() <= this.getBottom()) {
rv = (other.getBottom() - other.getTop()) / delta;
} else if (other.getTop() <= this.getTop() && this.getTop() <= this.getBottom() && this.getBottom() <= other.getBottom()) {
rv = (this.getBottom() - this.getTop()) / delta;
}
return rv;
}
public float overlapRatio(Rectangle other) {
double intersectionWidth = Math.max(0, Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
double intersectionHeight = Math.max(0, Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
double intersectionArea = Math.max(0, intersectionWidth * intersectionHeight);
double unionArea = this.getArea() + other.getArea() - intersectionArea;
return (float) (intersectionArea / unionArea);
}
public Rectangle merge(Rectangle other) {
this.setRect(this.createUnion(other));
return this;
}
public float getTop() {
return (float) this.getMinY();
}
public void setTop(float top) {
float deltaHeight = top - this.y;
this.setRect(this.x, top, this.width, this.height - deltaHeight);
}
public float getRight() {
return (float) this.getMaxX();
}
public void setRight(float right) {
this.setRect(this.x, this.y, right - this.x, this.height);
}
public float getLeft() {
return (float) this.getMinX();
}
public void setLeft(float left) {
float deltaWidth = left - this.x;
this.setRect(left, this.y, this.width - deltaWidth, this.height);
}
public float getBottom() {
return (float) this.getMaxY();
}
public void setBottom(float bottom) {
this.setRect(this.x, this.y, this.width, bottom - this.y);
}
public Point2D[] getPoints() {
return new Point2D[]{new Point2D.Float(this.getLeft(), this.getTop()), new Point2D.Float(this.getRight(), this.getTop()), new Point2D.Float(this.getRight(),
this.getBottom()), new Point2D.Float(this.getLeft(), this.getBottom())};
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
String s = super.toString();
sb.append(s.substring(0, s.length() - 1));
sb.append(String.format(",bottom=%f,right=%f]", this.getBottom(), this.getRight()));
return sb.toString();
}
}

View File

@ -0,0 +1,437 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.model.table;
import java.awt.geom.Line2D;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Formatter;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.CohenSutherlandClipping;
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.DoubleComparisons;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@SuppressWarnings("all")
public class Ruling extends Line2D.Float {
private static int PERPENDICULAR_PIXEL_EXPAND_AMOUNT = 2;
public Ruling(Point2D p1, Point2D p2) {
super(p1, p2);
}
public static List<Ruling> cropRulingsToArea(List<Ruling> rulings, Rectangle2D area) {
ArrayList<Ruling> rv = new ArrayList<>();
for (Ruling r : rulings) {
if (r.intersects(area)) {
rv.add(r.intersect(area));
}
}
return rv;
}
// log(n) implementation of find_intersections
// based on http://people.csail.mit.edu/indyk/6.838-old/handouts/lec2.pdf
public static Map<Point2D, Ruling[]> findIntersections(List<Ruling> horizontals, List<Ruling> verticals) {
class SortObject {
protected SOType type;
protected float position;
protected Ruling ruling;
public SortObject(SOType type, float position, Ruling ruling) {
this.type = type;
this.position = position;
this.ruling = ruling;
}
}
List<SortObject> sos = new ArrayList<>();
TreeMap<Ruling, Boolean> tree = new TreeMap<>(new Comparator<Ruling>() {
@Override
public int compare(Ruling o1, Ruling o2) {
return java.lang.Double.compare(o1.getTop(), o2.getTop());
}
});
TreeMap<Point2D, Ruling[]> rv = new TreeMap<>(new Comparator<Point2D>() {
@Override
public int compare(Point2D o1, Point2D o2) {
if (o1.getY() > o2.getY()) {
return 1;
}
if (o1.getY() < o2.getY()) {
return -1;
}
if (o1.getX() > o2.getX()) {
return 1;
}
if (o1.getX() < o2.getX()) {
return -1;
}
return 0;
}
});
for (Ruling h : horizontals) {
sos.add(new SortObject(SOType.HLEFT, h.getLeft() - PERPENDICULAR_PIXEL_EXPAND_AMOUNT, h));
sos.add(new SortObject(SOType.HRIGHT, h.getRight() + PERPENDICULAR_PIXEL_EXPAND_AMOUNT, h));
}
for (Ruling v : verticals) {
sos.add(new SortObject(SOType.VERTICAL, v.getLeft(), v));
}
Collections.sort(sos, new Comparator<SortObject>() {
@Override
public int compare(SortObject a, SortObject b) {
int rv;
if (DoubleComparisons.feq(a.position, b.position)) {
if (a.type == SOType.VERTICAL && b.type == SOType.HLEFT) {
rv = 1;
} else if (a.type == SOType.VERTICAL && b.type == SOType.HRIGHT) {
rv = -1;
} else if (a.type == SOType.HLEFT && b.type == SOType.VERTICAL) {
rv = -1;
} else if (a.type == SOType.HRIGHT && b.type == SOType.VERTICAL) {
rv = 1;
} else {
rv = java.lang.Double.compare(a.position, b.position);
}
} else {
return java.lang.Double.compare(a.position, b.position);
}
return rv;
}
});
for (SortObject so : sos) {
switch (so.type) {
case VERTICAL:
for (Map.Entry<Ruling, Boolean> h : tree.entrySet()) {
try {
Point2D i = h.getKey().intersectionPoint(so.ruling);
if (i == null) {
continue;
}
rv.put(i, new Ruling[]{h.getKey().expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT), so.ruling.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT)});
} catch (UnsupportedOperationException e) {
log.info("Some line are oblique, ignoring...");
continue;
}
}
break;
case HRIGHT:
tree.remove(so.ruling);
break;
case HLEFT:
tree.put(so.ruling, true);
break;
}
}
return rv;
}
public boolean vertical() {
return this.length() > 0 && DoubleComparisons.feq(this.x1, this.x2); //diff < ORIENTATION_CHECK_THRESHOLD;
}
public boolean horizontal() {
return this.length() > 0 && DoubleComparisons.feq(this.y1, this.y2); //diff < ORIENTATION_CHECK_THRESHOLD;
}
// attributes that make sense only for non-oblique lines
// these are used to have a single collapse method (in page, currently)
public boolean oblique() {
return !(this.vertical() || this.horizontal());
}
public float getPosition() {
if (this.oblique()) {
throw new UnsupportedOperationException();
}
return this.vertical() ? this.getLeft() : this.getTop();
}
public float getStart() {
if (this.oblique()) {
throw new UnsupportedOperationException();
}
return this.vertical() ? this.getTop() : this.getLeft();
}
public void setStart(float v) {
if (this.oblique()) {
throw new UnsupportedOperationException();
}
if (this.vertical()) {
this.setTop(v);
} else {
this.setLeft(v);
}
}
public float getEnd() {
if (this.oblique()) {
throw new UnsupportedOperationException();
}
return this.vertical() ? this.getBottom() : this.getRight();
}
public void setEnd(float v) {
if (this.oblique()) {
throw new UnsupportedOperationException();
}
if (this.vertical()) {
this.setBottom(v);
} else {
this.setRight(v);
}
}
public void setStartEnd(float start, float end) {
if (this.oblique()) {
throw new UnsupportedOperationException();
}
if (this.vertical()) {
this.setTop(start);
this.setBottom(end);
} else {
this.setLeft(start);
this.setRight(end);
}
}
public boolean perpendicularTo(Ruling other) {
return this.vertical() == other.horizontal();
}
public boolean nearlyIntersects(Ruling another, int colinearOrParallelExpandAmount) {
if (this.intersectsLine(another)) {
return true;
}
boolean rv = false;
if (this.perpendicularTo(another)) {
rv = this.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT).intersectsLine(another);
} else {
rv = this.expand(colinearOrParallelExpandAmount).intersectsLine(another.expand(colinearOrParallelExpandAmount));
}
return rv;
}
public double length() {
return Math.sqrt(Math.pow(this.x1 - this.x2, 2) + Math.pow(this.y1 - this.y2, 2));
}
public Ruling intersect(Rectangle2D clip) {
Float clipee = (Float) this.clone();
boolean clipped = new CohenSutherlandClipping(clip).clip(clipee);
if (clipped) {
return new Ruling(clipee.getP1(), clipee.getP2());
} else {
return this;
}
}
public Ruling expand(float amount) {
Ruling r = (Ruling) this.clone();
try {
r.setStart(this.getStart() - amount);
r.setEnd(this.getEnd() + amount);
} catch (UnsupportedOperationException e) {
log.warn("Could not expand ruling!");
}
return r;
}
public Point2D intersectionPoint(Ruling other) {
Ruling this_l = this.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT);
Ruling other_l = other.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT);
Ruling horizontal, vertical;
if (!this_l.intersectsLine(other_l)) {
return null;
}
if (this_l.horizontal() && other_l.vertical()) {
horizontal = this_l;
vertical = other_l;
} else if (this_l.vertical() && other_l.horizontal()) {
vertical = this_l;
horizontal = other_l;
} else {
log.warn("lines must be orthogonal, vertical and horizontal");
return null;
}
return new Point2D.Float(vertical.getLeft(), horizontal.getTop());
}
@Override
public boolean equals(Object other) {
if (this == other) {
return true;
}
if (!(other instanceof Ruling)) {
return false;
}
Ruling o = (Ruling) other;
return this.getP1().equals(o.getP1()) && this.getP2().equals(o.getP2());
}
@Override
public int hashCode() {
return super.hashCode();
}
public float getTop() {
return this.y1;
}
public void setTop(float v) {
setLine(this.getLeft(), v, this.getRight(), this.getBottom());
}
public float getLeft() {
return this.x1;
}
public void setLeft(float v) {
setLine(v, this.getTop(), this.getRight(), this.getBottom());
}
public float getBottom() {
return this.y2;
}
public void setBottom(float v) {
setLine(this.getLeft(), this.getTop(), this.getRight(), v);
}
public float getRight() {
return this.x2;
}
public void setRight(float v) {
setLine(this.getLeft(), this.getTop(), v, this.getBottom());
}
public float getWidth() {
return this.getRight() - this.getLeft();
}
public float getHeight() {
return this.getBottom() - this.getTop();
}
public double getAngle() {
double angle = Math.toDegrees(Math.atan2(this.getP2().getY() - this.getP1().getY(), this.getP2().getX() - this.getP1().getX()));
if (angle < 0) {
angle += 360;
}
return angle;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
Formatter formatter = new Formatter(sb);
String rv = formatter.format("%s[minX=%f minY=%f maxX=%f maxY=%f]", this.getClass().toString(), this.x1, this.y1, this.x2, this.y2).toString();
formatter.close();
return rv;
}
private enum SOType {
VERTICAL,
HRIGHT,
HLEFT
}
}

View File

@ -1,18 +1,16 @@
package com.knecon.fforesight.service.layoutparser.processor.model.table;
package com.knecon.fforesight.service.layoutparser.processor.classification.model.table;
import java.awt.geom.Point2D;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.TreeMap;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
import lombok.Getter;
import lombok.Setter;
@ -21,8 +19,7 @@ import lombok.extern.slf4j.Slf4j;
@Slf4j
public class TablePageBlock extends AbstractPageBlock {
public static final double CELL_AREA_CONTAINED_THRESHOLD = 0.98;
private final TreeMap<CellPosition, Cell> cellTreeMap = new TreeMap<>();
private final TreeMap<CellPosition, Cell> cells = new TreeMap<>();
private final int rotation;
@Getter
@ -31,28 +28,20 @@ public class TablePageBlock extends AbstractPageBlock {
private int unrotatedRowCount;
private int unrotatedColCount;
private List<List<Cell>> rows;
@Getter
@Setter
private List<Cell> cells;
public TablePageBlock(List<Cell> cells, int rotation) {
public TablePageBlock(List<Cell> cells, Rectangle area, int rotation) {
setToBBoxOfComponents(cells);
this.cells = cells;
addCells(cells);
minX = area.getLeft();
minY = area.getBottom();
maxX = area.getRight();
maxY = area.getTop();
classification = PageBlockType.TABLE;
this.rotation = rotation;
}
@Override
public boolean isEmpty() {
return getColCount() == 0 || getRowCount() == 0;
}
public List<List<Cell>> getRows() {
if (rows == null) {
@ -83,17 +72,14 @@ public class TablePageBlock extends AbstractPageBlock {
public int getColCount() {
return getRows().stream()
.mapToInt(List::size)
.max()
.orElse(0);
return getRows().stream().mapToInt(List::size).max().orElse(0);
}
/**
* Detect header cells (either first row or first column):
* Column is marked as header if originalCell text is bold and row originalCell text is not bold.
* Column is marked as header if cell text is bold and row cell text is not bold.
* Defaults to row.
*/
private void computeHeaders() {
@ -101,7 +87,7 @@ public class TablePageBlock extends AbstractPageBlock {
if (rows == null) {
rows = computeRows();
}
// A bold originalCell is a header originalCell as long as every originalCell to the left/top is bold, too
// A bold cell is a header cell as long as every cell to the left/top is bold, too
// we move from left to right and top to bottom
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
List<Cell> rowCells = rows.get(rowIndex);
@ -126,8 +112,7 @@ public class TablePageBlock extends AbstractPageBlock {
List<Cell> cellsToTheTop = new ArrayList<>();
for (int i = 0; i < rowIndex; i++) {
try {
cellsToTheTop.add(rows.get(i)
.get(colIndex));
cellsToTheTop.add(rows.get(i).get(colIndex));
} catch (IndexOutOfBoundsException e) {
log.debug("No cell {} in row {}, ignoring.", colIndex, rowIndex);
}
@ -142,8 +127,7 @@ public class TablePageBlock extends AbstractPageBlock {
if (lastHeaderCell != null) {
cell.getHeaderCells().add(lastHeaderCell);
}
if (!cell.getTextBlocks().isEmpty() && cell.getTextBlocks()
.get(0).getMostPopularWordStyle().equals("bold")) {
if (!cell.getTextBlocks().isEmpty() && cell.getTextBlocks().get(0).getMostPopularWordStyle().equals("bold")) {
cell.setHeaderCell(true);
}
}
@ -159,7 +143,7 @@ public class TablePageBlock extends AbstractPageBlock {
for (int i = 0; i < unrotatedColCount; i++) { // rows
List<Cell> lastRow = new ArrayList<>();
for (int j = unrotatedRowCount - 1; j >= 0; j--) { // cols
Cell cell = cellTreeMap.get(new CellPosition(j, i));
Cell cell = cells.get(new CellPosition(j, i));
if (cell != null) {
lastRow.add(cell);
}
@ -170,7 +154,7 @@ public class TablePageBlock extends AbstractPageBlock {
for (int i = unrotatedColCount - 1; i >= 0; i--) { // rows
List<Cell> lastRow = new ArrayList<>();
for (int j = 0; j < unrotatedRowCount; j++) { // cols
Cell cell = cellTreeMap.get(new CellPosition(j, i));
Cell cell = cells.get(new CellPosition(j, i));
if (cell != null) {
lastRow.add(cell);
}
@ -181,7 +165,7 @@ public class TablePageBlock extends AbstractPageBlock {
for (int i = 0; i < unrotatedRowCount; i++) {
List<Cell> lastRow = new ArrayList<>();
for (int j = 0; j < unrotatedColCount; j++) {
Cell cell = cellTreeMap.get(new CellPosition(i, j)); // JAVA_8 use getOrDefault()
Cell cell = cells.get(new CellPosition(i, j)); // JAVA_8 use getOrDefault()
if (cell != null) {
lastRow.add(cell);
}
@ -195,6 +179,17 @@ public class TablePageBlock extends AbstractPageBlock {
}
private void add(Cell chunk, int row, int col) {
unrotatedRowCount = Math.max(unrotatedRowCount, row + 1);
unrotatedColCount = Math.max(unrotatedColCount, col + 1);
CellPosition cp = new CellPosition(row, col);
cells.put(cp, chunk);
}
private void addCells(List<Cell> cells) {
if (cells.isEmpty()) {
@ -203,12 +198,11 @@ public class TablePageBlock extends AbstractPageBlock {
cells.removeIf(cell -> cell.getWidth() < 1.1 || cell.getHeight() < 1.1);
List<List<Cell>> rowsOfCellsMatrix = calculateTableStructure(cells);
List<List<Cell>> rowsOfCells = calculateStructure(cells);
for (int i = 0; i < rowsOfCellsMatrix.size(); i++) {
for (int j = 0; j < rowsOfCellsMatrix.get(i).size(); j++) {
addCellToRowAndCol(rowsOfCellsMatrix.get(i)
.get(j), i, j);
for (int i = 0; i < rowsOfCells.size(); i++) {
for (int j = 0; j < rowsOfCells.get(i).size(); j++) {
add(rowsOfCells.get(i).get(j), i, j);
}
}
@ -219,125 +213,57 @@ public class TablePageBlock extends AbstractPageBlock {
* Calculates the structure of the table. For spanning rows and columns multiple cells with the same values will be inserted.
*
* @param cells The found cells
* @return TablePageBlock Structure as a rows of cells matrix
* @return TablePageBlock Structure
*/
private List<List<Cell>> calculateTableStructure(List<Cell> cells) {
private List<List<Cell>> calculateStructure(List<Cell> cells) {
List<List<Cell>> matrix = new ArrayList<>();
if (cells.isEmpty()) {
return new ArrayList<>();
return matrix;
}
Set<Double> uniqueX = new HashSet<>();
Set<Double> uniqueY = new HashSet<>();
cells.stream()
.filter(c -> !c.getTextBlocks().isEmpty() || c.getHeight() > 3 && c.getWidth() > 3)
.forEach(c -> {
uniqueX.add(c.getPdfMinX());
uniqueX.add(c.getPdfMaxX());
uniqueY.add(c.getPdfMinY());
uniqueY.add(c.getPdfMaxY());
});
Set<Float> uniqueX = new HashSet<>();
Set<Float> uniqueY = new HashSet<>();
cells.stream().filter(c -> !c.getTextBlocks().isEmpty() || c.getHeight() > 3 && c.getWidth() > 3).forEach(c -> {
uniqueX.add(c.getLeft());
uniqueX.add(c.getRight());
uniqueY.add(c.getBottom());
uniqueY.add(c.getTop());
});
var sortedUniqueX = uniqueX.stream()
.sorted()
.toList();
var sortedUniqueY = uniqueY.stream()
.sorted()
.toList();
var sortedUniqueX = uniqueX.stream().sorted().toList();
var sortedUniqueY = uniqueY.stream().sorted().toList();
List<List<Cell>> rowsOfCells = new ArrayList<>();
Double prevY = null;
for (Double y : sortedUniqueY) {
Float prevY = null;
for (Float y : sortedUniqueY) {
List<Cell> row = new ArrayList<>();
Double prevX = null;
for (Double x : sortedUniqueX) {
Float prevX = null;
for (Float x : sortedUniqueX) {
if (prevY != null && prevX != null) {
var cellFromGridStructure = new Cell(new Point2D.Double(prevX, prevY), new Point2D.Double(x, y));
var cell = new Cell(new Point2D.Float(prevX, prevY), new Point2D.Float(x, y));
if (cellFromGridStructure.hasMinimumSize()) {
cells.stream()
.map(originalCell -> new CellWithIntersection(originalCell,
RectangleTransformations.calculateIntersectedArea(cellFromGridStructure.getBBoxPdf(),
originalCell.getBBoxPdf())))
.filter(cellWithIntersection -> cellWithIntersection.intersectedArea > 0)
.filter(cellWithIntersection -> cellWithIntersection.originalCell.getArea() > cellWithIntersection.intersectedArea * CELL_AREA_CONTAINED_THRESHOLD)
.max(Comparator.comparing(CellWithIntersection::intersectedArea))
.map(CellWithIntersection::originalCell)
.ifPresent(matchingCell -> cellFromGridStructure.getTextBlocks().addAll(matchingCell.getTextBlocks()));
row.add(cellFromGridStructure);
var intersectionCell = cells.stream().filter(c -> cell.intersects(c) && cell.overlapRatio(c) > 0.1f).findFirst();
intersectionCell.ifPresent(value -> cell.getTextBlocks().addAll(value.getTextBlocks()));
if (cell.hasMinimumSize()) {
row.add(cell);
}
}
prevX = x;
}
// exclude empty rows and rows where all text blocks are empty
if (prevY != null && prevX != null && !row.isEmpty() && !row.stream()
.allMatch(cell -> cell.getTextBlocks().isEmpty())) {
rowsOfCells.add(row);
if (prevY != null && prevX != null && !row.isEmpty()) {
matrix.add(row);
}
prevY = y;
}
Collections.reverse(rowsOfCells);
// now cells are removed which are part of a column without any text blocks
// this is done by first computing the inverse matrix which contains call columns of cells
// then the column indices that have to be removed are determined
List<List<Cell>> columnsOfCells = new ArrayList<>();
int maxRowLength = rowsOfCells.stream()
.map(List::size)
.max(java.util.Comparator.naturalOrder())
.orElse(0);
for (int i = 0; i < maxRowLength; i++) {
columnsOfCells.add(new ArrayList<>());
}
for (List<Cell> row : rowsOfCells) {
for (int j = 0; j < row.size(); j++) {
columnsOfCells.get(j).add(row.get(j));
}
}
List<Integer> columnIndicesToRemove = new ArrayList<>();
int columnIndex = 0;
for (List<Cell> col : columnsOfCells) {
if (col.stream()
.allMatch(cell -> cell.getTextBlocks().isEmpty())) {
columnIndicesToRemove.add(columnIndex);
}
columnIndex++;
}
columnIndicesToRemove.sort(Collections.reverseOrder());
// update all rows so that the values of the empty columns get removed
var rowsOfCellsBefore = new ArrayList<>(rowsOfCells);
rowsOfCells = new ArrayList<>();
for (List<Cell> row : rowsOfCellsBefore) {
var updatedRow = new ArrayList<>(row);
columnIndicesToRemove.forEach(idxToRemove -> updatedRow.remove(updatedRow.get(idxToRemove)));
rowsOfCells.add(updatedRow);
}
return rowsOfCells;
}
private void addCellToRowAndCol(Cell cell, int row, int col) {
unrotatedRowCount = Math.max(unrotatedRowCount, row + 1);
unrotatedColCount = Math.max(unrotatedColCount, col + 1);
CellPosition cp = new CellPosition(row, col);
cellTreeMap.put(cp, cell);
Collections.reverse(matrix);
return matrix;
}
@ -364,7 +290,7 @@ public class TablePageBlock extends AbstractPageBlock {
if (!first) {
sb.append("\n");
}
sb.append('\"').append(textBlock.getText().replaceAll("\"", "\"")).append('\"');
sb.append('\"').append(textBlock.getText().replaceAll("\"", "\\\"")).append('\"');
first = false;
}
}
@ -411,9 +337,4 @@ public class TablePageBlock extends AbstractPageBlock {
return sb.toString();
}
record CellWithIntersection(Cell originalCell, double intersectedArea) {
}
}

View File

@ -0,0 +1,100 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
import org.apache.pdfbox.text.TextPosition;
import org.springframework.beans.BeanUtils;
import com.fasterxml.jackson.annotation.JsonIgnore;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.SneakyThrows;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class RedTextPosition {
private String textMatrix;
private float[] position;
@JsonIgnore
private int rotation;
@JsonIgnore
private float pageHeight;
@JsonIgnore
private float pageWidth;
private String unicode;
@JsonIgnore
private float dir;
// not used in reanalysis
@JsonIgnore
private float widthOfSpace;
// not used in reanalysis
@JsonIgnore
private float fontSizeInPt;
// not used in reanalysis
@JsonIgnore
private String fontName;
@SneakyThrows
public static RedTextPosition fromTextPosition(TextPosition textPosition) {
var pos = new RedTextPosition();
BeanUtils.copyProperties(textPosition, pos);
pos.setFontName(textPosition.getFont().getName());
pos.setFontSizeInPt(textPosition.getFontSizeInPt());
pos.setTextMatrix(textPosition.getTextMatrix().toString());
var position = new float[4];
position[0] = textPosition.getXDirAdj();
position[1] = textPosition.getYDirAdj();
position[2] = textPosition.getWidthDirAdj();
position[3] = textPosition.getHeightDir();
pos.setPosition(position);
return pos;
}
@JsonIgnore
public float getXDirAdj() {
return position[0];
}
@JsonIgnore
public float getYDirAdj() {
return position[1];
}
@JsonIgnore
public float getWidthDirAdj() {
return position[2];
}
@JsonIgnore
public float getHeightDir() {
return position[3];
}
}

View File

@ -0,0 +1,48 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
import java.util.ArrayList;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.TextNormalizationUtilities;
import lombok.Getter;
@Getter
public class SearchableText {
private final List<TextPositionSequence> sequences = new ArrayList<>();
public void add(TextPositionSequence textPositionSequence) {
sequences.add(textPositionSequence);
}
public void addAll(List<TextPositionSequence> textPositionSequences) {
sequences.addAll(textPositionSequences);
}
@Override
public String toString() {
return buildString(sequences);
}
public static String buildString(List<TextPositionSequence> sequences) {
StringBuilder sb = new StringBuilder();
for (TextPositionSequence word : sequences) {
sb.append(word);
sb.append(' ');
}
String text = sb.toString();
text = TextNormalizationUtilities.removeHyphenLineBreaks(text);
text = TextNormalizationUtilities.removeLineBreaks(text);
text = TextNormalizationUtilities.removeRepeatingWhitespaces(text);
return text;
}
}

View File

@ -0,0 +1,17 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class SimplifiedSectionText {
private int sectionNumber;
private String text;
}

View File

@ -0,0 +1,20 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
import java.util.ArrayList;
import java.util.List;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class SimplifiedText {
private int numberOfPages;
private List<SimplifiedSectionText> sectionTexts = new ArrayList<>();
}

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.model.text;
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
import java.util.HashMap;
import java.util.Map;
@ -9,14 +9,10 @@ public class StringFrequencyCounter {
@Getter
private final Map<String, Integer> countPerValue = new HashMap<>();
boolean changed;
String mostPopularCache;
public void add(String value) {
changed = true;
if (!countPerValue.containsKey(value)) {
countPerValue.put(value, 1);
} else {
@ -27,8 +23,6 @@ public class StringFrequencyCounter {
public void addAll(Map<String, Integer> otherCounter) {
changed = true;
for (Map.Entry<String, Integer> entry : otherCounter.entrySet()) {
if (countPerValue.containsKey(entry.getKey())) {
countPerValue.put(entry.getKey(), countPerValue.get(entry.getKey()) + entry.getValue());
@ -41,18 +35,13 @@ public class StringFrequencyCounter {
public String getMostPopular() {
if (changed || mostPopularCache == null) {
Map.Entry<String, Integer> mostPopular = null;
for (Map.Entry<String, Integer> entry : countPerValue.entrySet()) {
if (mostPopular == null || entry.getValue() >= mostPopular.getValue()) {
mostPopular = entry;
}
Map.Entry<String, Integer> mostPopular = null;
for (Map.Entry<String, Integer> entry : countPerValue.entrySet()) {
if (mostPopular == null || entry.getValue() > mostPopular.getValue()) {
mostPopular = entry;
}
mostPopularCache = mostPopular != null ? mostPopular.getKey() : null;
changed = false;
}
return mostPopularCache;
return mostPopular != null ? mostPopular.getKey() : null;
}
}

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.model.text;
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonValue;
@ -44,15 +44,4 @@ public enum TextDirection {
throw new IllegalArgumentException(String.format("A value of %f is not supported by TextDirection", degrees));
}
public int getRotation() {
return switch (this) {
case ZERO -> 0;
case QUARTER_CIRCLE -> 1;
case HALF_CIRCLE -> 2;
case THREE_QUARTER_CIRCLE -> 3;
};
}
}

View File

@ -0,0 +1,367 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
import static java.util.stream.Collectors.toSet;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.FloatFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.DoubleComparisons;
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.TextNormalizationUtilities;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
@EqualsAndHashCode(callSuper = true)
@Data
@AllArgsConstructor
@Builder
@NoArgsConstructor
public class TextPageBlock extends AbstractPageBlock {
@Builder.Default
private List<TextPositionSequence> sequences = new ArrayList<>();
@JsonIgnore
private int rotation;
@JsonIgnore
private String mostPopularWordFont;
@JsonIgnore
private String mostPopularWordStyle;
@JsonIgnore
private float mostPopularWordFontSize;
@JsonIgnore
private float mostPopularWordHeight;
@JsonIgnore
private float mostPopularWordSpaceWidth;
@JsonIgnore
private float highestFontSize;
@JsonIgnore
private PageBlockType classification;
@JsonIgnore
public TextDirection getDir() {
return sequences.get(0).getDir();
}
@JsonIgnore
private float getPageHeight() {
return sequences.get(0).getPageHeight();
}
@JsonIgnore
private float getPageWidth() {
return sequences.get(0).getPageWidth();
}
public static TextPageBlock merge(List<TextPageBlock> textBlocksToMerge) {
List<TextPositionSequence> sequences = textBlocksToMerge.stream().map(TextPageBlock::getSequences).flatMap(java.util.Collection::stream).toList();
sequences = new ArrayList<>(sequences);
return fromTextPositionSequences(sequences);
}
public static TextPageBlock fromTextPositionSequences(List<TextPositionSequence> wordBlockList) {
TextPageBlock textBlock = null;
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter();
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
for (TextPositionSequence wordBlock : wordBlockList) {
lineHeightFrequencyCounter.add(wordBlock.getTextHeight());
fontSizeFrequencyCounter.add(wordBlock.getFontSize());
spaceFrequencyCounter.add(wordBlock.getSpaceWidth());
fontFrequencyCounter.add(wordBlock.getFont());
styleFrequencyCounter.add(wordBlock.getFontStyle());
if (textBlock == null) {
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
wordBlock.getMaxXDirAdj(),
wordBlock.getMinYDirAdj(),
wordBlock.getMaxYDirAdj(),
wordBlockList,
wordBlock.getRotation());
} else {
TextPageBlock spatialEntity = textBlock.union(wordBlock);
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
}
}
if (textBlock != null) {
textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
}
if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences()
.stream()
.map(t -> DoubleComparisons.round(t.getMinYDirAdj(), 3))
.collect(toSet())
.size() == 1) {
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
}
return textBlock;
}
/**
* Returns the minX value in pdf coordinate system.
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
* 0 -> LowerLeft
* 90 -> UpperLeft
* 180 -> UpperRight
* 270 -> LowerRight
*
* @return the minX value in pdf coordinate system
*/
@JsonIgnore
public float getPdfMinX() {
if (getDir().getDegrees() == 90) {
return minY;
} else if (getDir().getDegrees() == 180) {
return getPageWidth() - maxX;
} else if (getDir().getDegrees() == 270) {
return getPageWidth() - maxY;
} else {
return minX;
}
}
/**
* Returns the maxX value in pdf coordinate system.
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
* 0 -> LowerLeft
* 90 -> UpperLeft
* 180 -> UpperRight
* 270 -> LowerRight
*
* @return the maxX value in pdf coordinate system
*/
@JsonIgnore
public float getPdfMaxX() {
if (getDir().getDegrees() == 90) {
return maxY;
} else if (getDir().getDegrees() == 180) {
return getPageWidth() - minX;
} else if (getDir().getDegrees() == 270) {
return getPageWidth() - minY;
} else {
return maxX;
}
}
/**
* Returns the minY value in pdf coordinate system.
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
* 0 -> LowerLeft
* 90 -> UpperLeft
* 180 -> UpperRight
* 270 -> LowerRight
*
* @return the minY value in pdf coordinate system
*/
@JsonIgnore
public float getPdfMinY() {
if (getDir().getDegrees() == 90) {
return minX;
} else if (getDir().getDegrees() == 180) {
return maxY;
} else if (getDir().getDegrees() == 270) {
return getPageHeight() - maxX;
} else {
return getPageHeight() - maxY;
}
}
/**
* Returns the maxY value in pdf coordinate system.
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
* 0 -> LowerLeft
* 90 -> UpperLeft
* 180 -> UpperRight
* 270 -> LowerRight
*
* @return the maxY value in pdf coordinate system
*/
@JsonIgnore
public float getPdfMaxY() {
if (getDir().getDegrees() == 90) {
return maxX;
} else if (getDir().getDegrees() == 180) {
return minY;
} else if (getDir().getDegrees() == 270) {
return getPageHeight() - minX;
} else {
return getPageHeight() - minY;
}
}
public TextPageBlock(float minX, float maxX, float minY, float maxY, List<TextPositionSequence> sequences, int rotation) {
this.minX = minX;
this.maxX = maxX;
this.minY = minY;
this.maxY = maxY;
this.sequences = sequences;
this.rotation = rotation;
}
public TextPageBlock union(TextPositionSequence r) {
TextPageBlock union = this.copy();
union.add(r);
return union;
}
public TextPageBlock union(TextPageBlock r) {
TextPageBlock union = this.copy();
union.add(r);
return union;
}
public void add(TextPageBlock r) {
if (r.getMinX() < minX) {
minX = r.getMinX();
}
if (r.getMaxX() > maxX) {
maxX = r.getMaxX();
}
if (r.getMinY() < minY) {
minY = r.getMinY();
}
if (r.getMaxY() > maxY) {
maxY = r.getMaxY();
}
sequences.addAll(r.getSequences());
}
public void add(TextPositionSequence r) {
if (r.getMinXDirAdj() < minX) {
minX = r.getMinXDirAdj();
}
if (r.getMaxXDirAdj() > maxX) {
maxX = r.getMaxXDirAdj();
}
if (r.getMinYDirAdj() < minY) {
minY = r.getMinYDirAdj();
}
if (r.getMaxYDirAdj() > maxY) {
maxY = r.getMaxYDirAdj();
}
}
public TextPageBlock copy() {
return new TextPageBlock(minX, maxX, minY, maxY, sequences, rotation);
}
public void resize(float x1, float y1, float width, float height) {
set(x1, y1, x1 + width, y1 + height);
}
public void set(float x1, float y1, float x2, float y2) {
this.minX = Math.min(x1, x2);
this.maxX = Math.max(x1, x2);
this.minY = Math.min(y1, y2);
this.maxY = Math.max(y1, y2);
}
@Override
public String toString() {
StringBuilder builder = new StringBuilder();
for (int i = 0; i < sequences.size(); i++) {
String sequenceAsString = sequences.get(i).toString();
// Fix for missing Whitespace. This is recognized in getSequences method. See PDFTextStripper Line 1730.
if (i != 0 && sequences.get(i - 1).charAt(sequences.get(i - 1).length() - 1) != ' ' && sequenceAsString.charAt(0) != ' ') {
builder.append(' ');
}
builder.append(sequenceAsString);
}
return builder.toString();
}
@Override
@JsonIgnore
public String getText() {
StringBuilder sb = new StringBuilder();
TextPositionSequence previous = null;
for (TextPositionSequence word : sequences) {
if (previous != null) {
if (Math.abs(previous.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight()) {
sb.append('\n');
} else {
sb.append(' ');
}
}
sb.append(word.toString());
previous = word;
}
return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString());
}
}

View File

@ -0,0 +1,315 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
import java.awt.geom.AffineTransform;
import java.awt.geom.Point2D;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.pdfbox.text.TextPosition;
import com.dslplatform.json.JsonAttribute;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class TextPositionSequence implements CharSequence {
public static final int HEIGHT_PADDING = 2;
private int page;
private List<RedTextPosition> textPositions = new ArrayList<>();
private TextDirection dir;
private int rotation;
private float pageHeight;
private float pageWidth;
private boolean isParagraphStart;
public TextPositionSequence(int page) {
this.page = page;
}
public TextPositionSequence(List<TextPosition> textPositions, int page, boolean isParagraphStart) {
this.textPositions = textPositions.stream().map(RedTextPosition::fromTextPosition).collect(Collectors.toList());
this.page = page;
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
this.rotation = textPositions.get(0).getRotation();
this.pageHeight = textPositions.get(0).getPageHeight();
this.pageWidth = textPositions.get(0).getPageWidth();
this.isParagraphStart = isParagraphStart;
}
@Override
public int length() {
return textPositions.size();
}
@Override
public char charAt(int index) {
RedTextPosition textPosition = textPositionAt(index);
String text = textPosition.getUnicode();
return text.charAt(0);
}
public char charAt(int index, boolean caseInSensitive) {
RedTextPosition textPosition = textPositionAt(index);
String text = textPosition.getUnicode();
return caseInSensitive ? text.toLowerCase().charAt(0) : text.charAt(0);
}
@Override
public TextPositionSequence subSequence(int start, int end) {
var textPositionSequence = new TextPositionSequence();
textPositionSequence.textPositions = textPositions.subList(start, end);
textPositionSequence.page = page;
textPositionSequence.dir = dir;
textPositionSequence.rotation = rotation;
textPositionSequence.pageHeight = pageHeight;
textPositionSequence.pageWidth = pageWidth;
return textPositionSequence;
}
@Override
public String toString() {
StringBuilder builder = new StringBuilder(length());
for (int i = 0; i < length(); i++) {
builder.append(charAt(i));
}
return builder.toString();
}
public RedTextPosition textPositionAt(int index) {
return textPositions.get(index);
}
public void add(TextPositionSequence textPositionSequence, RedTextPosition textPosition) {
this.textPositions.add(textPosition);
this.page = textPositionSequence.getPage();
this.dir = textPositionSequence.getDir();
this.rotation = textPositionSequence.getRotation();
this.pageHeight = textPositionSequence.getPageHeight();
this.pageWidth = textPositionSequence.getPageWidth();
}
public void add(TextPosition textPosition) {
this.textPositions.add(RedTextPosition.fromTextPosition(textPosition));
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
this.rotation = textPositions.get(0).getRotation();
this.pageHeight = textPositions.get(0).getPageHeight();
this.pageWidth = textPositions.get(0).getPageWidth();
}
/**
* This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
* This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt.
*
* @return the text direction adjusted minX value
*/
@JsonIgnore
@JsonAttribute(ignore = true)
public float getMinXDirAdj() {
return textPositions.get(0).getXDirAdj();
}
/**
* This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
* This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt.
*
* @return the text direction adjusted maxX value
*/
@JsonIgnore
@JsonAttribute(ignore = true)
public float getMaxXDirAdj() {
return textPositions.get(textPositions.size() - 1).getXDirAdj() + textPositions.get(textPositions.size() - 1).getWidthDirAdj() + HEIGHT_PADDING;
}
/**
* This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
* This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt.
*
* @return the text direction adjusted minY value. The upper border of the bounding box of the word.
*/
@JsonIgnore
@JsonAttribute(ignore = true)
public float getMinYDirAdj() {
return textPositions.get(0).getYDirAdj() - getTextHeight();
}
/**
* This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
* This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt.
*
* @return the text direction adjusted maxY value. The lower border of the bounding box of the word.
*/
@JsonIgnore
@JsonAttribute(ignore = true)
public float getMaxYDirAdj() {
return textPositions.get(0).getYDirAdj();
}
@JsonIgnore
@JsonAttribute(ignore = true)
public float getTextHeight() {
return textPositions.get(0).getHeightDir() + HEIGHT_PADDING;
}
@JsonIgnore
@JsonAttribute(ignore = true)
public float getHeight() {
return getMaxYDirAdj() - getMinYDirAdj();
}
@JsonIgnore
@JsonAttribute(ignore = true)
public float getWidth() {
return getMaxXDirAdj() - getMinXDirAdj();
}
@JsonIgnore
@JsonAttribute(ignore = true)
public String getFont() {
return textPositions.get(0).getFontName().toLowerCase().replaceAll(",bold", "").replaceAll(",italic", "");
}
@JsonIgnore
@JsonAttribute(ignore = true)
public String getFontStyle() {
String lowercaseFontName = textPositions.get(0).getFontName().toLowerCase();
if (lowercaseFontName.contains("bold") && lowercaseFontName.contains("italic")) {
return "bold, italic";
} else if (lowercaseFontName.contains("bold")) {
return "bold";
} else if (lowercaseFontName.contains("italic")) {
return "italic";
} else {
return "standard";
}
}
@JsonIgnore
@JsonAttribute(ignore = true)
public float getFontSize() {
return textPositions.get(0).getFontSizeInPt();
}
@JsonIgnore
@JsonAttribute(ignore = true)
public float getSpaceWidth() {
return textPositions.get(0).getWidthOfSpace();
}
/**
* This returns the bounding box of the word in Pdf Coordinate System where {0,0} rotated with the page rotation.
* 0 -> LowerLeft
* 90 -> UpperLeft
* 180 -> UpperRight
* 270 -> LowerRight
*
* @return bounding box of the word in Pdf Coordinate System
*/
@JsonIgnore
@JsonAttribute(ignore = true)
@SneakyThrows
public Rectangle getRectangle() {
log.debug("Page: '{}', Word: '{}', Rotation: '{}', textRotation {}", page, this, rotation, dir);
float textHeight = getTextHeight();
RedTextPosition firstTextPos = textPositions.get(0);
RedTextPosition lastTextPos = textPositions.get(textPositions.size() - 1);
Point2D bottomLeft = new Point2D.Double(firstTextPos.getXDirAdj(), firstTextPos.getYDirAdj() - HEIGHT_PADDING);
Point2D topRight = new Point2D.Double(lastTextPos.getXDirAdj() + lastTextPos.getWidthDirAdj(), lastTextPos.getYDirAdj() + textHeight + HEIGHT_PADDING);
AffineTransform transform = new AffineTransform();
if (dir == TextDirection.ZERO || dir == TextDirection.HALF_CIRCLE) {
transform.rotate(dir.getRadians(), pageWidth / 2f, pageHeight / 2f);
transform.translate(0f, pageHeight + textHeight);
transform.scale(1., -1.);
} else if (dir == TextDirection.QUARTER_CIRCLE) {
transform.rotate(dir.getRadians(), pageWidth / 2f, pageWidth / 2f);
transform.translate(0f, pageWidth + textHeight);
transform.scale(1., -1.);
} else {
transform.rotate(dir.getRadians(), pageHeight / 2f, pageHeight / 2f);
transform.translate(0f, pageWidth + textHeight);
transform.scale(1., -1.);
}
bottomLeft = transform.transform(bottomLeft, null);
topRight = transform.transform(topRight, null);
return new Rectangle( //
new Point((float) bottomLeft.getX(), (float) bottomLeft.getY()),
(float) (topRight.getX() - bottomLeft.getX()),
(float) (topRight.getY() - bottomLeft.getY()),
page);
}
}

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.knecon.fforesight.service.layoutparser.processor.services.parsing;
package com.knecon.fforesight.service.layoutparser.processor.classification.parsing;
import java.io.IOException;
import java.io.InputStream;
@ -92,28 +92,28 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine {
*/
LegacyPDFStreamEngine() throws IOException {
addOperator(new BeginText(this));
addOperator(new Concatenate(this));
addOperator(new DrawObject(this)); // special text version
addOperator(new EndText(this));
addOperator(new SetGraphicsStateParameters(this));
addOperator(new Save(this));
addOperator(new Restore(this));
addOperator(new NextLine(this));
addOperator(new SetCharSpacing(this));
addOperator(new MoveText(this));
addOperator(new MoveTextSetLeading(this));
addOperator(new SetFontAndSize(this));
addOperator(new ShowText(this));
addOperator(new ShowTextAdjusted(this));
addOperator(new SetTextLeading(this));
addOperator(new SetMatrix(this));
addOperator(new SetTextRenderingMode(this));
addOperator(new SetTextRise(this));
addOperator(new SetWordSpacing(this));
addOperator(new SetTextHorizontalScaling(this));
addOperator(new ShowTextLine(this));
addOperator(new ShowTextLineAndSpace(this));
addOperator(new BeginText());
addOperator(new Concatenate());
addOperator(new DrawObject()); // special text version
addOperator(new EndText());
addOperator(new SetGraphicsStateParameters());
addOperator(new Save());
addOperator(new Restore());
addOperator(new NextLine());
addOperator(new SetCharSpacing());
addOperator(new MoveText());
addOperator(new MoveTextSetLeading());
addOperator(new SetFontAndSize());
addOperator(new ShowText());
addOperator(new ShowTextAdjusted());
addOperator(new SetTextLeading());
addOperator(new SetMatrix());
addOperator(new SetTextRenderingMode());
addOperator(new SetTextRise());
addOperator(new SetWordSpacing());
addOperator(new SetTextHorizontalScaling());
addOperator(new ShowTextLine());
addOperator(new ShowTextLineAndSpace());
// load additional glyph list for Unicode mapping
String path = "/org/apache/pdfbox/resources/glyphlist/additional.txt";
@ -264,6 +264,7 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine {
return;
}
}
// adjust for cropbox if needed
Matrix translatedTextRenderingMatrix;
if (translateMatrix == null) {
@ -277,49 +278,49 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine {
// This is a hack for unicode letter with 2 chars e.g. RA see unicodeProblem.pdf
if (unicodeMapping.length() == 2) {
processTextPosition(new TextPosition(pageRotation,
pageSize.getWidth(),
pageSize.getHeight(),
translatedTextRenderingMatrix,
nextX,
nextY,
Math.abs(dyDisplay),
dxDisplay,
Math.abs(spaceWidthDisplay),
Character.toString(unicodeMapping.charAt(0)),
new int[]{code},
font,
fontSize,
(int) (fontSize * textMatrix.getScalingFactorX() * textMatrix.getScalingFactorY())));
pageSize.getWidth(),
pageSize.getHeight(),
translatedTextRenderingMatrix,
nextX,
nextY,
Math.abs(dyDisplay),
dxDisplay,
Math.abs(spaceWidthDisplay),
Character.toString(unicodeMapping.charAt(0)),
new int[]{code},
font,
fontSize,
(int) (fontSize * textMatrix.getScalingFactorX())));
processTextPosition(new TextPosition(pageRotation,
pageSize.getWidth(),
pageSize.getHeight(),
translatedTextRenderingMatrix,
nextX,
nextY,
Math.abs(dyDisplay),
dxDisplay,
Math.abs(spaceWidthDisplay),
Character.toString(unicodeMapping.charAt(1)),
new int[]{code},
font,
fontSize,
(int) (fontSize * textMatrix.getScalingFactorX() * textMatrix.getScalingFactorY())));
pageSize.getWidth(),
pageSize.getHeight(),
translatedTextRenderingMatrix,
nextX,
nextY,
Math.abs(dyDisplay),
dxDisplay,
Math.abs(spaceWidthDisplay),
Character.toString(unicodeMapping.charAt(1)),
new int[]{code},
font,
fontSize,
(int) (fontSize * textMatrix.getScalingFactorX())));
} else {
processTextPosition(new TextPosition(pageRotation,
pageSize.getWidth(),
pageSize.getHeight(),
translatedTextRenderingMatrix,
nextX,
nextY,
Math.abs(dyDisplay),
dxDisplay,
Math.abs(spaceWidthDisplay),
unicodeMapping,
new int[]{code},
font,
fontSize,
(int) (fontSize * textMatrix.getScalingFactorX() * textMatrix.getScalingFactorY())));
pageSize.getWidth(),
pageSize.getHeight(),
translatedTextRenderingMatrix,
nextX,
nextY,
Math.abs(dyDisplay),
dxDisplay,
Math.abs(spaceWidthDisplay),
unicodeMapping,
new int[]{code},
font,
fontSize,
(int) (fontSize * textMatrix.getScalingFactorX())));
}
}

View File

@ -1,11 +1,10 @@
package com.knecon.fforesight.service.layoutparser.processor.services.parsing;
package com.knecon.fforesight.service.layoutparser.processor.classification.parsing;
import java.awt.color.CMMException;
import java.awt.geom.Point2D;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.contentstream.operator.OperatorName;
@ -21,8 +20,6 @@ import org.apache.pdfbox.contentstream.operator.color.SetStrokingColorSpace;
import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceCMYKColor;
import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceGrayColor;
import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceRGBColor;
import org.apache.pdfbox.contentstream.operator.markedcontent.BeginMarkedContentSequenceWithProperties;
import org.apache.pdfbox.contentstream.operator.markedcontent.EndMarkedContentSequence;
import org.apache.pdfbox.contentstream.operator.state.SetFlatness;
import org.apache.pdfbox.contentstream.operator.state.SetLineCapStyle;
import org.apache.pdfbox.contentstream.operator.state.SetLineDashPattern;
@ -38,9 +35,9 @@ import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.graphics.color.PDColor;
import org.apache.pdfbox.text.TextPosition;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.RedTextPosition;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence;
import lombok.Getter;
import lombok.Setter;
@ -51,12 +48,15 @@ import lombok.extern.slf4j.Slf4j;
@Slf4j
public class PDFLinesTextStripper extends PDFTextStripper {
private final static Set<String> DOT_LIKE_CHARACTERS = Set.of(".", "·", "", "", "", "", "", "", "", "", "", "", "", "");
private final List<Word> words = new ArrayList<>();
private final List<TextPositionSequence> textPositionSequences = new ArrayList<>();
private final List<Ruling> rulings = new ArrayList<>();
private final List<Ruling> graphicsPath = new ArrayList<>();
@Setter
protected PDPage pdpage;
private int minCharWidth;
private int maxCharWidth;
private int minCharHeight;
private int maxCharHeight;
private float path_x;
private float path_y;
@ -68,30 +68,26 @@ public class PDFLinesTextStripper extends PDFTextStripper {
public PDFLinesTextStripper() throws IOException {
super();
this.addOperator(new SetStrokingColorSpace(this));
this.addOperator(new SetNonStrokingColorSpace(this));
this.addOperator(new SetLineDashPattern(this));
this.addOperator(new SetStrokingDeviceGrayColor(this));
this.addOperator(new SetNonStrokingDeviceGrayColor(this));
this.addOperator(new SetFlatness(this));
this.addOperator(new SetLineJoinStyle(this));
this.addOperator(new SetLineCapStyle(this));
this.addOperator(new SetStrokingDeviceCMYKColor(this));
this.addOperator(new SetNonStrokingDeviceCMYKColor(this));
this.addOperator(new SetLineMiterLimit(this));
this.addOperator(new SetStrokingDeviceRGBColor(this));
this.addOperator(new SetNonStrokingDeviceRGBColor(this));
this.addOperator(new SetRenderingIntent(this));
this.addOperator(new SetStrokingColor(this));
this.addOperator(new SetNonStrokingColor(this));
this.addOperator(new SetStrokingColorN(this));
this.addOperator(new SetNonStrokingColorN(this));
this.addOperator(new SetFontAndSize(this));
this.addOperator(new SetLineWidth(this));
addOperator(new BeginMarkedContentSequenceWithProperties(this));
// addOperator(new BeginMarkedContentSequence(this));
addOperator(new EndMarkedContentSequence(this));
this.addOperator(new SetStrokingColorSpace());
this.addOperator(new SetNonStrokingColorSpace());
this.addOperator(new SetLineDashPattern());
this.addOperator(new SetStrokingDeviceGrayColor());
this.addOperator(new SetNonStrokingDeviceGrayColor());
this.addOperator(new SetFlatness());
this.addOperator(new SetLineJoinStyle());
this.addOperator(new SetLineCapStyle());
this.addOperator(new SetStrokingDeviceCMYKColor());
this.addOperator(new SetNonStrokingDeviceCMYKColor());
this.addOperator(new SetLineMiterLimit());
this.addOperator(new SetStrokingDeviceRGBColor());
this.addOperator(new SetNonStrokingDeviceRGBColor());
this.addOperator(new SetRenderingIntent());
this.addOperator(new SetStrokingColor());
this.addOperator(new SetNonStrokingColor());
this.addOperator(new SetStrokingColorN());
this.addOperator(new SetNonStrokingColorN());
this.addOperator(new SetFontAndSize());
this.addOperator(new SetLineWidth());
}
@ -102,7 +98,6 @@ public class PDFLinesTextStripper extends PDFTextStripper {
//move
switch (operation) {
case OperatorName.MOVE_TO:
if (arguments.size() == 2) {
Point2D.Float pos = transformPosition(floatValue(arguments.get(0)), floatValue(arguments.get(1)));
@ -203,7 +198,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
try {
if (stroke && !getGraphicsState().getStrokingColor().isPattern() && isBlack(getGraphicsState().getStrokingColor()) || //
!stroke && !getGraphicsState().getNonStrokingColor().isPattern() && isBlack(getGraphicsState().getNonStrokingColor())) {
!stroke && !getGraphicsState().getNonStrokingColor().isPattern() && isBlack(getGraphicsState().getNonStrokingColor())) {
rulings.addAll(path);
}
} catch (UnsupportedOperationException e) {
@ -224,11 +219,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
// This is a quick and dirt hack
// Happens for file 216.pdf
log.debug(e.getMessage());
var result = true;
for (var component : color.getComponents()) {
result = result && component == 0;
}
return result;
return color.getComponents()[0] == 0 && color.getComponents()[1] == 0 && color.getComponents()[2] == 0 && color.getComponents()[1] == 1;
}
}
@ -239,17 +230,28 @@ public class PDFLinesTextStripper extends PDFTextStripper {
int startIndex = 0;
RedTextPosition previous = null;
float direction = -1;
for (int i = 0; i <= textPositions.size() - 1; i++) {
if (direction == -1) {
direction = textPositions.get(i).getDir();
if (!textPositionSequences.isEmpty()) {
previous = textPositionSequences.get(textPositionSequences.size() - 1)
.getTextPositions()
.get(textPositionSequences.get(textPositionSequences.size() - 1).getTextPositions().size() - 1);
}
if (!words.isEmpty()) {
previous = words.get(words.size() - 1)
.getCharacters()
.get(words.get(words.size() - 1).getCharacters().size() - 1).getTextPosition();
int charWidth = (int) textPositions.get(i).getWidthDirAdj();
if (charWidth < minCharWidth) {
minCharWidth = charWidth;
}
if (charWidth > maxCharWidth) {
maxCharWidth = charWidth;
}
int charHeight = (int) textPositions.get(i).getHeightDir();
if (charHeight < minCharHeight) {
minCharHeight = charHeight;
}
if (charWidth > maxCharHeight) {
maxCharHeight = charHeight;
}
if (i == 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i).getUnicode().equals("\u00A0") || textPositions.get(i).getUnicode().equals("\t"))) {
@ -257,142 +259,80 @@ public class PDFLinesTextStripper extends PDFTextStripper {
continue;
}
if (textPositions.get(i).getDir() != direction && startIndex != i) {
List<TextPosition> sublist = textPositions.subList(startIndex, i);
words.add(new Word(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart));
startIndex = i;
direction = textPositions.get(i).getDir();
}
// Strange but sometimes this is happening, for example: Metolachlor2.pdf
if (checkIfCurrentPositionIsToTheRightOfPreviousPosition(i, textPositions)) {
if (i > 0 && textPositions.get(i).getXDirAdj() < textPositions.get(i - 1).getXDirAdj()) {
List<TextPosition> sublist = textPositions.subList(startIndex, i);
if (checkIfSequenceContainsOnlyWhitespaces(sublist)) {
words.add(new Word(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart));
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0)
.getUnicode()
.equals("\t")))) {
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart));
}
startIndex = i;
}
if (textPositions.get(i).getRotation() == 0 && i > 0 && textPositions.get(i).getX() > textPositions.get(i - 1).getEndX() + 1) {
List<TextPosition> sublist = textPositions.subList(startIndex, i);
if (checkIfSequenceContainsOnlyWhitespaces(sublist)) {
words.add(new Word(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart));
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0)
.getUnicode()
.equals("\t")))) {
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart));
}
startIndex = i;
}
if (i > 0
&& (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i).getUnicode().equals("\u00A0") || textPositions.get(i).getUnicode().equals("\t"))
&& i <= textPositions.size() - 2) {
if (i > 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i).getUnicode().equals("\u00A0") || textPositions.get(i)
.getUnicode()
.equals("\t")) && i <= textPositions.size() - 2) {
List<TextPosition> sublist = textPositions.subList(startIndex, i);
if (checkIfSequenceContainsOnlyWhitespaces(sublist)) {
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0)
.getUnicode()
.equals("\t")))) {
// Remove false sequence ends (whitespaces)
if (checkIfGapSizeBetweenCharactersSmallerThanMaximum(previous, sublist, 0.01f)) {
if (previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0)
.getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < 0.01) {
for (TextPosition t : sublist) {
words.get(words.size() - 1).add(t);
textPositionSequences.get(textPositionSequences.size() - 1).add(t);
}
} else {
words.add(new Word(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart));
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart));
}
}
startIndex = i + 1;
}
if (isDottedLineFollowedByWord(textPositions, i, startIndex)) {
List<TextPosition> sublist = textPositions.subList(startIndex, i);
words.add(new Word(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart));
startIndex = i;
}
if (isWordFollowedByDottedLine(textPositions, i, startIndex)) {
List<TextPosition> sublist = textPositions.subList(startIndex, i - 2);
words.add(new Word(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart));
startIndex = i - 2;
}
}
List<TextPosition> sublist = textPositions.subList(startIndex, textPositions.size());
if (!sublist.isEmpty() && (sublist.get(sublist.size() - 1).getUnicode().equals(" ")
|| sublist.get(sublist.size() - 1).getUnicode().equals("\u00A0")
|| sublist.get(sublist.size() - 1).getUnicode().equals("\t"))) {
if (!sublist.isEmpty() && (sublist.get(sublist.size() - 1).getUnicode().equals(" ") || sublist.get(sublist.size() - 1)
.getUnicode()
.equals("\u00A0") || sublist.get(sublist.size() - 1).getUnicode().equals("\t"))) {
sublist = sublist.subList(0, sublist.size() - 1);
}
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0)
.getUnicode()
.equals("\t")))) {
if (previous != null
&& sublist.get(0).getYDirAdj() == previous.getYDirAdj()
&& sublist.get(0).getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < 0.01) {
if (previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0)
.getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < 0.01) {
for (TextPosition t : sublist) {
words.get(words.size() - 1).add(t);
textPositionSequences.get(textPositionSequences.size() - 1).add(t);
}
} else {
words.add(new Word(sublist, pageNumber, isParagraphStart));
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, isParagraphStart));
}
}
super.writeString(text);
}
private boolean isWordFollowedByDottedLine(List<TextPosition> textPositions, int i, int startIndex) {
return i - startIndex >= 4 //
&& isDot(textPositions, i) //
&& isDot(textPositions, i - 1) //
&& isDot(textPositions, i - 2) //
&& alphanumeric(textPositions, i - 3);
}
private static boolean isDottedLineFollowedByWord(List<TextPosition> textPositions, int i, int startIndex) {
return i - startIndex >= 4 //
&& alphanumeric(textPositions, i) //
&& isDot(textPositions, i - 1) //
&& isDot(textPositions, i - 2) //
&& isDot(textPositions, i - 3);
}
private static boolean isDot(List<TextPosition> textPositions, int i) {
return DOT_LIKE_CHARACTERS.contains(textPositions.get(i).getUnicode());
}
private static boolean alphanumeric(List<TextPosition> textPositions, int i) {
return Character.isAlphabetic(textPositions.get(i).getUnicode().charAt(0)) || Character.isDigit(textPositions.get(i).getUnicode().charAt(0));
}
public boolean checkIfCurrentPositionIsToTheRightOfPreviousPosition(int i, List<TextPosition> textPositions) {
return i > 0 && textPositions.get(i).getXDirAdj() < textPositions.get(i - 1).getXDirAdj();
}
public boolean checkIfSequenceContainsOnlyWhitespaces(List<TextPosition> sublist) {
return !(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0)
.getUnicode()
.equals("\t")));
}
public boolean checkIfGapSizeBetweenCharactersSmallerThanMaximum(RedTextPosition previous, List<TextPosition> sublist, float maximumGapSize) {
return previous != null
&& sublist.get(0).getYDirAdj() == previous.getYDirAdj()
&& sublist.get(0).getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < maximumGapSize;
}
@Override
public String getText(PDDocument doc) throws IOException {
words.clear();
minCharWidth = Integer.MAX_VALUE;
maxCharWidth = 0;
minCharHeight = Integer.MAX_VALUE;
maxCharHeight = 0;
textPositionSequences.clear();
rulings.clear();
graphicsPath.clear();
path_x = 0.0f;
@ -404,4 +344,3 @@ public class PDFLinesTextStripper extends PDFTextStripper {
}

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.knecon.fforesight.service.layoutparser.processor.services.parsing;
package com.knecon.fforesight.service.layoutparser.processor.classification.parsing;
import java.io.BufferedInputStream;
import java.io.IOException;
@ -25,10 +25,9 @@ import java.io.StringWriter;
import java.io.Writer;
import java.text.Bidi;
import java.text.Normalizer;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Deque;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
@ -43,22 +42,15 @@ import java.util.regex.Pattern;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageTree;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
import org.apache.pdfbox.pdmodel.interactive.pagenavigation.PDThreadBead;
import org.apache.pdfbox.text.TextPosition;
import org.apache.pdfbox.text.TextPositionComparator;
import com.knecon.fforesight.service.layoutparser.processor.utils.QuickSort;
import lombok.Getter;
import org.apache.pdfbox.util.QuickSort;
/**
* This is just a copy except i only adjusted lines 594-607 cause this is a bug in Pdfbox.
@ -71,10 +63,6 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
private static float defaultDropThreshold = 2.5f;
private static final boolean useCustomQuickSort;
@Getter
protected final List<PDMarkedContent> markedContents = new ArrayList<>();
protected final Deque<PDMarkedContent> currentMarkedContents = new ArrayDeque<>();
private static final Log LOG = LogFactory.getLog(PDFTextStripper.class);
// enable the ability to set the default indent/drop thresholds
@ -208,38 +196,6 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
}
public void beginMarkedContentSequence(COSName tag, COSDictionary properties) {
PDMarkedContent markedContent = PDMarkedContent.create(tag, properties);
if (this.currentMarkedContents.isEmpty()) {
this.markedContents.add(markedContent);
} else {
PDMarkedContent currentMarkedContent = this.currentMarkedContents.peek();
if (currentMarkedContent != null) {
currentMarkedContent.addMarkedContent(markedContent);
}
}
this.currentMarkedContents.push(markedContent);
}
@Override
public void endMarkedContentSequence() {
if (!this.currentMarkedContents.isEmpty()) {
this.currentMarkedContents.pop();
}
}
public void xobject(PDXObject xobject) {
if (!this.currentMarkedContents.isEmpty()) {
this.currentMarkedContents.peek().addXObject(xobject);
}
}
/**
* This will return the text of a document. See writeText. <br>
* NOTE: The document must not be encrypted when coming into this method.
@ -320,11 +276,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
endBookmarkPageNumber = -1;
}
if (startBookmarkPageNumber == -1
&& startBookmark != null
&& endBookmarkPageNumber == -1
&& endBookmark != null
&& startBookmark.getCOSObject() == endBookmark.getCOSObject()) {
if (startBookmarkPageNumber == -1 && startBookmark != null && endBookmarkPageNumber == -1 && endBookmark != null && startBookmark.getCOSObject() == endBookmark.getCOSObject()) {
// this is a special case where both the start and end bookmark
// are the same but point to nothing. In this case
// we will not extract any text.
@ -371,9 +323,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
@Override
public void processPage(PDPage page) throws IOException {
if (currentPageNo >= startPage && currentPageNo <= endPage && (startBookmarkPageNumber == -1 || currentPageNo >= startBookmarkPageNumber) && (endBookmarkPageNumber == -1
|| currentPageNo
<= endBookmarkPageNumber)) {
if (currentPageNo >= startPage && currentPageNo <= endPage && (startBookmarkPageNumber == -1 || currentPageNo >= startBookmarkPageNumber) && (endBookmarkPageNumber == -1 || currentPageNo <= endBookmarkPageNumber)) {
startPage(page);
int numberOfArticleSections = 1;
@ -648,6 +598,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
var normalized = normalize(line);
// normalized.stream().filter(l -> System.out.println(l.getText().contains("Plenarprotokoll 20/24")).findFirst().isPresent()
lastLineStartPosition = handleLineSeparation(current, lastPosition, lastLineStartPosition, maxHeightForLine);
writeLine(normalized, current.isParagraphStart);
line.clear();
@ -659,8 +610,8 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
}
// test if our TextPosition starts after a new word would be expected to start
if (expectedStartOfNextWordX != EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE && expectedStartOfNextWordX < positionX
// only bother adding a word separator if the last character was not a word separator
&& (wordSeparator.isEmpty() || //
// only bother adding a word separator if the last character was not a word separator
&& (wordSeparator.isEmpty() || //
(lastPosition.getTextPosition().getUnicode() != null && !lastPosition.getTextPosition().getUnicode().endsWith(wordSeparator)))) {
line.add(LineItem.getWordSeparator());
}
@ -926,11 +877,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
textList.add(text);
}
}
if (!this.currentMarkedContents.isEmpty()) {
this.currentMarkedContents.peek().addText(text);
}
}
}
@ -1018,7 +965,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
/**
* Set the desired word separator for output text. The PDFBox text extraction algorithm will output a space
* character if there is enough space between two textPositions. By default a space character is used. If you need and
* character if there is enough space between two words. By default a space character is used. If you need and
* accurate count of characters that are found in a PDF document then you might want to set the word separator to
* the empty string.
*
@ -1714,7 +1661,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
/**
* Write a list of string containing a whole line of a document.
*
* @param line a list with the textPositions of the given line
* @param line a list with the words of the given line
* @throws IOException if something went wrong
*/
private void writeLine(List<WordWithTextPositions> line, boolean isParagraphEnd) throws IOException {
@ -1722,6 +1669,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
int numberOfStrings = line.size();
for (int i = 0; i < numberOfStrings; i++) {
WordWithTextPositions word = line.get(i);
word.getTextPositions().sort(Comparator.comparing(TextPosition::getX));
writeString(word.getText(), word.getTextPositions(), isParagraphEnd && i == numberOfStrings - 1);
if (i < numberOfStrings - 1) {
writeWordSeparator();
@ -1754,9 +1702,9 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
/**
* Handles the LTR and RTL direction of the given textPositions. The whole implementation stands and falls with the given
* word. If the word is a full line, the results will be the best. If the word contains of single textPositions or
* characters, the order of the characters in a word or textPositions in a line may wrong, due to RTL and LTR marks and
* Handles the LTR and RTL direction of the given words. The whole implementation stands and falls with the given
* word. If the word is a full line, the results will be the best. If the word contains of single words or
* characters, the order of the characters in a word or words in a line may wrong, due to RTL and LTR marks and
* characters!
* <p>
* Based on http://www.nesterovsky-bros.com/weblog/2013/07/28/VisualToLogicalConversionInJava.aspx
@ -2112,9 +2060,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
return endParagraphWritten;
}
public void setEndParagraphWritten() {
public void setEndParagraphWritten(){
endParagraphWritten = true;
}

View File

@ -0,0 +1,280 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.service;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.Orientation;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.RulingTextDirAdjustUtil;
@Service
@SuppressWarnings("all")
public class BlockificationService {
private static final float THRESHOLD = 1f;
private static final float Y_GAP_SPLIT_HEIGHT_MODIFIER = 1.25f;
private static final int X_GAP_SPLIT_CONSTANT = 50;
/**
* This method is building blocks by expanding the minX/maxX and minY/maxY value on each word that is not split by the conditions.
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
* Rulings (Table lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
*
* @param textPositions The words of a page.
* @param horizontalRulingLines Horizontal table lines.
* @param verticalRulingLines Vertical table lines.
* @return ClassificationPage object that contains the Textblock and text statistics.
*/
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
List<TextPageBlock> classificationTextBlocks = constructFineGranularTextPageBlocks(textPositions, horizontalRulingLines, verticalRulingLines);
classificationTextBlocks = mergeFineGranularTextPageBlocks(classificationTextBlocks);
return new ClassificationPage(new ArrayList<>(classificationTextBlocks.stream().map(classificationTextBlock -> (AbstractPageBlock) classificationTextBlock).toList()));
}
private List<TextPageBlock> mergeFineGranularTextPageBlocks(List<TextPageBlock> classificationTextBlocks) {
if (classificationTextBlocks.isEmpty()) {
return new ArrayList<>();
}
List<List<TextPageBlock>> textBlocksToMerge = new LinkedList<>();
List<TextPageBlock> currentTextBlocksToMerge = new LinkedList<>();
textBlocksToMerge.add(currentTextBlocksToMerge);
TextPageBlock previousTextBlock = null;
for (TextPageBlock currentTextBlock : classificationTextBlocks) {
if (previousTextBlock == null) {
currentTextBlocksToMerge.add(currentTextBlock);
previousTextBlock = currentTextBlock;
continue;
}
boolean alignsXRight = Math.abs(currentTextBlock.getPdfMaxX() - previousTextBlock.getPdfMaxX()) < 1;
boolean smallYGap = Math.abs(currentTextBlock.getPdfMaxY() - previousTextBlock.getPdfMinY()) < 5;
if (alignsXRight && smallYGap) {
currentTextBlocksToMerge.add(currentTextBlock);
} else {
currentTextBlocksToMerge = new LinkedList<>();
currentTextBlocksToMerge.add(currentTextBlock);
textBlocksToMerge.add(currentTextBlocksToMerge);
}
previousTextBlock = currentTextBlock;
}
return textBlocksToMerge.stream().map(TextPageBlock::merge).toList();
}
private void assignOrientations(List<TextPageBlock> classificationTextBlocks) {
Iterator<TextPageBlock> itty = classificationTextBlocks.iterator();
TextPageBlock previousLeft = null;
TextPageBlock previousRight = null;
while (itty.hasNext()) {
TextPageBlock block = (TextPageBlock) itty.next();
if (previousLeft != null && block.getOrientation().equals(Orientation.LEFT)) {
if (previousLeft.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousLeft.getMinY()) {
previousLeft.add(block);
itty.remove();
continue;
}
}
if (previousRight != null && block.getOrientation().equals(Orientation.RIGHT)) {
if (previousRight.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousRight.getMinY()) {
previousRight.add(block);
itty.remove();
continue;
}
}
if (block.getOrientation().equals(Orientation.LEFT)) {
previousLeft = block;
} else if (block.getOrientation().equals(Orientation.RIGHT)) {
previousRight = block;
}
}
itty = classificationTextBlocks.iterator();
TextPageBlock previous = null;
while (itty.hasNext()) {
TextPageBlock block = (TextPageBlock) itty.next();
if (previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold(
block.getMaxY(),
previous.getMaxY()) || previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation()
.equals(Orientation.RIGHT) && equalsWithThreshold(block.getMaxY(), previous.getMaxY())) {
previous.add(block);
itty.remove();
continue;
}
previous = block;
}
}
private List<TextPageBlock> constructFineGranularTextPageBlocks(List<TextPositionSequence> textPositions,
List<Ruling> horizontalRulingLines,
List<Ruling> verticalRulingLines) {
int indexOnPage = 0;
List<TextPositionSequence> wordClusterToCombine = new ArrayList<>();
List<TextPageBlock> classificationTextBlocks = new ArrayList<>();
float minX = 1000, maxX = 0, minY = 1000, maxY = 0;
TextPositionSequence prev = null;
var listIdentitifier = Pattern.compile("\\b(?:[1-9]|1\\d|20|[ivxlc]|[a-z])\\s*(?:[.)])", Pattern.CASE_INSENSITIVE);
boolean wasSplitted = false;
Float splitX1 = null;
for (TextPositionSequence word : textPositions) {
Matcher listIdentifierPattern = listIdentitifier.matcher(word.toString());
boolean yGap = word.getMinYDirAdj() - maxY > word.getHeight() * Y_GAP_SPLIT_HEIGHT_MODIFIER;
boolean sameLine = prev != null && equalsWithThreshold(prev.getMinYDirAdj(), word.getMinYDirAdj());
boolean positiveXGapInline = prev != null && maxX + X_GAP_SPLIT_CONSTANT < word.getMinXDirAdj() && sameLine;
boolean negativeXGap = prev != null && word.getMinXDirAdj() - minX < -5;
boolean startFromTop = prev != null && word.getMinYDirAdj() < prev.getMinYDirAdj() - prev.getTextHeight();
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
boolean splitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
boolean fontChange = prev != null && (!word.getFont().equals(prev.getFont()) || !word.getFontStyle()
.equals(prev.getFontStyle()) || word.getFontSize() != prev.getFontSize());
boolean newline = prev != null && Math.abs(word.getMinYDirAdj() - prev.getMinYDirAdj()) > word.getHeight();
boolean isListIdentifier = listIdentifierPattern.matches();
if (prev != null && (prev.isParagraphStart() || negativeXGap || positiveXGapInline || yGap || startFromTop || splitByRuling || (newline && (fontChange || isListIdentifier)))) {
// if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) {
Orientation prevOrientation = null;
if (!classificationTextBlocks.isEmpty()) {
prevOrientation = classificationTextBlocks.get(classificationTextBlocks.size() - 1).getOrientation();
}
TextPageBlock classificationTextBlock = TextPageBlock.fromTextPositionSequences(wordClusterToCombine);
classificationTextBlocks.add(classificationTextBlock);
wordClusterToCombine = new ArrayList<>();
if (positiveXGapInline && !splitByRuling) {
wasSplitted = true;
classificationTextBlock.setOrientation(Orientation.LEFT);
splitX1 = word.getMinXDirAdj();
} else if (newLineAfterSplit && !splitByRuling) {
wasSplitted = false;
classificationTextBlock.setOrientation(Orientation.RIGHT);
splitX1 = null;
} else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (yGap || !startFromTop || !positiveXGapInline || !newLineAfterSplit || !splitByRuling)) {
classificationTextBlock.setOrientation(Orientation.LEFT);
}
minX = 1000;
maxX = 0;
minY = 1000;
maxY = 0;
prev = null;
}
wordClusterToCombine.add(word);
prev = word;
if (word.getMinXDirAdj() < minX) {
minX = word.getMinXDirAdj();
}
if (word.getMaxXDirAdj() > maxX) {
maxX = word.getMaxXDirAdj();
}
if (word.getMinYDirAdj() < minY) {
minY = word.getMinYDirAdj();
}
if (word.getMaxYDirAdj() > maxY) {
maxY = word.getMaxYDirAdj();
}
}
TextPageBlock classificationTextBlock = TextPageBlock.fromTextPositionSequences(wordClusterToCombine);
if (classificationTextBlock != null) {
classificationTextBlocks.add(classificationTextBlock);
}
return classificationTextBlocks;
}
private boolean equalsWithThreshold(float f1, float f2) {
return Math.abs(f1 - f2) < THRESHOLD;
}
private boolean isSplitByRuling(float minX,
float minY,
float maxX,
float maxY,
TextPositionSequence word,
List<Ruling> horizontalRulingLines,
List<Ruling> verticalRulingLines) {
return isSplitByRuling(maxX,
minY,
word.getMinXDirAdj(),
word.getMinYDirAdj(),
verticalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(minX,
minY,
word.getMinXDirAdj(),
word.getMaxYDirAdj(),
horizontalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(maxX,
minY,
word.getMinXDirAdj(),
word.getMinYDirAdj(),
horizontalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(minX,
minY,
word.getMinXDirAdj(),
word.getMaxYDirAdj(),
verticalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()); //
}
private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List<Ruling> rulingLines, float dir, float pageWidth, float pageHeight) {
for (Ruling ruling : rulingLines) {
var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight);
if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) {
return true;
}
}
return false;
}
}

View File

@ -0,0 +1,164 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.service;
import java.util.List;
import org.springframework.stereotype.Service;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.FloatFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.PositionUtils;
@Service
public class BodyTextFrameService {
private static final float APPROXIMATE_HEADER_LINE_COUNT = 2.0f;
/**
* Adjusts and sets the body text frame to a page.
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
* 0 -> LowerLeft
* 90 -> UpperLeft
* 180 -> UpperRight
* 270 -> LowerRight
* The aspect ratio of the page is also regarded.
*
* @param page The page
* @param bodyTextFrame frame that contains the main text on portrait pages
* @param landscapeBodyTextFrame frame that contains the main text on landscape pages
*/
public void setBodyTextFrameAdjustedToPage(ClassificationPage page, Rectangle bodyTextFrame, Rectangle landscapeBodyTextFrame) {
Rectangle textFrame = page.isLandscape() ? landscapeBodyTextFrame : bodyTextFrame;
if (page.getPageWidth() > page.getPageHeight() && page.getRotation() == 270) {
textFrame = new Rectangle(new Point(textFrame.getTopLeft().getY(), page.getPageHeight() - textFrame.getTopLeft().getX() - textFrame.getWidth()),
textFrame.getHeight(),
textFrame.getWidth(),
0);
} else if (page.getPageWidth() > page.getPageHeight() && page.getRotation() != 0) {
textFrame = new Rectangle(new Point(textFrame.getTopLeft().getY(), textFrame.getTopLeft().getX()), textFrame.getHeight(), textFrame.getWidth(), page.getPageNumber());
} else if (page.getRotation() == 180) {
textFrame = new Rectangle(new Point(textFrame.getTopLeft().getX(), page.getPageHeight() - textFrame.getTopLeft().getY() - textFrame.getHeight()),
textFrame.getWidth(),
textFrame.getHeight(),
0);
}
page.setBodyTextFrame(textFrame);
}
/**
* Calculates the frame that contains the main text, text outside the frame will be e.g. headers or footers.
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
* 0 -> LowerLeft
* 90 -> UpperLeft
* 180 -> UpperRight
* 270 -> LowerRight
* The aspect ratio of the page is also regarded.
*
* @param pages List of all pages
* @param documentFontSizeCounter Statistics of the document
* @param landscape Calculate for landscape or portrait
* @return Rectangle of the text frame
*/
public Rectangle calculateBodyTextFrame(List<ClassificationPage> pages, FloatFrequencyCounter documentFontSizeCounter, boolean landscape) {
BodyTextFrameExpansionsRectangle expansionsRectangle = new BodyTextFrameExpansionsRectangle();
for (ClassificationPage page : pages) {
if (page.getTextBlocks().isEmpty() || landscape != page.isLandscape()) {
continue;
}
for (AbstractPageBlock container : page.getTextBlocks()) {
if (container instanceof TextPageBlock) {
TextPageBlock textBlock = (TextPageBlock) container;
if (textBlock.getMostPopularWordFont() == null || textBlock.getMostPopularWordStyle() == null) {
continue;
}
float approxLineCount = PositionUtils.getApproxLineCount(textBlock);
if (approxLineCount < APPROXIMATE_HEADER_LINE_COUNT) {
continue;
}
if (documentFontSizeCounter.getMostPopular() != null && textBlock.getMostPopularWordFontSize() >= documentFontSizeCounter.getMostPopular()) {
expandRectangle(textBlock, page, expansionsRectangle);
}
}
if (container instanceof TablePageBlock) {
TablePageBlock table = (TablePageBlock) container;
for (List<Cell> row : table.getRows()) {
for (Cell cell : row) {
if (cell == null || cell.getTextBlocks() == null) {
continue;
}
for (TextPageBlock textBlock : cell.getTextBlocks()) {
expandRectangle(textBlock, page, expansionsRectangle);
}
}
}
}
}
}
return new Rectangle(new Point(expansionsRectangle.minX, expansionsRectangle.minY),
expansionsRectangle.maxX - expansionsRectangle.minX,
expansionsRectangle.maxY - expansionsRectangle.minY,
0);
}
private void expandRectangle(TextPageBlock textBlock, ClassificationPage page, BodyTextFrameExpansionsRectangle expansionsRectangle) {
if (page.getPageWidth() > page.getPageHeight() && page.getRotation() != 0) {
if (textBlock.getPdfMinY() < expansionsRectangle.minX) {
expansionsRectangle.minX = textBlock.getPdfMinY();
}
if (textBlock.getPdfMaxY() > expansionsRectangle.maxX) {
expansionsRectangle.maxX = textBlock.getPdfMaxY();
}
if (textBlock.getPdfMinX() < expansionsRectangle.minY) {
expansionsRectangle.minY = textBlock.getPdfMinX();
}
if (textBlock.getPdfMaxX() > expansionsRectangle.maxY) {
expansionsRectangle.maxY = textBlock.getPdfMaxX();
}
} else {
if (textBlock.getPdfMinX() < expansionsRectangle.minX) {
expansionsRectangle.minX = textBlock.getPdfMinX();
}
if (textBlock.getPdfMaxX() > expansionsRectangle.maxX) {
expansionsRectangle.maxX = textBlock.getPdfMaxX();
}
if (textBlock.getPdfMinY() < expansionsRectangle.minY) {
expansionsRectangle.minY = textBlock.getPdfMinY();
}
if (textBlock.getPdfMaxY() > expansionsRectangle.maxY) {
expansionsRectangle.maxY = textBlock.getPdfMaxY();
}
}
}
private class BodyTextFrameExpansionsRectangle {
float minX = 10000;
float maxX = -100;
float minY = 10000;
float maxY = -100;
}
}

View File

@ -0,0 +1,112 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.service;
import java.util.List;
import java.util.regex.Pattern;
import org.springframework.stereotype.Service;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.PositionUtils;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
public class ClassificationService {
private final BodyTextFrameService bodyTextFrameService;
public void classifyDocument(ClassificationDocument document) {
Rectangle bodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), false);
Rectangle landscapeBodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), true);
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
for (ClassificationPage page : document.getPages()) {
bodyTextFrameService.setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame);
classifyPage(page, document, headlineFontSizes);
}
}
public void classifyPage(ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
for (AbstractPageBlock textBlock : page.getTextBlocks()) {
if (textBlock instanceof TextPageBlock) {
classifyBlock((TextPageBlock) textBlock, page, document, headlineFontSizes);
}
}
}
public void classifyBlock(TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
var bodyTextFrame = page.getBodyTextFrame();
if (document.getFontSizeCounter().getMostPopular() == null) {
textBlock.setClassification(PageBlockType.OTHER);
return;
}
if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation())) {
textBlock.setClassification(PageBlockType.HEADER);
} else if (PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation())) {
textBlock.setClassification(PageBlockType.FOOTER);
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
.size() == 1)) {
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
textBlock.setClassification(PageBlockType.TITLE);
}
} else if (textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter()
.getMostPopular() && PositionUtils.getApproxLineCount(textBlock) < 4.9 && (textBlock.getMostPopularWordStyle().equals("bold") || !document.getFontStyleCounter()
.getCountPerValue()
.containsKey("bold") && textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() + 1) && textBlock.getSequences()
.get(0)
.getTextPositions()
.get(0)
.getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
for (int i = 1; i <= headlineFontSizes.size(); i++) {
if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) {
textBlock.setClassification(PageBlockType.getHeadlineType(i));
document.setHeadlines(true);
}
}
} else if (!textBlock.getText().startsWith("Figure ") && PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordStyle()
.equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9 && textBlock.getSequences()
.get(0)
.getTextPositions()
.get(0)
.getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
textBlock.setClassification(PageBlockType.getHeadlineType(headlineFontSizes.size() + 1));
document.setHeadlines(true);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) {
textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFont()
.equals(document.getFontCounter().getMostPopular()) && textBlock.getMostPopularWordStyle()
.equals(document.getFontStyleCounter().getMostPopular()) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) {
textBlock.setClassification(PageBlockType.PARAGRAPH);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("italic") && !document.getFontStyleCounter()
.getMostPopular()
.equals("italic") && PositionUtils.getApproxLineCount(textBlock) < 2.9) {
textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) {
textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN);
} else {
textBlock.setClassification(PageBlockType.OTHER);
}
}
}

View File

@ -0,0 +1,136 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.service;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.adapter.ImageServiceResponseAdapter;
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableCells;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.classification.parsing.PDFLinesTextStripper;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
public class PdfParsingService {
private final RulingCleaningService rulingCleaningService;
private final TableExtractionService tableExtractionService;
private final BlockificationService blockificationService;
private final ImageServiceResponseAdapter imageServiceResponseAdapter;
public ClassificationDocument parseDocument(PDDocument originDocument, Map<Integer, List<TableCells>> pdfTableCells, Map<Integer, List<ClassifiedImage>> pdfImages) {
ClassificationDocument document = new ClassificationDocument();
List<ClassificationPage> classificationPages = new ArrayList<>();
originDocument.setAllSecurityToBeRemoved(true);
long pageCount = originDocument.getNumberOfPages();
for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) {
parsePage(pdfImages, originDocument, pdfTableCells, document, classificationPages, pageNumber);
}
document.setPages(classificationPages);
return document;
}
@SneakyThrows
private void parsePage(Map<Integer, List<ClassifiedImage>> pdfImages,
PDDocument pdDocument,
Map<Integer, List<TableCells>> pdfTableCells,
ClassificationDocument document,
List<ClassificationPage> classificationPages,
int pageNumber) {
PDFLinesTextStripper stripper = new PDFLinesTextStripper();
PDPage pdPage = pdDocument.getPage(pageNumber - 1);
stripper.setPageNumber(pageNumber);
stripper.setStartPage(pageNumber);
stripper.setEndPage(pageNumber);
stripper.setPdpage(pdPage);
stripper.getText(pdDocument);
PDRectangle pdr = pdPage.getMediaBox();
int rotation = pdPage.getRotation();
boolean isLandscape = pdr.getWidth() > pdr.getHeight() && (rotation == 0 || rotation == 180) || pdr.getHeight() > pdr.getWidth() && (rotation == 90 || rotation == 270);
PDRectangle cropbox = pdPage.getCropBox();
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber),
stripper.getRulings(),
stripper.getMinCharWidth(),
stripper.getMaxCharHeight());
ClassificationPage classificationPage = blockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
classificationPage.setRotation(rotation);
classificationPage.setLandscape(isLandscape);
classificationPage.setPageNumber(pageNumber);
classificationPage.setPageWidth(cropbox.getWidth());
classificationPage.setPageHeight(cropbox.getHeight());
// If images is ocr needs to be calculated before textBlocks are moved into tables, otherwise findOcr algorithm needs to be adopted.
if (pdfImages != null && pdfImages.containsKey(pageNumber)) {
classificationPage.setImages(pdfImages.get(pageNumber));
imageServiceResponseAdapter.findOcr(classificationPage);
}
tableExtractionService.extractTables(cleanRulings, classificationPage);
buildPageStatistics(classificationPage);
increaseDocumentStatistics(classificationPage, document);
classificationPages.add(classificationPage);
}
private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) {
if (!classificationPage.isLandscape()) {
document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue());
}
document.getFontCounter().addAll(classificationPage.getFontCounter().getCountPerValue());
document.getTextHeightCounter().addAll(classificationPage.getTextHeightCounter().getCountPerValue());
document.getFontStyleCounter().addAll(classificationPage.getFontStyleCounter().getCountPerValue());
}
private void buildPageStatistics(ClassificationPage classificationPage) {
// Collect all statistics for the classificationPage, except from blocks inside tables, as tables will always be added to BodyTextFrame.
for (AbstractPageBlock textBlock : classificationPage.getTextBlocks()) {
if (textBlock instanceof TextPageBlock) {
if (((TextPageBlock) textBlock).getSequences() == null) {
continue;
}
for (TextPositionSequence word : ((TextPageBlock) textBlock).getSequences()) {
classificationPage.getTextHeightCounter().add(word.getTextHeight());
classificationPage.getFontCounter().add(word.getFont());
classificationPage.getFontSizeCounter().add(word.getFontSize());
classificationPage.getFontStyleCounter().add(word.getFontStyle());
}
}
}
}
}

View File

@ -0,0 +1,231 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.service;
import java.awt.geom.Line2D;
import java.awt.geom.Point2D;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableCells;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.DoubleComparisons;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
public class RulingCleaningService {
public CleanRulings getCleanRulings(List<TableCells> tableCells, List<Ruling> rulings, float minCharWidth, float maxCharHeight) {
if (!rulings.isEmpty()) {
snapPoints(rulings, minCharWidth, maxCharHeight);
}
List<Ruling> vrs = new ArrayList<>();
for (Ruling vr : rulings) {
if (vr.vertical()) {
vrs.add(vr);
}
}
if (vrs.isEmpty()) {
vrs.addAll(extractVerticalRulings(tableCells));
}
List<Ruling> verticalRulingLines = collapseOrientedRulings(vrs);
List<Ruling> hrs = new ArrayList<>();
for (Ruling hr : rulings) {
if (hr.horizontal()) {
hrs.add(hr);
}
}
if (hrs.isEmpty()) {
hrs.addAll(extractHorizontalRulings(tableCells));
}
List<Ruling> horizontalRulingLines = collapseOrientedRulings(hrs);
return CleanRulings.builder().vertical(verticalRulingLines).horizontal(horizontalRulingLines).build();
}
public void snapPoints(List<? extends Line2D.Float> rulings, float xThreshold, float yThreshold) {
// collect points and keep a Line -> p1,p2 map
Map<Line2D.Float, Point2D[]> linesToPoints = new HashMap<>();
List<Point2D> points = new ArrayList<>();
for (Line2D.Float r : rulings) {
Point2D p1 = r.getP1();
Point2D p2 = r.getP2();
linesToPoints.put(r, new Point2D[]{p1, p2});
points.add(p1);
points.add(p2);
}
// snap by X
points.sort(Comparator.comparingDouble(Point2D::getX));
List<List<Point2D>> groupedPoints = new ArrayList<>();
groupedPoints.add(new ArrayList<>(Collections.singletonList(points.get(0))));
for (Point2D p : points.subList(1, points.size() - 1)) {
List<Point2D> last = groupedPoints.get(groupedPoints.size() - 1);
if (Math.abs(p.getX() - last.get(0).getX()) < xThreshold) {
groupedPoints.get(groupedPoints.size() - 1).add(p);
} else {
groupedPoints.add(new ArrayList<>(Collections.singletonList(p)));
}
}
for (List<Point2D> group : groupedPoints) {
float avgLoc = 0;
for (Point2D p : group) {
avgLoc += p.getX();
}
avgLoc /= group.size();
for (Point2D p : group) {
p.setLocation(avgLoc, p.getY());
}
}
// ---
// snap by Y
points.sort(Comparator.comparingDouble(Point2D::getY));
groupedPoints = new ArrayList<>();
groupedPoints.add(new ArrayList<>(Collections.singletonList(points.get(0))));
for (Point2D p : points.subList(1, points.size() - 1)) {
List<Point2D> last = groupedPoints.get(groupedPoints.size() - 1);
if (Math.abs(p.getY() - last.get(0).getY()) < yThreshold) {
groupedPoints.get(groupedPoints.size() - 1).add(p);
} else {
groupedPoints.add(new ArrayList<>(Collections.singletonList(p)));
}
}
for (List<Point2D> group : groupedPoints) {
float avgLoc = 0;
for (Point2D p : group) {
avgLoc += p.getY();
}
avgLoc /= group.size();
for (Point2D p : group) {
p.setLocation(p.getX(), avgLoc);
}
}
// ---
// finally, modify lines
for (Map.Entry<Line2D.Float, Point2D[]> ltp : linesToPoints.entrySet()) {
Point2D[] p = ltp.getValue();
ltp.getKey().setLine(p[0], p[1]);
}
}
private Collection<? extends Ruling> extractVerticalRulings(List<TableCells> cvParsedTableCells) {
List<Ruling> vrs = new ArrayList<>();
if (cvParsedTableCells != null) {
for (TableCells cvParsedTableCell : cvParsedTableCells) {
Ruling leftLine = createRuling(cvParsedTableCell.getX0(), cvParsedTableCell.getX0(), cvParsedTableCell.getY0(), cvParsedTableCell.getY1());
Ruling rightLine = createRuling(cvParsedTableCell.getX1(), cvParsedTableCell.getX1(), cvParsedTableCell.getY0(), cvParsedTableCell.getY1());
vrs.add(leftLine);
vrs.add(rightLine);
}
}
return vrs;
}
private Collection<? extends Ruling> extractHorizontalRulings(List<TableCells> cvParsedTableCells) {
List<Ruling> hrs = new ArrayList<>();
if (cvParsedTableCells != null) {
for (TableCells cvParsedTableCell : cvParsedTableCells) {
Ruling topLine = createRuling(cvParsedTableCell.getX0(), cvParsedTableCell.getX1(), cvParsedTableCell.getY1(), cvParsedTableCell.getY1());
Ruling baseLine = createRuling(cvParsedTableCell.getX0(), cvParsedTableCell.getX1(), cvParsedTableCell.getY0(), cvParsedTableCell.getY0());
hrs.add(topLine);
hrs.add(baseLine);
}
}
return hrs;
}
private Ruling createRuling(float tableCellX0, float tableCellX1, float tableCellY0, float tableCellY1) {
float x0 = tableCellX0;
float x1 = tableCellX1;
float y0 = tableCellY0;
float y1 = tableCellY1;
if (x1 < x0) {
x0 = tableCellX1;
x1 = tableCellX0;
}
if (y1 < y0) {
y0 = tableCellY1;
y1 = tableCellY0;
}
return new Ruling(new Point2D.Float(x0, y0), new Point2D.Float(x1, y1));
}
private List<Ruling> collapseOrientedRulings(List<Ruling> lines) {
int COLINEAR_OR_PARALLEL_PIXEL_EXPAND_AMOUNT = 1;
return collapseOrientedRulings(lines, COLINEAR_OR_PARALLEL_PIXEL_EXPAND_AMOUNT);
}
private List<Ruling> collapseOrientedRulings(List<Ruling> lines, int expandAmount) {
ArrayList<Ruling> rv = new ArrayList<>();
lines.sort((a, b) -> {
final float diff = a.getPosition() - b.getPosition();
return Float.compare(diff == 0 ? a.getStart() - b.getStart() : diff, 0f);
});
for (Ruling next_line : lines) {
Ruling last = rv.isEmpty() ? null : rv.get(rv.size() - 1);
// if current line colinear with next, and are "close enough": expand current line
if (last != null && DoubleComparisons.feq(next_line.getPosition(), last.getPosition()) && last.nearlyIntersects(next_line, expandAmount)) {
final float lastStart = last.getStart();
final float lastEnd = last.getEnd();
final boolean lastFlipped = lastStart > lastEnd;
final boolean nextFlipped = next_line.getStart() > next_line.getEnd();
boolean differentDirections = nextFlipped != lastFlipped;
float nextS = differentDirections ? next_line.getEnd() : next_line.getStart();
float nextE = differentDirections ? next_line.getStart() : next_line.getEnd();
final float newStart = lastFlipped ? Math.max(nextS, lastStart) : Math.min(nextS, lastStart);
final float newEnd = lastFlipped ? Math.min(nextE, lastEnd) : Math.max(nextE, lastEnd);
last.setStartEnd(newStart, newEnd);
assert !last.oblique();
} else if (next_line.length() == 0) {
continue;
} else {
rv.add(next_line);
}
}
return rv;
}
}

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.services;
package com.knecon.fforesight.service.layoutparser.processor.classification.service;
import java.util.ArrayList;
import java.util.Collections;
@ -7,30 +7,27 @@ import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import org.apache.logging.log4j.util.Strings;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationSection;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.UnclassifiedText;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationFooter;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationHeader;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationSection;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.UnclassifiedText;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@Deprecated
public class SectionsBuilderService {
public void buildSections(ClassificationDocument document) {
List<AbstractPageBlock> chunkWords = new ArrayList<>();
@ -46,6 +43,7 @@ public class SectionsBuilderService {
for (ClassificationPage page : document.getPages()) {
List<TextPageBlock> header = new ArrayList<>();
List<TextPageBlock> footer = new ArrayList<>();
List<TextPageBlock> unclassifiedText = new ArrayList<>();
for (AbstractPageBlock current : page.getTextBlocks()) {
if (current.getClassification() == null) {
@ -64,6 +62,11 @@ public class SectionsBuilderService {
continue;
}
if (current.getClassification().equals(PageBlockType.OTHER)) {
unclassifiedText.add((TextPageBlock) current);
continue;
}
if (prev != null && current.getClassification().isHeadline() && !prev.getClassification().isHeadline() || !document.isHeadlines()) {
ClassificationSection chunkBlock = buildTextBlock(chunkWords, lastHeadline);
chunkBlock.setHeadline(lastHeadline);
@ -73,8 +76,7 @@ public class SectionsBuilderService {
chunkBlockList.add(chunkBlock);
chunkWords = new ArrayList<>();
if (!chunkBlock.getTables().isEmpty()) {
previousTable = chunkBlock.getTables()
.get(chunkBlock.getTables().size() - 1);
previousTable = chunkBlock.getTables().get(chunkBlock.getTables().size() - 1);
}
}
if (current instanceof TablePageBlock table) {
@ -92,6 +94,9 @@ public class SectionsBuilderService {
if (!footer.isEmpty()) {
footers.add(new ClassificationFooter(footer));
}
if (!unclassifiedText.isEmpty()) {
unclassifiedTexts.add(new UnclassifiedText(unclassifiedText));
}
}
ClassificationSection chunkBlock = buildTextBlock(chunkWords, lastHeadline);
@ -105,21 +110,6 @@ public class SectionsBuilderService {
}
public void buildParagraphDebugSections(ClassificationDocument document) {
List<ClassificationSection> sections = new ArrayList<>();
for (var page : document.getPages()) {
page.getTextBlocks()
.forEach(block -> {
block.setPage(page.getPageNumber());
var section = buildTextBlock(List.of(block), Strings.EMPTY);
sections.add(section);
});
}
document.setSections(sections);
}
public void addImagesToSections(ClassificationDocument document) {
Map<Integer, List<ClassificationSection>> sectionMap = new HashMap<>();
@ -159,10 +149,10 @@ public class SectionsBuilderService {
}
}
for (ClassificationSection section : sectionsOnPage) {
Double xMin = null;
Double yMin = null;
Double xMax = null;
Double yMax = null;
Float xMin = null;
Float yMin = null;
Float xMax = null;
Float yMax = null;
for (AbstractPageBlock abs : section.getPageBlocks()) {
if (abs.getPage() != page.getPageNumber()) {
@ -206,17 +196,10 @@ public class SectionsBuilderService {
log.debug("Image position x: {}, y: {}", image.getPosition().getX(), image.getPosition().getY());
log.debug("Paragraph position xMin: {}, xMax: {}, yMin: {}, yMax: {}", xMin, xMax, yMin, yMax);
if (xMin != null
&& xMax != null
&& yMin != null
&& yMax != null
&& image.getPosition().getX() >= xMin
&& image.getPosition().getX() <= xMax
&& image.getPosition().getY() >= yMin
&& image.getPosition().getY() <= yMax) {
if (xMin != null && xMax != null && yMin != null && yMax != null && image.getPosition().getX() >= xMin && image.getPosition()
.getX() <= xMax && image.getPosition().getY() >= yMin && image.getPosition().getY() <= yMax) {
section.getImages().add(image);
image.setAppendedToSection(true);
break;
}
}
if (!image.isAppendedToSection()) {
@ -236,26 +219,17 @@ public class SectionsBuilderService {
List<Cell> previousTableNonHeaderRow = getRowWithNonHeaderCells(previousTable);
List<Cell> tableNonHeaderRow = getRowWithNonHeaderCells(currentTable);
// Allow merging of tables if header row is separated from first logical non-header row
if (previousTableNonHeaderRow.isEmpty()
&& previousTable.getRowCount() == 1
&& previousTable.getRows()
.get(0).size() == tableNonHeaderRow.size()) {
previousTableNonHeaderRow = previousTable.getRows()
.get(0)
.stream()
.map(cell -> {
Cell fakeCell = Cell.copy(cell);
fakeCell.setHeaderCells(Collections.singletonList(cell));
return fakeCell;
})
.collect(Collectors.toList());
if (previousTableNonHeaderRow.isEmpty() && previousTable.getRowCount() == 1 && previousTable.getRows().get(0).size() == tableNonHeaderRow.size()) {
previousTableNonHeaderRow = previousTable.getRows().get(0).stream().map(cell -> {
Cell fakeCell = new Cell(cell.getPoints()[0], cell.getPoints()[2]);
fakeCell.setHeaderCells(Collections.singletonList(cell));
return fakeCell;
}).collect(Collectors.toList());
}
if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) {
for (int i = currentTable.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
List<Cell> row = currentTable.getRows()
.get(i);
if (row.size() == tableNonHeaderRow.size() && row.stream()
.allMatch(cell -> cell.getHeaderCells().isEmpty())) {
List<Cell> row = currentTable.getRows().get(i);
if (row.size() == tableNonHeaderRow.size() && row.stream().allMatch(cell -> cell.getHeaderCells().isEmpty())) {
for (int j = 0; j < row.size(); j++) {
row.get(j).setHeaderCells(previousTableNonHeaderRow.get(j).getHeaderCells());
}
@ -298,11 +272,7 @@ public class SectionsBuilderService {
private boolean hasInvalidHeaderInformation(TablePageBlock table) {
return table.getRows()
.stream()
.flatMap(row -> row.stream()
.filter(cell -> !cell.getHeaderCells().isEmpty()))
.findAny().isEmpty();
return table.getRows().stream().flatMap(row -> row.stream().filter(cell -> !cell.getHeaderCells().isEmpty())).findAny().isEmpty();
}
@ -310,8 +280,7 @@ public class SectionsBuilderService {
private List<Cell> getRowWithNonHeaderCells(TablePageBlock table) {
for (int i = table.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
List<Cell> row = table.getRows()
.get(i);
List<Cell> row = table.getRows().get(i);
if (row.size() == 1) {
continue;
}

View File

@ -0,0 +1,344 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.service;
import java.awt.geom.Point2D;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.DoubleComparisons;
@Service
public class TableExtractionService {
private static final Comparator<Point2D> X_FIRST_POINT_COMPARATOR = (arg0, arg1) -> {
int rv = 0;
float arg0X = DoubleComparisons.round(arg0.getX(), 2);
float arg0Y = DoubleComparisons.round(arg0.getY(), 2);
float arg1X = DoubleComparisons.round(arg1.getX(), 2);
float arg1Y = DoubleComparisons.round(arg1.getY(), 2);
if (arg0X > arg1X) {
rv = 1;
} else if (arg0X < arg1X) {
rv = -1;
} else if (arg0Y > arg1Y) {
rv = 1;
} else if (arg0Y < arg1Y) {
rv = -1;
}
return rv;
};
private static final Comparator<Point2D> POINT_COMPARATOR = (arg0, arg1) -> {
int rv = 0;
float arg0X = DoubleComparisons.round(arg0.getX(), 2);
float arg0Y = DoubleComparisons.round(arg0.getY(), 2);
float arg1X = DoubleComparisons.round(arg1.getX(), 2);
float arg1Y = DoubleComparisons.round(arg1.getY(), 2);
if (arg0Y > arg1Y) {
rv = 1;
} else if (arg0Y < arg1Y) {
rv = -1;
} else if (arg0X > arg1X) {
rv = 1;
} else if (arg0X < arg1X) {
rv = -1;
}
return rv;
};
/**
* Finds tables on a page and moves textblocks into cells of the found tables.
* Note: This algorithm uses Pdf Coordinate System where {0,0} rotated with the page rotation.
* 0 -> LowerLeft
* 90 -> UpperLeft
* 180 -> UpperRight
* 270 -> LowerRight
* <p>
* DirAdj (Text direction adjusted) values can not be used here.
*
* @param cleanRulings The lines used to build the table.
* @param page Page object that contains textblocks and statistics.
*/
public void extractTables(CleanRulings cleanRulings, ClassificationPage page) {
List<Cell> cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical());
List<TextPageBlock> toBeRemoved = new ArrayList<>();
for (AbstractPageBlock abstractPageBlock : page.getTextBlocks()) {
TextPageBlock textBlock = (TextPageBlock) abstractPageBlock;
for (Cell cell : cells) {
if (cell.hasMinimumSize() && cell.intersects(textBlock.getPdfMinX(),
textBlock.getPdfMinY(),
textBlock.getPdfMaxX() - textBlock.getPdfMinX(),
textBlock.getPdfMaxY() - textBlock.getPdfMinY())) {
cell.addTextBlock(textBlock);
toBeRemoved.add(textBlock);
break;
}
}
}
cells = new ArrayList<>(new HashSet<>(cells));
DoubleComparisons.sort(cells, Rectangle.ILL_DEFINED_ORDER);
List<Rectangle> spreadsheetAreas = findSpreadsheetsFromCells(cells).stream().filter(r -> r.getWidth() > 0f && r.getHeight() > 0f).toList();
List<TablePageBlock> tables = new ArrayList<>();
for (Rectangle area : spreadsheetAreas) {
List<Cell> overlappingCells = new ArrayList<>();
for (Cell c : cells) {
if (c.hasMinimumSize() && c.intersects(area)) {
overlappingCells.add(c);
}
}
tables.add(new TablePageBlock(overlappingCells, area, page.getRotation()));
}
for (TablePageBlock table : tables) {
int position = -1;
Iterator<AbstractPageBlock> itty = page.getTextBlocks().iterator();
while (itty.hasNext()) {
AbstractPageBlock textBlock = itty.next();
if (textBlock instanceof TextPageBlock ? table.containsBlock((TextPageBlock) textBlock) : table.contains(textBlock) && position == -1) {
position = page.getTextBlocks().indexOf(textBlock);
}
}
if (position != -1) {
page.getTextBlocks().add(position, table);
}
}
page.getTextBlocks().removeAll(toBeRemoved);
}
public List<Cell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
for (Ruling r : horizontalRulingLines) {
if (r.getX2() < r.getX1()) {
double a = r.getX2();
r.x2 = (float) r.getX1();
r.x1 = (float) a;
}
}
List<Cell> cellsFound = new ArrayList<>();
Map<Point2D, Ruling[]> intersectionPoints = Ruling.findIntersections(horizontalRulingLines, verticalRulingLines);
List<Point2D> intersectionPointsList = new ArrayList<>(intersectionPoints.keySet());
intersectionPointsList.sort(POINT_COMPARATOR);
for (int i = 0; i < intersectionPointsList.size(); i++) {
Point2D topLeft = intersectionPointsList.get(i);
Ruling[] hv = intersectionPoints.get(topLeft);
// CrossingPointsDirectlyBelow( topLeft );
List<Point2D> xPoints = new ArrayList<>();
// CrossingPointsDirectlyToTheRight( topLeft );
List<Point2D> yPoints = new ArrayList<>();
for (Point2D p : intersectionPointsList.subList(i, intersectionPointsList.size())) {
if (p.getX() == topLeft.getX() && p.getY() > topLeft.getY()) {
xPoints.add(p);
}
if (p.getY() == topLeft.getY() && p.getX() > topLeft.getX()) {
yPoints.add(p);
}
}
outer:
for (Point2D xPoint : xPoints) {
// is there a vertical edge b/w topLeft and xPoint?
if (!hv[1].equals(intersectionPoints.get(xPoint)[1])) {
continue;
}
for (Point2D yPoint : yPoints) {
// is there an horizontal edge b/w topLeft and yPoint ?
if (!hv[0].equals(intersectionPoints.get(yPoint)[0])) {
continue;
}
Point2D btmRight = new Point2D.Float((float) yPoint.getX(), (float) xPoint.getY());
if (intersectionPoints.containsKey(btmRight) && intersectionPoints.get(btmRight)[0].equals(intersectionPoints.get(xPoint)[0]) && intersectionPoints.get(btmRight)[1].equals(
intersectionPoints.get(yPoint)[1])) {
cellsFound.add(new Cell(topLeft, btmRight));
break outer;
}
}
}
}
// TODO create cells for vertical ruling lines with aligned endpoints at the top/bottom of a grid
// that aren't connected with an horizontal ruler?
// see: https://github.com/jazzido/tabula-extractor/issues/78#issuecomment-41481207
return cellsFound;
}
private List<Rectangle> findSpreadsheetsFromCells(List<? extends Rectangle> cells) {
// via: http://stackoverflow.com/questions/13746284/merging-multiple-adjacent-rectangles-into-one-polygon
List<Rectangle> rectangles = new ArrayList<>();
Set<Point2D> pointSet = new HashSet<>();
Map<Point2D, Point2D> edgesH = new HashMap<>();
Map<Point2D, Point2D> edgesV = new HashMap<>();
int i = 0;
for (Rectangle cell : cells) {
for (Point2D pt : cell.getPoints()) {
if (pointSet.contains(pt)) { // shared vertex, remove it
pointSet.remove(pt);
} else {
pointSet.add(pt);
}
}
}
// X first sort
List<Point2D> pointsSortX = new ArrayList<>(pointSet);
pointsSortX.sort(X_FIRST_POINT_COMPARATOR);
// Y first sort
List<Point2D> pointsSortY = new ArrayList<>(pointSet);
pointsSortY.sort(POINT_COMPARATOR);
while (i < pointSet.size()) {
float currY = (float) pointsSortY.get(i).getY();
while (i < pointSet.size() && DoubleComparisons.feq(pointsSortY.get(i).getY(), currY)) {
edgesH.put(pointsSortY.get(i), pointsSortY.get(i + 1));
edgesH.put(pointsSortY.get(i + 1), pointsSortY.get(i));
i += 2;
}
}
i = 0;
while (i < pointSet.size()) {
float currX = (float) pointsSortX.get(i).getX();
while (i < pointSet.size() && DoubleComparisons.feq(pointsSortX.get(i).getX(), currX)) {
edgesV.put(pointsSortX.get(i), pointsSortX.get(i + 1));
edgesV.put(pointsSortX.get(i + 1), pointsSortX.get(i));
i += 2;
}
}
// Get all the polygons
List<List<PolygonVertex>> polygons = new ArrayList<>();
Point2D nextVertex;
while (!edgesH.isEmpty()) {
ArrayList<PolygonVertex> polygon = new ArrayList<>();
Point2D first = edgesH.keySet().iterator().next();
polygon.add(new PolygonVertex(first, Direction.HORIZONTAL));
edgesH.remove(first);
while (true) {
PolygonVertex curr = polygon.get(polygon.size() - 1);
PolygonVertex lastAddedVertex;
if (curr.direction == Direction.HORIZONTAL) {
nextVertex = edgesV.get(curr.point);
edgesV.remove(curr.point);
lastAddedVertex = new PolygonVertex(nextVertex, Direction.VERTICAL);
} else {
nextVertex = edgesH.get(curr.point);
edgesH.remove(curr.point);
lastAddedVertex = new PolygonVertex(nextVertex, Direction.HORIZONTAL);
}
polygon.add(lastAddedVertex);
if (lastAddedVertex.equals(polygon.get(0))) {
// closed polygon
polygon.remove(polygon.size() - 1);
break;
}
}
for (PolygonVertex vertex : polygon) {
edgesH.remove(vertex.point);
edgesV.remove(vertex.point);
}
polygons.add(polygon);
}
// calculate grid-aligned minimum area rectangles for each found polygon
for (List<PolygonVertex> poly : polygons) {
float top = Float.MAX_VALUE;
float left = Float.MAX_VALUE;
float bottom = Float.MIN_VALUE;
float right = Float.MIN_VALUE;
for (PolygonVertex pt : poly) {
top = (float) Math.min(top, pt.point.getY());
left = (float) Math.min(left, pt.point.getX());
bottom = (float) Math.max(bottom, pt.point.getY());
right = (float) Math.max(right, pt.point.getX());
}
rectangles.add(new Rectangle(top, left, right - left, bottom - top));
}
return rectangles;
}
private enum Direction {
HORIZONTAL,
VERTICAL
}
static class PolygonVertex {
Point2D point;
Direction direction;
PolygonVertex(Point2D point, Direction direction) {
this.direction = direction;
this.point = point;
}
@Override
public boolean equals(Object other) {
if (this == other) {
return true;
}
if (!(other instanceof PolygonVertex)) {
return false;
}
return this.point.equals(((PolygonVertex) other).point);
}
@Override
public int hashCode() {
return this.point.hashCode();
}
@Override
public String toString() {
return String.format("%s[point=%s,direction=%s]", this.getClass().getName(), this.point.toString(), this.direction.toString());
}
}
}

View File

@ -9,7 +9,7 @@
* This program is free software under the LGPL (>=v2.1)
* Read the file LICENSE.txt coming with the sources for details.
*/
package com.knecon.fforesight.service.layoutparser.processor.utils;
package com.knecon.fforesight.service.layoutparser.processor.classification.utils;
import java.awt.geom.Line2D;
import java.awt.geom.Rectangle2D;

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.utils;
package com.knecon.fforesight.service.layoutparser.processor.classification.utils;
import java.math.BigDecimal;
import java.util.Comparator;

View File

@ -1,7 +1,7 @@
package com.knecon.fforesight.service.layoutparser.processor.utils;
package com.knecon.fforesight.service.layoutparser.processor.classification.utils;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
import lombok.experimental.UtilityClass;
@ -19,10 +19,9 @@ public final class PositionUtils {
double threshold = textBlock.getMostPopularWordHeight() * 3;
if (textBlock.getPdfMinX() + threshold > btf.getTopLeft().getX()
&& textBlock.getPdfMaxX() - threshold < btf.getTopLeft().getX() + btf.getWidth()
&& textBlock.getPdfMinY() + threshold > btf.getTopLeft().getY()
&& textBlock.getPdfMaxY() - threshold < btf.getTopLeft().getY() + btf.getHeight()) {
if (textBlock.getPdfMinX() + threshold > btf.getTopLeft().getX() && textBlock.getPdfMaxX() - threshold < btf.getTopLeft()
.getX() + btf.getWidth() && textBlock.getPdfMinY() + threshold > btf.getTopLeft().getY() && textBlock.getPdfMaxY() - threshold < btf.getTopLeft()
.getY() + btf.getHeight()) {
return true;
} else {
return false;
@ -108,15 +107,15 @@ public final class PositionUtils {
}
public double getHeightDifferenceBetweenChunkWordAndDocumentWord(TextPageBlock textBlock, Double documentMostPopularWordHeight) {
public float getHeightDifferenceBetweenChunkWordAndDocumentWord(TextPageBlock textBlock, Float documentMostPopularWordHeight) {
return textBlock.getMostPopularWordHeight() - documentMostPopularWordHeight;
}
public double getApproxLineCount(TextPageBlock textBlock) {
public Float getApproxLineCount(TextPageBlock textBlock) {
return textBlock.getBBoxDirAdj().getHeight() / textBlock.getMostPopularWordHeight();
return textBlock.getHeight() / textBlock.getMostPopularWordHeight();
}
}

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.utils;
package com.knecon.fforesight.service.layoutparser.processor.classification.utils;
import java.util.ArrayDeque;
import java.util.Comparator;

View File

@ -1,9 +1,9 @@
package com.knecon.fforesight.service.layoutparser.processor.utils;
package com.knecon.fforesight.service.layoutparser.processor.classification.utils;
import java.awt.geom.Line2D;
import java.awt.geom.Point2D;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Ruling;
import lombok.experimental.UtilityClass;

Some files were not shown because too many files have changed in this diff Show More