RED-6009: Document Tree Structure

* squashed commits
This commit is contained in:
Kilian Schuettler 2023-06-06 11:19:34 +02:00
parent a6a6fd8180
commit 1f9e151092
212 changed files with 43279 additions and 7761 deletions

View File

@ -23,6 +23,7 @@
<properties>
<pdfbox.version>2.0.24</pdfbox.version>
<lombok.version>1.18.26</lombok.version>
</properties>
@ -88,5 +89,26 @@
</plugin>
</plugins>
</pluginManagement>
<plugins>
<plugin>
<groupId>org.projectlombok</groupId>
<artifactId>lombok-maven-plugin</artifactId>
<version>1.18.20.0</version>
<executions>
<execution>
<id>delombok</id>
<phase>generate-sources</phase>
<goals>
<goal>delombok</goal>
</goals>
<configuration>
<addOutputDirectory>false</addOutputDirectory>
<sourceDirectory>src/main/java</sourceDirectory>
<outputDirectory>${delomboked.sources}</outputDirectory>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>

View File

@ -39,7 +39,6 @@
</exclusion>
</exclusions>
</dependency>
</dependencies>
<build>

View File

@ -12,7 +12,7 @@
<artifactId>redaction-service-server-v1</artifactId>
<properties>
<drools.version>7.73.0.Final</drools.version>
<drools.version>8.37.0.Final</drools.version>
<kie.version>7.73.0.Final</kie.version>
<locationtech.version>1.19.0</locationtech.version>
<javaassist.version>3.29.2-GA</javaassist.version>
@ -64,7 +64,12 @@
</dependency>
<dependency>
<groupId>org.drools</groupId>
<artifactId>drools-core</artifactId>
<artifactId>drools-engine</artifactId>
<version>${drools.version}</version>
</dependency>
<dependency>
<groupId>org.drools</groupId>
<artifactId>drools-mvel</artifactId>
<version>${drools.version}</version>
</dependency>
<dependency>
@ -198,5 +203,4 @@
</plugin>
</plugins>
</build>
</project>

View File

@ -1,26 +0,0 @@
package com.iqser.red.service.redaction.v1.server.classification.model;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
import lombok.AllArgsConstructor;
import lombok.Data;
import java.util.List;
@Data
@AllArgsConstructor
public class Footer {
private List<TextBlock> textBlocks;
@JsonIgnore
public SearchableText getSearchableText() {
SearchableText searchableText = new SearchableText();
textBlocks.forEach(block -> searchableText.addAll(block.getSequences()));
return searchableText;
}
}

View File

@ -1,26 +0,0 @@
package com.iqser.red.service.redaction.v1.server.classification.model;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
import lombok.AllArgsConstructor;
import lombok.Data;
import java.util.List;
@Data
@AllArgsConstructor
public class Header {
private List<TextBlock> textBlocks;
@JsonIgnore
public SearchableText getSearchableText() {
SearchableText searchableText = new SearchableText();
textBlocks.forEach(block -> searchableText.addAll(block.getSequences()));
return searchableText;
}
}

View File

@ -1,8 +0,0 @@
package com.iqser.red.service.redaction.v1.server.classification.model;
public enum Orientation {
NONE,
LEFT,
RIGHT
}

View File

@ -1,65 +0,0 @@
package com.iqser.red.service.redaction.v1.server.classification.model;
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
import lombok.Data;
import lombok.NoArgsConstructor;
import java.util.ArrayList;
import java.util.List;
@Data
@NoArgsConstructor
public class Paragraph implements Comparable {
private List<AbstractTextContainer> pageBlocks = new ArrayList<>();
private List<PdfImage> images = new ArrayList<>();
private String headline;
public SearchableText getSearchableText() {
SearchableText searchableText = new SearchableText();
pageBlocks.forEach(block -> {
if (block instanceof TextBlock) {
searchableText.addAll(((TextBlock) block).getSequences());
}
});
return searchableText;
}
public List<Table> getTables() {
List<Table> tables = new ArrayList<>();
pageBlocks.forEach(block -> {
if (block instanceof Table) {
tables.add((Table) block);
}
});
return tables;
}
public List<TextBlock> getTextBlocks() {
List<TextBlock> textBlocks = new ArrayList<>();
pageBlocks.forEach(block -> {
if (block instanceof TextBlock) {
textBlocks.add((TextBlock) block);
}
});
return textBlocks;
}
@Override
public int compareTo(Object o) {
return 0;
}
}

View File

@ -1,64 +0,0 @@
package com.iqser.red.service.redaction.v1.server.classification.model;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.section.SectionArea;
import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue;
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class SectionText {
private int sectionNumber;
private String text;
private boolean isTable;
private String headline;
@Builder.Default
private List<SectionArea> sectionAreas = new ArrayList<>();
@Builder.Default
private Set<Image> images = new HashSet<>();
@Builder.Default
private List<TextBlock> textBlocks = new ArrayList<>();
@Builder.Default
private Map<String, CellValue> tabularData = new HashMap<>();
@Builder.Default
private List<Integer> cellStarts = new ArrayList<>();
public void setTabularData(Map<String, CellValue> tabularData) {
tabularData.remove(null);
this.tabularData = tabularData;
}
@JsonIgnore
public SearchableText getSearchableText() {
SearchableText searchableText = new SearchableText();
textBlocks.forEach(block -> {
if (block != null) {
searchableText.addAll(block.getSequences());
}
});
return searchableText;
}
}

View File

@ -1,19 +0,0 @@
package com.iqser.red.service.redaction.v1.server.classification.model;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import java.util.ArrayList;
import java.util.List;
@Data
@NoArgsConstructor
@AllArgsConstructor
public class Text {
private int numberOfPages;
private List<SectionText> sectionTexts = new ArrayList<>();
}

View File

@ -1,26 +0,0 @@
package com.iqser.red.service.redaction.v1.server.classification.model;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
import lombok.AllArgsConstructor;
import lombok.Data;
import java.util.List;
@Data
@AllArgsConstructor
public class UnclassifiedText {
private List<TextBlock> textBlocks;
@JsonIgnore
public SearchableText getSearchableText() {
SearchableText searchableText = new SearchableText();
textBlocks.forEach(block -> searchableText.addAll(block.getSequences()));
return searchableText;
}
}

View File

@ -10,7 +10,7 @@ import lombok.NoArgsConstructor;
@Builder
@AllArgsConstructor
@NoArgsConstructor
public class EntityRecogintionEntity {
public class EntityRecognitionEntity {
private String value;
private int startOffset;

View File

@ -4,7 +4,6 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
@ -12,8 +11,8 @@ import lombok.NoArgsConstructor;
@Data
@NoArgsConstructor
@AllArgsConstructor
public class NerEntities {
public class NerEntitiesModel {
private Map<Integer, List<EntityRecogintionEntity>> data = new HashMap<>();
private Map<Integer, List<EntityRecognitionEntity>> data = new HashMap<>();
}

View File

@ -1,13 +1,13 @@
package com.iqser.red.service.redaction.v1.server.controller;
import org.springframework.web.bind.annotation.RestController;
import com.iqser.red.service.redaction.v1.model.RuleBuilderModel;
import com.iqser.red.service.redaction.v1.resources.RuleBuilderResource;
import com.iqser.red.service.redaction.v1.server.redaction.rulebuilder.RuleBuilderModelService;
import com.iqser.red.service.redaction.v1.server.redaction.service.RuleBuilderModelService;
import lombok.RequiredArgsConstructor;
import org.springframework.web.bind.annotation.RestController;
@RestController
@RequiredArgsConstructor
public class RuleBuilderController implements RuleBuilderResource {

View File

@ -1,5 +1,6 @@
package com.iqser.red.service.redaction.v1.server.segmentation;
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
@ -10,11 +11,10 @@ import org.springframework.stereotype.Service;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.dossier.file.FileType;
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
import com.iqser.red.service.redaction.v1.server.redaction.model.ImageType;
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
import com.iqser.red.service.redaction.v1.server.redaction.model.RedRectangle2D;
import com.iqser.red.service.redaction.v1.server.redaction.model.image.ImageServiceResponse;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.image.ImageServiceResponse;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationPage;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.image.ClassifiedImage;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.ImageType;
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
import lombok.RequiredArgsConstructor;
@ -22,26 +22,26 @@ import lombok.SneakyThrows;
@Service
@RequiredArgsConstructor
public class ImageService {
public class ImageServiceResponseAdapter {
private final ObjectMapper objectMapper;
private final RedactionStorageService redactionStorageService;
@SneakyThrows
public Map<Integer, List<PdfImage>> convertImages(String dossierId, String fileId) {
public Map<Integer, List<ClassifiedImage>> convertImages(String dossierId, String fileId) {
var imageClassificationStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(dossierId, fileId, FileType.IMAGE_INFO));
ImageServiceResponse imageServiceResponse = objectMapper.readValue(imageClassificationStream, ImageServiceResponse.class);
Map<Integer, List<PdfImage>> images = new HashMap<>();
Map<Integer, List<ClassifiedImage>> images = new HashMap<>();
imageServiceResponse.getData().forEach(imageMetadata -> {
var classification = imageMetadata.getFilters().isAllPassed() ? ImageType.valueOf(imageMetadata.getClassification()
.getLabel()
.toUpperCase(Locale.ROOT)) : ImageType.OTHER;
images.computeIfAbsent(imageMetadata.getPosition().getPageNumber(), x -> new ArrayList<>())
.add(new PdfImage(new RedRectangle2D(imageMetadata.getPosition().getX1(),
.add(new ClassifiedImage(new Rectangle2D.Double(imageMetadata.getPosition().getX1(),
imageMetadata.getPosition().getY1(),
imageMetadata.getGeometry().getWidth(),
imageMetadata.getGeometry().getHeight()), classification, imageMetadata.isAlpha(), imageMetadata.getPosition().getPageNumber()));
@ -53,7 +53,7 @@ public class ImageService {
.getLabel()
.toUpperCase(Locale.ROOT)) : ImageType.OTHER;
images.computeIfAbsent(imageMetadata.getPosition().getPageNumber(), x -> new ArrayList<>())
.add(new PdfImage(new RedRectangle2D(imageMetadata.getPosition().getX1(),
.add(new ClassifiedImage(new Rectangle2D.Double(imageMetadata.getPosition().getX1(),
imageMetadata.getPosition().getY1(),
imageMetadata.getGeometry().getWidth(),
imageMetadata.getGeometry().getHeight()), classification, imageMetadata.isAlpha(), imageMetadata.getPosition().getPageNumber()));
@ -63,7 +63,7 @@ public class ImageService {
}
public void findOcr(Page page) {
public void findOcr(ClassificationPage page) {
page.getImages().forEach(image -> {
if (image.getImageType().equals(ImageType.OTHER)) {

View File

@ -0,0 +1,164 @@
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter;
import static java.lang.String.format;
import static java.util.stream.Collectors.groupingBy;
import java.awt.geom.Rectangle2D;
import java.util.Collection;
import java.util.Comparator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.springframework.stereotype.Service;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.RedactionLog;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.RedactionLogEntry;
import com.iqser.red.service.redaction.v1.server.exception.NotFoundException;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity.EntityType;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity.RedactionEntity;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity.RedactionPosition;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Page;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.SemanticNode;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.services.EntityCreationService;
import com.iqser.red.service.redaction.v1.server.redaction.model.dictionary.SearchImplementation;
import lombok.RequiredArgsConstructor;
@Service
@RequiredArgsConstructor
public class RedactionLogEntryAdapter {
private static final double MATCH_THRESHOLD = 1;
private final EntityCreationService entityCreationService;
public Stream<RedactionEntity> toRedactionEntity(RedactionLog redactionLog, SemanticNode node) {
List<Integer> pageNumbers = redactionLog.getRedactionLogEntry().stream().flatMap(entry -> entry.getPositions().stream().map(Rectangle::getPage)).distinct().toList();
if (!pageNumbers.stream().allMatch(node::isOnPage)) {
throw new IllegalArgumentException(format("SemanticNode %s does not contain these pages %s present in the redaction log",
node,
pageNumbers.stream().filter(pageNumber -> !node.isOnPage(pageNumber)).toList()));
}
Set<String> entryValues = redactionLog.getRedactionLogEntry().stream().map(RedactionLogEntry::getValue).map(String::toLowerCase).collect(Collectors.toSet());
SearchImplementation searchImplementation = new SearchImplementation(entryValues, true);
Map<String, List<RedactionEntity>> tempEntitiesByValue = findAllPossibleEntitiesAndGroupByValueIgnoringCase(node, searchImplementation);
assert allValuesFound(tempEntitiesByValue, entryValues);
List<RedactionEntity> entities = redactionLog.getRedactionLogEntry()
.stream()
.map(entry -> findClosestRedactionEntity(entry, tempEntitiesByValue.get(entry.getValue().toLowerCase(Locale.ROOT)), node))
.toList();
tempEntitiesByValue.values().stream().flatMap(Collection::stream).forEach(RedactionEntity::removeFromGraph);
return entities.stream();
}
private static boolean allValuesFound(Map<String, List<RedactionEntity>> entitiesByValue, Set<String> entryValues) {
return entitiesByValue.keySet().equals(entryValues);
}
private Map<String, List<RedactionEntity>> findAllPossibleEntitiesAndGroupByValueIgnoringCase(SemanticNode node, SearchImplementation searchImplementation) {
return searchImplementation.getBoundaries(node.getTextBlock(), node.getBoundary())
.stream()
.map(boundary -> entityCreationService.byBoundary(boundary, "temp", EntityType.ENTITY, node))
.collect(groupingBy(entity -> entity.getValue().toLowerCase(Locale.ROOT)));
}
private RedactionEntity findClosestRedactionEntity(RedactionLogEntry redactionLogEntry, List<RedactionEntity> entitiesWithSameValue, SemanticNode node) {
RedactionEntity closestEntity = entitiesWithSameValue.stream()
.filter(entity -> pagesMatch(entity, redactionLogEntry))
.min(Comparator.comparingDouble(entity -> calculateMinDistance(redactionLogEntry, entity)))
.orElseThrow(() -> new NotFoundException(format("No entity with similar position found for %s", redactionLogEntry)));
double distance = calculateMinDistance(redactionLogEntry, closestEntity);
if (distance > MATCH_THRESHOLD) {
throw new NotFoundException(format("Distance to closest found entity is %.2f for \n%s \n%s",
distance,
redactionLogEntry.getPositions(),
closestEntity.getRedactionPositionsPerPage()));
}
return createCorrectEntity(redactionLogEntry, node, closestEntity);
}
private RedactionEntity createCorrectEntity(RedactionLogEntry redactionLogEntry, SemanticNode node, RedactionEntity closestEntity) {
RedactionEntity correctEntity = entityCreationService.byBoundary(closestEntity.getBoundary(),
redactionLogEntry.getType(),
redactionLogEntry.isRecommendation() ? EntityType.RECOMMENDATION : EntityType.ENTITY,
node);
correctEntity.setLegalBasis(redactionLogEntry.getLegalBasis());
correctEntity.setRedactionReason(redactionLogEntry.getReason());
correctEntity.addMatchedRule(redactionLogEntry.getMatchedRule());
correctEntity.setRedaction(redactionLogEntry.isRedacted());
correctEntity.setDictionaryEntry(redactionLogEntry.isDictionaryEntry());
correctEntity.setDossierDictionaryEntry(redactionLogEntry.isDossierDictionaryEntry());
return correctEntity;
}
private static boolean pagesMatch(RedactionEntity entity, RedactionLogEntry redactionLogEntry) {
Set<Integer> entityPageNumbers = entity.getRedactionPositionsPerPage().stream().map(RedactionPosition::getPage).map(Page::getNumber).collect(Collectors.toSet());
Set<Integer> redactionLogEntryPageNumbers = redactionLogEntry.getPositions().stream().map(Rectangle::getPage).collect(Collectors.toSet());
return entityPageNumbers.equals(redactionLogEntryPageNumbers);
}
private double calculateMinDistance(RedactionLogEntry redactionLogEntry, RedactionEntity entity) {
if (redactionLogEntry.getPositions().size() != countRectangles(entity)) {
return Double.MAX_VALUE;
}
return redactionLogEntry.getPositions().stream().mapToDouble(redactionLogEntryRectangle -> calculateMinDistancePerRectangle(entity, redactionLogEntryRectangle)).sum();
}
private static long countRectangles(RedactionEntity entity) {
return entity.getRedactionPositionsPerPage().stream().mapToLong(redactionPosition -> redactionPosition.getRectanglePerLine().size()).sum();
}
private double calculateMinDistancePerRectangle(RedactionEntity entity, Rectangle redactionLogEntryRectangle) {
return entity.getRedactionPositionsPerPage()
.stream()
.filter(redactionPosition -> redactionPosition.getPage().getNumber() == redactionLogEntryRectangle.getPage())
.map(RedactionPosition::getRectanglePerLine)
.flatMap(Collection::stream)
.mapToDouble(rectangle -> calculateDistance(rectangle, toRectangle2D(redactionLogEntryRectangle)))
.min()
.orElse(Double.MAX_VALUE);
}
private double calculateDistance(Rectangle2D rectangle, Rectangle2D rectangle2D) {
return Math.abs(rectangle.getMinX() - rectangle2D.getMinX()) //
+ Math.abs(rectangle.getMinY() - rectangle2D.getMinY()) //
+ Math.abs(rectangle.getMaxX() - rectangle2D.getMaxX()) //
+ Math.abs(rectangle.getMaxY() - rectangle2D.getMaxY());
}
private Rectangle2D toRectangle2D(Rectangle rectangle) {
return new Rectangle2D.Float(rectangle.getTopLeft().getX(), rectangle.getTopLeft().getY() + rectangle.getHeight(), rectangle.getWidth(), -rectangle.getHeight());
}
}

View File

@ -1,4 +1,4 @@
package com.iqser.red.service.redaction.v1.server.segmentation;
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter;
import java.util.ArrayList;
import java.util.Collection;
@ -10,9 +10,9 @@ import org.springframework.stereotype.Service;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.dossier.file.FileType;
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfTableCell;
import com.iqser.red.service.redaction.v1.server.redaction.model.table.TableCells;
import com.iqser.red.service.redaction.v1.server.redaction.model.table.TableServiceResponse;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.table.PdfTableCell;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.table.TableCells;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.table.TableServiceResponse;
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
import lombok.RequiredArgsConstructor;
@ -22,7 +22,7 @@ import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
public class TableService {
public class TableServiceResponseAdapter {
private final ObjectMapper objectMapper;
private final RedactionStorageService redactionStorageService;

View File

@ -1,9 +1,8 @@
package com.iqser.red.service.redaction.v1.server.redaction.model.image;
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.image;
import java.util.HashMap;
import java.util.Map;
import lombok.Data;
@Data

View File

@ -1,5 +1,4 @@
package com.iqser.red.service.redaction.v1.server.redaction.model.image;
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.image;
import lombok.Data;

View File

@ -1,5 +1,4 @@
package com.iqser.red.service.redaction.v1.server.redaction.model.image;
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.image;
import lombok.Data;

View File

@ -1,5 +1,4 @@
package com.iqser.red.service.redaction.v1.server.redaction.model.image;
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.image;
import lombok.Data;

View File

@ -1,5 +1,4 @@
package com.iqser.red.service.redaction.v1.server.redaction.model.image;
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.image;
import lombok.Data;

View File

@ -1,5 +1,4 @@
package com.iqser.red.service.redaction.v1.server.redaction.model.image;
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.image;
import lombok.Data;

View File

@ -1,13 +1,13 @@
package com.iqser.red.service.redaction.v1.server.redaction.model.image;
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.image;
import java.util.ArrayList;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonAlias;
import com.fasterxml.jackson.annotation.JsonProperty;
import lombok.Data;
import java.util.ArrayList;
import java.util.List;
@Data
public class ImageServiceResponse {

View File

@ -1,4 +1,4 @@
package com.iqser.red.service.redaction.v1.server.redaction.model.image;
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.image;
import lombok.Data;

View File

@ -1,5 +1,4 @@
package com.iqser.red.service.redaction.v1.server.redaction.model.image;
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.image;
import lombok.Data;

View File

@ -0,0 +1,10 @@
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.image;
import lombok.Data;
@Data
public class Probability {
private boolean unconfident;
}

View File

@ -1,5 +1,4 @@
package com.iqser.red.service.redaction.v1.server.redaction.model.table;
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.table;
import lombok.Data;

View File

@ -1,4 +1,4 @@
package com.iqser.red.service.redaction.v1.server.redaction.model;
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.table;
import lombok.AllArgsConstructor;
import lombok.Builder;

View File

@ -1,5 +1,4 @@
package com.iqser.red.service.redaction.v1.server.redaction.model.table;
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.table;
import lombok.Data;

View File

@ -1,9 +1,8 @@
package com.iqser.red.service.redaction.v1.server.redaction.model.table;
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.table;
import java.util.ArrayList;
import java.util.List;
import lombok.Data;
@Data

View File

@ -1,9 +1,8 @@
package com.iqser.red.service.redaction.v1.server.redaction.model.table;
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.table;
import java.util.ArrayList;
import java.util.List;
import lombok.Data;
@Data

View File

@ -1,9 +1,8 @@
package com.iqser.red.service.redaction.v1.server.tableextraction.model;
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.iqser.red.service.redaction.v1.server.classification.model.Orientation;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
import lombok.AllArgsConstructor;
import lombok.Data;
@ -12,7 +11,7 @@ import lombok.NoArgsConstructor;
@Data
@AllArgsConstructor
@NoArgsConstructor
public abstract class AbstractTextContainer {
public abstract class AbstractPageBlock {
@JsonIgnore
protected float minX;
@ -23,7 +22,7 @@ public abstract class AbstractTextContainer {
@JsonIgnore
protected float maxY;
@JsonIgnore
protected String classification;
protected PageBlockType classification;
@JsonIgnore
protected int page;
@ -34,13 +33,19 @@ public abstract class AbstractTextContainer {
public abstract String getText();
public boolean containsBlock(TextBlock other) {
public boolean isHeadline() {
return this instanceof TextPageBlock && this.getClassification() != null && this.getClassification().isHeadline();
}
public boolean containsBlock(TextPageBlock other) {
return this.minX <= other.getPdfMinX() && this.maxX >= other.getPdfMaxX() && this.minY >= other.getPdfMinY() && this.maxY <= other.getPdfMaxY();
}
public boolean contains(AbstractTextContainer other) {
public boolean contains(AbstractPageBlock other) {
return this.minX <= other.minX && this.maxX >= other.maxX && this.minY >= other.minY && this.maxY <= other.maxY;
}
@ -66,4 +71,10 @@ public abstract class AbstractTextContainer {
return maxX - minX;
}
public boolean intersectsY(AbstractPageBlock atc) {
return this.minY <= atc.getMaxY() && this.maxY >= atc.getMinY();
}
}

View File

@ -1,22 +1,24 @@
package com.iqser.red.service.redaction.v1.server.classification.model;
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model;
import java.util.ArrayList;
import java.util.List;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.section.SectionGrid;
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryVersion;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.StringFrequencyCounter;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.UnclassifiedText;
import com.iqser.red.service.redaction.v1.server.redaction.model.dictionary.DictionaryVersion;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@NoArgsConstructor
public class Document {
public class ClassificationDocument {
private List<Page> pages = new ArrayList<>();
private List<Paragraph> paragraphs = new ArrayList<>();
private List<Header> headers = new ArrayList<>();
private List<Footer> footers = new ArrayList<>();
private List<ClassificationPage> pages = new ArrayList<>();
private List<ClassificationSection> sections = new ArrayList<>();
private List<ClassificationHeader> headers = new ArrayList<>();
private List<ClassificationFooter> footers = new ArrayList<>();
private List<UnclassifiedText> unclassifiedTexts = new ArrayList<>();
private FloatFrequencyCounter textHeightCounter = new FloatFrequencyCounter();
private FloatFrequencyCounter fontSizeCounter = new FloatFrequencyCounter();

View File

@ -0,0 +1,16 @@
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model;
import java.util.List;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
import lombok.AllArgsConstructor;
import lombok.Data;
@Data
@AllArgsConstructor
public class ClassificationFooter {
private List<TextPageBlock> textBlocks;
}

View File

@ -0,0 +1,16 @@
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model;
import java.util.List;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
import lombok.AllArgsConstructor;
import lombok.Data;
@Data
@AllArgsConstructor
public class ClassificationHeader {
private List<TextPageBlock> textBlocks;
}

View File

@ -1,11 +1,11 @@
package com.iqser.red.service.redaction.v1.server.classification.model;
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model;
import java.util.ArrayList;
import java.util.List;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.image.ClassifiedImage;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.StringFrequencyCounter;
import lombok.Data;
import lombok.NonNull;
@ -13,12 +13,12 @@ import lombok.RequiredArgsConstructor;
@Data
@RequiredArgsConstructor
public class Page {
public class ClassificationPage {
@NonNull
private List<AbstractTextContainer> textBlocks;
private List<AbstractPageBlock> textBlocks;
private List<PdfImage> images = new ArrayList<>();
private List<ClassifiedImage> images = new ArrayList<>();
private Rectangle bodyTextFrame;
@ -35,10 +35,4 @@ public class Page {
private float pageWidth;
private float pageHeight;
public boolean isRotated() {
return rotation != 0;
}
}

View File

@ -0,0 +1,32 @@
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model;
import java.util.ArrayList;
import java.util.List;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.image.ClassifiedImage;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.TablePageBlock;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@NoArgsConstructor
public class ClassificationSection {
private List<AbstractPageBlock> pageBlocks = new ArrayList<>();
private List<ClassifiedImage> images = new ArrayList<>();
private String headline;
public List<TablePageBlock> getTables() {
List<TablePageBlock> tables = new ArrayList<>();
pageBlocks.forEach(block -> {
if (block instanceof TablePageBlock) {
tables.add((TablePageBlock) block);
}
});
return tables;
}
}

View File

@ -1,6 +1,4 @@
package com.iqser.red.service.redaction.v1.server.classification.model;
import lombok.Getter;
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model;
import java.util.ArrayList;
import java.util.Collections;
@ -9,6 +7,8 @@ import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import lombok.Getter;
public class FloatFrequencyCounter {
@Getter

View File

@ -0,0 +1,8 @@
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model;
public enum Orientation {
NONE,
LEFT,
RIGHT
}

View File

@ -0,0 +1,38 @@
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model;
public enum PageBlockType {
H1,
H2,
H3,
H4,
H5,
H6,
HEADER,
FOOTER,
TITLE,
PARAGRAPH,
PARAGRAPH_BOLD,
PARAGRAPH_ITALIC,
PARAGRAPH_UNKNOWN,
OTHER,
TABLE;
public static PageBlockType getHeadlineType(int i) {
return switch (i) {
case 1 -> PageBlockType.H1;
case 2 -> PageBlockType.H2;
case 3 -> PageBlockType.H3;
case 4 -> PageBlockType.H4;
case 5 -> PageBlockType.H5;
default -> PageBlockType.H6;
};
}
public boolean isHeadline() {
return this.equals(H1) || this.equals(H2) || this.equals(H3) || this.equals(H4) || this.equals(H5) || this.equals(H6);
}
}

View File

@ -0,0 +1,25 @@
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.image;
import java.awt.geom.Rectangle2D;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.ImageType;
import lombok.Data;
import lombok.NonNull;
import lombok.RequiredArgsConstructor;
@Data
@RequiredArgsConstructor
public class ClassifiedImage {
@NonNull
private Rectangle2D position;
@NonNull
private ImageType imageType;
private boolean isAppendedToSection;
@NonNull
private boolean hasTransparency;
@NonNull
private int page;
}

View File

@ -1,25 +1,25 @@
package com.iqser.red.service.redaction.v1.server.tableextraction.model;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table;
import java.awt.geom.Point2D;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
@SuppressWarnings("serial")
@Data
@EqualsAndHashCode(callSuper = true)
@NoArgsConstructor
public class Cell extends Rectangle {
private List<TextBlock> textBlocks = new ArrayList<>();
private List<TextPageBlock> textBlocks = new ArrayList<>();
private List<Cell> headerCells = new ArrayList<>();
@ -27,6 +27,8 @@ public class Cell extends Rectangle {
private static final int MIN_SIZE = 1;
private int pageNumber;
public Cell(Point2D topLeft, Point2D bottomRight) {
@ -34,7 +36,7 @@ public class Cell extends Rectangle {
}
public void addTextBlock(TextBlock textBlock) {
public void addTextBlock(TextPageBlock textBlock) {
textBlocks.add(textBlock);
}
@ -45,11 +47,11 @@ public class Cell extends Rectangle {
StringBuilder sb = new StringBuilder();
Iterator<TextBlock> itty = textBlocks.iterator();
Iterator<TextPageBlock> itty = textBlocks.iterator();
TextPositionSequence previous = null;
while (itty.hasNext()) {
TextBlock textBlock = itty.next();
TextPageBlock textBlock = itty.next();
for (TextPositionSequence word : textBlock.getSequences()) {
if (previous != null) {

View File

@ -1,4 +1,4 @@
package com.iqser.red.service.redaction.v1.server.tableextraction.model;
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table;
import lombok.RequiredArgsConstructor;
import lombok.Value;

View File

@ -1,10 +1,10 @@
package com.iqser.red.service.redaction.v1.server.tableextraction.model;
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table;
import java.util.List;
import lombok.Builder;
import lombok.Data;
import java.util.List;
@Data
@Builder
public class CleanRulings {

View File

@ -1,4 +1,4 @@
package com.iqser.red.service.redaction.v1.server.tableextraction.model;
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;

View File

@ -1,14 +1,20 @@
package com.iqser.red.service.redaction.v1.server.tableextraction.model;
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.CohenSutherlandClipping;
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils;
import lombok.extern.slf4j.Slf4j;
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table;
import java.awt.geom.Line2D;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.util.*;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Formatter;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils.CohenSutherlandClipping;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils.Utils;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@SuppressWarnings("all")

View File

@ -1,25 +1,25 @@
package com.iqser.red.service.redaction.v1.server.tableextraction.model;
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table;
import java.awt.geom.Point2D;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.TreeMap;
import java.util.stream.Collectors;
import org.apache.commons.collections4.CollectionUtils;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.PageBlockType;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
import lombok.Getter;
import lombok.Setter;
import lombok.extern.slf4j.Slf4j;
@Slf4j
public class Table extends AbstractTextContainer {
public class TablePageBlock extends AbstractPageBlock {
private final TreeMap<CellPosition, Cell> cells = new TreeMap<>();
@ -29,21 +29,18 @@ public class Table extends AbstractTextContainer {
private String headline;
private int unrotatedRowCount;
private int unrotatedColCount;
private int rowCount = -1;
private int colCount = -1;
private List<List<Cell>> rows;
public Table(List<Cell> cells, Rectangle area, int rotation) {
public TablePageBlock(List<Cell> cells, Rectangle area, int rotation) {
addCells(cells);
minX = area.getLeft();
minY = area.getBottom();
maxX = area.getRight();
maxY = area.getTop();
classification = "Table";
classification = PageBlockType.TABLE;
this.rotation = rotation;
}
@ -71,19 +68,13 @@ public class Table extends AbstractTextContainer {
public int getRowCount() {
if (rowCount == -1) {
rowCount = getRows().size();
}
return rowCount;
return getRows().size();
}
public int getColCount() {
if (colCount == -1) {
colCount = getRows().stream().mapToInt(List::size).max().orElse(0);
}
return colCount;
return getRows().stream().mapToInt(List::size).max().orElse(0);
}
@ -224,7 +215,7 @@ public class Table extends AbstractTextContainer {
* Calculates the structure of the table. For spanning rows and columns multiple cells with the same values will be inserted.
*
* @param cells The found cells
* @return Table Structure
* @return TablePageBlock Structure
*/
private List<List<Cell>> calculateStructure(List<Cell> cells) {
@ -243,8 +234,8 @@ public class Table extends AbstractTextContainer {
uniqueY.add(c.getTop());
});
var sortedUniqueX = uniqueX.stream().sorted().collect(Collectors.toList());
var sortedUniqueY = uniqueY.stream().sorted().collect(Collectors.toList());
var sortedUniqueX = uniqueX.stream().sorted().toList();
var sortedUniqueY = uniqueY.stream().sorted().toList();
Float prevY = null;
for (Float y : sortedUniqueY) {
@ -258,9 +249,7 @@ public class Table extends AbstractTextContainer {
var cell = new Cell(new Point2D.Float(prevX, prevY), new Point2D.Float(x, y));
var intersectionCell = cells.stream().filter(c -> cell.intersects(c) && cell.overlapRatio(c) > 0.1f).findFirst();
if (intersectionCell.isPresent()) {
cell.getTextBlocks().addAll(intersectionCell.get().getTextBlocks());
}
intersectionCell.ifPresent(value -> cell.getTextBlocks().addAll(value.getTextBlocks()));
if (cell.hasMinimumSize()) {
row.add(cell);
}
@ -268,7 +257,7 @@ public class Table extends AbstractTextContainer {
prevX = x;
}
if (prevY != null && prevX != null) {
if (prevY != null && prevX != null && !row.isEmpty()) {
matrix.add(row);
}
prevY = y;
@ -299,7 +288,7 @@ public class Table extends AbstractTextContainer {
}
if (column != null && column.getTextBlocks() != null) {
boolean first = true;
for (TextBlock textBlock : column.getTextBlocks()) {
for (TextPageBlock textBlock : column.getTextBlocks()) {
if (!first) {
sb.append("\n");
}
@ -331,7 +320,7 @@ public class Table extends AbstractTextContainer {
sb.append(i == 0 ? "\n<th>" : "\n<td>");
if (column != null && column.getTextBlocks() != null) {
boolean first = true;
for (TextBlock textBlock : column.getTextBlocks()) {
for (TextPageBlock textBlock : column.getTextBlocks()) {
if (!first) {
sb.append("<br />");
}

View File

@ -1,4 +1,4 @@
package com.iqser.red.service.redaction.v1.server.parsing.model;
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text;
import org.apache.pdfbox.text.TextPosition;
import org.springframework.beans.BeanUtils;

View File

@ -0,0 +1,49 @@
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text;
import java.util.ArrayList;
import java.util.List;
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
import lombok.Getter;
public class SearchableText {
@Getter
private final List<TextPositionSequence> sequences = new ArrayList<>();
public void add(TextPositionSequence textPositionSequence) {
sequences.add(textPositionSequence);
}
public void addAll(List<TextPositionSequence> textPositionSequences) {
sequences.addAll(textPositionSequences);
}
@Override
public String toString() {
return buildString(sequences);
}
public static String buildString(List<TextPositionSequence> sequences) {
StringBuilder sb = new StringBuilder();
for (TextPositionSequence word : sequences) {
sb.append(word);
sb.append(' ');
}
String text = sb.toString();
text = TextNormalizationUtilities.removeHyphenLineBreaks(text);
text = TextNormalizationUtilities.removeLineBreaks(text);
text = TextNormalizationUtilities.removeRepeatingWhitespaces(text);
return text;
}
}

View File

@ -1,5 +1,4 @@
package com.iqser.red.service.redaction.v1.server.classification.model;
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text;
import lombok.AllArgsConstructor;
import lombok.Builder;

View File

@ -1,4 +1,4 @@
package com.iqser.red.service.redaction.v1.server.classification.model;
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text;
import java.util.ArrayList;
import java.util.List;

View File

@ -1,10 +1,10 @@
package com.iqser.red.service.redaction.v1.server.classification.model;
import lombok.Getter;
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text;
import java.util.HashMap;
import java.util.Map;
import lombok.Getter;
public class StringFrequencyCounter {
@Getter

View File

@ -1,6 +1,4 @@
package com.iqser.red.service.redaction.v1.server.parsing.model;
import java.util.Objects;
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonValue;
@ -46,18 +44,4 @@ public enum TextDirection {
throw new IllegalArgumentException(String.format("A value of %f is not supported by TextDirection", degrees));
}
public static TextDirection fromString(String degreesAsString) {
Objects.requireNonNull(degreesAsString, "Cannot construct a text direction from a null value");
String value = degreesAsString.strip();
if (degreesAsString.endsWith(VALUE_STRING_SUFFIX)) {
value = degreesAsString.replace(VALUE_STRING_SUFFIX + "$", "");
}
return fromDegrees(Float.parseFloat(value));
}
}

View File

@ -1,13 +1,12 @@
package com.iqser.red.service.redaction.v1.server.classification.model;
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text;
import java.util.ArrayList;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextDirection;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.PageBlockType;
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import lombok.AllArgsConstructor;
import lombok.Builder;
@ -18,7 +17,7 @@ import lombok.NoArgsConstructor;
@Builder
@Data
@NoArgsConstructor
public class TextBlock extends AbstractTextContainer {
public class TextPageBlock extends AbstractPageBlock {
@Builder.Default
private List<TextPositionSequence> sequences = new ArrayList<>();
@ -45,7 +44,7 @@ public class TextBlock extends AbstractTextContainer {
private float highestFontSize;
@JsonIgnore
private String classification;
private PageBlockType classification;
@JsonIgnore
@ -95,6 +94,7 @@ public class TextBlock extends AbstractTextContainer {
}
}
/**
* Returns the maxX value in pdf coordinate system.
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
@ -174,7 +174,7 @@ public class TextBlock extends AbstractTextContainer {
}
public TextBlock(float minX, float maxX, float minY, float maxY, List<TextPositionSequence> sequences, int rotation) {
public TextPageBlock(float minX, float maxX, float minY, float maxY, List<TextPositionSequence> sequences, int rotation) {
this.minX = minX;
this.maxX = maxX;
@ -185,23 +185,23 @@ public class TextBlock extends AbstractTextContainer {
}
public TextBlock union(TextPositionSequence r) {
public TextPageBlock union(TextPositionSequence r) {
TextBlock union = this.copy();
TextPageBlock union = this.copy();
union.add(r);
return union;
}
public TextBlock union(TextBlock r) {
public TextPageBlock union(TextPageBlock r) {
TextBlock union = this.copy();
TextPageBlock union = this.copy();
union.add(r);
return union;
}
public void add(TextBlock r) {
public void add(TextPageBlock r) {
if (r.getMinX() < minX) {
minX = r.getMinX();
@ -236,9 +236,9 @@ public class TextBlock extends AbstractTextContainer {
}
public TextBlock copy() {
public TextPageBlock copy() {
return new TextBlock(minX, maxX, minY, maxY, sequences, rotation);
return new TextPageBlock(minX, maxX, minY, maxY, sequences, rotation);
}

View File

@ -1,4 +1,4 @@
package com.iqser.red.service.redaction.v1.server.parsing.model;
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text;
import java.awt.geom.AffineTransform;
import java.awt.geom.Point2D;

View File

@ -0,0 +1,14 @@
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text;
import java.util.List;
import lombok.AllArgsConstructor;
import lombok.Data;
@Data
@AllArgsConstructor
public class UnclassifiedText {
private List<TextPageBlock> textBlocks;
}

View File

@ -14,34 +14,18 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.iqser.red.service.redaction.v1.server.parsing;
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.parsing;
import java.io.InputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Map;
import java.util.WeakHashMap;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.fontbox.ttf.TrueTypeFont;
import org.apache.fontbox.util.BoundingBox;
import org.apache.pdfbox.contentstream.PDFStreamEngine;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.font.encoding.GlyphList;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.font.PDCIDFont;
import org.apache.pdfbox.pdmodel.font.PDCIDFontType2;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDSimpleFont;
import org.apache.pdfbox.pdmodel.font.PDTrueTypeFont;
import org.apache.pdfbox.pdmodel.font.PDType0Font;
import org.apache.pdfbox.pdmodel.font.PDType3Font;
import org.apache.pdfbox.pdmodel.graphics.state.PDGraphicsState;
import org.apache.pdfbox.text.TextPosition;
import org.apache.pdfbox.util.Matrix;
import org.apache.pdfbox.util.Vector;
import org.apache.pdfbox.contentstream.operator.DrawObject;
import org.apache.pdfbox.contentstream.operator.state.Concatenate;
import org.apache.pdfbox.contentstream.operator.state.Restore;
@ -50,22 +34,36 @@ import org.apache.pdfbox.contentstream.operator.state.SetGraphicsStateParameters
import org.apache.pdfbox.contentstream.operator.state.SetMatrix;
import org.apache.pdfbox.contentstream.operator.text.BeginText;
import org.apache.pdfbox.contentstream.operator.text.EndText;
import org.apache.pdfbox.contentstream.operator.text.SetFontAndSize;
import org.apache.pdfbox.contentstream.operator.text.SetTextHorizontalScaling;
import org.apache.pdfbox.contentstream.operator.text.ShowTextAdjusted;
import org.apache.pdfbox.contentstream.operator.text.ShowTextLine;
import org.apache.pdfbox.contentstream.operator.text.ShowTextLineAndSpace;
import org.apache.pdfbox.contentstream.operator.text.MoveText;
import org.apache.pdfbox.contentstream.operator.text.MoveTextSetLeading;
import org.apache.pdfbox.contentstream.operator.text.NextLine;
import org.apache.pdfbox.contentstream.operator.text.SetCharSpacing;
import org.apache.pdfbox.contentstream.operator.text.SetFontAndSize;
import org.apache.pdfbox.contentstream.operator.text.SetTextHorizontalScaling;
import org.apache.pdfbox.contentstream.operator.text.SetTextLeading;
import org.apache.pdfbox.contentstream.operator.text.SetTextRenderingMode;
import org.apache.pdfbox.contentstream.operator.text.SetTextRise;
import org.apache.pdfbox.contentstream.operator.text.SetWordSpacing;
import org.apache.pdfbox.contentstream.operator.text.ShowText;
import org.apache.pdfbox.contentstream.operator.text.ShowTextAdjusted;
import org.apache.pdfbox.contentstream.operator.text.ShowTextLine;
import org.apache.pdfbox.contentstream.operator.text.ShowTextLineAndSpace;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.font.PDCIDFont;
import org.apache.pdfbox.pdmodel.font.PDCIDFontType2;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDFontDescriptor;
import org.apache.pdfbox.pdmodel.font.PDSimpleFont;
import org.apache.pdfbox.pdmodel.font.PDTrueTypeFont;
import org.apache.pdfbox.pdmodel.font.PDType0Font;
import org.apache.pdfbox.pdmodel.font.PDType3Font;
import org.apache.pdfbox.pdmodel.font.encoding.GlyphList;
import org.apache.pdfbox.pdmodel.graphics.state.PDGraphicsState;
import org.apache.pdfbox.text.TextPosition;
import org.apache.pdfbox.util.Matrix;
import org.apache.pdfbox.util.Vector;
/**
* LEGACY text calculations which are known to be incorrect but are depended on by PDFTextStripper.

View File

@ -1,17 +1,32 @@
package com.iqser.red.service.redaction.v1.server.parsing;
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.parsing;
import com.iqser.red.service.redaction.v1.server.parsing.model.RedTextPosition;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
import lombok.Getter;
import lombok.Setter;
import lombok.extern.slf4j.Slf4j;
import java.awt.geom.Point2D;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.contentstream.operator.OperatorName;
import org.apache.pdfbox.contentstream.operator.color.*;
import org.apache.pdfbox.contentstream.operator.state.*;
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColor;
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColorN;
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColorSpace;
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceCMYKColor;
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceGrayColor;
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceRGBColor;
import org.apache.pdfbox.contentstream.operator.color.SetStrokingColor;
import org.apache.pdfbox.contentstream.operator.color.SetStrokingColorN;
import org.apache.pdfbox.contentstream.operator.color.SetStrokingColorSpace;
import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceCMYKColor;
import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceGrayColor;
import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceRGBColor;
import org.apache.pdfbox.contentstream.operator.state.SetFlatness;
import org.apache.pdfbox.contentstream.operator.state.SetLineCapStyle;
import org.apache.pdfbox.contentstream.operator.state.SetLineDashPattern;
import org.apache.pdfbox.contentstream.operator.state.SetLineJoinStyle;
import org.apache.pdfbox.contentstream.operator.state.SetLineMiterLimit;
import org.apache.pdfbox.contentstream.operator.state.SetLineWidth;
import org.apache.pdfbox.contentstream.operator.state.SetRenderingIntent;
import org.apache.pdfbox.contentstream.operator.text.SetFontAndSize;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSNumber;
@ -19,11 +34,13 @@ import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.text.TextPosition;
import java.awt.geom.Point2D;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.Ruling;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.RedTextPosition;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPositionSequence;
import lombok.Getter;
import lombok.Setter;
import lombok.extern.slf4j.Slf4j;
@Slf4j
public class PDFLinesTextStripper extends PDFTextStripper {
@ -264,8 +281,8 @@ public class PDFLinesTextStripper extends PDFTextStripper {
// Remove false sequence ends (whitespaces)
if (previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0)
.getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < 0.01) {
for (TextPosition t : sublist) {
textPositionSequences.get(textPositionSequences.size() - 1).add(t);
for (TextPosition textPosition : sublist) {
textPositionSequences.get(textPositionSequences.size() - 1).add(textPosition);
}
} else {
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.iqser.red.service.redaction.v1.server.parsing;
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.parsing;
import java.io.BufferedInputStream;
import java.io.IOException;

View File

@ -1,4 +1,4 @@
package com.iqser.red.service.redaction.v1.server.classification.service;
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.service;
import static java.util.stream.Collectors.toSet;
@ -9,15 +9,15 @@ import java.util.List;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.server.classification.model.FloatFrequencyCounter;
import com.iqser.red.service.redaction.v1.server.classification.model.Orientation;
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
import com.iqser.red.service.redaction.v1.server.classification.model.StringFrequencyCounter;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.classification.utils.RulingTextDirAdjustUtil;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationPage;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.FloatFrequencyCounter;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.Orientation;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.Ruling;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.StringFrequencyCounter;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils.RulingTextDirAdjustUtil;
@Service
@SuppressWarnings("all")
@ -29,16 +29,18 @@ public class BlockificationService {
/**
* This method is building blocks by expanding the minX/maxX and minY/maxY value on each word that is not split by the conditions.
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
* Rulings (Table lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
* @param textPositions The words of a page.
* Rulings (TablePageBlock lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
*
* @param textPositions The words of a page.
* @param horizontalRulingLines Horizontal table lines.
* @param verticalRulingLines Vertical table lines.
* @param verticalRulingLines Vertical table lines.
* @return Page object that contains the Textblock and text statistics.
*/
public Page blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
int indexOnPage = 0;
List<TextPositionSequence> chunkWords = new ArrayList<>();
List<AbstractTextContainer> chunkBlockList1 = new ArrayList<>();
List<AbstractPageBlock> chunkBlockList = new ArrayList<>();
float minX = 1000, maxX = 0, minY = 1000, maxY = 0;
TextPositionSequence prev = null;
@ -58,12 +60,14 @@ public class BlockificationService {
if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) {
Orientation prevOrientation = null;
if (!chunkBlockList1.isEmpty()) {
prevOrientation = chunkBlockList1.get(chunkBlockList1.size() - 1).getOrientation();
if (!chunkBlockList.isEmpty()) {
prevOrientation = chunkBlockList.get(chunkBlockList.size() - 1).getOrientation();
}
TextBlock cb1 = buildTextBlock(chunkWords);
chunkBlockList1.add(cb1);
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
indexOnPage++;
chunkBlockList.add(cb1);
chunkWords = new ArrayList<>();
if (splitByX && !isSplitByRuling) {
@ -102,17 +106,17 @@ public class BlockificationService {
}
}
TextBlock cb1 = buildTextBlock(chunkWords);
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
if (cb1 != null) {
chunkBlockList1.add(cb1);
chunkBlockList.add(cb1);
}
Iterator<AbstractTextContainer> itty = chunkBlockList1.iterator();
Iterator<AbstractPageBlock> itty = chunkBlockList.iterator();
TextBlock previousLeft = null;
TextBlock previousRight = null;
TextPageBlock previousLeft = null;
TextPageBlock previousRight = null;
while (itty.hasNext()) {
TextBlock block = (TextBlock) itty.next();
TextPageBlock block = (TextPageBlock) itty.next();
if (previousLeft != null && block.getOrientation().equals(Orientation.LEFT)) {
if (previousLeft.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousLeft.getMinY()) {
@ -137,10 +141,10 @@ public class BlockificationService {
}
}
itty = chunkBlockList1.iterator();
TextBlock previous = null;
itty = chunkBlockList.iterator();
TextPageBlock previous = null;
while (itty.hasNext()) {
TextBlock block = (TextBlock) itty.next();
TextPageBlock block = (TextPageBlock) itty.next();
if (previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold(block.getMaxY(),
previous.getMaxY()) || previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation()
@ -153,7 +157,7 @@ public class BlockificationService {
previous = block;
}
return new Page(chunkBlockList1);
return new ClassificationPage(chunkBlockList);
}
@ -163,9 +167,9 @@ public class BlockificationService {
}
private TextBlock buildTextBlock(List<TextPositionSequence> wordBlockList) {
private TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
TextBlock textBlock = null;
TextPageBlock textBlock = null;
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
@ -182,9 +186,14 @@ public class BlockificationService {
styleFrequencyCounter.add(wordBlock.getFontStyle());
if (textBlock == null) {
textBlock = new TextBlock(wordBlock.getMinXDirAdj(), wordBlock.getMaxXDirAdj(), wordBlock.getMinYDirAdj(), wordBlock.getMaxYDirAdj(), wordBlockList, wordBlock.getRotation());
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
wordBlock.getMaxXDirAdj(),
wordBlock.getMinYDirAdj(),
wordBlock.getMaxYDirAdj(),
wordBlockList,
wordBlock.getRotation());
} else {
TextBlock spatialEntity = textBlock.union(wordBlock);
TextPageBlock spatialEntity = textBlock.union(wordBlock);
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
}
}
@ -213,10 +222,38 @@ public class BlockificationService {
List<Ruling> horizontalRulingLines,
List<Ruling> verticalRulingLines) {
return isSplitByRuling(maxX, minY, word.getMinXDirAdj(), word.getMinYDirAdj(), verticalRulingLines, word.getDir().getDegrees(), word.getPageWidth(), word.getPageHeight()) //
|| isSplitByRuling(minX, minY, word.getMinXDirAdj(), word.getMaxYDirAdj(), horizontalRulingLines, word.getDir().getDegrees(), word.getPageWidth(), word.getPageHeight()) //
|| isSplitByRuling(maxX, minY, word.getMinXDirAdj(), word.getMinYDirAdj(), horizontalRulingLines, word.getDir().getDegrees(), word.getPageWidth(), word.getPageHeight()) //
|| isSplitByRuling(minX, minY, word.getMinXDirAdj(), word.getMaxYDirAdj(), verticalRulingLines, word.getDir().getDegrees(), word.getPageWidth(), word.getPageHeight()); //
return isSplitByRuling(maxX,
minY,
word.getMinXDirAdj(),
word.getMinYDirAdj(),
verticalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(minX,
minY,
word.getMinXDirAdj(),
word.getMaxYDirAdj(),
horizontalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(maxX,
minY,
word.getMinXDirAdj(),
word.getMinYDirAdj(),
horizontalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(minX,
minY,
word.getMinXDirAdj(),
word.getMaxYDirAdj(),
verticalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight());
}

View File

@ -1,4 +1,4 @@
package com.iqser.red.service.redaction.v1.server.classification.service;
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.service;
import java.util.List;
@ -6,17 +6,20 @@ import org.springframework.stereotype.Service;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.iqser.red.service.redaction.v1.server.classification.model.FloatFrequencyCounter;
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.classification.utils.PositionUtils;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationPage;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.FloatFrequencyCounter;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.Cell;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.TablePageBlock;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils.PositionUtils;
@Service
public class BodyTextFrameService {
private static final float APPROXIMATE_HEADER_LINE_COUNT = 2.9f;
/**
* Adjusts and sets the body text frame to a page.
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
@ -30,7 +33,7 @@ public class BodyTextFrameService {
* @param bodyTextFrame frame that contains the main text on portrait pages
* @param landscapeBodyTextFrame frame that contains the main text on landscape pages
*/
public void setBodyTextFrameAdjustedToPage(Page page, Rectangle bodyTextFrame, Rectangle landscapeBodyTextFrame) {
public void setBodyTextFrameAdjustedToPage(ClassificationPage page, Rectangle bodyTextFrame, Rectangle landscapeBodyTextFrame) {
Rectangle textFrame = page.isLandscape() ? landscapeBodyTextFrame : bodyTextFrame;
@ -65,26 +68,26 @@ public class BodyTextFrameService {
* @param landscape Calculate for landscape or portrait
* @return Rectangle of the text frame
*/
public Rectangle calculateBodyTextFrame(List<Page> pages, FloatFrequencyCounter documentFontSizeCounter, boolean landscape) {
public Rectangle calculateBodyTextFrame(List<ClassificationPage> pages, FloatFrequencyCounter documentFontSizeCounter, boolean landscape) {
BodyTextFrameExpansionsRectangle expansionsRectangle = new BodyTextFrameExpansionsRectangle();
for (Page page : pages) {
for (ClassificationPage page : pages) {
if (page.getTextBlocks().isEmpty() || landscape != page.isLandscape()) {
continue;
}
for (AbstractTextContainer container : page.getTextBlocks()) {
for (AbstractPageBlock container : page.getTextBlocks()) {
if (container instanceof TextBlock) {
TextBlock textBlock = (TextBlock) container;
if (container instanceof TextPageBlock) {
TextPageBlock textBlock = (TextPageBlock) container;
if (textBlock.getMostPopularWordFont() == null || textBlock.getMostPopularWordStyle() == null) {
continue;
}
float approxLineCount = PositionUtils.getApproxLineCount(textBlock);
if (approxLineCount < 2.9f) {
if (approxLineCount < APPROXIMATE_HEADER_LINE_COUNT) {
continue;
}
@ -94,15 +97,15 @@ public class BodyTextFrameService {
}
}
if (container instanceof Table) {
Table table = (Table) container;
if (container instanceof TablePageBlock) {
TablePageBlock table = (TablePageBlock) container;
for (List<Cell> row : table.getRows()) {
for (Cell cell : row) {
if (cell == null || cell.getTextBlocks() == null) {
continue;
}
for (TextBlock textBlock : cell.getTextBlocks()) {
for (TextPageBlock textBlock : cell.getTextBlocks()) {
expandRectangle(textBlock, page, expansionsRectangle);
}
}
@ -117,7 +120,7 @@ public class BodyTextFrameService {
}
private void expandRectangle(TextBlock textBlock, Page page, BodyTextFrameExpansionsRectangle expansionsRectangle) {
private void expandRectangle(TextPageBlock textBlock, ClassificationPage page, BodyTextFrameExpansionsRectangle expansionsRectangle) {
if (page.getPageWidth() > page.getPageHeight() && page.getRotation() != 0) {
if (textBlock.getPdfMinY() < expansionsRectangle.minX) {

View File

@ -1,4 +1,4 @@
package com.iqser.red.service.redaction.v1.server.classification.service;
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.service;
import java.util.List;
import java.util.regex.Pattern;
@ -6,11 +6,12 @@ import java.util.regex.Pattern;
import org.springframework.stereotype.Service;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.classification.utils.PositionUtils;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationDocument;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationPage;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.PageBlockType;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils.PositionUtils;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@ -23,7 +24,7 @@ public class ClassificationService {
private final BodyTextFrameService bodyTextFrameService;
public void classifyDocument(Document document) {
public void classifyDocument(ClassificationDocument document) {
Rectangle bodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), false);
Rectangle landscapeBodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), true);
@ -31,43 +32,43 @@ public class ClassificationService {
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
for (Page page : document.getPages()) {
for (ClassificationPage page : document.getPages()) {
bodyTextFrameService.setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame);
classifyPage(page, document, headlineFontSizes);
}
}
public void classifyPage(Page page, Document document, List<Float> headlineFontSizes) {
public void classifyPage(ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
for (AbstractTextContainer textBlock : page.getTextBlocks()) {
if (textBlock instanceof TextBlock) {
classifyBlock((TextBlock) textBlock, page, document, headlineFontSizes);
for (AbstractPageBlock textBlock : page.getTextBlocks()) {
if (textBlock instanceof TextPageBlock) {
classifyBlock((TextPageBlock) textBlock, page, document, headlineFontSizes);
}
}
}
public void classifyBlock(TextBlock textBlock, Page page, Document document, List<Float> headlineFontSizes) {
public void classifyBlock(TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
var bodyTextFrame = page.getBodyTextFrame();
if (document.getFontSizeCounter().getMostPopular() == null) {
textBlock.setClassification("Other");
textBlock.setClassification(PageBlockType.OTHER);
return;
}
if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
textBlock.setClassification("Header");
textBlock.setClassification(PageBlockType.HEADER);
} else if (PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
textBlock.setClassification("Footer");
textBlock.setClassification(PageBlockType.FOOTER);
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
.size() == 1)) {
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
textBlock.setClassification("Title");
textBlock.setClassification(PageBlockType.TITLE);
}
} else if (textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter()
.getMostPopular() && PositionUtils.getApproxLineCount(textBlock) < 4.9 && (textBlock.getMostPopularWordStyle().equals("bold") || !document.getFontStyleCounter()
@ -80,36 +81,34 @@ public class ClassificationService {
for (int i = 1; i <= headlineFontSizes.size(); i++) {
if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) {
textBlock.setClassification("H " + i);
textBlock.setClassification(PageBlockType.getHeadlineType(i));
document.setHeadlines(true);
}
}
} else if (!textBlock.getText().startsWith("Table ") && !textBlock.getText().startsWith("Figure ") && PositionUtils.isWithinBodyTextFrame(bodyTextFrame,
textBlock) && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter()
.getMostPopular()
.equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9 && textBlock.getSequences()
} else if (!textBlock.getText().startsWith("Figure ") && PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordStyle()
.equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9 && textBlock.getSequences()
.get(0)
.getTextPositions()
.get(0)
.getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
textBlock.setClassification("H " + (headlineFontSizes.size() + 1));
textBlock.setClassification(PageBlockType.getHeadlineType(headlineFontSizes.size() + 1));
document.setHeadlines(true);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) {
textBlock.setClassification("TextBlock Bold");
textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFont()
.equals(document.getFontCounter().getMostPopular()) && textBlock.getMostPopularWordStyle()
.equals(document.getFontStyleCounter().getMostPopular()) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) {
textBlock.setClassification("TextBlock");
textBlock.setClassification(PageBlockType.PARAGRAPH);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("italic") && !document.getFontStyleCounter()
.getMostPopular()
.equals("italic") && PositionUtils.getApproxLineCount(textBlock) < 2.9) {
textBlock.setClassification("TextBlock Italic");
textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) {
textBlock.setClassification("TextBlock Unknown");
textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN);
} else {
textBlock.setClassification("Other");
textBlock.setClassification(PageBlockType.OTHER);
}
}

View File

@ -1,4 +1,4 @@
package com.iqser.red.service.redaction.v1.server.segmentation;
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.service;
import java.io.File;
import java.io.FileOutputStream;
@ -16,21 +16,19 @@ import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.classification.service.BlockificationService;
import com.iqser.red.service.redaction.v1.server.classification.service.ClassificationService;
import com.iqser.red.service.redaction.v1.server.parsing.PDFLinesTextStripper;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfTableCell;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.ImageServiceResponseAdapter;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.TableServiceResponseAdapter;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.table.PdfTableCell;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationDocument;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationPage;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.image.ClassifiedImage;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.CleanRulings;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.parsing.PDFLinesTextStripper;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils.FileUtils;
import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.CleanRulings;
import com.iqser.red.service.redaction.v1.server.tableextraction.service.RulingCleaningService;
import com.iqser.red.service.redaction.v1.server.tableextraction.service.TableExtractionService;
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.FileUtils;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@ -46,18 +44,21 @@ public class PdfSegmentationService {
private final BlockificationService blockificationService;
private final ClassificationService classificationService;
private final SectionsBuilderService sectionsBuilderService;
private final ImageService imageService;
private final TableService tableService;
private final ImageServiceResponseAdapter imageServiceResponseAdapter;
private final TableServiceResponseAdapter tableServiceResponseAdapter;
public Document parseDocument(String dossierId, String fileId, InputStream documentInputStream, Map<Integer, List<PdfImage>> pdfImages) throws IOException {
public ClassificationDocument parseDocument(String dossierId,
String fileId,
InputStream documentInputStream,
Map<Integer, List<ClassifiedImage>> pdfImages) throws IOException {
PDDocument pdDocument = null;
File tempFile = null;
try {
Map<Integer, List<PdfTableCell>> pdfTableCells = new HashMap<>();
if (redactionServiceSettings.isCvTableParsingEnabled()) {
pdfTableCells = tableService.convertTables(dossierId, fileId);
pdfTableCells = tableServiceResponseAdapter.convertTables(dossierId, fileId);
}
tempFile = FileUtils.createTempFile("document", ".pdf");
@ -65,8 +66,8 @@ public class PdfSegmentationService {
IOUtils.copy(documentInputStream, fos);
// initialize required variables
Document document = new Document();
List<Page> pages = new ArrayList<>();
ClassificationDocument document = new ClassificationDocument();
List<ClassificationPage> pages = new ArrayList<>();
pdDocument = PDDocument.load(tempFile, MemoryUsageSetting.setupMixed(67108864L));
pdDocument.setAllSecurityToBeRemoved(true);
@ -94,12 +95,12 @@ public class PdfSegmentationService {
}
private void processPage(Map<Integer, List<PdfImage>> pdfImages,
PDDocument pdDocument,
Map<Integer, List<PdfTableCell>> pdfTableCells,
Document document,
List<Page> pages,
int pageNumber) throws IOException {
private void processPage(Map<Integer, List<ClassifiedImage>> pdfImages,
PDDocument pdDocument,
Map<Integer, List<PdfTableCell>> pdfTableCells,
ClassificationDocument document,
List<ClassificationPage> pages,
int pageNumber) throws IOException {
PDFLinesTextStripper stripper = new PDFLinesTextStripper();
PDPage pdPage = pdDocument.getPage(pageNumber - 1);
@ -119,7 +120,7 @@ public class PdfSegmentationService {
stripper.getRulings(),
stripper.getMinCharWidth(),
stripper.getMaxCharHeight());
Page page = blockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
ClassificationPage page = blockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
page.setRotation(rotation);
page.setLandscape(isLandscape);
@ -130,7 +131,7 @@ public class PdfSegmentationService {
// If images is ocr needs to be calculated before textBlocks are moved into tables, otherwise findOcr algorithm needs to be adopted.
if (pdfImages != null && pdfImages.containsKey(pageNumber)) {
page.setImages(pdfImages.get(pageNumber));
imageService.findOcr(page);
imageServiceResponseAdapter.findOcr(page);
}
tableExtractionService.extractTables(cleanRulings, page);
@ -141,7 +142,7 @@ public class PdfSegmentationService {
}
private void increaseDocumentStatistics(Page page, Document document) {
private void increaseDocumentStatistics(ClassificationPage page, ClassificationDocument document) {
if (!page.isLandscape()) {
document.getFontSizeCounter().addAll(page.getFontSizeCounter().getCountPerValue());
@ -152,15 +153,15 @@ public class PdfSegmentationService {
}
private void buildPageStatistics(Page page) {
private void buildPageStatistics(ClassificationPage page) {
// Collect all statistics for the page, except from blocks inside tables, as tables will always be added to BodyTextFrame.
for (AbstractTextContainer textBlock : page.getTextBlocks()) {
if (textBlock instanceof TextBlock) {
if (((TextBlock) textBlock).getSequences() == null) {
for (AbstractPageBlock textBlock : page.getTextBlocks()) {
if (textBlock instanceof TextPageBlock) {
if (((TextPageBlock) textBlock).getSequences() == null) {
continue;
}
for (TextPositionSequence word : ((TextBlock) textBlock).getSequences()) {
for (TextPositionSequence word : ((TextPageBlock) textBlock).getSequences()) {
page.getTextHeightCounter().add(word.getTextHeight());
page.getFontCounter().add(word.getFont());
page.getFontSizeCounter().add(word.getFontSize());

View File

@ -1,4 +1,4 @@
package com.iqser.red.service.redaction.v1.server.tableextraction.service;
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.service;
import java.awt.geom.Line2D;
import java.awt.geom.Point2D;
@ -12,11 +12,11 @@ import java.util.Map;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfTableCell;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.table.PdfTableCell;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.CleanRulings;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.Ruling;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils.Utils;
import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.CleanRulings;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;

View File

@ -1,9 +1,8 @@
package com.iqser.red.service.redaction.v1.server.segmentation;
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.service;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
@ -11,17 +10,18 @@ import java.util.stream.Collectors;
import org.apache.commons.collections4.CollectionUtils;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Footer;
import com.iqser.red.service.redaction.v1.server.classification.model.Header;
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.classification.model.UnclassifiedText;
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationDocument;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationFooter;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationHeader;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationPage;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationSection;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.PageBlockType;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.image.ClassifiedImage;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.Cell;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.TablePageBlock;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.UnclassifiedText;
import lombok.extern.slf4j.Slf4j;
@ -29,23 +29,23 @@ import lombok.extern.slf4j.Slf4j;
@Service
public class SectionsBuilderService {
public void buildSections(Document document) {
public void buildSections(ClassificationDocument document) {
List<AbstractTextContainer> chunkWords = new ArrayList<>();
List<Paragraph> chunkBlockList = new ArrayList<>();
List<Header> headers = new ArrayList<>();
List<Footer> footers = new ArrayList<>();
List<AbstractPageBlock> chunkWords = new ArrayList<>();
List<ClassificationSection> chunkBlockList = new ArrayList<>();
List<ClassificationHeader> headers = new ArrayList<>();
List<ClassificationFooter> footers = new ArrayList<>();
List<UnclassifiedText> unclassifiedTexts = new ArrayList<>();
AbstractTextContainer prev = null;
AbstractPageBlock prev = null;
String lastHeadline = "";
Table previousTable = null;
for (Page page : document.getPages()) {
List<TextBlock> header = new ArrayList<>();
List<TextBlock> footer = new ArrayList<>();
List<TextBlock> unclassifiedText = new ArrayList<>();
for (AbstractTextContainer current : page.getTextBlocks()) {
TablePageBlock previousTable = null;
for (ClassificationPage page : document.getPages()) {
List<TextPageBlock> header = new ArrayList<>();
List<TextPageBlock> footer = new ArrayList<>();
List<TextPageBlock> unclassifiedText = new ArrayList<>();
for (AbstractPageBlock current : page.getTextBlocks()) {
if (current.getClassification() == null) {
continue;
@ -53,23 +53,23 @@ public class SectionsBuilderService {
current.setPage(page.getPageNumber());
if (current.getClassification().equals("Header")) {
header.add((TextBlock) current);
if (current.getClassification().equals(PageBlockType.HEADER)) {
header.add((TextPageBlock) current);
continue;
}
if (current.getClassification().equals("Footer")) {
footer.add((TextBlock) current);
if (current.getClassification().equals(PageBlockType.FOOTER)) {
footer.add((TextPageBlock) current);
continue;
}
if (current.getClassification().equals("Other")) {
unclassifiedText.add((TextBlock) current);
if (current.getClassification().equals(PageBlockType.OTHER)) {
unclassifiedText.add((TextPageBlock) current);
continue;
}
if (prev != null && current.getClassification().startsWith("H ") && !prev.getClassification().startsWith("H ") || !document.isHeadlines()) {
Paragraph chunkBlock = buildTextBlock(chunkWords, lastHeadline);
if (prev != null && current.getClassification().isHeadline() && !prev.getClassification().isHeadline() || !document.isHeadlines()) {
ClassificationSection chunkBlock = buildTextBlock(chunkWords, lastHeadline);
chunkBlock.setHeadline(lastHeadline);
if (document.isHeadlines()) {
lastHeadline = current.getText();
@ -80,8 +80,7 @@ public class SectionsBuilderService {
previousTable = chunkBlock.getTables().get(chunkBlock.getTables().size() - 1);
}
}
if (current instanceof Table) {
Table table = (Table) current;
if (current instanceof TablePageBlock table) {
// Distribute header information for subsequent tables
mergeTableMetadata(table, previousTable);
previousTable = table;
@ -91,69 +90,72 @@ public class SectionsBuilderService {
}
if (!header.isEmpty()) {
headers.add(new Header(header));
headers.add(new ClassificationHeader(header));
}
if (!footer.isEmpty()) {
footers.add(new Footer(footer));
footers.add(new ClassificationFooter(footer));
}
if (!unclassifiedText.isEmpty()) {
unclassifiedTexts.add(new UnclassifiedText(unclassifiedText));
}
}
Paragraph chunkBlock = buildTextBlock(chunkWords, lastHeadline);
ClassificationSection chunkBlock = buildTextBlock(chunkWords, lastHeadline);
chunkBlock.setHeadline(lastHeadline);
chunkBlockList.add(chunkBlock);
document.setParagraphs(chunkBlockList);
document.setSections(chunkBlockList);
document.setHeaders(headers);
document.setFooters(footers);
document.setUnclassifiedTexts(unclassifiedTexts);
}
public void addImagesToSections(Document document) {
public void addImagesToSections(ClassificationDocument document) {
Map<Integer, List<Paragraph>> paragraphMap = new HashMap<>();
for (Paragraph paragraph : document.getParagraphs()) {
for (AbstractTextContainer container : paragraph.getPageBlocks()) {
paragraphMap.computeIfAbsent(container.getPage(), c -> new ArrayList<>()).add(paragraph);
Map<Integer, List<ClassificationSection>> sectionMap = new HashMap<>();
for (ClassificationSection section : document.getSections()) {
for (AbstractPageBlock container : section.getPageBlocks()) {
List<ClassificationSection> sectionsOnPage = sectionMap.computeIfAbsent(container.getPage(), c -> new ArrayList<>());
if (sectionsOnPage.contains(section)) {
continue;
}
sectionsOnPage.add(section);
}
}
if (paragraphMap.isEmpty()) {
Paragraph paragraph = new Paragraph();
document.getParagraphs().add(paragraph);
paragraphMap.computeIfAbsent(1, x -> new ArrayList<>()).add(paragraph);
if (sectionMap.isEmpty()) {
ClassificationSection section = new ClassificationSection();
document.getSections().add(section);
sectionMap.computeIfAbsent(1, x -> new ArrayList<>()).add(section);
}
// first page is always a paragraph, else we can't process pages 1..N,
// where N is the first found page with a paragraph
if (paragraphMap.get(1) == null) {
Paragraph paragraph = new Paragraph();
document.getParagraphs().add(paragraph);
paragraphMap.computeIfAbsent(1, x -> new ArrayList<>()).add(paragraph);
if (sectionMap.get(1) == null) {
ClassificationSection section = new ClassificationSection();
document.getSections().add(section);
sectionMap.computeIfAbsent(1, x -> new ArrayList<>()).add(section);
}
for (Page page : document.getPages()) {
for (PdfImage image : page.getImages()) {
List<Paragraph> paragraphsOnPage = paragraphMap.get(page.getPageNumber());
if (paragraphsOnPage == null) {
for (ClassificationPage page : document.getPages()) {
for (ClassifiedImage image : page.getImages()) {
List<ClassificationSection> sectionsOnPage = sectionMap.get(page.getPageNumber());
if (sectionsOnPage == null) {
int i = page.getPageNumber();
while (paragraphsOnPage == null) {
paragraphsOnPage = paragraphMap.get(i);
while (sectionsOnPage == null) {
sectionsOnPage = sectionMap.get(i);
i--;
}
}
for (Paragraph paragraph : paragraphsOnPage) {
for (ClassificationSection section : sectionsOnPage) {
Float xMin = null;
Float yMin = null;
Float xMax = null;
Float yMax = null;
for (AbstractTextContainer abs : paragraph.getPageBlocks()) {
for (AbstractPageBlock abs : section.getPageBlocks()) {
if (abs.getPage() != page.getPageNumber()) {
continue;
}
@ -197,21 +199,21 @@ public class SectionsBuilderService {
if (xMin != null && xMax != null && yMin != null && yMax != null && image.getPosition().getX() >= xMin && image.getPosition()
.getX() <= xMax && image.getPosition().getY() >= yMin && image.getPosition().getY() <= yMax) {
paragraph.getImages().add(image);
image.setAppendedToParagraph(true);
section.getImages().add(image);
image.setAppendedToSection(true);
}
}
if (!image.isAppendedToParagraph()) {
if (!image.isAppendedToSection()) {
log.debug("Image uses first paragraph");
paragraphsOnPage.get(0).getImages().add(image);
image.setAppendedToParagraph(true);
sectionsOnPage.get(0).getImages().add(image);
image.setAppendedToSection(true);
}
}
}
}
private void mergeTableMetadata(Table currentTable, Table previousTable) {
private void mergeTableMetadata(TablePageBlock currentTable, TablePageBlock previousTable) {
// Distribute header information for subsequent tables
if (previousTable != null && hasInvalidHeaderInformation(currentTable) && hasValidHeaderInformation(previousTable)) {
@ -239,86 +241,44 @@ public class SectionsBuilderService {
}
private Paragraph buildTextBlock(List<AbstractTextContainer> wordBlockList, String lastHeadline) {
private ClassificationSection buildTextBlock(List<AbstractPageBlock> wordBlockList, String lastHeadline) {
Paragraph paragraph = new Paragraph();
TextBlock textBlock = null;
ClassificationSection section = new ClassificationSection();
int pageBefore = -1;
boolean splitByTable = false;
for (AbstractPageBlock container : wordBlockList) {
if (container instanceof TablePageBlock table) {
Iterator<AbstractTextContainer> itty = wordBlockList.iterator();
boolean alreadyAdded = false;
AbstractTextContainer previous = null;
while (itty.hasNext()) {
AbstractTextContainer container = itty.next();
if (container instanceof Table) {
Table table = (Table) container;
splitByTable = true;
if (previous != null && previous.getText().startsWith("Table ")) {
table.setHeadline(previous.getText());
if (lastHeadline == null || lastHeadline.isEmpty()) {
table.setHeadline("Text in table");
} else {
if (lastHeadline == null || lastHeadline.isEmpty()) {
table.setHeadline("Text in table");
} else {
table.setHeadline("Table in: " + lastHeadline);
}
table.setHeadline("TablePageBlock in: " + lastHeadline);
}
if (textBlock != null && !alreadyAdded) {
paragraph.getPageBlocks().add(textBlock);
alreadyAdded = true;
}
paragraph.getPageBlocks().add(table);
section.getPageBlocks().add(table);
continue;
}
TextBlock wordBlock = (TextBlock) container;
if (textBlock == null) {
textBlock = new TextBlock(wordBlock.getMinX(), wordBlock.getMaxX(), wordBlock.getMinY(), wordBlock.getMaxY(), wordBlock.getSequences(), wordBlock.getRotation());
textBlock.setPage(wordBlock.getPage());
} else if (splitByTable) {
textBlock = new TextBlock(wordBlock.getMinX(), wordBlock.getMaxX(), wordBlock.getMinY(), wordBlock.getMaxY(), wordBlock.getSequences(), wordBlock.getRotation());
textBlock.setPage(wordBlock.getPage());
alreadyAdded = false;
} else if (pageBefore != -1 && wordBlock.getPage() != pageBefore) {
textBlock.setPage(pageBefore);
paragraph.getPageBlocks().add(textBlock);
textBlock = new TextBlock(wordBlock.getMinX(), wordBlock.getMaxX(), wordBlock.getMinY(), wordBlock.getMaxY(), wordBlock.getSequences(), wordBlock.getRotation());
textBlock.setPage(wordBlock.getPage());
} else {
TextBlock spatialEntity = textBlock.union(wordBlock);
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
}
pageBefore = wordBlock.getPage();
splitByTable = false;
previous = container;
TextPageBlock wordBlock = (TextPageBlock) container;
section.getPageBlocks().add(wordBlock);
}
if (textBlock != null && !alreadyAdded) {
paragraph.getPageBlocks().add(textBlock);
}
return paragraph;
return section;
}
private boolean hasValidHeaderInformation(Table table) {
private boolean hasValidHeaderInformation(TablePageBlock table) {
return !hasInvalidHeaderInformation(table);
}
private boolean hasInvalidHeaderInformation(Table table) {
private boolean hasInvalidHeaderInformation(TablePageBlock table) {
return table.getRows().stream().flatMap(row -> row.stream().filter(cell -> CollectionUtils.isNotEmpty(cell.getHeaderCells()))).findAny().isEmpty();
}
private List<Cell> getRowWithNonHeaderCells(Table table) {
private List<Cell> getRowWithNonHeaderCells(TablePageBlock table) {
for (int i = table.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
List<Cell> row = table.getRows().get(i);

View File

@ -1,4 +1,4 @@
package com.iqser.red.service.redaction.v1.server.tableextraction.service;
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.service;
import java.awt.geom.Point2D;
import java.util.ArrayList;
@ -13,15 +13,15 @@ import java.util.stream.Collectors;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.CleanRulings;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Rectangle;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationPage;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.Cell;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.CleanRulings;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.Rectangle;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.Ruling;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.TablePageBlock;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils.Utils;
@Service
public class TableExtractionService {
@ -73,20 +73,20 @@ public class TableExtractionService {
* 90 -> UpperLeft
* 180 -> UpperRight
* 270 -> LowerRight
*
* <p>
* DirAdj (Text direction adjusted) values can not be used here.
*
* @param cleanRulings The lines used to build the table.
* @param page Page object that contains textblocks and statistics.
* @param page Page object that contains textblocks and statistics.
*/
public void extractTables(CleanRulings cleanRulings, Page page) {
public void extractTables(CleanRulings cleanRulings, ClassificationPage page) {
List<Cell> cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical());
List<TextBlock> toBeRemoved = new ArrayList<>();
List<TextPageBlock> toBeRemoved = new ArrayList<>();
for (AbstractTextContainer abstractTextContainer : page.getTextBlocks()) {
TextBlock textBlock = (TextBlock) abstractTextContainer;
for (AbstractPageBlock abstractPageBlock : page.getTextBlocks()) {
TextPageBlock textBlock = (TextPageBlock) abstractPageBlock;
for (Cell cell : cells) {
if (cell.hasMinimumSize() && cell.intersects(textBlock.getPdfMinX(),
textBlock.getPdfMinY(),
@ -104,7 +104,7 @@ public class TableExtractionService {
List<Rectangle> spreadsheetAreas = findSpreadsheetsFromCells(cells).stream().filter(r -> r.getWidth() > 0f && r.getHeight() > 0f).collect(Collectors.toList());
List<Table> tables = new ArrayList<>();
List<TablePageBlock> tables = new ArrayList<>();
for (Rectangle area : spreadsheetAreas) {
List<Cell> overlappingCells = new ArrayList<>();
@ -113,16 +113,16 @@ public class TableExtractionService {
overlappingCells.add(c);
}
}
tables.add(new Table(overlappingCells, area, page.getRotation()));
tables.add(new TablePageBlock(overlappingCells, area, page.getRotation()));
}
for (Table table : tables) {
for (TablePageBlock table : tables) {
int position = -1;
Iterator<AbstractTextContainer> itty = page.getTextBlocks().iterator();
Iterator<AbstractPageBlock> itty = page.getTextBlocks().iterator();
while (itty.hasNext()) {
AbstractTextContainer textBlock = itty.next();
if (textBlock instanceof TextBlock ? table.containsBlock((TextBlock) textBlock) : table.contains(textBlock) && position == -1) {
AbstractPageBlock textBlock = itty.next();
if (textBlock instanceof TextPageBlock ? table.containsBlock((TextPageBlock) textBlock) : table.contains(textBlock) && position == -1) {
position = page.getTextBlocks().indexOf(textBlock);
}
}

View File

@ -9,7 +9,7 @@
* This program is free software under the LGPL (>=v2.1)
* Read the file LICENSE.txt coming with the sources for details.
*/
package com.iqser.red.service.redaction.v1.server.tableextraction.utils;
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils;
import java.awt.geom.Line2D;
import java.awt.geom.Rectangle2D;

View File

@ -1,4 +1,4 @@
package com.iqser.red.service.redaction.v1.server.tableextraction.utils;
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils;
import java.io.File;
import java.io.IOException;
@ -23,9 +23,11 @@ public class FileUtils {
/**
* Deletes a file; logs a message with the reason if the deletion fails.
* This method is null-safe.
*
* @param file The file to delete. Can be null.
*/
public void deleteFile(File file) {
if (file != null) {
try {
Files.deleteIfExists(file.toPath());

View File

@ -1,7 +1,7 @@
package com.iqser.red.service.redaction.v1.server.classification.utils;
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
import lombok.experimental.UtilityClass;
@ -11,7 +11,7 @@ public final class PositionUtils {
// TODO This currently uses pdf coord system. In the futher this should use java coord system.
// Note: DirAdj (TextDirection Adjusted) can not be user for this.
public boolean isWithinBodyTextFrame(Rectangle btf, TextBlock textBlock) {
public boolean isWithinBodyTextFrame(Rectangle btf, TextPageBlock textBlock) {
if (btf == null || textBlock == null) {
return false;
@ -32,7 +32,7 @@ public final class PositionUtils {
// TODO This currently uses pdf coord system. In the futher this should use java coord system.
// Note: DirAdj (TextDirection Adjusted) can not be user for this.
public boolean isOverBodyTextFrame(Rectangle btf, TextBlock textBlock, int rotation) {
public boolean isOverBodyTextFrame(Rectangle btf, TextPageBlock textBlock, int rotation) {
if (btf == null || textBlock == null) {
return false;
@ -58,9 +58,10 @@ public final class PositionUtils {
}
// TODO This currently uses pdf coord system. In the futher this should use java coord system.
// Note: DirAdj (TextDirection Adjusted) can not be user for this.
public boolean isUnderBodyTextFrame(Rectangle btf, TextBlock textBlock, int rotation) {
public boolean isUnderBodyTextFrame(Rectangle btf, TextPageBlock textBlock, int rotation) {
if (btf == null || textBlock == null) {
return false;
@ -86,9 +87,10 @@ public final class PositionUtils {
}
// TODO This currently uses pdf coord system. In the futher this should use java coord system.
// Note: DirAdj (TextDirection Adjusted) can not be user for this.
public boolean isTouchingUnderBodyTextFrame(Rectangle btf, TextBlock textBlock) {
public boolean isTouchingUnderBodyTextFrame(Rectangle btf, TextPageBlock textBlock) {
//TODO Currently this is not working for rotated pages.
@ -105,13 +107,13 @@ public final class PositionUtils {
}
public float getHeightDifferenceBetweenChunkWordAndDocumentWord(TextBlock textBlock, Float documentMostPopularWordHeight) {
public float getHeightDifferenceBetweenChunkWordAndDocumentWord(TextPageBlock textBlock, Float documentMostPopularWordHeight) {
return textBlock.getMostPopularWordHeight() - documentMostPopularWordHeight;
}
public Float getApproxLineCount(TextBlock textBlock) {
public Float getApproxLineCount(TextPageBlock textBlock) {
return textBlock.getHeight() / textBlock.getMostPopularWordHeight();
}

View File

@ -1,4 +1,4 @@
package com.iqser.red.service.redaction.v1.server.tableextraction.utils;
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils;
import java.util.ArrayDeque;
import java.util.Comparator;

View File

@ -1,9 +1,9 @@
package com.iqser.red.service.redaction.v1.server.classification.utils;
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils;
import java.awt.geom.Line2D;
import java.awt.geom.Point2D;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.Ruling;
import lombok.experimental.UtilityClass;
@ -13,7 +13,7 @@ public final class RulingTextDirAdjustUtil {
/**
* Converts a ruling (line of a table) the same way TextPositions are converted in PDFBox.
* This will get the y position of the text, adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
*
* <p>
* See org.apache.pdfbox.text.TextPosition
*/
public Line2D.Float convertToDirAdj(Ruling ruling, float dir, float pageWidth, float pageHeight) {

View File

@ -1,4 +1,4 @@
package com.iqser.red.service.redaction.v1.server.tableextraction.utils;
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils;
import java.math.BigDecimal;
import java.util.Comparator;

View File

@ -0,0 +1,50 @@
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.data;
import java.awt.geom.Rectangle2D;
import java.util.List;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.AtomicTextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class AtomicPositionBlockData {
Long id;
int[] stringIdxToPositionIdx;
float[][] positions;
public static AtomicPositionBlockData fromAtomicTextBlock(AtomicTextBlock atomicTextBlock) {
return AtomicPositionBlockData.builder()
.id(atomicTextBlock.getId())
.positions(toPrimitiveFloatMatrix(atomicTextBlock.getPositions()))
.stringIdxToPositionIdx(atomicTextBlock.getStringIdxToPositionIdx().stream().mapToInt(Integer::intValue).toArray())
.build();
}
private static float[][] toPrimitiveFloatMatrix(List<Rectangle2D> positions) {
float[][] positionMatrix = new float[positions.size()][];
for (int i = 0; i < positions.size(); i++) {
positionMatrix[i] = toArray(positions.get(i));
}
return positionMatrix;
}
private static float[] toArray(Rectangle2D positions) {
return new float[]{(float) positions.getMinX(), (float) positions.getMinY(), (float) positions.getWidth(), (float) positions.getHeight()};
}
}

View File

@ -0,0 +1,39 @@
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.data;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.AtomicTextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class AtomicTextBlockData {
Long id;
Long page;
String searchText;
int numberOnPage;
int start;
int end;
int[] lineBreaks;
public static AtomicTextBlockData fromAtomicTextBlock(AtomicTextBlock atomicTextBlock) {
return AtomicTextBlockData.builder()
.id(atomicTextBlock.getId())
.page(atomicTextBlock.getPage().getNumber().longValue())
.searchText(atomicTextBlock.getSearchText())
.numberOnPage(atomicTextBlock.getNumberOnPage())
.start(atomicTextBlock.getBoundary().start())
.end(atomicTextBlock.getBoundary().end())
.lineBreaks(atomicTextBlock.getLineBreaks().stream().mapToInt(Integer::intValue).toArray())
.build();
}
}

View File

@ -0,0 +1,43 @@
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.data;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Document;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class DocumentData {
PageData[] pages;
AtomicTextBlockData[] atomicTextBlocks;
AtomicPositionBlockData[] atomicPositionBlocks;
DocumentTreeData documentTreeData;
public static DocumentData fromDocument(Document document) {
var atomicPositionBlocks = document.streamTerminalTextBlocksInOrder()
.flatMap(textBlock -> textBlock.getAtomicTextBlocks().stream())
.distinct()
.map(AtomicPositionBlockData::fromAtomicTextBlock)
.toArray(AtomicPositionBlockData[]::new);
var atomicTextBlocks = document.streamTerminalTextBlocksInOrder()
.flatMap(textBlock -> textBlock.getAtomicTextBlocks().stream())
.distinct()
.map(AtomicTextBlockData::fromAtomicTextBlock)
.toArray(AtomicTextBlockData[]::new);
var pages = document.getPages().stream().map(PageData::fromPage).toArray(PageData[]::new);
var documentTreeData = new DocumentTreeData(DocumentTreeData.EntryData.fromEntry(document.getDocumentTree().getRoot()));
return new DocumentData(pages, atomicTextBlocks, atomicPositionBlocks, documentTreeData);
}
}

View File

@ -0,0 +1,128 @@
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.data;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Stream;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.data.mapper.PropertiesMapper;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.DocumentTree;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Image;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.NodeType;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Page;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Table;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.TableCell;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.AtomicTextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.Getter;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class DocumentTreeData {
EntryData root;
public EntryData get(List<Integer> tocId) {
if (tocId.isEmpty()) {
return root;
}
EntryData entry = root.children.get(tocId.get(0));
for (int id : tocId.subList(1, tocId.size())) {
entry = entry.children.get(id);
}
return entry;
}
public Stream<EntryData> streamAllEntries() {
return Stream.concat(Stream.of(root), root.children.stream()).flatMap(DocumentTreeData::flatten);
}
public String toString() {
return String.join("\n", streamAllEntries().map(EntryData::toString).toList());
}
private static Stream<EntryData> flatten(EntryData entry) {
return Stream.concat(Stream.of(entry), entry.children.stream().flatMap(DocumentTreeData::flatten));
}
@Builder
@Getter
@AllArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public static class EntryData {
NodeType type;
int[] treeId;
Long[] atomicBlockIds;
Long[] pageNumbers;
Map<String, String> properties;
List<EntryData> children;
public static EntryData fromEntry(DocumentTree.Entry entry) {
Long[] atomicBlockIds = toAtomicTextBlockIds(entry);
Map<String, String> properties = switch (entry.getType()) {
case TABLE -> PropertiesMapper.buildTableProperties((Table) entry.getNode());
case TABLE_CELL -> PropertiesMapper.buildTableCellProperties((TableCell) entry.getNode());
case IMAGE -> PropertiesMapper.buildImageProperties((Image) entry.getNode());
default -> new HashMap<>();
};
var treeId = entry.getTreeId().stream().mapToInt(Integer::intValue).toArray();
var pageNumbers = entry.getNode().getPages().stream().map(Page::getNumber).map(Integer::longValue).toArray(Long[]::new);
var subEntries = entry.getChildren().stream().map(EntryData::fromEntry).toList();
return new EntryData(entry.getType(), treeId, atomicBlockIds, pageNumbers, properties, subEntries);
}
private static Long[] toAtomicTextBlockIds(DocumentTree.Entry entry) {
if (entry.getNode().isLeaf()) {
return entry.getNode().getLeafTextBlock().getAtomicTextBlocks().stream().map(AtomicTextBlock::getId).toArray(Long[]::new);
} else {
return new Long[]{};
}
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("[");
for (int i : treeId) {
sb.append(i);
sb.append(",");
}
sb.delete(sb.length() - 1, sb.length());
sb.append("]: ");
sb.append(type);
sb.append(" atbs = ");
sb.append(atomicBlockIds.length);
return sb.toString();
}
}
}

View File

@ -0,0 +1,28 @@
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.data;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Page;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class PageData {
int number;
int height;
int width;
int rotation;
public static PageData fromPage(Page page) {
return new PageData(page.getNumber(), page.getHeight(), page.getWidth(), page.getRotation());
}
}

View File

@ -0,0 +1,198 @@
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.data.mapper;
import java.util.Arrays;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.data.AtomicPositionBlockData;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.data.AtomicTextBlockData;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.data.DocumentData;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.data.DocumentTreeData;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.data.PageData;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.DocumentTree;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Document;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Footer;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Header;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Headline;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Image;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Page;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Paragraph;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Section;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.SemanticNode;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Table;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.TableCell;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.AtomicTextBlock;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.TextBlock;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.TextBlockCollector;
import lombok.experimental.UtilityClass;
@UtilityClass
public class DocumentGraphMapper {
public Document toDocumentGraph(DocumentData documentData) {
Document document = new Document();
DocumentTree documentTree = new DocumentTree(document);
Context context = new Context(documentData, documentTree);
context.pages.addAll(Arrays.stream(documentData.getPages()).map(DocumentGraphMapper::buildPage).toList());
context.documentTree.getRoot().getChildren().addAll(buildEntries(documentData.getDocumentTreeData().getRoot().getChildren(), context));
document.setDocumentTree(context.documentTree);
document.setPages(new HashSet<>(context.pages));
document.setNumberOfPages(documentData.getPages().length);
document.setTextBlock(document.getTextBlock());
return document;
}
private List<DocumentTree.Entry> buildEntries(List<DocumentTreeData.EntryData> entries, Context context) {
List<DocumentTree.Entry> newEntries = new LinkedList<>();
for (DocumentTreeData.EntryData entryData : entries) {
List<Page> pages = Arrays.stream(entryData.getPageNumbers()).map(pageNumber -> getPage(pageNumber, context)).toList();
SemanticNode node = switch (entryData.getType()) {
case SECTION -> buildSection(context);
case PARAGRAPH -> buildParagraph(context);
case HEADLINE -> buildHeadline(context);
case HEADER -> buildHeader(context);
case FOOTER -> buildFooter(context);
case TABLE -> buildTable(context, entryData.getProperties());
case TABLE_CELL -> buildTableCell(context, entryData.getProperties());
case IMAGE -> buildImage(context, entryData.getProperties(), entryData.getPageNumbers());
default -> throw new UnsupportedOperationException("Not yet implemented for type " + entryData.getType());
};
if (entryData.getAtomicBlockIds().length > 0) {
TextBlock textBlock = toTextBlock(entryData.getAtomicBlockIds(), context, node);
node.setLeafTextBlock(textBlock);
}
List<Integer> treeId = Arrays.stream(entryData.getTreeId()).boxed().toList();
node.setTreeId(treeId);
switch (entryData.getType()) {
case HEADER -> pages.forEach(page -> page.setHeader((Header) node));
case FOOTER -> pages.forEach(page -> page.setFooter((Footer) node));
default -> pages.forEach(page -> page.getMainBody().add(node));
}
newEntries.add(DocumentTree.Entry.builder().treeId(treeId).children(buildEntries(entryData.getChildren(), context)).node(node).build());
}
return newEntries;
}
private Headline buildHeadline(Context context) {
return Headline.builder().documentTree(context.documentTree).build();
}
private Image buildImage(Context context, Map<String, String> properties, Long[] pageNumbers) {
assert pageNumbers.length == 1;
Page page = getPage(pageNumbers[0], context);
var builder = Image.builder();
PropertiesMapper.parseImageProperties(properties, builder);
return builder.documentTree(context.documentTree).page(page).build();
}
private TableCell buildTableCell(Context context, Map<String, String> properties) {
TableCell.TableCellBuilder builder = TableCell.builder();
PropertiesMapper.parseTableCellProperties(properties, builder);
return builder.documentTree(context.documentTree).build();
}
private Table buildTable(Context context, Map<String, String> properties) {
Table.TableBuilder builder = Table.builder();
PropertiesMapper.parseTableProperties(properties, builder);
return builder.documentTree(context.documentTree).build();
}
private Footer buildFooter(Context context) {
return Footer.builder().documentTree(context.documentTree).build();
}
private Header buildHeader(Context context) {
return Header.builder().documentTree(context.documentTree).build();
}
private Section buildSection(Context context) {
return Section.builder().documentTree(context.documentTree).build();
}
private Paragraph buildParagraph(Context context) {
return Paragraph.builder().documentTree(context.documentTree).build();
}
private TextBlock toTextBlock(Long[] atomicTextBlockIds, Context context, SemanticNode parent) {
return Arrays.stream(atomicTextBlockIds).map(atomicTextBlockId -> getAtomicTextBlock(context, parent, atomicTextBlockId)).collect(new TextBlockCollector());
}
private AtomicTextBlock getAtomicTextBlock(Context context, SemanticNode parent, Long atomicTextBlockId) {
return AtomicTextBlock.fromAtomicTextBlockData(context.atomicTextBlockData.get(Math.toIntExact(atomicTextBlockId)),
context.atomicPositionBlockData.get(Math.toIntExact(atomicTextBlockId)),
parent,
getPage(context.atomicTextBlockData.get(Math.toIntExact(atomicTextBlockId)).getPage(), context));
}
private Page buildPage(PageData p) {
return Page.builder().rotation(p.getRotation()).height(p.getHeight()).width(p.getWidth()).number(p.getNumber()).mainBody(new LinkedList<>()).build();
}
private Page getPage(Long pageIndex, Context context) {
return context.pages.stream()
.filter(page -> page.getNumber() == Math.toIntExact(pageIndex))
.findFirst()
.orElseThrow(() -> new NoSuchElementException(String.format("ClassificationPage with number %d not found", pageIndex)));
}
static final class Context {
private final DocumentTree documentTree;
private final List<Page> pages;
private final List<AtomicTextBlockData> atomicTextBlockData;
private final List<AtomicPositionBlockData> atomicPositionBlockData;
Context(DocumentData documentData, DocumentTree documentTree) {
this.documentTree = documentTree;
this.pages = new LinkedList<>();
this.atomicTextBlockData = Arrays.stream(documentData.getAtomicTextBlocks()).toList();
this.atomicPositionBlockData = Arrays.stream(documentData.getAtomicPositionBlocks()).toList();
}
}
}

View File

@ -0,0 +1,110 @@
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.data.mapper;
import static java.lang.String.format;
import java.awt.geom.Rectangle2D;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Image;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.ImageType;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Table;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.TableCell;
import lombok.AccessLevel;
import lombok.experimental.FieldDefaults;
import lombok.experimental.UtilityClass;
@UtilityClass
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class PropertiesMapper {
String imageType = "imageType";
private final String transparency = "transparency";
private final String position = "position";
String id = "id";
String row = "row";
String col = "col";
String header = "header";
String bBox = "bBox";
String numberOfRows = "numberOfRows";
String numberOfCols = "numberOfCols";
public Map<String, String> buildImageProperties(Image image) {
Map<String, String> properties = new HashMap<>();
properties.put(imageType, image.getImageType().toString());
properties.put(transparency, String.valueOf(image.isTransparent()));
properties.put(position, toString(image.getPosition()));
properties.put(id, image.getId());
return properties;
}
private String toString(Rectangle2D rectangle2D) {
return format("%f,%f,%f,%f", rectangle2D.getX(), rectangle2D.getY(), rectangle2D.getWidth(), rectangle2D.getHeight());
}
public Map<String, String> buildTableCellProperties(TableCell tableCell) {
Map<String, String> properties = new HashMap<>();
properties.put(row, String.valueOf(tableCell.getRow()));
properties.put(col, String.valueOf(tableCell.getCol()));
properties.put(header, String.valueOf(tableCell.isHeader()));
if (tableCell.getPages().size() > 1 || tableCell.getBBox().keySet().size() > 1) {
throw new IllegalArgumentException("TableCell can only occur on a single page!");
}
String bBoxString = toString(tableCell.getBBox().get(tableCell.getPages().stream().findFirst().get()));
properties.put(bBox, bBoxString);
return properties;
}
public Map<String, String> buildTableProperties(Table table) {
Map<String, String> properties = new HashMap<>();
properties.put(numberOfRows, String.valueOf(table.getNumberOfRows()));
properties.put(numberOfCols, String.valueOf(table.getNumberOfCols()));
return properties;
}
public void parseImageProperties(Map<String, String> properties, Image.ImageBuilder builder) {
builder.imageType(ImageType.fromString(properties.get(imageType)));
builder.transparent(Boolean.parseBoolean(properties.get(transparency)));
builder.position(parseRectangle2D(properties.get(position)));
builder.id(properties.get(id));
}
public void parseTableCellProperties(Map<String, String> properties, TableCell.TableCellBuilder builder) {
builder.row(Integer.parseInt(properties.get(row)));
builder.col(Integer.parseInt(properties.get(col)));
builder.header(Boolean.parseBoolean(properties.get(header)));
builder.bBox(parseRectangle2D(properties.get(bBox)));
}
public void parseTableProperties(Map<String, String> properties, Table.TableBuilder builder) {
builder.numberOfRows(Integer.parseInt(properties.get(numberOfRows)));
builder.numberOfCols(Integer.parseInt(properties.get(numberOfCols)));
}
private Rectangle2D parseRectangle2D(String bBox) {
List<Float> floats = Arrays.stream(bBox.split(",")).map(Float::parseFloat).toList();
return new Rectangle2D.Float(floats.get(0), floats.get(1), floats.get(2), floats.get(3));
}
}

View File

@ -0,0 +1,246 @@
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.factory;
import static java.lang.String.format;
import static java.util.stream.Collectors.groupingBy;
import static java.util.stream.Collectors.toList;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Set;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationDocument;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationFooter;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationHeader;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationPage;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.image.ClassifiedImage;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.DocumentTree;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Document;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Footer;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.GenericSemanticNode;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Header;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Headline;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Image;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Page;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Paragraph;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Section;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.AtomicTextBlock;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.utils.TextPositionOperations;
import com.iqser.red.service.redaction.v1.server.redaction.utils.IdBuilder;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Getter;
import lombok.experimental.FieldDefaults;
import lombok.experimental.UtilityClass;
@UtilityClass
public class DocumentGraphFactory {
public Document buildDocumentGraph(ClassificationDocument document) {
Document documentGraph = new Document();
Context context = new Context(documentGraph);
document.getPages().forEach(context::buildAndAddPageWithCounter);
document.getSections().stream().flatMap(section -> section.getImages().stream()).forEach(image -> context.getImages().add(image));
addSections(document, context);
addHeaderAndFooterToEachPage(document, context);
documentGraph.setNumberOfPages(context.pages.size());
documentGraph.setPages(context.pages.keySet());
documentGraph.setDocumentTree(context.documentTree);
documentGraph.setTextBlock(documentGraph.getTextBlock());
return documentGraph;
}
private void addSections(ClassificationDocument document, Context context) {
document.getSections().forEach(section -> SectionNodeFactory.addSection(null, section.getPageBlocks(), section.getImages(), context));
}
public void addParagraphOrHeadline(GenericSemanticNode parentNode, TextPageBlock originalTextBlock, Context context, List<TextPageBlock> textBlocksToMerge) {
Page page = context.getPage(originalTextBlock.getPage());
GenericSemanticNode node;
if (originalTextBlock.isHeadline()) {
node = Headline.builder().documentTree(context.getDocumentTree()).build();
} else {
node = Paragraph.builder().documentTree(context.getDocumentTree()).build();
}
page.getMainBody().add(node);
List<TextPageBlock> textBlocks = new ArrayList<>(textBlocksToMerge);
textBlocks.add(originalTextBlock);
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), node, context, page);
List<Integer> treeId = context.documentTree.createNewChildEntryAndReturnId(parentNode, node);
node.setLeafTextBlock(textBlock);
node.setTreeId(treeId);
}
public void addImage(Section section, ClassifiedImage image, Context context) {
Rectangle2D position = image.getPosition();
Page page = context.getPage(image.getPage());
Image imageNode = Image.builder()
.id(IdBuilder.buildId(Set.of(page), List.of(position)))
.imageType(image.getImageType())
.position(position)
.transparent(image.isHasTransparency())
.page(page)
.documentTree(context.getDocumentTree())
.build();
page.getMainBody().add(imageNode);
List<Integer> tocId = context.getDocumentTree().createNewChildEntryAndReturnId(section, imageNode);
imageNode.setTreeId(tocId);
}
private void addHeaderAndFooterToEachPage(ClassificationDocument document, Context context) {
Map<Integer, List<TextPageBlock>> headers = document.getHeaders()
.stream()
.map(ClassificationHeader::getTextBlocks)
.flatMap(List::stream)
.collect(groupingBy(AbstractPageBlock::getPage, toList()));
Map<Integer, List<TextPageBlock>> footers = document.getFooters()
.stream()
.map(ClassificationFooter::getTextBlocks)
.flatMap(List::stream)
.collect(groupingBy(AbstractPageBlock::getPage, toList()));
for (int pageIndex = 1; pageIndex <= document.getPages().size(); pageIndex++) {
if (headers.containsKey(pageIndex)) {
addHeader(headers.get(pageIndex), context);
} else {
addEmptyHeader(pageIndex, context);
}
}
for (int pageIndex = 1; pageIndex <= document.getPages().size(); pageIndex++) {
if (footers.containsKey(pageIndex)) {
addFooter(footers.get(pageIndex), context);
} else {
addEmptyFooter(pageIndex, context);
}
}
}
private void addFooter(List<TextPageBlock> textBlocks, Context context) {
Page page = context.getPage(textBlocks.get(0).getPage());
Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build();
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks),
footer,
context,
page);
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer);
footer.setTreeId(tocId);
footer.setLeafTextBlock(textBlock);
page.setFooter(footer);
}
public void addHeader(List<TextPageBlock> textBlocks, Context context) {
Page page = context.getPage(textBlocks.get(0).getPage());
Header header = Header.builder().documentTree(context.getDocumentTree()).build();
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), header, 0, page);
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header);
header.setTreeId(tocId);
header.setLeafTextBlock(textBlock);
page.setHeader(header);
}
private void addEmptyFooter(int pageIndex, Context context) {
Page page = context.getPage(pageIndex);
Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build();
AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(footer, context, page);
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer);
footer.setTreeId(tocId);
footer.setLeafTextBlock(textBlock);
page.setFooter(footer);
}
private void addEmptyHeader(int pageIndex, Context context) {
Page page = context.getPage(pageIndex);
Header header = Header.builder().documentTree(context.getDocumentTree()).build();
AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(header, 0, page);
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header);
header.setTreeId(tocId);
header.setLeafTextBlock(textBlock);
page.setHeader(header);
}
@Getter
@Builder
@AllArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public final class Context {
DocumentTree documentTree;
Map<Page, Integer> pages;
List<Section> sections;
List<ClassifiedImage> images;
TextBlockFactory textBlockFactory;
public Context(Document document) {
documentTree = new DocumentTree(document);
pages = new HashMap<>();
sections = new LinkedList<>();
images = new LinkedList<>();
textBlockFactory = new TextBlockFactory();
}
public void buildAndAddPageWithCounter(ClassificationPage classificationPage) {
Page page = Page.fromClassificationPage(classificationPage);
//this counter counts the TextBlocks per page
//initial value is set to 1, because 0 is reserved for Header
pages.put(page, 1);
}
public int getAndIncrementTextBlockNumberOnPage(Page page) {
Integer textBlockNumberOnPage = pages.get(page);
pages.merge(page, 1, Integer::sum);
return textBlockNumberOnPage;
}
public Page getPage(int pageIndex) {
return pages.keySet()
.stream()
.filter(page -> page.getNumber() == pageIndex)
.findFirst()
.orElseThrow(() -> new NoSuchElementException(format("ClassificationPage with number %d not found", pageIndex)));
}
}
}

View File

@ -0,0 +1,33 @@
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.factory;
import java.awt.geom.Rectangle2D;
import java.util.Collections;
import java.util.List;
import lombok.AccessLevel;
import lombok.Builder;
import lombok.Getter;
import lombok.experimental.FieldDefaults;
@Builder
@Getter
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class SearchTextWithTextPositionDto {
String searchText;
List<Integer> lineBreaks;
List<Integer> stringCoordsToPositionCoords;
List<Rectangle2D> positions;
public static SearchTextWithTextPositionDto empty() {
return SearchTextWithTextPositionDto.builder()
.searchText("")
.lineBreaks(Collections.emptyList())
.positions(Collections.emptyList())
.stringCoordsToPositionCoords(Collections.emptyList())
.build();
}
}

View File

@ -0,0 +1,185 @@
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.factory;
import java.awt.geom.AffineTransform;
import java.awt.geom.Rectangle2D;
import java.util.LinkedList;
import java.util.List;
import java.util.Objects;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.RedTextPosition;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextDirection;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPositionSequence;
import lombok.experimental.UtilityClass;
@UtilityClass
public class SearchTextWithTextPositionFactory {
public final int HEIGHT_PADDING = 2;
// when checking for a hyphen linebreak, we need to check after a linebreak if the last hyphen was less than three symbols away.
// We detect a linebreak as either a "\n" character or if two adjacent symbol's position differ in y-coordinates by at least one character height.
// If there is a hyphen linebreak, the hyphen will be 1 position in front of a "\n" or 2 positions in front of the character which has a lower y-coordinate
// This is why, we need to initialize this to < -2, otherwise, if the very first symbol is a \n we would detect a hyphen linebreak that isn't there.
// Also, Integer.MIN_VALUE is a bad idea due to potential overflow during arithmetic operations. This is why the default should be -3.
public final int MAX_HYPHEN_LINEBREAK_DISTANCE = 3;
public SearchTextWithTextPositionDto buildSearchTextToTextPositionModel(List<TextPositionSequence> sequences) {
if (sequences.isEmpty() || sequences.stream().allMatch(sequence -> sequence.getTextPositions().isEmpty())) {
return SearchTextWithTextPositionDto.empty();
}
Context context = new Context();
RedTextPosition currentTextPosition = sequences.get(0).getTextPositions().get(0);
RedTextPosition previousTextPosition = RedTextPosition.builder().unicode(" ").position(currentTextPosition.getPosition()).build();
for (TextPositionSequence word : sequences) {
for (int i = 0; i < word.getTextPositions().size(); ++i) {
currentTextPosition = word.getTextPositions().get(i);
if (isLineBreak(currentTextPosition, previousTextPosition)) {
removeHyphenLinebreaks(context);
context.lineBreaksStringIdx.add(context.stringIdx);
}
if (!isRepeatedWhitespace(currentTextPosition.getUnicode(), previousTextPosition.getUnicode())) {
if (isHyphen(currentTextPosition.getUnicode())) {
context.lastHyphenIdx = context.stringIdx;
}
appendCurrentTextPosition(context, currentTextPosition);
}
previousTextPosition = currentTextPosition;
++context.positionIdx;
}
previousTextPosition = RedTextPosition.builder().unicode(" ").position(previousTextPosition.getPosition()).build();
context.stringBuilder.append(" ");
context.stringIdxToPositionIdx.add(context.positionIdx);
++context.stringIdx;
}
assert context.stringBuilder.length() == context.stringIdxToPositionIdx.size();
List<Rectangle2D> positions = sequences.stream()
.flatMap(sequence -> sequence.getTextPositions().stream().map(textPosition -> mapRedTextPositionToInitialUserSpace(textPosition, sequence)))
.toList();
return SearchTextWithTextPositionDto.builder()
.searchText(context.stringBuilder.toString())
.lineBreaks(context.lineBreaksStringIdx)
.stringCoordsToPositionCoords(context.stringIdxToPositionIdx)
.positions(positions)
.build();
}
private void appendCurrentTextPosition(Context context, RedTextPosition currentTextPosition) {
context.stringBuilder.append(currentTextPosition.getUnicode());
// unicode characters with more than 16-bit encoding have a length > 1 in java strings
for (int j = 0; j < currentTextPosition.getUnicode().length(); j++) {
context.stringIdxToPositionIdx.add(context.positionIdx);
}
context.stringIdx += currentTextPosition.getUnicode().length();
}
private void removeHyphenLinebreaks(Context context) {
if (lastHyphenDirectlyBeforeLineBreak(context)) {
context.stringBuilder.delete(context.lastHyphenIdx, context.stringBuilder.length());
context.stringIdxToPositionIdx = context.stringIdxToPositionIdx.subList(0, context.lastHyphenIdx);
context.stringIdx = context.lastHyphenIdx;
context.lastHyphenIdx = -MAX_HYPHEN_LINEBREAK_DISTANCE;
}
}
private boolean lastHyphenDirectlyBeforeLineBreak(Context context) {
return context.stringIdx - context.lastHyphenIdx < MAX_HYPHEN_LINEBREAK_DISTANCE;
}
private boolean isLineBreak(RedTextPosition currentTextPosition, RedTextPosition previousTextPosition) {
return Objects.equals(currentTextPosition.getUnicode(), "\n") || isDeltaYLargerThanTextHeight(currentTextPosition, previousTextPosition);
}
private boolean isDeltaYLargerThanTextHeight(RedTextPosition currentPosition, RedTextPosition previousPosition) {
if (previousPosition == null) {
return false;
}
float deltaY = Math.abs(currentPosition.getYDirAdj() - previousPosition.getYDirAdj());
return deltaY >= currentPosition.getHeightDir();
}
private boolean isRepeatedWhitespace(String currentUnicode, String previousUnicode) {
return Objects.equals(previousUnicode, " ") && Objects.equals(currentUnicode, " ");
}
private boolean isHyphen(String unicodeCharacter) {
return Objects.equals(unicodeCharacter, "-") || //
Objects.equals(unicodeCharacter, "~") || //
Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "\u00AD");
}
private Rectangle2D mapRedTextPositionToInitialUserSpace(RedTextPosition textPosition, TextPositionSequence sequence) {
float textHeight = sequence.getTextHeight() + HEIGHT_PADDING;
Rectangle2D rectangle2D = new Rectangle2D.Double(textPosition.getXDirAdj(),
textPosition.getYDirAdj() - textHeight,
textPosition.getWidthDirAdj(),
textHeight + HEIGHT_PADDING);
AffineTransform transform = new AffineTransform();
if (sequence.getDir() == TextDirection.ZERO || sequence.getDir() == TextDirection.HALF_CIRCLE) {
transform.rotate(sequence.getDir().getRadians(), sequence.getPageWidth() / 2f, sequence.getPageHeight() / 2f);
transform.translate(0f, sequence.getPageHeight());
} else if (sequence.getDir() == TextDirection.QUARTER_CIRCLE) {
transform.rotate(sequence.getDir().getRadians(), sequence.getPageWidth() / 2f, sequence.getPageWidth() / 2f);
transform.translate(0f, sequence.getPageWidth());
} else {
transform.rotate(sequence.getDir().getRadians(), sequence.getPageHeight() / 2f, sequence.getPageHeight() / 2f);
transform.translate(0f, sequence.getPageWidth());
}
transform.scale(1., -1.);
return transform.createTransformedShape(rectangle2D).getBounds2D();
}
private class Context {
List<Integer> stringIdxToPositionIdx = new LinkedList<>();
List<Integer> lineBreaksStringIdx = new LinkedList<>();
StringBuilder stringBuilder = new StringBuilder();
int stringIdx;
int positionIdx;
int lastHyphenIdx = -MAX_HYPHEN_LINEBREAK_DISTANCE;
}
}

View File

@ -0,0 +1,183 @@
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.factory;
import static java.lang.String.format;
import static java.util.Collections.emptyList;
import static java.util.stream.Collectors.groupingBy;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.image.ClassifiedImage;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.TablePageBlock;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.GenericSemanticNode;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Page;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Section;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.utils.TableMergingUtility;
import lombok.experimental.UtilityClass;
@UtilityClass
public class SectionNodeFactory {
public void addSection(GenericSemanticNode parentNode, List<AbstractPageBlock> pageBlocks, List<ClassifiedImage> images, DocumentGraphFactory.Context context) {
if (pageBlocks.isEmpty()) {
return;
}
Map<Integer, List<AbstractPageBlock>> blocksPerPage = pageBlocks.stream().collect(groupingBy(AbstractPageBlock::getPage));
Section section = Section.builder().documentTree(context.getDocumentTree()).build();
context.getSections().add(section);
blocksPerPage.keySet().forEach(pageNumber -> addSectionNodeToPageNode(context, section, pageNumber));
section.setTreeId(getTreeId(parentNode, context, section));
addFirstHeadlineDirectlyToSection(pageBlocks, context, section);
if (containsTablesAndTextBlocks(pageBlocks)) {
splitPageBlocksIntoSubSections(pageBlocks).forEach(subSectionPageBlocks -> addSection(section, subSectionPageBlocks, emptyList(), context));
} else {
addTablesAndParagraphsAndHeadlinesToSection(pageBlocks, context, section);
}
images.stream().distinct().forEach(image -> DocumentGraphFactory.addImage(section, image, context));
}
private List<Integer> getTreeId(GenericSemanticNode parentNode, DocumentGraphFactory.Context context, Section section) {
if (parentNode == null) {
return context.getDocumentTree().createNewMainEntryAndReturnId(section);
} else {
return context.getDocumentTree().createNewChildEntryAndReturnId(parentNode, section);
}
}
private void addFirstHeadlineDirectlyToSection(List<AbstractPageBlock> pageBlocks, DocumentGraphFactory.Context context, Section section) {
if (pageBlocks.get(0).isHeadline()) {
addTablesAndParagraphsAndHeadlinesToSection(List.of(pageBlocks.get(0)), context, section);
pageBlocks.remove(0);
}
}
private void addTablesAndParagraphsAndHeadlinesToSection(List<AbstractPageBlock> pageBlocks, DocumentGraphFactory.Context context, Section section) {
Set<AbstractPageBlock> alreadyMerged = new HashSet<>();
List<AbstractPageBlock> remainingBlocks = new LinkedList<>(pageBlocks);
for (AbstractPageBlock abstractPageBlock : pageBlocks) {
if (alreadyMerged.contains(abstractPageBlock)) {
continue;
}
remainingBlocks.removeAll(alreadyMerged);
if (abstractPageBlock instanceof TextPageBlock) {
List<TextPageBlock> textBlocks = findTextBlocksWithSameClassificationAndAlignsY(abstractPageBlock, remainingBlocks);
alreadyMerged.addAll(textBlocks);
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, textBlocks);
} else if (abstractPageBlock instanceof TablePageBlock tablePageBlock) {
List<TablePageBlock> tablesToMerge = TableMergingUtility.findConsecutiveTablesWithSameColCountAndSameHeaders(tablePageBlock, remainingBlocks);
alreadyMerged.addAll(tablesToMerge);
TableNodeFactory.addTable(section, tablesToMerge, context);
} else {
throw new RuntimeException(format("Unhandled AbstractPageBlockType %s!", abstractPageBlock.getClass()));
}
}
}
private boolean containsTablesAndTextBlocks(List<AbstractPageBlock> pageBlocks) {
return pageBlocks.stream().anyMatch(pageBlock -> pageBlock instanceof TablePageBlock) && pageBlocks.stream().anyMatch(pageBlock -> pageBlock instanceof TextPageBlock);
}
/**
* This function splits the list of PageBlocks around TablePageBlocks, such that SubSections can be created, that don't include tables.
* This is needed so we can execute rules on sections, that do not contain tables.
* See: <a href="https://knecon.atlassian.net/wiki/spaces/RED/pages/14765218/Document+Structure">document structure wiki</a>
*
* @param pageBlocks a List of AbstractPageBlocks, which have at least one TablePageBlock and one ClassificationTextBlock
* @return List of Lists of AbstractPageBlocks, which include either a single Headline ClassificationTextBlock and a TablePageBlock or only ClassificationTextBlocks.
*/
private List<List<AbstractPageBlock>> splitPageBlocksIntoSubSections(List<AbstractPageBlock> pageBlocks) {
List<List<AbstractPageBlock>> splitList = splitIntoCoherentList(pageBlocks);
movePrecedingHeadlineToTableList(splitList);
return splitList.stream().filter(list -> !list.isEmpty()).toList();
}
private void movePrecedingHeadlineToTableList(List<List<AbstractPageBlock>> splitList) {
for (int i = 0; i < splitList.size(); i++) {
if (listIsTablesOnly(splitList.get(i)) && i > 0) {
List<AbstractPageBlock> previousList = splitList.get(i - 1);
AbstractPageBlock lastPageBlockInPreviousList = previousList.get(previousList.size() - 1);
if (lastPageBlockInPreviousList.isHeadline()) {
previousList.remove(i - 1);
splitList.get(i).add(0, lastPageBlockInPreviousList);
}
}
}
}
private boolean listIsTablesOnly(List<AbstractPageBlock> abstractPageBlocks) {
return abstractPageBlocks.stream().allMatch(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock);
}
/**
* @param pageBlocks a List of AbstractPageBlocks, which have at least one TablePageBlock and one ClassificationTextBlock
* @return List of Lists of AbstractPageBlocks, which are exclusively of type ClassificationTextBlock or TablePageBlock
*/
private List<List<AbstractPageBlock>> splitIntoCoherentList(List<AbstractPageBlock> pageBlocks) {
List<List<AbstractPageBlock>> splitList = new LinkedList<>();
List<AbstractPageBlock> currentList = new LinkedList<>();
splitList.add(currentList);
Class<? extends AbstractPageBlock> lastPageBlockClass = pageBlocks.get(0).getClass();
for (AbstractPageBlock pageBlock : pageBlocks) {
if (lastPageBlockClass.isInstance(pageBlock)) {
currentList.add(pageBlock);
} else {
currentList = new LinkedList<>();
currentList.add(pageBlock);
splitList.add(currentList);
lastPageBlockClass = pageBlock.getClass();
}
}
return splitList;
}
private List<TextPageBlock> findTextBlocksWithSameClassificationAndAlignsY(AbstractPageBlock atc, List<AbstractPageBlock> pageBlocks) {
return pageBlocks.stream()
.filter(abstractTextContainer -> !abstractTextContainer.equals(atc))
.filter(abstractTextContainer -> abstractTextContainer.getPage() == atc.getPage())
.filter(abstractTextContainer -> abstractTextContainer instanceof TextPageBlock)
.filter(abstractTextContainer -> abstractTextContainer.intersectsY(atc))
.map(abstractTextContainer -> (TextPageBlock) abstractTextContainer)
.toList();
}
private void addSectionNodeToPageNode(DocumentGraphFactory.Context context, Section section, Integer pageNumber) {
Page page = context.getPage(pageNumber);
page.getMainBody().add(section);
}
}

View File

@ -0,0 +1,136 @@
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.factory;
import static java.util.Collections.emptyList;
import java.util.Collection;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.Cell;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.TablePageBlock;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.GenericSemanticNode;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Page;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.SemanticNode;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Table;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.TableCell;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.TextBlock;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.utils.TextPositionOperations;
import lombok.experimental.UtilityClass;
@UtilityClass
public class TableNodeFactory {
public final double TABLE_CELL_MERGE_CONTENTS_SIZE_THRESHOLD = 0.05;
public void addTable(GenericSemanticNode parentNode, List<TablePageBlock> tablesToMerge, DocumentGraphFactory.Context context) {
setPageNumberInCells(tablesToMerge);
Set<Page> pages = tablesToMerge.stream().map(AbstractPageBlock::getPage).map(context::getPage).collect(Collectors.toSet());
List<List<Cell>> mergedRows = tablesToMerge.stream().map(TablePageBlock::getRows).flatMap(Collection::stream).toList();
Table table = Table.builder().documentTree(context.getDocumentTree()).numberOfCols(mergedRows.get(0).size()).numberOfRows(mergedRows.size()).build();
pages.forEach(page -> addTableToPage(page, parentNode, table));
List<Integer> treeId = context.getDocumentTree().createNewChildEntryAndReturnId(parentNode, table);
table.setTreeId(treeId);
addTableCells(mergedRows, table, context);
ifTableHasNoHeadersSetFirstRowAsHeaders(table);
}
private void setPageNumberInCells(List<TablePageBlock> tablesToMerge) {
// For some reason I can't figure out, in some table cells, the ClassificationTextBlocks have 0 as page number
// So I am fixing this here, but this should actually be fixed upstream.
tablesToMerge.forEach(table -> table.getRows()
.stream()
.flatMap(Collection::stream)
.peek(cell -> cell.setPageNumber(table.getPage()))
.forEach(cell -> setPageNumberInTextBlocksWithPageNumberSetTo0(table, cell)));
}
private void setPageNumberInTextBlocksWithPageNumberSetTo0(TablePageBlock table, Cell cell) {
cell.getTextBlocks().stream()//
.filter(tb -> tb.getPage() == 0)//
.forEach(tb -> tb.setPage(table.getPage()));
}
@SuppressWarnings("PMD.UnusedPrivateMethod") // PMD actually flags this wrong
private void addTableToPage(Page page, SemanticNode parentNode, Table table) {
if (!page.getMainBody().contains(parentNode)) {
parentNode.getPages().add(page);
}
page.getMainBody().add(table);
}
private void ifTableHasNoHeadersSetFirstRowAsHeaders(Table table) {
if (table.streamHeaders().findAny().isEmpty()) {
table.streamRow(0).forEach(tableCellNode -> tableCellNode.setHeader(true));
}
}
private void addTableCells(List<List<Cell>> rows, Table table, DocumentGraphFactory.Context context) {
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
for (int colIndex = 0; colIndex < rows.get(rowIndex).size(); colIndex++) {
addTableCell(rows.get(rowIndex).get(colIndex), rowIndex, colIndex, table, context);
}
}
}
@SuppressWarnings("PMD.UnusedPrivateMethod") // PMD actually flags this wrong
private void addTableCell(Cell cell, int rowIndex, int colIndex, Table tableNode, DocumentGraphFactory.Context context) {
Page page = context.getPage(cell.getPageNumber());
TableCell tableCell = TableCell.builder().documentTree(context.getDocumentTree()).row(rowIndex).col(colIndex).header(cell.isHeaderCell()).bBox(cell.getBounds2D()).build();
page.getMainBody().add(tableCell);
List<Integer> treeId = context.getDocumentTree().createNewTableChildEntryAndReturnId(tableNode, tableCell);
tableCell.setTreeId(treeId);
TextBlock textBlock;
if (cell.getTextBlocks().isEmpty()) {
tableCell.setLeafTextBlock(context.getTextBlockFactory().emptyTextBlock(tableNode, context, page));
} else if (cell.getTextBlocks().size() == 1) {
textBlock = context.getTextBlockFactory().buildAtomicTextBlock(cell.getTextBlocks().get(0).getSequences(), tableCell, context, page);
tableCell.setLeafTextBlock(textBlock);
} else if (firstTextBlockIsHeadline(cell)) {
SectionNodeFactory.addSection(tableCell, cell.getTextBlocks().stream().map(tb -> (AbstractPageBlock) tb).toList(), emptyList(), context);
} else if (cellAreaIsSmallerThanPageAreaTimesThreshold(cell, page)) {
List<TextPositionSequence> sequences = TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(cell.getTextBlocks());
textBlock = context.getTextBlockFactory().buildAtomicTextBlock(sequences, tableCell, context, page);
tableCell.setLeafTextBlock(textBlock);
} else {
cell.getTextBlocks().forEach(tb -> DocumentGraphFactory.addParagraphOrHeadline(tableCell, tb, context, emptyList()));
}
}
private boolean cellAreaIsSmallerThanPageAreaTimesThreshold(Cell cell, Page page) {
return cell.getArea() < TABLE_CELL_MERGE_CONTENTS_SIZE_THRESHOLD * page.getHeight() * page.getWidth();
}
private boolean firstTextBlockIsHeadline(Cell cell) {
return cell.getTextBlocks().get(0).isHeadline();
}
}

View File

@ -0,0 +1,53 @@
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.factory;
import java.util.List;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Page;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.SemanticNode;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.AtomicTextBlock;
import lombok.AccessLevel;
import lombok.experimental.FieldDefaults;
@FieldDefaults(level = AccessLevel.PRIVATE)
public class TextBlockFactory {
int stringOffset;
long textBlockIdx;
public AtomicTextBlock buildAtomicTextBlock(List<TextPositionSequence> sequences, SemanticNode parent, DocumentGraphFactory.Context context, Page page) {
Integer numberOnPage = context.getAndIncrementTextBlockNumberOnPage(page);
return buildAtomicTextBlock(sequences, parent, numberOnPage, page);
}
public AtomicTextBlock buildAtomicTextBlock(List<TextPositionSequence> sequences, SemanticNode parent, Integer numberOnPage, Page page) {
SearchTextWithTextPositionDto searchTextWithTextPositionDto = SearchTextWithTextPositionFactory.buildSearchTextToTextPositionModel(sequences);
int offset = stringOffset;
stringOffset += searchTextWithTextPositionDto.getSearchText().length();
long idx = textBlockIdx;
textBlockIdx++;
return AtomicTextBlock.fromSearchTextWithTextPositionDto(searchTextWithTextPositionDto, parent, offset, idx, numberOnPage, page);
}
public AtomicTextBlock emptyTextBlock(SemanticNode parent, DocumentGraphFactory.Context context, Page page) {
long idx = textBlockIdx;
textBlockIdx++;
return AtomicTextBlock.empty(idx, stringOffset, page, context.getAndIncrementTextBlockNumberOnPage(page), parent);
}
public AtomicTextBlock emptyTextBlock(SemanticNode parent, Integer numberOnPage, Page page) {
long idx = textBlockIdx;
textBlockIdx++;
return AtomicTextBlock.empty(idx, stringOffset, page, numberOnPage, parent);
}
}

View File

@ -0,0 +1,163 @@
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph;
import static java.lang.String.format;
import java.util.Collection;
import java.util.LinkedList;
import java.util.List;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.TextBlock;
import lombok.EqualsAndHashCode;
import lombok.Setter;
@Setter
@EqualsAndHashCode
public class Boundary implements Comparable<Boundary> {
private int start;
private int end;
public Boundary(int start, int end) {
if (start > end) {
throw new IllegalArgumentException(format("start: %d > end: %d", start, end));
}
this.start = start;
this.end = end;
}
public int length() {
return end - start;
}
public int start() {
return start;
}
public int end() {
return end;
}
public boolean contains(Boundary boundary) {
return start <= boundary.start() && boundary.end() <= end;
}
public boolean containedBy(Boundary boundary) {
return boundary.contains(this);
}
public boolean contains(int start, int end) {
if (start > end) {
throw new IllegalArgumentException(format("start: %d > end: %d", start, end));
}
return this.start <= start && end <= this.end;
}
public boolean containedBy(int start, int end) {
if (start > end) {
throw new IllegalArgumentException(format("start: %d > end: %d", start, end));
}
return start <= this.start && this.end <= end;
}
public boolean contains(int index) {
return start <= index && index < end;
}
public boolean intersects(Boundary boundary) {
return boundary.start() < this.end && this.start < boundary.end();
}
public List<Boundary> split(List<Integer> splitIndices) {
if (splitIndices.stream().anyMatch(idx -> !this.contains(idx))) {
throw new IndexOutOfBoundsException(format("%s splitting indices are out of range for %s", splitIndices.stream().filter(idx -> !this.contains(idx)).toList(), this));
}
List<Boundary> splitBoundaries = new LinkedList<>();
int previousIndex = start;
for (int splitIndex : splitIndices) {
// skip split if it would produce a boundary of length 0
if (splitIndex == previousIndex) {
continue;
}
splitBoundaries.add(new Boundary(previousIndex, splitIndex));
previousIndex = splitIndex;
}
splitBoundaries.add(new Boundary(previousIndex, end));
return splitBoundaries;
}
public static Boundary merge(Collection<Boundary> boundaries) {
int minStart = boundaries.stream().mapToInt(Boundary::start).min().orElseThrow(IllegalArgumentException::new);
int maxEnd = boundaries.stream().mapToInt(Boundary::end).max().orElseThrow(IllegalArgumentException::new);
return new Boundary(minStart, maxEnd);
}
@Override
public String toString() {
return format("Boundary [%d|%d)", start, end);
}
@Override
public int compareTo(Boundary boundary) {
if (end < boundary.end() && start < boundary.start()) {
return -1;
}
if (start > boundary.start() && end > boundary.end()) {
return 1;
}
return 0;
}
/**
* shrinks the boundary, such that textBlock.subSequence(boundary) returns a string without whitespaces.
*
* @param textBlock TextBlock to check whitespaces against
* @return boundary
*/
public Boundary trim(TextBlock textBlock) {
int trimmedStart = this.start;
while (Character.isWhitespace(textBlock.charAt(trimmedStart))) {
trimmedStart++;
}
int trimmedEnd = this.end;
while (Character.isWhitespace(textBlock.charAt(trimmedEnd - 1))) {
trimmedEnd--;
}
return new Boundary(trimmedStart, Math.max(trimmedEnd, trimmedStart));
}
}

View File

@ -0,0 +1,217 @@
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph;
import static java.lang.String.format;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.stream.Stream;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Document;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.GenericSemanticNode;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.NodeType;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.SemanticNode;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Table;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.TableCell;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.TextBlock;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.TextBlockCollector;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.experimental.FieldDefaults;
@Data
@EqualsAndHashCode
public class DocumentTree {
private final Entry root;
public DocumentTree(Document document) {
root = Entry.builder().treeId(Collections.emptyList()).children(new LinkedList<>()).node(document).build();
}
public TextBlock buildTextBlock() {
return allEntriesInOrder().map(Entry::getNode).filter(SemanticNode::isLeaf).map(SemanticNode::getLeafTextBlock).collect(new TextBlockCollector());
}
public List<Integer> createNewMainEntryAndReturnId(GenericSemanticNode node) {
return createNewChildEntryAndReturnIdImpl(Collections.emptyList(), node);
}
public List<Integer> createNewChildEntryAndReturnId(GenericSemanticNode parentNode, GenericSemanticNode node) {
return createNewChildEntryAndReturnIdImpl(parentNode.getTreeId(), node);
}
public List<Integer> createNewChildEntryAndReturnId(GenericSemanticNode parentNode, Table node) {
return createNewChildEntryAndReturnIdImpl(parentNode.getTreeId(), node);
}
public List<Integer> createNewTableChildEntryAndReturnId(Table parentTable, TableCell tableCell) {
return createNewChildEntryAndReturnIdImpl(parentTable.getTreeId(), tableCell);
}
@SuppressWarnings("PMD.UnusedPrivateMethod") // PMD actually flags this wrong
private List<Integer> createNewChildEntryAndReturnIdImpl(List<Integer> parentId, SemanticNode node) {
if (!entryExists(parentId)) {
throw new IllegalArgumentException(format("parentId %s does not exist!", parentId));
}
Entry parent = getEntryById(parentId);
List<Integer> newId = new LinkedList<>(parentId);
newId.add(parent.children.size());
parent.children.add(Entry.builder().treeId(newId).node(node).build());
return newId;
}
private boolean entryExists(List<Integer> treeId) {
if (treeId.isEmpty()) {
return root != null;
}
Entry entry = root.children.get(treeId.get(0));
for (int id : treeId.subList(1, treeId.size())) {
if (id >= entry.children.size() || 0 > id) {
return false;
}
entry = entry.children.get(id);
}
return true;
}
public Entry getParentEntryById(List<Integer> treeId) {
return getEntryById(getParentId(treeId));
}
public boolean hasParentById(List<Integer> treeId) {
return !treeId.isEmpty();
}
public Stream<SemanticNode> childNodes(List<Integer> treeId) {
return getEntryById(treeId).children.stream().map(Entry::getNode);
}
public Stream<SemanticNode> childNodesOfType(List<Integer> treeId, NodeType nodeType) {
return getEntryById(treeId).children.stream().filter(entry -> entry.node.getType().equals(nodeType)).map(Entry::getNode);
}
private static List<Integer> getParentId(List<Integer> treeId) {
if (treeId.isEmpty()) {
throw new UnsupportedOperationException("Root has no parent!");
}
if (treeId.size() < 2) {
return Collections.emptyList();
}
return treeId.subList(0, treeId.size() - 1);
}
public Entry getEntryById(List<Integer> treeId) {
if (treeId.isEmpty()) {
return root;
}
Entry entry = root.children.get(treeId.get(0));
for (int id : treeId.subList(1, treeId.size())) {
entry = entry.children.get(id);
}
return entry;
}
public Stream<Entry> mainEntries() {
return root.children.stream();
}
public Stream<Entry> allEntriesInOrder() {
return Stream.of(root).flatMap(DocumentTree::flatten);
}
public Stream<Entry> allSubEntriesInOrder(List<Integer> parentId) {
return getEntryById(parentId).children.stream().flatMap(DocumentTree::flatten);
}
@Override
public String toString() {
return String.join("\n", allEntriesInOrder().map(Entry::toString).toList());
}
private static Stream<Entry> flatten(Entry entry) {
return Stream.concat(Stream.of(entry), entry.children.stream().flatMap(DocumentTree::flatten));
}
public SemanticNode getHighestParentById(List<Integer> treeId) {
if (treeId.isEmpty()) {
return root.node;
}
return root.children.get(treeId.get(0)).node;
}
@Builder
@Getter
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE, makeFinal = true)
public static class Entry {
List<Integer> treeId;
SemanticNode node;
@Builder.Default
List<Entry> children = new LinkedList<>();
@Override
public String toString() {
return node.toString();
}
public NodeType getType() {
return node.getType();
}
}
}

View File

@ -1,4 +1,4 @@
package com.iqser.red.service.redaction.v1.server.redaction.model;
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity;
public enum EntityType {
ENTITY,

View File

@ -0,0 +1,229 @@
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity;
import java.awt.geom.Rectangle2D;
import java.util.Collection;
import java.util.Comparator;
import java.util.Deque;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Engine;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.Boundary;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Page;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.SemanticNode;
import com.iqser.red.service.redaction.v1.server.redaction.utils.IdBuilder;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
public class RedactionEntity {
// initial values
@EqualsAndHashCode.Include
final Boundary boundary;
@EqualsAndHashCode.Include
final String type;
@EqualsAndHashCode.Include
final EntityType entityType;
// empty defaults
boolean redaction;
boolean removed;
boolean ignored;
boolean resized;
boolean skipRemoveEntitiesContainedInLarger;
boolean dictionaryEntry;
boolean dossierDictionaryEntry;
Set<Engine> engines;
Set<RedactionEntity> references;
@Builder.Default
Deque<Integer> matchedRules = new LinkedList<>();
String redactionReason;
String legalBasis;
// inferred on graph insertion
@EqualsAndHashCode.Include
String value;
String textBefore;
String textAfter;
@Builder.Default
Set<Page> pages = new HashSet<>();
List<RedactionPosition> redactionPositionsPerPage;
@Builder.Default
List<SemanticNode> intersectingNodes = new LinkedList<>();
SemanticNode deepestFullyContainingNode;
public static RedactionEntity initialEntityNode(Boundary boundary, String type, EntityType entityType) {
return RedactionEntity.builder().type(type).entityType(entityType).boundary(boundary).engines(new HashSet<>()).references(new HashSet<>()).build();
}
public boolean occursInNodeOfType(Class<? extends SemanticNode> clazz) {
return intersectingNodes.stream().anyMatch(clazz::isInstance);
}
public boolean occursInNode(SemanticNode semanticNode) {
return intersectingNodes.stream().anyMatch(node -> node.equals(semanticNode));
}
public boolean isType(String type) {
return this.type.equals(type);
}
public boolean isAnyType(List<String> types) {
return types.contains(type);
}
public void addIntersectingNode(SemanticNode containingNode) {
intersectingNodes.add(containingNode);
}
public void removeFromGraph() {
intersectingNodes.forEach(node -> node.getEntities().remove(this));
pages.forEach(page -> page.getEntities().remove(this));
intersectingNodes = new LinkedList<>();
deepestFullyContainingNode = null;
pages = new HashSet<>();
removed = true;
ignored = true;
}
public void addMatchedRule(int ruleNumber) {
matchedRules.add(ruleNumber);
}
public int getMatchedRule() {
if (matchedRules.isEmpty()) {
return 0;
}
return matchedRules.getLast();
}
public List<RedactionPosition> getRedactionPositionsPerPage() {
if (redactionPositionsPerPage == null || redactionPositionsPerPage.isEmpty()) {
Map<Page, List<Rectangle2D>> rectanglesPerLinePerPage = deepestFullyContainingNode.getTextBlock().getPositionsPerPage(boundary);
Page firstPage = rectanglesPerLinePerPage.keySet()
.stream()
.min(Comparator.comparingInt(Page::getNumber))
.orElseThrow(() -> new RuntimeException("No Positions found on any page!"));
String id = IdBuilder.buildId(pages, rectanglesPerLinePerPage.values().stream().flatMap(Collection::stream).toList());
redactionPositionsPerPage = rectanglesPerLinePerPage.entrySet().stream().map(entry -> buildRedactionPosition(firstPage, id, entry)).toList();
}
return redactionPositionsPerPage;
}
private static RedactionPosition buildRedactionPosition(Page firstPage, String id, Map.Entry<Page, List<Rectangle2D>> entry) {
if (entry.getKey().equals(firstPage)) {
return new RedactionPosition(id, entry.getKey(), entry.getValue());
} else {
return new RedactionPosition(id + "-" + entry.getKey().getNumber(), entry.getKey(), entry.getValue());
}
}
public boolean containedBy(RedactionEntity redactionEntity) {
return this.boundary.containedBy(redactionEntity.getBoundary());
}
public boolean contains(RedactionEntity redactionEntity) {
return this.boundary.contains(redactionEntity.getBoundary());
}
public boolean intersects(RedactionEntity redactionEntity) {
return this.boundary.intersects(redactionEntity.getBoundary());
}
public void addEngine(Engine engine) {
engines.add(engine);
}
public void addEngines(Set<Engine> engines) {
this.engines.addAll(engines);
}
public void addReference(RedactionEntity reference) {
references.add(reference);
}
public void addReferences(List<RedactionEntity> references) {
this.references.addAll(references);
}
public boolean matchesAnnotationId(String manualRedactionId) {
return getRedactionPositionsPerPage().stream().anyMatch(entityPosition -> entityPosition.getId().equals(manualRedactionId));
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("Entity[\"");
sb.append(value);
sb.append("\", ");
sb.append(boundary);
sb.append(", pages[");
pages.forEach(page -> {
sb.append(page.getNumber());
sb.append(", ");
});
sb.delete(sb.length() - 2, sb.length());
sb.append("], type = \"");
sb.append(type);
sb.append("\", EntityType.");
sb.append(entityType);
sb.append("]");
return sb.toString();
}
}

View File

@ -0,0 +1,24 @@
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity;
import java.awt.geom.Rectangle2D;
import java.util.List;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Page;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.experimental.FieldDefaults;
@Data
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class RedactionPosition {
final String id;
Page page;
// Each entry in this list corresponds to an entry in the redaction log, this means:
// An entity might be represented by multiple redaction log entries
List<Rectangle2D> rectanglePerLine;
}

View File

@ -0,0 +1,119 @@
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes;
import java.awt.geom.Rectangle2D;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import com.iqser.red.service.redaction.v1.server.exception.NotFoundException;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.DocumentTree;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity.RedactionEntity;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.TextBlock;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.TextBlockCollector;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class Document implements GenericSemanticNode {
Set<Page> pages;
DocumentTree documentTree;
Integer numberOfPages;
TextBlock textBlock;
@Builder.Default
Set<RedactionEntity> entities = new HashSet<>();
@Override
public NodeType getType() {
return NodeType.DOCUMENT;
}
public TextBlock getTextBlock() {
if (textBlock == null) {
textBlock = streamTerminalTextBlocksInOrder().collect(new TextBlockCollector());
}
return textBlock;
}
public List<Section> getMainSections() {
return streamChildrenOfType(NodeType.SECTION).map(node -> (Section) node).collect(Collectors.toList());
}
public Stream<TextBlock> streamTerminalTextBlocksInOrder() {
return streamAllNodes().filter(SemanticNode::isLeaf).map(SemanticNode::getLeafTextBlock);
}
@Override
public List<Integer> getTreeId() {
return Collections.emptyList();
}
@Override
public void setTreeId(List<Integer> tocId) {
throw new UnsupportedOperationException("Document is always the root of the TablePageBlock of Contents");
}
@Override
public Headline getHeadline() {
return streamAllSubNodesOfType(NodeType.HEADLINE).map(node -> (Headline) node).findFirst().orElseThrow(() -> new NotFoundException("No Headlines found in this document!"));
}
private Stream<SemanticNode> streamAllNodes() {
return documentTree.allEntriesInOrder().map(DocumentTree.Entry::getNode);
}
public Stream<Image> streamAllImages() {
return streamAllSubNodesOfType(NodeType.IMAGE).map(node -> (Image) node);
}
@Override
public String toString() {
return NodeType.DOCUMENT + ": " + this.getTextBlock().buildSummary();
}
@Override
public Map<Page, Rectangle2D> getBBox() {
Map<Page, Rectangle2D> bBox = new HashMap<>();
for (Page page : pages) {
bBox.put(page, new Rectangle2D.Double(0, 0, page.getWidth(), page.getHeight()));
}
return bBox;
}
}

View File

@ -0,0 +1,64 @@
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.DocumentTree;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity.RedactionEntity;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class Footer implements GenericSemanticNode {
List<Integer> treeId;
TextBlock leafTextBlock;
@EqualsAndHashCode.Exclude
DocumentTree documentTree;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<RedactionEntity> entities = new HashSet<>();
@Override
public NodeType getType() {
return NodeType.FOOTER;
}
@Override
public boolean isLeaf() {
return true;
}
@Override
public TextBlock getTextBlock() {
return leafTextBlock;
}
@Override
public String toString() {
return treeId + ": " + NodeType.FOOTER + ": " + leafTextBlock.buildSummary();
}
}

View File

@ -0,0 +1,5 @@
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes;
public interface GenericSemanticNode extends SemanticNode {
}

View File

@ -0,0 +1,64 @@
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.DocumentTree;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity.RedactionEntity;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class Header implements GenericSemanticNode {
List<Integer> treeId;
TextBlock leafTextBlock;
@EqualsAndHashCode.Exclude
DocumentTree documentTree;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<RedactionEntity> entities = new HashSet<>();
@Override
public boolean isLeaf() {
return true;
}
@Override
public NodeType getType() {
return NodeType.HEADER;
}
@Override
public TextBlock getTextBlock() {
return leafTextBlock;
}
@Override
public String toString() {
return treeId + ": " + NodeType.HEADER + ": " + leafTextBlock.buildSummary();
}
}

View File

@ -0,0 +1,71 @@
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.DocumentTree;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity.RedactionEntity;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class Headline implements GenericSemanticNode {
List<Integer> treeId;
TextBlock leafTextBlock;
@EqualsAndHashCode.Exclude
DocumentTree documentTree;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<RedactionEntity> entities = new HashSet<>();
@Override
public NodeType getType() {
return NodeType.HEADLINE;
}
@Override
public boolean isLeaf() {
return true;
}
@Override
public TextBlock getTextBlock() {
return leafTextBlock;
}
@Override
public String toString() {
return treeId + ": " + NodeType.HEADLINE + ": " + leafTextBlock.buildSummary();
}
@Override
public Headline getHeadline() {
return this;
}
}

View File

@ -0,0 +1,94 @@
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes;
import java.awt.geom.Rectangle2D;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.DocumentTree;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity.RedactionEntity;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.TextBlock;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.TextBlockCollector;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class Image implements GenericSemanticNode {
List<Integer> treeId;
String id;
ImageType imageType;
boolean transparent;
Rectangle2D position;
boolean redaction;
boolean ignored;
@Builder.Default
String redactionReason = "";
@Builder.Default
String legalBasis = "";
@Builder.Default
int matchedRule = -1;
@EqualsAndHashCode.Exclude
Page page;
@EqualsAndHashCode.Exclude
DocumentTree documentTree;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<RedactionEntity> entities = new HashSet<>();
@Override
public NodeType getType() {
return NodeType.IMAGE;
}
@Override
public TextBlock getTextBlock() {
return streamAllSubNodes().filter(SemanticNode::isLeaf).map(SemanticNode::getLeafTextBlock).collect(new TextBlockCollector());
}
@Override
public Set<Page> getPages() {
return Collections.singleton(page);
}
@Override
public String toString() {
return treeId + ": " + NodeType.IMAGE + ": " + imageType.toString() + " " + position;
}
@Override
public Map<Page, Rectangle2D> getBBox() {
Map<Page, Rectangle2D> bBoxPerPage = new HashMap<>();
bBoxPerPage.put(page, position);
return bBoxPerPage;
}
}

View File

@ -0,0 +1,21 @@
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes;
public enum ImageType {
LOGO,
FORMULA,
SIGNATURE,
OTHER,
OCR;
public static ImageType fromString(String imageType) {
return switch (imageType.toLowerCase()) {
case "logo" -> ImageType.LOGO;
case "formula" -> ImageType.FORMULA;
case "signature" -> ImageType.SIGNATURE;
case "ocr" -> ImageType.OCR;
default -> ImageType.OTHER;
};
}
}

View File

@ -0,0 +1,21 @@
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes;
import java.util.Locale;
public enum NodeType {
DOCUMENT,
SECTION,
HEADLINE,
PARAGRAPH,
TABLE,
TABLE_CELL,
IMAGE,
HEADER,
FOOTER;
public String toString() {
return this.name().charAt(0) + this.name().substring(1).toLowerCase(Locale.ROOT);
}
}

View File

@ -0,0 +1,87 @@
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationPage;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity.RedactionEntity;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.TextBlock;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.TextBlockCollector;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.Setter;
import lombok.experimental.FieldDefaults;
@Getter
@Setter
@Builder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class Page {
Integer number;
Integer height;
Integer width;
Integer rotation;
@EqualsAndHashCode.Exclude
List<SemanticNode> mainBody;
@EqualsAndHashCode.Exclude
Header header;
@EqualsAndHashCode.Exclude
Footer footer;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<RedactionEntity> entities = new HashSet<>();
@Builder.Default
@EqualsAndHashCode.Exclude
Set<Image> images = new HashSet<>();
public static Page fromClassificationPage(ClassificationPage classificationPage) {
return Page.builder()
.height((int) classificationPage.getPageHeight())
.width((int) classificationPage.getPageWidth())
.number(classificationPage.getPageNumber())
.rotation(classificationPage.getRotation())
.mainBody(new LinkedList<>())
.build();
}
public TextBlock getMainBodyTextBlock() {
return mainBody.stream().filter(SemanticNode::isLeaf).map(SemanticNode::getLeafTextBlock).collect(new TextBlockCollector());
}
@Override
public String toString() {
return String.valueOf(number);
}
@Override
public int hashCode() {
return number;
}
@Override
public boolean equals(Object o) {
return o instanceof Page && o.hashCode() == this.hashCode();
}
}

View File

@ -0,0 +1,62 @@
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.DocumentTree;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity.RedactionEntity;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class Paragraph implements GenericSemanticNode {
List<Integer> treeId;
TextBlock leafTextBlock;
@EqualsAndHashCode.Exclude
DocumentTree documentTree;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<RedactionEntity> entities = new HashSet<>();
@Override
public NodeType getType() {
return NodeType.PARAGRAPH;
}
@Override
public boolean isLeaf() {
return true;
}
@Override
public TextBlock getTextBlock() {
return leafTextBlock;
}
@Override
public String toString() {
return treeId + ": " + NodeType.PARAGRAPH + ": " + leafTextBlock.buildSummary();
}
}

Some files were not shown because too many files have changed in this diff Show More