RED-6009: Document Tree Structure
* squashed commits
This commit is contained in:
parent
a6a6fd8180
commit
1f9e151092
@ -23,6 +23,7 @@
|
||||
|
||||
<properties>
|
||||
<pdfbox.version>2.0.24</pdfbox.version>
|
||||
<lombok.version>1.18.26</lombok.version>
|
||||
</properties>
|
||||
|
||||
|
||||
@ -88,5 +89,26 @@
|
||||
</plugin>
|
||||
</plugins>
|
||||
</pluginManagement>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.projectlombok</groupId>
|
||||
<artifactId>lombok-maven-plugin</artifactId>
|
||||
<version>1.18.20.0</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>delombok</id>
|
||||
<phase>generate-sources</phase>
|
||||
<goals>
|
||||
<goal>delombok</goal>
|
||||
</goals>
|
||||
<configuration>
|
||||
<addOutputDirectory>false</addOutputDirectory>
|
||||
<sourceDirectory>src/main/java</sourceDirectory>
|
||||
<outputDirectory>${delomboked.sources}</outputDirectory>
|
||||
</configuration>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
</project>
|
||||
|
||||
@ -39,7 +39,6 @@
|
||||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
|
||||
@ -12,7 +12,7 @@
|
||||
<artifactId>redaction-service-server-v1</artifactId>
|
||||
|
||||
<properties>
|
||||
<drools.version>7.73.0.Final</drools.version>
|
||||
<drools.version>8.37.0.Final</drools.version>
|
||||
<kie.version>7.73.0.Final</kie.version>
|
||||
<locationtech.version>1.19.0</locationtech.version>
|
||||
<javaassist.version>3.29.2-GA</javaassist.version>
|
||||
@ -64,7 +64,12 @@
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.drools</groupId>
|
||||
<artifactId>drools-core</artifactId>
|
||||
<artifactId>drools-engine</artifactId>
|
||||
<version>${drools.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.drools</groupId>
|
||||
<artifactId>drools-mvel</artifactId>
|
||||
<version>${drools.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
@ -198,5 +203,4 @@
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
|
||||
</project>
|
||||
|
||||
@ -1,26 +0,0 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
public class Footer {
|
||||
|
||||
private List<TextBlock> textBlocks;
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public SearchableText getSearchableText() {
|
||||
|
||||
SearchableText searchableText = new SearchableText();
|
||||
textBlocks.forEach(block -> searchableText.addAll(block.getSequences()));
|
||||
return searchableText;
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,26 +0,0 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
public class Header {
|
||||
|
||||
private List<TextBlock> textBlocks;
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public SearchableText getSearchableText() {
|
||||
|
||||
SearchableText searchableText = new SearchableText();
|
||||
textBlocks.forEach(block -> searchableText.addAll(block.getSequences()));
|
||||
return searchableText;
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,8 +0,0 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
||||
|
||||
public enum Orientation {
|
||||
|
||||
NONE,
|
||||
LEFT,
|
||||
RIGHT
|
||||
}
|
||||
@ -1,65 +0,0 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
public class Paragraph implements Comparable {
|
||||
|
||||
private List<AbstractTextContainer> pageBlocks = new ArrayList<>();
|
||||
private List<PdfImage> images = new ArrayList<>();
|
||||
private String headline;
|
||||
|
||||
|
||||
public SearchableText getSearchableText() {
|
||||
|
||||
SearchableText searchableText = new SearchableText();
|
||||
pageBlocks.forEach(block -> {
|
||||
if (block instanceof TextBlock) {
|
||||
searchableText.addAll(((TextBlock) block).getSequences());
|
||||
}
|
||||
});
|
||||
return searchableText;
|
||||
}
|
||||
|
||||
|
||||
public List<Table> getTables() {
|
||||
|
||||
List<Table> tables = new ArrayList<>();
|
||||
pageBlocks.forEach(block -> {
|
||||
if (block instanceof Table) {
|
||||
tables.add((Table) block);
|
||||
}
|
||||
});
|
||||
return tables;
|
||||
}
|
||||
|
||||
|
||||
public List<TextBlock> getTextBlocks() {
|
||||
|
||||
List<TextBlock> textBlocks = new ArrayList<>();
|
||||
pageBlocks.forEach(block -> {
|
||||
if (block instanceof TextBlock) {
|
||||
textBlocks.add((TextBlock) block);
|
||||
}
|
||||
});
|
||||
return textBlocks;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int compareTo(Object o) {
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,64 +0,0 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.section.SectionArea;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class SectionText {
|
||||
|
||||
private int sectionNumber;
|
||||
private String text;
|
||||
|
||||
private boolean isTable;
|
||||
private String headline;
|
||||
|
||||
@Builder.Default
|
||||
private List<SectionArea> sectionAreas = new ArrayList<>();
|
||||
@Builder.Default
|
||||
private Set<Image> images = new HashSet<>();
|
||||
@Builder.Default
|
||||
private List<TextBlock> textBlocks = new ArrayList<>();
|
||||
@Builder.Default
|
||||
private Map<String, CellValue> tabularData = new HashMap<>();
|
||||
@Builder.Default
|
||||
private List<Integer> cellStarts = new ArrayList<>();
|
||||
|
||||
|
||||
public void setTabularData(Map<String, CellValue> tabularData) {
|
||||
|
||||
tabularData.remove(null);
|
||||
this.tabularData = tabularData;
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public SearchableText getSearchableText() {
|
||||
|
||||
SearchableText searchableText = new SearchableText();
|
||||
textBlocks.forEach(block -> {
|
||||
if (block != null) {
|
||||
searchableText.addAll(block.getSequences());
|
||||
}
|
||||
});
|
||||
return searchableText;
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,19 +0,0 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
||||
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class Text {
|
||||
|
||||
private int numberOfPages;
|
||||
private List<SectionText> sectionTexts = new ArrayList<>();
|
||||
|
||||
}
|
||||
@ -1,26 +0,0 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
public class UnclassifiedText {
|
||||
|
||||
private List<TextBlock> textBlocks;
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public SearchableText getSearchableText() {
|
||||
|
||||
SearchableText searchableText = new SearchableText();
|
||||
textBlocks.forEach(block -> searchableText.addAll(block.getSequences()));
|
||||
return searchableText;
|
||||
}
|
||||
|
||||
}
|
||||
@ -10,7 +10,7 @@ import lombok.NoArgsConstructor;
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
public class EntityRecogintionEntity {
|
||||
public class EntityRecognitionEntity {
|
||||
|
||||
private String value;
|
||||
private int startOffset;
|
||||
@ -4,7 +4,6 @@ import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
@ -12,8 +11,8 @@ import lombok.NoArgsConstructor;
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class NerEntities {
|
||||
public class NerEntitiesModel {
|
||||
|
||||
private Map<Integer, List<EntityRecogintionEntity>> data = new HashMap<>();
|
||||
private Map<Integer, List<EntityRecognitionEntity>> data = new HashMap<>();
|
||||
|
||||
}
|
||||
@ -1,13 +1,13 @@
|
||||
package com.iqser.red.service.redaction.v1.server.controller;
|
||||
|
||||
import org.springframework.web.bind.annotation.RestController;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.model.RuleBuilderModel;
|
||||
import com.iqser.red.service.redaction.v1.resources.RuleBuilderResource;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.rulebuilder.RuleBuilderModelService;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.service.RuleBuilderModelService;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
import org.springframework.web.bind.annotation.RestController;
|
||||
|
||||
@RestController
|
||||
@RequiredArgsConstructor
|
||||
public class RuleBuilderController implements RuleBuilderResource {
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
package com.iqser.red.service.redaction.v1.server.segmentation;
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
@ -10,11 +11,10 @@ import org.springframework.stereotype.Service;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.dossier.file.FileType;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.ImageType;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.RedRectangle2D;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.image.ImageServiceResponse;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.image.ImageServiceResponse;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationPage;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.image.ClassifiedImage;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.ImageType;
|
||||
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
@ -22,26 +22,26 @@ import lombok.SneakyThrows;
|
||||
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class ImageService {
|
||||
public class ImageServiceResponseAdapter {
|
||||
|
||||
private final ObjectMapper objectMapper;
|
||||
private final RedactionStorageService redactionStorageService;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public Map<Integer, List<PdfImage>> convertImages(String dossierId, String fileId) {
|
||||
public Map<Integer, List<ClassifiedImage>> convertImages(String dossierId, String fileId) {
|
||||
|
||||
var imageClassificationStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(dossierId, fileId, FileType.IMAGE_INFO));
|
||||
|
||||
ImageServiceResponse imageServiceResponse = objectMapper.readValue(imageClassificationStream, ImageServiceResponse.class);
|
||||
|
||||
Map<Integer, List<PdfImage>> images = new HashMap<>();
|
||||
Map<Integer, List<ClassifiedImage>> images = new HashMap<>();
|
||||
imageServiceResponse.getData().forEach(imageMetadata -> {
|
||||
var classification = imageMetadata.getFilters().isAllPassed() ? ImageType.valueOf(imageMetadata.getClassification()
|
||||
.getLabel()
|
||||
.toUpperCase(Locale.ROOT)) : ImageType.OTHER;
|
||||
images.computeIfAbsent(imageMetadata.getPosition().getPageNumber(), x -> new ArrayList<>())
|
||||
.add(new PdfImage(new RedRectangle2D(imageMetadata.getPosition().getX1(),
|
||||
.add(new ClassifiedImage(new Rectangle2D.Double(imageMetadata.getPosition().getX1(),
|
||||
imageMetadata.getPosition().getY1(),
|
||||
imageMetadata.getGeometry().getWidth(),
|
||||
imageMetadata.getGeometry().getHeight()), classification, imageMetadata.isAlpha(), imageMetadata.getPosition().getPageNumber()));
|
||||
@ -53,7 +53,7 @@ public class ImageService {
|
||||
.getLabel()
|
||||
.toUpperCase(Locale.ROOT)) : ImageType.OTHER;
|
||||
images.computeIfAbsent(imageMetadata.getPosition().getPageNumber(), x -> new ArrayList<>())
|
||||
.add(new PdfImage(new RedRectangle2D(imageMetadata.getPosition().getX1(),
|
||||
.add(new ClassifiedImage(new Rectangle2D.Double(imageMetadata.getPosition().getX1(),
|
||||
imageMetadata.getPosition().getY1(),
|
||||
imageMetadata.getGeometry().getWidth(),
|
||||
imageMetadata.getGeometry().getHeight()), classification, imageMetadata.isAlpha(), imageMetadata.getPosition().getPageNumber()));
|
||||
@ -63,7 +63,7 @@ public class ImageService {
|
||||
}
|
||||
|
||||
|
||||
public void findOcr(Page page) {
|
||||
public void findOcr(ClassificationPage page) {
|
||||
|
||||
page.getImages().forEach(image -> {
|
||||
if (image.getImageType().equals(ImageType.OTHER)) {
|
||||
@ -0,0 +1,164 @@
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter;
|
||||
|
||||
import static java.lang.String.format;
|
||||
import static java.util.stream.Collectors.groupingBy;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Collection;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.RedactionLog;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.RedactionLogEntry;
|
||||
import com.iqser.red.service.redaction.v1.server.exception.NotFoundException;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity.EntityType;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity.RedactionEntity;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity.RedactionPosition;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Page;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.SemanticNode;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.services.EntityCreationService;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.dictionary.SearchImplementation;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class RedactionLogEntryAdapter {
|
||||
|
||||
private static final double MATCH_THRESHOLD = 1;
|
||||
private final EntityCreationService entityCreationService;
|
||||
|
||||
|
||||
public Stream<RedactionEntity> toRedactionEntity(RedactionLog redactionLog, SemanticNode node) {
|
||||
|
||||
List<Integer> pageNumbers = redactionLog.getRedactionLogEntry().stream().flatMap(entry -> entry.getPositions().stream().map(Rectangle::getPage)).distinct().toList();
|
||||
if (!pageNumbers.stream().allMatch(node::isOnPage)) {
|
||||
throw new IllegalArgumentException(format("SemanticNode %s does not contain these pages %s present in the redaction log",
|
||||
node,
|
||||
pageNumbers.stream().filter(pageNumber -> !node.isOnPage(pageNumber)).toList()));
|
||||
}
|
||||
Set<String> entryValues = redactionLog.getRedactionLogEntry().stream().map(RedactionLogEntry::getValue).map(String::toLowerCase).collect(Collectors.toSet());
|
||||
SearchImplementation searchImplementation = new SearchImplementation(entryValues, true);
|
||||
|
||||
Map<String, List<RedactionEntity>> tempEntitiesByValue = findAllPossibleEntitiesAndGroupByValueIgnoringCase(node, searchImplementation);
|
||||
|
||||
assert allValuesFound(tempEntitiesByValue, entryValues);
|
||||
|
||||
List<RedactionEntity> entities = redactionLog.getRedactionLogEntry()
|
||||
.stream()
|
||||
.map(entry -> findClosestRedactionEntity(entry, tempEntitiesByValue.get(entry.getValue().toLowerCase(Locale.ROOT)), node))
|
||||
.toList();
|
||||
tempEntitiesByValue.values().stream().flatMap(Collection::stream).forEach(RedactionEntity::removeFromGraph);
|
||||
return entities.stream();
|
||||
}
|
||||
|
||||
|
||||
private static boolean allValuesFound(Map<String, List<RedactionEntity>> entitiesByValue, Set<String> entryValues) {
|
||||
|
||||
return entitiesByValue.keySet().equals(entryValues);
|
||||
}
|
||||
|
||||
|
||||
private Map<String, List<RedactionEntity>> findAllPossibleEntitiesAndGroupByValueIgnoringCase(SemanticNode node, SearchImplementation searchImplementation) {
|
||||
|
||||
return searchImplementation.getBoundaries(node.getTextBlock(), node.getBoundary())
|
||||
.stream()
|
||||
.map(boundary -> entityCreationService.byBoundary(boundary, "temp", EntityType.ENTITY, node))
|
||||
.collect(groupingBy(entity -> entity.getValue().toLowerCase(Locale.ROOT)));
|
||||
}
|
||||
|
||||
|
||||
private RedactionEntity findClosestRedactionEntity(RedactionLogEntry redactionLogEntry, List<RedactionEntity> entitiesWithSameValue, SemanticNode node) {
|
||||
|
||||
RedactionEntity closestEntity = entitiesWithSameValue.stream()
|
||||
.filter(entity -> pagesMatch(entity, redactionLogEntry))
|
||||
.min(Comparator.comparingDouble(entity -> calculateMinDistance(redactionLogEntry, entity)))
|
||||
.orElseThrow(() -> new NotFoundException(format("No entity with similar position found for %s", redactionLogEntry)));
|
||||
|
||||
double distance = calculateMinDistance(redactionLogEntry, closestEntity);
|
||||
if (distance > MATCH_THRESHOLD) {
|
||||
throw new NotFoundException(format("Distance to closest found entity is %.2f for \n%s \n%s",
|
||||
distance,
|
||||
redactionLogEntry.getPositions(),
|
||||
closestEntity.getRedactionPositionsPerPage()));
|
||||
}
|
||||
|
||||
return createCorrectEntity(redactionLogEntry, node, closestEntity);
|
||||
}
|
||||
|
||||
|
||||
private RedactionEntity createCorrectEntity(RedactionLogEntry redactionLogEntry, SemanticNode node, RedactionEntity closestEntity) {
|
||||
|
||||
RedactionEntity correctEntity = entityCreationService.byBoundary(closestEntity.getBoundary(),
|
||||
redactionLogEntry.getType(),
|
||||
redactionLogEntry.isRecommendation() ? EntityType.RECOMMENDATION : EntityType.ENTITY,
|
||||
node);
|
||||
correctEntity.setLegalBasis(redactionLogEntry.getLegalBasis());
|
||||
correctEntity.setRedactionReason(redactionLogEntry.getReason());
|
||||
correctEntity.addMatchedRule(redactionLogEntry.getMatchedRule());
|
||||
correctEntity.setRedaction(redactionLogEntry.isRedacted());
|
||||
correctEntity.setDictionaryEntry(redactionLogEntry.isDictionaryEntry());
|
||||
correctEntity.setDossierDictionaryEntry(redactionLogEntry.isDossierDictionaryEntry());
|
||||
return correctEntity;
|
||||
}
|
||||
|
||||
|
||||
private static boolean pagesMatch(RedactionEntity entity, RedactionLogEntry redactionLogEntry) {
|
||||
|
||||
Set<Integer> entityPageNumbers = entity.getRedactionPositionsPerPage().stream().map(RedactionPosition::getPage).map(Page::getNumber).collect(Collectors.toSet());
|
||||
Set<Integer> redactionLogEntryPageNumbers = redactionLogEntry.getPositions().stream().map(Rectangle::getPage).collect(Collectors.toSet());
|
||||
return entityPageNumbers.equals(redactionLogEntryPageNumbers);
|
||||
}
|
||||
|
||||
|
||||
private double calculateMinDistance(RedactionLogEntry redactionLogEntry, RedactionEntity entity) {
|
||||
|
||||
if (redactionLogEntry.getPositions().size() != countRectangles(entity)) {
|
||||
return Double.MAX_VALUE;
|
||||
}
|
||||
return redactionLogEntry.getPositions().stream().mapToDouble(redactionLogEntryRectangle -> calculateMinDistancePerRectangle(entity, redactionLogEntryRectangle)).sum();
|
||||
}
|
||||
|
||||
|
||||
private static long countRectangles(RedactionEntity entity) {
|
||||
|
||||
return entity.getRedactionPositionsPerPage().stream().mapToLong(redactionPosition -> redactionPosition.getRectanglePerLine().size()).sum();
|
||||
}
|
||||
|
||||
|
||||
private double calculateMinDistancePerRectangle(RedactionEntity entity, Rectangle redactionLogEntryRectangle) {
|
||||
|
||||
return entity.getRedactionPositionsPerPage()
|
||||
.stream()
|
||||
.filter(redactionPosition -> redactionPosition.getPage().getNumber() == redactionLogEntryRectangle.getPage())
|
||||
.map(RedactionPosition::getRectanglePerLine)
|
||||
.flatMap(Collection::stream)
|
||||
.mapToDouble(rectangle -> calculateDistance(rectangle, toRectangle2D(redactionLogEntryRectangle)))
|
||||
.min()
|
||||
.orElse(Double.MAX_VALUE);
|
||||
}
|
||||
|
||||
|
||||
private double calculateDistance(Rectangle2D rectangle, Rectangle2D rectangle2D) {
|
||||
|
||||
return Math.abs(rectangle.getMinX() - rectangle2D.getMinX()) //
|
||||
+ Math.abs(rectangle.getMinY() - rectangle2D.getMinY()) //
|
||||
+ Math.abs(rectangle.getMaxX() - rectangle2D.getMaxX()) //
|
||||
+ Math.abs(rectangle.getMaxY() - rectangle2D.getMaxY());
|
||||
}
|
||||
|
||||
|
||||
private Rectangle2D toRectangle2D(Rectangle rectangle) {
|
||||
|
||||
return new Rectangle2D.Float(rectangle.getTopLeft().getX(), rectangle.getTopLeft().getY() + rectangle.getHeight(), rectangle.getWidth(), -rectangle.getHeight());
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,4 +1,4 @@
|
||||
package com.iqser.red.service.redaction.v1.server.segmentation;
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
@ -10,9 +10,9 @@ import org.springframework.stereotype.Service;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.dossier.file.FileType;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfTableCell;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.table.TableCells;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.table.TableServiceResponse;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.table.PdfTableCell;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.table.TableCells;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.table.TableServiceResponse;
|
||||
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
@ -22,7 +22,7 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class TableService {
|
||||
public class TableServiceResponseAdapter {
|
||||
|
||||
private final ObjectMapper objectMapper;
|
||||
private final RedactionStorageService redactionStorageService;
|
||||
@ -1,9 +1,8 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.model.image;
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.image;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@ -1,5 +1,4 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.model.image;
|
||||
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.image;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@ -1,5 +1,4 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.model.image;
|
||||
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.image;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@ -1,5 +1,4 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.model.image;
|
||||
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.image;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@ -1,5 +1,4 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.model.image;
|
||||
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.image;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@ -1,5 +1,4 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.model.image;
|
||||
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.image;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.model.image;
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.image;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonAlias;
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
@Data
|
||||
public class ImageServiceResponse {
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.model.image;
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.image;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@ -1,5 +1,4 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.model.image;
|
||||
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.image;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@ -0,0 +1,10 @@
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.image;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
public class Probability {
|
||||
|
||||
private boolean unconfident;
|
||||
|
||||
}
|
||||
@ -1,5 +1,4 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.model.table;
|
||||
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.table;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.model;
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.table;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
@ -1,5 +1,4 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.model.table;
|
||||
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.table;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@ -1,9 +1,8 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.model.table;
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.table;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@ -1,9 +1,8 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.model.table;
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.table;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@ -1,9 +1,8 @@
|
||||
package com.iqser.red.service.redaction.v1.server.tableextraction.model;
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Orientation;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
@ -12,7 +11,7 @@ import lombok.NoArgsConstructor;
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
public abstract class AbstractTextContainer {
|
||||
public abstract class AbstractPageBlock {
|
||||
|
||||
@JsonIgnore
|
||||
protected float minX;
|
||||
@ -23,7 +22,7 @@ public abstract class AbstractTextContainer {
|
||||
@JsonIgnore
|
||||
protected float maxY;
|
||||
@JsonIgnore
|
||||
protected String classification;
|
||||
protected PageBlockType classification;
|
||||
@JsonIgnore
|
||||
protected int page;
|
||||
|
||||
@ -34,13 +33,19 @@ public abstract class AbstractTextContainer {
|
||||
public abstract String getText();
|
||||
|
||||
|
||||
public boolean containsBlock(TextBlock other) {
|
||||
public boolean isHeadline() {
|
||||
|
||||
return this instanceof TextPageBlock && this.getClassification() != null && this.getClassification().isHeadline();
|
||||
}
|
||||
|
||||
|
||||
public boolean containsBlock(TextPageBlock other) {
|
||||
|
||||
return this.minX <= other.getPdfMinX() && this.maxX >= other.getPdfMaxX() && this.minY >= other.getPdfMinY() && this.maxY <= other.getPdfMaxY();
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(AbstractTextContainer other) {
|
||||
public boolean contains(AbstractPageBlock other) {
|
||||
|
||||
return this.minX <= other.minX && this.maxX >= other.maxX && this.minY >= other.minY && this.maxY <= other.maxY;
|
||||
}
|
||||
@ -66,4 +71,10 @@ public abstract class AbstractTextContainer {
|
||||
return maxX - minX;
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsY(AbstractPageBlock atc) {
|
||||
|
||||
return this.minY <= atc.getMaxY() && this.maxY >= atc.getMinY();
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,22 +1,24 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.section.SectionGrid;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryVersion;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.StringFrequencyCounter;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.UnclassifiedText;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.dictionary.DictionaryVersion;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
public class Document {
|
||||
public class ClassificationDocument {
|
||||
|
||||
private List<Page> pages = new ArrayList<>();
|
||||
private List<Paragraph> paragraphs = new ArrayList<>();
|
||||
private List<Header> headers = new ArrayList<>();
|
||||
private List<Footer> footers = new ArrayList<>();
|
||||
private List<ClassificationPage> pages = new ArrayList<>();
|
||||
private List<ClassificationSection> sections = new ArrayList<>();
|
||||
private List<ClassificationHeader> headers = new ArrayList<>();
|
||||
private List<ClassificationFooter> footers = new ArrayList<>();
|
||||
private List<UnclassifiedText> unclassifiedTexts = new ArrayList<>();
|
||||
private FloatFrequencyCounter textHeightCounter = new FloatFrequencyCounter();
|
||||
private FloatFrequencyCounter fontSizeCounter = new FloatFrequencyCounter();
|
||||
@ -0,0 +1,16 @@
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
public class ClassificationFooter {
|
||||
|
||||
private List<TextPageBlock> textBlocks;
|
||||
|
||||
}
|
||||
@ -0,0 +1,16 @@
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
public class ClassificationHeader {
|
||||
|
||||
private List<TextPageBlock> textBlocks;
|
||||
|
||||
}
|
||||
@ -1,11 +1,11 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.image.ClassifiedImage;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.StringFrequencyCounter;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.NonNull;
|
||||
@ -13,12 +13,12 @@ import lombok.RequiredArgsConstructor;
|
||||
|
||||
@Data
|
||||
@RequiredArgsConstructor
|
||||
public class Page {
|
||||
public class ClassificationPage {
|
||||
|
||||
@NonNull
|
||||
private List<AbstractTextContainer> textBlocks;
|
||||
private List<AbstractPageBlock> textBlocks;
|
||||
|
||||
private List<PdfImage> images = new ArrayList<>();
|
||||
private List<ClassifiedImage> images = new ArrayList<>();
|
||||
|
||||
private Rectangle bodyTextFrame;
|
||||
|
||||
@ -35,10 +35,4 @@ public class Page {
|
||||
private float pageWidth;
|
||||
private float pageHeight;
|
||||
|
||||
|
||||
public boolean isRotated() {
|
||||
|
||||
return rotation != 0;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,32 @@
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.image.ClassifiedImage;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.TablePageBlock;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
public class ClassificationSection {
|
||||
|
||||
private List<AbstractPageBlock> pageBlocks = new ArrayList<>();
|
||||
private List<ClassifiedImage> images = new ArrayList<>();
|
||||
private String headline;
|
||||
|
||||
|
||||
public List<TablePageBlock> getTables() {
|
||||
|
||||
List<TablePageBlock> tables = new ArrayList<>();
|
||||
pageBlocks.forEach(block -> {
|
||||
if (block instanceof TablePageBlock) {
|
||||
tables.add((TablePageBlock) block);
|
||||
}
|
||||
});
|
||||
return tables;
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,6 +1,4 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
||||
|
||||
import lombok.Getter;
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
@ -9,6 +7,8 @@ import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import lombok.Getter;
|
||||
|
||||
public class FloatFrequencyCounter {
|
||||
|
||||
@Getter
|
||||
@ -0,0 +1,8 @@
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model;
|
||||
|
||||
public enum Orientation {
|
||||
|
||||
NONE,
|
||||
LEFT,
|
||||
RIGHT
|
||||
}
|
||||
@ -0,0 +1,38 @@
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model;
|
||||
|
||||
public enum PageBlockType {
|
||||
H1,
|
||||
H2,
|
||||
H3,
|
||||
H4,
|
||||
H5,
|
||||
H6,
|
||||
HEADER,
|
||||
FOOTER,
|
||||
TITLE,
|
||||
PARAGRAPH,
|
||||
PARAGRAPH_BOLD,
|
||||
PARAGRAPH_ITALIC,
|
||||
PARAGRAPH_UNKNOWN,
|
||||
OTHER,
|
||||
TABLE;
|
||||
|
||||
|
||||
public static PageBlockType getHeadlineType(int i) {
|
||||
|
||||
return switch (i) {
|
||||
case 1 -> PageBlockType.H1;
|
||||
case 2 -> PageBlockType.H2;
|
||||
case 3 -> PageBlockType.H3;
|
||||
case 4 -> PageBlockType.H4;
|
||||
case 5 -> PageBlockType.H5;
|
||||
default -> PageBlockType.H6;
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
public boolean isHeadline() {
|
||||
|
||||
return this.equals(H1) || this.equals(H2) || this.equals(H3) || this.equals(H4) || this.equals(H5) || this.equals(H6);
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,25 @@
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.image;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.ImageType;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.NonNull;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@Data
|
||||
@RequiredArgsConstructor
|
||||
public class ClassifiedImage {
|
||||
|
||||
@NonNull
|
||||
private Rectangle2D position;
|
||||
@NonNull
|
||||
private ImageType imageType;
|
||||
private boolean isAppendedToSection;
|
||||
@NonNull
|
||||
private boolean hasTransparency;
|
||||
@NonNull
|
||||
private int page;
|
||||
|
||||
}
|
||||
@ -1,25 +1,25 @@
|
||||
package com.iqser.red.service.redaction.v1.server.tableextraction.model;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@SuppressWarnings("serial")
|
||||
@Data
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
@NoArgsConstructor
|
||||
public class Cell extends Rectangle {
|
||||
|
||||
private List<TextBlock> textBlocks = new ArrayList<>();
|
||||
private List<TextPageBlock> textBlocks = new ArrayList<>();
|
||||
|
||||
private List<Cell> headerCells = new ArrayList<>();
|
||||
|
||||
@ -27,6 +27,8 @@ public class Cell extends Rectangle {
|
||||
|
||||
private static final int MIN_SIZE = 1;
|
||||
|
||||
private int pageNumber;
|
||||
|
||||
|
||||
public Cell(Point2D topLeft, Point2D bottomRight) {
|
||||
|
||||
@ -34,7 +36,7 @@ public class Cell extends Rectangle {
|
||||
}
|
||||
|
||||
|
||||
public void addTextBlock(TextBlock textBlock) {
|
||||
public void addTextBlock(TextPageBlock textBlock) {
|
||||
|
||||
textBlocks.add(textBlock);
|
||||
}
|
||||
@ -45,11 +47,11 @@ public class Cell extends Rectangle {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
Iterator<TextBlock> itty = textBlocks.iterator();
|
||||
Iterator<TextPageBlock> itty = textBlocks.iterator();
|
||||
TextPositionSequence previous = null;
|
||||
while (itty.hasNext()) {
|
||||
|
||||
TextBlock textBlock = itty.next();
|
||||
TextPageBlock textBlock = itty.next();
|
||||
|
||||
for (TextPositionSequence word : textBlock.getSequences()) {
|
||||
if (previous != null) {
|
||||
@ -1,4 +1,4 @@
|
||||
package com.iqser.red.service.redaction.v1.server.tableextraction.model;
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.Value;
|
||||
@ -1,10 +1,10 @@
|
||||
package com.iqser.red.service.redaction.v1.server.tableextraction.model;
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
public class CleanRulings {
|
||||
@ -1,4 +1,4 @@
|
||||
package com.iqser.red.service.redaction.v1.server.tableextraction.model;
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
@ -1,14 +1,20 @@
|
||||
package com.iqser.red.service.redaction.v1.server.tableextraction.model;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.CohenSutherlandClipping;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table;
|
||||
|
||||
import java.awt.geom.Line2D;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.Formatter;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.TreeMap;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils.CohenSutherlandClipping;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils.Utils;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@SuppressWarnings("all")
|
||||
@ -1,25 +1,25 @@
|
||||
package com.iqser.red.service.redaction.v1.server.tableextraction.model;
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.TreeMap;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.collections4.CollectionUtils;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.PageBlockType;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
public class Table extends AbstractTextContainer {
|
||||
public class TablePageBlock extends AbstractPageBlock {
|
||||
|
||||
private final TreeMap<CellPosition, Cell> cells = new TreeMap<>();
|
||||
|
||||
@ -29,21 +29,18 @@ public class Table extends AbstractTextContainer {
|
||||
private String headline;
|
||||
private int unrotatedRowCount;
|
||||
private int unrotatedColCount;
|
||||
private int rowCount = -1;
|
||||
private int colCount = -1;
|
||||
private List<List<Cell>> rows;
|
||||
|
||||
|
||||
public Table(List<Cell> cells, Rectangle area, int rotation) {
|
||||
public TablePageBlock(List<Cell> cells, Rectangle area, int rotation) {
|
||||
|
||||
addCells(cells);
|
||||
minX = area.getLeft();
|
||||
minY = area.getBottom();
|
||||
maxX = area.getRight();
|
||||
maxY = area.getTop();
|
||||
classification = "Table";
|
||||
classification = PageBlockType.TABLE;
|
||||
this.rotation = rotation;
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -71,19 +68,13 @@ public class Table extends AbstractTextContainer {
|
||||
|
||||
public int getRowCount() {
|
||||
|
||||
if (rowCount == -1) {
|
||||
rowCount = getRows().size();
|
||||
}
|
||||
return rowCount;
|
||||
return getRows().size();
|
||||
}
|
||||
|
||||
|
||||
public int getColCount() {
|
||||
|
||||
if (colCount == -1) {
|
||||
colCount = getRows().stream().mapToInt(List::size).max().orElse(0);
|
||||
}
|
||||
return colCount;
|
||||
return getRows().stream().mapToInt(List::size).max().orElse(0);
|
||||
|
||||
}
|
||||
|
||||
@ -224,7 +215,7 @@ public class Table extends AbstractTextContainer {
|
||||
* Calculates the structure of the table. For spanning rows and columns multiple cells with the same values will be inserted.
|
||||
*
|
||||
* @param cells The found cells
|
||||
* @return Table Structure
|
||||
* @return TablePageBlock Structure
|
||||
*/
|
||||
private List<List<Cell>> calculateStructure(List<Cell> cells) {
|
||||
|
||||
@ -243,8 +234,8 @@ public class Table extends AbstractTextContainer {
|
||||
uniqueY.add(c.getTop());
|
||||
});
|
||||
|
||||
var sortedUniqueX = uniqueX.stream().sorted().collect(Collectors.toList());
|
||||
var sortedUniqueY = uniqueY.stream().sorted().collect(Collectors.toList());
|
||||
var sortedUniqueX = uniqueX.stream().sorted().toList();
|
||||
var sortedUniqueY = uniqueY.stream().sorted().toList();
|
||||
|
||||
Float prevY = null;
|
||||
for (Float y : sortedUniqueY) {
|
||||
@ -258,9 +249,7 @@ public class Table extends AbstractTextContainer {
|
||||
var cell = new Cell(new Point2D.Float(prevX, prevY), new Point2D.Float(x, y));
|
||||
|
||||
var intersectionCell = cells.stream().filter(c -> cell.intersects(c) && cell.overlapRatio(c) > 0.1f).findFirst();
|
||||
if (intersectionCell.isPresent()) {
|
||||
cell.getTextBlocks().addAll(intersectionCell.get().getTextBlocks());
|
||||
}
|
||||
intersectionCell.ifPresent(value -> cell.getTextBlocks().addAll(value.getTextBlocks()));
|
||||
if (cell.hasMinimumSize()) {
|
||||
row.add(cell);
|
||||
}
|
||||
@ -268,7 +257,7 @@ public class Table extends AbstractTextContainer {
|
||||
prevX = x;
|
||||
}
|
||||
|
||||
if (prevY != null && prevX != null) {
|
||||
if (prevY != null && prevX != null && !row.isEmpty()) {
|
||||
matrix.add(row);
|
||||
}
|
||||
prevY = y;
|
||||
@ -299,7 +288,7 @@ public class Table extends AbstractTextContainer {
|
||||
}
|
||||
if (column != null && column.getTextBlocks() != null) {
|
||||
boolean first = true;
|
||||
for (TextBlock textBlock : column.getTextBlocks()) {
|
||||
for (TextPageBlock textBlock : column.getTextBlocks()) {
|
||||
if (!first) {
|
||||
sb.append("\n");
|
||||
}
|
||||
@ -331,7 +320,7 @@ public class Table extends AbstractTextContainer {
|
||||
sb.append(i == 0 ? "\n<th>" : "\n<td>");
|
||||
if (column != null && column.getTextBlocks() != null) {
|
||||
boolean first = true;
|
||||
for (TextBlock textBlock : column.getTextBlocks()) {
|
||||
for (TextPageBlock textBlock : column.getTextBlocks()) {
|
||||
if (!first) {
|
||||
sb.append("<br />");
|
||||
}
|
||||
@ -1,4 +1,4 @@
|
||||
package com.iqser.red.service.redaction.v1.server.parsing.model;
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text;
|
||||
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
import org.springframework.beans.BeanUtils;
|
||||
@ -0,0 +1,49 @@
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
|
||||
|
||||
import lombok.Getter;
|
||||
|
||||
public class SearchableText {
|
||||
|
||||
@Getter
|
||||
private final List<TextPositionSequence> sequences = new ArrayList<>();
|
||||
|
||||
|
||||
public void add(TextPositionSequence textPositionSequence) {
|
||||
|
||||
sequences.add(textPositionSequence);
|
||||
}
|
||||
|
||||
|
||||
public void addAll(List<TextPositionSequence> textPositionSequences) {
|
||||
|
||||
sequences.addAll(textPositionSequences);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return buildString(sequences);
|
||||
}
|
||||
|
||||
|
||||
public static String buildString(List<TextPositionSequence> sequences) {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (TextPositionSequence word : sequences) {
|
||||
sb.append(word);
|
||||
sb.append(' ');
|
||||
}
|
||||
String text = sb.toString();
|
||||
text = TextNormalizationUtilities.removeHyphenLineBreaks(text);
|
||||
text = TextNormalizationUtilities.removeLineBreaks(text);
|
||||
text = TextNormalizationUtilities.removeRepeatingWhitespaces(text);
|
||||
return text;
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,5 +1,4 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
||||
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
@ -1,4 +1,4 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
@ -1,10 +1,10 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
||||
|
||||
import lombok.Getter;
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import lombok.Getter;
|
||||
|
||||
public class StringFrequencyCounter {
|
||||
|
||||
@Getter
|
||||
@ -1,6 +1,4 @@
|
||||
package com.iqser.red.service.redaction.v1.server.parsing.model;
|
||||
|
||||
import java.util.Objects;
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonCreator;
|
||||
import com.fasterxml.jackson.annotation.JsonValue;
|
||||
@ -46,18 +44,4 @@ public enum TextDirection {
|
||||
|
||||
throw new IllegalArgumentException(String.format("A value of %f is not supported by TextDirection", degrees));
|
||||
}
|
||||
|
||||
|
||||
public static TextDirection fromString(String degreesAsString) {
|
||||
|
||||
Objects.requireNonNull(degreesAsString, "Cannot construct a text direction from a null value");
|
||||
|
||||
String value = degreesAsString.strip();
|
||||
|
||||
if (degreesAsString.endsWith(VALUE_STRING_SUFFIX)) {
|
||||
value = degreesAsString.replace(VALUE_STRING_SUFFIX + "$", "");
|
||||
}
|
||||
|
||||
return fromDegrees(Float.parseFloat(value));
|
||||
}
|
||||
}
|
||||
@ -1,13 +1,12 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextDirection;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.PageBlockType;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
@ -18,7 +17,7 @@ import lombok.NoArgsConstructor;
|
||||
@Builder
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
public class TextBlock extends AbstractTextContainer {
|
||||
public class TextPageBlock extends AbstractPageBlock {
|
||||
|
||||
@Builder.Default
|
||||
private List<TextPositionSequence> sequences = new ArrayList<>();
|
||||
@ -45,7 +44,7 @@ public class TextBlock extends AbstractTextContainer {
|
||||
private float highestFontSize;
|
||||
|
||||
@JsonIgnore
|
||||
private String classification;
|
||||
private PageBlockType classification;
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
@ -95,6 +94,7 @@ public class TextBlock extends AbstractTextContainer {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns the maxX value in pdf coordinate system.
|
||||
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
|
||||
@ -174,7 +174,7 @@ public class TextBlock extends AbstractTextContainer {
|
||||
}
|
||||
|
||||
|
||||
public TextBlock(float minX, float maxX, float minY, float maxY, List<TextPositionSequence> sequences, int rotation) {
|
||||
public TextPageBlock(float minX, float maxX, float minY, float maxY, List<TextPositionSequence> sequences, int rotation) {
|
||||
|
||||
this.minX = minX;
|
||||
this.maxX = maxX;
|
||||
@ -185,23 +185,23 @@ public class TextBlock extends AbstractTextContainer {
|
||||
}
|
||||
|
||||
|
||||
public TextBlock union(TextPositionSequence r) {
|
||||
public TextPageBlock union(TextPositionSequence r) {
|
||||
|
||||
TextBlock union = this.copy();
|
||||
TextPageBlock union = this.copy();
|
||||
union.add(r);
|
||||
return union;
|
||||
}
|
||||
|
||||
|
||||
public TextBlock union(TextBlock r) {
|
||||
public TextPageBlock union(TextPageBlock r) {
|
||||
|
||||
TextBlock union = this.copy();
|
||||
TextPageBlock union = this.copy();
|
||||
union.add(r);
|
||||
return union;
|
||||
}
|
||||
|
||||
|
||||
public void add(TextBlock r) {
|
||||
public void add(TextPageBlock r) {
|
||||
|
||||
if (r.getMinX() < minX) {
|
||||
minX = r.getMinX();
|
||||
@ -236,9 +236,9 @@ public class TextBlock extends AbstractTextContainer {
|
||||
}
|
||||
|
||||
|
||||
public TextBlock copy() {
|
||||
public TextPageBlock copy() {
|
||||
|
||||
return new TextBlock(minX, maxX, minY, maxY, sequences, rotation);
|
||||
return new TextPageBlock(minX, maxX, minY, maxY, sequences, rotation);
|
||||
}
|
||||
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
package com.iqser.red.service.redaction.v1.server.parsing.model;
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Point2D;
|
||||
@ -0,0 +1,14 @@
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
public class UnclassifiedText {
|
||||
|
||||
private List<TextPageBlock> textBlocks;
|
||||
|
||||
}
|
||||
@ -14,34 +14,18 @@
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package com.iqser.red.service.redaction.v1.server.parsing;
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.parsing;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.Map;
|
||||
import java.util.WeakHashMap;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import org.apache.fontbox.ttf.TrueTypeFont;
|
||||
import org.apache.fontbox.util.BoundingBox;
|
||||
|
||||
import org.apache.pdfbox.contentstream.PDFStreamEngine;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.font.encoding.GlyphList;
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
import org.apache.pdfbox.pdmodel.font.PDCIDFont;
|
||||
import org.apache.pdfbox.pdmodel.font.PDCIDFontType2;
|
||||
import org.apache.pdfbox.pdmodel.font.PDFont;
|
||||
import org.apache.pdfbox.pdmodel.font.PDSimpleFont;
|
||||
import org.apache.pdfbox.pdmodel.font.PDTrueTypeFont;
|
||||
import org.apache.pdfbox.pdmodel.font.PDType0Font;
|
||||
import org.apache.pdfbox.pdmodel.font.PDType3Font;
|
||||
import org.apache.pdfbox.pdmodel.graphics.state.PDGraphicsState;
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
import org.apache.pdfbox.util.Matrix;
|
||||
import org.apache.pdfbox.util.Vector;
|
||||
import org.apache.pdfbox.contentstream.operator.DrawObject;
|
||||
import org.apache.pdfbox.contentstream.operator.state.Concatenate;
|
||||
import org.apache.pdfbox.contentstream.operator.state.Restore;
|
||||
@ -50,22 +34,36 @@ import org.apache.pdfbox.contentstream.operator.state.SetGraphicsStateParameters
|
||||
import org.apache.pdfbox.contentstream.operator.state.SetMatrix;
|
||||
import org.apache.pdfbox.contentstream.operator.text.BeginText;
|
||||
import org.apache.pdfbox.contentstream.operator.text.EndText;
|
||||
import org.apache.pdfbox.contentstream.operator.text.SetFontAndSize;
|
||||
import org.apache.pdfbox.contentstream.operator.text.SetTextHorizontalScaling;
|
||||
import org.apache.pdfbox.contentstream.operator.text.ShowTextAdjusted;
|
||||
import org.apache.pdfbox.contentstream.operator.text.ShowTextLine;
|
||||
import org.apache.pdfbox.contentstream.operator.text.ShowTextLineAndSpace;
|
||||
import org.apache.pdfbox.contentstream.operator.text.MoveText;
|
||||
import org.apache.pdfbox.contentstream.operator.text.MoveTextSetLeading;
|
||||
import org.apache.pdfbox.contentstream.operator.text.NextLine;
|
||||
import org.apache.pdfbox.contentstream.operator.text.SetCharSpacing;
|
||||
import org.apache.pdfbox.contentstream.operator.text.SetFontAndSize;
|
||||
import org.apache.pdfbox.contentstream.operator.text.SetTextHorizontalScaling;
|
||||
import org.apache.pdfbox.contentstream.operator.text.SetTextLeading;
|
||||
import org.apache.pdfbox.contentstream.operator.text.SetTextRenderingMode;
|
||||
import org.apache.pdfbox.contentstream.operator.text.SetTextRise;
|
||||
import org.apache.pdfbox.contentstream.operator.text.SetWordSpacing;
|
||||
import org.apache.pdfbox.contentstream.operator.text.ShowText;
|
||||
import org.apache.pdfbox.contentstream.operator.text.ShowTextAdjusted;
|
||||
import org.apache.pdfbox.contentstream.operator.text.ShowTextLine;
|
||||
import org.apache.pdfbox.contentstream.operator.text.ShowTextLineAndSpace;
|
||||
import org.apache.pdfbox.cos.COSDictionary;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
import org.apache.pdfbox.pdmodel.font.PDCIDFont;
|
||||
import org.apache.pdfbox.pdmodel.font.PDCIDFontType2;
|
||||
import org.apache.pdfbox.pdmodel.font.PDFont;
|
||||
import org.apache.pdfbox.pdmodel.font.PDFontDescriptor;
|
||||
import org.apache.pdfbox.pdmodel.font.PDSimpleFont;
|
||||
import org.apache.pdfbox.pdmodel.font.PDTrueTypeFont;
|
||||
import org.apache.pdfbox.pdmodel.font.PDType0Font;
|
||||
import org.apache.pdfbox.pdmodel.font.PDType3Font;
|
||||
import org.apache.pdfbox.pdmodel.font.encoding.GlyphList;
|
||||
import org.apache.pdfbox.pdmodel.graphics.state.PDGraphicsState;
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
import org.apache.pdfbox.util.Matrix;
|
||||
import org.apache.pdfbox.util.Vector;
|
||||
|
||||
/**
|
||||
* LEGACY text calculations which are known to be incorrect but are depended on by PDFTextStripper.
|
||||
@ -1,17 +1,32 @@
|
||||
package com.iqser.red.service.redaction.v1.server.parsing;
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.parsing;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.RedTextPosition;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.pdfbox.contentstream.operator.Operator;
|
||||
import org.apache.pdfbox.contentstream.operator.OperatorName;
|
||||
import org.apache.pdfbox.contentstream.operator.color.*;
|
||||
import org.apache.pdfbox.contentstream.operator.state.*;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColor;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColorN;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColorSpace;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceCMYKColor;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceGrayColor;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceRGBColor;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetStrokingColor;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetStrokingColorN;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetStrokingColorSpace;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceCMYKColor;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceGrayColor;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceRGBColor;
|
||||
import org.apache.pdfbox.contentstream.operator.state.SetFlatness;
|
||||
import org.apache.pdfbox.contentstream.operator.state.SetLineCapStyle;
|
||||
import org.apache.pdfbox.contentstream.operator.state.SetLineDashPattern;
|
||||
import org.apache.pdfbox.contentstream.operator.state.SetLineJoinStyle;
|
||||
import org.apache.pdfbox.contentstream.operator.state.SetLineMiterLimit;
|
||||
import org.apache.pdfbox.contentstream.operator.state.SetLineWidth;
|
||||
import org.apache.pdfbox.contentstream.operator.state.SetRenderingIntent;
|
||||
import org.apache.pdfbox.contentstream.operator.text.SetFontAndSize;
|
||||
import org.apache.pdfbox.cos.COSBase;
|
||||
import org.apache.pdfbox.cos.COSNumber;
|
||||
@ -19,11 +34,13 @@ import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.Ruling;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.RedTextPosition;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPositionSequence;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
@ -264,8 +281,8 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
// Remove false sequence ends (whitespaces)
|
||||
if (previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0)
|
||||
.getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < 0.01) {
|
||||
for (TextPosition t : sublist) {
|
||||
textPositionSequences.get(textPositionSequences.size() - 1).add(t);
|
||||
for (TextPosition textPosition : sublist) {
|
||||
textPositionSequences.get(textPositionSequences.size() - 1).add(textPosition);
|
||||
}
|
||||
} else {
|
||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
|
||||
@ -14,7 +14,7 @@
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package com.iqser.red.service.redaction.v1.server.parsing;
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.parsing;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.IOException;
|
||||
@ -1,4 +1,4 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.service;
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.service;
|
||||
|
||||
import static java.util.stream.Collectors.toSet;
|
||||
|
||||
@ -9,15 +9,15 @@ import java.util.List;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.FloatFrequencyCounter;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Orientation;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.StringFrequencyCounter;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.utils.RulingTextDirAdjustUtil;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationPage;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.FloatFrequencyCounter;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.Orientation;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.Ruling;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.StringFrequencyCounter;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils.RulingTextDirAdjustUtil;
|
||||
|
||||
@Service
|
||||
@SuppressWarnings("all")
|
||||
@ -29,16 +29,18 @@ public class BlockificationService {
|
||||
/**
|
||||
* This method is building blocks by expanding the minX/maxX and minY/maxY value on each word that is not split by the conditions.
|
||||
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
|
||||
* Rulings (Table lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
|
||||
* @param textPositions The words of a page.
|
||||
* Rulings (TablePageBlock lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
|
||||
*
|
||||
* @param textPositions The words of a page.
|
||||
* @param horizontalRulingLines Horizontal table lines.
|
||||
* @param verticalRulingLines Vertical table lines.
|
||||
* @param verticalRulingLines Vertical table lines.
|
||||
* @return Page object that contains the Textblock and text statistics.
|
||||
*/
|
||||
public Page blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
||||
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
||||
|
||||
int indexOnPage = 0;
|
||||
List<TextPositionSequence> chunkWords = new ArrayList<>();
|
||||
List<AbstractTextContainer> chunkBlockList1 = new ArrayList<>();
|
||||
List<AbstractPageBlock> chunkBlockList = new ArrayList<>();
|
||||
|
||||
float minX = 1000, maxX = 0, minY = 1000, maxY = 0;
|
||||
TextPositionSequence prev = null;
|
||||
@ -58,12 +60,14 @@ public class BlockificationService {
|
||||
if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) {
|
||||
|
||||
Orientation prevOrientation = null;
|
||||
if (!chunkBlockList1.isEmpty()) {
|
||||
prevOrientation = chunkBlockList1.get(chunkBlockList1.size() - 1).getOrientation();
|
||||
if (!chunkBlockList.isEmpty()) {
|
||||
prevOrientation = chunkBlockList.get(chunkBlockList.size() - 1).getOrientation();
|
||||
}
|
||||
|
||||
TextBlock cb1 = buildTextBlock(chunkWords);
|
||||
chunkBlockList1.add(cb1);
|
||||
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
|
||||
indexOnPage++;
|
||||
|
||||
chunkBlockList.add(cb1);
|
||||
chunkWords = new ArrayList<>();
|
||||
|
||||
if (splitByX && !isSplitByRuling) {
|
||||
@ -102,17 +106,17 @@ public class BlockificationService {
|
||||
}
|
||||
}
|
||||
|
||||
TextBlock cb1 = buildTextBlock(chunkWords);
|
||||
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
|
||||
if (cb1 != null) {
|
||||
chunkBlockList1.add(cb1);
|
||||
chunkBlockList.add(cb1);
|
||||
}
|
||||
|
||||
Iterator<AbstractTextContainer> itty = chunkBlockList1.iterator();
|
||||
Iterator<AbstractPageBlock> itty = chunkBlockList.iterator();
|
||||
|
||||
TextBlock previousLeft = null;
|
||||
TextBlock previousRight = null;
|
||||
TextPageBlock previousLeft = null;
|
||||
TextPageBlock previousRight = null;
|
||||
while (itty.hasNext()) {
|
||||
TextBlock block = (TextBlock) itty.next();
|
||||
TextPageBlock block = (TextPageBlock) itty.next();
|
||||
|
||||
if (previousLeft != null && block.getOrientation().equals(Orientation.LEFT)) {
|
||||
if (previousLeft.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousLeft.getMinY()) {
|
||||
@ -137,10 +141,10 @@ public class BlockificationService {
|
||||
}
|
||||
}
|
||||
|
||||
itty = chunkBlockList1.iterator();
|
||||
TextBlock previous = null;
|
||||
itty = chunkBlockList.iterator();
|
||||
TextPageBlock previous = null;
|
||||
while (itty.hasNext()) {
|
||||
TextBlock block = (TextBlock) itty.next();
|
||||
TextPageBlock block = (TextPageBlock) itty.next();
|
||||
|
||||
if (previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold(block.getMaxY(),
|
||||
previous.getMaxY()) || previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation()
|
||||
@ -153,7 +157,7 @@ public class BlockificationService {
|
||||
previous = block;
|
||||
}
|
||||
|
||||
return new Page(chunkBlockList1);
|
||||
return new ClassificationPage(chunkBlockList);
|
||||
}
|
||||
|
||||
|
||||
@ -163,9 +167,9 @@ public class BlockificationService {
|
||||
}
|
||||
|
||||
|
||||
private TextBlock buildTextBlock(List<TextPositionSequence> wordBlockList) {
|
||||
private TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
|
||||
|
||||
TextBlock textBlock = null;
|
||||
TextPageBlock textBlock = null;
|
||||
|
||||
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
|
||||
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
|
||||
@ -182,9 +186,14 @@ public class BlockificationService {
|
||||
styleFrequencyCounter.add(wordBlock.getFontStyle());
|
||||
|
||||
if (textBlock == null) {
|
||||
textBlock = new TextBlock(wordBlock.getMinXDirAdj(), wordBlock.getMaxXDirAdj(), wordBlock.getMinYDirAdj(), wordBlock.getMaxYDirAdj(), wordBlockList, wordBlock.getRotation());
|
||||
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
|
||||
wordBlock.getMaxXDirAdj(),
|
||||
wordBlock.getMinYDirAdj(),
|
||||
wordBlock.getMaxYDirAdj(),
|
||||
wordBlockList,
|
||||
wordBlock.getRotation());
|
||||
} else {
|
||||
TextBlock spatialEntity = textBlock.union(wordBlock);
|
||||
TextPageBlock spatialEntity = textBlock.union(wordBlock);
|
||||
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
|
||||
}
|
||||
}
|
||||
@ -213,10 +222,38 @@ public class BlockificationService {
|
||||
List<Ruling> horizontalRulingLines,
|
||||
List<Ruling> verticalRulingLines) {
|
||||
|
||||
return isSplitByRuling(maxX, minY, word.getMinXDirAdj(), word.getMinYDirAdj(), verticalRulingLines, word.getDir().getDegrees(), word.getPageWidth(), word.getPageHeight()) //
|
||||
|| isSplitByRuling(minX, minY, word.getMinXDirAdj(), word.getMaxYDirAdj(), horizontalRulingLines, word.getDir().getDegrees(), word.getPageWidth(), word.getPageHeight()) //
|
||||
|| isSplitByRuling(maxX, minY, word.getMinXDirAdj(), word.getMinYDirAdj(), horizontalRulingLines, word.getDir().getDegrees(), word.getPageWidth(), word.getPageHeight()) //
|
||||
|| isSplitByRuling(minX, minY, word.getMinXDirAdj(), word.getMaxYDirAdj(), verticalRulingLines, word.getDir().getDegrees(), word.getPageWidth(), word.getPageHeight()); //
|
||||
return isSplitByRuling(maxX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMinYDirAdj(),
|
||||
verticalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight()) //
|
||||
|| isSplitByRuling(minX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMaxYDirAdj(),
|
||||
horizontalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight()) //
|
||||
|| isSplitByRuling(maxX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMinYDirAdj(),
|
||||
horizontalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight()) //
|
||||
|| isSplitByRuling(minX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMaxYDirAdj(),
|
||||
verticalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight());
|
||||
}
|
||||
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.service;
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.service;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@ -6,17 +6,20 @@ import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.FloatFrequencyCounter;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.utils.PositionUtils;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationPage;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.FloatFrequencyCounter;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.Cell;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.TablePageBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils.PositionUtils;
|
||||
|
||||
@Service
|
||||
public class BodyTextFrameService {
|
||||
|
||||
private static final float APPROXIMATE_HEADER_LINE_COUNT = 2.9f;
|
||||
|
||||
|
||||
/**
|
||||
* Adjusts and sets the body text frame to a page.
|
||||
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
|
||||
@ -30,7 +33,7 @@ public class BodyTextFrameService {
|
||||
* @param bodyTextFrame frame that contains the main text on portrait pages
|
||||
* @param landscapeBodyTextFrame frame that contains the main text on landscape pages
|
||||
*/
|
||||
public void setBodyTextFrameAdjustedToPage(Page page, Rectangle bodyTextFrame, Rectangle landscapeBodyTextFrame) {
|
||||
public void setBodyTextFrameAdjustedToPage(ClassificationPage page, Rectangle bodyTextFrame, Rectangle landscapeBodyTextFrame) {
|
||||
|
||||
Rectangle textFrame = page.isLandscape() ? landscapeBodyTextFrame : bodyTextFrame;
|
||||
|
||||
@ -65,26 +68,26 @@ public class BodyTextFrameService {
|
||||
* @param landscape Calculate for landscape or portrait
|
||||
* @return Rectangle of the text frame
|
||||
*/
|
||||
public Rectangle calculateBodyTextFrame(List<Page> pages, FloatFrequencyCounter documentFontSizeCounter, boolean landscape) {
|
||||
public Rectangle calculateBodyTextFrame(List<ClassificationPage> pages, FloatFrequencyCounter documentFontSizeCounter, boolean landscape) {
|
||||
|
||||
BodyTextFrameExpansionsRectangle expansionsRectangle = new BodyTextFrameExpansionsRectangle();
|
||||
|
||||
for (Page page : pages) {
|
||||
for (ClassificationPage page : pages) {
|
||||
|
||||
if (page.getTextBlocks().isEmpty() || landscape != page.isLandscape()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (AbstractTextContainer container : page.getTextBlocks()) {
|
||||
for (AbstractPageBlock container : page.getTextBlocks()) {
|
||||
|
||||
if (container instanceof TextBlock) {
|
||||
TextBlock textBlock = (TextBlock) container;
|
||||
if (container instanceof TextPageBlock) {
|
||||
TextPageBlock textBlock = (TextPageBlock) container;
|
||||
if (textBlock.getMostPopularWordFont() == null || textBlock.getMostPopularWordStyle() == null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
float approxLineCount = PositionUtils.getApproxLineCount(textBlock);
|
||||
if (approxLineCount < 2.9f) {
|
||||
if (approxLineCount < APPROXIMATE_HEADER_LINE_COUNT) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -94,15 +97,15 @@ public class BodyTextFrameService {
|
||||
}
|
||||
}
|
||||
|
||||
if (container instanceof Table) {
|
||||
Table table = (Table) container;
|
||||
if (container instanceof TablePageBlock) {
|
||||
TablePageBlock table = (TablePageBlock) container;
|
||||
for (List<Cell> row : table.getRows()) {
|
||||
for (Cell cell : row) {
|
||||
|
||||
if (cell == null || cell.getTextBlocks() == null) {
|
||||
continue;
|
||||
}
|
||||
for (TextBlock textBlock : cell.getTextBlocks()) {
|
||||
for (TextPageBlock textBlock : cell.getTextBlocks()) {
|
||||
expandRectangle(textBlock, page, expansionsRectangle);
|
||||
}
|
||||
}
|
||||
@ -117,7 +120,7 @@ public class BodyTextFrameService {
|
||||
}
|
||||
|
||||
|
||||
private void expandRectangle(TextBlock textBlock, Page page, BodyTextFrameExpansionsRectangle expansionsRectangle) {
|
||||
private void expandRectangle(TextPageBlock textBlock, ClassificationPage page, BodyTextFrameExpansionsRectangle expansionsRectangle) {
|
||||
|
||||
if (page.getPageWidth() > page.getPageHeight() && page.getRotation() != 0) {
|
||||
if (textBlock.getPdfMinY() < expansionsRectangle.minX) {
|
||||
@ -1,4 +1,4 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.service;
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.service;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
@ -6,11 +6,12 @@ import java.util.regex.Pattern;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.utils.PositionUtils;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationDocument;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationPage;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.PageBlockType;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils.PositionUtils;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
@ -23,7 +24,7 @@ public class ClassificationService {
|
||||
private final BodyTextFrameService bodyTextFrameService;
|
||||
|
||||
|
||||
public void classifyDocument(Document document) {
|
||||
public void classifyDocument(ClassificationDocument document) {
|
||||
|
||||
Rectangle bodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), false);
|
||||
Rectangle landscapeBodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), true);
|
||||
@ -31,43 +32,43 @@ public class ClassificationService {
|
||||
|
||||
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
|
||||
|
||||
for (Page page : document.getPages()) {
|
||||
for (ClassificationPage page : document.getPages()) {
|
||||
bodyTextFrameService.setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame);
|
||||
classifyPage(page, document, headlineFontSizes);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void classifyPage(Page page, Document document, List<Float> headlineFontSizes) {
|
||||
public void classifyPage(ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
|
||||
|
||||
for (AbstractTextContainer textBlock : page.getTextBlocks()) {
|
||||
if (textBlock instanceof TextBlock) {
|
||||
classifyBlock((TextBlock) textBlock, page, document, headlineFontSizes);
|
||||
for (AbstractPageBlock textBlock : page.getTextBlocks()) {
|
||||
if (textBlock instanceof TextPageBlock) {
|
||||
classifyBlock((TextPageBlock) textBlock, page, document, headlineFontSizes);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void classifyBlock(TextBlock textBlock, Page page, Document document, List<Float> headlineFontSizes) {
|
||||
public void classifyBlock(TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
|
||||
|
||||
var bodyTextFrame = page.getBodyTextFrame();
|
||||
|
||||
if (document.getFontSizeCounter().getMostPopular() == null) {
|
||||
textBlock.setClassification("Other");
|
||||
textBlock.setClassification(PageBlockType.OTHER);
|
||||
return;
|
||||
}
|
||||
if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
|
||||
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
|
||||
textBlock.setClassification("Header");
|
||||
textBlock.setClassification(PageBlockType.HEADER);
|
||||
|
||||
} else if (PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
|
||||
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
|
||||
textBlock.setClassification("Footer");
|
||||
textBlock.setClassification(PageBlockType.FOOTER);
|
||||
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
|
||||
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
|
||||
.size() == 1)) {
|
||||
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
|
||||
textBlock.setClassification("Title");
|
||||
textBlock.setClassification(PageBlockType.TITLE);
|
||||
}
|
||||
} else if (textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter()
|
||||
.getMostPopular() && PositionUtils.getApproxLineCount(textBlock) < 4.9 && (textBlock.getMostPopularWordStyle().equals("bold") || !document.getFontStyleCounter()
|
||||
@ -80,36 +81,34 @@ public class ClassificationService {
|
||||
|
||||
for (int i = 1; i <= headlineFontSizes.size(); i++) {
|
||||
if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) {
|
||||
textBlock.setClassification("H " + i);
|
||||
textBlock.setClassification(PageBlockType.getHeadlineType(i));
|
||||
document.setHeadlines(true);
|
||||
}
|
||||
}
|
||||
} else if (!textBlock.getText().startsWith("Table ") && !textBlock.getText().startsWith("Figure ") && PositionUtils.isWithinBodyTextFrame(bodyTextFrame,
|
||||
textBlock) && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter()
|
||||
.getMostPopular()
|
||||
.equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9 && textBlock.getSequences()
|
||||
} else if (!textBlock.getText().startsWith("Figure ") && PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordStyle()
|
||||
.equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9 && textBlock.getSequences()
|
||||
.get(0)
|
||||
.getTextPositions()
|
||||
.get(0)
|
||||
.getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
|
||||
textBlock.setClassification("H " + (headlineFontSizes.size() + 1));
|
||||
textBlock.setClassification(PageBlockType.getHeadlineType(headlineFontSizes.size() + 1));
|
||||
document.setHeadlines(true);
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
|
||||
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) {
|
||||
textBlock.setClassification("TextBlock Bold");
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD);
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFont()
|
||||
.equals(document.getFontCounter().getMostPopular()) && textBlock.getMostPopularWordStyle()
|
||||
.equals(document.getFontStyleCounter().getMostPopular()) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) {
|
||||
textBlock.setClassification("TextBlock");
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
|
||||
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("italic") && !document.getFontStyleCounter()
|
||||
.getMostPopular()
|
||||
.equals("italic") && PositionUtils.getApproxLineCount(textBlock) < 2.9) {
|
||||
textBlock.setClassification("TextBlock Italic");
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC);
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) {
|
||||
textBlock.setClassification("TextBlock Unknown");
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN);
|
||||
} else {
|
||||
textBlock.setClassification("Other");
|
||||
textBlock.setClassification(PageBlockType.OTHER);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
package com.iqser.red.service.redaction.v1.server.segmentation;
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.service;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
@ -16,21 +16,19 @@ import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.service.BlockificationService;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.service.ClassificationService;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.PDFLinesTextStripper;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfTableCell;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.ImageServiceResponseAdapter;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.TableServiceResponseAdapter;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.table.PdfTableCell;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationDocument;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationPage;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.image.ClassifiedImage;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.CleanRulings;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.parsing.PDFLinesTextStripper;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils.FileUtils;
|
||||
import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.CleanRulings;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.service.RulingCleaningService;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.service.TableExtractionService;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.FileUtils;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
@ -46,18 +44,21 @@ public class PdfSegmentationService {
|
||||
private final BlockificationService blockificationService;
|
||||
private final ClassificationService classificationService;
|
||||
private final SectionsBuilderService sectionsBuilderService;
|
||||
private final ImageService imageService;
|
||||
private final TableService tableService;
|
||||
private final ImageServiceResponseAdapter imageServiceResponseAdapter;
|
||||
private final TableServiceResponseAdapter tableServiceResponseAdapter;
|
||||
|
||||
|
||||
public Document parseDocument(String dossierId, String fileId, InputStream documentInputStream, Map<Integer, List<PdfImage>> pdfImages) throws IOException {
|
||||
public ClassificationDocument parseDocument(String dossierId,
|
||||
String fileId,
|
||||
InputStream documentInputStream,
|
||||
Map<Integer, List<ClassifiedImage>> pdfImages) throws IOException {
|
||||
|
||||
PDDocument pdDocument = null;
|
||||
File tempFile = null;
|
||||
try {
|
||||
Map<Integer, List<PdfTableCell>> pdfTableCells = new HashMap<>();
|
||||
if (redactionServiceSettings.isCvTableParsingEnabled()) {
|
||||
pdfTableCells = tableService.convertTables(dossierId, fileId);
|
||||
pdfTableCells = tableServiceResponseAdapter.convertTables(dossierId, fileId);
|
||||
}
|
||||
|
||||
tempFile = FileUtils.createTempFile("document", ".pdf");
|
||||
@ -65,8 +66,8 @@ public class PdfSegmentationService {
|
||||
IOUtils.copy(documentInputStream, fos);
|
||||
|
||||
// initialize required variables
|
||||
Document document = new Document();
|
||||
List<Page> pages = new ArrayList<>();
|
||||
ClassificationDocument document = new ClassificationDocument();
|
||||
List<ClassificationPage> pages = new ArrayList<>();
|
||||
|
||||
pdDocument = PDDocument.load(tempFile, MemoryUsageSetting.setupMixed(67108864L));
|
||||
pdDocument.setAllSecurityToBeRemoved(true);
|
||||
@ -94,12 +95,12 @@ public class PdfSegmentationService {
|
||||
}
|
||||
|
||||
|
||||
private void processPage(Map<Integer, List<PdfImage>> pdfImages,
|
||||
PDDocument pdDocument,
|
||||
Map<Integer, List<PdfTableCell>> pdfTableCells,
|
||||
Document document,
|
||||
List<Page> pages,
|
||||
int pageNumber) throws IOException {
|
||||
private void processPage(Map<Integer, List<ClassifiedImage>> pdfImages,
|
||||
PDDocument pdDocument,
|
||||
Map<Integer, List<PdfTableCell>> pdfTableCells,
|
||||
ClassificationDocument document,
|
||||
List<ClassificationPage> pages,
|
||||
int pageNumber) throws IOException {
|
||||
|
||||
PDFLinesTextStripper stripper = new PDFLinesTextStripper();
|
||||
PDPage pdPage = pdDocument.getPage(pageNumber - 1);
|
||||
@ -119,7 +120,7 @@ public class PdfSegmentationService {
|
||||
stripper.getRulings(),
|
||||
stripper.getMinCharWidth(),
|
||||
stripper.getMaxCharHeight());
|
||||
Page page = blockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
ClassificationPage page = blockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
|
||||
page.setRotation(rotation);
|
||||
page.setLandscape(isLandscape);
|
||||
@ -130,7 +131,7 @@ public class PdfSegmentationService {
|
||||
// If images is ocr needs to be calculated before textBlocks are moved into tables, otherwise findOcr algorithm needs to be adopted.
|
||||
if (pdfImages != null && pdfImages.containsKey(pageNumber)) {
|
||||
page.setImages(pdfImages.get(pageNumber));
|
||||
imageService.findOcr(page);
|
||||
imageServiceResponseAdapter.findOcr(page);
|
||||
}
|
||||
|
||||
tableExtractionService.extractTables(cleanRulings, page);
|
||||
@ -141,7 +142,7 @@ public class PdfSegmentationService {
|
||||
}
|
||||
|
||||
|
||||
private void increaseDocumentStatistics(Page page, Document document) {
|
||||
private void increaseDocumentStatistics(ClassificationPage page, ClassificationDocument document) {
|
||||
|
||||
if (!page.isLandscape()) {
|
||||
document.getFontSizeCounter().addAll(page.getFontSizeCounter().getCountPerValue());
|
||||
@ -152,15 +153,15 @@ public class PdfSegmentationService {
|
||||
}
|
||||
|
||||
|
||||
private void buildPageStatistics(Page page) {
|
||||
private void buildPageStatistics(ClassificationPage page) {
|
||||
|
||||
// Collect all statistics for the page, except from blocks inside tables, as tables will always be added to BodyTextFrame.
|
||||
for (AbstractTextContainer textBlock : page.getTextBlocks()) {
|
||||
if (textBlock instanceof TextBlock) {
|
||||
if (((TextBlock) textBlock).getSequences() == null) {
|
||||
for (AbstractPageBlock textBlock : page.getTextBlocks()) {
|
||||
if (textBlock instanceof TextPageBlock) {
|
||||
if (((TextPageBlock) textBlock).getSequences() == null) {
|
||||
continue;
|
||||
}
|
||||
for (TextPositionSequence word : ((TextBlock) textBlock).getSequences()) {
|
||||
for (TextPositionSequence word : ((TextPageBlock) textBlock).getSequences()) {
|
||||
page.getTextHeightCounter().add(word.getTextHeight());
|
||||
page.getFontCounter().add(word.getFont());
|
||||
page.getFontSizeCounter().add(word.getFontSize());
|
||||
@ -1,4 +1,4 @@
|
||||
package com.iqser.red.service.redaction.v1.server.tableextraction.service;
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.service;
|
||||
|
||||
import java.awt.geom.Line2D;
|
||||
import java.awt.geom.Point2D;
|
||||
@ -12,11 +12,11 @@ import java.util.Map;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfTableCell;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.table.PdfTableCell;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.CleanRulings;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.Ruling;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils.Utils;
|
||||
import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.CleanRulings;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
@ -1,9 +1,8 @@
|
||||
package com.iqser.red.service.redaction.v1.server.segmentation;
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.service;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
@ -11,17 +10,18 @@ import java.util.stream.Collectors;
|
||||
import org.apache.commons.collections4.CollectionUtils;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Footer;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Header;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.UnclassifiedText;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationDocument;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationFooter;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationHeader;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationPage;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationSection;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.PageBlockType;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.image.ClassifiedImage;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.Cell;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.TablePageBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.UnclassifiedText;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@ -29,23 +29,23 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@Service
|
||||
public class SectionsBuilderService {
|
||||
|
||||
public void buildSections(Document document) {
|
||||
public void buildSections(ClassificationDocument document) {
|
||||
|
||||
List<AbstractTextContainer> chunkWords = new ArrayList<>();
|
||||
List<Paragraph> chunkBlockList = new ArrayList<>();
|
||||
List<Header> headers = new ArrayList<>();
|
||||
List<Footer> footers = new ArrayList<>();
|
||||
List<AbstractPageBlock> chunkWords = new ArrayList<>();
|
||||
List<ClassificationSection> chunkBlockList = new ArrayList<>();
|
||||
List<ClassificationHeader> headers = new ArrayList<>();
|
||||
List<ClassificationFooter> footers = new ArrayList<>();
|
||||
List<UnclassifiedText> unclassifiedTexts = new ArrayList<>();
|
||||
|
||||
AbstractTextContainer prev = null;
|
||||
AbstractPageBlock prev = null;
|
||||
|
||||
String lastHeadline = "";
|
||||
Table previousTable = null;
|
||||
for (Page page : document.getPages()) {
|
||||
List<TextBlock> header = new ArrayList<>();
|
||||
List<TextBlock> footer = new ArrayList<>();
|
||||
List<TextBlock> unclassifiedText = new ArrayList<>();
|
||||
for (AbstractTextContainer current : page.getTextBlocks()) {
|
||||
TablePageBlock previousTable = null;
|
||||
for (ClassificationPage page : document.getPages()) {
|
||||
List<TextPageBlock> header = new ArrayList<>();
|
||||
List<TextPageBlock> footer = new ArrayList<>();
|
||||
List<TextPageBlock> unclassifiedText = new ArrayList<>();
|
||||
for (AbstractPageBlock current : page.getTextBlocks()) {
|
||||
|
||||
if (current.getClassification() == null) {
|
||||
continue;
|
||||
@ -53,23 +53,23 @@ public class SectionsBuilderService {
|
||||
|
||||
current.setPage(page.getPageNumber());
|
||||
|
||||
if (current.getClassification().equals("Header")) {
|
||||
header.add((TextBlock) current);
|
||||
if (current.getClassification().equals(PageBlockType.HEADER)) {
|
||||
header.add((TextPageBlock) current);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (current.getClassification().equals("Footer")) {
|
||||
footer.add((TextBlock) current);
|
||||
if (current.getClassification().equals(PageBlockType.FOOTER)) {
|
||||
footer.add((TextPageBlock) current);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (current.getClassification().equals("Other")) {
|
||||
unclassifiedText.add((TextBlock) current);
|
||||
if (current.getClassification().equals(PageBlockType.OTHER)) {
|
||||
unclassifiedText.add((TextPageBlock) current);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (prev != null && current.getClassification().startsWith("H ") && !prev.getClassification().startsWith("H ") || !document.isHeadlines()) {
|
||||
Paragraph chunkBlock = buildTextBlock(chunkWords, lastHeadline);
|
||||
if (prev != null && current.getClassification().isHeadline() && !prev.getClassification().isHeadline() || !document.isHeadlines()) {
|
||||
ClassificationSection chunkBlock = buildTextBlock(chunkWords, lastHeadline);
|
||||
chunkBlock.setHeadline(lastHeadline);
|
||||
if (document.isHeadlines()) {
|
||||
lastHeadline = current.getText();
|
||||
@ -80,8 +80,7 @@ public class SectionsBuilderService {
|
||||
previousTable = chunkBlock.getTables().get(chunkBlock.getTables().size() - 1);
|
||||
}
|
||||
}
|
||||
if (current instanceof Table) {
|
||||
Table table = (Table) current;
|
||||
if (current instanceof TablePageBlock table) {
|
||||
// Distribute header information for subsequent tables
|
||||
mergeTableMetadata(table, previousTable);
|
||||
previousTable = table;
|
||||
@ -91,69 +90,72 @@ public class SectionsBuilderService {
|
||||
}
|
||||
|
||||
if (!header.isEmpty()) {
|
||||
headers.add(new Header(header));
|
||||
headers.add(new ClassificationHeader(header));
|
||||
}
|
||||
if (!footer.isEmpty()) {
|
||||
footers.add(new Footer(footer));
|
||||
footers.add(new ClassificationFooter(footer));
|
||||
}
|
||||
if (!unclassifiedText.isEmpty()) {
|
||||
unclassifiedTexts.add(new UnclassifiedText(unclassifiedText));
|
||||
}
|
||||
}
|
||||
|
||||
Paragraph chunkBlock = buildTextBlock(chunkWords, lastHeadline);
|
||||
ClassificationSection chunkBlock = buildTextBlock(chunkWords, lastHeadline);
|
||||
chunkBlock.setHeadline(lastHeadline);
|
||||
chunkBlockList.add(chunkBlock);
|
||||
|
||||
document.setParagraphs(chunkBlockList);
|
||||
document.setSections(chunkBlockList);
|
||||
document.setHeaders(headers);
|
||||
document.setFooters(footers);
|
||||
document.setUnclassifiedTexts(unclassifiedTexts);
|
||||
}
|
||||
|
||||
|
||||
public void addImagesToSections(Document document) {
|
||||
public void addImagesToSections(ClassificationDocument document) {
|
||||
|
||||
Map<Integer, List<Paragraph>> paragraphMap = new HashMap<>();
|
||||
for (Paragraph paragraph : document.getParagraphs()) {
|
||||
for (AbstractTextContainer container : paragraph.getPageBlocks()) {
|
||||
|
||||
paragraphMap.computeIfAbsent(container.getPage(), c -> new ArrayList<>()).add(paragraph);
|
||||
Map<Integer, List<ClassificationSection>> sectionMap = new HashMap<>();
|
||||
for (ClassificationSection section : document.getSections()) {
|
||||
for (AbstractPageBlock container : section.getPageBlocks()) {
|
||||
|
||||
List<ClassificationSection> sectionsOnPage = sectionMap.computeIfAbsent(container.getPage(), c -> new ArrayList<>());
|
||||
if (sectionsOnPage.contains(section)) {
|
||||
continue;
|
||||
}
|
||||
sectionsOnPage.add(section);
|
||||
}
|
||||
}
|
||||
|
||||
if (paragraphMap.isEmpty()) {
|
||||
Paragraph paragraph = new Paragraph();
|
||||
document.getParagraphs().add(paragraph);
|
||||
paragraphMap.computeIfAbsent(1, x -> new ArrayList<>()).add(paragraph);
|
||||
if (sectionMap.isEmpty()) {
|
||||
ClassificationSection section = new ClassificationSection();
|
||||
document.getSections().add(section);
|
||||
sectionMap.computeIfAbsent(1, x -> new ArrayList<>()).add(section);
|
||||
}
|
||||
|
||||
// first page is always a paragraph, else we can't process pages 1..N,
|
||||
// where N is the first found page with a paragraph
|
||||
if (paragraphMap.get(1) == null) {
|
||||
Paragraph paragraph = new Paragraph();
|
||||
document.getParagraphs().add(paragraph);
|
||||
paragraphMap.computeIfAbsent(1, x -> new ArrayList<>()).add(paragraph);
|
||||
if (sectionMap.get(1) == null) {
|
||||
ClassificationSection section = new ClassificationSection();
|
||||
document.getSections().add(section);
|
||||
sectionMap.computeIfAbsent(1, x -> new ArrayList<>()).add(section);
|
||||
}
|
||||
|
||||
for (Page page : document.getPages()) {
|
||||
for (PdfImage image : page.getImages()) {
|
||||
List<Paragraph> paragraphsOnPage = paragraphMap.get(page.getPageNumber());
|
||||
if (paragraphsOnPage == null) {
|
||||
for (ClassificationPage page : document.getPages()) {
|
||||
for (ClassifiedImage image : page.getImages()) {
|
||||
List<ClassificationSection> sectionsOnPage = sectionMap.get(page.getPageNumber());
|
||||
if (sectionsOnPage == null) {
|
||||
int i = page.getPageNumber();
|
||||
while (paragraphsOnPage == null) {
|
||||
paragraphsOnPage = paragraphMap.get(i);
|
||||
while (sectionsOnPage == null) {
|
||||
sectionsOnPage = sectionMap.get(i);
|
||||
i--;
|
||||
}
|
||||
}
|
||||
for (Paragraph paragraph : paragraphsOnPage) {
|
||||
for (ClassificationSection section : sectionsOnPage) {
|
||||
Float xMin = null;
|
||||
Float yMin = null;
|
||||
Float xMax = null;
|
||||
Float yMax = null;
|
||||
|
||||
for (AbstractTextContainer abs : paragraph.getPageBlocks()) {
|
||||
for (AbstractPageBlock abs : section.getPageBlocks()) {
|
||||
if (abs.getPage() != page.getPageNumber()) {
|
||||
continue;
|
||||
}
|
||||
@ -197,21 +199,21 @@ public class SectionsBuilderService {
|
||||
|
||||
if (xMin != null && xMax != null && yMin != null && yMax != null && image.getPosition().getX() >= xMin && image.getPosition()
|
||||
.getX() <= xMax && image.getPosition().getY() >= yMin && image.getPosition().getY() <= yMax) {
|
||||
paragraph.getImages().add(image);
|
||||
image.setAppendedToParagraph(true);
|
||||
section.getImages().add(image);
|
||||
image.setAppendedToSection(true);
|
||||
}
|
||||
}
|
||||
if (!image.isAppendedToParagraph()) {
|
||||
if (!image.isAppendedToSection()) {
|
||||
log.debug("Image uses first paragraph");
|
||||
paragraphsOnPage.get(0).getImages().add(image);
|
||||
image.setAppendedToParagraph(true);
|
||||
sectionsOnPage.get(0).getImages().add(image);
|
||||
image.setAppendedToSection(true);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void mergeTableMetadata(Table currentTable, Table previousTable) {
|
||||
private void mergeTableMetadata(TablePageBlock currentTable, TablePageBlock previousTable) {
|
||||
|
||||
// Distribute header information for subsequent tables
|
||||
if (previousTable != null && hasInvalidHeaderInformation(currentTable) && hasValidHeaderInformation(previousTable)) {
|
||||
@ -239,86 +241,44 @@ public class SectionsBuilderService {
|
||||
}
|
||||
|
||||
|
||||
private Paragraph buildTextBlock(List<AbstractTextContainer> wordBlockList, String lastHeadline) {
|
||||
private ClassificationSection buildTextBlock(List<AbstractPageBlock> wordBlockList, String lastHeadline) {
|
||||
|
||||
Paragraph paragraph = new Paragraph();
|
||||
TextBlock textBlock = null;
|
||||
ClassificationSection section = new ClassificationSection();
|
||||
|
||||
int pageBefore = -1;
|
||||
boolean splitByTable = false;
|
||||
for (AbstractPageBlock container : wordBlockList) {
|
||||
if (container instanceof TablePageBlock table) {
|
||||
|
||||
Iterator<AbstractTextContainer> itty = wordBlockList.iterator();
|
||||
boolean alreadyAdded = false;
|
||||
AbstractTextContainer previous = null;
|
||||
while (itty.hasNext()) {
|
||||
AbstractTextContainer container = itty.next();
|
||||
|
||||
if (container instanceof Table) {
|
||||
Table table = (Table) container;
|
||||
splitByTable = true;
|
||||
|
||||
if (previous != null && previous.getText().startsWith("Table ")) {
|
||||
table.setHeadline(previous.getText());
|
||||
if (lastHeadline == null || lastHeadline.isEmpty()) {
|
||||
table.setHeadline("Text in table");
|
||||
} else {
|
||||
if (lastHeadline == null || lastHeadline.isEmpty()) {
|
||||
table.setHeadline("Text in table");
|
||||
} else {
|
||||
table.setHeadline("Table in: " + lastHeadline);
|
||||
}
|
||||
table.setHeadline("TablePageBlock in: " + lastHeadline);
|
||||
}
|
||||
|
||||
if (textBlock != null && !alreadyAdded) {
|
||||
paragraph.getPageBlocks().add(textBlock);
|
||||
alreadyAdded = true;
|
||||
}
|
||||
paragraph.getPageBlocks().add(table);
|
||||
section.getPageBlocks().add(table);
|
||||
continue;
|
||||
}
|
||||
|
||||
TextBlock wordBlock = (TextBlock) container;
|
||||
|
||||
if (textBlock == null) {
|
||||
textBlock = new TextBlock(wordBlock.getMinX(), wordBlock.getMaxX(), wordBlock.getMinY(), wordBlock.getMaxY(), wordBlock.getSequences(), wordBlock.getRotation());
|
||||
textBlock.setPage(wordBlock.getPage());
|
||||
} else if (splitByTable) {
|
||||
textBlock = new TextBlock(wordBlock.getMinX(), wordBlock.getMaxX(), wordBlock.getMinY(), wordBlock.getMaxY(), wordBlock.getSequences(), wordBlock.getRotation());
|
||||
textBlock.setPage(wordBlock.getPage());
|
||||
alreadyAdded = false;
|
||||
} else if (pageBefore != -1 && wordBlock.getPage() != pageBefore) {
|
||||
textBlock.setPage(pageBefore);
|
||||
paragraph.getPageBlocks().add(textBlock);
|
||||
textBlock = new TextBlock(wordBlock.getMinX(), wordBlock.getMaxX(), wordBlock.getMinY(), wordBlock.getMaxY(), wordBlock.getSequences(), wordBlock.getRotation());
|
||||
textBlock.setPage(wordBlock.getPage());
|
||||
} else {
|
||||
TextBlock spatialEntity = textBlock.union(wordBlock);
|
||||
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
|
||||
}
|
||||
pageBefore = wordBlock.getPage();
|
||||
splitByTable = false;
|
||||
previous = container;
|
||||
TextPageBlock wordBlock = (TextPageBlock) container;
|
||||
section.getPageBlocks().add(wordBlock);
|
||||
}
|
||||
|
||||
if (textBlock != null && !alreadyAdded) {
|
||||
paragraph.getPageBlocks().add(textBlock);
|
||||
}
|
||||
return paragraph;
|
||||
return section;
|
||||
}
|
||||
|
||||
|
||||
private boolean hasValidHeaderInformation(Table table) {
|
||||
private boolean hasValidHeaderInformation(TablePageBlock table) {
|
||||
|
||||
return !hasInvalidHeaderInformation(table);
|
||||
}
|
||||
|
||||
|
||||
private boolean hasInvalidHeaderInformation(Table table) {
|
||||
private boolean hasInvalidHeaderInformation(TablePageBlock table) {
|
||||
|
||||
return table.getRows().stream().flatMap(row -> row.stream().filter(cell -> CollectionUtils.isNotEmpty(cell.getHeaderCells()))).findAny().isEmpty();
|
||||
|
||||
}
|
||||
|
||||
|
||||
private List<Cell> getRowWithNonHeaderCells(Table table) {
|
||||
private List<Cell> getRowWithNonHeaderCells(TablePageBlock table) {
|
||||
|
||||
for (int i = table.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
|
||||
List<Cell> row = table.getRows().get(i);
|
||||
@ -1,4 +1,4 @@
|
||||
package com.iqser.red.service.redaction.v1.server.tableextraction.service;
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.service;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.ArrayList;
|
||||
@ -13,15 +13,15 @@ import java.util.stream.Collectors;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.CleanRulings;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Rectangle;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationPage;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.Cell;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.CleanRulings;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.Rectangle;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.Ruling;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.TablePageBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils.Utils;
|
||||
|
||||
@Service
|
||||
public class TableExtractionService {
|
||||
@ -73,20 +73,20 @@ public class TableExtractionService {
|
||||
* 90 -> UpperLeft
|
||||
* 180 -> UpperRight
|
||||
* 270 -> LowerRight
|
||||
*
|
||||
* <p>
|
||||
* DirAdj (Text direction adjusted) values can not be used here.
|
||||
*
|
||||
* @param cleanRulings The lines used to build the table.
|
||||
* @param page Page object that contains textblocks and statistics.
|
||||
* @param page Page object that contains textblocks and statistics.
|
||||
*/
|
||||
public void extractTables(CleanRulings cleanRulings, Page page) {
|
||||
public void extractTables(CleanRulings cleanRulings, ClassificationPage page) {
|
||||
|
||||
List<Cell> cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
|
||||
List<TextBlock> toBeRemoved = new ArrayList<>();
|
||||
List<TextPageBlock> toBeRemoved = new ArrayList<>();
|
||||
|
||||
for (AbstractTextContainer abstractTextContainer : page.getTextBlocks()) {
|
||||
TextBlock textBlock = (TextBlock) abstractTextContainer;
|
||||
for (AbstractPageBlock abstractPageBlock : page.getTextBlocks()) {
|
||||
TextPageBlock textBlock = (TextPageBlock) abstractPageBlock;
|
||||
for (Cell cell : cells) {
|
||||
if (cell.hasMinimumSize() && cell.intersects(textBlock.getPdfMinX(),
|
||||
textBlock.getPdfMinY(),
|
||||
@ -104,7 +104,7 @@ public class TableExtractionService {
|
||||
|
||||
List<Rectangle> spreadsheetAreas = findSpreadsheetsFromCells(cells).stream().filter(r -> r.getWidth() > 0f && r.getHeight() > 0f).collect(Collectors.toList());
|
||||
|
||||
List<Table> tables = new ArrayList<>();
|
||||
List<TablePageBlock> tables = new ArrayList<>();
|
||||
for (Rectangle area : spreadsheetAreas) {
|
||||
|
||||
List<Cell> overlappingCells = new ArrayList<>();
|
||||
@ -113,16 +113,16 @@ public class TableExtractionService {
|
||||
overlappingCells.add(c);
|
||||
}
|
||||
}
|
||||
tables.add(new Table(overlappingCells, area, page.getRotation()));
|
||||
tables.add(new TablePageBlock(overlappingCells, area, page.getRotation()));
|
||||
}
|
||||
|
||||
for (Table table : tables) {
|
||||
for (TablePageBlock table : tables) {
|
||||
int position = -1;
|
||||
|
||||
Iterator<AbstractTextContainer> itty = page.getTextBlocks().iterator();
|
||||
Iterator<AbstractPageBlock> itty = page.getTextBlocks().iterator();
|
||||
while (itty.hasNext()) {
|
||||
AbstractTextContainer textBlock = itty.next();
|
||||
if (textBlock instanceof TextBlock ? table.containsBlock((TextBlock) textBlock) : table.contains(textBlock) && position == -1) {
|
||||
AbstractPageBlock textBlock = itty.next();
|
||||
if (textBlock instanceof TextPageBlock ? table.containsBlock((TextPageBlock) textBlock) : table.contains(textBlock) && position == -1) {
|
||||
position = page.getTextBlocks().indexOf(textBlock);
|
||||
}
|
||||
}
|
||||
@ -9,7 +9,7 @@
|
||||
* This program is free software under the LGPL (>=v2.1)
|
||||
* Read the file LICENSE.txt coming with the sources for details.
|
||||
*/
|
||||
package com.iqser.red.service.redaction.v1.server.tableextraction.utils;
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils;
|
||||
|
||||
import java.awt.geom.Line2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
@ -1,4 +1,4 @@
|
||||
package com.iqser.red.service.redaction.v1.server.tableextraction.utils;
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
@ -23,9 +23,11 @@ public class FileUtils {
|
||||
/**
|
||||
* Deletes a file; logs a message with the reason if the deletion fails.
|
||||
* This method is null-safe.
|
||||
*
|
||||
* @param file The file to delete. Can be null.
|
||||
*/
|
||||
public void deleteFile(File file) {
|
||||
|
||||
if (file != null) {
|
||||
try {
|
||||
Files.deleteIfExists(file.toPath());
|
||||
@ -1,7 +1,7 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.utils;
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@ -11,7 +11,7 @@ public final class PositionUtils {
|
||||
|
||||
// TODO This currently uses pdf coord system. In the futher this should use java coord system.
|
||||
// Note: DirAdj (TextDirection Adjusted) can not be user for this.
|
||||
public boolean isWithinBodyTextFrame(Rectangle btf, TextBlock textBlock) {
|
||||
public boolean isWithinBodyTextFrame(Rectangle btf, TextPageBlock textBlock) {
|
||||
|
||||
if (btf == null || textBlock == null) {
|
||||
return false;
|
||||
@ -32,7 +32,7 @@ public final class PositionUtils {
|
||||
|
||||
// TODO This currently uses pdf coord system. In the futher this should use java coord system.
|
||||
// Note: DirAdj (TextDirection Adjusted) can not be user for this.
|
||||
public boolean isOverBodyTextFrame(Rectangle btf, TextBlock textBlock, int rotation) {
|
||||
public boolean isOverBodyTextFrame(Rectangle btf, TextPageBlock textBlock, int rotation) {
|
||||
|
||||
if (btf == null || textBlock == null) {
|
||||
return false;
|
||||
@ -58,9 +58,10 @@ public final class PositionUtils {
|
||||
|
||||
}
|
||||
|
||||
|
||||
// TODO This currently uses pdf coord system. In the futher this should use java coord system.
|
||||
// Note: DirAdj (TextDirection Adjusted) can not be user for this.
|
||||
public boolean isUnderBodyTextFrame(Rectangle btf, TextBlock textBlock, int rotation) {
|
||||
public boolean isUnderBodyTextFrame(Rectangle btf, TextPageBlock textBlock, int rotation) {
|
||||
|
||||
if (btf == null || textBlock == null) {
|
||||
return false;
|
||||
@ -86,9 +87,10 @@ public final class PositionUtils {
|
||||
|
||||
}
|
||||
|
||||
|
||||
// TODO This currently uses pdf coord system. In the futher this should use java coord system.
|
||||
// Note: DirAdj (TextDirection Adjusted) can not be user for this.
|
||||
public boolean isTouchingUnderBodyTextFrame(Rectangle btf, TextBlock textBlock) {
|
||||
public boolean isTouchingUnderBodyTextFrame(Rectangle btf, TextPageBlock textBlock) {
|
||||
|
||||
//TODO Currently this is not working for rotated pages.
|
||||
|
||||
@ -105,13 +107,13 @@ public final class PositionUtils {
|
||||
}
|
||||
|
||||
|
||||
public float getHeightDifferenceBetweenChunkWordAndDocumentWord(TextBlock textBlock, Float documentMostPopularWordHeight) {
|
||||
public float getHeightDifferenceBetweenChunkWordAndDocumentWord(TextPageBlock textBlock, Float documentMostPopularWordHeight) {
|
||||
|
||||
return textBlock.getMostPopularWordHeight() - documentMostPopularWordHeight;
|
||||
}
|
||||
|
||||
|
||||
public Float getApproxLineCount(TextBlock textBlock) {
|
||||
public Float getApproxLineCount(TextPageBlock textBlock) {
|
||||
|
||||
return textBlock.getHeight() / textBlock.getMostPopularWordHeight();
|
||||
}
|
||||
@ -1,4 +1,4 @@
|
||||
package com.iqser.red.service.redaction.v1.server.tableextraction.utils;
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils;
|
||||
|
||||
import java.util.ArrayDeque;
|
||||
import java.util.Comparator;
|
||||
@ -1,9 +1,9 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.utils;
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils;
|
||||
|
||||
import java.awt.geom.Line2D;
|
||||
import java.awt.geom.Point2D;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.Ruling;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@ -13,7 +13,7 @@ public final class RulingTextDirAdjustUtil {
|
||||
/**
|
||||
* Converts a ruling (line of a table) the same way TextPositions are converted in PDFBox.
|
||||
* This will get the y position of the text, adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
|
||||
*
|
||||
* <p>
|
||||
* See org.apache.pdfbox.text.TextPosition
|
||||
*/
|
||||
public Line2D.Float convertToDirAdj(Ruling ruling, float dir, float pageWidth, float pageHeight) {
|
||||
@ -1,4 +1,4 @@
|
||||
package com.iqser.red.service.redaction.v1.server.tableextraction.utils;
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils;
|
||||
|
||||
import java.math.BigDecimal;
|
||||
import java.util.Comparator;
|
||||
@ -0,0 +1,50 @@
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.data;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.List;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.AtomicTextBlock;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class AtomicPositionBlockData {
|
||||
|
||||
Long id;
|
||||
int[] stringIdxToPositionIdx;
|
||||
float[][] positions;
|
||||
|
||||
|
||||
public static AtomicPositionBlockData fromAtomicTextBlock(AtomicTextBlock atomicTextBlock) {
|
||||
|
||||
return AtomicPositionBlockData.builder()
|
||||
.id(atomicTextBlock.getId())
|
||||
.positions(toPrimitiveFloatMatrix(atomicTextBlock.getPositions()))
|
||||
.stringIdxToPositionIdx(atomicTextBlock.getStringIdxToPositionIdx().stream().mapToInt(Integer::intValue).toArray())
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
private static float[][] toPrimitiveFloatMatrix(List<Rectangle2D> positions) {
|
||||
|
||||
float[][] positionMatrix = new float[positions.size()][];
|
||||
for (int i = 0; i < positions.size(); i++) {
|
||||
positionMatrix[i] = toArray(positions.get(i));
|
||||
}
|
||||
return positionMatrix;
|
||||
}
|
||||
|
||||
|
||||
private static float[] toArray(Rectangle2D positions) {
|
||||
|
||||
return new float[]{(float) positions.getMinX(), (float) positions.getMinY(), (float) positions.getWidth(), (float) positions.getHeight()};
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,39 @@
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.data;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.AtomicTextBlock;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class AtomicTextBlockData {
|
||||
|
||||
Long id;
|
||||
Long page;
|
||||
String searchText;
|
||||
int numberOnPage;
|
||||
int start;
|
||||
int end;
|
||||
int[] lineBreaks;
|
||||
|
||||
|
||||
public static AtomicTextBlockData fromAtomicTextBlock(AtomicTextBlock atomicTextBlock) {
|
||||
|
||||
return AtomicTextBlockData.builder()
|
||||
.id(atomicTextBlock.getId())
|
||||
.page(atomicTextBlock.getPage().getNumber().longValue())
|
||||
.searchText(atomicTextBlock.getSearchText())
|
||||
.numberOnPage(atomicTextBlock.getNumberOnPage())
|
||||
.start(atomicTextBlock.getBoundary().start())
|
||||
.end(atomicTextBlock.getBoundary().end())
|
||||
.lineBreaks(atomicTextBlock.getLineBreaks().stream().mapToInt(Integer::intValue).toArray())
|
||||
.build();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,43 @@
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.data;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Document;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class DocumentData {
|
||||
|
||||
PageData[] pages;
|
||||
AtomicTextBlockData[] atomicTextBlocks;
|
||||
AtomicPositionBlockData[] atomicPositionBlocks;
|
||||
DocumentTreeData documentTreeData;
|
||||
|
||||
|
||||
public static DocumentData fromDocument(Document document) {
|
||||
|
||||
var atomicPositionBlocks = document.streamTerminalTextBlocksInOrder()
|
||||
.flatMap(textBlock -> textBlock.getAtomicTextBlocks().stream())
|
||||
.distinct()
|
||||
.map(AtomicPositionBlockData::fromAtomicTextBlock)
|
||||
.toArray(AtomicPositionBlockData[]::new);
|
||||
|
||||
var atomicTextBlocks = document.streamTerminalTextBlocksInOrder()
|
||||
.flatMap(textBlock -> textBlock.getAtomicTextBlocks().stream())
|
||||
.distinct()
|
||||
.map(AtomicTextBlockData::fromAtomicTextBlock)
|
||||
.toArray(AtomicTextBlockData[]::new);
|
||||
|
||||
var pages = document.getPages().stream().map(PageData::fromPage).toArray(PageData[]::new);
|
||||
|
||||
var documentTreeData = new DocumentTreeData(DocumentTreeData.EntryData.fromEntry(document.getDocumentTree().getRoot()));
|
||||
return new DocumentData(pages, atomicTextBlocks, atomicPositionBlocks, documentTreeData);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,128 @@
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.data;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.data.mapper.PropertiesMapper;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.DocumentTree;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Image;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.NodeType;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Page;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Table;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.TableCell;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.AtomicTextBlock;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.Getter;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class DocumentTreeData {
|
||||
|
||||
EntryData root;
|
||||
|
||||
|
||||
public EntryData get(List<Integer> tocId) {
|
||||
|
||||
if (tocId.isEmpty()) {
|
||||
return root;
|
||||
}
|
||||
EntryData entry = root.children.get(tocId.get(0));
|
||||
for (int id : tocId.subList(1, tocId.size())) {
|
||||
entry = entry.children.get(id);
|
||||
}
|
||||
return entry;
|
||||
}
|
||||
|
||||
|
||||
public Stream<EntryData> streamAllEntries() {
|
||||
|
||||
return Stream.concat(Stream.of(root), root.children.stream()).flatMap(DocumentTreeData::flatten);
|
||||
}
|
||||
|
||||
|
||||
public String toString() {
|
||||
|
||||
return String.join("\n", streamAllEntries().map(EntryData::toString).toList());
|
||||
}
|
||||
|
||||
|
||||
private static Stream<EntryData> flatten(EntryData entry) {
|
||||
|
||||
return Stream.concat(Stream.of(entry), entry.children.stream().flatMap(DocumentTreeData::flatten));
|
||||
}
|
||||
|
||||
|
||||
@Builder
|
||||
@Getter
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public static class EntryData {
|
||||
|
||||
NodeType type;
|
||||
int[] treeId;
|
||||
Long[] atomicBlockIds;
|
||||
Long[] pageNumbers;
|
||||
Map<String, String> properties;
|
||||
List<EntryData> children;
|
||||
|
||||
|
||||
public static EntryData fromEntry(DocumentTree.Entry entry) {
|
||||
|
||||
Long[] atomicBlockIds = toAtomicTextBlockIds(entry);
|
||||
|
||||
Map<String, String> properties = switch (entry.getType()) {
|
||||
case TABLE -> PropertiesMapper.buildTableProperties((Table) entry.getNode());
|
||||
case TABLE_CELL -> PropertiesMapper.buildTableCellProperties((TableCell) entry.getNode());
|
||||
case IMAGE -> PropertiesMapper.buildImageProperties((Image) entry.getNode());
|
||||
default -> new HashMap<>();
|
||||
};
|
||||
var treeId = entry.getTreeId().stream().mapToInt(Integer::intValue).toArray();
|
||||
var pageNumbers = entry.getNode().getPages().stream().map(Page::getNumber).map(Integer::longValue).toArray(Long[]::new);
|
||||
var subEntries = entry.getChildren().stream().map(EntryData::fromEntry).toList();
|
||||
return new EntryData(entry.getType(), treeId, atomicBlockIds, pageNumbers, properties, subEntries);
|
||||
}
|
||||
|
||||
|
||||
private static Long[] toAtomicTextBlockIds(DocumentTree.Entry entry) {
|
||||
|
||||
if (entry.getNode().isLeaf()) {
|
||||
return entry.getNode().getLeafTextBlock().getAtomicTextBlocks().stream().map(AtomicTextBlock::getId).toArray(Long[]::new);
|
||||
} else {
|
||||
return new Long[]{};
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append("[");
|
||||
for (int i : treeId) {
|
||||
sb.append(i);
|
||||
sb.append(",");
|
||||
}
|
||||
sb.delete(sb.length() - 1, sb.length());
|
||||
sb.append("]: ");
|
||||
|
||||
sb.append(type);
|
||||
sb.append(" atbs = ");
|
||||
sb.append(atomicBlockIds.length);
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,28 @@
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.data;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Page;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class PageData {
|
||||
|
||||
int number;
|
||||
int height;
|
||||
int width;
|
||||
int rotation;
|
||||
|
||||
|
||||
public static PageData fromPage(Page page) {
|
||||
|
||||
return new PageData(page.getNumber(), page.getHeight(), page.getWidth(), page.getRotation());
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,198 @@
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.data.mapper;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.NoSuchElementException;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.data.AtomicPositionBlockData;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.data.AtomicTextBlockData;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.data.DocumentData;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.data.DocumentTreeData;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.data.PageData;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.DocumentTree;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Footer;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Header;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Headline;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Image;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Page;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Paragraph;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Section;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.SemanticNode;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Table;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.TableCell;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.AtomicTextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.TextBlockCollector;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class DocumentGraphMapper {
|
||||
|
||||
public Document toDocumentGraph(DocumentData documentData) {
|
||||
|
||||
Document document = new Document();
|
||||
DocumentTree documentTree = new DocumentTree(document);
|
||||
Context context = new Context(documentData, documentTree);
|
||||
|
||||
context.pages.addAll(Arrays.stream(documentData.getPages()).map(DocumentGraphMapper::buildPage).toList());
|
||||
|
||||
context.documentTree.getRoot().getChildren().addAll(buildEntries(documentData.getDocumentTreeData().getRoot().getChildren(), context));
|
||||
|
||||
document.setDocumentTree(context.documentTree);
|
||||
document.setPages(new HashSet<>(context.pages));
|
||||
document.setNumberOfPages(documentData.getPages().length);
|
||||
|
||||
document.setTextBlock(document.getTextBlock());
|
||||
return document;
|
||||
}
|
||||
|
||||
|
||||
private List<DocumentTree.Entry> buildEntries(List<DocumentTreeData.EntryData> entries, Context context) {
|
||||
|
||||
List<DocumentTree.Entry> newEntries = new LinkedList<>();
|
||||
for (DocumentTreeData.EntryData entryData : entries) {
|
||||
|
||||
List<Page> pages = Arrays.stream(entryData.getPageNumbers()).map(pageNumber -> getPage(pageNumber, context)).toList();
|
||||
|
||||
SemanticNode node = switch (entryData.getType()) {
|
||||
case SECTION -> buildSection(context);
|
||||
case PARAGRAPH -> buildParagraph(context);
|
||||
case HEADLINE -> buildHeadline(context);
|
||||
case HEADER -> buildHeader(context);
|
||||
case FOOTER -> buildFooter(context);
|
||||
case TABLE -> buildTable(context, entryData.getProperties());
|
||||
case TABLE_CELL -> buildTableCell(context, entryData.getProperties());
|
||||
case IMAGE -> buildImage(context, entryData.getProperties(), entryData.getPageNumbers());
|
||||
default -> throw new UnsupportedOperationException("Not yet implemented for type " + entryData.getType());
|
||||
};
|
||||
|
||||
if (entryData.getAtomicBlockIds().length > 0) {
|
||||
TextBlock textBlock = toTextBlock(entryData.getAtomicBlockIds(), context, node);
|
||||
node.setLeafTextBlock(textBlock);
|
||||
}
|
||||
List<Integer> treeId = Arrays.stream(entryData.getTreeId()).boxed().toList();
|
||||
node.setTreeId(treeId);
|
||||
|
||||
switch (entryData.getType()) {
|
||||
case HEADER -> pages.forEach(page -> page.setHeader((Header) node));
|
||||
case FOOTER -> pages.forEach(page -> page.setFooter((Footer) node));
|
||||
default -> pages.forEach(page -> page.getMainBody().add(node));
|
||||
}
|
||||
|
||||
newEntries.add(DocumentTree.Entry.builder().treeId(treeId).children(buildEntries(entryData.getChildren(), context)).node(node).build());
|
||||
}
|
||||
return newEntries;
|
||||
}
|
||||
|
||||
|
||||
private Headline buildHeadline(Context context) {
|
||||
|
||||
return Headline.builder().documentTree(context.documentTree).build();
|
||||
}
|
||||
|
||||
|
||||
private Image buildImage(Context context, Map<String, String> properties, Long[] pageNumbers) {
|
||||
|
||||
assert pageNumbers.length == 1;
|
||||
Page page = getPage(pageNumbers[0], context);
|
||||
var builder = Image.builder();
|
||||
PropertiesMapper.parseImageProperties(properties, builder);
|
||||
return builder.documentTree(context.documentTree).page(page).build();
|
||||
}
|
||||
|
||||
|
||||
private TableCell buildTableCell(Context context, Map<String, String> properties) {
|
||||
|
||||
TableCell.TableCellBuilder builder = TableCell.builder();
|
||||
PropertiesMapper.parseTableCellProperties(properties, builder);
|
||||
return builder.documentTree(context.documentTree).build();
|
||||
}
|
||||
|
||||
|
||||
private Table buildTable(Context context, Map<String, String> properties) {
|
||||
|
||||
Table.TableBuilder builder = Table.builder();
|
||||
PropertiesMapper.parseTableProperties(properties, builder);
|
||||
return builder.documentTree(context.documentTree).build();
|
||||
}
|
||||
|
||||
|
||||
private Footer buildFooter(Context context) {
|
||||
|
||||
return Footer.builder().documentTree(context.documentTree).build();
|
||||
}
|
||||
|
||||
|
||||
private Header buildHeader(Context context) {
|
||||
|
||||
return Header.builder().documentTree(context.documentTree).build();
|
||||
}
|
||||
|
||||
|
||||
private Section buildSection(Context context) {
|
||||
|
||||
return Section.builder().documentTree(context.documentTree).build();
|
||||
}
|
||||
|
||||
|
||||
private Paragraph buildParagraph(Context context) {
|
||||
|
||||
return Paragraph.builder().documentTree(context.documentTree).build();
|
||||
}
|
||||
|
||||
|
||||
private TextBlock toTextBlock(Long[] atomicTextBlockIds, Context context, SemanticNode parent) {
|
||||
|
||||
return Arrays.stream(atomicTextBlockIds).map(atomicTextBlockId -> getAtomicTextBlock(context, parent, atomicTextBlockId)).collect(new TextBlockCollector());
|
||||
}
|
||||
|
||||
|
||||
private AtomicTextBlock getAtomicTextBlock(Context context, SemanticNode parent, Long atomicTextBlockId) {
|
||||
|
||||
return AtomicTextBlock.fromAtomicTextBlockData(context.atomicTextBlockData.get(Math.toIntExact(atomicTextBlockId)),
|
||||
context.atomicPositionBlockData.get(Math.toIntExact(atomicTextBlockId)),
|
||||
parent,
|
||||
getPage(context.atomicTextBlockData.get(Math.toIntExact(atomicTextBlockId)).getPage(), context));
|
||||
}
|
||||
|
||||
|
||||
private Page buildPage(PageData p) {
|
||||
|
||||
return Page.builder().rotation(p.getRotation()).height(p.getHeight()).width(p.getWidth()).number(p.getNumber()).mainBody(new LinkedList<>()).build();
|
||||
}
|
||||
|
||||
|
||||
private Page getPage(Long pageIndex, Context context) {
|
||||
|
||||
return context.pages.stream()
|
||||
.filter(page -> page.getNumber() == Math.toIntExact(pageIndex))
|
||||
.findFirst()
|
||||
.orElseThrow(() -> new NoSuchElementException(String.format("ClassificationPage with number %d not found", pageIndex)));
|
||||
}
|
||||
|
||||
|
||||
static final class Context {
|
||||
|
||||
private final DocumentTree documentTree;
|
||||
private final List<Page> pages;
|
||||
private final List<AtomicTextBlockData> atomicTextBlockData;
|
||||
private final List<AtomicPositionBlockData> atomicPositionBlockData;
|
||||
|
||||
|
||||
Context(DocumentData documentData, DocumentTree documentTree) {
|
||||
|
||||
this.documentTree = documentTree;
|
||||
this.pages = new LinkedList<>();
|
||||
this.atomicTextBlockData = Arrays.stream(documentData.getAtomicTextBlocks()).toList();
|
||||
this.atomicPositionBlockData = Arrays.stream(documentData.getAtomicPositionBlocks()).toList();
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,110 @@
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.data.mapper;
|
||||
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Image;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.ImageType;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Table;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.TableCell;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class PropertiesMapper {
|
||||
|
||||
String imageType = "imageType";
|
||||
private final String transparency = "transparency";
|
||||
private final String position = "position";
|
||||
String id = "id";
|
||||
String row = "row";
|
||||
String col = "col";
|
||||
String header = "header";
|
||||
String bBox = "bBox";
|
||||
String numberOfRows = "numberOfRows";
|
||||
String numberOfCols = "numberOfCols";
|
||||
|
||||
|
||||
public Map<String, String> buildImageProperties(Image image) {
|
||||
|
||||
Map<String, String> properties = new HashMap<>();
|
||||
properties.put(imageType, image.getImageType().toString());
|
||||
properties.put(transparency, String.valueOf(image.isTransparent()));
|
||||
properties.put(position, toString(image.getPosition()));
|
||||
properties.put(id, image.getId());
|
||||
return properties;
|
||||
}
|
||||
|
||||
|
||||
private String toString(Rectangle2D rectangle2D) {
|
||||
|
||||
return format("%f,%f,%f,%f", rectangle2D.getX(), rectangle2D.getY(), rectangle2D.getWidth(), rectangle2D.getHeight());
|
||||
}
|
||||
|
||||
|
||||
public Map<String, String> buildTableCellProperties(TableCell tableCell) {
|
||||
|
||||
Map<String, String> properties = new HashMap<>();
|
||||
properties.put(row, String.valueOf(tableCell.getRow()));
|
||||
properties.put(col, String.valueOf(tableCell.getCol()));
|
||||
properties.put(header, String.valueOf(tableCell.isHeader()));
|
||||
|
||||
if (tableCell.getPages().size() > 1 || tableCell.getBBox().keySet().size() > 1) {
|
||||
throw new IllegalArgumentException("TableCell can only occur on a single page!");
|
||||
}
|
||||
String bBoxString = toString(tableCell.getBBox().get(tableCell.getPages().stream().findFirst().get()));
|
||||
properties.put(bBox, bBoxString);
|
||||
|
||||
return properties;
|
||||
}
|
||||
|
||||
|
||||
public Map<String, String> buildTableProperties(Table table) {
|
||||
|
||||
Map<String, String> properties = new HashMap<>();
|
||||
properties.put(numberOfRows, String.valueOf(table.getNumberOfRows()));
|
||||
properties.put(numberOfCols, String.valueOf(table.getNumberOfCols()));
|
||||
return properties;
|
||||
}
|
||||
|
||||
|
||||
public void parseImageProperties(Map<String, String> properties, Image.ImageBuilder builder) {
|
||||
|
||||
builder.imageType(ImageType.fromString(properties.get(imageType)));
|
||||
builder.transparent(Boolean.parseBoolean(properties.get(transparency)));
|
||||
builder.position(parseRectangle2D(properties.get(position)));
|
||||
builder.id(properties.get(id));
|
||||
}
|
||||
|
||||
|
||||
public void parseTableCellProperties(Map<String, String> properties, TableCell.TableCellBuilder builder) {
|
||||
|
||||
builder.row(Integer.parseInt(properties.get(row)));
|
||||
builder.col(Integer.parseInt(properties.get(col)));
|
||||
builder.header(Boolean.parseBoolean(properties.get(header)));
|
||||
builder.bBox(parseRectangle2D(properties.get(bBox)));
|
||||
}
|
||||
|
||||
|
||||
public void parseTableProperties(Map<String, String> properties, Table.TableBuilder builder) {
|
||||
|
||||
builder.numberOfRows(Integer.parseInt(properties.get(numberOfRows)));
|
||||
builder.numberOfCols(Integer.parseInt(properties.get(numberOfCols)));
|
||||
}
|
||||
|
||||
|
||||
private Rectangle2D parseRectangle2D(String bBox) {
|
||||
|
||||
List<Float> floats = Arrays.stream(bBox.split(",")).map(Float::parseFloat).toList();
|
||||
return new Rectangle2D.Float(floats.get(0), floats.get(1), floats.get(2), floats.get(3));
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,246 @@
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.factory;
|
||||
|
||||
import static java.lang.String.format;
|
||||
import static java.util.stream.Collectors.groupingBy;
|
||||
import static java.util.stream.Collectors.toList;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.Set;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationDocument;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationFooter;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationHeader;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationPage;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.image.ClassifiedImage;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.DocumentTree;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Footer;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.GenericSemanticNode;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Header;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Headline;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Image;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Page;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Paragraph;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Section;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.AtomicTextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.utils.TextPositionOperations;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.IdBuilder;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Getter;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class DocumentGraphFactory {
|
||||
|
||||
public Document buildDocumentGraph(ClassificationDocument document) {
|
||||
|
||||
Document documentGraph = new Document();
|
||||
Context context = new Context(documentGraph);
|
||||
|
||||
document.getPages().forEach(context::buildAndAddPageWithCounter);
|
||||
document.getSections().stream().flatMap(section -> section.getImages().stream()).forEach(image -> context.getImages().add(image));
|
||||
addSections(document, context);
|
||||
addHeaderAndFooterToEachPage(document, context);
|
||||
|
||||
documentGraph.setNumberOfPages(context.pages.size());
|
||||
documentGraph.setPages(context.pages.keySet());
|
||||
documentGraph.setDocumentTree(context.documentTree);
|
||||
documentGraph.setTextBlock(documentGraph.getTextBlock());
|
||||
return documentGraph;
|
||||
}
|
||||
|
||||
|
||||
private void addSections(ClassificationDocument document, Context context) {
|
||||
|
||||
document.getSections().forEach(section -> SectionNodeFactory.addSection(null, section.getPageBlocks(), section.getImages(), context));
|
||||
}
|
||||
|
||||
|
||||
public void addParagraphOrHeadline(GenericSemanticNode parentNode, TextPageBlock originalTextBlock, Context context, List<TextPageBlock> textBlocksToMerge) {
|
||||
|
||||
Page page = context.getPage(originalTextBlock.getPage());
|
||||
|
||||
GenericSemanticNode node;
|
||||
if (originalTextBlock.isHeadline()) {
|
||||
node = Headline.builder().documentTree(context.getDocumentTree()).build();
|
||||
} else {
|
||||
node = Paragraph.builder().documentTree(context.getDocumentTree()).build();
|
||||
}
|
||||
|
||||
page.getMainBody().add(node);
|
||||
|
||||
List<TextPageBlock> textBlocks = new ArrayList<>(textBlocksToMerge);
|
||||
textBlocks.add(originalTextBlock);
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), node, context, page);
|
||||
List<Integer> treeId = context.documentTree.createNewChildEntryAndReturnId(parentNode, node);
|
||||
node.setLeafTextBlock(textBlock);
|
||||
node.setTreeId(treeId);
|
||||
}
|
||||
|
||||
|
||||
public void addImage(Section section, ClassifiedImage image, Context context) {
|
||||
|
||||
Rectangle2D position = image.getPosition();
|
||||
Page page = context.getPage(image.getPage());
|
||||
Image imageNode = Image.builder()
|
||||
.id(IdBuilder.buildId(Set.of(page), List.of(position)))
|
||||
.imageType(image.getImageType())
|
||||
.position(position)
|
||||
.transparent(image.isHasTransparency())
|
||||
.page(page)
|
||||
.documentTree(context.getDocumentTree())
|
||||
.build();
|
||||
page.getMainBody().add(imageNode);
|
||||
|
||||
List<Integer> tocId = context.getDocumentTree().createNewChildEntryAndReturnId(section, imageNode);
|
||||
imageNode.setTreeId(tocId);
|
||||
}
|
||||
|
||||
|
||||
private void addHeaderAndFooterToEachPage(ClassificationDocument document, Context context) {
|
||||
|
||||
Map<Integer, List<TextPageBlock>> headers = document.getHeaders()
|
||||
.stream()
|
||||
.map(ClassificationHeader::getTextBlocks)
|
||||
.flatMap(List::stream)
|
||||
.collect(groupingBy(AbstractPageBlock::getPage, toList()));
|
||||
|
||||
Map<Integer, List<TextPageBlock>> footers = document.getFooters()
|
||||
.stream()
|
||||
.map(ClassificationFooter::getTextBlocks)
|
||||
.flatMap(List::stream)
|
||||
.collect(groupingBy(AbstractPageBlock::getPage, toList()));
|
||||
|
||||
for (int pageIndex = 1; pageIndex <= document.getPages().size(); pageIndex++) {
|
||||
if (headers.containsKey(pageIndex)) {
|
||||
addHeader(headers.get(pageIndex), context);
|
||||
} else {
|
||||
addEmptyHeader(pageIndex, context);
|
||||
}
|
||||
}
|
||||
|
||||
for (int pageIndex = 1; pageIndex <= document.getPages().size(); pageIndex++) {
|
||||
if (footers.containsKey(pageIndex)) {
|
||||
addFooter(footers.get(pageIndex), context);
|
||||
} else {
|
||||
addEmptyFooter(pageIndex, context);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void addFooter(List<TextPageBlock> textBlocks, Context context) {
|
||||
|
||||
Page page = context.getPage(textBlocks.get(0).getPage());
|
||||
Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build();
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks),
|
||||
footer,
|
||||
context,
|
||||
page);
|
||||
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer);
|
||||
footer.setTreeId(tocId);
|
||||
footer.setLeafTextBlock(textBlock);
|
||||
page.setFooter(footer);
|
||||
}
|
||||
|
||||
|
||||
public void addHeader(List<TextPageBlock> textBlocks, Context context) {
|
||||
|
||||
Page page = context.getPage(textBlocks.get(0).getPage());
|
||||
Header header = Header.builder().documentTree(context.getDocumentTree()).build();
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), header, 0, page);
|
||||
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header);
|
||||
header.setTreeId(tocId);
|
||||
header.setLeafTextBlock(textBlock);
|
||||
page.setHeader(header);
|
||||
}
|
||||
|
||||
|
||||
private void addEmptyFooter(int pageIndex, Context context) {
|
||||
|
||||
Page page = context.getPage(pageIndex);
|
||||
Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build();
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(footer, context, page);
|
||||
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer);
|
||||
footer.setTreeId(tocId);
|
||||
footer.setLeafTextBlock(textBlock);
|
||||
page.setFooter(footer);
|
||||
}
|
||||
|
||||
|
||||
private void addEmptyHeader(int pageIndex, Context context) {
|
||||
|
||||
Page page = context.getPage(pageIndex);
|
||||
Header header = Header.builder().documentTree(context.getDocumentTree()).build();
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(header, 0, page);
|
||||
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header);
|
||||
header.setTreeId(tocId);
|
||||
header.setLeafTextBlock(textBlock);
|
||||
page.setHeader(header);
|
||||
}
|
||||
|
||||
|
||||
@Getter
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public final class Context {
|
||||
|
||||
DocumentTree documentTree;
|
||||
Map<Page, Integer> pages;
|
||||
List<Section> sections;
|
||||
List<ClassifiedImage> images;
|
||||
TextBlockFactory textBlockFactory;
|
||||
|
||||
|
||||
public Context(Document document) {
|
||||
|
||||
documentTree = new DocumentTree(document);
|
||||
pages = new HashMap<>();
|
||||
sections = new LinkedList<>();
|
||||
images = new LinkedList<>();
|
||||
textBlockFactory = new TextBlockFactory();
|
||||
}
|
||||
|
||||
|
||||
public void buildAndAddPageWithCounter(ClassificationPage classificationPage) {
|
||||
|
||||
Page page = Page.fromClassificationPage(classificationPage);
|
||||
//this counter counts the TextBlocks per page
|
||||
//initial value is set to 1, because 0 is reserved for Header
|
||||
pages.put(page, 1);
|
||||
}
|
||||
|
||||
|
||||
public int getAndIncrementTextBlockNumberOnPage(Page page) {
|
||||
|
||||
Integer textBlockNumberOnPage = pages.get(page);
|
||||
pages.merge(page, 1, Integer::sum);
|
||||
return textBlockNumberOnPage;
|
||||
}
|
||||
|
||||
|
||||
public Page getPage(int pageIndex) {
|
||||
|
||||
return pages.keySet()
|
||||
.stream()
|
||||
.filter(page -> page.getNumber() == pageIndex)
|
||||
.findFirst()
|
||||
.orElseThrow(() -> new NoSuchElementException(format("ClassificationPage with number %d not found", pageIndex)));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,33 @@
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.factory;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Builder;
|
||||
import lombok.Getter;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Builder
|
||||
@Getter
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class SearchTextWithTextPositionDto {
|
||||
|
||||
String searchText;
|
||||
List<Integer> lineBreaks;
|
||||
List<Integer> stringCoordsToPositionCoords;
|
||||
List<Rectangle2D> positions;
|
||||
|
||||
|
||||
public static SearchTextWithTextPositionDto empty() {
|
||||
|
||||
return SearchTextWithTextPositionDto.builder()
|
||||
.searchText("")
|
||||
.lineBreaks(Collections.emptyList())
|
||||
.positions(Collections.emptyList())
|
||||
.stringCoordsToPositionCoords(Collections.emptyList())
|
||||
.build();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,185 @@
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.factory;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.RedTextPosition;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextDirection;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPositionSequence;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class SearchTextWithTextPositionFactory {
|
||||
|
||||
public final int HEIGHT_PADDING = 2;
|
||||
// when checking for a hyphen linebreak, we need to check after a linebreak if the last hyphen was less than three symbols away.
|
||||
// We detect a linebreak as either a "\n" character or if two adjacent symbol's position differ in y-coordinates by at least one character height.
|
||||
// If there is a hyphen linebreak, the hyphen will be 1 position in front of a "\n" or 2 positions in front of the character which has a lower y-coordinate
|
||||
// This is why, we need to initialize this to < -2, otherwise, if the very first symbol is a \n we would detect a hyphen linebreak that isn't there.
|
||||
// Also, Integer.MIN_VALUE is a bad idea due to potential overflow during arithmetic operations. This is why the default should be -3.
|
||||
public final int MAX_HYPHEN_LINEBREAK_DISTANCE = 3;
|
||||
|
||||
|
||||
public SearchTextWithTextPositionDto buildSearchTextToTextPositionModel(List<TextPositionSequence> sequences) {
|
||||
|
||||
if (sequences.isEmpty() || sequences.stream().allMatch(sequence -> sequence.getTextPositions().isEmpty())) {
|
||||
return SearchTextWithTextPositionDto.empty();
|
||||
}
|
||||
|
||||
Context context = new Context();
|
||||
|
||||
RedTextPosition currentTextPosition = sequences.get(0).getTextPositions().get(0);
|
||||
RedTextPosition previousTextPosition = RedTextPosition.builder().unicode(" ").position(currentTextPosition.getPosition()).build();
|
||||
|
||||
for (TextPositionSequence word : sequences) {
|
||||
for (int i = 0; i < word.getTextPositions().size(); ++i) {
|
||||
|
||||
currentTextPosition = word.getTextPositions().get(i);
|
||||
if (isLineBreak(currentTextPosition, previousTextPosition)) {
|
||||
removeHyphenLinebreaks(context);
|
||||
context.lineBreaksStringIdx.add(context.stringIdx);
|
||||
}
|
||||
if (!isRepeatedWhitespace(currentTextPosition.getUnicode(), previousTextPosition.getUnicode())) {
|
||||
if (isHyphen(currentTextPosition.getUnicode())) {
|
||||
context.lastHyphenIdx = context.stringIdx;
|
||||
}
|
||||
appendCurrentTextPosition(context, currentTextPosition);
|
||||
}
|
||||
|
||||
previousTextPosition = currentTextPosition;
|
||||
++context.positionIdx;
|
||||
}
|
||||
|
||||
previousTextPosition = RedTextPosition.builder().unicode(" ").position(previousTextPosition.getPosition()).build();
|
||||
context.stringBuilder.append(" ");
|
||||
context.stringIdxToPositionIdx.add(context.positionIdx);
|
||||
++context.stringIdx;
|
||||
}
|
||||
|
||||
assert context.stringBuilder.length() == context.stringIdxToPositionIdx.size();
|
||||
|
||||
List<Rectangle2D> positions = sequences.stream()
|
||||
.flatMap(sequence -> sequence.getTextPositions().stream().map(textPosition -> mapRedTextPositionToInitialUserSpace(textPosition, sequence)))
|
||||
.toList();
|
||||
|
||||
return SearchTextWithTextPositionDto.builder()
|
||||
.searchText(context.stringBuilder.toString())
|
||||
.lineBreaks(context.lineBreaksStringIdx)
|
||||
.stringCoordsToPositionCoords(context.stringIdxToPositionIdx)
|
||||
.positions(positions)
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
private void appendCurrentTextPosition(Context context, RedTextPosition currentTextPosition) {
|
||||
|
||||
context.stringBuilder.append(currentTextPosition.getUnicode());
|
||||
|
||||
// unicode characters with more than 16-bit encoding have a length > 1 in java strings
|
||||
for (int j = 0; j < currentTextPosition.getUnicode().length(); j++) {
|
||||
context.stringIdxToPositionIdx.add(context.positionIdx);
|
||||
}
|
||||
context.stringIdx += currentTextPosition.getUnicode().length();
|
||||
}
|
||||
|
||||
|
||||
private void removeHyphenLinebreaks(Context context) {
|
||||
|
||||
if (lastHyphenDirectlyBeforeLineBreak(context)) {
|
||||
context.stringBuilder.delete(context.lastHyphenIdx, context.stringBuilder.length());
|
||||
context.stringIdxToPositionIdx = context.stringIdxToPositionIdx.subList(0, context.lastHyphenIdx);
|
||||
context.stringIdx = context.lastHyphenIdx;
|
||||
context.lastHyphenIdx = -MAX_HYPHEN_LINEBREAK_DISTANCE;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private boolean lastHyphenDirectlyBeforeLineBreak(Context context) {
|
||||
|
||||
return context.stringIdx - context.lastHyphenIdx < MAX_HYPHEN_LINEBREAK_DISTANCE;
|
||||
}
|
||||
|
||||
|
||||
private boolean isLineBreak(RedTextPosition currentTextPosition, RedTextPosition previousTextPosition) {
|
||||
|
||||
return Objects.equals(currentTextPosition.getUnicode(), "\n") || isDeltaYLargerThanTextHeight(currentTextPosition, previousTextPosition);
|
||||
}
|
||||
|
||||
|
||||
private boolean isDeltaYLargerThanTextHeight(RedTextPosition currentPosition, RedTextPosition previousPosition) {
|
||||
|
||||
if (previousPosition == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
float deltaY = Math.abs(currentPosition.getYDirAdj() - previousPosition.getYDirAdj());
|
||||
return deltaY >= currentPosition.getHeightDir();
|
||||
}
|
||||
|
||||
|
||||
private boolean isRepeatedWhitespace(String currentUnicode, String previousUnicode) {
|
||||
|
||||
return Objects.equals(previousUnicode, " ") && Objects.equals(currentUnicode, " ");
|
||||
}
|
||||
|
||||
|
||||
private boolean isHyphen(String unicodeCharacter) {
|
||||
|
||||
return Objects.equals(unicodeCharacter, "-") || //
|
||||
Objects.equals(unicodeCharacter, "~") || //
|
||||
Objects.equals(unicodeCharacter, "‐") || //
|
||||
Objects.equals(unicodeCharacter, "‒") || //
|
||||
Objects.equals(unicodeCharacter, "⁻") || //
|
||||
Objects.equals(unicodeCharacter, "−") || //
|
||||
Objects.equals(unicodeCharacter, "﹣") || //
|
||||
Objects.equals(unicodeCharacter, "゠") || //
|
||||
Objects.equals(unicodeCharacter, "⁓") || //
|
||||
Objects.equals(unicodeCharacter, "‑") || //
|
||||
Objects.equals(unicodeCharacter, "\u00AD");
|
||||
}
|
||||
|
||||
|
||||
private Rectangle2D mapRedTextPositionToInitialUserSpace(RedTextPosition textPosition, TextPositionSequence sequence) {
|
||||
|
||||
float textHeight = sequence.getTextHeight() + HEIGHT_PADDING;
|
||||
Rectangle2D rectangle2D = new Rectangle2D.Double(textPosition.getXDirAdj(),
|
||||
textPosition.getYDirAdj() - textHeight,
|
||||
textPosition.getWidthDirAdj(),
|
||||
textHeight + HEIGHT_PADDING);
|
||||
|
||||
AffineTransform transform = new AffineTransform();
|
||||
|
||||
if (sequence.getDir() == TextDirection.ZERO || sequence.getDir() == TextDirection.HALF_CIRCLE) {
|
||||
transform.rotate(sequence.getDir().getRadians(), sequence.getPageWidth() / 2f, sequence.getPageHeight() / 2f);
|
||||
transform.translate(0f, sequence.getPageHeight());
|
||||
} else if (sequence.getDir() == TextDirection.QUARTER_CIRCLE) {
|
||||
transform.rotate(sequence.getDir().getRadians(), sequence.getPageWidth() / 2f, sequence.getPageWidth() / 2f);
|
||||
transform.translate(0f, sequence.getPageWidth());
|
||||
} else {
|
||||
transform.rotate(sequence.getDir().getRadians(), sequence.getPageHeight() / 2f, sequence.getPageHeight() / 2f);
|
||||
transform.translate(0f, sequence.getPageWidth());
|
||||
}
|
||||
transform.scale(1., -1.);
|
||||
|
||||
return transform.createTransformedShape(rectangle2D).getBounds2D();
|
||||
}
|
||||
|
||||
|
||||
private class Context {
|
||||
|
||||
List<Integer> stringIdxToPositionIdx = new LinkedList<>();
|
||||
List<Integer> lineBreaksStringIdx = new LinkedList<>();
|
||||
StringBuilder stringBuilder = new StringBuilder();
|
||||
|
||||
int stringIdx;
|
||||
int positionIdx;
|
||||
|
||||
int lastHyphenIdx = -MAX_HYPHEN_LINEBREAK_DISTANCE;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,183 @@
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.factory;
|
||||
|
||||
import static java.lang.String.format;
|
||||
import static java.util.Collections.emptyList;
|
||||
import static java.util.stream.Collectors.groupingBy;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.image.ClassifiedImage;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.TablePageBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.GenericSemanticNode;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Page;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Section;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.utils.TableMergingUtility;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class SectionNodeFactory {
|
||||
|
||||
public void addSection(GenericSemanticNode parentNode, List<AbstractPageBlock> pageBlocks, List<ClassifiedImage> images, DocumentGraphFactory.Context context) {
|
||||
|
||||
if (pageBlocks.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
Map<Integer, List<AbstractPageBlock>> blocksPerPage = pageBlocks.stream().collect(groupingBy(AbstractPageBlock::getPage));
|
||||
Section section = Section.builder().documentTree(context.getDocumentTree()).build();
|
||||
|
||||
context.getSections().add(section);
|
||||
blocksPerPage.keySet().forEach(pageNumber -> addSectionNodeToPageNode(context, section, pageNumber));
|
||||
|
||||
section.setTreeId(getTreeId(parentNode, context, section));
|
||||
|
||||
addFirstHeadlineDirectlyToSection(pageBlocks, context, section);
|
||||
if (containsTablesAndTextBlocks(pageBlocks)) {
|
||||
splitPageBlocksIntoSubSections(pageBlocks).forEach(subSectionPageBlocks -> addSection(section, subSectionPageBlocks, emptyList(), context));
|
||||
} else {
|
||||
addTablesAndParagraphsAndHeadlinesToSection(pageBlocks, context, section);
|
||||
}
|
||||
|
||||
images.stream().distinct().forEach(image -> DocumentGraphFactory.addImage(section, image, context));
|
||||
}
|
||||
|
||||
|
||||
private List<Integer> getTreeId(GenericSemanticNode parentNode, DocumentGraphFactory.Context context, Section section) {
|
||||
|
||||
if (parentNode == null) {
|
||||
return context.getDocumentTree().createNewMainEntryAndReturnId(section);
|
||||
} else {
|
||||
return context.getDocumentTree().createNewChildEntryAndReturnId(parentNode, section);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void addFirstHeadlineDirectlyToSection(List<AbstractPageBlock> pageBlocks, DocumentGraphFactory.Context context, Section section) {
|
||||
|
||||
if (pageBlocks.get(0).isHeadline()) {
|
||||
addTablesAndParagraphsAndHeadlinesToSection(List.of(pageBlocks.get(0)), context, section);
|
||||
pageBlocks.remove(0);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void addTablesAndParagraphsAndHeadlinesToSection(List<AbstractPageBlock> pageBlocks, DocumentGraphFactory.Context context, Section section) {
|
||||
|
||||
Set<AbstractPageBlock> alreadyMerged = new HashSet<>();
|
||||
List<AbstractPageBlock> remainingBlocks = new LinkedList<>(pageBlocks);
|
||||
for (AbstractPageBlock abstractPageBlock : pageBlocks) {
|
||||
|
||||
if (alreadyMerged.contains(abstractPageBlock)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
remainingBlocks.removeAll(alreadyMerged);
|
||||
|
||||
if (abstractPageBlock instanceof TextPageBlock) {
|
||||
List<TextPageBlock> textBlocks = findTextBlocksWithSameClassificationAndAlignsY(abstractPageBlock, remainingBlocks);
|
||||
alreadyMerged.addAll(textBlocks);
|
||||
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, textBlocks);
|
||||
} else if (abstractPageBlock instanceof TablePageBlock tablePageBlock) {
|
||||
List<TablePageBlock> tablesToMerge = TableMergingUtility.findConsecutiveTablesWithSameColCountAndSameHeaders(tablePageBlock, remainingBlocks);
|
||||
alreadyMerged.addAll(tablesToMerge);
|
||||
TableNodeFactory.addTable(section, tablesToMerge, context);
|
||||
} else {
|
||||
throw new RuntimeException(format("Unhandled AbstractPageBlockType %s!", abstractPageBlock.getClass()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private boolean containsTablesAndTextBlocks(List<AbstractPageBlock> pageBlocks) {
|
||||
|
||||
return pageBlocks.stream().anyMatch(pageBlock -> pageBlock instanceof TablePageBlock) && pageBlocks.stream().anyMatch(pageBlock -> pageBlock instanceof TextPageBlock);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* This function splits the list of PageBlocks around TablePageBlocks, such that SubSections can be created, that don't include tables.
|
||||
* This is needed so we can execute rules on sections, that do not contain tables.
|
||||
* See: <a href="https://knecon.atlassian.net/wiki/spaces/RED/pages/14765218/Document+Structure">document structure wiki</a>
|
||||
*
|
||||
* @param pageBlocks a List of AbstractPageBlocks, which have at least one TablePageBlock and one ClassificationTextBlock
|
||||
* @return List of Lists of AbstractPageBlocks, which include either a single Headline ClassificationTextBlock and a TablePageBlock or only ClassificationTextBlocks.
|
||||
*/
|
||||
private List<List<AbstractPageBlock>> splitPageBlocksIntoSubSections(List<AbstractPageBlock> pageBlocks) {
|
||||
|
||||
List<List<AbstractPageBlock>> splitList = splitIntoCoherentList(pageBlocks);
|
||||
movePrecedingHeadlineToTableList(splitList);
|
||||
return splitList.stream().filter(list -> !list.isEmpty()).toList();
|
||||
}
|
||||
|
||||
|
||||
private void movePrecedingHeadlineToTableList(List<List<AbstractPageBlock>> splitList) {
|
||||
|
||||
for (int i = 0; i < splitList.size(); i++) {
|
||||
if (listIsTablesOnly(splitList.get(i)) && i > 0) {
|
||||
List<AbstractPageBlock> previousList = splitList.get(i - 1);
|
||||
AbstractPageBlock lastPageBlockInPreviousList = previousList.get(previousList.size() - 1);
|
||||
if (lastPageBlockInPreviousList.isHeadline()) {
|
||||
previousList.remove(i - 1);
|
||||
splitList.get(i).add(0, lastPageBlockInPreviousList);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private boolean listIsTablesOnly(List<AbstractPageBlock> abstractPageBlocks) {
|
||||
|
||||
return abstractPageBlocks.stream().allMatch(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @param pageBlocks a List of AbstractPageBlocks, which have at least one TablePageBlock and one ClassificationTextBlock
|
||||
* @return List of Lists of AbstractPageBlocks, which are exclusively of type ClassificationTextBlock or TablePageBlock
|
||||
*/
|
||||
private List<List<AbstractPageBlock>> splitIntoCoherentList(List<AbstractPageBlock> pageBlocks) {
|
||||
|
||||
List<List<AbstractPageBlock>> splitList = new LinkedList<>();
|
||||
List<AbstractPageBlock> currentList = new LinkedList<>();
|
||||
splitList.add(currentList);
|
||||
|
||||
Class<? extends AbstractPageBlock> lastPageBlockClass = pageBlocks.get(0).getClass();
|
||||
for (AbstractPageBlock pageBlock : pageBlocks) {
|
||||
if (lastPageBlockClass.isInstance(pageBlock)) {
|
||||
currentList.add(pageBlock);
|
||||
} else {
|
||||
currentList = new LinkedList<>();
|
||||
currentList.add(pageBlock);
|
||||
splitList.add(currentList);
|
||||
lastPageBlockClass = pageBlock.getClass();
|
||||
}
|
||||
}
|
||||
return splitList;
|
||||
}
|
||||
|
||||
|
||||
private List<TextPageBlock> findTextBlocksWithSameClassificationAndAlignsY(AbstractPageBlock atc, List<AbstractPageBlock> pageBlocks) {
|
||||
|
||||
return pageBlocks.stream()
|
||||
.filter(abstractTextContainer -> !abstractTextContainer.equals(atc))
|
||||
.filter(abstractTextContainer -> abstractTextContainer.getPage() == atc.getPage())
|
||||
.filter(abstractTextContainer -> abstractTextContainer instanceof TextPageBlock)
|
||||
.filter(abstractTextContainer -> abstractTextContainer.intersectsY(atc))
|
||||
.map(abstractTextContainer -> (TextPageBlock) abstractTextContainer)
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
private void addSectionNodeToPageNode(DocumentGraphFactory.Context context, Section section, Integer pageNumber) {
|
||||
|
||||
Page page = context.getPage(pageNumber);
|
||||
page.getMainBody().add(section);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,136 @@
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.factory;
|
||||
|
||||
import static java.util.Collections.emptyList;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.Cell;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.TablePageBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.GenericSemanticNode;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Page;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.SemanticNode;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Table;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.TableCell;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.utils.TextPositionOperations;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class TableNodeFactory {
|
||||
|
||||
public final double TABLE_CELL_MERGE_CONTENTS_SIZE_THRESHOLD = 0.05;
|
||||
|
||||
|
||||
public void addTable(GenericSemanticNode parentNode, List<TablePageBlock> tablesToMerge, DocumentGraphFactory.Context context) {
|
||||
|
||||
setPageNumberInCells(tablesToMerge);
|
||||
Set<Page> pages = tablesToMerge.stream().map(AbstractPageBlock::getPage).map(context::getPage).collect(Collectors.toSet());
|
||||
List<List<Cell>> mergedRows = tablesToMerge.stream().map(TablePageBlock::getRows).flatMap(Collection::stream).toList();
|
||||
Table table = Table.builder().documentTree(context.getDocumentTree()).numberOfCols(mergedRows.get(0).size()).numberOfRows(mergedRows.size()).build();
|
||||
|
||||
pages.forEach(page -> addTableToPage(page, parentNode, table));
|
||||
|
||||
List<Integer> treeId = context.getDocumentTree().createNewChildEntryAndReturnId(parentNode, table);
|
||||
table.setTreeId(treeId);
|
||||
addTableCells(mergedRows, table, context);
|
||||
|
||||
ifTableHasNoHeadersSetFirstRowAsHeaders(table);
|
||||
}
|
||||
|
||||
|
||||
private void setPageNumberInCells(List<TablePageBlock> tablesToMerge) {
|
||||
|
||||
// For some reason I can't figure out, in some table cells, the ClassificationTextBlocks have 0 as page number
|
||||
// So I am fixing this here, but this should actually be fixed upstream.
|
||||
tablesToMerge.forEach(table -> table.getRows()
|
||||
.stream()
|
||||
.flatMap(Collection::stream)
|
||||
.peek(cell -> cell.setPageNumber(table.getPage()))
|
||||
.forEach(cell -> setPageNumberInTextBlocksWithPageNumberSetTo0(table, cell)));
|
||||
}
|
||||
|
||||
|
||||
private void setPageNumberInTextBlocksWithPageNumberSetTo0(TablePageBlock table, Cell cell) {
|
||||
|
||||
cell.getTextBlocks().stream()//
|
||||
.filter(tb -> tb.getPage() == 0)//
|
||||
.forEach(tb -> tb.setPage(table.getPage()));
|
||||
}
|
||||
|
||||
|
||||
@SuppressWarnings("PMD.UnusedPrivateMethod") // PMD actually flags this wrong
|
||||
private void addTableToPage(Page page, SemanticNode parentNode, Table table) {
|
||||
|
||||
if (!page.getMainBody().contains(parentNode)) {
|
||||
parentNode.getPages().add(page);
|
||||
}
|
||||
|
||||
page.getMainBody().add(table);
|
||||
}
|
||||
|
||||
|
||||
private void ifTableHasNoHeadersSetFirstRowAsHeaders(Table table) {
|
||||
|
||||
if (table.streamHeaders().findAny().isEmpty()) {
|
||||
table.streamRow(0).forEach(tableCellNode -> tableCellNode.setHeader(true));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void addTableCells(List<List<Cell>> rows, Table table, DocumentGraphFactory.Context context) {
|
||||
|
||||
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
|
||||
for (int colIndex = 0; colIndex < rows.get(rowIndex).size(); colIndex++) {
|
||||
addTableCell(rows.get(rowIndex).get(colIndex), rowIndex, colIndex, table, context);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SuppressWarnings("PMD.UnusedPrivateMethod") // PMD actually flags this wrong
|
||||
private void addTableCell(Cell cell, int rowIndex, int colIndex, Table tableNode, DocumentGraphFactory.Context context) {
|
||||
|
||||
Page page = context.getPage(cell.getPageNumber());
|
||||
|
||||
TableCell tableCell = TableCell.builder().documentTree(context.getDocumentTree()).row(rowIndex).col(colIndex).header(cell.isHeaderCell()).bBox(cell.getBounds2D()).build();
|
||||
page.getMainBody().add(tableCell);
|
||||
|
||||
List<Integer> treeId = context.getDocumentTree().createNewTableChildEntryAndReturnId(tableNode, tableCell);
|
||||
tableCell.setTreeId(treeId);
|
||||
|
||||
TextBlock textBlock;
|
||||
if (cell.getTextBlocks().isEmpty()) {
|
||||
tableCell.setLeafTextBlock(context.getTextBlockFactory().emptyTextBlock(tableNode, context, page));
|
||||
} else if (cell.getTextBlocks().size() == 1) {
|
||||
textBlock = context.getTextBlockFactory().buildAtomicTextBlock(cell.getTextBlocks().get(0).getSequences(), tableCell, context, page);
|
||||
tableCell.setLeafTextBlock(textBlock);
|
||||
} else if (firstTextBlockIsHeadline(cell)) {
|
||||
SectionNodeFactory.addSection(tableCell, cell.getTextBlocks().stream().map(tb -> (AbstractPageBlock) tb).toList(), emptyList(), context);
|
||||
} else if (cellAreaIsSmallerThanPageAreaTimesThreshold(cell, page)) {
|
||||
List<TextPositionSequence> sequences = TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(cell.getTextBlocks());
|
||||
textBlock = context.getTextBlockFactory().buildAtomicTextBlock(sequences, tableCell, context, page);
|
||||
tableCell.setLeafTextBlock(textBlock);
|
||||
} else {
|
||||
cell.getTextBlocks().forEach(tb -> DocumentGraphFactory.addParagraphOrHeadline(tableCell, tb, context, emptyList()));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private boolean cellAreaIsSmallerThanPageAreaTimesThreshold(Cell cell, Page page) {
|
||||
|
||||
return cell.getArea() < TABLE_CELL_MERGE_CONTENTS_SIZE_THRESHOLD * page.getHeight() * page.getWidth();
|
||||
}
|
||||
|
||||
|
||||
private boolean firstTextBlockIsHeadline(Cell cell) {
|
||||
|
||||
return cell.getTextBlocks().get(0).isHeadline();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,53 @@
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.factory;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Page;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.SemanticNode;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.AtomicTextBlock;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class TextBlockFactory {
|
||||
|
||||
int stringOffset;
|
||||
long textBlockIdx;
|
||||
|
||||
|
||||
public AtomicTextBlock buildAtomicTextBlock(List<TextPositionSequence> sequences, SemanticNode parent, DocumentGraphFactory.Context context, Page page) {
|
||||
|
||||
Integer numberOnPage = context.getAndIncrementTextBlockNumberOnPage(page);
|
||||
return buildAtomicTextBlock(sequences, parent, numberOnPage, page);
|
||||
}
|
||||
|
||||
|
||||
public AtomicTextBlock buildAtomicTextBlock(List<TextPositionSequence> sequences, SemanticNode parent, Integer numberOnPage, Page page) {
|
||||
|
||||
SearchTextWithTextPositionDto searchTextWithTextPositionDto = SearchTextWithTextPositionFactory.buildSearchTextToTextPositionModel(sequences);
|
||||
int offset = stringOffset;
|
||||
stringOffset += searchTextWithTextPositionDto.getSearchText().length();
|
||||
long idx = textBlockIdx;
|
||||
textBlockIdx++;
|
||||
return AtomicTextBlock.fromSearchTextWithTextPositionDto(searchTextWithTextPositionDto, parent, offset, idx, numberOnPage, page);
|
||||
}
|
||||
|
||||
|
||||
public AtomicTextBlock emptyTextBlock(SemanticNode parent, DocumentGraphFactory.Context context, Page page) {
|
||||
|
||||
long idx = textBlockIdx;
|
||||
textBlockIdx++;
|
||||
return AtomicTextBlock.empty(idx, stringOffset, page, context.getAndIncrementTextBlockNumberOnPage(page), parent);
|
||||
}
|
||||
|
||||
|
||||
public AtomicTextBlock emptyTextBlock(SemanticNode parent, Integer numberOnPage, Page page) {
|
||||
|
||||
long idx = textBlockIdx;
|
||||
textBlockIdx++;
|
||||
return AtomicTextBlock.empty(idx, stringOffset, page, numberOnPage, parent);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,163 @@
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph;
|
||||
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.TextBlock;
|
||||
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.Setter;
|
||||
|
||||
@Setter
|
||||
@EqualsAndHashCode
|
||||
public class Boundary implements Comparable<Boundary> {
|
||||
|
||||
private int start;
|
||||
private int end;
|
||||
|
||||
|
||||
public Boundary(int start, int end) {
|
||||
|
||||
if (start > end) {
|
||||
throw new IllegalArgumentException(format("start: %d > end: %d", start, end));
|
||||
}
|
||||
this.start = start;
|
||||
this.end = end;
|
||||
}
|
||||
|
||||
|
||||
public int length() {
|
||||
|
||||
return end - start;
|
||||
}
|
||||
|
||||
|
||||
public int start() {
|
||||
|
||||
return start;
|
||||
}
|
||||
|
||||
|
||||
public int end() {
|
||||
|
||||
return end;
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(Boundary boundary) {
|
||||
|
||||
return start <= boundary.start() && boundary.end() <= end;
|
||||
}
|
||||
|
||||
|
||||
public boolean containedBy(Boundary boundary) {
|
||||
|
||||
return boundary.contains(this);
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(int start, int end) {
|
||||
|
||||
if (start > end) {
|
||||
throw new IllegalArgumentException(format("start: %d > end: %d", start, end));
|
||||
}
|
||||
return this.start <= start && end <= this.end;
|
||||
}
|
||||
|
||||
|
||||
public boolean containedBy(int start, int end) {
|
||||
|
||||
if (start > end) {
|
||||
throw new IllegalArgumentException(format("start: %d > end: %d", start, end));
|
||||
}
|
||||
return start <= this.start && this.end <= end;
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(int index) {
|
||||
|
||||
return start <= index && index < end;
|
||||
}
|
||||
|
||||
|
||||
public boolean intersects(Boundary boundary) {
|
||||
|
||||
return boundary.start() < this.end && this.start < boundary.end();
|
||||
}
|
||||
|
||||
|
||||
public List<Boundary> split(List<Integer> splitIndices) {
|
||||
|
||||
if (splitIndices.stream().anyMatch(idx -> !this.contains(idx))) {
|
||||
throw new IndexOutOfBoundsException(format("%s splitting indices are out of range for %s", splitIndices.stream().filter(idx -> !this.contains(idx)).toList(), this));
|
||||
}
|
||||
List<Boundary> splitBoundaries = new LinkedList<>();
|
||||
int previousIndex = start;
|
||||
for (int splitIndex : splitIndices) {
|
||||
|
||||
// skip split if it would produce a boundary of length 0
|
||||
if (splitIndex == previousIndex) {
|
||||
continue;
|
||||
}
|
||||
splitBoundaries.add(new Boundary(previousIndex, splitIndex));
|
||||
previousIndex = splitIndex;
|
||||
}
|
||||
splitBoundaries.add(new Boundary(previousIndex, end));
|
||||
return splitBoundaries;
|
||||
}
|
||||
|
||||
|
||||
public static Boundary merge(Collection<Boundary> boundaries) {
|
||||
|
||||
int minStart = boundaries.stream().mapToInt(Boundary::start).min().orElseThrow(IllegalArgumentException::new);
|
||||
int maxEnd = boundaries.stream().mapToInt(Boundary::end).max().orElseThrow(IllegalArgumentException::new);
|
||||
return new Boundary(minStart, maxEnd);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return format("Boundary [%d|%d)", start, end);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int compareTo(Boundary boundary) {
|
||||
|
||||
if (end < boundary.end() && start < boundary.start()) {
|
||||
return -1;
|
||||
}
|
||||
if (start > boundary.start() && end > boundary.end()) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* shrinks the boundary, such that textBlock.subSequence(boundary) returns a string without whitespaces.
|
||||
*
|
||||
* @param textBlock TextBlock to check whitespaces against
|
||||
* @return boundary
|
||||
*/
|
||||
public Boundary trim(TextBlock textBlock) {
|
||||
|
||||
int trimmedStart = this.start;
|
||||
while (Character.isWhitespace(textBlock.charAt(trimmedStart))) {
|
||||
trimmedStart++;
|
||||
}
|
||||
|
||||
int trimmedEnd = this.end;
|
||||
while (Character.isWhitespace(textBlock.charAt(trimmedEnd - 1))) {
|
||||
trimmedEnd--;
|
||||
}
|
||||
|
||||
return new Boundary(trimmedStart, Math.max(trimmedEnd, trimmedStart));
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,217 @@
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph;
|
||||
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.GenericSemanticNode;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.NodeType;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.SemanticNode;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Table;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.TableCell;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.TextBlockCollector;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.Getter;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@EqualsAndHashCode
|
||||
public class DocumentTree {
|
||||
|
||||
private final Entry root;
|
||||
|
||||
|
||||
public DocumentTree(Document document) {
|
||||
|
||||
root = Entry.builder().treeId(Collections.emptyList()).children(new LinkedList<>()).node(document).build();
|
||||
}
|
||||
|
||||
|
||||
public TextBlock buildTextBlock() {
|
||||
|
||||
return allEntriesInOrder().map(Entry::getNode).filter(SemanticNode::isLeaf).map(SemanticNode::getLeafTextBlock).collect(new TextBlockCollector());
|
||||
}
|
||||
|
||||
|
||||
public List<Integer> createNewMainEntryAndReturnId(GenericSemanticNode node) {
|
||||
|
||||
return createNewChildEntryAndReturnIdImpl(Collections.emptyList(), node);
|
||||
}
|
||||
|
||||
|
||||
public List<Integer> createNewChildEntryAndReturnId(GenericSemanticNode parentNode, GenericSemanticNode node) {
|
||||
|
||||
return createNewChildEntryAndReturnIdImpl(parentNode.getTreeId(), node);
|
||||
}
|
||||
|
||||
|
||||
public List<Integer> createNewChildEntryAndReturnId(GenericSemanticNode parentNode, Table node) {
|
||||
|
||||
return createNewChildEntryAndReturnIdImpl(parentNode.getTreeId(), node);
|
||||
}
|
||||
|
||||
|
||||
public List<Integer> createNewTableChildEntryAndReturnId(Table parentTable, TableCell tableCell) {
|
||||
|
||||
return createNewChildEntryAndReturnIdImpl(parentTable.getTreeId(), tableCell);
|
||||
}
|
||||
|
||||
|
||||
@SuppressWarnings("PMD.UnusedPrivateMethod") // PMD actually flags this wrong
|
||||
private List<Integer> createNewChildEntryAndReturnIdImpl(List<Integer> parentId, SemanticNode node) {
|
||||
|
||||
if (!entryExists(parentId)) {
|
||||
throw new IllegalArgumentException(format("parentId %s does not exist!", parentId));
|
||||
}
|
||||
|
||||
Entry parent = getEntryById(parentId);
|
||||
List<Integer> newId = new LinkedList<>(parentId);
|
||||
newId.add(parent.children.size());
|
||||
parent.children.add(Entry.builder().treeId(newId).node(node).build());
|
||||
|
||||
return newId;
|
||||
}
|
||||
|
||||
|
||||
private boolean entryExists(List<Integer> treeId) {
|
||||
|
||||
if (treeId.isEmpty()) {
|
||||
return root != null;
|
||||
}
|
||||
Entry entry = root.children.get(treeId.get(0));
|
||||
for (int id : treeId.subList(1, treeId.size())) {
|
||||
if (id >= entry.children.size() || 0 > id) {
|
||||
return false;
|
||||
}
|
||||
entry = entry.children.get(id);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
public Entry getParentEntryById(List<Integer> treeId) {
|
||||
|
||||
return getEntryById(getParentId(treeId));
|
||||
}
|
||||
|
||||
|
||||
public boolean hasParentById(List<Integer> treeId) {
|
||||
|
||||
return !treeId.isEmpty();
|
||||
}
|
||||
|
||||
|
||||
public Stream<SemanticNode> childNodes(List<Integer> treeId) {
|
||||
|
||||
return getEntryById(treeId).children.stream().map(Entry::getNode);
|
||||
}
|
||||
|
||||
|
||||
public Stream<SemanticNode> childNodesOfType(List<Integer> treeId, NodeType nodeType) {
|
||||
|
||||
return getEntryById(treeId).children.stream().filter(entry -> entry.node.getType().equals(nodeType)).map(Entry::getNode);
|
||||
}
|
||||
|
||||
|
||||
private static List<Integer> getParentId(List<Integer> treeId) {
|
||||
|
||||
if (treeId.isEmpty()) {
|
||||
throw new UnsupportedOperationException("Root has no parent!");
|
||||
}
|
||||
if (treeId.size() < 2) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
return treeId.subList(0, treeId.size() - 1);
|
||||
}
|
||||
|
||||
|
||||
public Entry getEntryById(List<Integer> treeId) {
|
||||
|
||||
if (treeId.isEmpty()) {
|
||||
return root;
|
||||
}
|
||||
Entry entry = root.children.get(treeId.get(0));
|
||||
for (int id : treeId.subList(1, treeId.size())) {
|
||||
entry = entry.children.get(id);
|
||||
}
|
||||
return entry;
|
||||
}
|
||||
|
||||
|
||||
public Stream<Entry> mainEntries() {
|
||||
|
||||
return root.children.stream();
|
||||
}
|
||||
|
||||
|
||||
public Stream<Entry> allEntriesInOrder() {
|
||||
|
||||
return Stream.of(root).flatMap(DocumentTree::flatten);
|
||||
}
|
||||
|
||||
|
||||
public Stream<Entry> allSubEntriesInOrder(List<Integer> parentId) {
|
||||
|
||||
return getEntryById(parentId).children.stream().flatMap(DocumentTree::flatten);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return String.join("\n", allEntriesInOrder().map(Entry::toString).toList());
|
||||
}
|
||||
|
||||
|
||||
private static Stream<Entry> flatten(Entry entry) {
|
||||
|
||||
return Stream.concat(Stream.of(entry), entry.children.stream().flatMap(DocumentTree::flatten));
|
||||
}
|
||||
|
||||
|
||||
public SemanticNode getHighestParentById(List<Integer> treeId) {
|
||||
|
||||
if (treeId.isEmpty()) {
|
||||
return root.node;
|
||||
}
|
||||
return root.children.get(treeId.get(0)).node;
|
||||
}
|
||||
|
||||
|
||||
@Builder
|
||||
@Getter
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE, makeFinal = true)
|
||||
public static class Entry {
|
||||
|
||||
List<Integer> treeId;
|
||||
SemanticNode node;
|
||||
@Builder.Default
|
||||
List<Entry> children = new LinkedList<>();
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return node.toString();
|
||||
}
|
||||
|
||||
|
||||
public NodeType getType() {
|
||||
|
||||
return node.getType();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,4 +1,4 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.model;
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity;
|
||||
|
||||
public enum EntityType {
|
||||
ENTITY,
|
||||
@ -0,0 +1,229 @@
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Collection;
|
||||
import java.util.Comparator;
|
||||
import java.util.Deque;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Engine;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.Boundary;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Page;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.SemanticNode;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.IdBuilder;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
|
||||
public class RedactionEntity {
|
||||
|
||||
// initial values
|
||||
@EqualsAndHashCode.Include
|
||||
final Boundary boundary;
|
||||
@EqualsAndHashCode.Include
|
||||
final String type;
|
||||
@EqualsAndHashCode.Include
|
||||
final EntityType entityType;
|
||||
|
||||
// empty defaults
|
||||
boolean redaction;
|
||||
boolean removed;
|
||||
boolean ignored;
|
||||
boolean resized;
|
||||
boolean skipRemoveEntitiesContainedInLarger;
|
||||
boolean dictionaryEntry;
|
||||
boolean dossierDictionaryEntry;
|
||||
Set<Engine> engines;
|
||||
Set<RedactionEntity> references;
|
||||
@Builder.Default
|
||||
Deque<Integer> matchedRules = new LinkedList<>();
|
||||
String redactionReason;
|
||||
String legalBasis;
|
||||
|
||||
// inferred on graph insertion
|
||||
@EqualsAndHashCode.Include
|
||||
String value;
|
||||
String textBefore;
|
||||
String textAfter;
|
||||
@Builder.Default
|
||||
Set<Page> pages = new HashSet<>();
|
||||
List<RedactionPosition> redactionPositionsPerPage;
|
||||
@Builder.Default
|
||||
List<SemanticNode> intersectingNodes = new LinkedList<>();
|
||||
SemanticNode deepestFullyContainingNode;
|
||||
|
||||
|
||||
public static RedactionEntity initialEntityNode(Boundary boundary, String type, EntityType entityType) {
|
||||
|
||||
return RedactionEntity.builder().type(type).entityType(entityType).boundary(boundary).engines(new HashSet<>()).references(new HashSet<>()).build();
|
||||
}
|
||||
|
||||
|
||||
public boolean occursInNodeOfType(Class<? extends SemanticNode> clazz) {
|
||||
|
||||
return intersectingNodes.stream().anyMatch(clazz::isInstance);
|
||||
}
|
||||
|
||||
|
||||
public boolean occursInNode(SemanticNode semanticNode) {
|
||||
|
||||
return intersectingNodes.stream().anyMatch(node -> node.equals(semanticNode));
|
||||
}
|
||||
|
||||
|
||||
public boolean isType(String type) {
|
||||
|
||||
return this.type.equals(type);
|
||||
}
|
||||
|
||||
|
||||
public boolean isAnyType(List<String> types) {
|
||||
|
||||
return types.contains(type);
|
||||
}
|
||||
|
||||
|
||||
public void addIntersectingNode(SemanticNode containingNode) {
|
||||
|
||||
intersectingNodes.add(containingNode);
|
||||
}
|
||||
|
||||
|
||||
public void removeFromGraph() {
|
||||
|
||||
intersectingNodes.forEach(node -> node.getEntities().remove(this));
|
||||
pages.forEach(page -> page.getEntities().remove(this));
|
||||
intersectingNodes = new LinkedList<>();
|
||||
deepestFullyContainingNode = null;
|
||||
pages = new HashSet<>();
|
||||
removed = true;
|
||||
ignored = true;
|
||||
}
|
||||
|
||||
|
||||
public void addMatchedRule(int ruleNumber) {
|
||||
|
||||
matchedRules.add(ruleNumber);
|
||||
}
|
||||
|
||||
|
||||
public int getMatchedRule() {
|
||||
|
||||
if (matchedRules.isEmpty()) {
|
||||
return 0;
|
||||
}
|
||||
return matchedRules.getLast();
|
||||
}
|
||||
|
||||
|
||||
public List<RedactionPosition> getRedactionPositionsPerPage() {
|
||||
|
||||
if (redactionPositionsPerPage == null || redactionPositionsPerPage.isEmpty()) {
|
||||
Map<Page, List<Rectangle2D>> rectanglesPerLinePerPage = deepestFullyContainingNode.getTextBlock().getPositionsPerPage(boundary);
|
||||
|
||||
Page firstPage = rectanglesPerLinePerPage.keySet()
|
||||
.stream()
|
||||
.min(Comparator.comparingInt(Page::getNumber))
|
||||
.orElseThrow(() -> new RuntimeException("No Positions found on any page!"));
|
||||
|
||||
String id = IdBuilder.buildId(pages, rectanglesPerLinePerPage.values().stream().flatMap(Collection::stream).toList());
|
||||
redactionPositionsPerPage = rectanglesPerLinePerPage.entrySet().stream().map(entry -> buildRedactionPosition(firstPage, id, entry)).toList();
|
||||
}
|
||||
return redactionPositionsPerPage;
|
||||
}
|
||||
|
||||
|
||||
private static RedactionPosition buildRedactionPosition(Page firstPage, String id, Map.Entry<Page, List<Rectangle2D>> entry) {
|
||||
|
||||
if (entry.getKey().equals(firstPage)) {
|
||||
return new RedactionPosition(id, entry.getKey(), entry.getValue());
|
||||
} else {
|
||||
return new RedactionPosition(id + "-" + entry.getKey().getNumber(), entry.getKey(), entry.getValue());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public boolean containedBy(RedactionEntity redactionEntity) {
|
||||
|
||||
return this.boundary.containedBy(redactionEntity.getBoundary());
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(RedactionEntity redactionEntity) {
|
||||
|
||||
return this.boundary.contains(redactionEntity.getBoundary());
|
||||
}
|
||||
|
||||
|
||||
public boolean intersects(RedactionEntity redactionEntity) {
|
||||
|
||||
return this.boundary.intersects(redactionEntity.getBoundary());
|
||||
}
|
||||
|
||||
|
||||
public void addEngine(Engine engine) {
|
||||
|
||||
engines.add(engine);
|
||||
}
|
||||
|
||||
|
||||
public void addEngines(Set<Engine> engines) {
|
||||
|
||||
this.engines.addAll(engines);
|
||||
}
|
||||
|
||||
|
||||
public void addReference(RedactionEntity reference) {
|
||||
|
||||
references.add(reference);
|
||||
}
|
||||
|
||||
|
||||
public void addReferences(List<RedactionEntity> references) {
|
||||
|
||||
this.references.addAll(references);
|
||||
}
|
||||
|
||||
|
||||
public boolean matchesAnnotationId(String manualRedactionId) {
|
||||
|
||||
return getRedactionPositionsPerPage().stream().anyMatch(entityPosition -> entityPosition.getId().equals(manualRedactionId));
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append("Entity[\"");
|
||||
sb.append(value);
|
||||
sb.append("\", ");
|
||||
sb.append(boundary);
|
||||
sb.append(", pages[");
|
||||
pages.forEach(page -> {
|
||||
sb.append(page.getNumber());
|
||||
sb.append(", ");
|
||||
});
|
||||
sb.delete(sb.length() - 2, sb.length());
|
||||
sb.append("], type = \"");
|
||||
sb.append(type);
|
||||
sb.append("\", EntityType.");
|
||||
sb.append(entityType);
|
||||
sb.append("]");
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,24 @@
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.List;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Page;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class RedactionPosition {
|
||||
|
||||
final String id;
|
||||
Page page;
|
||||
// Each entry in this list corresponds to an entry in the redaction log, this means:
|
||||
// An entity might be represented by multiple redaction log entries
|
||||
List<Rectangle2D> rectanglePerLine;
|
||||
|
||||
}
|
||||
@ -0,0 +1,119 @@
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.exception.NotFoundException;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.DocumentTree;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity.RedactionEntity;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.TextBlockCollector;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class Document implements GenericSemanticNode {
|
||||
|
||||
Set<Page> pages;
|
||||
DocumentTree documentTree;
|
||||
Integer numberOfPages;
|
||||
TextBlock textBlock;
|
||||
@Builder.Default
|
||||
Set<RedactionEntity> entities = new HashSet<>();
|
||||
|
||||
|
||||
@Override
|
||||
public NodeType getType() {
|
||||
|
||||
return NodeType.DOCUMENT;
|
||||
}
|
||||
|
||||
|
||||
public TextBlock getTextBlock() {
|
||||
|
||||
if (textBlock == null) {
|
||||
textBlock = streamTerminalTextBlocksInOrder().collect(new TextBlockCollector());
|
||||
}
|
||||
return textBlock;
|
||||
}
|
||||
|
||||
|
||||
public List<Section> getMainSections() {
|
||||
|
||||
return streamChildrenOfType(NodeType.SECTION).map(node -> (Section) node).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
||||
public Stream<TextBlock> streamTerminalTextBlocksInOrder() {
|
||||
|
||||
return streamAllNodes().filter(SemanticNode::isLeaf).map(SemanticNode::getLeafTextBlock);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public List<Integer> getTreeId() {
|
||||
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void setTreeId(List<Integer> tocId) {
|
||||
|
||||
throw new UnsupportedOperationException("Document is always the root of the TablePageBlock of Contents");
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Headline getHeadline() {
|
||||
|
||||
return streamAllSubNodesOfType(NodeType.HEADLINE).map(node -> (Headline) node).findFirst().orElseThrow(() -> new NotFoundException("No Headlines found in this document!"));
|
||||
}
|
||||
|
||||
|
||||
private Stream<SemanticNode> streamAllNodes() {
|
||||
|
||||
return documentTree.allEntriesInOrder().map(DocumentTree.Entry::getNode);
|
||||
}
|
||||
|
||||
|
||||
public Stream<Image> streamAllImages() {
|
||||
|
||||
return streamAllSubNodesOfType(NodeType.IMAGE).map(node -> (Image) node);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return NodeType.DOCUMENT + ": " + this.getTextBlock().buildSummary();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Map<Page, Rectangle2D> getBBox() {
|
||||
|
||||
Map<Page, Rectangle2D> bBox = new HashMap<>();
|
||||
for (Page page : pages) {
|
||||
bBox.put(page, new Rectangle2D.Double(0, 0, page.getWidth(), page.getHeight()));
|
||||
}
|
||||
return bBox;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,64 @@
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.DocumentTree;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity.RedactionEntity;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.TextBlock;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class Footer implements GenericSemanticNode {
|
||||
|
||||
List<Integer> treeId;
|
||||
TextBlock leafTextBlock;
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
DocumentTree documentTree;
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<RedactionEntity> entities = new HashSet<>();
|
||||
|
||||
|
||||
@Override
|
||||
public NodeType getType() {
|
||||
|
||||
return NodeType.FOOTER;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean isLeaf() {
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public TextBlock getTextBlock() {
|
||||
|
||||
return leafTextBlock;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return treeId + ": " + NodeType.FOOTER + ": " + leafTextBlock.buildSummary();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,5 @@
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes;
|
||||
|
||||
public interface GenericSemanticNode extends SemanticNode {
|
||||
|
||||
}
|
||||
@ -0,0 +1,64 @@
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.DocumentTree;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity.RedactionEntity;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.TextBlock;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class Header implements GenericSemanticNode {
|
||||
|
||||
List<Integer> treeId;
|
||||
TextBlock leafTextBlock;
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
DocumentTree documentTree;
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<RedactionEntity> entities = new HashSet<>();
|
||||
|
||||
|
||||
@Override
|
||||
public boolean isLeaf() {
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public NodeType getType() {
|
||||
|
||||
return NodeType.HEADER;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public TextBlock getTextBlock() {
|
||||
|
||||
return leafTextBlock;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return treeId + ": " + NodeType.HEADER + ": " + leafTextBlock.buildSummary();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,71 @@
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.DocumentTree;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity.RedactionEntity;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.TextBlock;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class Headline implements GenericSemanticNode {
|
||||
|
||||
List<Integer> treeId;
|
||||
TextBlock leafTextBlock;
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
DocumentTree documentTree;
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<RedactionEntity> entities = new HashSet<>();
|
||||
|
||||
|
||||
@Override
|
||||
public NodeType getType() {
|
||||
|
||||
return NodeType.HEADLINE;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean isLeaf() {
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public TextBlock getTextBlock() {
|
||||
|
||||
return leafTextBlock;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return treeId + ": " + NodeType.HEADLINE + ": " + leafTextBlock.buildSummary();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Headline getHeadline() {
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,94 @@
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.DocumentTree;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity.RedactionEntity;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.TextBlockCollector;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class Image implements GenericSemanticNode {
|
||||
|
||||
List<Integer> treeId;
|
||||
String id;
|
||||
|
||||
ImageType imageType;
|
||||
boolean transparent;
|
||||
Rectangle2D position;
|
||||
|
||||
boolean redaction;
|
||||
boolean ignored;
|
||||
@Builder.Default
|
||||
String redactionReason = "";
|
||||
@Builder.Default
|
||||
String legalBasis = "";
|
||||
@Builder.Default
|
||||
int matchedRule = -1;
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
Page page;
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
DocumentTree documentTree;
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<RedactionEntity> entities = new HashSet<>();
|
||||
|
||||
|
||||
@Override
|
||||
public NodeType getType() {
|
||||
|
||||
return NodeType.IMAGE;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public TextBlock getTextBlock() {
|
||||
|
||||
return streamAllSubNodes().filter(SemanticNode::isLeaf).map(SemanticNode::getLeafTextBlock).collect(new TextBlockCollector());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Set<Page> getPages() {
|
||||
|
||||
return Collections.singleton(page);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return treeId + ": " + NodeType.IMAGE + ": " + imageType.toString() + " " + position;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Map<Page, Rectangle2D> getBBox() {
|
||||
|
||||
Map<Page, Rectangle2D> bBoxPerPage = new HashMap<>();
|
||||
bBoxPerPage.put(page, position);
|
||||
return bBoxPerPage;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,21 @@
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes;
|
||||
|
||||
public enum ImageType {
|
||||
LOGO,
|
||||
FORMULA,
|
||||
SIGNATURE,
|
||||
OTHER,
|
||||
OCR;
|
||||
|
||||
|
||||
public static ImageType fromString(String imageType) {
|
||||
|
||||
return switch (imageType.toLowerCase()) {
|
||||
case "logo" -> ImageType.LOGO;
|
||||
case "formula" -> ImageType.FORMULA;
|
||||
case "signature" -> ImageType.SIGNATURE;
|
||||
case "ocr" -> ImageType.OCR;
|
||||
default -> ImageType.OTHER;
|
||||
};
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,21 @@
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes;
|
||||
|
||||
import java.util.Locale;
|
||||
|
||||
public enum NodeType {
|
||||
DOCUMENT,
|
||||
SECTION,
|
||||
HEADLINE,
|
||||
PARAGRAPH,
|
||||
TABLE,
|
||||
TABLE_CELL,
|
||||
IMAGE,
|
||||
HEADER,
|
||||
FOOTER;
|
||||
|
||||
|
||||
public String toString() {
|
||||
|
||||
return this.name().charAt(0) + this.name().substring(1).toLowerCase(Locale.ROOT);
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,87 @@
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationPage;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity.RedactionEntity;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.TextBlockCollector;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Getter
|
||||
@Setter
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class Page {
|
||||
|
||||
Integer number;
|
||||
Integer height;
|
||||
Integer width;
|
||||
Integer rotation;
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
List<SemanticNode> mainBody;
|
||||
@EqualsAndHashCode.Exclude
|
||||
Header header;
|
||||
@EqualsAndHashCode.Exclude
|
||||
Footer footer;
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<RedactionEntity> entities = new HashSet<>();
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<Image> images = new HashSet<>();
|
||||
|
||||
|
||||
public static Page fromClassificationPage(ClassificationPage classificationPage) {
|
||||
|
||||
return Page.builder()
|
||||
.height((int) classificationPage.getPageHeight())
|
||||
.width((int) classificationPage.getPageWidth())
|
||||
.number(classificationPage.getPageNumber())
|
||||
.rotation(classificationPage.getRotation())
|
||||
.mainBody(new LinkedList<>())
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
public TextBlock getMainBodyTextBlock() {
|
||||
|
||||
return mainBody.stream().filter(SemanticNode::isLeaf).map(SemanticNode::getLeafTextBlock).collect(new TextBlockCollector());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return String.valueOf(number);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
|
||||
return number;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
|
||||
return o instanceof Page && o.hashCode() == this.hashCode();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,62 @@
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.DocumentTree;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity.RedactionEntity;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.TextBlock;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class Paragraph implements GenericSemanticNode {
|
||||
|
||||
List<Integer> treeId;
|
||||
TextBlock leafTextBlock;
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
DocumentTree documentTree;
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<RedactionEntity> entities = new HashSet<>();
|
||||
|
||||
|
||||
@Override
|
||||
public NodeType getType() {
|
||||
|
||||
return NodeType.PARAGRAPH;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean isLeaf() {
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public TextBlock getTextBlock() {
|
||||
|
||||
return leafTextBlock;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return treeId + ": " + NodeType.PARAGRAPH + ": " + leafTextBlock.buildSummary();
|
||||
}
|
||||
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user