RED-6009: Document Tree Structure
* squashed commits
This commit is contained in:
parent
a6a6fd8180
commit
1f9e151092
@ -23,6 +23,7 @@
|
|||||||
|
|
||||||
<properties>
|
<properties>
|
||||||
<pdfbox.version>2.0.24</pdfbox.version>
|
<pdfbox.version>2.0.24</pdfbox.version>
|
||||||
|
<lombok.version>1.18.26</lombok.version>
|
||||||
</properties>
|
</properties>
|
||||||
|
|
||||||
|
|
||||||
@ -88,5 +89,26 @@
|
|||||||
</plugin>
|
</plugin>
|
||||||
</plugins>
|
</plugins>
|
||||||
</pluginManagement>
|
</pluginManagement>
|
||||||
|
<plugins>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.projectlombok</groupId>
|
||||||
|
<artifactId>lombok-maven-plugin</artifactId>
|
||||||
|
<version>1.18.20.0</version>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<id>delombok</id>
|
||||||
|
<phase>generate-sources</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>delombok</goal>
|
||||||
|
</goals>
|
||||||
|
<configuration>
|
||||||
|
<addOutputDirectory>false</addOutputDirectory>
|
||||||
|
<sourceDirectory>src/main/java</sourceDirectory>
|
||||||
|
<outputDirectory>${delomboked.sources}</outputDirectory>
|
||||||
|
</configuration>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
</build>
|
</build>
|
||||||
</project>
|
</project>
|
||||||
|
|||||||
@ -39,7 +39,6 @@
|
|||||||
</exclusion>
|
</exclusion>
|
||||||
</exclusions>
|
</exclusions>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|
||||||
<build>
|
<build>
|
||||||
|
|||||||
@ -12,7 +12,7 @@
|
|||||||
<artifactId>redaction-service-server-v1</artifactId>
|
<artifactId>redaction-service-server-v1</artifactId>
|
||||||
|
|
||||||
<properties>
|
<properties>
|
||||||
<drools.version>7.73.0.Final</drools.version>
|
<drools.version>8.37.0.Final</drools.version>
|
||||||
<kie.version>7.73.0.Final</kie.version>
|
<kie.version>7.73.0.Final</kie.version>
|
||||||
<locationtech.version>1.19.0</locationtech.version>
|
<locationtech.version>1.19.0</locationtech.version>
|
||||||
<javaassist.version>3.29.2-GA</javaassist.version>
|
<javaassist.version>3.29.2-GA</javaassist.version>
|
||||||
@ -64,7 +64,12 @@
|
|||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.drools</groupId>
|
<groupId>org.drools</groupId>
|
||||||
<artifactId>drools-core</artifactId>
|
<artifactId>drools-engine</artifactId>
|
||||||
|
<version>${drools.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.drools</groupId>
|
||||||
|
<artifactId>drools-mvel</artifactId>
|
||||||
<version>${drools.version}</version>
|
<version>${drools.version}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
@ -198,5 +203,4 @@
|
|||||||
</plugin>
|
</plugin>
|
||||||
</plugins>
|
</plugins>
|
||||||
</build>
|
</build>
|
||||||
|
|
||||||
</project>
|
</project>
|
||||||
|
|||||||
@ -1,26 +0,0 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
|
||||||
|
|
||||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
|
||||||
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
|
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
|
||||||
import lombok.Data;
|
|
||||||
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
@Data
|
|
||||||
@AllArgsConstructor
|
|
||||||
public class Footer {
|
|
||||||
|
|
||||||
private List<TextBlock> textBlocks;
|
|
||||||
|
|
||||||
|
|
||||||
@JsonIgnore
|
|
||||||
public SearchableText getSearchableText() {
|
|
||||||
|
|
||||||
SearchableText searchableText = new SearchableText();
|
|
||||||
textBlocks.forEach(block -> searchableText.addAll(block.getSequences()));
|
|
||||||
return searchableText;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,26 +0,0 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
|
||||||
|
|
||||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
|
||||||
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
|
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
|
||||||
import lombok.Data;
|
|
||||||
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
@Data
|
|
||||||
@AllArgsConstructor
|
|
||||||
public class Header {
|
|
||||||
|
|
||||||
private List<TextBlock> textBlocks;
|
|
||||||
|
|
||||||
|
|
||||||
@JsonIgnore
|
|
||||||
public SearchableText getSearchableText() {
|
|
||||||
|
|
||||||
SearchableText searchableText = new SearchableText();
|
|
||||||
textBlocks.forEach(block -> searchableText.addAll(block.getSequences()));
|
|
||||||
return searchableText;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,8 +0,0 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
|
||||||
|
|
||||||
public enum Orientation {
|
|
||||||
|
|
||||||
NONE,
|
|
||||||
LEFT,
|
|
||||||
RIGHT
|
|
||||||
}
|
|
||||||
@ -1,65 +0,0 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
|
||||||
|
|
||||||
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
|
|
||||||
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
|
|
||||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
|
||||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
|
|
||||||
|
|
||||||
import lombok.Data;
|
|
||||||
import lombok.NoArgsConstructor;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
@Data
|
|
||||||
@NoArgsConstructor
|
|
||||||
public class Paragraph implements Comparable {
|
|
||||||
|
|
||||||
private List<AbstractTextContainer> pageBlocks = new ArrayList<>();
|
|
||||||
private List<PdfImage> images = new ArrayList<>();
|
|
||||||
private String headline;
|
|
||||||
|
|
||||||
|
|
||||||
public SearchableText getSearchableText() {
|
|
||||||
|
|
||||||
SearchableText searchableText = new SearchableText();
|
|
||||||
pageBlocks.forEach(block -> {
|
|
||||||
if (block instanceof TextBlock) {
|
|
||||||
searchableText.addAll(((TextBlock) block).getSequences());
|
|
||||||
}
|
|
||||||
});
|
|
||||||
return searchableText;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public List<Table> getTables() {
|
|
||||||
|
|
||||||
List<Table> tables = new ArrayList<>();
|
|
||||||
pageBlocks.forEach(block -> {
|
|
||||||
if (block instanceof Table) {
|
|
||||||
tables.add((Table) block);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
return tables;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public List<TextBlock> getTextBlocks() {
|
|
||||||
|
|
||||||
List<TextBlock> textBlocks = new ArrayList<>();
|
|
||||||
pageBlocks.forEach(block -> {
|
|
||||||
if (block instanceof TextBlock) {
|
|
||||||
textBlocks.add((TextBlock) block);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
return textBlocks;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int compareTo(Object o) {
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,64 +0,0 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
|
||||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.section.SectionArea;
|
|
||||||
import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue;
|
|
||||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
|
|
||||||
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
|
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
|
||||||
import lombok.Builder;
|
|
||||||
import lombok.Data;
|
|
||||||
import lombok.NoArgsConstructor;
|
|
||||||
|
|
||||||
@Data
|
|
||||||
@Builder
|
|
||||||
@NoArgsConstructor
|
|
||||||
@AllArgsConstructor
|
|
||||||
public class SectionText {
|
|
||||||
|
|
||||||
private int sectionNumber;
|
|
||||||
private String text;
|
|
||||||
|
|
||||||
private boolean isTable;
|
|
||||||
private String headline;
|
|
||||||
|
|
||||||
@Builder.Default
|
|
||||||
private List<SectionArea> sectionAreas = new ArrayList<>();
|
|
||||||
@Builder.Default
|
|
||||||
private Set<Image> images = new HashSet<>();
|
|
||||||
@Builder.Default
|
|
||||||
private List<TextBlock> textBlocks = new ArrayList<>();
|
|
||||||
@Builder.Default
|
|
||||||
private Map<String, CellValue> tabularData = new HashMap<>();
|
|
||||||
@Builder.Default
|
|
||||||
private List<Integer> cellStarts = new ArrayList<>();
|
|
||||||
|
|
||||||
|
|
||||||
public void setTabularData(Map<String, CellValue> tabularData) {
|
|
||||||
|
|
||||||
tabularData.remove(null);
|
|
||||||
this.tabularData = tabularData;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@JsonIgnore
|
|
||||||
public SearchableText getSearchableText() {
|
|
||||||
|
|
||||||
SearchableText searchableText = new SearchableText();
|
|
||||||
textBlocks.forEach(block -> {
|
|
||||||
if (block != null) {
|
|
||||||
searchableText.addAll(block.getSequences());
|
|
||||||
}
|
|
||||||
});
|
|
||||||
return searchableText;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,19 +0,0 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
|
||||||
|
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
|
||||||
import lombok.Data;
|
|
||||||
import lombok.NoArgsConstructor;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
@Data
|
|
||||||
@NoArgsConstructor
|
|
||||||
@AllArgsConstructor
|
|
||||||
public class Text {
|
|
||||||
|
|
||||||
private int numberOfPages;
|
|
||||||
private List<SectionText> sectionTexts = new ArrayList<>();
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,26 +0,0 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
|
||||||
|
|
||||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
|
||||||
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
|
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
|
||||||
import lombok.Data;
|
|
||||||
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
@Data
|
|
||||||
@AllArgsConstructor
|
|
||||||
public class UnclassifiedText {
|
|
||||||
|
|
||||||
private List<TextBlock> textBlocks;
|
|
||||||
|
|
||||||
|
|
||||||
@JsonIgnore
|
|
||||||
public SearchableText getSearchableText() {
|
|
||||||
|
|
||||||
SearchableText searchableText = new SearchableText();
|
|
||||||
textBlocks.forEach(block -> searchableText.addAll(block.getSequences()));
|
|
||||||
return searchableText;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -10,7 +10,7 @@ import lombok.NoArgsConstructor;
|
|||||||
@Builder
|
@Builder
|
||||||
@AllArgsConstructor
|
@AllArgsConstructor
|
||||||
@NoArgsConstructor
|
@NoArgsConstructor
|
||||||
public class EntityRecogintionEntity {
|
public class EntityRecognitionEntity {
|
||||||
|
|
||||||
private String value;
|
private String value;
|
||||||
private int startOffset;
|
private int startOffset;
|
||||||
@ -4,7 +4,6 @@ import java.util.HashMap;
|
|||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
import lombok.Data;
|
import lombok.Data;
|
||||||
import lombok.NoArgsConstructor;
|
import lombok.NoArgsConstructor;
|
||||||
@ -12,8 +11,8 @@ import lombok.NoArgsConstructor;
|
|||||||
@Data
|
@Data
|
||||||
@NoArgsConstructor
|
@NoArgsConstructor
|
||||||
@AllArgsConstructor
|
@AllArgsConstructor
|
||||||
public class NerEntities {
|
public class NerEntitiesModel {
|
||||||
|
|
||||||
private Map<Integer, List<EntityRecogintionEntity>> data = new HashMap<>();
|
private Map<Integer, List<EntityRecognitionEntity>> data = new HashMap<>();
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -1,13 +1,13 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.controller;
|
package com.iqser.red.service.redaction.v1.server.controller;
|
||||||
|
|
||||||
|
import org.springframework.web.bind.annotation.RestController;
|
||||||
|
|
||||||
import com.iqser.red.service.redaction.v1.model.RuleBuilderModel;
|
import com.iqser.red.service.redaction.v1.model.RuleBuilderModel;
|
||||||
import com.iqser.red.service.redaction.v1.resources.RuleBuilderResource;
|
import com.iqser.red.service.redaction.v1.resources.RuleBuilderResource;
|
||||||
import com.iqser.red.service.redaction.v1.server.redaction.rulebuilder.RuleBuilderModelService;
|
import com.iqser.red.service.redaction.v1.server.redaction.service.RuleBuilderModelService;
|
||||||
|
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
|
|
||||||
import org.springframework.web.bind.annotation.RestController;
|
|
||||||
|
|
||||||
@RestController
|
@RestController
|
||||||
@RequiredArgsConstructor
|
@RequiredArgsConstructor
|
||||||
public class RuleBuilderController implements RuleBuilderResource {
|
public class RuleBuilderController implements RuleBuilderResource {
|
||||||
|
|||||||
@ -1,5 +1,6 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.segmentation;
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter;
|
||||||
|
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@ -10,11 +11,10 @@ import org.springframework.stereotype.Service;
|
|||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.dossier.file.FileType;
|
import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.dossier.file.FileType;
|
||||||
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.image.ImageServiceResponse;
|
||||||
import com.iqser.red.service.redaction.v1.server.redaction.model.ImageType;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationPage;
|
||||||
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.image.ClassifiedImage;
|
||||||
import com.iqser.red.service.redaction.v1.server.redaction.model.RedRectangle2D;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.ImageType;
|
||||||
import com.iqser.red.service.redaction.v1.server.redaction.model.image.ImageServiceResponse;
|
|
||||||
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
|
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
|
||||||
|
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
@ -22,26 +22,26 @@ import lombok.SneakyThrows;
|
|||||||
|
|
||||||
@Service
|
@Service
|
||||||
@RequiredArgsConstructor
|
@RequiredArgsConstructor
|
||||||
public class ImageService {
|
public class ImageServiceResponseAdapter {
|
||||||
|
|
||||||
private final ObjectMapper objectMapper;
|
private final ObjectMapper objectMapper;
|
||||||
private final RedactionStorageService redactionStorageService;
|
private final RedactionStorageService redactionStorageService;
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public Map<Integer, List<PdfImage>> convertImages(String dossierId, String fileId) {
|
public Map<Integer, List<ClassifiedImage>> convertImages(String dossierId, String fileId) {
|
||||||
|
|
||||||
var imageClassificationStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(dossierId, fileId, FileType.IMAGE_INFO));
|
var imageClassificationStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(dossierId, fileId, FileType.IMAGE_INFO));
|
||||||
|
|
||||||
ImageServiceResponse imageServiceResponse = objectMapper.readValue(imageClassificationStream, ImageServiceResponse.class);
|
ImageServiceResponse imageServiceResponse = objectMapper.readValue(imageClassificationStream, ImageServiceResponse.class);
|
||||||
|
|
||||||
Map<Integer, List<PdfImage>> images = new HashMap<>();
|
Map<Integer, List<ClassifiedImage>> images = new HashMap<>();
|
||||||
imageServiceResponse.getData().forEach(imageMetadata -> {
|
imageServiceResponse.getData().forEach(imageMetadata -> {
|
||||||
var classification = imageMetadata.getFilters().isAllPassed() ? ImageType.valueOf(imageMetadata.getClassification()
|
var classification = imageMetadata.getFilters().isAllPassed() ? ImageType.valueOf(imageMetadata.getClassification()
|
||||||
.getLabel()
|
.getLabel()
|
||||||
.toUpperCase(Locale.ROOT)) : ImageType.OTHER;
|
.toUpperCase(Locale.ROOT)) : ImageType.OTHER;
|
||||||
images.computeIfAbsent(imageMetadata.getPosition().getPageNumber(), x -> new ArrayList<>())
|
images.computeIfAbsent(imageMetadata.getPosition().getPageNumber(), x -> new ArrayList<>())
|
||||||
.add(new PdfImage(new RedRectangle2D(imageMetadata.getPosition().getX1(),
|
.add(new ClassifiedImage(new Rectangle2D.Double(imageMetadata.getPosition().getX1(),
|
||||||
imageMetadata.getPosition().getY1(),
|
imageMetadata.getPosition().getY1(),
|
||||||
imageMetadata.getGeometry().getWidth(),
|
imageMetadata.getGeometry().getWidth(),
|
||||||
imageMetadata.getGeometry().getHeight()), classification, imageMetadata.isAlpha(), imageMetadata.getPosition().getPageNumber()));
|
imageMetadata.getGeometry().getHeight()), classification, imageMetadata.isAlpha(), imageMetadata.getPosition().getPageNumber()));
|
||||||
@ -53,7 +53,7 @@ public class ImageService {
|
|||||||
.getLabel()
|
.getLabel()
|
||||||
.toUpperCase(Locale.ROOT)) : ImageType.OTHER;
|
.toUpperCase(Locale.ROOT)) : ImageType.OTHER;
|
||||||
images.computeIfAbsent(imageMetadata.getPosition().getPageNumber(), x -> new ArrayList<>())
|
images.computeIfAbsent(imageMetadata.getPosition().getPageNumber(), x -> new ArrayList<>())
|
||||||
.add(new PdfImage(new RedRectangle2D(imageMetadata.getPosition().getX1(),
|
.add(new ClassifiedImage(new Rectangle2D.Double(imageMetadata.getPosition().getX1(),
|
||||||
imageMetadata.getPosition().getY1(),
|
imageMetadata.getPosition().getY1(),
|
||||||
imageMetadata.getGeometry().getWidth(),
|
imageMetadata.getGeometry().getWidth(),
|
||||||
imageMetadata.getGeometry().getHeight()), classification, imageMetadata.isAlpha(), imageMetadata.getPosition().getPageNumber()));
|
imageMetadata.getGeometry().getHeight()), classification, imageMetadata.isAlpha(), imageMetadata.getPosition().getPageNumber()));
|
||||||
@ -63,7 +63,7 @@ public class ImageService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void findOcr(Page page) {
|
public void findOcr(ClassificationPage page) {
|
||||||
|
|
||||||
page.getImages().forEach(image -> {
|
page.getImages().forEach(image -> {
|
||||||
if (image.getImageType().equals(ImageType.OTHER)) {
|
if (image.getImageType().equals(ImageType.OTHER)) {
|
||||||
@ -0,0 +1,164 @@
|
|||||||
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter;
|
||||||
|
|
||||||
|
import static java.lang.String.format;
|
||||||
|
import static java.util.stream.Collectors.groupingBy;
|
||||||
|
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Locale;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||||
|
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.RedactionLog;
|
||||||
|
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.RedactionLogEntry;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.exception.NotFoundException;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity.EntityType;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity.RedactionEntity;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity.RedactionPosition;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Page;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.SemanticNode;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.services.EntityCreationService;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.redaction.model.dictionary.SearchImplementation;
|
||||||
|
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
|
||||||
|
@Service
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
public class RedactionLogEntryAdapter {
|
||||||
|
|
||||||
|
private static final double MATCH_THRESHOLD = 1;
|
||||||
|
private final EntityCreationService entityCreationService;
|
||||||
|
|
||||||
|
|
||||||
|
public Stream<RedactionEntity> toRedactionEntity(RedactionLog redactionLog, SemanticNode node) {
|
||||||
|
|
||||||
|
List<Integer> pageNumbers = redactionLog.getRedactionLogEntry().stream().flatMap(entry -> entry.getPositions().stream().map(Rectangle::getPage)).distinct().toList();
|
||||||
|
if (!pageNumbers.stream().allMatch(node::isOnPage)) {
|
||||||
|
throw new IllegalArgumentException(format("SemanticNode %s does not contain these pages %s present in the redaction log",
|
||||||
|
node,
|
||||||
|
pageNumbers.stream().filter(pageNumber -> !node.isOnPage(pageNumber)).toList()));
|
||||||
|
}
|
||||||
|
Set<String> entryValues = redactionLog.getRedactionLogEntry().stream().map(RedactionLogEntry::getValue).map(String::toLowerCase).collect(Collectors.toSet());
|
||||||
|
SearchImplementation searchImplementation = new SearchImplementation(entryValues, true);
|
||||||
|
|
||||||
|
Map<String, List<RedactionEntity>> tempEntitiesByValue = findAllPossibleEntitiesAndGroupByValueIgnoringCase(node, searchImplementation);
|
||||||
|
|
||||||
|
assert allValuesFound(tempEntitiesByValue, entryValues);
|
||||||
|
|
||||||
|
List<RedactionEntity> entities = redactionLog.getRedactionLogEntry()
|
||||||
|
.stream()
|
||||||
|
.map(entry -> findClosestRedactionEntity(entry, tempEntitiesByValue.get(entry.getValue().toLowerCase(Locale.ROOT)), node))
|
||||||
|
.toList();
|
||||||
|
tempEntitiesByValue.values().stream().flatMap(Collection::stream).forEach(RedactionEntity::removeFromGraph);
|
||||||
|
return entities.stream();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static boolean allValuesFound(Map<String, List<RedactionEntity>> entitiesByValue, Set<String> entryValues) {
|
||||||
|
|
||||||
|
return entitiesByValue.keySet().equals(entryValues);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private Map<String, List<RedactionEntity>> findAllPossibleEntitiesAndGroupByValueIgnoringCase(SemanticNode node, SearchImplementation searchImplementation) {
|
||||||
|
|
||||||
|
return searchImplementation.getBoundaries(node.getTextBlock(), node.getBoundary())
|
||||||
|
.stream()
|
||||||
|
.map(boundary -> entityCreationService.byBoundary(boundary, "temp", EntityType.ENTITY, node))
|
||||||
|
.collect(groupingBy(entity -> entity.getValue().toLowerCase(Locale.ROOT)));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private RedactionEntity findClosestRedactionEntity(RedactionLogEntry redactionLogEntry, List<RedactionEntity> entitiesWithSameValue, SemanticNode node) {
|
||||||
|
|
||||||
|
RedactionEntity closestEntity = entitiesWithSameValue.stream()
|
||||||
|
.filter(entity -> pagesMatch(entity, redactionLogEntry))
|
||||||
|
.min(Comparator.comparingDouble(entity -> calculateMinDistance(redactionLogEntry, entity)))
|
||||||
|
.orElseThrow(() -> new NotFoundException(format("No entity with similar position found for %s", redactionLogEntry)));
|
||||||
|
|
||||||
|
double distance = calculateMinDistance(redactionLogEntry, closestEntity);
|
||||||
|
if (distance > MATCH_THRESHOLD) {
|
||||||
|
throw new NotFoundException(format("Distance to closest found entity is %.2f for \n%s \n%s",
|
||||||
|
distance,
|
||||||
|
redactionLogEntry.getPositions(),
|
||||||
|
closestEntity.getRedactionPositionsPerPage()));
|
||||||
|
}
|
||||||
|
|
||||||
|
return createCorrectEntity(redactionLogEntry, node, closestEntity);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private RedactionEntity createCorrectEntity(RedactionLogEntry redactionLogEntry, SemanticNode node, RedactionEntity closestEntity) {
|
||||||
|
|
||||||
|
RedactionEntity correctEntity = entityCreationService.byBoundary(closestEntity.getBoundary(),
|
||||||
|
redactionLogEntry.getType(),
|
||||||
|
redactionLogEntry.isRecommendation() ? EntityType.RECOMMENDATION : EntityType.ENTITY,
|
||||||
|
node);
|
||||||
|
correctEntity.setLegalBasis(redactionLogEntry.getLegalBasis());
|
||||||
|
correctEntity.setRedactionReason(redactionLogEntry.getReason());
|
||||||
|
correctEntity.addMatchedRule(redactionLogEntry.getMatchedRule());
|
||||||
|
correctEntity.setRedaction(redactionLogEntry.isRedacted());
|
||||||
|
correctEntity.setDictionaryEntry(redactionLogEntry.isDictionaryEntry());
|
||||||
|
correctEntity.setDossierDictionaryEntry(redactionLogEntry.isDossierDictionaryEntry());
|
||||||
|
return correctEntity;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static boolean pagesMatch(RedactionEntity entity, RedactionLogEntry redactionLogEntry) {
|
||||||
|
|
||||||
|
Set<Integer> entityPageNumbers = entity.getRedactionPositionsPerPage().stream().map(RedactionPosition::getPage).map(Page::getNumber).collect(Collectors.toSet());
|
||||||
|
Set<Integer> redactionLogEntryPageNumbers = redactionLogEntry.getPositions().stream().map(Rectangle::getPage).collect(Collectors.toSet());
|
||||||
|
return entityPageNumbers.equals(redactionLogEntryPageNumbers);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private double calculateMinDistance(RedactionLogEntry redactionLogEntry, RedactionEntity entity) {
|
||||||
|
|
||||||
|
if (redactionLogEntry.getPositions().size() != countRectangles(entity)) {
|
||||||
|
return Double.MAX_VALUE;
|
||||||
|
}
|
||||||
|
return redactionLogEntry.getPositions().stream().mapToDouble(redactionLogEntryRectangle -> calculateMinDistancePerRectangle(entity, redactionLogEntryRectangle)).sum();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static long countRectangles(RedactionEntity entity) {
|
||||||
|
|
||||||
|
return entity.getRedactionPositionsPerPage().stream().mapToLong(redactionPosition -> redactionPosition.getRectanglePerLine().size()).sum();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private double calculateMinDistancePerRectangle(RedactionEntity entity, Rectangle redactionLogEntryRectangle) {
|
||||||
|
|
||||||
|
return entity.getRedactionPositionsPerPage()
|
||||||
|
.stream()
|
||||||
|
.filter(redactionPosition -> redactionPosition.getPage().getNumber() == redactionLogEntryRectangle.getPage())
|
||||||
|
.map(RedactionPosition::getRectanglePerLine)
|
||||||
|
.flatMap(Collection::stream)
|
||||||
|
.mapToDouble(rectangle -> calculateDistance(rectangle, toRectangle2D(redactionLogEntryRectangle)))
|
||||||
|
.min()
|
||||||
|
.orElse(Double.MAX_VALUE);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private double calculateDistance(Rectangle2D rectangle, Rectangle2D rectangle2D) {
|
||||||
|
|
||||||
|
return Math.abs(rectangle.getMinX() - rectangle2D.getMinX()) //
|
||||||
|
+ Math.abs(rectangle.getMinY() - rectangle2D.getMinY()) //
|
||||||
|
+ Math.abs(rectangle.getMaxX() - rectangle2D.getMaxX()) //
|
||||||
|
+ Math.abs(rectangle.getMaxY() - rectangle2D.getMaxY());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private Rectangle2D toRectangle2D(Rectangle rectangle) {
|
||||||
|
|
||||||
|
return new Rectangle2D.Float(rectangle.getTopLeft().getX(), rectangle.getTopLeft().getY() + rectangle.getHeight(), rectangle.getWidth(), -rectangle.getHeight());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -1,4 +1,4 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.segmentation;
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
@ -10,9 +10,9 @@ import org.springframework.stereotype.Service;
|
|||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.dossier.file.FileType;
|
import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.dossier.file.FileType;
|
||||||
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfTableCell;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.table.PdfTableCell;
|
||||||
import com.iqser.red.service.redaction.v1.server.redaction.model.table.TableCells;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.table.TableCells;
|
||||||
import com.iqser.red.service.redaction.v1.server.redaction.model.table.TableServiceResponse;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.table.TableServiceResponse;
|
||||||
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
|
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
|
||||||
|
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
@ -22,7 +22,7 @@ import lombok.extern.slf4j.Slf4j;
|
|||||||
@Slf4j
|
@Slf4j
|
||||||
@Service
|
@Service
|
||||||
@RequiredArgsConstructor
|
@RequiredArgsConstructor
|
||||||
public class TableService {
|
public class TableServiceResponseAdapter {
|
||||||
|
|
||||||
private final ObjectMapper objectMapper;
|
private final ObjectMapper objectMapper;
|
||||||
private final RedactionStorageService redactionStorageService;
|
private final RedactionStorageService redactionStorageService;
|
||||||
@ -1,9 +1,8 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.redaction.model.image;
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.image;
|
||||||
|
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
|
|
||||||
import lombok.Data;
|
import lombok.Data;
|
||||||
|
|
||||||
@Data
|
@Data
|
||||||
@ -1,5 +1,4 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.redaction.model.image;
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.image;
|
||||||
|
|
||||||
|
|
||||||
import lombok.Data;
|
import lombok.Data;
|
||||||
|
|
||||||
@ -1,5 +1,4 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.redaction.model.image;
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.image;
|
||||||
|
|
||||||
|
|
||||||
import lombok.Data;
|
import lombok.Data;
|
||||||
|
|
||||||
@ -1,5 +1,4 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.redaction.model.image;
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.image;
|
||||||
|
|
||||||
|
|
||||||
import lombok.Data;
|
import lombok.Data;
|
||||||
|
|
||||||
@ -1,5 +1,4 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.redaction.model.image;
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.image;
|
||||||
|
|
||||||
|
|
||||||
import lombok.Data;
|
import lombok.Data;
|
||||||
|
|
||||||
@ -1,5 +1,4 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.redaction.model.image;
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.image;
|
||||||
|
|
||||||
|
|
||||||
import lombok.Data;
|
import lombok.Data;
|
||||||
|
|
||||||
@ -1,13 +1,13 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.redaction.model.image;
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.image;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
import com.fasterxml.jackson.annotation.JsonAlias;
|
import com.fasterxml.jackson.annotation.JsonAlias;
|
||||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||||
|
|
||||||
import lombok.Data;
|
import lombok.Data;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
@Data
|
@Data
|
||||||
public class ImageServiceResponse {
|
public class ImageServiceResponse {
|
||||||
|
|
||||||
@ -1,4 +1,4 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.redaction.model.image;
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.image;
|
||||||
|
|
||||||
import lombok.Data;
|
import lombok.Data;
|
||||||
|
|
||||||
@ -1,5 +1,4 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.redaction.model.image;
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.image;
|
||||||
|
|
||||||
|
|
||||||
import lombok.Data;
|
import lombok.Data;
|
||||||
|
|
||||||
@ -0,0 +1,10 @@
|
|||||||
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.image;
|
||||||
|
|
||||||
|
import lombok.Data;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
public class Probability {
|
||||||
|
|
||||||
|
private boolean unconfident;
|
||||||
|
|
||||||
|
}
|
||||||
@ -1,5 +1,4 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.redaction.model.table;
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.table;
|
||||||
|
|
||||||
|
|
||||||
import lombok.Data;
|
import lombok.Data;
|
||||||
|
|
||||||
@ -1,4 +1,4 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.redaction.model;
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.table;
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
import lombok.Builder;
|
import lombok.Builder;
|
||||||
@ -1,5 +1,4 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.redaction.model.table;
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.table;
|
||||||
|
|
||||||
|
|
||||||
import lombok.Data;
|
import lombok.Data;
|
||||||
|
|
||||||
@ -1,9 +1,8 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.redaction.model.table;
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.table;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
|
||||||
import lombok.Data;
|
import lombok.Data;
|
||||||
|
|
||||||
@Data
|
@Data
|
||||||
@ -1,9 +1,8 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.redaction.model.table;
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.table;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
|
||||||
import lombok.Data;
|
import lombok.Data;
|
||||||
|
|
||||||
@Data
|
@Data
|
||||||
@ -1,9 +1,8 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.tableextraction.model;
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model;
|
||||||
|
|
||||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||||
import com.iqser.red.service.redaction.v1.server.classification.model.Orientation;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
|
||||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
import lombok.Data;
|
import lombok.Data;
|
||||||
@ -12,7 +11,7 @@ import lombok.NoArgsConstructor;
|
|||||||
@Data
|
@Data
|
||||||
@AllArgsConstructor
|
@AllArgsConstructor
|
||||||
@NoArgsConstructor
|
@NoArgsConstructor
|
||||||
public abstract class AbstractTextContainer {
|
public abstract class AbstractPageBlock {
|
||||||
|
|
||||||
@JsonIgnore
|
@JsonIgnore
|
||||||
protected float minX;
|
protected float minX;
|
||||||
@ -23,7 +22,7 @@ public abstract class AbstractTextContainer {
|
|||||||
@JsonIgnore
|
@JsonIgnore
|
||||||
protected float maxY;
|
protected float maxY;
|
||||||
@JsonIgnore
|
@JsonIgnore
|
||||||
protected String classification;
|
protected PageBlockType classification;
|
||||||
@JsonIgnore
|
@JsonIgnore
|
||||||
protected int page;
|
protected int page;
|
||||||
|
|
||||||
@ -34,13 +33,19 @@ public abstract class AbstractTextContainer {
|
|||||||
public abstract String getText();
|
public abstract String getText();
|
||||||
|
|
||||||
|
|
||||||
public boolean containsBlock(TextBlock other) {
|
public boolean isHeadline() {
|
||||||
|
|
||||||
|
return this instanceof TextPageBlock && this.getClassification() != null && this.getClassification().isHeadline();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean containsBlock(TextPageBlock other) {
|
||||||
|
|
||||||
return this.minX <= other.getPdfMinX() && this.maxX >= other.getPdfMaxX() && this.minY >= other.getPdfMinY() && this.maxY <= other.getPdfMaxY();
|
return this.minX <= other.getPdfMinX() && this.maxX >= other.getPdfMaxX() && this.minY >= other.getPdfMinY() && this.maxY <= other.getPdfMaxY();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public boolean contains(AbstractTextContainer other) {
|
public boolean contains(AbstractPageBlock other) {
|
||||||
|
|
||||||
return this.minX <= other.minX && this.maxX >= other.maxX && this.minY >= other.minY && this.maxY <= other.maxY;
|
return this.minX <= other.minX && this.maxX >= other.maxX && this.minY >= other.minY && this.maxY <= other.maxY;
|
||||||
}
|
}
|
||||||
@ -66,4 +71,10 @@ public abstract class AbstractTextContainer {
|
|||||||
return maxX - minX;
|
return maxX - minX;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean intersectsY(AbstractPageBlock atc) {
|
||||||
|
|
||||||
|
return this.minY <= atc.getMaxY() && this.maxY >= atc.getMinY();
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -1,22 +1,24 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.section.SectionGrid;
|
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.section.SectionGrid;
|
||||||
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryVersion;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.StringFrequencyCounter;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.UnclassifiedText;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.redaction.model.dictionary.DictionaryVersion;
|
||||||
|
|
||||||
import lombok.Data;
|
import lombok.Data;
|
||||||
import lombok.NoArgsConstructor;
|
import lombok.NoArgsConstructor;
|
||||||
|
|
||||||
@Data
|
@Data
|
||||||
@NoArgsConstructor
|
@NoArgsConstructor
|
||||||
public class Document {
|
public class ClassificationDocument {
|
||||||
|
|
||||||
private List<Page> pages = new ArrayList<>();
|
private List<ClassificationPage> pages = new ArrayList<>();
|
||||||
private List<Paragraph> paragraphs = new ArrayList<>();
|
private List<ClassificationSection> sections = new ArrayList<>();
|
||||||
private List<Header> headers = new ArrayList<>();
|
private List<ClassificationHeader> headers = new ArrayList<>();
|
||||||
private List<Footer> footers = new ArrayList<>();
|
private List<ClassificationFooter> footers = new ArrayList<>();
|
||||||
private List<UnclassifiedText> unclassifiedTexts = new ArrayList<>();
|
private List<UnclassifiedText> unclassifiedTexts = new ArrayList<>();
|
||||||
private FloatFrequencyCounter textHeightCounter = new FloatFrequencyCounter();
|
private FloatFrequencyCounter textHeightCounter = new FloatFrequencyCounter();
|
||||||
private FloatFrequencyCounter fontSizeCounter = new FloatFrequencyCounter();
|
private FloatFrequencyCounter fontSizeCounter = new FloatFrequencyCounter();
|
||||||
@ -0,0 +1,16 @@
|
|||||||
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
|
||||||
|
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Data;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@AllArgsConstructor
|
||||||
|
public class ClassificationFooter {
|
||||||
|
|
||||||
|
private List<TextPageBlock> textBlocks;
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,16 @@
|
|||||||
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
|
||||||
|
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Data;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@AllArgsConstructor
|
||||||
|
public class ClassificationHeader {
|
||||||
|
|
||||||
|
private List<TextPageBlock> textBlocks;
|
||||||
|
|
||||||
|
}
|
||||||
@ -1,11 +1,11 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||||
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.image.ClassifiedImage;
|
||||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.StringFrequencyCounter;
|
||||||
|
|
||||||
import lombok.Data;
|
import lombok.Data;
|
||||||
import lombok.NonNull;
|
import lombok.NonNull;
|
||||||
@ -13,12 +13,12 @@ import lombok.RequiredArgsConstructor;
|
|||||||
|
|
||||||
@Data
|
@Data
|
||||||
@RequiredArgsConstructor
|
@RequiredArgsConstructor
|
||||||
public class Page {
|
public class ClassificationPage {
|
||||||
|
|
||||||
@NonNull
|
@NonNull
|
||||||
private List<AbstractTextContainer> textBlocks;
|
private List<AbstractPageBlock> textBlocks;
|
||||||
|
|
||||||
private List<PdfImage> images = new ArrayList<>();
|
private List<ClassifiedImage> images = new ArrayList<>();
|
||||||
|
|
||||||
private Rectangle bodyTextFrame;
|
private Rectangle bodyTextFrame;
|
||||||
|
|
||||||
@ -35,10 +35,4 @@ public class Page {
|
|||||||
private float pageWidth;
|
private float pageWidth;
|
||||||
private float pageHeight;
|
private float pageHeight;
|
||||||
|
|
||||||
|
|
||||||
public boolean isRotated() {
|
|
||||||
|
|
||||||
return rotation != 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -0,0 +1,32 @@
|
|||||||
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.image.ClassifiedImage;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.TablePageBlock;
|
||||||
|
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.NoArgsConstructor;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@NoArgsConstructor
|
||||||
|
public class ClassificationSection {
|
||||||
|
|
||||||
|
private List<AbstractPageBlock> pageBlocks = new ArrayList<>();
|
||||||
|
private List<ClassifiedImage> images = new ArrayList<>();
|
||||||
|
private String headline;
|
||||||
|
|
||||||
|
|
||||||
|
public List<TablePageBlock> getTables() {
|
||||||
|
|
||||||
|
List<TablePageBlock> tables = new ArrayList<>();
|
||||||
|
pageBlocks.forEach(block -> {
|
||||||
|
if (block instanceof TablePageBlock) {
|
||||||
|
tables.add((TablePageBlock) block);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
return tables;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -1,6 +1,4 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model;
|
||||||
|
|
||||||
import lombok.Getter;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
@ -9,6 +7,8 @@ import java.util.List;
|
|||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import lombok.Getter;
|
||||||
|
|
||||||
public class FloatFrequencyCounter {
|
public class FloatFrequencyCounter {
|
||||||
|
|
||||||
@Getter
|
@Getter
|
||||||
@ -0,0 +1,8 @@
|
|||||||
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model;
|
||||||
|
|
||||||
|
public enum Orientation {
|
||||||
|
|
||||||
|
NONE,
|
||||||
|
LEFT,
|
||||||
|
RIGHT
|
||||||
|
}
|
||||||
@ -0,0 +1,38 @@
|
|||||||
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model;
|
||||||
|
|
||||||
|
public enum PageBlockType {
|
||||||
|
H1,
|
||||||
|
H2,
|
||||||
|
H3,
|
||||||
|
H4,
|
||||||
|
H5,
|
||||||
|
H6,
|
||||||
|
HEADER,
|
||||||
|
FOOTER,
|
||||||
|
TITLE,
|
||||||
|
PARAGRAPH,
|
||||||
|
PARAGRAPH_BOLD,
|
||||||
|
PARAGRAPH_ITALIC,
|
||||||
|
PARAGRAPH_UNKNOWN,
|
||||||
|
OTHER,
|
||||||
|
TABLE;
|
||||||
|
|
||||||
|
|
||||||
|
public static PageBlockType getHeadlineType(int i) {
|
||||||
|
|
||||||
|
return switch (i) {
|
||||||
|
case 1 -> PageBlockType.H1;
|
||||||
|
case 2 -> PageBlockType.H2;
|
||||||
|
case 3 -> PageBlockType.H3;
|
||||||
|
case 4 -> PageBlockType.H4;
|
||||||
|
case 5 -> PageBlockType.H5;
|
||||||
|
default -> PageBlockType.H6;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean isHeadline() {
|
||||||
|
|
||||||
|
return this.equals(H1) || this.equals(H2) || this.equals(H3) || this.equals(H4) || this.equals(H5) || this.equals(H6);
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,25 @@
|
|||||||
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.image;
|
||||||
|
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
|
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.ImageType;
|
||||||
|
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.NonNull;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
public class ClassifiedImage {
|
||||||
|
|
||||||
|
@NonNull
|
||||||
|
private Rectangle2D position;
|
||||||
|
@NonNull
|
||||||
|
private ImageType imageType;
|
||||||
|
private boolean isAppendedToSection;
|
||||||
|
@NonNull
|
||||||
|
private boolean hasTransparency;
|
||||||
|
@NonNull
|
||||||
|
private int page;
|
||||||
|
|
||||||
|
}
|
||||||
@ -1,25 +1,25 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.tableextraction.model;
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table;
|
||||||
|
|
||||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
|
||||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
|
||||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
|
|
||||||
|
|
||||||
import lombok.Data;
|
|
||||||
import lombok.EqualsAndHashCode;
|
|
||||||
import lombok.NoArgsConstructor;
|
|
||||||
|
|
||||||
import java.awt.geom.Point2D;
|
import java.awt.geom.Point2D;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPositionSequence;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
|
||||||
|
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.EqualsAndHashCode;
|
||||||
|
import lombok.NoArgsConstructor;
|
||||||
|
|
||||||
@SuppressWarnings("serial")
|
@SuppressWarnings("serial")
|
||||||
@Data
|
@Data
|
||||||
@EqualsAndHashCode(callSuper = true)
|
@EqualsAndHashCode(callSuper = true)
|
||||||
@NoArgsConstructor
|
@NoArgsConstructor
|
||||||
public class Cell extends Rectangle {
|
public class Cell extends Rectangle {
|
||||||
|
|
||||||
private List<TextBlock> textBlocks = new ArrayList<>();
|
private List<TextPageBlock> textBlocks = new ArrayList<>();
|
||||||
|
|
||||||
private List<Cell> headerCells = new ArrayList<>();
|
private List<Cell> headerCells = new ArrayList<>();
|
||||||
|
|
||||||
@ -27,6 +27,8 @@ public class Cell extends Rectangle {
|
|||||||
|
|
||||||
private static final int MIN_SIZE = 1;
|
private static final int MIN_SIZE = 1;
|
||||||
|
|
||||||
|
private int pageNumber;
|
||||||
|
|
||||||
|
|
||||||
public Cell(Point2D topLeft, Point2D bottomRight) {
|
public Cell(Point2D topLeft, Point2D bottomRight) {
|
||||||
|
|
||||||
@ -34,7 +36,7 @@ public class Cell extends Rectangle {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void addTextBlock(TextBlock textBlock) {
|
public void addTextBlock(TextPageBlock textBlock) {
|
||||||
|
|
||||||
textBlocks.add(textBlock);
|
textBlocks.add(textBlock);
|
||||||
}
|
}
|
||||||
@ -45,11 +47,11 @@ public class Cell extends Rectangle {
|
|||||||
|
|
||||||
StringBuilder sb = new StringBuilder();
|
StringBuilder sb = new StringBuilder();
|
||||||
|
|
||||||
Iterator<TextBlock> itty = textBlocks.iterator();
|
Iterator<TextPageBlock> itty = textBlocks.iterator();
|
||||||
TextPositionSequence previous = null;
|
TextPositionSequence previous = null;
|
||||||
while (itty.hasNext()) {
|
while (itty.hasNext()) {
|
||||||
|
|
||||||
TextBlock textBlock = itty.next();
|
TextPageBlock textBlock = itty.next();
|
||||||
|
|
||||||
for (TextPositionSequence word : textBlock.getSequences()) {
|
for (TextPositionSequence word : textBlock.getSequences()) {
|
||||||
if (previous != null) {
|
if (previous != null) {
|
||||||
@ -1,4 +1,4 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.tableextraction.model;
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table;
|
||||||
|
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
import lombok.Value;
|
import lombok.Value;
|
||||||
@ -1,10 +1,10 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.tableextraction.model;
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
import lombok.Builder;
|
import lombok.Builder;
|
||||||
import lombok.Data;
|
import lombok.Data;
|
||||||
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
@Data
|
@Data
|
||||||
@Builder
|
@Builder
|
||||||
public class CleanRulings {
|
public class CleanRulings {
|
||||||
@ -1,4 +1,4 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.tableextraction.model;
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table;
|
||||||
|
|
||||||
import java.awt.geom.Point2D;
|
import java.awt.geom.Point2D;
|
||||||
import java.awt.geom.Rectangle2D;
|
import java.awt.geom.Rectangle2D;
|
||||||
@ -1,14 +1,20 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.tableextraction.model;
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table;
|
||||||
|
|
||||||
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.CohenSutherlandClipping;
|
|
||||||
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils;
|
|
||||||
|
|
||||||
import lombok.extern.slf4j.Slf4j;
|
|
||||||
|
|
||||||
import java.awt.geom.Line2D;
|
import java.awt.geom.Line2D;
|
||||||
import java.awt.geom.Point2D;
|
import java.awt.geom.Point2D;
|
||||||
import java.awt.geom.Rectangle2D;
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.util.*;
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.Formatter;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.TreeMap;
|
||||||
|
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils.CohenSutherlandClipping;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils.Utils;
|
||||||
|
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
|
||||||
@Slf4j
|
@Slf4j
|
||||||
@SuppressWarnings("all")
|
@SuppressWarnings("all")
|
||||||
@ -1,25 +1,25 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.tableextraction.model;
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table;
|
||||||
|
|
||||||
import java.awt.geom.Point2D;
|
import java.awt.geom.Point2D;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.Comparator;
|
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.TreeMap;
|
import java.util.TreeMap;
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
import org.apache.commons.collections4.CollectionUtils;
|
import org.apache.commons.collections4.CollectionUtils;
|
||||||
|
|
||||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.PageBlockType;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
|
||||||
|
|
||||||
import lombok.Getter;
|
import lombok.Getter;
|
||||||
import lombok.Setter;
|
import lombok.Setter;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
|
||||||
@Slf4j
|
@Slf4j
|
||||||
public class Table extends AbstractTextContainer {
|
public class TablePageBlock extends AbstractPageBlock {
|
||||||
|
|
||||||
private final TreeMap<CellPosition, Cell> cells = new TreeMap<>();
|
private final TreeMap<CellPosition, Cell> cells = new TreeMap<>();
|
||||||
|
|
||||||
@ -29,21 +29,18 @@ public class Table extends AbstractTextContainer {
|
|||||||
private String headline;
|
private String headline;
|
||||||
private int unrotatedRowCount;
|
private int unrotatedRowCount;
|
||||||
private int unrotatedColCount;
|
private int unrotatedColCount;
|
||||||
private int rowCount = -1;
|
|
||||||
private int colCount = -1;
|
|
||||||
private List<List<Cell>> rows;
|
private List<List<Cell>> rows;
|
||||||
|
|
||||||
|
|
||||||
public Table(List<Cell> cells, Rectangle area, int rotation) {
|
public TablePageBlock(List<Cell> cells, Rectangle area, int rotation) {
|
||||||
|
|
||||||
addCells(cells);
|
addCells(cells);
|
||||||
minX = area.getLeft();
|
minX = area.getLeft();
|
||||||
minY = area.getBottom();
|
minY = area.getBottom();
|
||||||
maxX = area.getRight();
|
maxX = area.getRight();
|
||||||
maxY = area.getTop();
|
maxY = area.getTop();
|
||||||
classification = "Table";
|
classification = PageBlockType.TABLE;
|
||||||
this.rotation = rotation;
|
this.rotation = rotation;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -71,19 +68,13 @@ public class Table extends AbstractTextContainer {
|
|||||||
|
|
||||||
public int getRowCount() {
|
public int getRowCount() {
|
||||||
|
|
||||||
if (rowCount == -1) {
|
return getRows().size();
|
||||||
rowCount = getRows().size();
|
|
||||||
}
|
|
||||||
return rowCount;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public int getColCount() {
|
public int getColCount() {
|
||||||
|
|
||||||
if (colCount == -1) {
|
return getRows().stream().mapToInt(List::size).max().orElse(0);
|
||||||
colCount = getRows().stream().mapToInt(List::size).max().orElse(0);
|
|
||||||
}
|
|
||||||
return colCount;
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -224,7 +215,7 @@ public class Table extends AbstractTextContainer {
|
|||||||
* Calculates the structure of the table. For spanning rows and columns multiple cells with the same values will be inserted.
|
* Calculates the structure of the table. For spanning rows and columns multiple cells with the same values will be inserted.
|
||||||
*
|
*
|
||||||
* @param cells The found cells
|
* @param cells The found cells
|
||||||
* @return Table Structure
|
* @return TablePageBlock Structure
|
||||||
*/
|
*/
|
||||||
private List<List<Cell>> calculateStructure(List<Cell> cells) {
|
private List<List<Cell>> calculateStructure(List<Cell> cells) {
|
||||||
|
|
||||||
@ -243,8 +234,8 @@ public class Table extends AbstractTextContainer {
|
|||||||
uniqueY.add(c.getTop());
|
uniqueY.add(c.getTop());
|
||||||
});
|
});
|
||||||
|
|
||||||
var sortedUniqueX = uniqueX.stream().sorted().collect(Collectors.toList());
|
var sortedUniqueX = uniqueX.stream().sorted().toList();
|
||||||
var sortedUniqueY = uniqueY.stream().sorted().collect(Collectors.toList());
|
var sortedUniqueY = uniqueY.stream().sorted().toList();
|
||||||
|
|
||||||
Float prevY = null;
|
Float prevY = null;
|
||||||
for (Float y : sortedUniqueY) {
|
for (Float y : sortedUniqueY) {
|
||||||
@ -258,9 +249,7 @@ public class Table extends AbstractTextContainer {
|
|||||||
var cell = new Cell(new Point2D.Float(prevX, prevY), new Point2D.Float(x, y));
|
var cell = new Cell(new Point2D.Float(prevX, prevY), new Point2D.Float(x, y));
|
||||||
|
|
||||||
var intersectionCell = cells.stream().filter(c -> cell.intersects(c) && cell.overlapRatio(c) > 0.1f).findFirst();
|
var intersectionCell = cells.stream().filter(c -> cell.intersects(c) && cell.overlapRatio(c) > 0.1f).findFirst();
|
||||||
if (intersectionCell.isPresent()) {
|
intersectionCell.ifPresent(value -> cell.getTextBlocks().addAll(value.getTextBlocks()));
|
||||||
cell.getTextBlocks().addAll(intersectionCell.get().getTextBlocks());
|
|
||||||
}
|
|
||||||
if (cell.hasMinimumSize()) {
|
if (cell.hasMinimumSize()) {
|
||||||
row.add(cell);
|
row.add(cell);
|
||||||
}
|
}
|
||||||
@ -268,7 +257,7 @@ public class Table extends AbstractTextContainer {
|
|||||||
prevX = x;
|
prevX = x;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (prevY != null && prevX != null) {
|
if (prevY != null && prevX != null && !row.isEmpty()) {
|
||||||
matrix.add(row);
|
matrix.add(row);
|
||||||
}
|
}
|
||||||
prevY = y;
|
prevY = y;
|
||||||
@ -299,7 +288,7 @@ public class Table extends AbstractTextContainer {
|
|||||||
}
|
}
|
||||||
if (column != null && column.getTextBlocks() != null) {
|
if (column != null && column.getTextBlocks() != null) {
|
||||||
boolean first = true;
|
boolean first = true;
|
||||||
for (TextBlock textBlock : column.getTextBlocks()) {
|
for (TextPageBlock textBlock : column.getTextBlocks()) {
|
||||||
if (!first) {
|
if (!first) {
|
||||||
sb.append("\n");
|
sb.append("\n");
|
||||||
}
|
}
|
||||||
@ -331,7 +320,7 @@ public class Table extends AbstractTextContainer {
|
|||||||
sb.append(i == 0 ? "\n<th>" : "\n<td>");
|
sb.append(i == 0 ? "\n<th>" : "\n<td>");
|
||||||
if (column != null && column.getTextBlocks() != null) {
|
if (column != null && column.getTextBlocks() != null) {
|
||||||
boolean first = true;
|
boolean first = true;
|
||||||
for (TextBlock textBlock : column.getTextBlocks()) {
|
for (TextPageBlock textBlock : column.getTextBlocks()) {
|
||||||
if (!first) {
|
if (!first) {
|
||||||
sb.append("<br />");
|
sb.append("<br />");
|
||||||
}
|
}
|
||||||
@ -1,4 +1,4 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.parsing.model;
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text;
|
||||||
|
|
||||||
import org.apache.pdfbox.text.TextPosition;
|
import org.apache.pdfbox.text.TextPosition;
|
||||||
import org.springframework.beans.BeanUtils;
|
import org.springframework.beans.BeanUtils;
|
||||||
@ -0,0 +1,49 @@
|
|||||||
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
|
||||||
|
|
||||||
|
import lombok.Getter;
|
||||||
|
|
||||||
|
public class SearchableText {
|
||||||
|
|
||||||
|
@Getter
|
||||||
|
private final List<TextPositionSequence> sequences = new ArrayList<>();
|
||||||
|
|
||||||
|
|
||||||
|
public void add(TextPositionSequence textPositionSequence) {
|
||||||
|
|
||||||
|
sequences.add(textPositionSequence);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void addAll(List<TextPositionSequence> textPositionSequences) {
|
||||||
|
|
||||||
|
sequences.addAll(textPositionSequences);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
|
||||||
|
return buildString(sequences);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static String buildString(List<TextPositionSequence> sequences) {
|
||||||
|
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
for (TextPositionSequence word : sequences) {
|
||||||
|
sb.append(word);
|
||||||
|
sb.append(' ');
|
||||||
|
}
|
||||||
|
String text = sb.toString();
|
||||||
|
text = TextNormalizationUtilities.removeHyphenLineBreaks(text);
|
||||||
|
text = TextNormalizationUtilities.removeLineBreaks(text);
|
||||||
|
text = TextNormalizationUtilities.removeRepeatingWhitespaces(text);
|
||||||
|
return text;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -1,5 +1,4 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text;
|
||||||
|
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
import lombok.Builder;
|
import lombok.Builder;
|
||||||
@ -1,4 +1,4 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@ -1,10 +1,10 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text;
|
||||||
|
|
||||||
import lombok.Getter;
|
|
||||||
|
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
|
import lombok.Getter;
|
||||||
|
|
||||||
public class StringFrequencyCounter {
|
public class StringFrequencyCounter {
|
||||||
|
|
||||||
@Getter
|
@Getter
|
||||||
@ -1,6 +1,4 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.parsing.model;
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text;
|
||||||
|
|
||||||
import java.util.Objects;
|
|
||||||
|
|
||||||
import com.fasterxml.jackson.annotation.JsonCreator;
|
import com.fasterxml.jackson.annotation.JsonCreator;
|
||||||
import com.fasterxml.jackson.annotation.JsonValue;
|
import com.fasterxml.jackson.annotation.JsonValue;
|
||||||
@ -46,18 +44,4 @@ public enum TextDirection {
|
|||||||
|
|
||||||
throw new IllegalArgumentException(String.format("A value of %f is not supported by TextDirection", degrees));
|
throw new IllegalArgumentException(String.format("A value of %f is not supported by TextDirection", degrees));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public static TextDirection fromString(String degreesAsString) {
|
|
||||||
|
|
||||||
Objects.requireNonNull(degreesAsString, "Cannot construct a text direction from a null value");
|
|
||||||
|
|
||||||
String value = degreesAsString.strip();
|
|
||||||
|
|
||||||
if (degreesAsString.endsWith(VALUE_STRING_SUFFIX)) {
|
|
||||||
value = degreesAsString.replace(VALUE_STRING_SUFFIX + "$", "");
|
|
||||||
}
|
|
||||||
|
|
||||||
return fromDegrees(Float.parseFloat(value));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
@ -1,13 +1,12 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextDirection;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock;
|
||||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.PageBlockType;
|
||||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
|
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
|
||||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
import lombok.Builder;
|
import lombok.Builder;
|
||||||
@ -18,7 +17,7 @@ import lombok.NoArgsConstructor;
|
|||||||
@Builder
|
@Builder
|
||||||
@Data
|
@Data
|
||||||
@NoArgsConstructor
|
@NoArgsConstructor
|
||||||
public class TextBlock extends AbstractTextContainer {
|
public class TextPageBlock extends AbstractPageBlock {
|
||||||
|
|
||||||
@Builder.Default
|
@Builder.Default
|
||||||
private List<TextPositionSequence> sequences = new ArrayList<>();
|
private List<TextPositionSequence> sequences = new ArrayList<>();
|
||||||
@ -45,7 +44,7 @@ public class TextBlock extends AbstractTextContainer {
|
|||||||
private float highestFontSize;
|
private float highestFontSize;
|
||||||
|
|
||||||
@JsonIgnore
|
@JsonIgnore
|
||||||
private String classification;
|
private PageBlockType classification;
|
||||||
|
|
||||||
|
|
||||||
@JsonIgnore
|
@JsonIgnore
|
||||||
@ -95,6 +94,7 @@ public class TextBlock extends AbstractTextContainer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the maxX value in pdf coordinate system.
|
* Returns the maxX value in pdf coordinate system.
|
||||||
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
|
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
|
||||||
@ -174,7 +174,7 @@ public class TextBlock extends AbstractTextContainer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public TextBlock(float minX, float maxX, float minY, float maxY, List<TextPositionSequence> sequences, int rotation) {
|
public TextPageBlock(float minX, float maxX, float minY, float maxY, List<TextPositionSequence> sequences, int rotation) {
|
||||||
|
|
||||||
this.minX = minX;
|
this.minX = minX;
|
||||||
this.maxX = maxX;
|
this.maxX = maxX;
|
||||||
@ -185,23 +185,23 @@ public class TextBlock extends AbstractTextContainer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public TextBlock union(TextPositionSequence r) {
|
public TextPageBlock union(TextPositionSequence r) {
|
||||||
|
|
||||||
TextBlock union = this.copy();
|
TextPageBlock union = this.copy();
|
||||||
union.add(r);
|
union.add(r);
|
||||||
return union;
|
return union;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public TextBlock union(TextBlock r) {
|
public TextPageBlock union(TextPageBlock r) {
|
||||||
|
|
||||||
TextBlock union = this.copy();
|
TextPageBlock union = this.copy();
|
||||||
union.add(r);
|
union.add(r);
|
||||||
return union;
|
return union;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void add(TextBlock r) {
|
public void add(TextPageBlock r) {
|
||||||
|
|
||||||
if (r.getMinX() < minX) {
|
if (r.getMinX() < minX) {
|
||||||
minX = r.getMinX();
|
minX = r.getMinX();
|
||||||
@ -236,9 +236,9 @@ public class TextBlock extends AbstractTextContainer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public TextBlock copy() {
|
public TextPageBlock copy() {
|
||||||
|
|
||||||
return new TextBlock(minX, maxX, minY, maxY, sequences, rotation);
|
return new TextPageBlock(minX, maxX, minY, maxY, sequences, rotation);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -1,4 +1,4 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.parsing.model;
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text;
|
||||||
|
|
||||||
import java.awt.geom.AffineTransform;
|
import java.awt.geom.AffineTransform;
|
||||||
import java.awt.geom.Point2D;
|
import java.awt.geom.Point2D;
|
||||||
@ -0,0 +1,14 @@
|
|||||||
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Data;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@AllArgsConstructor
|
||||||
|
public class UnclassifiedText {
|
||||||
|
|
||||||
|
private List<TextPageBlock> textBlocks;
|
||||||
|
|
||||||
|
}
|
||||||
@ -14,34 +14,18 @@
|
|||||||
* See the License for the specific language governing permissions and
|
* See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
package com.iqser.red.service.redaction.v1.server.parsing;
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.parsing;
|
||||||
|
|
||||||
import java.io.InputStream;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.WeakHashMap;
|
import java.util.WeakHashMap;
|
||||||
|
|
||||||
import org.apache.commons.logging.Log;
|
import org.apache.commons.logging.Log;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
|
||||||
import org.apache.fontbox.ttf.TrueTypeFont;
|
import org.apache.fontbox.ttf.TrueTypeFont;
|
||||||
import org.apache.fontbox.util.BoundingBox;
|
import org.apache.fontbox.util.BoundingBox;
|
||||||
|
|
||||||
import org.apache.pdfbox.contentstream.PDFStreamEngine;
|
import org.apache.pdfbox.contentstream.PDFStreamEngine;
|
||||||
import org.apache.pdfbox.pdmodel.PDPage;
|
|
||||||
import org.apache.pdfbox.pdmodel.font.encoding.GlyphList;
|
|
||||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
|
||||||
import org.apache.pdfbox.pdmodel.font.PDCIDFont;
|
|
||||||
import org.apache.pdfbox.pdmodel.font.PDCIDFontType2;
|
|
||||||
import org.apache.pdfbox.pdmodel.font.PDFont;
|
|
||||||
import org.apache.pdfbox.pdmodel.font.PDSimpleFont;
|
|
||||||
import org.apache.pdfbox.pdmodel.font.PDTrueTypeFont;
|
|
||||||
import org.apache.pdfbox.pdmodel.font.PDType0Font;
|
|
||||||
import org.apache.pdfbox.pdmodel.font.PDType3Font;
|
|
||||||
import org.apache.pdfbox.pdmodel.graphics.state.PDGraphicsState;
|
|
||||||
import org.apache.pdfbox.text.TextPosition;
|
|
||||||
import org.apache.pdfbox.util.Matrix;
|
|
||||||
import org.apache.pdfbox.util.Vector;
|
|
||||||
import org.apache.pdfbox.contentstream.operator.DrawObject;
|
import org.apache.pdfbox.contentstream.operator.DrawObject;
|
||||||
import org.apache.pdfbox.contentstream.operator.state.Concatenate;
|
import org.apache.pdfbox.contentstream.operator.state.Concatenate;
|
||||||
import org.apache.pdfbox.contentstream.operator.state.Restore;
|
import org.apache.pdfbox.contentstream.operator.state.Restore;
|
||||||
@ -50,22 +34,36 @@ import org.apache.pdfbox.contentstream.operator.state.SetGraphicsStateParameters
|
|||||||
import org.apache.pdfbox.contentstream.operator.state.SetMatrix;
|
import org.apache.pdfbox.contentstream.operator.state.SetMatrix;
|
||||||
import org.apache.pdfbox.contentstream.operator.text.BeginText;
|
import org.apache.pdfbox.contentstream.operator.text.BeginText;
|
||||||
import org.apache.pdfbox.contentstream.operator.text.EndText;
|
import org.apache.pdfbox.contentstream.operator.text.EndText;
|
||||||
import org.apache.pdfbox.contentstream.operator.text.SetFontAndSize;
|
|
||||||
import org.apache.pdfbox.contentstream.operator.text.SetTextHorizontalScaling;
|
|
||||||
import org.apache.pdfbox.contentstream.operator.text.ShowTextAdjusted;
|
|
||||||
import org.apache.pdfbox.contentstream.operator.text.ShowTextLine;
|
|
||||||
import org.apache.pdfbox.contentstream.operator.text.ShowTextLineAndSpace;
|
|
||||||
import org.apache.pdfbox.contentstream.operator.text.MoveText;
|
import org.apache.pdfbox.contentstream.operator.text.MoveText;
|
||||||
import org.apache.pdfbox.contentstream.operator.text.MoveTextSetLeading;
|
import org.apache.pdfbox.contentstream.operator.text.MoveTextSetLeading;
|
||||||
import org.apache.pdfbox.contentstream.operator.text.NextLine;
|
import org.apache.pdfbox.contentstream.operator.text.NextLine;
|
||||||
import org.apache.pdfbox.contentstream.operator.text.SetCharSpacing;
|
import org.apache.pdfbox.contentstream.operator.text.SetCharSpacing;
|
||||||
|
import org.apache.pdfbox.contentstream.operator.text.SetFontAndSize;
|
||||||
|
import org.apache.pdfbox.contentstream.operator.text.SetTextHorizontalScaling;
|
||||||
import org.apache.pdfbox.contentstream.operator.text.SetTextLeading;
|
import org.apache.pdfbox.contentstream.operator.text.SetTextLeading;
|
||||||
import org.apache.pdfbox.contentstream.operator.text.SetTextRenderingMode;
|
import org.apache.pdfbox.contentstream.operator.text.SetTextRenderingMode;
|
||||||
import org.apache.pdfbox.contentstream.operator.text.SetTextRise;
|
import org.apache.pdfbox.contentstream.operator.text.SetTextRise;
|
||||||
import org.apache.pdfbox.contentstream.operator.text.SetWordSpacing;
|
import org.apache.pdfbox.contentstream.operator.text.SetWordSpacing;
|
||||||
import org.apache.pdfbox.contentstream.operator.text.ShowText;
|
import org.apache.pdfbox.contentstream.operator.text.ShowText;
|
||||||
|
import org.apache.pdfbox.contentstream.operator.text.ShowTextAdjusted;
|
||||||
|
import org.apache.pdfbox.contentstream.operator.text.ShowTextLine;
|
||||||
|
import org.apache.pdfbox.contentstream.operator.text.ShowTextLineAndSpace;
|
||||||
import org.apache.pdfbox.cos.COSDictionary;
|
import org.apache.pdfbox.cos.COSDictionary;
|
||||||
|
import org.apache.pdfbox.pdmodel.PDPage;
|
||||||
|
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||||
|
import org.apache.pdfbox.pdmodel.font.PDCIDFont;
|
||||||
|
import org.apache.pdfbox.pdmodel.font.PDCIDFontType2;
|
||||||
|
import org.apache.pdfbox.pdmodel.font.PDFont;
|
||||||
import org.apache.pdfbox.pdmodel.font.PDFontDescriptor;
|
import org.apache.pdfbox.pdmodel.font.PDFontDescriptor;
|
||||||
|
import org.apache.pdfbox.pdmodel.font.PDSimpleFont;
|
||||||
|
import org.apache.pdfbox.pdmodel.font.PDTrueTypeFont;
|
||||||
|
import org.apache.pdfbox.pdmodel.font.PDType0Font;
|
||||||
|
import org.apache.pdfbox.pdmodel.font.PDType3Font;
|
||||||
|
import org.apache.pdfbox.pdmodel.font.encoding.GlyphList;
|
||||||
|
import org.apache.pdfbox.pdmodel.graphics.state.PDGraphicsState;
|
||||||
|
import org.apache.pdfbox.text.TextPosition;
|
||||||
|
import org.apache.pdfbox.util.Matrix;
|
||||||
|
import org.apache.pdfbox.util.Vector;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* LEGACY text calculations which are known to be incorrect but are depended on by PDFTextStripper.
|
* LEGACY text calculations which are known to be incorrect but are depended on by PDFTextStripper.
|
||||||
@ -1,17 +1,32 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.parsing;
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.parsing;
|
||||||
|
|
||||||
import com.iqser.red.service.redaction.v1.server.parsing.model.RedTextPosition;
|
import java.awt.geom.Point2D;
|
||||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
import java.io.IOException;
|
||||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
|
import java.util.ArrayList;
|
||||||
|
import java.util.Comparator;
|
||||||
import lombok.Getter;
|
import java.util.List;
|
||||||
import lombok.Setter;
|
|
||||||
import lombok.extern.slf4j.Slf4j;
|
|
||||||
|
|
||||||
import org.apache.pdfbox.contentstream.operator.Operator;
|
import org.apache.pdfbox.contentstream.operator.Operator;
|
||||||
import org.apache.pdfbox.contentstream.operator.OperatorName;
|
import org.apache.pdfbox.contentstream.operator.OperatorName;
|
||||||
import org.apache.pdfbox.contentstream.operator.color.*;
|
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColor;
|
||||||
import org.apache.pdfbox.contentstream.operator.state.*;
|
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColorN;
|
||||||
|
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColorSpace;
|
||||||
|
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceCMYKColor;
|
||||||
|
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceGrayColor;
|
||||||
|
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceRGBColor;
|
||||||
|
import org.apache.pdfbox.contentstream.operator.color.SetStrokingColor;
|
||||||
|
import org.apache.pdfbox.contentstream.operator.color.SetStrokingColorN;
|
||||||
|
import org.apache.pdfbox.contentstream.operator.color.SetStrokingColorSpace;
|
||||||
|
import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceCMYKColor;
|
||||||
|
import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceGrayColor;
|
||||||
|
import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceRGBColor;
|
||||||
|
import org.apache.pdfbox.contentstream.operator.state.SetFlatness;
|
||||||
|
import org.apache.pdfbox.contentstream.operator.state.SetLineCapStyle;
|
||||||
|
import org.apache.pdfbox.contentstream.operator.state.SetLineDashPattern;
|
||||||
|
import org.apache.pdfbox.contentstream.operator.state.SetLineJoinStyle;
|
||||||
|
import org.apache.pdfbox.contentstream.operator.state.SetLineMiterLimit;
|
||||||
|
import org.apache.pdfbox.contentstream.operator.state.SetLineWidth;
|
||||||
|
import org.apache.pdfbox.contentstream.operator.state.SetRenderingIntent;
|
||||||
import org.apache.pdfbox.contentstream.operator.text.SetFontAndSize;
|
import org.apache.pdfbox.contentstream.operator.text.SetFontAndSize;
|
||||||
import org.apache.pdfbox.cos.COSBase;
|
import org.apache.pdfbox.cos.COSBase;
|
||||||
import org.apache.pdfbox.cos.COSNumber;
|
import org.apache.pdfbox.cos.COSNumber;
|
||||||
@ -19,11 +34,13 @@ import org.apache.pdfbox.pdmodel.PDDocument;
|
|||||||
import org.apache.pdfbox.pdmodel.PDPage;
|
import org.apache.pdfbox.pdmodel.PDPage;
|
||||||
import org.apache.pdfbox.text.TextPosition;
|
import org.apache.pdfbox.text.TextPosition;
|
||||||
|
|
||||||
import java.awt.geom.Point2D;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.Ruling;
|
||||||
import java.io.IOException;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.RedTextPosition;
|
||||||
import java.util.ArrayList;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPositionSequence;
|
||||||
import java.util.Comparator;
|
|
||||||
import java.util.List;
|
import lombok.Getter;
|
||||||
|
import lombok.Setter;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
|
||||||
@Slf4j
|
@Slf4j
|
||||||
public class PDFLinesTextStripper extends PDFTextStripper {
|
public class PDFLinesTextStripper extends PDFTextStripper {
|
||||||
@ -264,8 +281,8 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
|||||||
// Remove false sequence ends (whitespaces)
|
// Remove false sequence ends (whitespaces)
|
||||||
if (previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0)
|
if (previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0)
|
||||||
.getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < 0.01) {
|
.getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < 0.01) {
|
||||||
for (TextPosition t : sublist) {
|
for (TextPosition textPosition : sublist) {
|
||||||
textPositionSequences.get(textPositionSequences.size() - 1).add(t);
|
textPositionSequences.get(textPositionSequences.size() - 1).add(textPosition);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
|
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
|
||||||
@ -14,7 +14,7 @@
|
|||||||
* See the License for the specific language governing permissions and
|
* See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
package com.iqser.red.service.redaction.v1.server.parsing;
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.parsing;
|
||||||
|
|
||||||
import java.io.BufferedInputStream;
|
import java.io.BufferedInputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
@ -1,4 +1,4 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.classification.service;
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.service;
|
||||||
|
|
||||||
import static java.util.stream.Collectors.toSet;
|
import static java.util.stream.Collectors.toSet;
|
||||||
|
|
||||||
@ -9,15 +9,15 @@ import java.util.List;
|
|||||||
|
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
import com.iqser.red.service.redaction.v1.server.classification.model.FloatFrequencyCounter;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock;
|
||||||
import com.iqser.red.service.redaction.v1.server.classification.model.Orientation;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationPage;
|
||||||
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.FloatFrequencyCounter;
|
||||||
import com.iqser.red.service.redaction.v1.server.classification.model.StringFrequencyCounter;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.Orientation;
|
||||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.Ruling;
|
||||||
import com.iqser.red.service.redaction.v1.server.classification.utils.RulingTextDirAdjustUtil;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.StringFrequencyCounter;
|
||||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
|
||||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPositionSequence;
|
||||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils.RulingTextDirAdjustUtil;
|
||||||
|
|
||||||
@Service
|
@Service
|
||||||
@SuppressWarnings("all")
|
@SuppressWarnings("all")
|
||||||
@ -29,16 +29,18 @@ public class BlockificationService {
|
|||||||
/**
|
/**
|
||||||
* This method is building blocks by expanding the minX/maxX and minY/maxY value on each word that is not split by the conditions.
|
* This method is building blocks by expanding the minX/maxX and minY/maxY value on each word that is not split by the conditions.
|
||||||
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
|
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
|
||||||
* Rulings (Table lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
|
* Rulings (TablePageBlock lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
|
||||||
* @param textPositions The words of a page.
|
*
|
||||||
|
* @param textPositions The words of a page.
|
||||||
* @param horizontalRulingLines Horizontal table lines.
|
* @param horizontalRulingLines Horizontal table lines.
|
||||||
* @param verticalRulingLines Vertical table lines.
|
* @param verticalRulingLines Vertical table lines.
|
||||||
* @return Page object that contains the Textblock and text statistics.
|
* @return Page object that contains the Textblock and text statistics.
|
||||||
*/
|
*/
|
||||||
public Page blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
||||||
|
|
||||||
|
int indexOnPage = 0;
|
||||||
List<TextPositionSequence> chunkWords = new ArrayList<>();
|
List<TextPositionSequence> chunkWords = new ArrayList<>();
|
||||||
List<AbstractTextContainer> chunkBlockList1 = new ArrayList<>();
|
List<AbstractPageBlock> chunkBlockList = new ArrayList<>();
|
||||||
|
|
||||||
float minX = 1000, maxX = 0, minY = 1000, maxY = 0;
|
float minX = 1000, maxX = 0, minY = 1000, maxY = 0;
|
||||||
TextPositionSequence prev = null;
|
TextPositionSequence prev = null;
|
||||||
@ -58,12 +60,14 @@ public class BlockificationService {
|
|||||||
if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) {
|
if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) {
|
||||||
|
|
||||||
Orientation prevOrientation = null;
|
Orientation prevOrientation = null;
|
||||||
if (!chunkBlockList1.isEmpty()) {
|
if (!chunkBlockList.isEmpty()) {
|
||||||
prevOrientation = chunkBlockList1.get(chunkBlockList1.size() - 1).getOrientation();
|
prevOrientation = chunkBlockList.get(chunkBlockList.size() - 1).getOrientation();
|
||||||
}
|
}
|
||||||
|
|
||||||
TextBlock cb1 = buildTextBlock(chunkWords);
|
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
|
||||||
chunkBlockList1.add(cb1);
|
indexOnPage++;
|
||||||
|
|
||||||
|
chunkBlockList.add(cb1);
|
||||||
chunkWords = new ArrayList<>();
|
chunkWords = new ArrayList<>();
|
||||||
|
|
||||||
if (splitByX && !isSplitByRuling) {
|
if (splitByX && !isSplitByRuling) {
|
||||||
@ -102,17 +106,17 @@ public class BlockificationService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
TextBlock cb1 = buildTextBlock(chunkWords);
|
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
|
||||||
if (cb1 != null) {
|
if (cb1 != null) {
|
||||||
chunkBlockList1.add(cb1);
|
chunkBlockList.add(cb1);
|
||||||
}
|
}
|
||||||
|
|
||||||
Iterator<AbstractTextContainer> itty = chunkBlockList1.iterator();
|
Iterator<AbstractPageBlock> itty = chunkBlockList.iterator();
|
||||||
|
|
||||||
TextBlock previousLeft = null;
|
TextPageBlock previousLeft = null;
|
||||||
TextBlock previousRight = null;
|
TextPageBlock previousRight = null;
|
||||||
while (itty.hasNext()) {
|
while (itty.hasNext()) {
|
||||||
TextBlock block = (TextBlock) itty.next();
|
TextPageBlock block = (TextPageBlock) itty.next();
|
||||||
|
|
||||||
if (previousLeft != null && block.getOrientation().equals(Orientation.LEFT)) {
|
if (previousLeft != null && block.getOrientation().equals(Orientation.LEFT)) {
|
||||||
if (previousLeft.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousLeft.getMinY()) {
|
if (previousLeft.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousLeft.getMinY()) {
|
||||||
@ -137,10 +141,10 @@ public class BlockificationService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
itty = chunkBlockList1.iterator();
|
itty = chunkBlockList.iterator();
|
||||||
TextBlock previous = null;
|
TextPageBlock previous = null;
|
||||||
while (itty.hasNext()) {
|
while (itty.hasNext()) {
|
||||||
TextBlock block = (TextBlock) itty.next();
|
TextPageBlock block = (TextPageBlock) itty.next();
|
||||||
|
|
||||||
if (previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold(block.getMaxY(),
|
if (previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold(block.getMaxY(),
|
||||||
previous.getMaxY()) || previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation()
|
previous.getMaxY()) || previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation()
|
||||||
@ -153,7 +157,7 @@ public class BlockificationService {
|
|||||||
previous = block;
|
previous = block;
|
||||||
}
|
}
|
||||||
|
|
||||||
return new Page(chunkBlockList1);
|
return new ClassificationPage(chunkBlockList);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -163,9 +167,9 @@ public class BlockificationService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private TextBlock buildTextBlock(List<TextPositionSequence> wordBlockList) {
|
private TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
|
||||||
|
|
||||||
TextBlock textBlock = null;
|
TextPageBlock textBlock = null;
|
||||||
|
|
||||||
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
|
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
|
||||||
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
|
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
|
||||||
@ -182,9 +186,14 @@ public class BlockificationService {
|
|||||||
styleFrequencyCounter.add(wordBlock.getFontStyle());
|
styleFrequencyCounter.add(wordBlock.getFontStyle());
|
||||||
|
|
||||||
if (textBlock == null) {
|
if (textBlock == null) {
|
||||||
textBlock = new TextBlock(wordBlock.getMinXDirAdj(), wordBlock.getMaxXDirAdj(), wordBlock.getMinYDirAdj(), wordBlock.getMaxYDirAdj(), wordBlockList, wordBlock.getRotation());
|
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
|
||||||
|
wordBlock.getMaxXDirAdj(),
|
||||||
|
wordBlock.getMinYDirAdj(),
|
||||||
|
wordBlock.getMaxYDirAdj(),
|
||||||
|
wordBlockList,
|
||||||
|
wordBlock.getRotation());
|
||||||
} else {
|
} else {
|
||||||
TextBlock spatialEntity = textBlock.union(wordBlock);
|
TextPageBlock spatialEntity = textBlock.union(wordBlock);
|
||||||
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
|
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -213,10 +222,38 @@ public class BlockificationService {
|
|||||||
List<Ruling> horizontalRulingLines,
|
List<Ruling> horizontalRulingLines,
|
||||||
List<Ruling> verticalRulingLines) {
|
List<Ruling> verticalRulingLines) {
|
||||||
|
|
||||||
return isSplitByRuling(maxX, minY, word.getMinXDirAdj(), word.getMinYDirAdj(), verticalRulingLines, word.getDir().getDegrees(), word.getPageWidth(), word.getPageHeight()) //
|
return isSplitByRuling(maxX,
|
||||||
|| isSplitByRuling(minX, minY, word.getMinXDirAdj(), word.getMaxYDirAdj(), horizontalRulingLines, word.getDir().getDegrees(), word.getPageWidth(), word.getPageHeight()) //
|
minY,
|
||||||
|| isSplitByRuling(maxX, minY, word.getMinXDirAdj(), word.getMinYDirAdj(), horizontalRulingLines, word.getDir().getDegrees(), word.getPageWidth(), word.getPageHeight()) //
|
word.getMinXDirAdj(),
|
||||||
|| isSplitByRuling(minX, minY, word.getMinXDirAdj(), word.getMaxYDirAdj(), verticalRulingLines, word.getDir().getDegrees(), word.getPageWidth(), word.getPageHeight()); //
|
word.getMinYDirAdj(),
|
||||||
|
verticalRulingLines,
|
||||||
|
word.getDir().getDegrees(),
|
||||||
|
word.getPageWidth(),
|
||||||
|
word.getPageHeight()) //
|
||||||
|
|| isSplitByRuling(minX,
|
||||||
|
minY,
|
||||||
|
word.getMinXDirAdj(),
|
||||||
|
word.getMaxYDirAdj(),
|
||||||
|
horizontalRulingLines,
|
||||||
|
word.getDir().getDegrees(),
|
||||||
|
word.getPageWidth(),
|
||||||
|
word.getPageHeight()) //
|
||||||
|
|| isSplitByRuling(maxX,
|
||||||
|
minY,
|
||||||
|
word.getMinXDirAdj(),
|
||||||
|
word.getMinYDirAdj(),
|
||||||
|
horizontalRulingLines,
|
||||||
|
word.getDir().getDegrees(),
|
||||||
|
word.getPageWidth(),
|
||||||
|
word.getPageHeight()) //
|
||||||
|
|| isSplitByRuling(minX,
|
||||||
|
minY,
|
||||||
|
word.getMinXDirAdj(),
|
||||||
|
word.getMaxYDirAdj(),
|
||||||
|
verticalRulingLines,
|
||||||
|
word.getDir().getDegrees(),
|
||||||
|
word.getPageWidth(),
|
||||||
|
word.getPageHeight());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -1,4 +1,4 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.classification.service;
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.service;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
@ -6,17 +6,20 @@ import org.springframework.stereotype.Service;
|
|||||||
|
|
||||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
|
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
|
||||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||||
import com.iqser.red.service.redaction.v1.server.classification.model.FloatFrequencyCounter;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock;
|
||||||
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationPage;
|
||||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.FloatFrequencyCounter;
|
||||||
import com.iqser.red.service.redaction.v1.server.classification.utils.PositionUtils;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.Cell;
|
||||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.TablePageBlock;
|
||||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
|
||||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils.PositionUtils;
|
||||||
|
|
||||||
@Service
|
@Service
|
||||||
public class BodyTextFrameService {
|
public class BodyTextFrameService {
|
||||||
|
|
||||||
|
private static final float APPROXIMATE_HEADER_LINE_COUNT = 2.9f;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Adjusts and sets the body text frame to a page.
|
* Adjusts and sets the body text frame to a page.
|
||||||
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
|
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
|
||||||
@ -30,7 +33,7 @@ public class BodyTextFrameService {
|
|||||||
* @param bodyTextFrame frame that contains the main text on portrait pages
|
* @param bodyTextFrame frame that contains the main text on portrait pages
|
||||||
* @param landscapeBodyTextFrame frame that contains the main text on landscape pages
|
* @param landscapeBodyTextFrame frame that contains the main text on landscape pages
|
||||||
*/
|
*/
|
||||||
public void setBodyTextFrameAdjustedToPage(Page page, Rectangle bodyTextFrame, Rectangle landscapeBodyTextFrame) {
|
public void setBodyTextFrameAdjustedToPage(ClassificationPage page, Rectangle bodyTextFrame, Rectangle landscapeBodyTextFrame) {
|
||||||
|
|
||||||
Rectangle textFrame = page.isLandscape() ? landscapeBodyTextFrame : bodyTextFrame;
|
Rectangle textFrame = page.isLandscape() ? landscapeBodyTextFrame : bodyTextFrame;
|
||||||
|
|
||||||
@ -65,26 +68,26 @@ public class BodyTextFrameService {
|
|||||||
* @param landscape Calculate for landscape or portrait
|
* @param landscape Calculate for landscape or portrait
|
||||||
* @return Rectangle of the text frame
|
* @return Rectangle of the text frame
|
||||||
*/
|
*/
|
||||||
public Rectangle calculateBodyTextFrame(List<Page> pages, FloatFrequencyCounter documentFontSizeCounter, boolean landscape) {
|
public Rectangle calculateBodyTextFrame(List<ClassificationPage> pages, FloatFrequencyCounter documentFontSizeCounter, boolean landscape) {
|
||||||
|
|
||||||
BodyTextFrameExpansionsRectangle expansionsRectangle = new BodyTextFrameExpansionsRectangle();
|
BodyTextFrameExpansionsRectangle expansionsRectangle = new BodyTextFrameExpansionsRectangle();
|
||||||
|
|
||||||
for (Page page : pages) {
|
for (ClassificationPage page : pages) {
|
||||||
|
|
||||||
if (page.getTextBlocks().isEmpty() || landscape != page.isLandscape()) {
|
if (page.getTextBlocks().isEmpty() || landscape != page.isLandscape()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (AbstractTextContainer container : page.getTextBlocks()) {
|
for (AbstractPageBlock container : page.getTextBlocks()) {
|
||||||
|
|
||||||
if (container instanceof TextBlock) {
|
if (container instanceof TextPageBlock) {
|
||||||
TextBlock textBlock = (TextBlock) container;
|
TextPageBlock textBlock = (TextPageBlock) container;
|
||||||
if (textBlock.getMostPopularWordFont() == null || textBlock.getMostPopularWordStyle() == null) {
|
if (textBlock.getMostPopularWordFont() == null || textBlock.getMostPopularWordStyle() == null) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
float approxLineCount = PositionUtils.getApproxLineCount(textBlock);
|
float approxLineCount = PositionUtils.getApproxLineCount(textBlock);
|
||||||
if (approxLineCount < 2.9f) {
|
if (approxLineCount < APPROXIMATE_HEADER_LINE_COUNT) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -94,15 +97,15 @@ public class BodyTextFrameService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (container instanceof Table) {
|
if (container instanceof TablePageBlock) {
|
||||||
Table table = (Table) container;
|
TablePageBlock table = (TablePageBlock) container;
|
||||||
for (List<Cell> row : table.getRows()) {
|
for (List<Cell> row : table.getRows()) {
|
||||||
for (Cell cell : row) {
|
for (Cell cell : row) {
|
||||||
|
|
||||||
if (cell == null || cell.getTextBlocks() == null) {
|
if (cell == null || cell.getTextBlocks() == null) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
for (TextBlock textBlock : cell.getTextBlocks()) {
|
for (TextPageBlock textBlock : cell.getTextBlocks()) {
|
||||||
expandRectangle(textBlock, page, expansionsRectangle);
|
expandRectangle(textBlock, page, expansionsRectangle);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -117,7 +120,7 @@ public class BodyTextFrameService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void expandRectangle(TextBlock textBlock, Page page, BodyTextFrameExpansionsRectangle expansionsRectangle) {
|
private void expandRectangle(TextPageBlock textBlock, ClassificationPage page, BodyTextFrameExpansionsRectangle expansionsRectangle) {
|
||||||
|
|
||||||
if (page.getPageWidth() > page.getPageHeight() && page.getRotation() != 0) {
|
if (page.getPageWidth() > page.getPageHeight() && page.getRotation() != 0) {
|
||||||
if (textBlock.getPdfMinY() < expansionsRectangle.minX) {
|
if (textBlock.getPdfMinY() < expansionsRectangle.minX) {
|
||||||
@ -1,4 +1,4 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.classification.service;
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.service;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
@ -6,11 +6,12 @@ import java.util.regex.Pattern;
|
|||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||||
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock;
|
||||||
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationDocument;
|
||||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationPage;
|
||||||
import com.iqser.red.service.redaction.v1.server.classification.utils.PositionUtils;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.PageBlockType;
|
||||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils.PositionUtils;
|
||||||
|
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
@ -23,7 +24,7 @@ public class ClassificationService {
|
|||||||
private final BodyTextFrameService bodyTextFrameService;
|
private final BodyTextFrameService bodyTextFrameService;
|
||||||
|
|
||||||
|
|
||||||
public void classifyDocument(Document document) {
|
public void classifyDocument(ClassificationDocument document) {
|
||||||
|
|
||||||
Rectangle bodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), false);
|
Rectangle bodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), false);
|
||||||
Rectangle landscapeBodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), true);
|
Rectangle landscapeBodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), true);
|
||||||
@ -31,43 +32,43 @@ public class ClassificationService {
|
|||||||
|
|
||||||
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
|
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
|
||||||
|
|
||||||
for (Page page : document.getPages()) {
|
for (ClassificationPage page : document.getPages()) {
|
||||||
bodyTextFrameService.setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame);
|
bodyTextFrameService.setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame);
|
||||||
classifyPage(page, document, headlineFontSizes);
|
classifyPage(page, document, headlineFontSizes);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void classifyPage(Page page, Document document, List<Float> headlineFontSizes) {
|
public void classifyPage(ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
|
||||||
|
|
||||||
for (AbstractTextContainer textBlock : page.getTextBlocks()) {
|
for (AbstractPageBlock textBlock : page.getTextBlocks()) {
|
||||||
if (textBlock instanceof TextBlock) {
|
if (textBlock instanceof TextPageBlock) {
|
||||||
classifyBlock((TextBlock) textBlock, page, document, headlineFontSizes);
|
classifyBlock((TextPageBlock) textBlock, page, document, headlineFontSizes);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void classifyBlock(TextBlock textBlock, Page page, Document document, List<Float> headlineFontSizes) {
|
public void classifyBlock(TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
|
||||||
|
|
||||||
var bodyTextFrame = page.getBodyTextFrame();
|
var bodyTextFrame = page.getBodyTextFrame();
|
||||||
|
|
||||||
if (document.getFontSizeCounter().getMostPopular() == null) {
|
if (document.getFontSizeCounter().getMostPopular() == null) {
|
||||||
textBlock.setClassification("Other");
|
textBlock.setClassification(PageBlockType.OTHER);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
|
if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
|
||||||
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
|
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
|
||||||
textBlock.setClassification("Header");
|
textBlock.setClassification(PageBlockType.HEADER);
|
||||||
|
|
||||||
} else if (PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
|
} else if (PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
|
||||||
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
|
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
|
||||||
textBlock.setClassification("Footer");
|
textBlock.setClassification(PageBlockType.FOOTER);
|
||||||
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
|
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
|
||||||
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
|
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
|
||||||
.size() == 1)) {
|
.size() == 1)) {
|
||||||
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
|
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
|
||||||
textBlock.setClassification("Title");
|
textBlock.setClassification(PageBlockType.TITLE);
|
||||||
}
|
}
|
||||||
} else if (textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter()
|
} else if (textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter()
|
||||||
.getMostPopular() && PositionUtils.getApproxLineCount(textBlock) < 4.9 && (textBlock.getMostPopularWordStyle().equals("bold") || !document.getFontStyleCounter()
|
.getMostPopular() && PositionUtils.getApproxLineCount(textBlock) < 4.9 && (textBlock.getMostPopularWordStyle().equals("bold") || !document.getFontStyleCounter()
|
||||||
@ -80,36 +81,34 @@ public class ClassificationService {
|
|||||||
|
|
||||||
for (int i = 1; i <= headlineFontSizes.size(); i++) {
|
for (int i = 1; i <= headlineFontSizes.size(); i++) {
|
||||||
if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) {
|
if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) {
|
||||||
textBlock.setClassification("H " + i);
|
textBlock.setClassification(PageBlockType.getHeadlineType(i));
|
||||||
document.setHeadlines(true);
|
document.setHeadlines(true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if (!textBlock.getText().startsWith("Table ") && !textBlock.getText().startsWith("Figure ") && PositionUtils.isWithinBodyTextFrame(bodyTextFrame,
|
} else if (!textBlock.getText().startsWith("Figure ") && PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordStyle()
|
||||||
textBlock) && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter()
|
.equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9 && textBlock.getSequences()
|
||||||
.getMostPopular()
|
|
||||||
.equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9 && textBlock.getSequences()
|
|
||||||
.get(0)
|
.get(0)
|
||||||
.getTextPositions()
|
.getTextPositions()
|
||||||
.get(0)
|
.get(0)
|
||||||
.getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
|
.getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
|
||||||
textBlock.setClassification("H " + (headlineFontSizes.size() + 1));
|
textBlock.setClassification(PageBlockType.getHeadlineType(headlineFontSizes.size() + 1));
|
||||||
document.setHeadlines(true);
|
document.setHeadlines(true);
|
||||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
|
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
|
||||||
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) {
|
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) {
|
||||||
textBlock.setClassification("TextBlock Bold");
|
textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD);
|
||||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFont()
|
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFont()
|
||||||
.equals(document.getFontCounter().getMostPopular()) && textBlock.getMostPopularWordStyle()
|
.equals(document.getFontCounter().getMostPopular()) && textBlock.getMostPopularWordStyle()
|
||||||
.equals(document.getFontStyleCounter().getMostPopular()) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) {
|
.equals(document.getFontStyleCounter().getMostPopular()) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) {
|
||||||
textBlock.setClassification("TextBlock");
|
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
||||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
|
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
|
||||||
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("italic") && !document.getFontStyleCounter()
|
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("italic") && !document.getFontStyleCounter()
|
||||||
.getMostPopular()
|
.getMostPopular()
|
||||||
.equals("italic") && PositionUtils.getApproxLineCount(textBlock) < 2.9) {
|
.equals("italic") && PositionUtils.getApproxLineCount(textBlock) < 2.9) {
|
||||||
textBlock.setClassification("TextBlock Italic");
|
textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC);
|
||||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) {
|
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) {
|
||||||
textBlock.setClassification("TextBlock Unknown");
|
textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN);
|
||||||
} else {
|
} else {
|
||||||
textBlock.setClassification("Other");
|
textBlock.setClassification(PageBlockType.OTHER);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1,4 +1,4 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.segmentation;
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.service;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileOutputStream;
|
import java.io.FileOutputStream;
|
||||||
@ -16,21 +16,19 @@ import org.apache.pdfbox.pdmodel.PDPage;
|
|||||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.ImageServiceResponseAdapter;
|
||||||
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.TableServiceResponseAdapter;
|
||||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.table.PdfTableCell;
|
||||||
import com.iqser.red.service.redaction.v1.server.classification.service.BlockificationService;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock;
|
||||||
import com.iqser.red.service.redaction.v1.server.classification.service.ClassificationService;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationDocument;
|
||||||
import com.iqser.red.service.redaction.v1.server.parsing.PDFLinesTextStripper;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationPage;
|
||||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.image.ClassifiedImage;
|
||||||
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.CleanRulings;
|
||||||
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfTableCell;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPositionSequence;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.parsing.PDFLinesTextStripper;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils.FileUtils;
|
||||||
import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings;
|
import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings;
|
||||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
|
||||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.CleanRulings;
|
|
||||||
import com.iqser.red.service.redaction.v1.server.tableextraction.service.RulingCleaningService;
|
|
||||||
import com.iqser.red.service.redaction.v1.server.tableextraction.service.TableExtractionService;
|
|
||||||
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.FileUtils;
|
|
||||||
|
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
@ -46,18 +44,21 @@ public class PdfSegmentationService {
|
|||||||
private final BlockificationService blockificationService;
|
private final BlockificationService blockificationService;
|
||||||
private final ClassificationService classificationService;
|
private final ClassificationService classificationService;
|
||||||
private final SectionsBuilderService sectionsBuilderService;
|
private final SectionsBuilderService sectionsBuilderService;
|
||||||
private final ImageService imageService;
|
private final ImageServiceResponseAdapter imageServiceResponseAdapter;
|
||||||
private final TableService tableService;
|
private final TableServiceResponseAdapter tableServiceResponseAdapter;
|
||||||
|
|
||||||
|
|
||||||
public Document parseDocument(String dossierId, String fileId, InputStream documentInputStream, Map<Integer, List<PdfImage>> pdfImages) throws IOException {
|
public ClassificationDocument parseDocument(String dossierId,
|
||||||
|
String fileId,
|
||||||
|
InputStream documentInputStream,
|
||||||
|
Map<Integer, List<ClassifiedImage>> pdfImages) throws IOException {
|
||||||
|
|
||||||
PDDocument pdDocument = null;
|
PDDocument pdDocument = null;
|
||||||
File tempFile = null;
|
File tempFile = null;
|
||||||
try {
|
try {
|
||||||
Map<Integer, List<PdfTableCell>> pdfTableCells = new HashMap<>();
|
Map<Integer, List<PdfTableCell>> pdfTableCells = new HashMap<>();
|
||||||
if (redactionServiceSettings.isCvTableParsingEnabled()) {
|
if (redactionServiceSettings.isCvTableParsingEnabled()) {
|
||||||
pdfTableCells = tableService.convertTables(dossierId, fileId);
|
pdfTableCells = tableServiceResponseAdapter.convertTables(dossierId, fileId);
|
||||||
}
|
}
|
||||||
|
|
||||||
tempFile = FileUtils.createTempFile("document", ".pdf");
|
tempFile = FileUtils.createTempFile("document", ".pdf");
|
||||||
@ -65,8 +66,8 @@ public class PdfSegmentationService {
|
|||||||
IOUtils.copy(documentInputStream, fos);
|
IOUtils.copy(documentInputStream, fos);
|
||||||
|
|
||||||
// initialize required variables
|
// initialize required variables
|
||||||
Document document = new Document();
|
ClassificationDocument document = new ClassificationDocument();
|
||||||
List<Page> pages = new ArrayList<>();
|
List<ClassificationPage> pages = new ArrayList<>();
|
||||||
|
|
||||||
pdDocument = PDDocument.load(tempFile, MemoryUsageSetting.setupMixed(67108864L));
|
pdDocument = PDDocument.load(tempFile, MemoryUsageSetting.setupMixed(67108864L));
|
||||||
pdDocument.setAllSecurityToBeRemoved(true);
|
pdDocument.setAllSecurityToBeRemoved(true);
|
||||||
@ -94,12 +95,12 @@ public class PdfSegmentationService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void processPage(Map<Integer, List<PdfImage>> pdfImages,
|
private void processPage(Map<Integer, List<ClassifiedImage>> pdfImages,
|
||||||
PDDocument pdDocument,
|
PDDocument pdDocument,
|
||||||
Map<Integer, List<PdfTableCell>> pdfTableCells,
|
Map<Integer, List<PdfTableCell>> pdfTableCells,
|
||||||
Document document,
|
ClassificationDocument document,
|
||||||
List<Page> pages,
|
List<ClassificationPage> pages,
|
||||||
int pageNumber) throws IOException {
|
int pageNumber) throws IOException {
|
||||||
|
|
||||||
PDFLinesTextStripper stripper = new PDFLinesTextStripper();
|
PDFLinesTextStripper stripper = new PDFLinesTextStripper();
|
||||||
PDPage pdPage = pdDocument.getPage(pageNumber - 1);
|
PDPage pdPage = pdDocument.getPage(pageNumber - 1);
|
||||||
@ -119,7 +120,7 @@ public class PdfSegmentationService {
|
|||||||
stripper.getRulings(),
|
stripper.getRulings(),
|
||||||
stripper.getMinCharWidth(),
|
stripper.getMinCharWidth(),
|
||||||
stripper.getMaxCharHeight());
|
stripper.getMaxCharHeight());
|
||||||
Page page = blockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
ClassificationPage page = blockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||||
|
|
||||||
page.setRotation(rotation);
|
page.setRotation(rotation);
|
||||||
page.setLandscape(isLandscape);
|
page.setLandscape(isLandscape);
|
||||||
@ -130,7 +131,7 @@ public class PdfSegmentationService {
|
|||||||
// If images is ocr needs to be calculated before textBlocks are moved into tables, otherwise findOcr algorithm needs to be adopted.
|
// If images is ocr needs to be calculated before textBlocks are moved into tables, otherwise findOcr algorithm needs to be adopted.
|
||||||
if (pdfImages != null && pdfImages.containsKey(pageNumber)) {
|
if (pdfImages != null && pdfImages.containsKey(pageNumber)) {
|
||||||
page.setImages(pdfImages.get(pageNumber));
|
page.setImages(pdfImages.get(pageNumber));
|
||||||
imageService.findOcr(page);
|
imageServiceResponseAdapter.findOcr(page);
|
||||||
}
|
}
|
||||||
|
|
||||||
tableExtractionService.extractTables(cleanRulings, page);
|
tableExtractionService.extractTables(cleanRulings, page);
|
||||||
@ -141,7 +142,7 @@ public class PdfSegmentationService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void increaseDocumentStatistics(Page page, Document document) {
|
private void increaseDocumentStatistics(ClassificationPage page, ClassificationDocument document) {
|
||||||
|
|
||||||
if (!page.isLandscape()) {
|
if (!page.isLandscape()) {
|
||||||
document.getFontSizeCounter().addAll(page.getFontSizeCounter().getCountPerValue());
|
document.getFontSizeCounter().addAll(page.getFontSizeCounter().getCountPerValue());
|
||||||
@ -152,15 +153,15 @@ public class PdfSegmentationService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void buildPageStatistics(Page page) {
|
private void buildPageStatistics(ClassificationPage page) {
|
||||||
|
|
||||||
// Collect all statistics for the page, except from blocks inside tables, as tables will always be added to BodyTextFrame.
|
// Collect all statistics for the page, except from blocks inside tables, as tables will always be added to BodyTextFrame.
|
||||||
for (AbstractTextContainer textBlock : page.getTextBlocks()) {
|
for (AbstractPageBlock textBlock : page.getTextBlocks()) {
|
||||||
if (textBlock instanceof TextBlock) {
|
if (textBlock instanceof TextPageBlock) {
|
||||||
if (((TextBlock) textBlock).getSequences() == null) {
|
if (((TextPageBlock) textBlock).getSequences() == null) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
for (TextPositionSequence word : ((TextBlock) textBlock).getSequences()) {
|
for (TextPositionSequence word : ((TextPageBlock) textBlock).getSequences()) {
|
||||||
page.getTextHeightCounter().add(word.getTextHeight());
|
page.getTextHeightCounter().add(word.getTextHeight());
|
||||||
page.getFontCounter().add(word.getFont());
|
page.getFontCounter().add(word.getFont());
|
||||||
page.getFontSizeCounter().add(word.getFontSize());
|
page.getFontSizeCounter().add(word.getFontSize());
|
||||||
@ -1,4 +1,4 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.tableextraction.service;
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.service;
|
||||||
|
|
||||||
import java.awt.geom.Line2D;
|
import java.awt.geom.Line2D;
|
||||||
import java.awt.geom.Point2D;
|
import java.awt.geom.Point2D;
|
||||||
@ -12,11 +12,11 @@ import java.util.Map;
|
|||||||
|
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfTableCell;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.table.PdfTableCell;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.CleanRulings;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.Ruling;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils.Utils;
|
||||||
import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings;
|
import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings;
|
||||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.CleanRulings;
|
|
||||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
|
|
||||||
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils;
|
|
||||||
|
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
@ -1,9 +1,8 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.segmentation;
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.service;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
@ -11,17 +10,18 @@ import java.util.stream.Collectors;
|
|||||||
import org.apache.commons.collections4.CollectionUtils;
|
import org.apache.commons.collections4.CollectionUtils;
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock;
|
||||||
import com.iqser.red.service.redaction.v1.server.classification.model.Footer;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationDocument;
|
||||||
import com.iqser.red.service.redaction.v1.server.classification.model.Header;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationFooter;
|
||||||
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationHeader;
|
||||||
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationPage;
|
||||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationSection;
|
||||||
import com.iqser.red.service.redaction.v1.server.classification.model.UnclassifiedText;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.PageBlockType;
|
||||||
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.image.ClassifiedImage;
|
||||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.Cell;
|
||||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.TablePageBlock;
|
||||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.UnclassifiedText;
|
||||||
|
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
|
||||||
@ -29,23 +29,23 @@ import lombok.extern.slf4j.Slf4j;
|
|||||||
@Service
|
@Service
|
||||||
public class SectionsBuilderService {
|
public class SectionsBuilderService {
|
||||||
|
|
||||||
public void buildSections(Document document) {
|
public void buildSections(ClassificationDocument document) {
|
||||||
|
|
||||||
List<AbstractTextContainer> chunkWords = new ArrayList<>();
|
List<AbstractPageBlock> chunkWords = new ArrayList<>();
|
||||||
List<Paragraph> chunkBlockList = new ArrayList<>();
|
List<ClassificationSection> chunkBlockList = new ArrayList<>();
|
||||||
List<Header> headers = new ArrayList<>();
|
List<ClassificationHeader> headers = new ArrayList<>();
|
||||||
List<Footer> footers = new ArrayList<>();
|
List<ClassificationFooter> footers = new ArrayList<>();
|
||||||
List<UnclassifiedText> unclassifiedTexts = new ArrayList<>();
|
List<UnclassifiedText> unclassifiedTexts = new ArrayList<>();
|
||||||
|
|
||||||
AbstractTextContainer prev = null;
|
AbstractPageBlock prev = null;
|
||||||
|
|
||||||
String lastHeadline = "";
|
String lastHeadline = "";
|
||||||
Table previousTable = null;
|
TablePageBlock previousTable = null;
|
||||||
for (Page page : document.getPages()) {
|
for (ClassificationPage page : document.getPages()) {
|
||||||
List<TextBlock> header = new ArrayList<>();
|
List<TextPageBlock> header = new ArrayList<>();
|
||||||
List<TextBlock> footer = new ArrayList<>();
|
List<TextPageBlock> footer = new ArrayList<>();
|
||||||
List<TextBlock> unclassifiedText = new ArrayList<>();
|
List<TextPageBlock> unclassifiedText = new ArrayList<>();
|
||||||
for (AbstractTextContainer current : page.getTextBlocks()) {
|
for (AbstractPageBlock current : page.getTextBlocks()) {
|
||||||
|
|
||||||
if (current.getClassification() == null) {
|
if (current.getClassification() == null) {
|
||||||
continue;
|
continue;
|
||||||
@ -53,23 +53,23 @@ public class SectionsBuilderService {
|
|||||||
|
|
||||||
current.setPage(page.getPageNumber());
|
current.setPage(page.getPageNumber());
|
||||||
|
|
||||||
if (current.getClassification().equals("Header")) {
|
if (current.getClassification().equals(PageBlockType.HEADER)) {
|
||||||
header.add((TextBlock) current);
|
header.add((TextPageBlock) current);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (current.getClassification().equals("Footer")) {
|
if (current.getClassification().equals(PageBlockType.FOOTER)) {
|
||||||
footer.add((TextBlock) current);
|
footer.add((TextPageBlock) current);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (current.getClassification().equals("Other")) {
|
if (current.getClassification().equals(PageBlockType.OTHER)) {
|
||||||
unclassifiedText.add((TextBlock) current);
|
unclassifiedText.add((TextPageBlock) current);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (prev != null && current.getClassification().startsWith("H ") && !prev.getClassification().startsWith("H ") || !document.isHeadlines()) {
|
if (prev != null && current.getClassification().isHeadline() && !prev.getClassification().isHeadline() || !document.isHeadlines()) {
|
||||||
Paragraph chunkBlock = buildTextBlock(chunkWords, lastHeadline);
|
ClassificationSection chunkBlock = buildTextBlock(chunkWords, lastHeadline);
|
||||||
chunkBlock.setHeadline(lastHeadline);
|
chunkBlock.setHeadline(lastHeadline);
|
||||||
if (document.isHeadlines()) {
|
if (document.isHeadlines()) {
|
||||||
lastHeadline = current.getText();
|
lastHeadline = current.getText();
|
||||||
@ -80,8 +80,7 @@ public class SectionsBuilderService {
|
|||||||
previousTable = chunkBlock.getTables().get(chunkBlock.getTables().size() - 1);
|
previousTable = chunkBlock.getTables().get(chunkBlock.getTables().size() - 1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (current instanceof Table) {
|
if (current instanceof TablePageBlock table) {
|
||||||
Table table = (Table) current;
|
|
||||||
// Distribute header information for subsequent tables
|
// Distribute header information for subsequent tables
|
||||||
mergeTableMetadata(table, previousTable);
|
mergeTableMetadata(table, previousTable);
|
||||||
previousTable = table;
|
previousTable = table;
|
||||||
@ -91,69 +90,72 @@ public class SectionsBuilderService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (!header.isEmpty()) {
|
if (!header.isEmpty()) {
|
||||||
headers.add(new Header(header));
|
headers.add(new ClassificationHeader(header));
|
||||||
}
|
}
|
||||||
if (!footer.isEmpty()) {
|
if (!footer.isEmpty()) {
|
||||||
footers.add(new Footer(footer));
|
footers.add(new ClassificationFooter(footer));
|
||||||
}
|
}
|
||||||
if (!unclassifiedText.isEmpty()) {
|
if (!unclassifiedText.isEmpty()) {
|
||||||
unclassifiedTexts.add(new UnclassifiedText(unclassifiedText));
|
unclassifiedTexts.add(new UnclassifiedText(unclassifiedText));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Paragraph chunkBlock = buildTextBlock(chunkWords, lastHeadline);
|
ClassificationSection chunkBlock = buildTextBlock(chunkWords, lastHeadline);
|
||||||
chunkBlock.setHeadline(lastHeadline);
|
chunkBlock.setHeadline(lastHeadline);
|
||||||
chunkBlockList.add(chunkBlock);
|
chunkBlockList.add(chunkBlock);
|
||||||
|
|
||||||
document.setParagraphs(chunkBlockList);
|
document.setSections(chunkBlockList);
|
||||||
document.setHeaders(headers);
|
document.setHeaders(headers);
|
||||||
document.setFooters(footers);
|
document.setFooters(footers);
|
||||||
document.setUnclassifiedTexts(unclassifiedTexts);
|
document.setUnclassifiedTexts(unclassifiedTexts);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void addImagesToSections(Document document) {
|
public void addImagesToSections(ClassificationDocument document) {
|
||||||
|
|
||||||
Map<Integer, List<Paragraph>> paragraphMap = new HashMap<>();
|
Map<Integer, List<ClassificationSection>> sectionMap = new HashMap<>();
|
||||||
for (Paragraph paragraph : document.getParagraphs()) {
|
for (ClassificationSection section : document.getSections()) {
|
||||||
for (AbstractTextContainer container : paragraph.getPageBlocks()) {
|
for (AbstractPageBlock container : section.getPageBlocks()) {
|
||||||
|
|
||||||
paragraphMap.computeIfAbsent(container.getPage(), c -> new ArrayList<>()).add(paragraph);
|
|
||||||
|
|
||||||
|
List<ClassificationSection> sectionsOnPage = sectionMap.computeIfAbsent(container.getPage(), c -> new ArrayList<>());
|
||||||
|
if (sectionsOnPage.contains(section)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
sectionsOnPage.add(section);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (paragraphMap.isEmpty()) {
|
if (sectionMap.isEmpty()) {
|
||||||
Paragraph paragraph = new Paragraph();
|
ClassificationSection section = new ClassificationSection();
|
||||||
document.getParagraphs().add(paragraph);
|
document.getSections().add(section);
|
||||||
paragraphMap.computeIfAbsent(1, x -> new ArrayList<>()).add(paragraph);
|
sectionMap.computeIfAbsent(1, x -> new ArrayList<>()).add(section);
|
||||||
}
|
}
|
||||||
|
|
||||||
// first page is always a paragraph, else we can't process pages 1..N,
|
// first page is always a paragraph, else we can't process pages 1..N,
|
||||||
// where N is the first found page with a paragraph
|
// where N is the first found page with a paragraph
|
||||||
if (paragraphMap.get(1) == null) {
|
if (sectionMap.get(1) == null) {
|
||||||
Paragraph paragraph = new Paragraph();
|
ClassificationSection section = new ClassificationSection();
|
||||||
document.getParagraphs().add(paragraph);
|
document.getSections().add(section);
|
||||||
paragraphMap.computeIfAbsent(1, x -> new ArrayList<>()).add(paragraph);
|
sectionMap.computeIfAbsent(1, x -> new ArrayList<>()).add(section);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (Page page : document.getPages()) {
|
for (ClassificationPage page : document.getPages()) {
|
||||||
for (PdfImage image : page.getImages()) {
|
for (ClassifiedImage image : page.getImages()) {
|
||||||
List<Paragraph> paragraphsOnPage = paragraphMap.get(page.getPageNumber());
|
List<ClassificationSection> sectionsOnPage = sectionMap.get(page.getPageNumber());
|
||||||
if (paragraphsOnPage == null) {
|
if (sectionsOnPage == null) {
|
||||||
int i = page.getPageNumber();
|
int i = page.getPageNumber();
|
||||||
while (paragraphsOnPage == null) {
|
while (sectionsOnPage == null) {
|
||||||
paragraphsOnPage = paragraphMap.get(i);
|
sectionsOnPage = sectionMap.get(i);
|
||||||
i--;
|
i--;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (Paragraph paragraph : paragraphsOnPage) {
|
for (ClassificationSection section : sectionsOnPage) {
|
||||||
Float xMin = null;
|
Float xMin = null;
|
||||||
Float yMin = null;
|
Float yMin = null;
|
||||||
Float xMax = null;
|
Float xMax = null;
|
||||||
Float yMax = null;
|
Float yMax = null;
|
||||||
|
|
||||||
for (AbstractTextContainer abs : paragraph.getPageBlocks()) {
|
for (AbstractPageBlock abs : section.getPageBlocks()) {
|
||||||
if (abs.getPage() != page.getPageNumber()) {
|
if (abs.getPage() != page.getPageNumber()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -197,21 +199,21 @@ public class SectionsBuilderService {
|
|||||||
|
|
||||||
if (xMin != null && xMax != null && yMin != null && yMax != null && image.getPosition().getX() >= xMin && image.getPosition()
|
if (xMin != null && xMax != null && yMin != null && yMax != null && image.getPosition().getX() >= xMin && image.getPosition()
|
||||||
.getX() <= xMax && image.getPosition().getY() >= yMin && image.getPosition().getY() <= yMax) {
|
.getX() <= xMax && image.getPosition().getY() >= yMin && image.getPosition().getY() <= yMax) {
|
||||||
paragraph.getImages().add(image);
|
section.getImages().add(image);
|
||||||
image.setAppendedToParagraph(true);
|
image.setAppendedToSection(true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!image.isAppendedToParagraph()) {
|
if (!image.isAppendedToSection()) {
|
||||||
log.debug("Image uses first paragraph");
|
log.debug("Image uses first paragraph");
|
||||||
paragraphsOnPage.get(0).getImages().add(image);
|
sectionsOnPage.get(0).getImages().add(image);
|
||||||
image.setAppendedToParagraph(true);
|
image.setAppendedToSection(true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void mergeTableMetadata(Table currentTable, Table previousTable) {
|
private void mergeTableMetadata(TablePageBlock currentTable, TablePageBlock previousTable) {
|
||||||
|
|
||||||
// Distribute header information for subsequent tables
|
// Distribute header information for subsequent tables
|
||||||
if (previousTable != null && hasInvalidHeaderInformation(currentTable) && hasValidHeaderInformation(previousTable)) {
|
if (previousTable != null && hasInvalidHeaderInformation(currentTable) && hasValidHeaderInformation(previousTable)) {
|
||||||
@ -239,86 +241,44 @@ public class SectionsBuilderService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private Paragraph buildTextBlock(List<AbstractTextContainer> wordBlockList, String lastHeadline) {
|
private ClassificationSection buildTextBlock(List<AbstractPageBlock> wordBlockList, String lastHeadline) {
|
||||||
|
|
||||||
Paragraph paragraph = new Paragraph();
|
ClassificationSection section = new ClassificationSection();
|
||||||
TextBlock textBlock = null;
|
|
||||||
|
|
||||||
int pageBefore = -1;
|
for (AbstractPageBlock container : wordBlockList) {
|
||||||
boolean splitByTable = false;
|
if (container instanceof TablePageBlock table) {
|
||||||
|
|
||||||
Iterator<AbstractTextContainer> itty = wordBlockList.iterator();
|
if (lastHeadline == null || lastHeadline.isEmpty()) {
|
||||||
boolean alreadyAdded = false;
|
table.setHeadline("Text in table");
|
||||||
AbstractTextContainer previous = null;
|
|
||||||
while (itty.hasNext()) {
|
|
||||||
AbstractTextContainer container = itty.next();
|
|
||||||
|
|
||||||
if (container instanceof Table) {
|
|
||||||
Table table = (Table) container;
|
|
||||||
splitByTable = true;
|
|
||||||
|
|
||||||
if (previous != null && previous.getText().startsWith("Table ")) {
|
|
||||||
table.setHeadline(previous.getText());
|
|
||||||
} else {
|
} else {
|
||||||
if (lastHeadline == null || lastHeadline.isEmpty()) {
|
table.setHeadline("TablePageBlock in: " + lastHeadline);
|
||||||
table.setHeadline("Text in table");
|
|
||||||
} else {
|
|
||||||
table.setHeadline("Table in: " + lastHeadline);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (textBlock != null && !alreadyAdded) {
|
section.getPageBlocks().add(table);
|
||||||
paragraph.getPageBlocks().add(textBlock);
|
|
||||||
alreadyAdded = true;
|
|
||||||
}
|
|
||||||
paragraph.getPageBlocks().add(table);
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
TextBlock wordBlock = (TextBlock) container;
|
TextPageBlock wordBlock = (TextPageBlock) container;
|
||||||
|
section.getPageBlocks().add(wordBlock);
|
||||||
if (textBlock == null) {
|
|
||||||
textBlock = new TextBlock(wordBlock.getMinX(), wordBlock.getMaxX(), wordBlock.getMinY(), wordBlock.getMaxY(), wordBlock.getSequences(), wordBlock.getRotation());
|
|
||||||
textBlock.setPage(wordBlock.getPage());
|
|
||||||
} else if (splitByTable) {
|
|
||||||
textBlock = new TextBlock(wordBlock.getMinX(), wordBlock.getMaxX(), wordBlock.getMinY(), wordBlock.getMaxY(), wordBlock.getSequences(), wordBlock.getRotation());
|
|
||||||
textBlock.setPage(wordBlock.getPage());
|
|
||||||
alreadyAdded = false;
|
|
||||||
} else if (pageBefore != -1 && wordBlock.getPage() != pageBefore) {
|
|
||||||
textBlock.setPage(pageBefore);
|
|
||||||
paragraph.getPageBlocks().add(textBlock);
|
|
||||||
textBlock = new TextBlock(wordBlock.getMinX(), wordBlock.getMaxX(), wordBlock.getMinY(), wordBlock.getMaxY(), wordBlock.getSequences(), wordBlock.getRotation());
|
|
||||||
textBlock.setPage(wordBlock.getPage());
|
|
||||||
} else {
|
|
||||||
TextBlock spatialEntity = textBlock.union(wordBlock);
|
|
||||||
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
|
|
||||||
}
|
|
||||||
pageBefore = wordBlock.getPage();
|
|
||||||
splitByTable = false;
|
|
||||||
previous = container;
|
|
||||||
}
|
}
|
||||||
|
return section;
|
||||||
if (textBlock != null && !alreadyAdded) {
|
|
||||||
paragraph.getPageBlocks().add(textBlock);
|
|
||||||
}
|
|
||||||
return paragraph;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private boolean hasValidHeaderInformation(Table table) {
|
private boolean hasValidHeaderInformation(TablePageBlock table) {
|
||||||
|
|
||||||
return !hasInvalidHeaderInformation(table);
|
return !hasInvalidHeaderInformation(table);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private boolean hasInvalidHeaderInformation(Table table) {
|
private boolean hasInvalidHeaderInformation(TablePageBlock table) {
|
||||||
|
|
||||||
return table.getRows().stream().flatMap(row -> row.stream().filter(cell -> CollectionUtils.isNotEmpty(cell.getHeaderCells()))).findAny().isEmpty();
|
return table.getRows().stream().flatMap(row -> row.stream().filter(cell -> CollectionUtils.isNotEmpty(cell.getHeaderCells()))).findAny().isEmpty();
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private List<Cell> getRowWithNonHeaderCells(Table table) {
|
private List<Cell> getRowWithNonHeaderCells(TablePageBlock table) {
|
||||||
|
|
||||||
for (int i = table.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
|
for (int i = table.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
|
||||||
List<Cell> row = table.getRows().get(i);
|
List<Cell> row = table.getRows().get(i);
|
||||||
@ -1,4 +1,4 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.tableextraction.service;
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.service;
|
||||||
|
|
||||||
import java.awt.geom.Point2D;
|
import java.awt.geom.Point2D;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
@ -13,15 +13,15 @@ import java.util.stream.Collectors;
|
|||||||
|
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock;
|
||||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationPage;
|
||||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.Cell;
|
||||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.CleanRulings;
|
||||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.CleanRulings;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.Rectangle;
|
||||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Rectangle;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.Ruling;
|
||||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.TablePageBlock;
|
||||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
|
||||||
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils.Utils;
|
||||||
|
|
||||||
@Service
|
@Service
|
||||||
public class TableExtractionService {
|
public class TableExtractionService {
|
||||||
@ -73,20 +73,20 @@ public class TableExtractionService {
|
|||||||
* 90 -> UpperLeft
|
* 90 -> UpperLeft
|
||||||
* 180 -> UpperRight
|
* 180 -> UpperRight
|
||||||
* 270 -> LowerRight
|
* 270 -> LowerRight
|
||||||
*
|
* <p>
|
||||||
* DirAdj (Text direction adjusted) values can not be used here.
|
* DirAdj (Text direction adjusted) values can not be used here.
|
||||||
*
|
*
|
||||||
* @param cleanRulings The lines used to build the table.
|
* @param cleanRulings The lines used to build the table.
|
||||||
* @param page Page object that contains textblocks and statistics.
|
* @param page Page object that contains textblocks and statistics.
|
||||||
*/
|
*/
|
||||||
public void extractTables(CleanRulings cleanRulings, Page page) {
|
public void extractTables(CleanRulings cleanRulings, ClassificationPage page) {
|
||||||
|
|
||||||
List<Cell> cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
List<Cell> cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||||
|
|
||||||
List<TextBlock> toBeRemoved = new ArrayList<>();
|
List<TextPageBlock> toBeRemoved = new ArrayList<>();
|
||||||
|
|
||||||
for (AbstractTextContainer abstractTextContainer : page.getTextBlocks()) {
|
for (AbstractPageBlock abstractPageBlock : page.getTextBlocks()) {
|
||||||
TextBlock textBlock = (TextBlock) abstractTextContainer;
|
TextPageBlock textBlock = (TextPageBlock) abstractPageBlock;
|
||||||
for (Cell cell : cells) {
|
for (Cell cell : cells) {
|
||||||
if (cell.hasMinimumSize() && cell.intersects(textBlock.getPdfMinX(),
|
if (cell.hasMinimumSize() && cell.intersects(textBlock.getPdfMinX(),
|
||||||
textBlock.getPdfMinY(),
|
textBlock.getPdfMinY(),
|
||||||
@ -104,7 +104,7 @@ public class TableExtractionService {
|
|||||||
|
|
||||||
List<Rectangle> spreadsheetAreas = findSpreadsheetsFromCells(cells).stream().filter(r -> r.getWidth() > 0f && r.getHeight() > 0f).collect(Collectors.toList());
|
List<Rectangle> spreadsheetAreas = findSpreadsheetsFromCells(cells).stream().filter(r -> r.getWidth() > 0f && r.getHeight() > 0f).collect(Collectors.toList());
|
||||||
|
|
||||||
List<Table> tables = new ArrayList<>();
|
List<TablePageBlock> tables = new ArrayList<>();
|
||||||
for (Rectangle area : spreadsheetAreas) {
|
for (Rectangle area : spreadsheetAreas) {
|
||||||
|
|
||||||
List<Cell> overlappingCells = new ArrayList<>();
|
List<Cell> overlappingCells = new ArrayList<>();
|
||||||
@ -113,16 +113,16 @@ public class TableExtractionService {
|
|||||||
overlappingCells.add(c);
|
overlappingCells.add(c);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
tables.add(new Table(overlappingCells, area, page.getRotation()));
|
tables.add(new TablePageBlock(overlappingCells, area, page.getRotation()));
|
||||||
}
|
}
|
||||||
|
|
||||||
for (Table table : tables) {
|
for (TablePageBlock table : tables) {
|
||||||
int position = -1;
|
int position = -1;
|
||||||
|
|
||||||
Iterator<AbstractTextContainer> itty = page.getTextBlocks().iterator();
|
Iterator<AbstractPageBlock> itty = page.getTextBlocks().iterator();
|
||||||
while (itty.hasNext()) {
|
while (itty.hasNext()) {
|
||||||
AbstractTextContainer textBlock = itty.next();
|
AbstractPageBlock textBlock = itty.next();
|
||||||
if (textBlock instanceof TextBlock ? table.containsBlock((TextBlock) textBlock) : table.contains(textBlock) && position == -1) {
|
if (textBlock instanceof TextPageBlock ? table.containsBlock((TextPageBlock) textBlock) : table.contains(textBlock) && position == -1) {
|
||||||
position = page.getTextBlocks().indexOf(textBlock);
|
position = page.getTextBlocks().indexOf(textBlock);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -9,7 +9,7 @@
|
|||||||
* This program is free software under the LGPL (>=v2.1)
|
* This program is free software under the LGPL (>=v2.1)
|
||||||
* Read the file LICENSE.txt coming with the sources for details.
|
* Read the file LICENSE.txt coming with the sources for details.
|
||||||
*/
|
*/
|
||||||
package com.iqser.red.service.redaction.v1.server.tableextraction.utils;
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils;
|
||||||
|
|
||||||
import java.awt.geom.Line2D;
|
import java.awt.geom.Line2D;
|
||||||
import java.awt.geom.Rectangle2D;
|
import java.awt.geom.Rectangle2D;
|
||||||
@ -1,4 +1,4 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.tableextraction.utils;
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
@ -23,9 +23,11 @@ public class FileUtils {
|
|||||||
/**
|
/**
|
||||||
* Deletes a file; logs a message with the reason if the deletion fails.
|
* Deletes a file; logs a message with the reason if the deletion fails.
|
||||||
* This method is null-safe.
|
* This method is null-safe.
|
||||||
|
*
|
||||||
* @param file The file to delete. Can be null.
|
* @param file The file to delete. Can be null.
|
||||||
*/
|
*/
|
||||||
public void deleteFile(File file) {
|
public void deleteFile(File file) {
|
||||||
|
|
||||||
if (file != null) {
|
if (file != null) {
|
||||||
try {
|
try {
|
||||||
Files.deleteIfExists(file.toPath());
|
Files.deleteIfExists(file.toPath());
|
||||||
@ -1,7 +1,7 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.classification.utils;
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils;
|
||||||
|
|
||||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
|
||||||
|
|
||||||
import lombok.experimental.UtilityClass;
|
import lombok.experimental.UtilityClass;
|
||||||
|
|
||||||
@ -11,7 +11,7 @@ public final class PositionUtils {
|
|||||||
|
|
||||||
// TODO This currently uses pdf coord system. In the futher this should use java coord system.
|
// TODO This currently uses pdf coord system. In the futher this should use java coord system.
|
||||||
// Note: DirAdj (TextDirection Adjusted) can not be user for this.
|
// Note: DirAdj (TextDirection Adjusted) can not be user for this.
|
||||||
public boolean isWithinBodyTextFrame(Rectangle btf, TextBlock textBlock) {
|
public boolean isWithinBodyTextFrame(Rectangle btf, TextPageBlock textBlock) {
|
||||||
|
|
||||||
if (btf == null || textBlock == null) {
|
if (btf == null || textBlock == null) {
|
||||||
return false;
|
return false;
|
||||||
@ -32,7 +32,7 @@ public final class PositionUtils {
|
|||||||
|
|
||||||
// TODO This currently uses pdf coord system. In the futher this should use java coord system.
|
// TODO This currently uses pdf coord system. In the futher this should use java coord system.
|
||||||
// Note: DirAdj (TextDirection Adjusted) can not be user for this.
|
// Note: DirAdj (TextDirection Adjusted) can not be user for this.
|
||||||
public boolean isOverBodyTextFrame(Rectangle btf, TextBlock textBlock, int rotation) {
|
public boolean isOverBodyTextFrame(Rectangle btf, TextPageBlock textBlock, int rotation) {
|
||||||
|
|
||||||
if (btf == null || textBlock == null) {
|
if (btf == null || textBlock == null) {
|
||||||
return false;
|
return false;
|
||||||
@ -58,9 +58,10 @@ public final class PositionUtils {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// TODO This currently uses pdf coord system. In the futher this should use java coord system.
|
// TODO This currently uses pdf coord system. In the futher this should use java coord system.
|
||||||
// Note: DirAdj (TextDirection Adjusted) can not be user for this.
|
// Note: DirAdj (TextDirection Adjusted) can not be user for this.
|
||||||
public boolean isUnderBodyTextFrame(Rectangle btf, TextBlock textBlock, int rotation) {
|
public boolean isUnderBodyTextFrame(Rectangle btf, TextPageBlock textBlock, int rotation) {
|
||||||
|
|
||||||
if (btf == null || textBlock == null) {
|
if (btf == null || textBlock == null) {
|
||||||
return false;
|
return false;
|
||||||
@ -86,9 +87,10 @@ public final class PositionUtils {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// TODO This currently uses pdf coord system. In the futher this should use java coord system.
|
// TODO This currently uses pdf coord system. In the futher this should use java coord system.
|
||||||
// Note: DirAdj (TextDirection Adjusted) can not be user for this.
|
// Note: DirAdj (TextDirection Adjusted) can not be user for this.
|
||||||
public boolean isTouchingUnderBodyTextFrame(Rectangle btf, TextBlock textBlock) {
|
public boolean isTouchingUnderBodyTextFrame(Rectangle btf, TextPageBlock textBlock) {
|
||||||
|
|
||||||
//TODO Currently this is not working for rotated pages.
|
//TODO Currently this is not working for rotated pages.
|
||||||
|
|
||||||
@ -105,13 +107,13 @@ public final class PositionUtils {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public float getHeightDifferenceBetweenChunkWordAndDocumentWord(TextBlock textBlock, Float documentMostPopularWordHeight) {
|
public float getHeightDifferenceBetweenChunkWordAndDocumentWord(TextPageBlock textBlock, Float documentMostPopularWordHeight) {
|
||||||
|
|
||||||
return textBlock.getMostPopularWordHeight() - documentMostPopularWordHeight;
|
return textBlock.getMostPopularWordHeight() - documentMostPopularWordHeight;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public Float getApproxLineCount(TextBlock textBlock) {
|
public Float getApproxLineCount(TextPageBlock textBlock) {
|
||||||
|
|
||||||
return textBlock.getHeight() / textBlock.getMostPopularWordHeight();
|
return textBlock.getHeight() / textBlock.getMostPopularWordHeight();
|
||||||
}
|
}
|
||||||
@ -1,4 +1,4 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.tableextraction.utils;
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils;
|
||||||
|
|
||||||
import java.util.ArrayDeque;
|
import java.util.ArrayDeque;
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
@ -1,9 +1,9 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.classification.utils;
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils;
|
||||||
|
|
||||||
import java.awt.geom.Line2D;
|
import java.awt.geom.Line2D;
|
||||||
import java.awt.geom.Point2D;
|
import java.awt.geom.Point2D;
|
||||||
|
|
||||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.Ruling;
|
||||||
|
|
||||||
import lombok.experimental.UtilityClass;
|
import lombok.experimental.UtilityClass;
|
||||||
|
|
||||||
@ -13,7 +13,7 @@ public final class RulingTextDirAdjustUtil {
|
|||||||
/**
|
/**
|
||||||
* Converts a ruling (line of a table) the same way TextPositions are converted in PDFBox.
|
* Converts a ruling (line of a table) the same way TextPositions are converted in PDFBox.
|
||||||
* This will get the y position of the text, adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
|
* This will get the y position of the text, adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
|
||||||
*
|
* <p>
|
||||||
* See org.apache.pdfbox.text.TextPosition
|
* See org.apache.pdfbox.text.TextPosition
|
||||||
*/
|
*/
|
||||||
public Line2D.Float convertToDirAdj(Ruling ruling, float dir, float pageWidth, float pageHeight) {
|
public Line2D.Float convertToDirAdj(Ruling ruling, float dir, float pageWidth, float pageHeight) {
|
||||||
@ -1,4 +1,4 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.tableextraction.utils;
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils;
|
||||||
|
|
||||||
import java.math.BigDecimal;
|
import java.math.BigDecimal;
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
@ -0,0 +1,50 @@
|
|||||||
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.data;
|
||||||
|
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.AtomicTextBlock;
|
||||||
|
|
||||||
|
import lombok.AccessLevel;
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Builder;
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.experimental.FieldDefaults;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@Builder
|
||||||
|
@AllArgsConstructor
|
||||||
|
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||||
|
public class AtomicPositionBlockData {
|
||||||
|
|
||||||
|
Long id;
|
||||||
|
int[] stringIdxToPositionIdx;
|
||||||
|
float[][] positions;
|
||||||
|
|
||||||
|
|
||||||
|
public static AtomicPositionBlockData fromAtomicTextBlock(AtomicTextBlock atomicTextBlock) {
|
||||||
|
|
||||||
|
return AtomicPositionBlockData.builder()
|
||||||
|
.id(atomicTextBlock.getId())
|
||||||
|
.positions(toPrimitiveFloatMatrix(atomicTextBlock.getPositions()))
|
||||||
|
.stringIdxToPositionIdx(atomicTextBlock.getStringIdxToPositionIdx().stream().mapToInt(Integer::intValue).toArray())
|
||||||
|
.build();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static float[][] toPrimitiveFloatMatrix(List<Rectangle2D> positions) {
|
||||||
|
|
||||||
|
float[][] positionMatrix = new float[positions.size()][];
|
||||||
|
for (int i = 0; i < positions.size(); i++) {
|
||||||
|
positionMatrix[i] = toArray(positions.get(i));
|
||||||
|
}
|
||||||
|
return positionMatrix;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static float[] toArray(Rectangle2D positions) {
|
||||||
|
|
||||||
|
return new float[]{(float) positions.getMinX(), (float) positions.getMinY(), (float) positions.getWidth(), (float) positions.getHeight()};
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,39 @@
|
|||||||
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.data;
|
||||||
|
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.AtomicTextBlock;
|
||||||
|
|
||||||
|
import lombok.AccessLevel;
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Builder;
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.experimental.FieldDefaults;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@Builder
|
||||||
|
@AllArgsConstructor
|
||||||
|
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||||
|
public class AtomicTextBlockData {
|
||||||
|
|
||||||
|
Long id;
|
||||||
|
Long page;
|
||||||
|
String searchText;
|
||||||
|
int numberOnPage;
|
||||||
|
int start;
|
||||||
|
int end;
|
||||||
|
int[] lineBreaks;
|
||||||
|
|
||||||
|
|
||||||
|
public static AtomicTextBlockData fromAtomicTextBlock(AtomicTextBlock atomicTextBlock) {
|
||||||
|
|
||||||
|
return AtomicTextBlockData.builder()
|
||||||
|
.id(atomicTextBlock.getId())
|
||||||
|
.page(atomicTextBlock.getPage().getNumber().longValue())
|
||||||
|
.searchText(atomicTextBlock.getSearchText())
|
||||||
|
.numberOnPage(atomicTextBlock.getNumberOnPage())
|
||||||
|
.start(atomicTextBlock.getBoundary().start())
|
||||||
|
.end(atomicTextBlock.getBoundary().end())
|
||||||
|
.lineBreaks(atomicTextBlock.getLineBreaks().stream().mapToInt(Integer::intValue).toArray())
|
||||||
|
.build();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,43 @@
|
|||||||
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.data;
|
||||||
|
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Document;
|
||||||
|
|
||||||
|
import lombok.AccessLevel;
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Builder;
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.experimental.FieldDefaults;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@Builder
|
||||||
|
@AllArgsConstructor
|
||||||
|
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||||
|
public class DocumentData {
|
||||||
|
|
||||||
|
PageData[] pages;
|
||||||
|
AtomicTextBlockData[] atomicTextBlocks;
|
||||||
|
AtomicPositionBlockData[] atomicPositionBlocks;
|
||||||
|
DocumentTreeData documentTreeData;
|
||||||
|
|
||||||
|
|
||||||
|
public static DocumentData fromDocument(Document document) {
|
||||||
|
|
||||||
|
var atomicPositionBlocks = document.streamTerminalTextBlocksInOrder()
|
||||||
|
.flatMap(textBlock -> textBlock.getAtomicTextBlocks().stream())
|
||||||
|
.distinct()
|
||||||
|
.map(AtomicPositionBlockData::fromAtomicTextBlock)
|
||||||
|
.toArray(AtomicPositionBlockData[]::new);
|
||||||
|
|
||||||
|
var atomicTextBlocks = document.streamTerminalTextBlocksInOrder()
|
||||||
|
.flatMap(textBlock -> textBlock.getAtomicTextBlocks().stream())
|
||||||
|
.distinct()
|
||||||
|
.map(AtomicTextBlockData::fromAtomicTextBlock)
|
||||||
|
.toArray(AtomicTextBlockData[]::new);
|
||||||
|
|
||||||
|
var pages = document.getPages().stream().map(PageData::fromPage).toArray(PageData[]::new);
|
||||||
|
|
||||||
|
var documentTreeData = new DocumentTreeData(DocumentTreeData.EntryData.fromEntry(document.getDocumentTree().getRoot()));
|
||||||
|
return new DocumentData(pages, atomicTextBlocks, atomicPositionBlocks, documentTreeData);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,128 @@
|
|||||||
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.data;
|
||||||
|
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.data.mapper.PropertiesMapper;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.DocumentTree;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Image;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.NodeType;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Page;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Table;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.TableCell;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.AtomicTextBlock;
|
||||||
|
|
||||||
|
import lombok.AccessLevel;
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Builder;
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.Getter;
|
||||||
|
import lombok.NoArgsConstructor;
|
||||||
|
import lombok.experimental.FieldDefaults;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@Builder
|
||||||
|
@AllArgsConstructor
|
||||||
|
@NoArgsConstructor
|
||||||
|
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||||
|
public class DocumentTreeData {
|
||||||
|
|
||||||
|
EntryData root;
|
||||||
|
|
||||||
|
|
||||||
|
public EntryData get(List<Integer> tocId) {
|
||||||
|
|
||||||
|
if (tocId.isEmpty()) {
|
||||||
|
return root;
|
||||||
|
}
|
||||||
|
EntryData entry = root.children.get(tocId.get(0));
|
||||||
|
for (int id : tocId.subList(1, tocId.size())) {
|
||||||
|
entry = entry.children.get(id);
|
||||||
|
}
|
||||||
|
return entry;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public Stream<EntryData> streamAllEntries() {
|
||||||
|
|
||||||
|
return Stream.concat(Stream.of(root), root.children.stream()).flatMap(DocumentTreeData::flatten);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
|
||||||
|
return String.join("\n", streamAllEntries().map(EntryData::toString).toList());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static Stream<EntryData> flatten(EntryData entry) {
|
||||||
|
|
||||||
|
return Stream.concat(Stream.of(entry), entry.children.stream().flatMap(DocumentTreeData::flatten));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Builder
|
||||||
|
@Getter
|
||||||
|
@AllArgsConstructor
|
||||||
|
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||||
|
public static class EntryData {
|
||||||
|
|
||||||
|
NodeType type;
|
||||||
|
int[] treeId;
|
||||||
|
Long[] atomicBlockIds;
|
||||||
|
Long[] pageNumbers;
|
||||||
|
Map<String, String> properties;
|
||||||
|
List<EntryData> children;
|
||||||
|
|
||||||
|
|
||||||
|
public static EntryData fromEntry(DocumentTree.Entry entry) {
|
||||||
|
|
||||||
|
Long[] atomicBlockIds = toAtomicTextBlockIds(entry);
|
||||||
|
|
||||||
|
Map<String, String> properties = switch (entry.getType()) {
|
||||||
|
case TABLE -> PropertiesMapper.buildTableProperties((Table) entry.getNode());
|
||||||
|
case TABLE_CELL -> PropertiesMapper.buildTableCellProperties((TableCell) entry.getNode());
|
||||||
|
case IMAGE -> PropertiesMapper.buildImageProperties((Image) entry.getNode());
|
||||||
|
default -> new HashMap<>();
|
||||||
|
};
|
||||||
|
var treeId = entry.getTreeId().stream().mapToInt(Integer::intValue).toArray();
|
||||||
|
var pageNumbers = entry.getNode().getPages().stream().map(Page::getNumber).map(Integer::longValue).toArray(Long[]::new);
|
||||||
|
var subEntries = entry.getChildren().stream().map(EntryData::fromEntry).toList();
|
||||||
|
return new EntryData(entry.getType(), treeId, atomicBlockIds, pageNumbers, properties, subEntries);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static Long[] toAtomicTextBlockIds(DocumentTree.Entry entry) {
|
||||||
|
|
||||||
|
if (entry.getNode().isLeaf()) {
|
||||||
|
return entry.getNode().getLeafTextBlock().getAtomicTextBlocks().stream().map(AtomicTextBlock::getId).toArray(Long[]::new);
|
||||||
|
} else {
|
||||||
|
return new Long[]{};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
sb.append("[");
|
||||||
|
for (int i : treeId) {
|
||||||
|
sb.append(i);
|
||||||
|
sb.append(",");
|
||||||
|
}
|
||||||
|
sb.delete(sb.length() - 1, sb.length());
|
||||||
|
sb.append("]: ");
|
||||||
|
|
||||||
|
sb.append(type);
|
||||||
|
sb.append(" atbs = ");
|
||||||
|
sb.append(atomicBlockIds.length);
|
||||||
|
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,28 @@
|
|||||||
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.data;
|
||||||
|
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Page;
|
||||||
|
|
||||||
|
import lombok.AccessLevel;
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Builder;
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.experimental.FieldDefaults;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@Builder
|
||||||
|
@AllArgsConstructor
|
||||||
|
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||||
|
public class PageData {
|
||||||
|
|
||||||
|
int number;
|
||||||
|
int height;
|
||||||
|
int width;
|
||||||
|
int rotation;
|
||||||
|
|
||||||
|
|
||||||
|
public static PageData fromPage(Page page) {
|
||||||
|
|
||||||
|
return new PageData(page.getNumber(), page.getHeight(), page.getWidth(), page.getRotation());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,198 @@
|
|||||||
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.data.mapper;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.LinkedList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.NoSuchElementException;
|
||||||
|
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.data.AtomicPositionBlockData;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.data.AtomicTextBlockData;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.data.DocumentData;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.data.DocumentTreeData;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.data.PageData;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.DocumentTree;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Document;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Footer;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Header;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Headline;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Image;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Page;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Paragraph;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Section;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.SemanticNode;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Table;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.TableCell;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.AtomicTextBlock;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.TextBlock;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.TextBlockCollector;
|
||||||
|
|
||||||
|
import lombok.experimental.UtilityClass;
|
||||||
|
|
||||||
|
@UtilityClass
|
||||||
|
public class DocumentGraphMapper {
|
||||||
|
|
||||||
|
public Document toDocumentGraph(DocumentData documentData) {
|
||||||
|
|
||||||
|
Document document = new Document();
|
||||||
|
DocumentTree documentTree = new DocumentTree(document);
|
||||||
|
Context context = new Context(documentData, documentTree);
|
||||||
|
|
||||||
|
context.pages.addAll(Arrays.stream(documentData.getPages()).map(DocumentGraphMapper::buildPage).toList());
|
||||||
|
|
||||||
|
context.documentTree.getRoot().getChildren().addAll(buildEntries(documentData.getDocumentTreeData().getRoot().getChildren(), context));
|
||||||
|
|
||||||
|
document.setDocumentTree(context.documentTree);
|
||||||
|
document.setPages(new HashSet<>(context.pages));
|
||||||
|
document.setNumberOfPages(documentData.getPages().length);
|
||||||
|
|
||||||
|
document.setTextBlock(document.getTextBlock());
|
||||||
|
return document;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private List<DocumentTree.Entry> buildEntries(List<DocumentTreeData.EntryData> entries, Context context) {
|
||||||
|
|
||||||
|
List<DocumentTree.Entry> newEntries = new LinkedList<>();
|
||||||
|
for (DocumentTreeData.EntryData entryData : entries) {
|
||||||
|
|
||||||
|
List<Page> pages = Arrays.stream(entryData.getPageNumbers()).map(pageNumber -> getPage(pageNumber, context)).toList();
|
||||||
|
|
||||||
|
SemanticNode node = switch (entryData.getType()) {
|
||||||
|
case SECTION -> buildSection(context);
|
||||||
|
case PARAGRAPH -> buildParagraph(context);
|
||||||
|
case HEADLINE -> buildHeadline(context);
|
||||||
|
case HEADER -> buildHeader(context);
|
||||||
|
case FOOTER -> buildFooter(context);
|
||||||
|
case TABLE -> buildTable(context, entryData.getProperties());
|
||||||
|
case TABLE_CELL -> buildTableCell(context, entryData.getProperties());
|
||||||
|
case IMAGE -> buildImage(context, entryData.getProperties(), entryData.getPageNumbers());
|
||||||
|
default -> throw new UnsupportedOperationException("Not yet implemented for type " + entryData.getType());
|
||||||
|
};
|
||||||
|
|
||||||
|
if (entryData.getAtomicBlockIds().length > 0) {
|
||||||
|
TextBlock textBlock = toTextBlock(entryData.getAtomicBlockIds(), context, node);
|
||||||
|
node.setLeafTextBlock(textBlock);
|
||||||
|
}
|
||||||
|
List<Integer> treeId = Arrays.stream(entryData.getTreeId()).boxed().toList();
|
||||||
|
node.setTreeId(treeId);
|
||||||
|
|
||||||
|
switch (entryData.getType()) {
|
||||||
|
case HEADER -> pages.forEach(page -> page.setHeader((Header) node));
|
||||||
|
case FOOTER -> pages.forEach(page -> page.setFooter((Footer) node));
|
||||||
|
default -> pages.forEach(page -> page.getMainBody().add(node));
|
||||||
|
}
|
||||||
|
|
||||||
|
newEntries.add(DocumentTree.Entry.builder().treeId(treeId).children(buildEntries(entryData.getChildren(), context)).node(node).build());
|
||||||
|
}
|
||||||
|
return newEntries;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private Headline buildHeadline(Context context) {
|
||||||
|
|
||||||
|
return Headline.builder().documentTree(context.documentTree).build();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private Image buildImage(Context context, Map<String, String> properties, Long[] pageNumbers) {
|
||||||
|
|
||||||
|
assert pageNumbers.length == 1;
|
||||||
|
Page page = getPage(pageNumbers[0], context);
|
||||||
|
var builder = Image.builder();
|
||||||
|
PropertiesMapper.parseImageProperties(properties, builder);
|
||||||
|
return builder.documentTree(context.documentTree).page(page).build();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private TableCell buildTableCell(Context context, Map<String, String> properties) {
|
||||||
|
|
||||||
|
TableCell.TableCellBuilder builder = TableCell.builder();
|
||||||
|
PropertiesMapper.parseTableCellProperties(properties, builder);
|
||||||
|
return builder.documentTree(context.documentTree).build();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private Table buildTable(Context context, Map<String, String> properties) {
|
||||||
|
|
||||||
|
Table.TableBuilder builder = Table.builder();
|
||||||
|
PropertiesMapper.parseTableProperties(properties, builder);
|
||||||
|
return builder.documentTree(context.documentTree).build();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private Footer buildFooter(Context context) {
|
||||||
|
|
||||||
|
return Footer.builder().documentTree(context.documentTree).build();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private Header buildHeader(Context context) {
|
||||||
|
|
||||||
|
return Header.builder().documentTree(context.documentTree).build();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private Section buildSection(Context context) {
|
||||||
|
|
||||||
|
return Section.builder().documentTree(context.documentTree).build();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private Paragraph buildParagraph(Context context) {
|
||||||
|
|
||||||
|
return Paragraph.builder().documentTree(context.documentTree).build();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private TextBlock toTextBlock(Long[] atomicTextBlockIds, Context context, SemanticNode parent) {
|
||||||
|
|
||||||
|
return Arrays.stream(atomicTextBlockIds).map(atomicTextBlockId -> getAtomicTextBlock(context, parent, atomicTextBlockId)).collect(new TextBlockCollector());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private AtomicTextBlock getAtomicTextBlock(Context context, SemanticNode parent, Long atomicTextBlockId) {
|
||||||
|
|
||||||
|
return AtomicTextBlock.fromAtomicTextBlockData(context.atomicTextBlockData.get(Math.toIntExact(atomicTextBlockId)),
|
||||||
|
context.atomicPositionBlockData.get(Math.toIntExact(atomicTextBlockId)),
|
||||||
|
parent,
|
||||||
|
getPage(context.atomicTextBlockData.get(Math.toIntExact(atomicTextBlockId)).getPage(), context));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private Page buildPage(PageData p) {
|
||||||
|
|
||||||
|
return Page.builder().rotation(p.getRotation()).height(p.getHeight()).width(p.getWidth()).number(p.getNumber()).mainBody(new LinkedList<>()).build();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private Page getPage(Long pageIndex, Context context) {
|
||||||
|
|
||||||
|
return context.pages.stream()
|
||||||
|
.filter(page -> page.getNumber() == Math.toIntExact(pageIndex))
|
||||||
|
.findFirst()
|
||||||
|
.orElseThrow(() -> new NoSuchElementException(String.format("ClassificationPage with number %d not found", pageIndex)));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static final class Context {
|
||||||
|
|
||||||
|
private final DocumentTree documentTree;
|
||||||
|
private final List<Page> pages;
|
||||||
|
private final List<AtomicTextBlockData> atomicTextBlockData;
|
||||||
|
private final List<AtomicPositionBlockData> atomicPositionBlockData;
|
||||||
|
|
||||||
|
|
||||||
|
Context(DocumentData documentData, DocumentTree documentTree) {
|
||||||
|
|
||||||
|
this.documentTree = documentTree;
|
||||||
|
this.pages = new LinkedList<>();
|
||||||
|
this.atomicTextBlockData = Arrays.stream(documentData.getAtomicTextBlocks()).toList();
|
||||||
|
this.atomicPositionBlockData = Arrays.stream(documentData.getAtomicPositionBlocks()).toList();
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,110 @@
|
|||||||
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.data.mapper;
|
||||||
|
|
||||||
|
import static java.lang.String.format;
|
||||||
|
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Image;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.ImageType;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Table;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.TableCell;
|
||||||
|
|
||||||
|
import lombok.AccessLevel;
|
||||||
|
import lombok.experimental.FieldDefaults;
|
||||||
|
import lombok.experimental.UtilityClass;
|
||||||
|
|
||||||
|
@UtilityClass
|
||||||
|
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||||
|
public class PropertiesMapper {
|
||||||
|
|
||||||
|
String imageType = "imageType";
|
||||||
|
private final String transparency = "transparency";
|
||||||
|
private final String position = "position";
|
||||||
|
String id = "id";
|
||||||
|
String row = "row";
|
||||||
|
String col = "col";
|
||||||
|
String header = "header";
|
||||||
|
String bBox = "bBox";
|
||||||
|
String numberOfRows = "numberOfRows";
|
||||||
|
String numberOfCols = "numberOfCols";
|
||||||
|
|
||||||
|
|
||||||
|
public Map<String, String> buildImageProperties(Image image) {
|
||||||
|
|
||||||
|
Map<String, String> properties = new HashMap<>();
|
||||||
|
properties.put(imageType, image.getImageType().toString());
|
||||||
|
properties.put(transparency, String.valueOf(image.isTransparent()));
|
||||||
|
properties.put(position, toString(image.getPosition()));
|
||||||
|
properties.put(id, image.getId());
|
||||||
|
return properties;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private String toString(Rectangle2D rectangle2D) {
|
||||||
|
|
||||||
|
return format("%f,%f,%f,%f", rectangle2D.getX(), rectangle2D.getY(), rectangle2D.getWidth(), rectangle2D.getHeight());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public Map<String, String> buildTableCellProperties(TableCell tableCell) {
|
||||||
|
|
||||||
|
Map<String, String> properties = new HashMap<>();
|
||||||
|
properties.put(row, String.valueOf(tableCell.getRow()));
|
||||||
|
properties.put(col, String.valueOf(tableCell.getCol()));
|
||||||
|
properties.put(header, String.valueOf(tableCell.isHeader()));
|
||||||
|
|
||||||
|
if (tableCell.getPages().size() > 1 || tableCell.getBBox().keySet().size() > 1) {
|
||||||
|
throw new IllegalArgumentException("TableCell can only occur on a single page!");
|
||||||
|
}
|
||||||
|
String bBoxString = toString(tableCell.getBBox().get(tableCell.getPages().stream().findFirst().get()));
|
||||||
|
properties.put(bBox, bBoxString);
|
||||||
|
|
||||||
|
return properties;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public Map<String, String> buildTableProperties(Table table) {
|
||||||
|
|
||||||
|
Map<String, String> properties = new HashMap<>();
|
||||||
|
properties.put(numberOfRows, String.valueOf(table.getNumberOfRows()));
|
||||||
|
properties.put(numberOfCols, String.valueOf(table.getNumberOfCols()));
|
||||||
|
return properties;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void parseImageProperties(Map<String, String> properties, Image.ImageBuilder builder) {
|
||||||
|
|
||||||
|
builder.imageType(ImageType.fromString(properties.get(imageType)));
|
||||||
|
builder.transparent(Boolean.parseBoolean(properties.get(transparency)));
|
||||||
|
builder.position(parseRectangle2D(properties.get(position)));
|
||||||
|
builder.id(properties.get(id));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void parseTableCellProperties(Map<String, String> properties, TableCell.TableCellBuilder builder) {
|
||||||
|
|
||||||
|
builder.row(Integer.parseInt(properties.get(row)));
|
||||||
|
builder.col(Integer.parseInt(properties.get(col)));
|
||||||
|
builder.header(Boolean.parseBoolean(properties.get(header)));
|
||||||
|
builder.bBox(parseRectangle2D(properties.get(bBox)));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void parseTableProperties(Map<String, String> properties, Table.TableBuilder builder) {
|
||||||
|
|
||||||
|
builder.numberOfRows(Integer.parseInt(properties.get(numberOfRows)));
|
||||||
|
builder.numberOfCols(Integer.parseInt(properties.get(numberOfCols)));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private Rectangle2D parseRectangle2D(String bBox) {
|
||||||
|
|
||||||
|
List<Float> floats = Arrays.stream(bBox.split(",")).map(Float::parseFloat).toList();
|
||||||
|
return new Rectangle2D.Float(floats.get(0), floats.get(1), floats.get(2), floats.get(3));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,246 @@
|
|||||||
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.factory;
|
||||||
|
|
||||||
|
import static java.lang.String.format;
|
||||||
|
import static java.util.stream.Collectors.groupingBy;
|
||||||
|
import static java.util.stream.Collectors.toList;
|
||||||
|
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.LinkedList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.NoSuchElementException;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationDocument;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationFooter;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationHeader;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationPage;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.image.ClassifiedImage;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.DocumentTree;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Document;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Footer;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.GenericSemanticNode;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Header;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Headline;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Image;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Page;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Paragraph;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Section;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.AtomicTextBlock;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.utils.TextPositionOperations;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.redaction.utils.IdBuilder;
|
||||||
|
|
||||||
|
import lombok.AccessLevel;
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Builder;
|
||||||
|
import lombok.Getter;
|
||||||
|
import lombok.experimental.FieldDefaults;
|
||||||
|
import lombok.experimental.UtilityClass;
|
||||||
|
|
||||||
|
@UtilityClass
|
||||||
|
public class DocumentGraphFactory {
|
||||||
|
|
||||||
|
public Document buildDocumentGraph(ClassificationDocument document) {
|
||||||
|
|
||||||
|
Document documentGraph = new Document();
|
||||||
|
Context context = new Context(documentGraph);
|
||||||
|
|
||||||
|
document.getPages().forEach(context::buildAndAddPageWithCounter);
|
||||||
|
document.getSections().stream().flatMap(section -> section.getImages().stream()).forEach(image -> context.getImages().add(image));
|
||||||
|
addSections(document, context);
|
||||||
|
addHeaderAndFooterToEachPage(document, context);
|
||||||
|
|
||||||
|
documentGraph.setNumberOfPages(context.pages.size());
|
||||||
|
documentGraph.setPages(context.pages.keySet());
|
||||||
|
documentGraph.setDocumentTree(context.documentTree);
|
||||||
|
documentGraph.setTextBlock(documentGraph.getTextBlock());
|
||||||
|
return documentGraph;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void addSections(ClassificationDocument document, Context context) {
|
||||||
|
|
||||||
|
document.getSections().forEach(section -> SectionNodeFactory.addSection(null, section.getPageBlocks(), section.getImages(), context));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void addParagraphOrHeadline(GenericSemanticNode parentNode, TextPageBlock originalTextBlock, Context context, List<TextPageBlock> textBlocksToMerge) {
|
||||||
|
|
||||||
|
Page page = context.getPage(originalTextBlock.getPage());
|
||||||
|
|
||||||
|
GenericSemanticNode node;
|
||||||
|
if (originalTextBlock.isHeadline()) {
|
||||||
|
node = Headline.builder().documentTree(context.getDocumentTree()).build();
|
||||||
|
} else {
|
||||||
|
node = Paragraph.builder().documentTree(context.getDocumentTree()).build();
|
||||||
|
}
|
||||||
|
|
||||||
|
page.getMainBody().add(node);
|
||||||
|
|
||||||
|
List<TextPageBlock> textBlocks = new ArrayList<>(textBlocksToMerge);
|
||||||
|
textBlocks.add(originalTextBlock);
|
||||||
|
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), node, context, page);
|
||||||
|
List<Integer> treeId = context.documentTree.createNewChildEntryAndReturnId(parentNode, node);
|
||||||
|
node.setLeafTextBlock(textBlock);
|
||||||
|
node.setTreeId(treeId);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void addImage(Section section, ClassifiedImage image, Context context) {
|
||||||
|
|
||||||
|
Rectangle2D position = image.getPosition();
|
||||||
|
Page page = context.getPage(image.getPage());
|
||||||
|
Image imageNode = Image.builder()
|
||||||
|
.id(IdBuilder.buildId(Set.of(page), List.of(position)))
|
||||||
|
.imageType(image.getImageType())
|
||||||
|
.position(position)
|
||||||
|
.transparent(image.isHasTransparency())
|
||||||
|
.page(page)
|
||||||
|
.documentTree(context.getDocumentTree())
|
||||||
|
.build();
|
||||||
|
page.getMainBody().add(imageNode);
|
||||||
|
|
||||||
|
List<Integer> tocId = context.getDocumentTree().createNewChildEntryAndReturnId(section, imageNode);
|
||||||
|
imageNode.setTreeId(tocId);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void addHeaderAndFooterToEachPage(ClassificationDocument document, Context context) {
|
||||||
|
|
||||||
|
Map<Integer, List<TextPageBlock>> headers = document.getHeaders()
|
||||||
|
.stream()
|
||||||
|
.map(ClassificationHeader::getTextBlocks)
|
||||||
|
.flatMap(List::stream)
|
||||||
|
.collect(groupingBy(AbstractPageBlock::getPage, toList()));
|
||||||
|
|
||||||
|
Map<Integer, List<TextPageBlock>> footers = document.getFooters()
|
||||||
|
.stream()
|
||||||
|
.map(ClassificationFooter::getTextBlocks)
|
||||||
|
.flatMap(List::stream)
|
||||||
|
.collect(groupingBy(AbstractPageBlock::getPage, toList()));
|
||||||
|
|
||||||
|
for (int pageIndex = 1; pageIndex <= document.getPages().size(); pageIndex++) {
|
||||||
|
if (headers.containsKey(pageIndex)) {
|
||||||
|
addHeader(headers.get(pageIndex), context);
|
||||||
|
} else {
|
||||||
|
addEmptyHeader(pageIndex, context);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int pageIndex = 1; pageIndex <= document.getPages().size(); pageIndex++) {
|
||||||
|
if (footers.containsKey(pageIndex)) {
|
||||||
|
addFooter(footers.get(pageIndex), context);
|
||||||
|
} else {
|
||||||
|
addEmptyFooter(pageIndex, context);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void addFooter(List<TextPageBlock> textBlocks, Context context) {
|
||||||
|
|
||||||
|
Page page = context.getPage(textBlocks.get(0).getPage());
|
||||||
|
Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build();
|
||||||
|
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks),
|
||||||
|
footer,
|
||||||
|
context,
|
||||||
|
page);
|
||||||
|
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer);
|
||||||
|
footer.setTreeId(tocId);
|
||||||
|
footer.setLeafTextBlock(textBlock);
|
||||||
|
page.setFooter(footer);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void addHeader(List<TextPageBlock> textBlocks, Context context) {
|
||||||
|
|
||||||
|
Page page = context.getPage(textBlocks.get(0).getPage());
|
||||||
|
Header header = Header.builder().documentTree(context.getDocumentTree()).build();
|
||||||
|
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), header, 0, page);
|
||||||
|
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header);
|
||||||
|
header.setTreeId(tocId);
|
||||||
|
header.setLeafTextBlock(textBlock);
|
||||||
|
page.setHeader(header);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void addEmptyFooter(int pageIndex, Context context) {
|
||||||
|
|
||||||
|
Page page = context.getPage(pageIndex);
|
||||||
|
Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build();
|
||||||
|
AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(footer, context, page);
|
||||||
|
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer);
|
||||||
|
footer.setTreeId(tocId);
|
||||||
|
footer.setLeafTextBlock(textBlock);
|
||||||
|
page.setFooter(footer);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void addEmptyHeader(int pageIndex, Context context) {
|
||||||
|
|
||||||
|
Page page = context.getPage(pageIndex);
|
||||||
|
Header header = Header.builder().documentTree(context.getDocumentTree()).build();
|
||||||
|
AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(header, 0, page);
|
||||||
|
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header);
|
||||||
|
header.setTreeId(tocId);
|
||||||
|
header.setLeafTextBlock(textBlock);
|
||||||
|
page.setHeader(header);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Getter
|
||||||
|
@Builder
|
||||||
|
@AllArgsConstructor
|
||||||
|
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||||
|
public final class Context {
|
||||||
|
|
||||||
|
DocumentTree documentTree;
|
||||||
|
Map<Page, Integer> pages;
|
||||||
|
List<Section> sections;
|
||||||
|
List<ClassifiedImage> images;
|
||||||
|
TextBlockFactory textBlockFactory;
|
||||||
|
|
||||||
|
|
||||||
|
public Context(Document document) {
|
||||||
|
|
||||||
|
documentTree = new DocumentTree(document);
|
||||||
|
pages = new HashMap<>();
|
||||||
|
sections = new LinkedList<>();
|
||||||
|
images = new LinkedList<>();
|
||||||
|
textBlockFactory = new TextBlockFactory();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void buildAndAddPageWithCounter(ClassificationPage classificationPage) {
|
||||||
|
|
||||||
|
Page page = Page.fromClassificationPage(classificationPage);
|
||||||
|
//this counter counts the TextBlocks per page
|
||||||
|
//initial value is set to 1, because 0 is reserved for Header
|
||||||
|
pages.put(page, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public int getAndIncrementTextBlockNumberOnPage(Page page) {
|
||||||
|
|
||||||
|
Integer textBlockNumberOnPage = pages.get(page);
|
||||||
|
pages.merge(page, 1, Integer::sum);
|
||||||
|
return textBlockNumberOnPage;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public Page getPage(int pageIndex) {
|
||||||
|
|
||||||
|
return pages.keySet()
|
||||||
|
.stream()
|
||||||
|
.filter(page -> page.getNumber() == pageIndex)
|
||||||
|
.findFirst()
|
||||||
|
.orElseThrow(() -> new NoSuchElementException(format("ClassificationPage with number %d not found", pageIndex)));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,33 @@
|
|||||||
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.factory;
|
||||||
|
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import lombok.AccessLevel;
|
||||||
|
import lombok.Builder;
|
||||||
|
import lombok.Getter;
|
||||||
|
import lombok.experimental.FieldDefaults;
|
||||||
|
|
||||||
|
@Builder
|
||||||
|
@Getter
|
||||||
|
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||||
|
public class SearchTextWithTextPositionDto {
|
||||||
|
|
||||||
|
String searchText;
|
||||||
|
List<Integer> lineBreaks;
|
||||||
|
List<Integer> stringCoordsToPositionCoords;
|
||||||
|
List<Rectangle2D> positions;
|
||||||
|
|
||||||
|
|
||||||
|
public static SearchTextWithTextPositionDto empty() {
|
||||||
|
|
||||||
|
return SearchTextWithTextPositionDto.builder()
|
||||||
|
.searchText("")
|
||||||
|
.lineBreaks(Collections.emptyList())
|
||||||
|
.positions(Collections.emptyList())
|
||||||
|
.stringCoordsToPositionCoords(Collections.emptyList())
|
||||||
|
.build();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,185 @@
|
|||||||
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.factory;
|
||||||
|
|
||||||
|
import java.awt.geom.AffineTransform;
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
|
import java.util.LinkedList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Objects;
|
||||||
|
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.RedTextPosition;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextDirection;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPositionSequence;
|
||||||
|
|
||||||
|
import lombok.experimental.UtilityClass;
|
||||||
|
|
||||||
|
@UtilityClass
|
||||||
|
public class SearchTextWithTextPositionFactory {
|
||||||
|
|
||||||
|
public final int HEIGHT_PADDING = 2;
|
||||||
|
// when checking for a hyphen linebreak, we need to check after a linebreak if the last hyphen was less than three symbols away.
|
||||||
|
// We detect a linebreak as either a "\n" character or if two adjacent symbol's position differ in y-coordinates by at least one character height.
|
||||||
|
// If there is a hyphen linebreak, the hyphen will be 1 position in front of a "\n" or 2 positions in front of the character which has a lower y-coordinate
|
||||||
|
// This is why, we need to initialize this to < -2, otherwise, if the very first symbol is a \n we would detect a hyphen linebreak that isn't there.
|
||||||
|
// Also, Integer.MIN_VALUE is a bad idea due to potential overflow during arithmetic operations. This is why the default should be -3.
|
||||||
|
public final int MAX_HYPHEN_LINEBREAK_DISTANCE = 3;
|
||||||
|
|
||||||
|
|
||||||
|
public SearchTextWithTextPositionDto buildSearchTextToTextPositionModel(List<TextPositionSequence> sequences) {
|
||||||
|
|
||||||
|
if (sequences.isEmpty() || sequences.stream().allMatch(sequence -> sequence.getTextPositions().isEmpty())) {
|
||||||
|
return SearchTextWithTextPositionDto.empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
Context context = new Context();
|
||||||
|
|
||||||
|
RedTextPosition currentTextPosition = sequences.get(0).getTextPositions().get(0);
|
||||||
|
RedTextPosition previousTextPosition = RedTextPosition.builder().unicode(" ").position(currentTextPosition.getPosition()).build();
|
||||||
|
|
||||||
|
for (TextPositionSequence word : sequences) {
|
||||||
|
for (int i = 0; i < word.getTextPositions().size(); ++i) {
|
||||||
|
|
||||||
|
currentTextPosition = word.getTextPositions().get(i);
|
||||||
|
if (isLineBreak(currentTextPosition, previousTextPosition)) {
|
||||||
|
removeHyphenLinebreaks(context);
|
||||||
|
context.lineBreaksStringIdx.add(context.stringIdx);
|
||||||
|
}
|
||||||
|
if (!isRepeatedWhitespace(currentTextPosition.getUnicode(), previousTextPosition.getUnicode())) {
|
||||||
|
if (isHyphen(currentTextPosition.getUnicode())) {
|
||||||
|
context.lastHyphenIdx = context.stringIdx;
|
||||||
|
}
|
||||||
|
appendCurrentTextPosition(context, currentTextPosition);
|
||||||
|
}
|
||||||
|
|
||||||
|
previousTextPosition = currentTextPosition;
|
||||||
|
++context.positionIdx;
|
||||||
|
}
|
||||||
|
|
||||||
|
previousTextPosition = RedTextPosition.builder().unicode(" ").position(previousTextPosition.getPosition()).build();
|
||||||
|
context.stringBuilder.append(" ");
|
||||||
|
context.stringIdxToPositionIdx.add(context.positionIdx);
|
||||||
|
++context.stringIdx;
|
||||||
|
}
|
||||||
|
|
||||||
|
assert context.stringBuilder.length() == context.stringIdxToPositionIdx.size();
|
||||||
|
|
||||||
|
List<Rectangle2D> positions = sequences.stream()
|
||||||
|
.flatMap(sequence -> sequence.getTextPositions().stream().map(textPosition -> mapRedTextPositionToInitialUserSpace(textPosition, sequence)))
|
||||||
|
.toList();
|
||||||
|
|
||||||
|
return SearchTextWithTextPositionDto.builder()
|
||||||
|
.searchText(context.stringBuilder.toString())
|
||||||
|
.lineBreaks(context.lineBreaksStringIdx)
|
||||||
|
.stringCoordsToPositionCoords(context.stringIdxToPositionIdx)
|
||||||
|
.positions(positions)
|
||||||
|
.build();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void appendCurrentTextPosition(Context context, RedTextPosition currentTextPosition) {
|
||||||
|
|
||||||
|
context.stringBuilder.append(currentTextPosition.getUnicode());
|
||||||
|
|
||||||
|
// unicode characters with more than 16-bit encoding have a length > 1 in java strings
|
||||||
|
for (int j = 0; j < currentTextPosition.getUnicode().length(); j++) {
|
||||||
|
context.stringIdxToPositionIdx.add(context.positionIdx);
|
||||||
|
}
|
||||||
|
context.stringIdx += currentTextPosition.getUnicode().length();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void removeHyphenLinebreaks(Context context) {
|
||||||
|
|
||||||
|
if (lastHyphenDirectlyBeforeLineBreak(context)) {
|
||||||
|
context.stringBuilder.delete(context.lastHyphenIdx, context.stringBuilder.length());
|
||||||
|
context.stringIdxToPositionIdx = context.stringIdxToPositionIdx.subList(0, context.lastHyphenIdx);
|
||||||
|
context.stringIdx = context.lastHyphenIdx;
|
||||||
|
context.lastHyphenIdx = -MAX_HYPHEN_LINEBREAK_DISTANCE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private boolean lastHyphenDirectlyBeforeLineBreak(Context context) {
|
||||||
|
|
||||||
|
return context.stringIdx - context.lastHyphenIdx < MAX_HYPHEN_LINEBREAK_DISTANCE;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private boolean isLineBreak(RedTextPosition currentTextPosition, RedTextPosition previousTextPosition) {
|
||||||
|
|
||||||
|
return Objects.equals(currentTextPosition.getUnicode(), "\n") || isDeltaYLargerThanTextHeight(currentTextPosition, previousTextPosition);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private boolean isDeltaYLargerThanTextHeight(RedTextPosition currentPosition, RedTextPosition previousPosition) {
|
||||||
|
|
||||||
|
if (previousPosition == null) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
float deltaY = Math.abs(currentPosition.getYDirAdj() - previousPosition.getYDirAdj());
|
||||||
|
return deltaY >= currentPosition.getHeightDir();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private boolean isRepeatedWhitespace(String currentUnicode, String previousUnicode) {
|
||||||
|
|
||||||
|
return Objects.equals(previousUnicode, " ") && Objects.equals(currentUnicode, " ");
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private boolean isHyphen(String unicodeCharacter) {
|
||||||
|
|
||||||
|
return Objects.equals(unicodeCharacter, "-") || //
|
||||||
|
Objects.equals(unicodeCharacter, "~") || //
|
||||||
|
Objects.equals(unicodeCharacter, "‐") || //
|
||||||
|
Objects.equals(unicodeCharacter, "‒") || //
|
||||||
|
Objects.equals(unicodeCharacter, "⁻") || //
|
||||||
|
Objects.equals(unicodeCharacter, "−") || //
|
||||||
|
Objects.equals(unicodeCharacter, "﹣") || //
|
||||||
|
Objects.equals(unicodeCharacter, "゠") || //
|
||||||
|
Objects.equals(unicodeCharacter, "⁓") || //
|
||||||
|
Objects.equals(unicodeCharacter, "‑") || //
|
||||||
|
Objects.equals(unicodeCharacter, "\u00AD");
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private Rectangle2D mapRedTextPositionToInitialUserSpace(RedTextPosition textPosition, TextPositionSequence sequence) {
|
||||||
|
|
||||||
|
float textHeight = sequence.getTextHeight() + HEIGHT_PADDING;
|
||||||
|
Rectangle2D rectangle2D = new Rectangle2D.Double(textPosition.getXDirAdj(),
|
||||||
|
textPosition.getYDirAdj() - textHeight,
|
||||||
|
textPosition.getWidthDirAdj(),
|
||||||
|
textHeight + HEIGHT_PADDING);
|
||||||
|
|
||||||
|
AffineTransform transform = new AffineTransform();
|
||||||
|
|
||||||
|
if (sequence.getDir() == TextDirection.ZERO || sequence.getDir() == TextDirection.HALF_CIRCLE) {
|
||||||
|
transform.rotate(sequence.getDir().getRadians(), sequence.getPageWidth() / 2f, sequence.getPageHeight() / 2f);
|
||||||
|
transform.translate(0f, sequence.getPageHeight());
|
||||||
|
} else if (sequence.getDir() == TextDirection.QUARTER_CIRCLE) {
|
||||||
|
transform.rotate(sequence.getDir().getRadians(), sequence.getPageWidth() / 2f, sequence.getPageWidth() / 2f);
|
||||||
|
transform.translate(0f, sequence.getPageWidth());
|
||||||
|
} else {
|
||||||
|
transform.rotate(sequence.getDir().getRadians(), sequence.getPageHeight() / 2f, sequence.getPageHeight() / 2f);
|
||||||
|
transform.translate(0f, sequence.getPageWidth());
|
||||||
|
}
|
||||||
|
transform.scale(1., -1.);
|
||||||
|
|
||||||
|
return transform.createTransformedShape(rectangle2D).getBounds2D();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private class Context {
|
||||||
|
|
||||||
|
List<Integer> stringIdxToPositionIdx = new LinkedList<>();
|
||||||
|
List<Integer> lineBreaksStringIdx = new LinkedList<>();
|
||||||
|
StringBuilder stringBuilder = new StringBuilder();
|
||||||
|
|
||||||
|
int stringIdx;
|
||||||
|
int positionIdx;
|
||||||
|
|
||||||
|
int lastHyphenIdx = -MAX_HYPHEN_LINEBREAK_DISTANCE;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,183 @@
|
|||||||
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.factory;
|
||||||
|
|
||||||
|
import static java.lang.String.format;
|
||||||
|
import static java.util.Collections.emptyList;
|
||||||
|
import static java.util.stream.Collectors.groupingBy;
|
||||||
|
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.LinkedList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.image.ClassifiedImage;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.TablePageBlock;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.GenericSemanticNode;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Page;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Section;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.utils.TableMergingUtility;
|
||||||
|
|
||||||
|
import lombok.experimental.UtilityClass;
|
||||||
|
|
||||||
|
@UtilityClass
|
||||||
|
public class SectionNodeFactory {
|
||||||
|
|
||||||
|
public void addSection(GenericSemanticNode parentNode, List<AbstractPageBlock> pageBlocks, List<ClassifiedImage> images, DocumentGraphFactory.Context context) {
|
||||||
|
|
||||||
|
if (pageBlocks.isEmpty()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
Map<Integer, List<AbstractPageBlock>> blocksPerPage = pageBlocks.stream().collect(groupingBy(AbstractPageBlock::getPage));
|
||||||
|
Section section = Section.builder().documentTree(context.getDocumentTree()).build();
|
||||||
|
|
||||||
|
context.getSections().add(section);
|
||||||
|
blocksPerPage.keySet().forEach(pageNumber -> addSectionNodeToPageNode(context, section, pageNumber));
|
||||||
|
|
||||||
|
section.setTreeId(getTreeId(parentNode, context, section));
|
||||||
|
|
||||||
|
addFirstHeadlineDirectlyToSection(pageBlocks, context, section);
|
||||||
|
if (containsTablesAndTextBlocks(pageBlocks)) {
|
||||||
|
splitPageBlocksIntoSubSections(pageBlocks).forEach(subSectionPageBlocks -> addSection(section, subSectionPageBlocks, emptyList(), context));
|
||||||
|
} else {
|
||||||
|
addTablesAndParagraphsAndHeadlinesToSection(pageBlocks, context, section);
|
||||||
|
}
|
||||||
|
|
||||||
|
images.stream().distinct().forEach(image -> DocumentGraphFactory.addImage(section, image, context));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private List<Integer> getTreeId(GenericSemanticNode parentNode, DocumentGraphFactory.Context context, Section section) {
|
||||||
|
|
||||||
|
if (parentNode == null) {
|
||||||
|
return context.getDocumentTree().createNewMainEntryAndReturnId(section);
|
||||||
|
} else {
|
||||||
|
return context.getDocumentTree().createNewChildEntryAndReturnId(parentNode, section);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void addFirstHeadlineDirectlyToSection(List<AbstractPageBlock> pageBlocks, DocumentGraphFactory.Context context, Section section) {
|
||||||
|
|
||||||
|
if (pageBlocks.get(0).isHeadline()) {
|
||||||
|
addTablesAndParagraphsAndHeadlinesToSection(List.of(pageBlocks.get(0)), context, section);
|
||||||
|
pageBlocks.remove(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void addTablesAndParagraphsAndHeadlinesToSection(List<AbstractPageBlock> pageBlocks, DocumentGraphFactory.Context context, Section section) {
|
||||||
|
|
||||||
|
Set<AbstractPageBlock> alreadyMerged = new HashSet<>();
|
||||||
|
List<AbstractPageBlock> remainingBlocks = new LinkedList<>(pageBlocks);
|
||||||
|
for (AbstractPageBlock abstractPageBlock : pageBlocks) {
|
||||||
|
|
||||||
|
if (alreadyMerged.contains(abstractPageBlock)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
remainingBlocks.removeAll(alreadyMerged);
|
||||||
|
|
||||||
|
if (abstractPageBlock instanceof TextPageBlock) {
|
||||||
|
List<TextPageBlock> textBlocks = findTextBlocksWithSameClassificationAndAlignsY(abstractPageBlock, remainingBlocks);
|
||||||
|
alreadyMerged.addAll(textBlocks);
|
||||||
|
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, textBlocks);
|
||||||
|
} else if (abstractPageBlock instanceof TablePageBlock tablePageBlock) {
|
||||||
|
List<TablePageBlock> tablesToMerge = TableMergingUtility.findConsecutiveTablesWithSameColCountAndSameHeaders(tablePageBlock, remainingBlocks);
|
||||||
|
alreadyMerged.addAll(tablesToMerge);
|
||||||
|
TableNodeFactory.addTable(section, tablesToMerge, context);
|
||||||
|
} else {
|
||||||
|
throw new RuntimeException(format("Unhandled AbstractPageBlockType %s!", abstractPageBlock.getClass()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private boolean containsTablesAndTextBlocks(List<AbstractPageBlock> pageBlocks) {
|
||||||
|
|
||||||
|
return pageBlocks.stream().anyMatch(pageBlock -> pageBlock instanceof TablePageBlock) && pageBlocks.stream().anyMatch(pageBlock -> pageBlock instanceof TextPageBlock);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This function splits the list of PageBlocks around TablePageBlocks, such that SubSections can be created, that don't include tables.
|
||||||
|
* This is needed so we can execute rules on sections, that do not contain tables.
|
||||||
|
* See: <a href="https://knecon.atlassian.net/wiki/spaces/RED/pages/14765218/Document+Structure">document structure wiki</a>
|
||||||
|
*
|
||||||
|
* @param pageBlocks a List of AbstractPageBlocks, which have at least one TablePageBlock and one ClassificationTextBlock
|
||||||
|
* @return List of Lists of AbstractPageBlocks, which include either a single Headline ClassificationTextBlock and a TablePageBlock or only ClassificationTextBlocks.
|
||||||
|
*/
|
||||||
|
private List<List<AbstractPageBlock>> splitPageBlocksIntoSubSections(List<AbstractPageBlock> pageBlocks) {
|
||||||
|
|
||||||
|
List<List<AbstractPageBlock>> splitList = splitIntoCoherentList(pageBlocks);
|
||||||
|
movePrecedingHeadlineToTableList(splitList);
|
||||||
|
return splitList.stream().filter(list -> !list.isEmpty()).toList();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void movePrecedingHeadlineToTableList(List<List<AbstractPageBlock>> splitList) {
|
||||||
|
|
||||||
|
for (int i = 0; i < splitList.size(); i++) {
|
||||||
|
if (listIsTablesOnly(splitList.get(i)) && i > 0) {
|
||||||
|
List<AbstractPageBlock> previousList = splitList.get(i - 1);
|
||||||
|
AbstractPageBlock lastPageBlockInPreviousList = previousList.get(previousList.size() - 1);
|
||||||
|
if (lastPageBlockInPreviousList.isHeadline()) {
|
||||||
|
previousList.remove(i - 1);
|
||||||
|
splitList.get(i).add(0, lastPageBlockInPreviousList);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private boolean listIsTablesOnly(List<AbstractPageBlock> abstractPageBlocks) {
|
||||||
|
|
||||||
|
return abstractPageBlocks.stream().allMatch(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param pageBlocks a List of AbstractPageBlocks, which have at least one TablePageBlock and one ClassificationTextBlock
|
||||||
|
* @return List of Lists of AbstractPageBlocks, which are exclusively of type ClassificationTextBlock or TablePageBlock
|
||||||
|
*/
|
||||||
|
private List<List<AbstractPageBlock>> splitIntoCoherentList(List<AbstractPageBlock> pageBlocks) {
|
||||||
|
|
||||||
|
List<List<AbstractPageBlock>> splitList = new LinkedList<>();
|
||||||
|
List<AbstractPageBlock> currentList = new LinkedList<>();
|
||||||
|
splitList.add(currentList);
|
||||||
|
|
||||||
|
Class<? extends AbstractPageBlock> lastPageBlockClass = pageBlocks.get(0).getClass();
|
||||||
|
for (AbstractPageBlock pageBlock : pageBlocks) {
|
||||||
|
if (lastPageBlockClass.isInstance(pageBlock)) {
|
||||||
|
currentList.add(pageBlock);
|
||||||
|
} else {
|
||||||
|
currentList = new LinkedList<>();
|
||||||
|
currentList.add(pageBlock);
|
||||||
|
splitList.add(currentList);
|
||||||
|
lastPageBlockClass = pageBlock.getClass();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return splitList;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private List<TextPageBlock> findTextBlocksWithSameClassificationAndAlignsY(AbstractPageBlock atc, List<AbstractPageBlock> pageBlocks) {
|
||||||
|
|
||||||
|
return pageBlocks.stream()
|
||||||
|
.filter(abstractTextContainer -> !abstractTextContainer.equals(atc))
|
||||||
|
.filter(abstractTextContainer -> abstractTextContainer.getPage() == atc.getPage())
|
||||||
|
.filter(abstractTextContainer -> abstractTextContainer instanceof TextPageBlock)
|
||||||
|
.filter(abstractTextContainer -> abstractTextContainer.intersectsY(atc))
|
||||||
|
.map(abstractTextContainer -> (TextPageBlock) abstractTextContainer)
|
||||||
|
.toList();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void addSectionNodeToPageNode(DocumentGraphFactory.Context context, Section section, Integer pageNumber) {
|
||||||
|
|
||||||
|
Page page = context.getPage(pageNumber);
|
||||||
|
page.getMainBody().add(section);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,136 @@
|
|||||||
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.factory;
|
||||||
|
|
||||||
|
import static java.util.Collections.emptyList;
|
||||||
|
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.Cell;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.TablePageBlock;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPositionSequence;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.GenericSemanticNode;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Page;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.SemanticNode;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Table;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.TableCell;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.TextBlock;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.utils.TextPositionOperations;
|
||||||
|
|
||||||
|
import lombok.experimental.UtilityClass;
|
||||||
|
|
||||||
|
@UtilityClass
|
||||||
|
public class TableNodeFactory {
|
||||||
|
|
||||||
|
public final double TABLE_CELL_MERGE_CONTENTS_SIZE_THRESHOLD = 0.05;
|
||||||
|
|
||||||
|
|
||||||
|
public void addTable(GenericSemanticNode parentNode, List<TablePageBlock> tablesToMerge, DocumentGraphFactory.Context context) {
|
||||||
|
|
||||||
|
setPageNumberInCells(tablesToMerge);
|
||||||
|
Set<Page> pages = tablesToMerge.stream().map(AbstractPageBlock::getPage).map(context::getPage).collect(Collectors.toSet());
|
||||||
|
List<List<Cell>> mergedRows = tablesToMerge.stream().map(TablePageBlock::getRows).flatMap(Collection::stream).toList();
|
||||||
|
Table table = Table.builder().documentTree(context.getDocumentTree()).numberOfCols(mergedRows.get(0).size()).numberOfRows(mergedRows.size()).build();
|
||||||
|
|
||||||
|
pages.forEach(page -> addTableToPage(page, parentNode, table));
|
||||||
|
|
||||||
|
List<Integer> treeId = context.getDocumentTree().createNewChildEntryAndReturnId(parentNode, table);
|
||||||
|
table.setTreeId(treeId);
|
||||||
|
addTableCells(mergedRows, table, context);
|
||||||
|
|
||||||
|
ifTableHasNoHeadersSetFirstRowAsHeaders(table);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void setPageNumberInCells(List<TablePageBlock> tablesToMerge) {
|
||||||
|
|
||||||
|
// For some reason I can't figure out, in some table cells, the ClassificationTextBlocks have 0 as page number
|
||||||
|
// So I am fixing this here, but this should actually be fixed upstream.
|
||||||
|
tablesToMerge.forEach(table -> table.getRows()
|
||||||
|
.stream()
|
||||||
|
.flatMap(Collection::stream)
|
||||||
|
.peek(cell -> cell.setPageNumber(table.getPage()))
|
||||||
|
.forEach(cell -> setPageNumberInTextBlocksWithPageNumberSetTo0(table, cell)));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void setPageNumberInTextBlocksWithPageNumberSetTo0(TablePageBlock table, Cell cell) {
|
||||||
|
|
||||||
|
cell.getTextBlocks().stream()//
|
||||||
|
.filter(tb -> tb.getPage() == 0)//
|
||||||
|
.forEach(tb -> tb.setPage(table.getPage()));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@SuppressWarnings("PMD.UnusedPrivateMethod") // PMD actually flags this wrong
|
||||||
|
private void addTableToPage(Page page, SemanticNode parentNode, Table table) {
|
||||||
|
|
||||||
|
if (!page.getMainBody().contains(parentNode)) {
|
||||||
|
parentNode.getPages().add(page);
|
||||||
|
}
|
||||||
|
|
||||||
|
page.getMainBody().add(table);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void ifTableHasNoHeadersSetFirstRowAsHeaders(Table table) {
|
||||||
|
|
||||||
|
if (table.streamHeaders().findAny().isEmpty()) {
|
||||||
|
table.streamRow(0).forEach(tableCellNode -> tableCellNode.setHeader(true));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void addTableCells(List<List<Cell>> rows, Table table, DocumentGraphFactory.Context context) {
|
||||||
|
|
||||||
|
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
|
||||||
|
for (int colIndex = 0; colIndex < rows.get(rowIndex).size(); colIndex++) {
|
||||||
|
addTableCell(rows.get(rowIndex).get(colIndex), rowIndex, colIndex, table, context);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@SuppressWarnings("PMD.UnusedPrivateMethod") // PMD actually flags this wrong
|
||||||
|
private void addTableCell(Cell cell, int rowIndex, int colIndex, Table tableNode, DocumentGraphFactory.Context context) {
|
||||||
|
|
||||||
|
Page page = context.getPage(cell.getPageNumber());
|
||||||
|
|
||||||
|
TableCell tableCell = TableCell.builder().documentTree(context.getDocumentTree()).row(rowIndex).col(colIndex).header(cell.isHeaderCell()).bBox(cell.getBounds2D()).build();
|
||||||
|
page.getMainBody().add(tableCell);
|
||||||
|
|
||||||
|
List<Integer> treeId = context.getDocumentTree().createNewTableChildEntryAndReturnId(tableNode, tableCell);
|
||||||
|
tableCell.setTreeId(treeId);
|
||||||
|
|
||||||
|
TextBlock textBlock;
|
||||||
|
if (cell.getTextBlocks().isEmpty()) {
|
||||||
|
tableCell.setLeafTextBlock(context.getTextBlockFactory().emptyTextBlock(tableNode, context, page));
|
||||||
|
} else if (cell.getTextBlocks().size() == 1) {
|
||||||
|
textBlock = context.getTextBlockFactory().buildAtomicTextBlock(cell.getTextBlocks().get(0).getSequences(), tableCell, context, page);
|
||||||
|
tableCell.setLeafTextBlock(textBlock);
|
||||||
|
} else if (firstTextBlockIsHeadline(cell)) {
|
||||||
|
SectionNodeFactory.addSection(tableCell, cell.getTextBlocks().stream().map(tb -> (AbstractPageBlock) tb).toList(), emptyList(), context);
|
||||||
|
} else if (cellAreaIsSmallerThanPageAreaTimesThreshold(cell, page)) {
|
||||||
|
List<TextPositionSequence> sequences = TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(cell.getTextBlocks());
|
||||||
|
textBlock = context.getTextBlockFactory().buildAtomicTextBlock(sequences, tableCell, context, page);
|
||||||
|
tableCell.setLeafTextBlock(textBlock);
|
||||||
|
} else {
|
||||||
|
cell.getTextBlocks().forEach(tb -> DocumentGraphFactory.addParagraphOrHeadline(tableCell, tb, context, emptyList()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private boolean cellAreaIsSmallerThanPageAreaTimesThreshold(Cell cell, Page page) {
|
||||||
|
|
||||||
|
return cell.getArea() < TABLE_CELL_MERGE_CONTENTS_SIZE_THRESHOLD * page.getHeight() * page.getWidth();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private boolean firstTextBlockIsHeadline(Cell cell) {
|
||||||
|
|
||||||
|
return cell.getTextBlocks().get(0).isHeadline();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,53 @@
|
|||||||
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.factory;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPositionSequence;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Page;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.SemanticNode;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.AtomicTextBlock;
|
||||||
|
|
||||||
|
import lombok.AccessLevel;
|
||||||
|
import lombok.experimental.FieldDefaults;
|
||||||
|
|
||||||
|
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||||
|
public class TextBlockFactory {
|
||||||
|
|
||||||
|
int stringOffset;
|
||||||
|
long textBlockIdx;
|
||||||
|
|
||||||
|
|
||||||
|
public AtomicTextBlock buildAtomicTextBlock(List<TextPositionSequence> sequences, SemanticNode parent, DocumentGraphFactory.Context context, Page page) {
|
||||||
|
|
||||||
|
Integer numberOnPage = context.getAndIncrementTextBlockNumberOnPage(page);
|
||||||
|
return buildAtomicTextBlock(sequences, parent, numberOnPage, page);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public AtomicTextBlock buildAtomicTextBlock(List<TextPositionSequence> sequences, SemanticNode parent, Integer numberOnPage, Page page) {
|
||||||
|
|
||||||
|
SearchTextWithTextPositionDto searchTextWithTextPositionDto = SearchTextWithTextPositionFactory.buildSearchTextToTextPositionModel(sequences);
|
||||||
|
int offset = stringOffset;
|
||||||
|
stringOffset += searchTextWithTextPositionDto.getSearchText().length();
|
||||||
|
long idx = textBlockIdx;
|
||||||
|
textBlockIdx++;
|
||||||
|
return AtomicTextBlock.fromSearchTextWithTextPositionDto(searchTextWithTextPositionDto, parent, offset, idx, numberOnPage, page);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public AtomicTextBlock emptyTextBlock(SemanticNode parent, DocumentGraphFactory.Context context, Page page) {
|
||||||
|
|
||||||
|
long idx = textBlockIdx;
|
||||||
|
textBlockIdx++;
|
||||||
|
return AtomicTextBlock.empty(idx, stringOffset, page, context.getAndIncrementTextBlockNumberOnPage(page), parent);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public AtomicTextBlock emptyTextBlock(SemanticNode parent, Integer numberOnPage, Page page) {
|
||||||
|
|
||||||
|
long idx = textBlockIdx;
|
||||||
|
textBlockIdx++;
|
||||||
|
return AtomicTextBlock.empty(idx, stringOffset, page, numberOnPage, parent);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,163 @@
|
|||||||
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph;
|
||||||
|
|
||||||
|
import static java.lang.String.format;
|
||||||
|
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.LinkedList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.TextBlock;
|
||||||
|
|
||||||
|
import lombok.EqualsAndHashCode;
|
||||||
|
import lombok.Setter;
|
||||||
|
|
||||||
|
@Setter
|
||||||
|
@EqualsAndHashCode
|
||||||
|
public class Boundary implements Comparable<Boundary> {
|
||||||
|
|
||||||
|
private int start;
|
||||||
|
private int end;
|
||||||
|
|
||||||
|
|
||||||
|
public Boundary(int start, int end) {
|
||||||
|
|
||||||
|
if (start > end) {
|
||||||
|
throw new IllegalArgumentException(format("start: %d > end: %d", start, end));
|
||||||
|
}
|
||||||
|
this.start = start;
|
||||||
|
this.end = end;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public int length() {
|
||||||
|
|
||||||
|
return end - start;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public int start() {
|
||||||
|
|
||||||
|
return start;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public int end() {
|
||||||
|
|
||||||
|
return end;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean contains(Boundary boundary) {
|
||||||
|
|
||||||
|
return start <= boundary.start() && boundary.end() <= end;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean containedBy(Boundary boundary) {
|
||||||
|
|
||||||
|
return boundary.contains(this);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean contains(int start, int end) {
|
||||||
|
|
||||||
|
if (start > end) {
|
||||||
|
throw new IllegalArgumentException(format("start: %d > end: %d", start, end));
|
||||||
|
}
|
||||||
|
return this.start <= start && end <= this.end;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean containedBy(int start, int end) {
|
||||||
|
|
||||||
|
if (start > end) {
|
||||||
|
throw new IllegalArgumentException(format("start: %d > end: %d", start, end));
|
||||||
|
}
|
||||||
|
return start <= this.start && this.end <= end;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean contains(int index) {
|
||||||
|
|
||||||
|
return start <= index && index < end;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean intersects(Boundary boundary) {
|
||||||
|
|
||||||
|
return boundary.start() < this.end && this.start < boundary.end();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public List<Boundary> split(List<Integer> splitIndices) {
|
||||||
|
|
||||||
|
if (splitIndices.stream().anyMatch(idx -> !this.contains(idx))) {
|
||||||
|
throw new IndexOutOfBoundsException(format("%s splitting indices are out of range for %s", splitIndices.stream().filter(idx -> !this.contains(idx)).toList(), this));
|
||||||
|
}
|
||||||
|
List<Boundary> splitBoundaries = new LinkedList<>();
|
||||||
|
int previousIndex = start;
|
||||||
|
for (int splitIndex : splitIndices) {
|
||||||
|
|
||||||
|
// skip split if it would produce a boundary of length 0
|
||||||
|
if (splitIndex == previousIndex) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
splitBoundaries.add(new Boundary(previousIndex, splitIndex));
|
||||||
|
previousIndex = splitIndex;
|
||||||
|
}
|
||||||
|
splitBoundaries.add(new Boundary(previousIndex, end));
|
||||||
|
return splitBoundaries;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static Boundary merge(Collection<Boundary> boundaries) {
|
||||||
|
|
||||||
|
int minStart = boundaries.stream().mapToInt(Boundary::start).min().orElseThrow(IllegalArgumentException::new);
|
||||||
|
int maxEnd = boundaries.stream().mapToInt(Boundary::end).max().orElseThrow(IllegalArgumentException::new);
|
||||||
|
return new Boundary(minStart, maxEnd);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
|
||||||
|
return format("Boundary [%d|%d)", start, end);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int compareTo(Boundary boundary) {
|
||||||
|
|
||||||
|
if (end < boundary.end() && start < boundary.start()) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
if (start > boundary.start() && end > boundary.end()) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* shrinks the boundary, such that textBlock.subSequence(boundary) returns a string without whitespaces.
|
||||||
|
*
|
||||||
|
* @param textBlock TextBlock to check whitespaces against
|
||||||
|
* @return boundary
|
||||||
|
*/
|
||||||
|
public Boundary trim(TextBlock textBlock) {
|
||||||
|
|
||||||
|
int trimmedStart = this.start;
|
||||||
|
while (Character.isWhitespace(textBlock.charAt(trimmedStart))) {
|
||||||
|
trimmedStart++;
|
||||||
|
}
|
||||||
|
|
||||||
|
int trimmedEnd = this.end;
|
||||||
|
while (Character.isWhitespace(textBlock.charAt(trimmedEnd - 1))) {
|
||||||
|
trimmedEnd--;
|
||||||
|
}
|
||||||
|
|
||||||
|
return new Boundary(trimmedStart, Math.max(trimmedEnd, trimmedStart));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,217 @@
|
|||||||
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph;
|
||||||
|
|
||||||
|
import static java.lang.String.format;
|
||||||
|
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.LinkedList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Document;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.GenericSemanticNode;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.NodeType;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.SemanticNode;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Table;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.TableCell;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.TextBlock;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.TextBlockCollector;
|
||||||
|
|
||||||
|
import lombok.AccessLevel;
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Builder;
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.EqualsAndHashCode;
|
||||||
|
import lombok.Getter;
|
||||||
|
import lombok.experimental.FieldDefaults;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@EqualsAndHashCode
|
||||||
|
public class DocumentTree {
|
||||||
|
|
||||||
|
private final Entry root;
|
||||||
|
|
||||||
|
|
||||||
|
public DocumentTree(Document document) {
|
||||||
|
|
||||||
|
root = Entry.builder().treeId(Collections.emptyList()).children(new LinkedList<>()).node(document).build();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public TextBlock buildTextBlock() {
|
||||||
|
|
||||||
|
return allEntriesInOrder().map(Entry::getNode).filter(SemanticNode::isLeaf).map(SemanticNode::getLeafTextBlock).collect(new TextBlockCollector());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public List<Integer> createNewMainEntryAndReturnId(GenericSemanticNode node) {
|
||||||
|
|
||||||
|
return createNewChildEntryAndReturnIdImpl(Collections.emptyList(), node);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public List<Integer> createNewChildEntryAndReturnId(GenericSemanticNode parentNode, GenericSemanticNode node) {
|
||||||
|
|
||||||
|
return createNewChildEntryAndReturnIdImpl(parentNode.getTreeId(), node);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public List<Integer> createNewChildEntryAndReturnId(GenericSemanticNode parentNode, Table node) {
|
||||||
|
|
||||||
|
return createNewChildEntryAndReturnIdImpl(parentNode.getTreeId(), node);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public List<Integer> createNewTableChildEntryAndReturnId(Table parentTable, TableCell tableCell) {
|
||||||
|
|
||||||
|
return createNewChildEntryAndReturnIdImpl(parentTable.getTreeId(), tableCell);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@SuppressWarnings("PMD.UnusedPrivateMethod") // PMD actually flags this wrong
|
||||||
|
private List<Integer> createNewChildEntryAndReturnIdImpl(List<Integer> parentId, SemanticNode node) {
|
||||||
|
|
||||||
|
if (!entryExists(parentId)) {
|
||||||
|
throw new IllegalArgumentException(format("parentId %s does not exist!", parentId));
|
||||||
|
}
|
||||||
|
|
||||||
|
Entry parent = getEntryById(parentId);
|
||||||
|
List<Integer> newId = new LinkedList<>(parentId);
|
||||||
|
newId.add(parent.children.size());
|
||||||
|
parent.children.add(Entry.builder().treeId(newId).node(node).build());
|
||||||
|
|
||||||
|
return newId;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private boolean entryExists(List<Integer> treeId) {
|
||||||
|
|
||||||
|
if (treeId.isEmpty()) {
|
||||||
|
return root != null;
|
||||||
|
}
|
||||||
|
Entry entry = root.children.get(treeId.get(0));
|
||||||
|
for (int id : treeId.subList(1, treeId.size())) {
|
||||||
|
if (id >= entry.children.size() || 0 > id) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
entry = entry.children.get(id);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public Entry getParentEntryById(List<Integer> treeId) {
|
||||||
|
|
||||||
|
return getEntryById(getParentId(treeId));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean hasParentById(List<Integer> treeId) {
|
||||||
|
|
||||||
|
return !treeId.isEmpty();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public Stream<SemanticNode> childNodes(List<Integer> treeId) {
|
||||||
|
|
||||||
|
return getEntryById(treeId).children.stream().map(Entry::getNode);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public Stream<SemanticNode> childNodesOfType(List<Integer> treeId, NodeType nodeType) {
|
||||||
|
|
||||||
|
return getEntryById(treeId).children.stream().filter(entry -> entry.node.getType().equals(nodeType)).map(Entry::getNode);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static List<Integer> getParentId(List<Integer> treeId) {
|
||||||
|
|
||||||
|
if (treeId.isEmpty()) {
|
||||||
|
throw new UnsupportedOperationException("Root has no parent!");
|
||||||
|
}
|
||||||
|
if (treeId.size() < 2) {
|
||||||
|
return Collections.emptyList();
|
||||||
|
}
|
||||||
|
return treeId.subList(0, treeId.size() - 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public Entry getEntryById(List<Integer> treeId) {
|
||||||
|
|
||||||
|
if (treeId.isEmpty()) {
|
||||||
|
return root;
|
||||||
|
}
|
||||||
|
Entry entry = root.children.get(treeId.get(0));
|
||||||
|
for (int id : treeId.subList(1, treeId.size())) {
|
||||||
|
entry = entry.children.get(id);
|
||||||
|
}
|
||||||
|
return entry;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public Stream<Entry> mainEntries() {
|
||||||
|
|
||||||
|
return root.children.stream();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public Stream<Entry> allEntriesInOrder() {
|
||||||
|
|
||||||
|
return Stream.of(root).flatMap(DocumentTree::flatten);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public Stream<Entry> allSubEntriesInOrder(List<Integer> parentId) {
|
||||||
|
|
||||||
|
return getEntryById(parentId).children.stream().flatMap(DocumentTree::flatten);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
|
||||||
|
return String.join("\n", allEntriesInOrder().map(Entry::toString).toList());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static Stream<Entry> flatten(Entry entry) {
|
||||||
|
|
||||||
|
return Stream.concat(Stream.of(entry), entry.children.stream().flatMap(DocumentTree::flatten));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public SemanticNode getHighestParentById(List<Integer> treeId) {
|
||||||
|
|
||||||
|
if (treeId.isEmpty()) {
|
||||||
|
return root.node;
|
||||||
|
}
|
||||||
|
return root.children.get(treeId.get(0)).node;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Builder
|
||||||
|
@Getter
|
||||||
|
@AllArgsConstructor
|
||||||
|
@FieldDefaults(level = AccessLevel.PRIVATE, makeFinal = true)
|
||||||
|
public static class Entry {
|
||||||
|
|
||||||
|
List<Integer> treeId;
|
||||||
|
SemanticNode node;
|
||||||
|
@Builder.Default
|
||||||
|
List<Entry> children = new LinkedList<>();
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
|
||||||
|
return node.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public NodeType getType() {
|
||||||
|
|
||||||
|
return node.getType();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -1,4 +1,4 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.redaction.model;
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity;
|
||||||
|
|
||||||
public enum EntityType {
|
public enum EntityType {
|
||||||
ENTITY,
|
ENTITY,
|
||||||
@ -0,0 +1,229 @@
|
|||||||
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity;
|
||||||
|
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.Deque;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.LinkedList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Engine;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.Boundary;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Page;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.SemanticNode;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.redaction.utils.IdBuilder;
|
||||||
|
|
||||||
|
import lombok.AccessLevel;
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Builder;
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.EqualsAndHashCode;
|
||||||
|
import lombok.experimental.FieldDefaults;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@Builder
|
||||||
|
@AllArgsConstructor
|
||||||
|
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||||
|
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
|
||||||
|
public class RedactionEntity {
|
||||||
|
|
||||||
|
// initial values
|
||||||
|
@EqualsAndHashCode.Include
|
||||||
|
final Boundary boundary;
|
||||||
|
@EqualsAndHashCode.Include
|
||||||
|
final String type;
|
||||||
|
@EqualsAndHashCode.Include
|
||||||
|
final EntityType entityType;
|
||||||
|
|
||||||
|
// empty defaults
|
||||||
|
boolean redaction;
|
||||||
|
boolean removed;
|
||||||
|
boolean ignored;
|
||||||
|
boolean resized;
|
||||||
|
boolean skipRemoveEntitiesContainedInLarger;
|
||||||
|
boolean dictionaryEntry;
|
||||||
|
boolean dossierDictionaryEntry;
|
||||||
|
Set<Engine> engines;
|
||||||
|
Set<RedactionEntity> references;
|
||||||
|
@Builder.Default
|
||||||
|
Deque<Integer> matchedRules = new LinkedList<>();
|
||||||
|
String redactionReason;
|
||||||
|
String legalBasis;
|
||||||
|
|
||||||
|
// inferred on graph insertion
|
||||||
|
@EqualsAndHashCode.Include
|
||||||
|
String value;
|
||||||
|
String textBefore;
|
||||||
|
String textAfter;
|
||||||
|
@Builder.Default
|
||||||
|
Set<Page> pages = new HashSet<>();
|
||||||
|
List<RedactionPosition> redactionPositionsPerPage;
|
||||||
|
@Builder.Default
|
||||||
|
List<SemanticNode> intersectingNodes = new LinkedList<>();
|
||||||
|
SemanticNode deepestFullyContainingNode;
|
||||||
|
|
||||||
|
|
||||||
|
public static RedactionEntity initialEntityNode(Boundary boundary, String type, EntityType entityType) {
|
||||||
|
|
||||||
|
return RedactionEntity.builder().type(type).entityType(entityType).boundary(boundary).engines(new HashSet<>()).references(new HashSet<>()).build();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean occursInNodeOfType(Class<? extends SemanticNode> clazz) {
|
||||||
|
|
||||||
|
return intersectingNodes.stream().anyMatch(clazz::isInstance);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean occursInNode(SemanticNode semanticNode) {
|
||||||
|
|
||||||
|
return intersectingNodes.stream().anyMatch(node -> node.equals(semanticNode));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean isType(String type) {
|
||||||
|
|
||||||
|
return this.type.equals(type);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean isAnyType(List<String> types) {
|
||||||
|
|
||||||
|
return types.contains(type);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void addIntersectingNode(SemanticNode containingNode) {
|
||||||
|
|
||||||
|
intersectingNodes.add(containingNode);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void removeFromGraph() {
|
||||||
|
|
||||||
|
intersectingNodes.forEach(node -> node.getEntities().remove(this));
|
||||||
|
pages.forEach(page -> page.getEntities().remove(this));
|
||||||
|
intersectingNodes = new LinkedList<>();
|
||||||
|
deepestFullyContainingNode = null;
|
||||||
|
pages = new HashSet<>();
|
||||||
|
removed = true;
|
||||||
|
ignored = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void addMatchedRule(int ruleNumber) {
|
||||||
|
|
||||||
|
matchedRules.add(ruleNumber);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public int getMatchedRule() {
|
||||||
|
|
||||||
|
if (matchedRules.isEmpty()) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
return matchedRules.getLast();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public List<RedactionPosition> getRedactionPositionsPerPage() {
|
||||||
|
|
||||||
|
if (redactionPositionsPerPage == null || redactionPositionsPerPage.isEmpty()) {
|
||||||
|
Map<Page, List<Rectangle2D>> rectanglesPerLinePerPage = deepestFullyContainingNode.getTextBlock().getPositionsPerPage(boundary);
|
||||||
|
|
||||||
|
Page firstPage = rectanglesPerLinePerPage.keySet()
|
||||||
|
.stream()
|
||||||
|
.min(Comparator.comparingInt(Page::getNumber))
|
||||||
|
.orElseThrow(() -> new RuntimeException("No Positions found on any page!"));
|
||||||
|
|
||||||
|
String id = IdBuilder.buildId(pages, rectanglesPerLinePerPage.values().stream().flatMap(Collection::stream).toList());
|
||||||
|
redactionPositionsPerPage = rectanglesPerLinePerPage.entrySet().stream().map(entry -> buildRedactionPosition(firstPage, id, entry)).toList();
|
||||||
|
}
|
||||||
|
return redactionPositionsPerPage;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static RedactionPosition buildRedactionPosition(Page firstPage, String id, Map.Entry<Page, List<Rectangle2D>> entry) {
|
||||||
|
|
||||||
|
if (entry.getKey().equals(firstPage)) {
|
||||||
|
return new RedactionPosition(id, entry.getKey(), entry.getValue());
|
||||||
|
} else {
|
||||||
|
return new RedactionPosition(id + "-" + entry.getKey().getNumber(), entry.getKey(), entry.getValue());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean containedBy(RedactionEntity redactionEntity) {
|
||||||
|
|
||||||
|
return this.boundary.containedBy(redactionEntity.getBoundary());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean contains(RedactionEntity redactionEntity) {
|
||||||
|
|
||||||
|
return this.boundary.contains(redactionEntity.getBoundary());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean intersects(RedactionEntity redactionEntity) {
|
||||||
|
|
||||||
|
return this.boundary.intersects(redactionEntity.getBoundary());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void addEngine(Engine engine) {
|
||||||
|
|
||||||
|
engines.add(engine);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void addEngines(Set<Engine> engines) {
|
||||||
|
|
||||||
|
this.engines.addAll(engines);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void addReference(RedactionEntity reference) {
|
||||||
|
|
||||||
|
references.add(reference);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void addReferences(List<RedactionEntity> references) {
|
||||||
|
|
||||||
|
this.references.addAll(references);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean matchesAnnotationId(String manualRedactionId) {
|
||||||
|
|
||||||
|
return getRedactionPositionsPerPage().stream().anyMatch(entityPosition -> entityPosition.getId().equals(manualRedactionId));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
sb.append("Entity[\"");
|
||||||
|
sb.append(value);
|
||||||
|
sb.append("\", ");
|
||||||
|
sb.append(boundary);
|
||||||
|
sb.append(", pages[");
|
||||||
|
pages.forEach(page -> {
|
||||||
|
sb.append(page.getNumber());
|
||||||
|
sb.append(", ");
|
||||||
|
});
|
||||||
|
sb.delete(sb.length() - 2, sb.length());
|
||||||
|
sb.append("], type = \"");
|
||||||
|
sb.append(type);
|
||||||
|
sb.append("\", EntityType.");
|
||||||
|
sb.append(entityType);
|
||||||
|
sb.append("]");
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,24 @@
|
|||||||
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity;
|
||||||
|
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Page;
|
||||||
|
|
||||||
|
import lombok.AccessLevel;
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.experimental.FieldDefaults;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@AllArgsConstructor
|
||||||
|
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||||
|
public class RedactionPosition {
|
||||||
|
|
||||||
|
final String id;
|
||||||
|
Page page;
|
||||||
|
// Each entry in this list corresponds to an entry in the redaction log, this means:
|
||||||
|
// An entity might be represented by multiple redaction log entries
|
||||||
|
List<Rectangle2D> rectanglePerLine;
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,119 @@
|
|||||||
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes;
|
||||||
|
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
|
import com.iqser.red.service.redaction.v1.server.exception.NotFoundException;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.DocumentTree;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity.RedactionEntity;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.TextBlock;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.TextBlockCollector;
|
||||||
|
|
||||||
|
import lombok.AccessLevel;
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Builder;
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.NoArgsConstructor;
|
||||||
|
import lombok.experimental.FieldDefaults;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@Builder
|
||||||
|
@AllArgsConstructor
|
||||||
|
@NoArgsConstructor
|
||||||
|
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||||
|
public class Document implements GenericSemanticNode {
|
||||||
|
|
||||||
|
Set<Page> pages;
|
||||||
|
DocumentTree documentTree;
|
||||||
|
Integer numberOfPages;
|
||||||
|
TextBlock textBlock;
|
||||||
|
@Builder.Default
|
||||||
|
Set<RedactionEntity> entities = new HashSet<>();
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public NodeType getType() {
|
||||||
|
|
||||||
|
return NodeType.DOCUMENT;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public TextBlock getTextBlock() {
|
||||||
|
|
||||||
|
if (textBlock == null) {
|
||||||
|
textBlock = streamTerminalTextBlocksInOrder().collect(new TextBlockCollector());
|
||||||
|
}
|
||||||
|
return textBlock;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public List<Section> getMainSections() {
|
||||||
|
|
||||||
|
return streamChildrenOfType(NodeType.SECTION).map(node -> (Section) node).collect(Collectors.toList());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public Stream<TextBlock> streamTerminalTextBlocksInOrder() {
|
||||||
|
|
||||||
|
return streamAllNodes().filter(SemanticNode::isLeaf).map(SemanticNode::getLeafTextBlock);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<Integer> getTreeId() {
|
||||||
|
|
||||||
|
return Collections.emptyList();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setTreeId(List<Integer> tocId) {
|
||||||
|
|
||||||
|
throw new UnsupportedOperationException("Document is always the root of the TablePageBlock of Contents");
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Headline getHeadline() {
|
||||||
|
|
||||||
|
return streamAllSubNodesOfType(NodeType.HEADLINE).map(node -> (Headline) node).findFirst().orElseThrow(() -> new NotFoundException("No Headlines found in this document!"));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private Stream<SemanticNode> streamAllNodes() {
|
||||||
|
|
||||||
|
return documentTree.allEntriesInOrder().map(DocumentTree.Entry::getNode);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public Stream<Image> streamAllImages() {
|
||||||
|
|
||||||
|
return streamAllSubNodesOfType(NodeType.IMAGE).map(node -> (Image) node);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
|
||||||
|
return NodeType.DOCUMENT + ": " + this.getTextBlock().buildSummary();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Map<Page, Rectangle2D> getBBox() {
|
||||||
|
|
||||||
|
Map<Page, Rectangle2D> bBox = new HashMap<>();
|
||||||
|
for (Page page : pages) {
|
||||||
|
bBox.put(page, new Rectangle2D.Double(0, 0, page.getWidth(), page.getHeight()));
|
||||||
|
}
|
||||||
|
return bBox;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,64 @@
|
|||||||
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes;
|
||||||
|
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.DocumentTree;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity.RedactionEntity;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.TextBlock;
|
||||||
|
|
||||||
|
import lombok.AccessLevel;
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Builder;
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.EqualsAndHashCode;
|
||||||
|
import lombok.NoArgsConstructor;
|
||||||
|
import lombok.experimental.FieldDefaults;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@Builder
|
||||||
|
@AllArgsConstructor
|
||||||
|
@NoArgsConstructor
|
||||||
|
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||||
|
public class Footer implements GenericSemanticNode {
|
||||||
|
|
||||||
|
List<Integer> treeId;
|
||||||
|
TextBlock leafTextBlock;
|
||||||
|
|
||||||
|
@EqualsAndHashCode.Exclude
|
||||||
|
DocumentTree documentTree;
|
||||||
|
|
||||||
|
@Builder.Default
|
||||||
|
@EqualsAndHashCode.Exclude
|
||||||
|
Set<RedactionEntity> entities = new HashSet<>();
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public NodeType getType() {
|
||||||
|
|
||||||
|
return NodeType.FOOTER;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean isLeaf() {
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TextBlock getTextBlock() {
|
||||||
|
|
||||||
|
return leafTextBlock;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
|
||||||
|
return treeId + ": " + NodeType.FOOTER + ": " + leafTextBlock.buildSummary();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,5 @@
|
|||||||
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes;
|
||||||
|
|
||||||
|
public interface GenericSemanticNode extends SemanticNode {
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,64 @@
|
|||||||
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes;
|
||||||
|
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.DocumentTree;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity.RedactionEntity;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.TextBlock;
|
||||||
|
|
||||||
|
import lombok.AccessLevel;
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Builder;
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.EqualsAndHashCode;
|
||||||
|
import lombok.NoArgsConstructor;
|
||||||
|
import lombok.experimental.FieldDefaults;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@Builder
|
||||||
|
@AllArgsConstructor
|
||||||
|
@NoArgsConstructor
|
||||||
|
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||||
|
public class Header implements GenericSemanticNode {
|
||||||
|
|
||||||
|
List<Integer> treeId;
|
||||||
|
TextBlock leafTextBlock;
|
||||||
|
|
||||||
|
@EqualsAndHashCode.Exclude
|
||||||
|
DocumentTree documentTree;
|
||||||
|
|
||||||
|
@Builder.Default
|
||||||
|
@EqualsAndHashCode.Exclude
|
||||||
|
Set<RedactionEntity> entities = new HashSet<>();
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean isLeaf() {
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public NodeType getType() {
|
||||||
|
|
||||||
|
return NodeType.HEADER;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TextBlock getTextBlock() {
|
||||||
|
|
||||||
|
return leafTextBlock;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
|
||||||
|
return treeId + ": " + NodeType.HEADER + ": " + leafTextBlock.buildSummary();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,71 @@
|
|||||||
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes;
|
||||||
|
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.DocumentTree;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity.RedactionEntity;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.TextBlock;
|
||||||
|
|
||||||
|
import lombok.AccessLevel;
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Builder;
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.EqualsAndHashCode;
|
||||||
|
import lombok.NoArgsConstructor;
|
||||||
|
import lombok.experimental.FieldDefaults;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@Builder
|
||||||
|
@AllArgsConstructor
|
||||||
|
@NoArgsConstructor
|
||||||
|
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||||
|
public class Headline implements GenericSemanticNode {
|
||||||
|
|
||||||
|
List<Integer> treeId;
|
||||||
|
TextBlock leafTextBlock;
|
||||||
|
|
||||||
|
@EqualsAndHashCode.Exclude
|
||||||
|
DocumentTree documentTree;
|
||||||
|
|
||||||
|
@Builder.Default
|
||||||
|
@EqualsAndHashCode.Exclude
|
||||||
|
Set<RedactionEntity> entities = new HashSet<>();
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public NodeType getType() {
|
||||||
|
|
||||||
|
return NodeType.HEADLINE;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean isLeaf() {
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TextBlock getTextBlock() {
|
||||||
|
|
||||||
|
return leafTextBlock;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
|
||||||
|
return treeId + ": " + NodeType.HEADLINE + ": " + leafTextBlock.buildSummary();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Headline getHeadline() {
|
||||||
|
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,94 @@
|
|||||||
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes;
|
||||||
|
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.DocumentTree;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity.RedactionEntity;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.TextBlock;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.TextBlockCollector;
|
||||||
|
|
||||||
|
import lombok.AccessLevel;
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Builder;
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.EqualsAndHashCode;
|
||||||
|
import lombok.NoArgsConstructor;
|
||||||
|
import lombok.experimental.FieldDefaults;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@Builder
|
||||||
|
@AllArgsConstructor
|
||||||
|
@NoArgsConstructor
|
||||||
|
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||||
|
public class Image implements GenericSemanticNode {
|
||||||
|
|
||||||
|
List<Integer> treeId;
|
||||||
|
String id;
|
||||||
|
|
||||||
|
ImageType imageType;
|
||||||
|
boolean transparent;
|
||||||
|
Rectangle2D position;
|
||||||
|
|
||||||
|
boolean redaction;
|
||||||
|
boolean ignored;
|
||||||
|
@Builder.Default
|
||||||
|
String redactionReason = "";
|
||||||
|
@Builder.Default
|
||||||
|
String legalBasis = "";
|
||||||
|
@Builder.Default
|
||||||
|
int matchedRule = -1;
|
||||||
|
|
||||||
|
@EqualsAndHashCode.Exclude
|
||||||
|
Page page;
|
||||||
|
|
||||||
|
@EqualsAndHashCode.Exclude
|
||||||
|
DocumentTree documentTree;
|
||||||
|
|
||||||
|
@Builder.Default
|
||||||
|
@EqualsAndHashCode.Exclude
|
||||||
|
Set<RedactionEntity> entities = new HashSet<>();
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public NodeType getType() {
|
||||||
|
|
||||||
|
return NodeType.IMAGE;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TextBlock getTextBlock() {
|
||||||
|
|
||||||
|
return streamAllSubNodes().filter(SemanticNode::isLeaf).map(SemanticNode::getLeafTextBlock).collect(new TextBlockCollector());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Set<Page> getPages() {
|
||||||
|
|
||||||
|
return Collections.singleton(page);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
|
||||||
|
return treeId + ": " + NodeType.IMAGE + ": " + imageType.toString() + " " + position;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Map<Page, Rectangle2D> getBBox() {
|
||||||
|
|
||||||
|
Map<Page, Rectangle2D> bBoxPerPage = new HashMap<>();
|
||||||
|
bBoxPerPage.put(page, position);
|
||||||
|
return bBoxPerPage;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,21 @@
|
|||||||
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes;
|
||||||
|
|
||||||
|
public enum ImageType {
|
||||||
|
LOGO,
|
||||||
|
FORMULA,
|
||||||
|
SIGNATURE,
|
||||||
|
OTHER,
|
||||||
|
OCR;
|
||||||
|
|
||||||
|
|
||||||
|
public static ImageType fromString(String imageType) {
|
||||||
|
|
||||||
|
return switch (imageType.toLowerCase()) {
|
||||||
|
case "logo" -> ImageType.LOGO;
|
||||||
|
case "formula" -> ImageType.FORMULA;
|
||||||
|
case "signature" -> ImageType.SIGNATURE;
|
||||||
|
case "ocr" -> ImageType.OCR;
|
||||||
|
default -> ImageType.OTHER;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,21 @@
|
|||||||
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes;
|
||||||
|
|
||||||
|
import java.util.Locale;
|
||||||
|
|
||||||
|
public enum NodeType {
|
||||||
|
DOCUMENT,
|
||||||
|
SECTION,
|
||||||
|
HEADLINE,
|
||||||
|
PARAGRAPH,
|
||||||
|
TABLE,
|
||||||
|
TABLE_CELL,
|
||||||
|
IMAGE,
|
||||||
|
HEADER,
|
||||||
|
FOOTER;
|
||||||
|
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
|
||||||
|
return this.name().charAt(0) + this.name().substring(1).toLowerCase(Locale.ROOT);
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,87 @@
|
|||||||
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes;
|
||||||
|
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.LinkedList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationPage;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity.RedactionEntity;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.TextBlock;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.TextBlockCollector;
|
||||||
|
|
||||||
|
import lombok.AccessLevel;
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Builder;
|
||||||
|
import lombok.EqualsAndHashCode;
|
||||||
|
import lombok.Getter;
|
||||||
|
import lombok.Setter;
|
||||||
|
import lombok.experimental.FieldDefaults;
|
||||||
|
|
||||||
|
@Getter
|
||||||
|
@Setter
|
||||||
|
@Builder
|
||||||
|
@AllArgsConstructor
|
||||||
|
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||||
|
public class Page {
|
||||||
|
|
||||||
|
Integer number;
|
||||||
|
Integer height;
|
||||||
|
Integer width;
|
||||||
|
Integer rotation;
|
||||||
|
|
||||||
|
@EqualsAndHashCode.Exclude
|
||||||
|
List<SemanticNode> mainBody;
|
||||||
|
@EqualsAndHashCode.Exclude
|
||||||
|
Header header;
|
||||||
|
@EqualsAndHashCode.Exclude
|
||||||
|
Footer footer;
|
||||||
|
|
||||||
|
@Builder.Default
|
||||||
|
@EqualsAndHashCode.Exclude
|
||||||
|
Set<RedactionEntity> entities = new HashSet<>();
|
||||||
|
|
||||||
|
@Builder.Default
|
||||||
|
@EqualsAndHashCode.Exclude
|
||||||
|
Set<Image> images = new HashSet<>();
|
||||||
|
|
||||||
|
|
||||||
|
public static Page fromClassificationPage(ClassificationPage classificationPage) {
|
||||||
|
|
||||||
|
return Page.builder()
|
||||||
|
.height((int) classificationPage.getPageHeight())
|
||||||
|
.width((int) classificationPage.getPageWidth())
|
||||||
|
.number(classificationPage.getPageNumber())
|
||||||
|
.rotation(classificationPage.getRotation())
|
||||||
|
.mainBody(new LinkedList<>())
|
||||||
|
.build();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public TextBlock getMainBodyTextBlock() {
|
||||||
|
|
||||||
|
return mainBody.stream().filter(SemanticNode::isLeaf).map(SemanticNode::getLeafTextBlock).collect(new TextBlockCollector());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
|
||||||
|
return String.valueOf(number);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int hashCode() {
|
||||||
|
|
||||||
|
return number;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean equals(Object o) {
|
||||||
|
|
||||||
|
return o instanceof Page && o.hashCode() == this.hashCode();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,62 @@
|
|||||||
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes;
|
||||||
|
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.DocumentTree;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity.RedactionEntity;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.TextBlock;
|
||||||
|
|
||||||
|
import lombok.AccessLevel;
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Builder;
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.EqualsAndHashCode;
|
||||||
|
import lombok.experimental.FieldDefaults;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@Builder
|
||||||
|
@AllArgsConstructor
|
||||||
|
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||||
|
public class Paragraph implements GenericSemanticNode {
|
||||||
|
|
||||||
|
List<Integer> treeId;
|
||||||
|
TextBlock leafTextBlock;
|
||||||
|
|
||||||
|
@EqualsAndHashCode.Exclude
|
||||||
|
DocumentTree documentTree;
|
||||||
|
|
||||||
|
@Builder.Default
|
||||||
|
@EqualsAndHashCode.Exclude
|
||||||
|
Set<RedactionEntity> entities = new HashSet<>();
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public NodeType getType() {
|
||||||
|
|
||||||
|
return NodeType.PARAGRAPH;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean isLeaf() {
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TextBlock getTextBlock() {
|
||||||
|
|
||||||
|
return leafTextBlock;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
|
||||||
|
return treeId + ": " + NodeType.PARAGRAPH + ": " + leafTextBlock.buildSummary();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user