RED-7141: Implemented docstrum layout parsing
This commit is contained in:
parent
f146beeb44
commit
79239b751d
@ -55,6 +55,13 @@ public class DocumentStructure implements Serializable {
|
||||
|
||||
}
|
||||
|
||||
@Schema(description = "Object containing the extra field names, a duplicate paragraph has in its properties field.")
|
||||
public static class DuplicateParagraphProperties implements Serializable {
|
||||
|
||||
public static final String UNSORTED_TEXTBLOCK_ID = "utbid";
|
||||
|
||||
}
|
||||
|
||||
public static final String RECTANGLE_DELIMITER = ";";
|
||||
|
||||
|
||||
|
||||
@ -2,6 +2,9 @@ package com.knecon.fforesight.service.layoutparser.internal.api.queue;
|
||||
|
||||
public enum LayoutParsingType {
|
||||
REDACT_MANAGER,
|
||||
TAAS,
|
||||
DOCUMINE
|
||||
REDACT_MANAGER_OLD,
|
||||
REDACT_MANAGER_PARAGRAPH_DEBUG,
|
||||
DOCUMINE,
|
||||
CLARIFYND,
|
||||
CLARIFYND_PARAGRAPH_DEBUG
|
||||
}
|
||||
|
||||
@ -24,4 +24,5 @@ dependencies {
|
||||
implementation("com.fasterxml.jackson.module:jackson-module-afterburner:${jacksonVersion}")
|
||||
implementation("com.fasterxml.jackson.datatype:jackson-datatype-jsr310:${jacksonVersion}")
|
||||
implementation("org.springframework.boot:spring-boot-starter-web:3.1.3")
|
||||
implementation("org.jgrapht:jgrapht-core:1.5.2")
|
||||
}
|
||||
|
||||
@ -28,6 +28,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
@ -43,12 +44,11 @@ import com.knecon.fforesight.service.layoutparser.processor.services.RulingClean
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.TableExtractionService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.TaasBlockificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.DocuMineClassificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.TaasClassificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper;
|
||||
@ -76,16 +76,15 @@ public class LayoutParsingPipeline {
|
||||
CvTableParsingAdapter cvTableParsingAdapter;
|
||||
LayoutParsingStorageService layoutParsingStorageService;
|
||||
SectionsBuilderService sectionsBuilderService;
|
||||
TaasClassificationService taasClassificationService;
|
||||
RedactManagerClassificationService redactManagerClassificationService;
|
||||
DocuMineClassificationService docuMineClassificationService;
|
||||
SimplifiedSectionTextService simplifiedSectionTextService;
|
||||
BodyTextFrameService bodyTextFrameService;
|
||||
RulingCleaningService rulingCleaningService;
|
||||
TableExtractionService tableExtractionService;
|
||||
TaasBlockificationService taasBlockificationService;
|
||||
DocuMineBlockificationService docuMineBlockificationService;
|
||||
RedactManagerBlockificationService redactManagerBlockificationService;
|
||||
DocstrumBlockificationService docstrumBlockificationService;
|
||||
LayoutGridService layoutGridService;
|
||||
ObservationRegistry observationRegistry;
|
||||
VisualLayoutParsingAdapter visualLayoutParsingAdapter;
|
||||
@ -97,40 +96,33 @@ public class LayoutParsingPipeline {
|
||||
log.info("Starting layout parsing for {}", layoutParsingRequest.identifier());
|
||||
|
||||
File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
|
||||
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId())
|
||||
.orElse(originFile);
|
||||
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()).orElse(originFile);
|
||||
|
||||
VisualLayoutParsingResponse visualLayoutParsingResponse = new VisualLayoutParsingResponse();
|
||||
if (layoutParsingRequest.visualLayoutParsingFileId()
|
||||
.isPresent()) {
|
||||
visualLayoutParsingResponse = layoutParsingStorageService.getVisualLayoutParsingFile(layoutParsingRequest.visualLayoutParsingFileId()
|
||||
.get());
|
||||
if (layoutParsingRequest.visualLayoutParsingFileId().isPresent()) {
|
||||
visualLayoutParsingResponse = layoutParsingStorageService.getVisualLayoutParsingFile(layoutParsingRequest.visualLayoutParsingFileId().get());
|
||||
}
|
||||
|
||||
ImageServiceResponse imageServiceResponse = new ImageServiceResponse();
|
||||
if (layoutParsingRequest.imagesFileStorageId()
|
||||
.isPresent()) {
|
||||
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId()
|
||||
.get());
|
||||
if (layoutParsingRequest.imagesFileStorageId().isPresent()) {
|
||||
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get());
|
||||
}
|
||||
|
||||
TableServiceResponse tableServiceResponse = new TableServiceResponse();
|
||||
if (layoutParsingRequest.tablesFileStorageId()
|
||||
.isPresent()) {
|
||||
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId()
|
||||
.get());
|
||||
if (layoutParsingRequest.tablesFileStorageId().isPresent()) {
|
||||
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get());
|
||||
}
|
||||
|
||||
ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(),
|
||||
originFile,
|
||||
imageServiceResponse,
|
||||
tableServiceResponse,
|
||||
visualLayoutParsingResponse,
|
||||
layoutParsingRequest.identifier().toString());
|
||||
originFile,
|
||||
imageServiceResponse,
|
||||
tableServiceResponse,
|
||||
visualLayoutParsingResponse,
|
||||
layoutParsingRequest.identifier().toString());
|
||||
|
||||
log.info("Building document graph for {}", layoutParsingRequest.identifier());
|
||||
|
||||
Document documentGraph = observeBuildDocumentGraph(classificationDocument);
|
||||
Document documentGraph = observeBuildDocumentGraph(layoutParsingRequest.layoutParsingType(), classificationDocument);
|
||||
|
||||
log.info("Creating viewer document for {}", layoutParsingRequest.identifier());
|
||||
|
||||
@ -142,7 +134,7 @@ public class LayoutParsingPipeline {
|
||||
layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentGraph));
|
||||
layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, viewerDocumentFile);
|
||||
|
||||
if (layoutParsingRequest.layoutParsingType().equals(LayoutParsingType.TAAS)) {
|
||||
if (layoutParsingRequest.layoutParsingType().equals(LayoutParsingType.CLARIFYND)) {
|
||||
log.info("Building research document data for {}", layoutParsingRequest.identifier());
|
||||
var researchDocumentData = TaasDocumentDataMapper.fromDocument(documentGraph);
|
||||
layoutParsingStorageService.storeResearchDocumentData(layoutParsingRequest, researchDocumentData);
|
||||
@ -158,37 +150,37 @@ public class LayoutParsingPipeline {
|
||||
.numberOfPages(documentGraph.getNumberOfPages())
|
||||
.duration(System.currentTimeMillis() - start)
|
||||
.message(format("""
|
||||
Layout parsing has finished in %.02f s.
|
||||
identifiers: %s
|
||||
%s
|
||||
Files have been saved with Ids:
|
||||
Structure: %s
|
||||
Text: %s
|
||||
Positions: %s
|
||||
PageData: %s
|
||||
Simplified Text: %s
|
||||
Viewer Doc: %s""",
|
||||
((float) (System.currentTimeMillis() - start)) / 1000,
|
||||
layoutParsingRequest.identifier(),
|
||||
buildSemanticNodeCountMessage(documentGraph.getNumberOfPages(), documentGraph.buildSemanticNodeCounts()),
|
||||
layoutParsingRequest.structureFileStorageId(),
|
||||
layoutParsingRequest.textBlockFileStorageId(),
|
||||
layoutParsingRequest.positionBlockFileStorageId(),
|
||||
layoutParsingRequest.pageFileStorageId(),
|
||||
layoutParsingRequest.simplifiedTextStorageId(),
|
||||
layoutParsingRequest.viewerDocumentStorageId()))
|
||||
Layout parsing has finished in %.02f s.
|
||||
identifiers: %s
|
||||
%s
|
||||
Files have been saved with Ids:
|
||||
Structure: %s
|
||||
Text: %s
|
||||
Positions: %s
|
||||
PageData: %s
|
||||
Simplified Text: %s
|
||||
Viewer Doc: %s""",
|
||||
((float) (System.currentTimeMillis() - start)) / 1000,
|
||||
layoutParsingRequest.identifier(),
|
||||
buildSemanticNodeCountMessage(documentGraph.getNumberOfPages(), documentGraph.buildSemanticNodeCounts()),
|
||||
layoutParsingRequest.structureFileStorageId(),
|
||||
layoutParsingRequest.textBlockFileStorageId(),
|
||||
layoutParsingRequest.positionBlockFileStorageId(),
|
||||
layoutParsingRequest.pageFileStorageId(),
|
||||
layoutParsingRequest.simplifiedTextStorageId(),
|
||||
layoutParsingRequest.viewerDocumentStorageId()))
|
||||
.build();
|
||||
|
||||
}
|
||||
|
||||
|
||||
private Document observeBuildDocumentGraph(ClassificationDocument classificationDocument) {
|
||||
private Document observeBuildDocumentGraph(LayoutParsingType layoutParsingType, ClassificationDocument classificationDocument) {
|
||||
|
||||
AtomicReference<Document> documentReference = new AtomicReference<>();
|
||||
|
||||
Observation.createNotStarted("LayoutParsingPipeline", observationRegistry)
|
||||
.contextualName("build-document-graph")
|
||||
.observe(() -> documentReference.set(DocumentGraphFactory.buildDocumentGraph(classificationDocument)));
|
||||
.observe(() -> documentReference.set(DocumentGraphFactory.buildDocumentGraph(layoutParsingType, classificationDocument)));
|
||||
|
||||
return documentReference.get();
|
||||
}
|
||||
@ -197,14 +189,14 @@ public class LayoutParsingPipeline {
|
||||
private String buildSemanticNodeCountMessage(int numberOfPages, Map<NodeType, Long> semanticNodeCounts) {
|
||||
|
||||
return String.format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed",
|
||||
numberOfPages,
|
||||
semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION),
|
||||
semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE),
|
||||
semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH),
|
||||
semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE),
|
||||
semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL),
|
||||
semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER),
|
||||
semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER));
|
||||
numberOfPages,
|
||||
semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION),
|
||||
semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE),
|
||||
semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH),
|
||||
semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE),
|
||||
semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL),
|
||||
semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER),
|
||||
semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER));
|
||||
}
|
||||
|
||||
|
||||
@ -260,11 +252,16 @@ public class LayoutParsingPipeline {
|
||||
PDRectangle cropbox = pdPage.getCropBox();
|
||||
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), stripper.getRulings());
|
||||
|
||||
List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
|
||||
ClassificationPage classificationPage = switch (layoutParsingType) {
|
||||
case REDACT_MANAGER -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
case TAAS -> taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
case REDACT_MANAGER_OLD ->
|
||||
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, true);
|
||||
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, false);
|
||||
};
|
||||
|
||||
classificationPage.setCleanRulings(cleanRulings);
|
||||
classificationPage.setRotation(rotation);
|
||||
classificationPage.setLandscape(isLandscape);
|
||||
@ -289,7 +286,13 @@ public class LayoutParsingPipeline {
|
||||
}
|
||||
}
|
||||
|
||||
tableExtractionService.extractTables(cleanRulings, classificationPage);
|
||||
tableExtractionService.extractTables(emptyTableCells, classificationPage);
|
||||
|
||||
if (layoutParsingType == LayoutParsingType.REDACT_MANAGER) {
|
||||
docstrumBlockificationService.combineBlocks(classificationPage);
|
||||
} else if (layoutParsingType == LayoutParsingType.CLARIFYND) {
|
||||
docstrumBlockificationService.mergeZones(classificationPage.getTextBlocks());
|
||||
}
|
||||
|
||||
buildPageStatistics(classificationPage);
|
||||
increaseDocumentStatistics(classificationPage, classificationDocument);
|
||||
@ -303,14 +306,21 @@ public class LayoutParsingPipeline {
|
||||
bodyTextFrameService.setBodyTextFrames(classificationDocument, layoutParsingType);
|
||||
log.info("Classify TextBlocks for {}", identifier);
|
||||
switch (layoutParsingType) {
|
||||
case TAAS -> taasClassificationService.classifyDocument(classificationDocument);
|
||||
case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_OLD, CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG ->
|
||||
redactManagerClassificationService.classifyDocument(classificationDocument);
|
||||
case DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument);
|
||||
case REDACT_MANAGER -> redactManagerClassificationService.classifyDocument(classificationDocument);
|
||||
}
|
||||
|
||||
log.info("Building Sections for {}", identifier);
|
||||
sectionsBuilderService.buildSections(classificationDocument);
|
||||
sectionsBuilderService.addImagesToSections(classificationDocument);
|
||||
|
||||
switch (layoutParsingType) {
|
||||
case CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_PARAGRAPH_DEBUG -> sectionsBuilderService.buildParagraphDebugSections(classificationDocument);
|
||||
default -> {
|
||||
sectionsBuilderService.buildSections(classificationDocument);
|
||||
sectionsBuilderService.addImagesToSections(classificationDocument);
|
||||
}
|
||||
}
|
||||
|
||||
return classificationDocument;
|
||||
}
|
||||
|
||||
|
||||
@ -96,7 +96,7 @@ public abstract class AbstractPageBlock extends Rectangle {
|
||||
|
||||
return this.minX - threshold <= apb.getMaxX() && this.maxX + threshold >= apb.getMinX();
|
||||
}
|
||||
|
||||
|
||||
|
||||
public abstract boolean isEmpty();
|
||||
|
||||
|
||||
@ -15,7 +15,6 @@ import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.No
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
@ -52,7 +51,7 @@ public class Document implements GenericSemanticNode {
|
||||
public TextBlock getTextBlock() {
|
||||
|
||||
if (textBlock == null) {
|
||||
textBlock = streamTerminalTextBlocksInOrder().collect(new TextBlockCollector());
|
||||
textBlock = GenericSemanticNode.super.getTextBlock();
|
||||
}
|
||||
return textBlock;
|
||||
}
|
||||
@ -67,8 +66,7 @@ public class Document implements GenericSemanticNode {
|
||||
|
||||
public Stream<TextBlock> streamTerminalTextBlocksInOrder() {
|
||||
|
||||
return streamAllNodes().filter(SemanticNode::isLeaf)
|
||||
.map(SemanticNode::getLeafTextBlock);
|
||||
return streamAllNodes().filter(SemanticNode::isLeaf).map(SemanticNode::getTextBlock);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -0,0 +1,34 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.experimental.SuperBuilder;
|
||||
|
||||
@Data
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
@SuperBuilder
|
||||
public class DuplicatedParagraph extends Paragraph {
|
||||
|
||||
TextBlock unsortedLeafTextBlock;
|
||||
|
||||
|
||||
@Override
|
||||
public TextBlock getTextBlock() {
|
||||
|
||||
return Stream.of(leafTextBlock, unsortedLeafTextBlock).collect(new TextBlockCollector());
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return super.toString();
|
||||
}
|
||||
|
||||
}
|
||||
@ -18,11 +18,12 @@ import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.experimental.SuperBuilder;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@SuperBuilder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
@FieldDefaults(level = AccessLevel.PROTECTED)
|
||||
public class Paragraph implements GenericSemanticNode {
|
||||
|
||||
@Builder.Default
|
||||
|
||||
@ -11,7 +11,6 @@ import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.No
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
@ -62,9 +61,7 @@ public class Section implements GenericSemanticNode {
|
||||
public TextBlock getTextBlock() {
|
||||
|
||||
if (textBlock == null) {
|
||||
textBlock = streamAllSubNodes().filter(SemanticNode::isLeaf)
|
||||
.map(SemanticNode::getLeafTextBlock)
|
||||
.collect(new TextBlockCollector());
|
||||
textBlock = GenericSemanticNode.super.getTextBlock();
|
||||
}
|
||||
return textBlock;
|
||||
}
|
||||
|
||||
@ -20,6 +20,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.E
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
|
||||
public interface SemanticNode {
|
||||
@ -39,7 +40,10 @@ public interface SemanticNode {
|
||||
*
|
||||
* @return TextBlock containing all AtomicTextBlocks that are located under this Node.
|
||||
*/
|
||||
TextBlock getTextBlock();
|
||||
default TextBlock getTextBlock() {
|
||||
|
||||
return streamAllSubNodes().filter(SemanticNode::isLeaf).map(SemanticNode::getTextBlock).collect(new TextBlockCollector());
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
|
||||
@ -48,7 +48,6 @@ public class Table implements SemanticNode {
|
||||
@EqualsAndHashCode.Exclude
|
||||
Map<Page, Rectangle2D> bBoxCache;
|
||||
|
||||
|
||||
/**
|
||||
* Streams all entities in this table, that appear in a row, which contains any of the provided strings.
|
||||
*
|
||||
@ -332,9 +331,7 @@ public class Table implements SemanticNode {
|
||||
public TextBlock getTextBlock() {
|
||||
|
||||
if (textBlock == null) {
|
||||
textBlock = streamAllSubNodes().filter(SemanticNode::isLeaf)
|
||||
.map(SemanticNode::getLeafTextBlock)
|
||||
.collect(new TextBlockCollector());
|
||||
textBlock = SemanticNode.super.getTextBlock();
|
||||
}
|
||||
return textBlock;
|
||||
}
|
||||
|
||||
@ -53,6 +53,9 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
@JsonIgnore
|
||||
private PageBlockType classification;
|
||||
|
||||
@JsonIgnore
|
||||
private boolean toDuplicate;
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public TextDirection getDir() {
|
||||
@ -73,7 +76,7 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
|
||||
return sequences.get(0).getPageWidth();
|
||||
}
|
||||
|
||||
|
||||
|
||||
public static TextPageBlock merge(List<TextPageBlock> textBlocksToMerge) {
|
||||
|
||||
@ -82,6 +85,7 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
return fromTextPositionSequences(sequences);
|
||||
}
|
||||
|
||||
|
||||
public static TextPageBlock fromTextPositionSequences(List<TextPositionSequence> wordBlockList) {
|
||||
|
||||
TextPageBlock textBlock = null;
|
||||
@ -133,7 +137,6 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Returns the minX value in pdf coordinate system.
|
||||
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
|
||||
@ -362,7 +365,22 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
}
|
||||
|
||||
return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString());
|
||||
}
|
||||
|
||||
|
||||
public int getNumberOfLines() {
|
||||
|
||||
int numberOfLines = 1;
|
||||
TextPositionSequence previous = null;
|
||||
for (TextPositionSequence word : sequences) {
|
||||
if (previous != null) {
|
||||
if (word.getMaxYDirAdj() - previous.getMaxYDirAdj() > word.getTextHeight()) {
|
||||
numberOfLines++;
|
||||
}
|
||||
}
|
||||
previous = word;
|
||||
}
|
||||
return numberOfLines;
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -55,6 +55,17 @@ public class TextPositionSequence implements CharSequence {
|
||||
}
|
||||
|
||||
|
||||
public TextPositionSequence(List<RedTextPosition> textPositions, int page) {
|
||||
|
||||
this.textPositions = textPositions;
|
||||
this.page = page;
|
||||
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
|
||||
this.rotation = textPositions.get(0).getRotation();
|
||||
this.pageHeight = textPositions.get(0).getPageHeight();
|
||||
this.pageWidth = textPositions.get(0).getPageWidth();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int length() {
|
||||
|
||||
|
||||
@ -25,6 +25,7 @@ public class BodyTextFrameService {
|
||||
private static final float RULING_HEIGHT_THRESHOLD = 0.15f; // multiplied with page height. Header/Footer Rulings must be within that border of the page.
|
||||
private static final float RULING_WIDTH_THRESHOLD = 0.75f; // multiplied with page width. Header/Footer Rulings must be at least that wide.
|
||||
|
||||
|
||||
public void setBodyTextFrames(ClassificationDocument classificationDocument, LayoutParsingType layoutParsingType) {
|
||||
|
||||
Rectangle bodyTextFrame = calculateBodyTextFrame(classificationDocument.getPages(), classificationDocument.getFontSizeCounter(), false, layoutParsingType);
|
||||
@ -132,12 +133,7 @@ public class BodyTextFrameService {
|
||||
boolean landscape,
|
||||
LayoutParsingType layoutParsingType) {
|
||||
|
||||
float approximateHeaderLineCount;
|
||||
if (layoutParsingType.equals(LayoutParsingType.TAAS)) {
|
||||
approximateHeaderLineCount = 3.3f;
|
||||
} else {
|
||||
approximateHeaderLineCount = 2.9f;
|
||||
}
|
||||
float approximateHeaderLineCount = 2.9f;
|
||||
|
||||
BodyTextFrameExpansionsRectangle expansionsRectangle = new BodyTextFrameExpansionsRectangle();
|
||||
|
||||
@ -155,8 +151,9 @@ public class BodyTextFrameService {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|
||||
|| MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)) {
|
||||
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) || MarkedContentUtils.intersects(textBlock,
|
||||
page.getMarkedContentBboxPerType(),
|
||||
MarkedContentUtils.FOOTER)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
@ -7,6 +7,7 @@ import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.logging.log4j.util.Strings;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
@ -110,6 +111,20 @@ public class SectionsBuilderService {
|
||||
}
|
||||
|
||||
|
||||
public void buildParagraphDebugSections(ClassificationDocument document) {
|
||||
|
||||
List<ClassificationSection> sections = new ArrayList<>();
|
||||
for (var page : document.getPages()) {
|
||||
page.getTextBlocks().forEach(block -> {
|
||||
block.setPage(page.getPageNumber());
|
||||
var section = buildTextBlock(List.of(block), Strings.EMPTY);
|
||||
sections.add(section);
|
||||
});
|
||||
}
|
||||
document.setSections(sections);
|
||||
}
|
||||
|
||||
|
||||
public void addImagesToSections(ClassificationDocument document) {
|
||||
|
||||
Map<Integer, List<ClassificationSection>> sectionMap = new HashMap<>();
|
||||
|
||||
@ -14,7 +14,6 @@ import org.springframework.stereotype.Service;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
@ -41,19 +40,18 @@ public class TableExtractionService {
|
||||
* <p>
|
||||
* DirAdj (Text direction adjusted) values can not be used here.
|
||||
*
|
||||
* @param cleanRulings The lines used to build the table.
|
||||
* @param page Page object that contains textblocks and statistics.
|
||||
* @param emptyCells The cells used to build the table.
|
||||
* @param page Page object that contains textblocks and statistics.
|
||||
*/
|
||||
|
||||
public void extractTables(CleanRulings cleanRulings, ClassificationPage page) {
|
||||
public void extractTables(List<Cell> emptyCells, ClassificationPage page) {
|
||||
|
||||
List<Cell> cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
// sort cells by size (height * width) ascending so that textBlocks are always assigned to the smallest cells that contain them
|
||||
cells.sort(CELL_SIZE_COMPARATOR);
|
||||
emptyCells.sort(CELL_SIZE_COMPARATOR);
|
||||
|
||||
for (AbstractPageBlock abstractPageBlock : page.getTextBlocks()) {
|
||||
TextPageBlock textBlock = (TextPageBlock) abstractPageBlock;
|
||||
for (Cell cell : cells) {
|
||||
for (Cell cell : emptyCells) {
|
||||
if (cell.hasMinimumSize() && doesCellContainTextBlock(cell, textBlock)) {
|
||||
cell.addTextBlock(textBlock);
|
||||
break;
|
||||
@ -61,7 +59,7 @@ public class TableExtractionService {
|
||||
}
|
||||
}
|
||||
|
||||
cells = new ArrayList<>(new HashSet<>(cells));
|
||||
var cells = new ArrayList<>(new HashSet<>(emptyCells));
|
||||
DoubleComparisons.sort(cells, Rectangle.ILL_DEFINED_ORDER);
|
||||
|
||||
List<Rectangle> spreadsheetAreas = SpreadsheetFinder.findSpreadsheetsFromCells(cells);
|
||||
@ -79,9 +77,7 @@ public class TableExtractionService {
|
||||
}
|
||||
}
|
||||
|
||||
var containedCellsWithText = containedCells.stream()
|
||||
.filter(cell -> !cell.getTextBlocks().isEmpty())
|
||||
.toList();
|
||||
var containedCellsWithText = containedCells.stream().filter(cell -> !cell.getTextBlocks().isEmpty()).toList();
|
||||
|
||||
// verify if table would contain fewer cells with text than the threshold allows
|
||||
if (containedCellsWithText.size() >= MAX_TABLE_CONTAINED_CELLS_WITH_TEXT && checkIfTableCellsAreUniform(containedCells)) {
|
||||
@ -101,11 +97,7 @@ public class TableExtractionService {
|
||||
if (position != -1) {
|
||||
page.getTextBlocks().add(position, table);
|
||||
|
||||
var toBeRemoved = table.getCells()
|
||||
.stream()
|
||||
.map(Cell::getTextBlocks)
|
||||
.flatMap(List::stream)
|
||||
.toList();
|
||||
var toBeRemoved = table.getCells().stream().map(Cell::getTextBlocks).flatMap(List::stream).toList();
|
||||
// remove text blocks from the page that were also added with the table (from its contained cells)
|
||||
page.getTextBlocks().removeAll(toBeRemoved);
|
||||
}
|
||||
@ -115,7 +107,7 @@ public class TableExtractionService {
|
||||
|
||||
private boolean checkIfTableCellsAreUniform(List<Cell> containedCells) {
|
||||
|
||||
if(containedCells.size() <= 2) {
|
||||
if (containedCells.size() <= 2) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -139,19 +131,13 @@ public class TableExtractionService {
|
||||
}
|
||||
double x0 = cell.getX();
|
||||
double y0 = cell.getY();
|
||||
return (x >= x0 - TEXT_BLOCK_CONTAINMENT_TOLERANCE
|
||||
&& y >= y0 - TEXT_BLOCK_CONTAINMENT_TOLERANCE
|
||||
&& (x + w) <= x0 + cell.getWidth() + 2 * TEXT_BLOCK_CONTAINMENT_TOLERANCE
|
||||
&& (y + h) <= y0 + cell.getHeight() + 2 * TEXT_BLOCK_CONTAINMENT_TOLERANCE);
|
||||
return (x >= x0 - TEXT_BLOCK_CONTAINMENT_TOLERANCE && y >= y0 - TEXT_BLOCK_CONTAINMENT_TOLERANCE && (x + w) <= x0 + cell.getWidth() + 2 * TEXT_BLOCK_CONTAINMENT_TOLERANCE && (y + h) <= y0 + cell.getHeight() + 2 * TEXT_BLOCK_CONTAINMENT_TOLERANCE);
|
||||
}
|
||||
|
||||
|
||||
public static List<Cell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
||||
|
||||
return RectangularIntersectionFinder.find(horizontalRulingLines, verticalRulingLines)
|
||||
.stream()
|
||||
.map(Cell::new)
|
||||
.collect(Collectors.toList());
|
||||
return RectangularIntersectionFinder.find(horizontalRulingLines, verticalRulingLines).stream().map(Cell::new).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,408 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.blockification;
|
||||
|
||||
import static java.util.stream.Collectors.toSet;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.ListIterator;
|
||||
import java.util.Set;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.DocstrumSegmentationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.QuickSort;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionSequenceComparator;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@SuppressWarnings("all")
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class DocstrumBlockificationService {
|
||||
|
||||
private final DocstrumSegmentationService docstrumSegmentationService;
|
||||
|
||||
static final float THRESHOLD = 1f;
|
||||
|
||||
|
||||
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Cell> cells, boolean xyOrder) {
|
||||
|
||||
// Underlined or strikethrough are also in rulings but we dont want to split blocks with them so we use cells.
|
||||
List<Ruling> usedHorizonalRulings = new ArrayList<>();
|
||||
List<Ruling> usedVerticalRulings = new ArrayList<>();
|
||||
|
||||
cells.forEach(cell -> {
|
||||
usedHorizonalRulings.add(new Ruling(new Point2D.Float(cell.x, cell.y), new Point2D.Float(cell.x + cell.width, cell.y)));
|
||||
usedHorizonalRulings.add(new Ruling(new Point2D.Float(cell.x, cell.y + cell.height), new Point2D.Float(cell.x + cell.width, cell.y + cell.height)));
|
||||
usedVerticalRulings.add(new Ruling(new Point2D.Float(cell.x, cell.y), new Point2D.Float(cell.x, cell.y + cell.height)));
|
||||
usedVerticalRulings.add(new Ruling(new Point2D.Float(cell.x + cell.width, cell.y), new Point2D.Float(cell.x + cell.width, cell.y + cell.height)));
|
||||
});
|
||||
|
||||
List<AbstractPageBlock> abstractPageBlocks = new ArrayList<>();
|
||||
var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder);
|
||||
zones.forEach(zone -> {
|
||||
|
||||
List<TextPositionSequence> textPositionSequences = new ArrayList<>();
|
||||
zone.getLines().forEach(line -> {
|
||||
line.getWords().forEach(word -> {
|
||||
textPositionSequences.add(new TextPositionSequence(word.getTextPositions(), word.getPage()));
|
||||
});
|
||||
});
|
||||
|
||||
abstractPageBlocks.addAll(splitZonesAtRulings(textPositionSequences, usedHorizonalRulings, usedVerticalRulings));
|
||||
});
|
||||
|
||||
return new ClassificationPage(abstractPageBlocks);
|
||||
}
|
||||
|
||||
|
||||
public void combineBlocks(ClassificationPage page) {
|
||||
|
||||
mergeZones(page.getTextBlocks());
|
||||
|
||||
TextPageBlock previous = new TextPageBlock();
|
||||
ListIterator<AbstractPageBlock> itty = page.getTextBlocks().listIterator();
|
||||
while (itty.hasNext()) {
|
||||
AbstractPageBlock block = itty.next();
|
||||
if (block instanceof TablePageBlock) {
|
||||
continue;
|
||||
}
|
||||
TextPageBlock current = (TextPageBlock) block;
|
||||
|
||||
if (previous != null && !previous.getSequences().isEmpty()) {
|
||||
|
||||
if (current.getDir() == previous.getDir() //
|
||||
&& previous.getNumberOfLines() >= 2 && current.getNumberOfLines() >= 2 //
|
||||
&& previous.intersectsY(current) //
|
||||
&& !hasBetween(current, previous, page.getTextBlocks()) //
|
||||
&& numberOfYIntersections(current, previous, page.getTextBlocks()) == 0) {
|
||||
|
||||
previous.getSequences().addAll(current.getSequences());
|
||||
previous = buildTextBlock(previous.getSequences(), 0);
|
||||
previous.setToDuplicate(true);
|
||||
itty.remove();
|
||||
itty.previous();
|
||||
itty.set(previous);
|
||||
itty.next();
|
||||
continue;
|
||||
}
|
||||
|
||||
if (current.getDir() == previous.getDir() && (previous.almostIntersects(current, 0, 0))) {
|
||||
|
||||
previous.getSequences().addAll(current.getSequences());
|
||||
boolean toDuplicate = previous.isToDuplicate();
|
||||
previous = buildTextBlock(previous.getSequences(), 0);
|
||||
previous.setToDuplicate(toDuplicate);
|
||||
itty.remove();
|
||||
itty.previous();
|
||||
itty.set(previous);
|
||||
itty.next();
|
||||
continue;
|
||||
}
|
||||
|
||||
if (current.getDir() == previous.getDir() //
|
||||
&& (Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD || Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) //
|
||||
&& (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() == 2 && current.getNumberOfLines() == 1) //
|
||||
&& !hasBetween(current, previous, page.getTextBlocks()) && numberOfYIntersections(current, previous, page.getTextBlocks()) <= 4) {
|
||||
|
||||
previous.getSequences().addAll(current.getSequences());
|
||||
previous = buildTextBlock(previous.getSequences(), 0);
|
||||
itty.remove();
|
||||
itty.previous();
|
||||
itty.set(previous);
|
||||
itty.next();
|
||||
continue;
|
||||
}
|
||||
|
||||
if (current.getDir() == previous.getDir() //
|
||||
&& current.intersectsY(previous) //
|
||||
&& (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() == 2 && current.getNumberOfLines() == 1) //
|
||||
&& !hasBetween(current, previous, page.getTextBlocks()) //
|
||||
&& numberOfYIntersections(current, previous, page.getTextBlocks()) <= 0) {
|
||||
previous.getSequences().addAll(current.getSequences());
|
||||
previous = buildTextBlock(previous.getSequences(), 0);
|
||||
itty.remove();
|
||||
itty.previous();
|
||||
itty.set(previous);
|
||||
itty.next();
|
||||
continue;
|
||||
}
|
||||
|
||||
}
|
||||
previous = current;
|
||||
}
|
||||
|
||||
mergeZones(page.getTextBlocks());
|
||||
|
||||
}
|
||||
|
||||
|
||||
private boolean hasBetween(TextPageBlock block, TextPageBlock other, List<AbstractPageBlock> allBlocks) {
|
||||
|
||||
for (AbstractPageBlock current : allBlocks) {
|
||||
|
||||
if (current == other || current == block) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (other.intersectsY(current) && other.getMaxX() <= current.getMinX() && current.getMaxX() <= block.getMinX()) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
private int numberOfYIntersections(TextPageBlock block, TextPageBlock other, List<AbstractPageBlock> allBlocks) {
|
||||
|
||||
double minY = Math.min(block.getMinY(), other.getMinY());
|
||||
double maxY = Math.min(block.getMaxY(), other.getMaxY());
|
||||
|
||||
int numberOfYIntersections = 0;
|
||||
for (AbstractPageBlock current : allBlocks) {
|
||||
|
||||
if (current == other || current == block) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (minY <= current.getMaxY() && maxY >= current.getMinY()) {
|
||||
numberOfYIntersections++;
|
||||
}
|
||||
}
|
||||
|
||||
return numberOfYIntersections;
|
||||
}
|
||||
|
||||
|
||||
public void mergeZones(List<AbstractPageBlock> zones) {
|
||||
|
||||
ListIterator<AbstractPageBlock> itty = zones.listIterator();
|
||||
Set<AbstractPageBlock> toRemove = new HashSet<>();
|
||||
while (itty.hasNext()) {
|
||||
AbstractPageBlock block = itty.next();
|
||||
if (block instanceof TablePageBlock) {
|
||||
continue;
|
||||
}
|
||||
|
||||
TextPageBlock current = (TextPageBlock) block;
|
||||
|
||||
if (current.isToDuplicate()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (int i = 0; i < zones.size(); i++) {
|
||||
|
||||
if (toRemove.contains(zones.get(i))) {
|
||||
continue;
|
||||
}
|
||||
if (zones.get(i) == current) {
|
||||
continue;
|
||||
}
|
||||
if (zones.get(i) instanceof TablePageBlock) {
|
||||
continue;
|
||||
}
|
||||
|
||||
TextPageBlock inner = (TextPageBlock) zones.get(i);
|
||||
|
||||
if (inner.isToDuplicate()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (current.getDir() == inner.getDir() && current.almostIntersects(inner, 0, 0)) {
|
||||
|
||||
current.getSequences().addAll(inner.getSequences());
|
||||
QuickSort.sort(current.getSequences(), new TextPositionSequenceComparator());
|
||||
current = buildTextBlock(current.getSequences(), 0);
|
||||
toRemove.add(inner);
|
||||
itty.set(current);
|
||||
}
|
||||
}
|
||||
}
|
||||
zones.removeAll(toRemove);
|
||||
}
|
||||
|
||||
|
||||
public List<AbstractPageBlock> splitZonesAtRulings(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
||||
|
||||
int indexOnPage = 0;
|
||||
List<TextPositionSequence> chunkWords = new ArrayList<>();
|
||||
List<AbstractPageBlock> chunkBlockList = new ArrayList<>();
|
||||
|
||||
float minX = 1000, maxX = 0, minY = 1000, maxY = 0;
|
||||
TextPositionSequence prev = null;
|
||||
|
||||
for (TextPositionSequence word : textPositions) {
|
||||
|
||||
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
|
||||
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
|
||||
|
||||
if (prev != null && (splitByDir || isSplitByRuling)) {
|
||||
|
||||
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
|
||||
indexOnPage++;
|
||||
|
||||
chunkBlockList.add(cb1);
|
||||
chunkWords = new ArrayList<>();
|
||||
|
||||
minX = 1000;
|
||||
maxX = 0;
|
||||
minY = 1000;
|
||||
maxY = 0;
|
||||
prev = null;
|
||||
}
|
||||
|
||||
chunkWords.add(word);
|
||||
|
||||
prev = word;
|
||||
if (word.getMinXDirAdj() < minX) {
|
||||
minX = word.getMinXDirAdj();
|
||||
}
|
||||
if (word.getMaxXDirAdj() > maxX) {
|
||||
maxX = word.getMaxXDirAdj();
|
||||
}
|
||||
if (word.getMinYDirAdj() < minY) {
|
||||
minY = word.getMinYDirAdj();
|
||||
}
|
||||
if (word.getMaxYDirAdj() > maxY) {
|
||||
maxY = word.getMaxYDirAdj();
|
||||
}
|
||||
}
|
||||
|
||||
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
|
||||
if (cb1 != null) {
|
||||
chunkBlockList.add(cb1);
|
||||
}
|
||||
|
||||
return chunkBlockList;
|
||||
}
|
||||
|
||||
|
||||
private boolean equalsWithThreshold(float f1, float f2) {
|
||||
|
||||
return Math.abs(f1 - f2) < THRESHOLD;
|
||||
}
|
||||
|
||||
|
||||
private TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
|
||||
|
||||
TextPageBlock textBlock = null;
|
||||
|
||||
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
|
||||
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
|
||||
FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter();
|
||||
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
|
||||
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
|
||||
|
||||
for (TextPositionSequence wordBlock : wordBlockList) {
|
||||
|
||||
lineHeightFrequencyCounter.add(wordBlock.getTextHeight());
|
||||
fontSizeFrequencyCounter.add(wordBlock.getFontSize());
|
||||
spaceFrequencyCounter.add(wordBlock.getSpaceWidth());
|
||||
fontFrequencyCounter.add(wordBlock.getFont());
|
||||
styleFrequencyCounter.add(wordBlock.getFontStyle());
|
||||
|
||||
if (textBlock == null) {
|
||||
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
|
||||
wordBlock.getMaxXDirAdj(),
|
||||
wordBlock.getMinYDirAdj(),
|
||||
wordBlock.getMaxYDirAdj(),
|
||||
wordBlockList,
|
||||
wordBlock.getRotation());
|
||||
} else {
|
||||
TextPageBlock spatialEntity = textBlock.union(wordBlock);
|
||||
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
|
||||
}
|
||||
}
|
||||
|
||||
if (textBlock != null) {
|
||||
textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
|
||||
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
|
||||
}
|
||||
|
||||
if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) {
|
||||
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
|
||||
}
|
||||
return textBlock;
|
||||
}
|
||||
|
||||
|
||||
private boolean isSplitByRuling(float minX,
|
||||
float minY,
|
||||
float maxX,
|
||||
float maxY,
|
||||
TextPositionSequence word,
|
||||
List<Ruling> horizontalRulingLines,
|
||||
List<Ruling> verticalRulingLines) {
|
||||
|
||||
return isSplitByRuling(maxX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMinYDirAdj(),
|
||||
verticalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight()) //
|
||||
|| isSplitByRuling(minX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMaxYDirAdj(),
|
||||
horizontalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight()) //
|
||||
|| isSplitByRuling(maxX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMinYDirAdj(),
|
||||
horizontalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight()) //
|
||||
|| isSplitByRuling(minX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMaxYDirAdj(),
|
||||
verticalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight());
|
||||
}
|
||||
|
||||
|
||||
private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List<Ruling> rulingLines, float dir, float pageWidth, float pageHeight) {
|
||||
|
||||
for (Ruling ruling : rulingLines) {
|
||||
var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight);
|
||||
if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
private double round(float value, int decimalPoints) {
|
||||
|
||||
var d = Math.pow(10, decimalPoints);
|
||||
return Math.round(value * d) / d;
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,330 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.blockification;
|
||||
|
||||
|
||||
// TODO: figure out, why this fails the build
|
||||
// import static com.knecon.fforesight.service.layoutparser.processor.services.factory.SearchTextWithTextPositionFactory.HEIGHT_PADDING;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.Orientation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
@Service
|
||||
@SuppressWarnings("all")
|
||||
public class TaasBlockificationService {
|
||||
|
||||
private static final float THRESHOLD = 1f;
|
||||
private static final float Y_GAP_SPLIT_HEIGHT_MODIFIER = 1.25f; // multiplied with text height
|
||||
private static final float INTERSECTS_Y_THRESHOLD = 4;// 2 * HEIGHT_PADDING // This is exactly 2 times our position height padding. This is required to find boxes that are visually intersecting.
|
||||
private static final int X_GAP_SPLIT_CONSTANT = 50;
|
||||
public static final int X_ALIGNMENT_THRESHOLD = 1;
|
||||
public static final int NEGATIVE_X_GAP_THRESHOLD = -5;
|
||||
|
||||
private Pattern listIdentifier = Pattern.compile("^(?:(?:[1-9]|1\\d|20|[ivxlc]|[a-z])\\s*(?:[.)]))|\\uF0B7", Pattern.CASE_INSENSITIVE);
|
||||
|
||||
|
||||
/**
|
||||
* This method is building blocks by expanding the minX/maxX and minY/maxY value on each word that is not split by the conditions.
|
||||
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
|
||||
* Rulings (Table lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
|
||||
*
|
||||
* @param textPositions The words of a page.
|
||||
* @param horizontalRulingLines Horizontal table lines.
|
||||
* @param verticalRulingLines Vertical table lines.
|
||||
* @return ClassificationPage object that contains the Textblock and text statistics.
|
||||
*/
|
||||
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
||||
|
||||
List<TextPageBlock> classificationTextBlocks = constructFineGranularTextPageBlocks(textPositions, horizontalRulingLines, verticalRulingLines);
|
||||
classificationTextBlocks = mergeTextPageBlocksAligningX(classificationTextBlocks);
|
||||
classificationTextBlocks = mergeIntersectingTextBlocksUntilConvergence(classificationTextBlocks);
|
||||
|
||||
return new ClassificationPage(new ArrayList<>(classificationTextBlocks.stream().map(classificationTextBlock -> (AbstractPageBlock) classificationTextBlock).toList()));
|
||||
}
|
||||
|
||||
|
||||
private List<TextPageBlock> mergeIntersectingTextBlocksUntilConvergence(List<TextPageBlock> classificationTextBlocks) {
|
||||
|
||||
int currentSize = classificationTextBlocks.size();
|
||||
while (true) {
|
||||
classificationTextBlocks = mergeTextPageBlocksAlmostIntersecting(classificationTextBlocks);
|
||||
if (classificationTextBlocks.size() == currentSize) {
|
||||
break;
|
||||
}
|
||||
currentSize = classificationTextBlocks.size();
|
||||
}
|
||||
return classificationTextBlocks;
|
||||
}
|
||||
|
||||
|
||||
private List<TextPageBlock> mergeTextPageBlocksAligningX(List<TextPageBlock> classificationTextBlocks) {
|
||||
|
||||
if (classificationTextBlocks.isEmpty()) {
|
||||
return new ArrayList<>();
|
||||
}
|
||||
List<List<TextPageBlock>> textBlocksToMerge = new LinkedList<>();
|
||||
List<TextPageBlock> currentTextBlocksToMerge = new LinkedList<>();
|
||||
textBlocksToMerge.add(currentTextBlocksToMerge);
|
||||
TextPageBlock previousTextBlock = null;
|
||||
Float lastLineGap = null;
|
||||
for (TextPageBlock currentTextBlock : classificationTextBlocks) {
|
||||
if (previousTextBlock == null) {
|
||||
currentTextBlocksToMerge.add(currentTextBlock);
|
||||
previousTextBlock = currentTextBlock;
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
Matcher listIdentifierPattern = listIdentifier.matcher(currentTextBlock.getText());
|
||||
boolean isListIdentifier = listIdentifierPattern.find();
|
||||
|
||||
boolean yGap = Math.abs(currentTextBlock.getPdfMaxY() - previousTextBlock.getPdfMinY()) < previousTextBlock.getMostPopularWordHeight() * Y_GAP_SPLIT_HEIGHT_MODIFIER;
|
||||
|
||||
boolean sameFont = previousTextBlock.getMostPopularWordFont().equals(currentTextBlock.getMostPopularWordFont()) && previousTextBlock.getMostPopularWordFontSize() == currentTextBlock.getMostPopularWordFontSize();
|
||||
// boolean yGap = previousTextBlock != null && currentTextBlock.getMinYDirAdj() - maxY > Math.min(word.getHeight(), prev.getHeight()) * Y_GAP_SPLIT_HEIGHT_MODIFIER;
|
||||
|
||||
boolean alignsXRight = Math.abs(currentTextBlock.getPdfMaxX() - previousTextBlock.getPdfMaxX()) < X_ALIGNMENT_THRESHOLD;
|
||||
boolean alignsXLeft = Math.abs(currentTextBlock.getPdfMinX() - previousTextBlock.getPdfMinX()) < X_ALIGNMENT_THRESHOLD;
|
||||
// boolean smallYGap = Math.abs(currentTextBlock.getPdfMaxY() - previousTextBlock.getPdfMinY()) < yGap;
|
||||
if (yGap && sameFont && !isListIdentifier) {
|
||||
currentTextBlocksToMerge.add(currentTextBlock);
|
||||
|
||||
} else {
|
||||
currentTextBlocksToMerge = new LinkedList<>();
|
||||
currentTextBlocksToMerge.add(currentTextBlock);
|
||||
textBlocksToMerge.add(currentTextBlocksToMerge);
|
||||
}
|
||||
previousTextBlock = currentTextBlock;
|
||||
}
|
||||
return textBlocksToMerge.stream().map(TextPageBlock::merge).toList();
|
||||
}
|
||||
|
||||
|
||||
private List<TextPageBlock> mergeTextPageBlocksAlmostIntersecting(List<TextPageBlock> textPageBlocks) {
|
||||
|
||||
Set<TextPageBlock> alreadyMerged = new HashSet<>();
|
||||
List<List<TextPageBlock>> textBlocksToMerge = new LinkedList<>();
|
||||
for (TextPageBlock textPageBlock : textPageBlocks) {
|
||||
if (alreadyMerged.contains(textPageBlock)) {
|
||||
continue;
|
||||
}
|
||||
alreadyMerged.add(textPageBlock);
|
||||
textBlocksToMerge.add(Stream.concat(Stream.of(textPageBlock),
|
||||
textPageBlocks.stream().filter(textPageBlock2 -> textPageBlock.almostIntersects(textPageBlock2, INTERSECTS_Y_THRESHOLD, 0) && !alreadyMerged.contains(textPageBlock2)).peek(alreadyMerged::add))
|
||||
.toList());
|
||||
}
|
||||
return textBlocksToMerge.stream().map(TextPageBlock::merge).toList();
|
||||
}
|
||||
|
||||
|
||||
private void assignOrientations(List<TextPageBlock> classificationTextBlocks) {
|
||||
|
||||
Iterator<TextPageBlock> itty = classificationTextBlocks.iterator();
|
||||
|
||||
TextPageBlock previousLeft = null;
|
||||
TextPageBlock previousRight = null;
|
||||
while (itty.hasNext()) {
|
||||
TextPageBlock block = (TextPageBlock) itty.next();
|
||||
|
||||
if (previousLeft != null && block.getOrientation().equals(Orientation.LEFT)) {
|
||||
if (previousLeft.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousLeft.getMinY()) {
|
||||
previousLeft.add(block);
|
||||
itty.remove();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (previousRight != null && block.getOrientation().equals(Orientation.RIGHT)) {
|
||||
if (previousRight.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousRight.getMinY()) {
|
||||
previousRight.add(block);
|
||||
itty.remove();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (block.getOrientation().equals(Orientation.LEFT)) {
|
||||
previousLeft = block;
|
||||
} else if (block.getOrientation().equals(Orientation.RIGHT)) {
|
||||
previousRight = block;
|
||||
}
|
||||
}
|
||||
|
||||
itty = classificationTextBlocks.iterator();
|
||||
TextPageBlock previous = null;
|
||||
while (itty.hasNext()) {
|
||||
TextPageBlock block = (TextPageBlock) itty.next();
|
||||
|
||||
if (previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold(
|
||||
block.getMaxY(),
|
||||
previous.getMaxY()) || previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation()
|
||||
.equals(Orientation.RIGHT) && equalsWithThreshold(block.getMaxY(), previous.getMaxY())) {
|
||||
previous.add(block);
|
||||
itty.remove();
|
||||
continue;
|
||||
}
|
||||
|
||||
previous = block;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private List<TextPageBlock> constructFineGranularTextPageBlocks(List<TextPositionSequence> textPositions,
|
||||
List<Ruling> horizontalRulingLines,
|
||||
List<Ruling> verticalRulingLines) {
|
||||
|
||||
int indexOnPage = 0;
|
||||
List<TextPositionSequence> wordClusterToCombine = new ArrayList<>();
|
||||
List<TextPageBlock> classificationTextBlocks = new ArrayList<>();
|
||||
|
||||
float minX = 1000, maxX = 0, minY = 1000, maxY = 0;
|
||||
TextPositionSequence prev = null;
|
||||
// TODO: make static final constant
|
||||
|
||||
|
||||
boolean wasSplitted = false;
|
||||
Float splitX1 = null;
|
||||
for (TextPositionSequence word : textPositions) {
|
||||
|
||||
Matcher listIdentifierPattern = listIdentifier.matcher(word.toString());
|
||||
|
||||
boolean yGap = prev != null && word.getMinYDirAdj() - maxY > Math.min(word.getHeight(), prev.getHeight()) * Y_GAP_SPLIT_HEIGHT_MODIFIER;
|
||||
boolean sameLine = prev != null && equalsWithThreshold(prev.getMinYDirAdj(), word.getMinYDirAdj());
|
||||
boolean positiveXGapInline = prev != null && maxX + X_GAP_SPLIT_CONSTANT < word.getMinXDirAdj() && sameLine;
|
||||
boolean negativeXGap = prev != null && word.getMinXDirAdj() - minX < NEGATIVE_X_GAP_THRESHOLD;
|
||||
boolean startFromTop = prev != null && word.getMinYDirAdj() < prev.getMinYDirAdj() - prev.getTextHeight();
|
||||
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
|
||||
boolean splitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
|
||||
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
|
||||
boolean fontChange = prev != null && (!word.getFont().equals(prev.getFont()) || !word.getFontStyle()
|
||||
.equals(prev.getFontStyle()) || word.getFontSize() != prev.getFontSize());
|
||||
boolean newline = prev != null && Math.abs(word.getMinYDirAdj() - prev.getMinYDirAdj()) > word.getHeight();
|
||||
boolean isListIdentifier = listIdentifierPattern.matches();
|
||||
|
||||
if (prev != null && (prev.isParagraphStart() || negativeXGap || positiveXGapInline || yGap || startFromTop || splitByRuling || (newline && (fontChange || isListIdentifier)))) {
|
||||
// if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) {
|
||||
|
||||
Orientation prevOrientation = null;
|
||||
if (!classificationTextBlocks.isEmpty()) {
|
||||
prevOrientation = classificationTextBlocks.get(classificationTextBlocks.size() - X_ALIGNMENT_THRESHOLD).getOrientation();
|
||||
}
|
||||
|
||||
TextPageBlock classificationTextBlock = TextPageBlock.fromTextPositionSequences(wordClusterToCombine);
|
||||
|
||||
classificationTextBlocks.add(classificationTextBlock);
|
||||
wordClusterToCombine = new ArrayList<>();
|
||||
|
||||
if (positiveXGapInline && !splitByRuling) {
|
||||
wasSplitted = true;
|
||||
classificationTextBlock.setOrientation(Orientation.LEFT);
|
||||
splitX1 = word.getMinXDirAdj();
|
||||
} else if (newLineAfterSplit && !splitByRuling) {
|
||||
wasSplitted = false;
|
||||
classificationTextBlock.setOrientation(Orientation.RIGHT);
|
||||
splitX1 = null;
|
||||
} else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (yGap || !startFromTop || !positiveXGapInline || !newLineAfterSplit || !splitByRuling)) {
|
||||
classificationTextBlock.setOrientation(Orientation.LEFT);
|
||||
}
|
||||
|
||||
minX = 1000;
|
||||
maxX = 0;
|
||||
minY = 1000;
|
||||
maxY = 0;
|
||||
prev = null;
|
||||
}
|
||||
|
||||
wordClusterToCombine.add(word);
|
||||
|
||||
prev = word;
|
||||
if (word.getMinXDirAdj() < minX) {
|
||||
minX = word.getMinXDirAdj();
|
||||
}
|
||||
if (word.getMaxXDirAdj() > maxX) {
|
||||
maxX = word.getMaxXDirAdj();
|
||||
}
|
||||
if (word.getMinYDirAdj() < minY) {
|
||||
minY = word.getMinYDirAdj();
|
||||
}
|
||||
if (word.getMaxYDirAdj() > maxY) {
|
||||
maxY = word.getMaxYDirAdj();
|
||||
}
|
||||
}
|
||||
|
||||
TextPageBlock classificationTextBlock = TextPageBlock.fromTextPositionSequences(wordClusterToCombine);
|
||||
if (classificationTextBlock != null) {
|
||||
classificationTextBlocks.add(classificationTextBlock);
|
||||
}
|
||||
return classificationTextBlocks;
|
||||
}
|
||||
|
||||
|
||||
private boolean equalsWithThreshold(float f1, float f2) {
|
||||
|
||||
return Math.abs(f1 - f2) < THRESHOLD;
|
||||
}
|
||||
|
||||
|
||||
private boolean isSplitByRuling(float minX,
|
||||
float minY,
|
||||
float maxX,
|
||||
float maxY,
|
||||
TextPositionSequence word,
|
||||
List<Ruling> horizontalRulingLines,
|
||||
List<Ruling> verticalRulingLines) {
|
||||
|
||||
return isSplitByRuling(maxX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMinYDirAdj(),
|
||||
verticalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight()) //
|
||||
|| isSplitByRuling(minX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMaxYDirAdj(),
|
||||
horizontalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight()) //
|
||||
|| isSplitByRuling(maxX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMinYDirAdj(),
|
||||
horizontalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight()) //
|
||||
|| isSplitByRuling(minX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMaxYDirAdj(),
|
||||
verticalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight()); //
|
||||
}
|
||||
|
||||
|
||||
private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List<Ruling> rulingLines, float dir, float pageWidth, float pageHeight) {
|
||||
|
||||
for (Ruling ruling : rulingLines) {
|
||||
var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight);
|
||||
if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,114 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.classification;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class TaasClassificationService {
|
||||
|
||||
private final BodyTextFrameService bodyTextFrameService;
|
||||
|
||||
|
||||
public void classifyDocument(ClassificationDocument document) {
|
||||
|
||||
|
||||
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
|
||||
|
||||
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
|
||||
|
||||
for (ClassificationPage page : document.getPages()) {
|
||||
|
||||
classifyPage(page, document, headlineFontSizes);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void classifyPage(ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
|
||||
|
||||
for (AbstractPageBlock textBlock : page.getTextBlocks()) {
|
||||
if (textBlock instanceof TextPageBlock) {
|
||||
classifyBlock((TextPageBlock) textBlock, page, document, headlineFontSizes);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void classifyBlock(TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
|
||||
|
||||
var bodyTextFrame = page.getBodyTextFrame();
|
||||
|
||||
if (document.getFontSizeCounter().getMostPopular() == null) {
|
||||
textBlock.setClassification(PageBlockType.OTHER);
|
||||
return;
|
||||
}
|
||||
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|
||||
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation())) {
|
||||
textBlock.setClassification(PageBlockType.HEADER);
|
||||
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
|
||||
|| PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation())) {
|
||||
textBlock.setClassification(PageBlockType.FOOTER);
|
||||
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
|
||||
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
|
||||
.size() == 1)) {
|
||||
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
|
||||
textBlock.setClassification(PageBlockType.TITLE);
|
||||
}
|
||||
} else if (textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter()
|
||||
.getMostPopular() && PositionUtils.getApproxLineCount(textBlock) < 4.9 && (textBlock.getMostPopularWordStyle().equals("bold") || !document.getFontStyleCounter()
|
||||
.getCountPerValue()
|
||||
.containsKey("bold") && textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() + 1) && textBlock.getSequences()
|
||||
.get(0)
|
||||
.getTextPositions()
|
||||
.get(0)
|
||||
.getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
|
||||
|
||||
for (int i = 1; i <= headlineFontSizes.size(); i++) {
|
||||
if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) {
|
||||
textBlock.setClassification(PageBlockType.getHeadlineType(i));
|
||||
document.setHeadlines(true);
|
||||
}
|
||||
}
|
||||
} else if (!textBlock.getText().startsWith("Figure ") && PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordStyle()
|
||||
.equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9 && textBlock.getSequences()
|
||||
.get(0)
|
||||
.getTextPositions()
|
||||
.get(0)
|
||||
.getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
|
||||
textBlock.setClassification(PageBlockType.getHeadlineType(headlineFontSizes.size() + 1));
|
||||
document.setHeadlines(true);
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
|
||||
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) {
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD);
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFont()
|
||||
.equals(document.getFontCounter().getMostPopular()) && textBlock.getMostPopularWordStyle()
|
||||
.equals(document.getFontStyleCounter().getMostPopular()) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) {
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
|
||||
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("italic") && !document.getFontStyleCounter()
|
||||
.getMostPopular()
|
||||
.equals("italic") && PositionUtils.getApproxLineCount(textBlock) < 2.9) {
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC);
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) {
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN);
|
||||
} else {
|
||||
textBlock.setClassification(PageBlockType.OTHER);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,59 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.LineBuilderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.NearestNeighbourService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.ReadingOrderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.SpacingService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.ZoneBuilderService;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class DocstrumSegmentationService {
|
||||
|
||||
private final NearestNeighbourService nearestNeighbourService;
|
||||
private final SpacingService spacingService;
|
||||
private final LineBuilderService lineBuilderService;
|
||||
private final ZoneBuilderService zoneBuilderService;
|
||||
private final ReadingOrderService readingOrderService;
|
||||
|
||||
|
||||
public List<Zone> segmentPage(List<TextPositionSequence> textPositions, boolean xyOrder) {
|
||||
|
||||
List<Zone> zones = new ArrayList<>();
|
||||
zones.addAll(computeZones(textPositions, TextDirection.ZERO));
|
||||
zones.addAll(computeZones(textPositions, TextDirection.QUARTER_CIRCLE));
|
||||
zones.addAll(computeZones(textPositions, TextDirection.HALF_CIRCLE));
|
||||
zones.addAll(computeZones(textPositions, TextDirection.THREE_QUARTER_CIRCLE));
|
||||
|
||||
return readingOrderService.resolve(zones, xyOrder);
|
||||
}
|
||||
|
||||
|
||||
private List<Zone> computeZones(List<TextPositionSequence> textPositions, TextDirection direction) {
|
||||
|
||||
var positions = textPositions.stream().filter(t -> t.getDir() == direction).map(TextPositionSequence::getTextPositions).flatMap(List::stream).toList();
|
||||
|
||||
var characters = positions.stream().map(Character::new).collect(Collectors.toList());
|
||||
|
||||
nearestNeighbourService.findNearestNeighbors(characters);
|
||||
|
||||
var characterSpacing = spacingService.computeCharacterSpacing(characters);
|
||||
var lineSpacing = Math.min(spacingService.computeLineSpacing(characters), 20);
|
||||
|
||||
var lines = lineBuilderService.buildLines(characters, characterSpacing, lineSpacing);
|
||||
return zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,25 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
|
||||
|
||||
public class AngleFilter {
|
||||
|
||||
protected double lowerAngle;
|
||||
protected double upperAngle;
|
||||
|
||||
|
||||
public AngleFilter(double lowerAngle, double upperAngle) {
|
||||
|
||||
this.lowerAngle = lowerAngle < -Math.PI / 2 ? lowerAngle + Math.PI : lowerAngle;
|
||||
this.upperAngle = upperAngle >= Math.PI / 2 ? upperAngle - Math.PI : upperAngle;
|
||||
}
|
||||
|
||||
|
||||
public boolean matches(Neighbor neighbor) {
|
||||
|
||||
if (lowerAngle <= upperAngle) {
|
||||
return lowerAngle <= neighbor.getAngle() && neighbor.getAngle() < upperAngle;
|
||||
} else {
|
||||
return lowerAngle <= neighbor.getAngle() || neighbor.getAngle() < upperAngle;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,57 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
public abstract class BoundingBox {
|
||||
|
||||
private Rectangle2D bBox;
|
||||
|
||||
|
||||
public double getX() {
|
||||
|
||||
return bBox.getX();
|
||||
}
|
||||
|
||||
|
||||
public double getY() {
|
||||
|
||||
return bBox.getY();
|
||||
}
|
||||
|
||||
|
||||
public double getWidth() {
|
||||
|
||||
return bBox.getWidth();
|
||||
}
|
||||
|
||||
|
||||
public double getHeight() {
|
||||
|
||||
return bBox.getHeight();
|
||||
}
|
||||
|
||||
|
||||
public double getArea() {
|
||||
|
||||
return (bBox.getHeight() * bBox.getWidth());
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(Rectangle2D contained, double tolerance) {
|
||||
|
||||
return bBox.getX() <= contained.getX() + tolerance
|
||||
&& bBox.getY() <= contained.getY() + tolerance
|
||||
&& bBox.getX() + bBox.getWidth() >= contained.getX() + contained.getWidth() - tolerance
|
||||
&& bBox.getY() + bBox.getHeight() >= contained.getY() + contained.getHeight() - tolerance;
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsY(BoundingBox other) {
|
||||
|
||||
return this.getBBox().getMinY() <= other.getBBox().getMaxY() && this.getBBox().getMaxY() >= other.getBBox().getMinY();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,85 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
public class Character {
|
||||
|
||||
private final double x;
|
||||
private final double y;
|
||||
private final RedTextPosition textPosition;
|
||||
|
||||
private List<Neighbor> neighbors = new ArrayList<>();
|
||||
|
||||
|
||||
public Character(RedTextPosition chunk) {
|
||||
|
||||
this.x = chunk.getXDirAdj() + chunk.getWidthDirAdj() / 2;
|
||||
this.y = chunk.getYDirAdj() + chunk.getHeightDir() / 2;
|
||||
this.textPosition = chunk;
|
||||
}
|
||||
|
||||
|
||||
public double getHeight() {
|
||||
|
||||
return textPosition.getHeightDir();
|
||||
}
|
||||
|
||||
|
||||
public double distance(Character character) {
|
||||
|
||||
double dx = getX() - character.getX();
|
||||
double dy = getY() - character.getY();
|
||||
return Math.sqrt(dx * dx + dy * dy);
|
||||
}
|
||||
|
||||
|
||||
public double horizontalDistance(Character character) {
|
||||
|
||||
return Math.abs(getX() - character.getX());
|
||||
}
|
||||
|
||||
|
||||
public double verticalDistance(Character character) {
|
||||
|
||||
return Math.abs(getY() - character.getY());
|
||||
}
|
||||
|
||||
|
||||
public double overlappingDistance(Character other) {
|
||||
|
||||
double[] xs = new double[4];
|
||||
double s = Math.sin(-0);
|
||||
double c = Math.cos(-0);
|
||||
xs[0] = c * x - s * y;
|
||||
xs[1] = c * (x + textPosition.getWidthDirAdj()) - s * (y + textPosition.getHeightDir());
|
||||
xs[2] = c * other.x - s * other.y;
|
||||
xs[3] = c * (other.x + other.textPosition.getWidthDirAdj()) - s * (other.y + other.textPosition.getHeightDir());
|
||||
boolean overlapping = xs[1] >= xs[2] && xs[3] >= xs[0];
|
||||
Arrays.sort(xs);
|
||||
return Math.abs(xs[2] - xs[1]) * (overlapping ? 1 : -1);
|
||||
}
|
||||
|
||||
|
||||
public void setNeighbors(List<Neighbor> neighbors) {
|
||||
|
||||
this.neighbors = neighbors;
|
||||
}
|
||||
|
||||
|
||||
public double angle(Character character) {
|
||||
|
||||
if (getX() > character.getX()) {
|
||||
return Math.atan2(getY() - character.getY(), getX() - character.getX());
|
||||
} else {
|
||||
return Math.atan2(character.getY() - getY(), character.getX() - getX());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,90 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
|
||||
|
||||
public class Histogram {
|
||||
|
||||
private static final double EPSILON = 1.0e-6;
|
||||
private final double min;
|
||||
private final double resolution;
|
||||
private double[] frequencies;
|
||||
|
||||
|
||||
public Histogram(double minValue, double maxValue, double resolution) {
|
||||
|
||||
this.min = minValue - EPSILON;
|
||||
double delta = maxValue - minValue + 2 * EPSILON;
|
||||
int size = Math.max(1, (int) Math.round((maxValue - minValue) / resolution));
|
||||
this.resolution = delta / size;
|
||||
this.frequencies = new double[size];
|
||||
}
|
||||
|
||||
|
||||
public void kernelSmooth(double[] kernel) {
|
||||
|
||||
double[] newFrequencies = new double[frequencies.length];
|
||||
int shift = (kernel.length - 1) / 2;
|
||||
for (int i = 0; i < kernel.length; i++) {
|
||||
int jStart = Math.max(0, i - shift);
|
||||
int jEnd = Math.min(frequencies.length, frequencies.length + i - shift);
|
||||
for (int j = jStart; j < jEnd; j++) {
|
||||
newFrequencies[j - i + shift] += kernel[i] * frequencies[j];
|
||||
}
|
||||
}
|
||||
frequencies = newFrequencies;
|
||||
}
|
||||
|
||||
|
||||
public double[] createGaussianKernel(double length, double stdDeviation) {
|
||||
|
||||
int r = (int) Math.round(length / resolution) / 2;
|
||||
|
||||
int size = 2 * r + 1;
|
||||
double[] kernel = new double[size];
|
||||
double sum = 0;
|
||||
double b = 2 * (stdDeviation / resolution) * (stdDeviation / resolution);
|
||||
double a = 1 / Math.sqrt(Math.PI * b);
|
||||
for (int i = 0; i < size; i++) {
|
||||
kernel[i] = a * Math.exp(-(i - r) * (i - r) / b);
|
||||
sum += kernel[i];
|
||||
}
|
||||
for (int i = 0; i < size; i++) {
|
||||
kernel[i] /= sum;
|
||||
}
|
||||
return kernel;
|
||||
}
|
||||
|
||||
|
||||
public void gaussianSmooth(double windowLength, double stdDeviation) {
|
||||
|
||||
kernelSmooth(createGaussianKernel(windowLength, stdDeviation));
|
||||
}
|
||||
|
||||
|
||||
public void add(double value) {
|
||||
|
||||
frequencies[(int) ((value - min) / resolution)] += 1.0;
|
||||
}
|
||||
|
||||
|
||||
public int getSize() {
|
||||
|
||||
return frequencies.length;
|
||||
}
|
||||
|
||||
|
||||
public double getPeakValue() {
|
||||
|
||||
int peakIndex = 0;
|
||||
for (int i = 1; i < frequencies.length; i++) {
|
||||
if (frequencies[i] > frequencies[peakIndex]) {
|
||||
peakIndex = i;
|
||||
}
|
||||
}
|
||||
int peakEndIndex = peakIndex + 1;
|
||||
final double EPS = 0.0001;
|
||||
while (peakEndIndex < frequencies.length && Math.abs(frequencies[peakEndIndex] - frequencies[peakIndex]) < EPS) {
|
||||
peakEndIndex++;
|
||||
}
|
||||
return ((double) peakIndex + peakEndIndex) / 2 * resolution + min;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,164 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
public class Line extends BoundingBox {
|
||||
|
||||
private static final double WORD_DISTANCE_MULTIPLIER = 0.18;
|
||||
|
||||
private final double x0;
|
||||
private final double y0;
|
||||
|
||||
private final double x1;
|
||||
private final double y1;
|
||||
|
||||
private final double height;
|
||||
|
||||
private final List<Character> characters;
|
||||
private final List<TextPositionSequence> words = new ArrayList<>();
|
||||
|
||||
|
||||
public Line(List<Character> characters, double wordSpacing) {
|
||||
|
||||
this.characters = characters;
|
||||
|
||||
if (characters.size() >= 2) {
|
||||
// linear regression
|
||||
double sx = 0.0;
|
||||
double sxx = 0.0;
|
||||
double sxy = 0.0;
|
||||
double sy = 0.0;
|
||||
for (Character character : characters) {
|
||||
sx += character.getX();
|
||||
sxx += character.getX() * character.getX();
|
||||
sxy += character.getX() * character.getY();
|
||||
sy += character.getY();
|
||||
}
|
||||
double b = (characters.size() * sxy - sx * sy) / (characters.size() * sxx - sx * sx);
|
||||
double a = (sy - b * sx) / characters.size();
|
||||
|
||||
this.x0 = characters.get(0).getX();
|
||||
this.y0 = a + b * this.x0;
|
||||
this.x1 = characters.get(characters.size() - 1).getX();
|
||||
this.y1 = a + b * this.x1;
|
||||
} else {
|
||||
Character character = characters.get(0);
|
||||
double dx = character.getTextPosition().getWidthDirAdj() / 3;
|
||||
double dy = dx * Math.tan(0);
|
||||
this.x0 = character.getX() - dx;
|
||||
this.x1 = character.getX() + dx;
|
||||
this.y0 = character.getY() - dy;
|
||||
this.y1 = character.getY() + dy;
|
||||
}
|
||||
height = computeHeight();
|
||||
computeWords(wordSpacing * WORD_DISTANCE_MULTIPLIER);
|
||||
buildBBox();
|
||||
}
|
||||
|
||||
|
||||
public double getAngle() {
|
||||
|
||||
return Math.atan2(y1 - y0, x1 - x0);
|
||||
}
|
||||
|
||||
|
||||
public double getLength() {
|
||||
|
||||
return Math.sqrt((x0 - x1) * (x0 - x1) + (y0 - y1) * (y0 - y1));
|
||||
}
|
||||
|
||||
|
||||
private double computeHeight() {
|
||||
|
||||
return characters.stream().map(Character::getHeight).reduce(0d, Double::sum) / characters.size();
|
||||
}
|
||||
|
||||
|
||||
public double angularDifference(Line j) {
|
||||
|
||||
double diff = Math.abs(getAngle() - j.getAngle());
|
||||
if (diff <= Math.PI / 2) {
|
||||
return diff;
|
||||
} else {
|
||||
return Math.PI - diff;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public double horizontalDistance(Line other) {
|
||||
|
||||
double[] xs = new double[4];
|
||||
xs[0] = x0;
|
||||
xs[1] = x1;
|
||||
xs[2] = other.x0;
|
||||
xs[3] = other.x1;
|
||||
boolean overlapping = xs[1] >= xs[2] && xs[3] >= xs[0];
|
||||
Arrays.sort(xs);
|
||||
return Math.abs(xs[2] - xs[1]) * (overlapping ? 1 : -1);
|
||||
}
|
||||
|
||||
|
||||
public double verticalDistance(Line other) {
|
||||
|
||||
double ym = (y0 + y1) / 2;
|
||||
double yn = (other.y0 + other.y1) / 2;
|
||||
return Math.abs(ym - yn) / Math.sqrt(1);
|
||||
}
|
||||
|
||||
|
||||
private void computeWords(double wordSpacing) {
|
||||
|
||||
TextPositionSequence word = new TextPositionSequence();
|
||||
Character previous = null;
|
||||
for (Character current : characters) {
|
||||
if (previous != null) {
|
||||
double dist = current.getTextPosition().getXDirAdj() - previous.getTextPosition().getXDirAdj() - previous.getTextPosition().getWidthDirAdj();
|
||||
if (dist > wordSpacing) {
|
||||
words.add(word);
|
||||
word = new TextPositionSequence();
|
||||
}
|
||||
}
|
||||
word.getTextPositions().add(current.getTextPosition());
|
||||
previous = current;
|
||||
}
|
||||
words.add(word);
|
||||
}
|
||||
|
||||
|
||||
private void buildBBox() {
|
||||
|
||||
double minX = Double.POSITIVE_INFINITY;
|
||||
double minY = Double.POSITIVE_INFINITY;
|
||||
double maxX = Double.NEGATIVE_INFINITY;
|
||||
double maxY = Double.NEGATIVE_INFINITY;
|
||||
|
||||
for (Character character : characters) {
|
||||
|
||||
minX = Math.min(minX, character.getTextPosition().getXDirAdj());
|
||||
minY = Math.min(minY, character.getTextPosition().getYDirAdj());
|
||||
maxX = Math.max(maxX, character.getTextPosition().getXDirAdj() + character.getTextPosition().getWidthDirAdj());
|
||||
maxY = Math.max(maxY, character.getTextPosition().getYDirAdj() + character.getTextPosition().getHeightDir());
|
||||
|
||||
}
|
||||
|
||||
this.setBBox(new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY));
|
||||
}
|
||||
|
||||
|
||||
public String toString() {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
words.forEach(word -> sb.append(word.toString()).append(" "));
|
||||
return sb.toString().trim();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,36 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
|
||||
|
||||
import lombok.Getter;
|
||||
|
||||
public class Neighbor {
|
||||
|
||||
@Getter
|
||||
private final double distance;
|
||||
@Getter
|
||||
private final double angle;
|
||||
private final Character originCharacter;
|
||||
@Getter
|
||||
private final Character character;
|
||||
|
||||
|
||||
public Neighbor(Character neighbor, Character origin) {
|
||||
|
||||
this.distance = neighbor.distance(origin);
|
||||
this.angle = neighbor.angle(origin);
|
||||
this.character = neighbor;
|
||||
this.originCharacter = origin;
|
||||
}
|
||||
|
||||
|
||||
public double getHorizontalDistance() {
|
||||
|
||||
return character.horizontalDistance(originCharacter);
|
||||
}
|
||||
|
||||
|
||||
public double getVerticalDistance() {
|
||||
|
||||
return character.verticalDistance(originCharacter);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,31 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
public class UnionFind<T> extends org.jgrapht.alg.util.UnionFind<T> {
|
||||
|
||||
public UnionFind(Set<T> elements) {
|
||||
|
||||
super(elements);
|
||||
}
|
||||
|
||||
|
||||
public Collection<Set<T>> getGroups() {
|
||||
|
||||
Map<T, Set<T>> setRep = new LinkedHashMap<>();
|
||||
for (T t : getParentMap().keySet()) {
|
||||
T representative = find(t);
|
||||
if (!setRep.containsKey(representative)) {
|
||||
setRep.put(representative, new LinkedHashSet<>());
|
||||
}
|
||||
setRep.get(representative).add(t);
|
||||
}
|
||||
|
||||
return setRep.values();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,51 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
public class Zone extends BoundingBox {
|
||||
|
||||
private List<Line> lines;
|
||||
|
||||
|
||||
@SuppressWarnings("PMD.ConstructorCallsOverridableMethod")
|
||||
public Zone(List<Line> lines) {
|
||||
|
||||
lines.sort(Comparator.comparingDouble(Line::getY));
|
||||
this.lines = lines;
|
||||
buildBBox();
|
||||
}
|
||||
|
||||
|
||||
public void buildBBox() {
|
||||
|
||||
double minX = Double.POSITIVE_INFINITY;
|
||||
double minY = Double.POSITIVE_INFINITY;
|
||||
double maxX = Double.NEGATIVE_INFINITY;
|
||||
double maxY = Double.NEGATIVE_INFINITY;
|
||||
|
||||
for (Line line : lines) {
|
||||
|
||||
minX = Math.min(minX, line.getX());
|
||||
minY = Math.min(minY, line.getY());
|
||||
maxX = Math.max(maxX, line.getX() + line.getWidth());
|
||||
maxY = Math.max(maxY, line.getY() + line.getHeight());
|
||||
|
||||
}
|
||||
|
||||
this.setBBox(new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY));
|
||||
}
|
||||
|
||||
|
||||
public String toString() {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
lines.forEach(line -> sb.append(line.toString()).append("\n"));
|
||||
return sb.toString().trim();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,53 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.AngleFilter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Line;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.UnionFind;
|
||||
|
||||
@Service
|
||||
public class LineBuilderService {
|
||||
|
||||
private static final double CHARACTER_SPACING_DISTANCE_MULTIPLIER = 3.5;
|
||||
private static final double MAX_VERTICAL_CHARACTER_DISTANCE = 0.67;
|
||||
private static final double ANGLE_TOLERANCE = Math.PI / 6;
|
||||
|
||||
|
||||
public List<Line> buildLines(List<Character> characters, double characterSpacing, double lineSpacing) {
|
||||
|
||||
double maxHorizontalDistance = characterSpacing * CHARACTER_SPACING_DISTANCE_MULTIPLIER;
|
||||
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_CHARACTER_DISTANCE;
|
||||
|
||||
UnionFind<Character> unionFind = new UnionFind<>(new HashSet<>(characters));
|
||||
|
||||
AngleFilter filter = new AngleFilter(-ANGLE_TOLERANCE, ANGLE_TOLERANCE);
|
||||
|
||||
characters.forEach(character -> {
|
||||
character.getNeighbors().forEach(neighbor -> {
|
||||
double x = neighbor.getHorizontalDistance() / maxHorizontalDistance;
|
||||
double y = neighbor.getVerticalDistance() / maxVerticalDistance;
|
||||
if (character.getTextPosition().getDir() == neighbor.getCharacter().getTextPosition().getDir() && filter.matches(neighbor) && Math.pow(x, 2) + Math.pow(y,
|
||||
2) <= 1) {
|
||||
unionFind.union(character, neighbor.getCharacter());
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
List<Line> lines = new ArrayList<>();
|
||||
unionFind.getGroups().forEach(group -> {
|
||||
List<Character> lineCharacters = new ArrayList<>(group);
|
||||
lineCharacters.sort(Comparator.comparingDouble(Character::getX));
|
||||
lines.add(new Line(lineCharacters, characterSpacing));
|
||||
});
|
||||
|
||||
return lines;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,78 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Neighbor;
|
||||
|
||||
@Service
|
||||
public class NearestNeighbourService {
|
||||
|
||||
private static final int NUMBER_OF_NEIGHBOURS = 8;
|
||||
private static final double STEP = 16.0;
|
||||
|
||||
|
||||
public void findNearestNeighbors(List<Character> characters) {
|
||||
|
||||
if (characters.isEmpty() || characters.size() == 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
characters.sort(Comparator.comparingDouble(Character::getX));
|
||||
|
||||
int maxNeighborCount = NUMBER_OF_NEIGHBOURS;
|
||||
if (characters.size() <= NUMBER_OF_NEIGHBOURS) {
|
||||
maxNeighborCount = characters.size() - 1;
|
||||
}
|
||||
|
||||
for (int i = 0; i < characters.size(); i++) {
|
||||
|
||||
List<Neighbor> candidates = new ArrayList<>();
|
||||
|
||||
int start = i;
|
||||
int end = i + 1;
|
||||
|
||||
double distance = Double.POSITIVE_INFINITY;
|
||||
|
||||
for (double searchDistance = 0; searchDistance < distance; ) {
|
||||
|
||||
searchDistance += STEP;
|
||||
boolean newCandidatesFound = false;
|
||||
|
||||
while (start > 0 && characters.get(i).getX() - characters.get(start - 1).getX() < searchDistance) {
|
||||
start--;
|
||||
candidates.add(new Neighbor(characters.get(start), characters.get(i)));
|
||||
clearLeastDistant(candidates, maxNeighborCount);
|
||||
newCandidatesFound = true;
|
||||
}
|
||||
|
||||
while (end < characters.size() && characters.get(end).getX() - characters.get(i).getX() < searchDistance) {
|
||||
candidates.add(new Neighbor(characters.get(end), characters.get(i)));
|
||||
clearLeastDistant(candidates, maxNeighborCount);
|
||||
end++;
|
||||
newCandidatesFound = true;
|
||||
}
|
||||
|
||||
if (newCandidatesFound && candidates.size() >= maxNeighborCount) {
|
||||
distance = candidates.get(maxNeighborCount - 1).getDistance();
|
||||
}
|
||||
}
|
||||
clearLeastDistant(candidates, maxNeighborCount);
|
||||
characters.get(i).setNeighbors(new ArrayList<>(candidates));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void clearLeastDistant(List<Neighbor> candidates, int maxNeighborCount) {
|
||||
|
||||
if (candidates.size() > maxNeighborCount) {
|
||||
candidates.sort(Comparator.comparingDouble(Neighbor::getDistance));
|
||||
candidates.remove(candidates.remove(candidates.size() - 1));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,165 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.ListIterator;
|
||||
import java.util.Map;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.BoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.utils.DoubleUtils;
|
||||
|
||||
@Service
|
||||
public class ReadingOrderService {
|
||||
|
||||
private static final double THRESHOLD = 5;
|
||||
public static final double MULTI_COLUMN_DETECTION_THRESHOLD = 1.5;
|
||||
|
||||
|
||||
public List<Zone> resolve(List<Zone> zones, boolean xyReadingOrder) {
|
||||
|
||||
if (zones.isEmpty() || zones.size() == 1) {
|
||||
return zones;
|
||||
}
|
||||
|
||||
if (xyReadingOrder) {
|
||||
return resolveSingleColumnReadingOrder(zones);
|
||||
}
|
||||
|
||||
Map<Long, Integer> histogram = new HashMap<>();
|
||||
for (Zone zone : zones) {
|
||||
long minY = Math.round(zone.getBBox().getMinY());
|
||||
long maxY = Math.round(zone.getBBox().getMaxY());
|
||||
for (long i = minY; i <= maxY; i++) {
|
||||
histogram.put(i, histogram.getOrDefault(i, 0) + 1);
|
||||
}
|
||||
}
|
||||
|
||||
if (histogram.values().stream().mapToInt(Integer::intValue).average().orElse(1) < MULTI_COLUMN_DETECTION_THRESHOLD) {
|
||||
return resolveSingleColumnReadingOrder(zones);
|
||||
} else {
|
||||
|
||||
return resolveMultiColumnReadingOder(zones);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
private static List<Zone> resolveSingleColumnReadingOrder(List<Zone> zones) {
|
||||
|
||||
zones.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||
return zones;
|
||||
}
|
||||
|
||||
|
||||
private List<Zone> resolveMultiColumnReadingOder(List<Zone> zones) {
|
||||
|
||||
// Simple reading order resolver for multi column page layout as described here : https://pub.towardsai.net/advanced-rag-02-unveiling-pdf-parsing-b84ae866344e
|
||||
// TODO implement a more fancy reading order resolver see https://github.com/BobLd/DocumentLayoutAnalysis/blob/master/README.md#reading-order
|
||||
|
||||
double minX = Double.POSITIVE_INFINITY;
|
||||
double maxX = Double.NEGATIVE_INFINITY;
|
||||
|
||||
for (Zone zone : zones) {
|
||||
if (zone.getX() < minX) {
|
||||
minX = zone.getX();
|
||||
}
|
||||
if (zone.getX() + zone.getWidth() > maxX) {
|
||||
maxX = zone.getX() + zone.getWidth();
|
||||
}
|
||||
}
|
||||
|
||||
double midLineXCoordinate = (minX + maxX) / 2;
|
||||
|
||||
List<Zone> leftOf = new ArrayList<>();
|
||||
List<Zone> rightOf = new ArrayList<>();
|
||||
List<Zone> middle = new ArrayList<>();
|
||||
for (Zone zone : zones) {
|
||||
if (zone.getX() < midLineXCoordinate && zone.getX() + zone.getWidth() < midLineXCoordinate) {
|
||||
leftOf.add(zone);
|
||||
} else if (zone.getX() > midLineXCoordinate && zone.getX() + zone.getWidth() > midLineXCoordinate) {
|
||||
rightOf.add(zone);
|
||||
} else {
|
||||
middle.add(zone);
|
||||
}
|
||||
}
|
||||
|
||||
leftOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||
|
||||
rightOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||
|
||||
middle.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||
|
||||
List<Zone> leftNotIntersecting = new ArrayList<>();
|
||||
for (Zone leftZone : leftOf) {
|
||||
boolean intersects = false;
|
||||
for (Zone rightZone : rightOf) {
|
||||
if (leftZone.intersectsY(rightZone)) {
|
||||
intersects = true;
|
||||
break;
|
||||
}
|
||||
// early stopping
|
||||
if (rightZone.getBBox().getMinY() > leftZone.getBBox().getMaxY()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!intersects) {
|
||||
leftNotIntersecting.add(leftZone);
|
||||
}
|
||||
}
|
||||
|
||||
List<Zone> rightNotIntersecting = new ArrayList<>();
|
||||
for (Zone rightZone : rightOf) {
|
||||
boolean intersects = false;
|
||||
for (Zone leftZone : leftOf) {
|
||||
if (rightZone.intersectsY(leftZone)) {
|
||||
intersects = true;
|
||||
break;
|
||||
}
|
||||
// early stopping
|
||||
if (leftZone.getBBox().getMinY() > rightZone.getBBox().getMaxY()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!intersects) {
|
||||
rightNotIntersecting.add(rightZone);
|
||||
}
|
||||
}
|
||||
|
||||
leftOf.removeAll(leftNotIntersecting);
|
||||
rightOf.removeAll(rightNotIntersecting);
|
||||
|
||||
middle.addAll(leftNotIntersecting);
|
||||
middle.addAll(rightNotIntersecting);
|
||||
|
||||
List<Zone> sortedZones = new ArrayList<>();
|
||||
sortedZones.addAll(leftOf);
|
||||
sortedZones.addAll(rightOf);
|
||||
|
||||
ListIterator<Zone> itty = middle.listIterator();
|
||||
|
||||
while (itty.hasNext()) {
|
||||
Zone current = itty.next();
|
||||
for (int i = 0; i < sortedZones.size(); i++) {
|
||||
if (current.getY() < sortedZones.get(i).getY()) {
|
||||
sortedZones.add(i, current);
|
||||
itty.remove();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sortedZones.addAll(middle);
|
||||
|
||||
return sortedZones;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,56 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.AngleFilter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Histogram;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Neighbor;
|
||||
|
||||
@Service
|
||||
public class SpacingService {
|
||||
|
||||
private static final double SPACING_HISTOGRAM_RESOLUTION = 0.5;
|
||||
private static final double SPACING_HISTOGRAM_SMOOTHING_LENGTH = 2.5;
|
||||
private static final double SPACING_HIST_SMOOTHING_STANDARD_DEVIATION = 0.5;
|
||||
private static final double ANGLE_TOLERANCE = Math.PI / 6;
|
||||
|
||||
|
||||
public double computeCharacterSpacing(List<Character> characters) {
|
||||
|
||||
return computeSpacing(characters, 0);
|
||||
}
|
||||
|
||||
|
||||
public double computeLineSpacing(List<Character> characters) {
|
||||
|
||||
return computeSpacing(characters, Math.PI / 2);
|
||||
}
|
||||
|
||||
|
||||
private double computeSpacing(List<Character> characters, double angle) {
|
||||
|
||||
double maxDistance = Double.NEGATIVE_INFINITY;
|
||||
|
||||
for (Character character : characters) {
|
||||
for (Neighbor neighbor : character.getNeighbors()) {
|
||||
maxDistance = Math.max(maxDistance, neighbor.getDistance());
|
||||
}
|
||||
}
|
||||
Histogram histogram = new Histogram(0, maxDistance, SPACING_HISTOGRAM_RESOLUTION);
|
||||
AngleFilter angleFilter = new AngleFilter(angle - ANGLE_TOLERANCE, angle + ANGLE_TOLERANCE);
|
||||
for (Character character : characters) {
|
||||
for (Neighbor neighbor : character.getNeighbors()) {
|
||||
if (angleFilter.matches(neighbor)) {
|
||||
histogram.add(neighbor.getDistance());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
histogram.gaussianSmooth(SPACING_HISTOGRAM_SMOOTHING_LENGTH, SPACING_HIST_SMOOTHING_STANDARD_DEVIATION);
|
||||
return histogram.getPeakValue();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,152 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Line;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.UnionFind;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone;
|
||||
|
||||
@Service
|
||||
public class ZoneBuilderService {
|
||||
|
||||
private static final double MIN_HORIZONTAL_DISTANCE_MULTIPLIER = -0.5;
|
||||
private static final double MAX_VERTICAL_DISTANCE_MULTIPLIER = 1.2;
|
||||
|
||||
private static final double MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER = -3.0;
|
||||
|
||||
private static final double MAX_VERTICAL_MERGE_DISTANCE_MULTIPLIER = 0.5;
|
||||
|
||||
private static final double MIN_LINE_SIZE_SCALE = 0.9;
|
||||
|
||||
private static final double MAX_LINE_SIZE_SCALE = 2.5;
|
||||
|
||||
private static final double ANGLE_TOLERANCE = Math.PI / 6;
|
||||
|
||||
private static final int MAX_ZONES = 300;
|
||||
|
||||
private static final double MAX_VERTICAL_MERGE_DISTANCE = 0.5;
|
||||
|
||||
|
||||
public List<Zone> buildZones(List<Line> lines, double characterSpacing, double lineSpacing) {
|
||||
|
||||
double minHorizontalDistance = characterSpacing * MIN_HORIZONTAL_DISTANCE_MULTIPLIER;
|
||||
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_DISTANCE_MULTIPLIER;
|
||||
double minHorizontalMergeDistance = characterSpacing * MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER;
|
||||
double maxVerticalMergeDistance = lineSpacing * MAX_VERTICAL_MERGE_DISTANCE_MULTIPLIER;
|
||||
|
||||
UnionFind<Line> unionFind = new UnionFind<>(new HashSet<>(lines));
|
||||
|
||||
double meanHeight = calculateMeanHeight(lines);
|
||||
|
||||
lines.forEach(outerLine -> //
|
||||
lines.forEach(innerLine -> {
|
||||
|
||||
double scale = Math.min(outerLine.getHeight(), innerLine.getHeight()) / meanHeight;
|
||||
scale = Math.max(MIN_LINE_SIZE_SCALE, Math.min(scale, MAX_LINE_SIZE_SCALE));
|
||||
|
||||
if (!unionFind.inSameSet(outerLine, innerLine) && outerLine.angularDifference(innerLine) <= ANGLE_TOLERANCE) {
|
||||
|
||||
double horizontalDistance = outerLine.horizontalDistance(innerLine) / scale;
|
||||
double verticalDistance = outerLine.verticalDistance(innerLine) / scale;
|
||||
|
||||
if (minHorizontalDistance <= horizontalDistance && verticalDistance <= maxVerticalDistance //
|
||||
|| minHorizontalMergeDistance <= horizontalDistance && verticalDistance <= maxVerticalMergeDistance) {
|
||||
unionFind.union(outerLine, innerLine);
|
||||
}
|
||||
}
|
||||
}));
|
||||
|
||||
List<Zone> zones = new ArrayList<>();
|
||||
unionFind.getGroups().forEach(group -> {
|
||||
zones.add(new Zone(new ArrayList<>(group)));
|
||||
});
|
||||
|
||||
if (zones.size() > MAX_ZONES) {
|
||||
List<Line> oneZoneLines = new ArrayList<>();
|
||||
for (Zone zone : zones) {
|
||||
oneZoneLines.addAll(zone.getLines());
|
||||
}
|
||||
return List.of(mergeLinesInZone(oneZoneLines, characterSpacing, lineSpacing));
|
||||
}
|
||||
|
||||
return zones;
|
||||
}
|
||||
|
||||
|
||||
private double calculateMeanHeight(List<Line> lines) {
|
||||
|
||||
double meanHeight = 0.0;
|
||||
double weights = 0.0;
|
||||
for (Line line : lines) {
|
||||
double weight = line.getLength();
|
||||
meanHeight += line.getHeight() * weight;
|
||||
weights += weight;
|
||||
}
|
||||
meanHeight /= weights;
|
||||
return meanHeight;
|
||||
}
|
||||
|
||||
|
||||
private Zone mergeLinesInZone(List<Line> lines, double characterSpacing, double lineSpacing) {
|
||||
|
||||
double maxHorizontalDistance = 0;
|
||||
double minVerticalDistance = 0;
|
||||
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_MERGE_DISTANCE;
|
||||
|
||||
UnionFind<Line> unionFind = new UnionFind<>(new HashSet<>(lines));
|
||||
|
||||
lines.forEach(outer -> {
|
||||
|
||||
lines.forEach(inner -> {
|
||||
if (inner != outer) {
|
||||
|
||||
double horizontalDistance = outer.horizontalDistance(inner);
|
||||
double verticalDistance = outer.verticalDistance(inner);
|
||||
|
||||
if (horizontalDistance <= maxHorizontalDistance && minVerticalDistance <= verticalDistance && verticalDistance <= maxVerticalDistance) {
|
||||
unionFind.union(outer, inner);
|
||||
} else if (minVerticalDistance <= verticalDistance && verticalDistance <= maxVerticalDistance && Math.abs(horizontalDistance - Math.min(outer.getLength(),
|
||||
inner.getLength())) < 0.1) {
|
||||
boolean characterOverlap = false;
|
||||
int overlappingCount = 0;
|
||||
for (Character outerCharacter : outer.getCharacters()) {
|
||||
for (Character innerCharacter : inner.getCharacters()) {
|
||||
double characterOverlapDistance = outerCharacter.overlappingDistance(innerCharacter);
|
||||
if (characterOverlapDistance > 2) {
|
||||
characterOverlap = true;
|
||||
}
|
||||
if (characterOverlapDistance > 0) {
|
||||
overlappingCount++;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!characterOverlap && overlappingCount <= 2) {
|
||||
unionFind.union(outer, inner);
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
List<Line> outputZone = new ArrayList<>();
|
||||
for (Set<Line> group : unionFind.getGroups()) {
|
||||
List<Character> characters = new ArrayList<>();
|
||||
for (Line line : group) {
|
||||
characters.addAll(line.getCharacters());
|
||||
}
|
||||
characters.sort(Comparator.comparingDouble(Character::getX));
|
||||
|
||||
outputZone.add(new Line(characters, characterSpacing));
|
||||
}
|
||||
|
||||
return new Zone(outputZone);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,15 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.utils;
|
||||
|
||||
public class DoubleUtils {
|
||||
|
||||
public static int compareDouble(double d1, double d2, double precision) {
|
||||
|
||||
if (Double.isNaN(d1) || Double.isNaN(d2)) {
|
||||
return Double.compare(d1, d2);
|
||||
}
|
||||
long i1 = Math.round(d1 / (precision == 0 ? 1 : precision));
|
||||
long i2 = Math.round(d2 / (precision == 0 ? 1 : precision));
|
||||
return Long.compare(i1, i2);
|
||||
}
|
||||
|
||||
}
|
||||
@ -13,8 +13,10 @@ import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter;
|
||||
@ -22,6 +24,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.DuplicatedParagraph;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Footer;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Header;
|
||||
@ -46,14 +49,14 @@ import lombok.experimental.UtilityClass;
|
||||
@UtilityClass
|
||||
public class DocumentGraphFactory {
|
||||
|
||||
public Document buildDocumentGraph(ClassificationDocument document) {
|
||||
public Document buildDocumentGraph(LayoutParsingType layoutParsingType, ClassificationDocument document) {
|
||||
|
||||
Document documentGraph = new Document();
|
||||
Context context = new Context(documentGraph);
|
||||
|
||||
document.getPages().forEach(context::buildAndAddPageWithCounter);
|
||||
document.getSections().stream().flatMap(section -> section.getImages().stream()).forEach(image -> context.getImages().add(image));
|
||||
addSections(document, context);
|
||||
addSections(layoutParsingType, document, context);
|
||||
addHeaderAndFooterToEachPage(document, context);
|
||||
|
||||
documentGraph.setNumberOfPages(context.pages.size());
|
||||
@ -64,9 +67,9 @@ public class DocumentGraphFactory {
|
||||
}
|
||||
|
||||
|
||||
private void addSections(ClassificationDocument document, Context context) {
|
||||
private void addSections(LayoutParsingType layoutParsingType, ClassificationDocument document, Context context) {
|
||||
|
||||
document.getSections().forEach(section -> SectionNodeFactory.addSection(null, section.getNonEmptyPageBlocks(), section.getImages(), context));
|
||||
document.getSections().forEach(section -> SectionNodeFactory.addSection(layoutParsingType, null, section.getNonEmptyPageBlocks(), section.getImages(), context));
|
||||
}
|
||||
|
||||
|
||||
@ -77,6 +80,8 @@ public class DocumentGraphFactory {
|
||||
GenericSemanticNode node;
|
||||
if (originalTextBlock.isHeadline()) {
|
||||
node = Headline.builder().documentTree(context.getDocumentTree()).build();
|
||||
} else if (originalTextBlock.isToDuplicate()) {
|
||||
node = DuplicatedParagraph.builder().documentTree(context.getDocumentTree()).build();
|
||||
} else {
|
||||
node = Paragraph.builder().documentTree(context.getDocumentTree()).build();
|
||||
}
|
||||
@ -86,7 +91,16 @@ public class DocumentGraphFactory {
|
||||
List<TextPageBlock> textBlocks = new ArrayList<>();
|
||||
textBlocks.add(originalTextBlock);
|
||||
textBlocks.addAll(textBlocksToMerge);
|
||||
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), node, context, page);
|
||||
|
||||
if (node instanceof DuplicatedParagraph duplicatedParagraph) {
|
||||
AtomicTextBlock unsortedTextBlock = context.textBlockFactory.buildAtomicTextBlock(textBlocks.stream()
|
||||
.flatMap(tb -> tb.getSequences().stream())
|
||||
.collect(Collectors.toList()), node, context, page);
|
||||
duplicatedParagraph.setUnsortedLeafTextBlock(unsortedTextBlock);
|
||||
}
|
||||
|
||||
List<Integer> treeId = context.documentTree.createNewChildEntryAndReturnId(parentNode, node);
|
||||
node.setLeafTextBlock(textBlock);
|
||||
node.setTreeId(treeId);
|
||||
|
||||
@ -4,19 +4,21 @@ import static java.lang.String.format;
|
||||
import static java.util.Collections.emptyList;
|
||||
import static java.util.stream.Collectors.groupingBy;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TableMergingUtility;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
@ -24,7 +26,11 @@ import lombok.experimental.UtilityClass;
|
||||
@UtilityClass
|
||||
public class SectionNodeFactory {
|
||||
|
||||
public void addSection(GenericSemanticNode parentNode, List<AbstractPageBlock> pageBlocks, List<ClassifiedImage> images, DocumentGraphFactory.Context context) {
|
||||
public void addSection(LayoutParsingType layoutParsingType,
|
||||
GenericSemanticNode parentNode,
|
||||
List<AbstractPageBlock> pageBlocks,
|
||||
List<ClassifiedImage> images,
|
||||
DocumentGraphFactory.Context context) {
|
||||
|
||||
if (pageBlocks.isEmpty()) {
|
||||
return;
|
||||
@ -37,11 +43,11 @@ public class SectionNodeFactory {
|
||||
|
||||
section.setTreeId(getTreeId(parentNode, context, section));
|
||||
|
||||
addFirstHeadlineDirectlyToSection(pageBlocks, context, section);
|
||||
addFirstHeadlineDirectlyToSection(layoutParsingType, pageBlocks, context, section);
|
||||
if (containsTablesAndTextBlocks(pageBlocks)) {
|
||||
splitPageBlocksIntoSubSections(pageBlocks).forEach(subSectionPageBlocks -> addSection(section, subSectionPageBlocks, emptyList(), context));
|
||||
splitPageBlocksIntoSubSections(pageBlocks).forEach(subSectionPageBlocks -> addSection(layoutParsingType, section, subSectionPageBlocks, emptyList(), context));
|
||||
} else {
|
||||
addTablesAndParagraphsAndHeadlinesToSection(pageBlocks, context, section);
|
||||
addTablesAndParagraphsAndHeadlinesToSection(layoutParsingType, pageBlocks, context, section);
|
||||
}
|
||||
|
||||
images.stream().distinct().forEach(image -> DocumentGraphFactory.addImage(section, image, context));
|
||||
@ -58,16 +64,19 @@ public class SectionNodeFactory {
|
||||
}
|
||||
|
||||
|
||||
private void addFirstHeadlineDirectlyToSection(List<AbstractPageBlock> pageBlocks, DocumentGraphFactory.Context context, Section section) {
|
||||
private void addFirstHeadlineDirectlyToSection(LayoutParsingType layoutParsingType, List<AbstractPageBlock> pageBlocks, DocumentGraphFactory.Context context, Section section) {
|
||||
|
||||
if (pageBlocks.get(0).isHeadline()) {
|
||||
addTablesAndParagraphsAndHeadlinesToSection(List.of(pageBlocks.get(0)), context, section);
|
||||
addTablesAndParagraphsAndHeadlinesToSection(layoutParsingType, List.of(pageBlocks.get(0)), context, section);
|
||||
pageBlocks.remove(0);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void addTablesAndParagraphsAndHeadlinesToSection(List<AbstractPageBlock> pageBlocks, DocumentGraphFactory.Context context, Section section) {
|
||||
private void addTablesAndParagraphsAndHeadlinesToSection(LayoutParsingType layoutParsingType,
|
||||
List<AbstractPageBlock> pageBlocks,
|
||||
DocumentGraphFactory.Context context,
|
||||
Section section) {
|
||||
|
||||
Set<AbstractPageBlock> alreadyMerged = new HashSet<>();
|
||||
List<AbstractPageBlock> remainingBlocks = new LinkedList<>(pageBlocks);
|
||||
@ -80,13 +89,23 @@ public class SectionNodeFactory {
|
||||
remainingBlocks.removeAll(alreadyMerged);
|
||||
|
||||
if (abstractPageBlock instanceof TextPageBlock) {
|
||||
List<TextPageBlock> textBlocks = findTextBlocksWithSameClassificationAndAlignsY((TextPageBlock) abstractPageBlock, remainingBlocks);
|
||||
alreadyMerged.addAll(textBlocks);
|
||||
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, textBlocks);
|
||||
|
||||
switch (layoutParsingType) {
|
||||
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> {
|
||||
alreadyMerged.add(abstractPageBlock);
|
||||
remainingBlocks.remove(abstractPageBlock);
|
||||
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, new ArrayList<>());
|
||||
}
|
||||
default -> {
|
||||
List<TextPageBlock> textBlocks = findTextBlocksWithSameClassificationAndAlignsY((TextPageBlock) abstractPageBlock, remainingBlocks);
|
||||
alreadyMerged.addAll(textBlocks);
|
||||
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, textBlocks);
|
||||
}
|
||||
}
|
||||
} else if (abstractPageBlock instanceof TablePageBlock tablePageBlock) {
|
||||
List<TablePageBlock> tablesToMerge = TableMergingUtility.findConsecutiveTablesWithSameColCountAndSameHeaders(tablePageBlock, remainingBlocks);
|
||||
alreadyMerged.addAll(tablesToMerge);
|
||||
TableNodeFactory.addTable(section, tablesToMerge, context);
|
||||
TableNodeFactory.addTable(layoutParsingType, section, tablesToMerge, context);
|
||||
} else {
|
||||
throw new RuntimeException(format("Unhandled AbstractPageBlockType %s!", abstractPageBlock.getClass()));
|
||||
}
|
||||
@ -171,6 +190,7 @@ public class SectionNodeFactory {
|
||||
.filter(abstractTextContainer -> abstractTextContainer.intersectsY(atc))
|
||||
.map(abstractTextContainer -> (TextPageBlock) abstractTextContainer)
|
||||
.filter(abstractTextContainer -> abstractTextContainer.getDir() == atc.getDir())
|
||||
.filter(abstractTextContainer -> !abstractTextContainer.isToDuplicate())
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
@ -7,16 +7,17 @@ import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
@ -27,7 +28,7 @@ public class TableNodeFactory {
|
||||
public final double TABLE_CELL_MERGE_CONTENTS_SIZE_THRESHOLD = 0.05;
|
||||
|
||||
|
||||
public void addTable(GenericSemanticNode parentNode, List<TablePageBlock> tablesToMerge, DocumentGraphFactory.Context context) {
|
||||
public void addTable(LayoutParsingType layoutParsingType, GenericSemanticNode parentNode, List<TablePageBlock> tablesToMerge, DocumentGraphFactory.Context context) {
|
||||
|
||||
setPageNumberInCells(tablesToMerge);
|
||||
Set<Page> pages = tablesToMerge.stream().map(AbstractPageBlock::getPage).map(context::getPage).collect(Collectors.toSet());
|
||||
@ -43,7 +44,7 @@ public class TableNodeFactory {
|
||||
|
||||
List<Integer> treeId = context.getDocumentTree().createNewChildEntryAndReturnId(parentNode, table);
|
||||
table.setTreeId(treeId);
|
||||
addTableCells(mergedRows, table, context);
|
||||
addTableCells(layoutParsingType, mergedRows, table, context);
|
||||
|
||||
ifTableHasNoHeadersSetFirstRowAsHeaders(table);
|
||||
}
|
||||
@ -88,18 +89,18 @@ public class TableNodeFactory {
|
||||
}
|
||||
|
||||
|
||||
private void addTableCells(List<List<Cell>> rows, Table table, DocumentGraphFactory.Context context) {
|
||||
private void addTableCells(LayoutParsingType layoutParsingType, List<List<Cell>> rows, Table table, DocumentGraphFactory.Context context) {
|
||||
|
||||
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
|
||||
for (int colIndex = 0; colIndex < rows.get(rowIndex).size(); colIndex++) {
|
||||
addTableCell(rows.get(rowIndex).get(colIndex), rowIndex, colIndex, table, context);
|
||||
addTableCell(layoutParsingType, rows.get(rowIndex).get(colIndex), rowIndex, colIndex, table, context);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SuppressWarnings("PMD.UnusedPrivateMethod") // PMD actually flags this wrong
|
||||
private void addTableCell(Cell cell, int rowIndex, int colIndex, Table tableNode, DocumentGraphFactory.Context context) {
|
||||
private void addTableCell(LayoutParsingType layoutParsingType, Cell cell, int rowIndex, int colIndex, Table tableNode, DocumentGraphFactory.Context context) {
|
||||
|
||||
Page page = context.getPage(cell.getPageNumber());
|
||||
|
||||
@ -116,7 +117,7 @@ public class TableNodeFactory {
|
||||
textBlock = context.getTextBlockFactory().buildAtomicTextBlock(cell.getTextBlocks().get(0).getSequences(), tableCell, context, page);
|
||||
tableCell.setLeafTextBlock(textBlock);
|
||||
} else if (firstTextBlockIsHeadline(cell)) {
|
||||
SectionNodeFactory.addSection(tableCell, cell.getTextBlocks().stream().map(tb -> (AbstractPageBlock) tb).toList(), emptyList(), context);
|
||||
SectionNodeFactory.addSection(layoutParsingType, tableCell, cell.getTextBlocks().stream().map(tb -> (AbstractPageBlock) tb).toList(), emptyList(), context);
|
||||
} else if (cellAreaIsSmallerThanPageAreaTimesThreshold(cell, page)) {
|
||||
List<TextPositionSequence> sequences = TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(cell.getTextBlocks());
|
||||
textBlock = context.getTextBlockFactory().buildAtomicTextBlock(sequences, tableCell, context, page);
|
||||
|
||||
@ -8,8 +8,6 @@ import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import javax.xml.parsers.DocumentBuilder;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
|
||||
@ -18,6 +16,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.Do
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.DuplicatedParagraph;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
||||
@ -33,27 +32,20 @@ public class DocumentDataMapper {
|
||||
public DocumentData toDocumentData(Document document) {
|
||||
|
||||
List<DocumentTextData> documentTextData = document.streamTerminalTextBlocksInOrder()
|
||||
.flatMap(textBlock -> textBlock.getAtomicTextBlocks()
|
||||
.stream())
|
||||
.flatMap(textBlock -> textBlock.getAtomicTextBlocks().stream())
|
||||
.distinct()
|
||||
.map(DocumentDataMapper::toAtomicTextBlockData)
|
||||
.toList();
|
||||
|
||||
List<DocumentPositionData> atomicPositionBlockData = document.streamTerminalTextBlocksInOrder()
|
||||
.flatMap(textBlock -> textBlock.getAtomicTextBlocks()
|
||||
.stream())
|
||||
.flatMap(textBlock -> textBlock.getAtomicTextBlocks().stream())
|
||||
.distinct()
|
||||
.map(DocumentDataMapper::toAtomicPositionBlockData)
|
||||
.toList();
|
||||
|
||||
Set<Long> nonEmptyTextBlocks = documentTextData.stream()
|
||||
.mapToLong(DocumentTextData::getId).boxed()
|
||||
.collect(Collectors.toSet());
|
||||
Set<Long> nonEmptyTextBlocks = documentTextData.stream().mapToLong(DocumentTextData::getId).boxed().collect(Collectors.toSet());
|
||||
|
||||
List<DocumentPage> documentPageData = document.getPages()
|
||||
.stream()
|
||||
.map(DocumentDataMapper::toPageData)
|
||||
.toList();
|
||||
List<DocumentPage> documentPageData = document.getPages().stream().map(DocumentDataMapper::toPageData).toList();
|
||||
DocumentStructure tableOfContentsData = toDocumentTreeData(document.getDocumentTree());
|
||||
return DocumentData.builder()
|
||||
.documentTextData(documentTextData.toArray(new DocumentTextData[0]))
|
||||
@ -84,22 +76,17 @@ public class DocumentDataMapper {
|
||||
case TABLE -> PropertiesMapper.buildTableProperties((Table) entry.getNode());
|
||||
case TABLE_CELL -> PropertiesMapper.buildTableCellProperties((TableCell) entry.getNode());
|
||||
case IMAGE -> PropertiesMapper.buildImageProperties((Image) entry.getNode());
|
||||
case PARAGRAPH ->
|
||||
entry.getNode() instanceof DuplicatedParagraph duplicatedParagraph ? PropertiesMapper.buildDuplicateParagraphProperties(duplicatedParagraph) : new HashMap<>();
|
||||
default -> new HashMap<>();
|
||||
};
|
||||
|
||||
DocumentStructure.EntryData.EntryDataBuilder documentBuilder = DocumentStructure.EntryData.builder()
|
||||
.treeId(toPrimitiveIntArray(entry.getTreeId()))
|
||||
.children(entry.getChildren()
|
||||
.stream()
|
||||
.map(DocumentDataMapper::toEntryData)
|
||||
.toList())
|
||||
.children(entry.getChildren().stream().map(DocumentDataMapper::toEntryData).toList())
|
||||
.type(entry.getType())
|
||||
.atomicBlockIds(atomicTextBlocks)
|
||||
.pageNumbers(entry.getNode().getPages()
|
||||
.stream()
|
||||
.map(Page::getNumber)
|
||||
.map(Integer::longValue)
|
||||
.toArray(Long[]::new))
|
||||
.pageNumbers(entry.getNode().getPages().stream().map(Page::getNumber).map(Integer::longValue).toArray(Long[]::new))
|
||||
.properties(properties);
|
||||
if (entry.getNode() != null) {
|
||||
documentBuilder.engines(entry.getNode().getEngines());
|
||||
@ -112,10 +99,7 @@ public class DocumentDataMapper {
|
||||
|
||||
private Long[] toAtomicTextBlockIds(TextBlock textBlock) {
|
||||
|
||||
return textBlock.getAtomicTextBlocks()
|
||||
.stream()
|
||||
.map(AtomicTextBlock::getId)
|
||||
.toArray(Long[]::new);
|
||||
return textBlock.getAtomicTextBlocks().stream().map(AtomicTextBlock::getId).toArray(Long[]::new);
|
||||
}
|
||||
|
||||
|
||||
@ -167,9 +151,7 @@ public class DocumentDataMapper {
|
||||
|
||||
private int[] toPrimitiveIntArray(List<Integer> list) {
|
||||
|
||||
return list.stream()
|
||||
.mapToInt(Integer::intValue)
|
||||
.toArray();
|
||||
return list.stream().mapToInt(Integer::intValue).toArray();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -7,13 +7,14 @@ import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.NoSuchElementException;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.DuplicatedParagraph;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Footer;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Header;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Headline;
|
||||
@ -61,7 +62,7 @@ public class DocumentGraphMapper {
|
||||
|
||||
SemanticNode node = switch (entryData.getType()) {
|
||||
case SECTION -> buildSection(context);
|
||||
case PARAGRAPH -> buildParagraph(context);
|
||||
case PARAGRAPH -> buildParagraph(context, entryData.getProperties());
|
||||
case HEADLINE -> buildHeadline(context);
|
||||
case HEADER -> buildHeader(context);
|
||||
case FOOTER -> buildFooter(context);
|
||||
@ -140,7 +141,17 @@ public class DocumentGraphMapper {
|
||||
}
|
||||
|
||||
|
||||
private Paragraph buildParagraph(Context context) {
|
||||
private Paragraph buildParagraph(Context context, Map<String, String> properties) {
|
||||
|
||||
if (PropertiesMapper.isDuplicateParagraph(properties)) {
|
||||
|
||||
DuplicatedParagraph duplicatedParagraph = DuplicatedParagraph.builder().documentTree(context.documentTree).build();
|
||||
|
||||
Long[] unsortedTextblockIds = PropertiesMapper.getUnsortedTextblockIds(properties);
|
||||
duplicatedParagraph.setUnsortedLeafTextBlock(toTextBlock(unsortedTextblockIds, context, duplicatedParagraph));
|
||||
return duplicatedParagraph;
|
||||
|
||||
}
|
||||
|
||||
return Paragraph.builder().documentTree(context.documentTree).build();
|
||||
}
|
||||
|
||||
@ -1,17 +1,19 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.mapper;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Collections;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.DuplicatedParagraph;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
|
||||
public class PropertiesMapper {
|
||||
|
||||
@ -76,6 +78,32 @@ public class PropertiesMapper {
|
||||
}
|
||||
|
||||
|
||||
public static Map<String, String> buildDuplicateParagraphProperties(DuplicatedParagraph duplicatedParagraph) {
|
||||
|
||||
Map<String, String> properties = new HashMap<>();
|
||||
properties.put(DocumentStructure.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID, Arrays.toString(toAtomicTextBlockIds(duplicatedParagraph.getUnsortedLeafTextBlock())));
|
||||
return properties;
|
||||
}
|
||||
|
||||
|
||||
public static boolean isDuplicateParagraph(Map<String, String> properties) {
|
||||
|
||||
return properties.containsKey(DocumentStructure.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID);
|
||||
}
|
||||
|
||||
|
||||
public static Long[] getUnsortedTextblockIds(Map<String, String> properties) {
|
||||
|
||||
return toLongArray(properties.get(DocumentStructure.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID));
|
||||
}
|
||||
|
||||
|
||||
public static Long[] toLongArray(String ids) {
|
||||
|
||||
return Arrays.stream(ids.substring(1, ids.length() - 1).trim().split(",")).map(Long::valueOf).toArray(Long[]::new);
|
||||
}
|
||||
|
||||
|
||||
private static ImageType parseImageType(String imageType) {
|
||||
|
||||
return switch (imageType) {
|
||||
@ -101,4 +129,10 @@ public class PropertiesMapper {
|
||||
rectangle2D.getHeight());
|
||||
}
|
||||
|
||||
|
||||
private static Long[] toAtomicTextBlockIds(TextBlock textBlock) {
|
||||
|
||||
return textBlock.getAtomicTextBlocks().stream().map(AtomicTextBlock::getId).toArray(Long[]::new);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -237,8 +237,13 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
int startIndex = 0;
|
||||
RedTextPosition previous = null;
|
||||
|
||||
float direction = -1;
|
||||
for (int i = 0; i <= textPositions.size() - 1; i++) {
|
||||
|
||||
if (direction == -1) {
|
||||
direction = textPositions.get(i).getDir();
|
||||
}
|
||||
|
||||
if (!textPositionSequences.isEmpty()) {
|
||||
previous = textPositionSequences.get(textPositionSequences.size() - 1)
|
||||
.getTextPositions()
|
||||
@ -250,6 +255,13 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (textPositions.get(i).getDir() != direction && startIndex != i) {
|
||||
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart));
|
||||
startIndex = i;
|
||||
direction = textPositions.get(i).getDir();
|
||||
}
|
||||
|
||||
// Strange but sometimes this is happening, for example: Metolachlor2.pdf
|
||||
if (checkIfCurrentPositionIsToTheRightOfPreviousPosition(i, textPositions)) {
|
||||
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
||||
@ -329,6 +341,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
.getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < maximumGapSize;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String getText(PDDocument doc) throws IOException {
|
||||
|
||||
|
||||
@ -20,6 +20,7 @@ import org.springframework.stereotype.Service;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.DuplicatedParagraph;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
||||
@ -53,6 +54,8 @@ public class LayoutGridService {
|
||||
|
||||
static Color INNER_LINES_COLOR = new Color(255, 175, 175);
|
||||
static Color PARAGRAPH_COLOR = new Color(70, 130, 180);
|
||||
|
||||
static Color DUPLICATE_PARAGRAPH_COLOR = new Color(70, 180, 101);
|
||||
static Color TABLE_COLOR = new Color(102, 205, 170);
|
||||
static Color SECTION_COLOR = new Color(50, 50, 50);
|
||||
static Color HEADLINE_COLOR = new Color(162, 56, 56);
|
||||
@ -100,6 +103,11 @@ public class LayoutGridService {
|
||||
case IMAGE -> IMAGE_COLOR;
|
||||
default -> null;
|
||||
};
|
||||
|
||||
if (semanticNode instanceof DuplicatedParagraph) {
|
||||
color = DUPLICATE_PARAGRAPH_COLOR;
|
||||
}
|
||||
|
||||
if (isNotSectionOrTableCellOrDocument(semanticNode)) {
|
||||
addAsRectangle(semanticNode, layoutGrid, color);
|
||||
}
|
||||
|
||||
@ -3,7 +3,6 @@ package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
|
||||
|
||||
@ -37,7 +37,7 @@ public class MessageHandler {
|
||||
|
||||
LayoutParsingRequest layoutParsingRequest = objectMapper.readValue(message.getBody(), LayoutParsingRequest.class);
|
||||
|
||||
if (layoutParsingRequest.layoutParsingType().equals(LayoutParsingType.TAAS) && layoutParsingRequest.researchDocumentStorageId() == null) {
|
||||
if (layoutParsingRequest.layoutParsingType().equals(LayoutParsingType.CLARIFYND) && layoutParsingRequest.researchDocumentStorageId() == null) {
|
||||
throw new IllegalArgumentException("ResearchDocumentDataStorageId is null!");
|
||||
}
|
||||
log.info("Layout parsing request received {}", layoutParsingRequest.identifier());
|
||||
|
||||
@ -48,12 +48,13 @@ public class BdrJsonBuildTest extends AbstractTest {
|
||||
@SneakyThrows
|
||||
protected Document buildGraph(File file) {
|
||||
|
||||
return DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.TAAS,
|
||||
file,
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse(),
|
||||
new VisualLayoutParsingResponse(),
|
||||
file.toString()));
|
||||
return DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.CLARIFYND,
|
||||
layoutParsingPipeline.parseLayout(LayoutParsingType.CLARIFYND,
|
||||
file,
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse(),
|
||||
new VisualLayoutParsingResponse(),
|
||||
file.toString()));
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -95,12 +95,13 @@ public class HeadlinesGoldStandardIntegrationTest {
|
||||
goldStandardLog.getRedactionLogEntry().removeIf(r -> !r.isRedacted() || r.getChanges().get(r.getChanges().size() - 1).getType().equals(ChangeType.REMOVED));
|
||||
goldStandardLog.getRedactionLogEntry().forEach(e -> goldStandardHeadlines.add(new Headline(e.getPositions().get(0).getPage(), e.getValue())));
|
||||
|
||||
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||
pdfFileResource.getFile(),
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse(),
|
||||
new VisualLayoutParsingResponse(),
|
||||
filePath));
|
||||
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_OLD,
|
||||
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD,
|
||||
pdfFileResource.getFile(),
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse(),
|
||||
new VisualLayoutParsingResponse(),
|
||||
filePath));
|
||||
|
||||
var foundHeadlines = documentGraph.streamAllSubNodes()
|
||||
.map(SemanticNode::getHeadline)
|
||||
|
||||
@ -26,7 +26,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
|
||||
public void testLayoutParserEndToEnd() {
|
||||
|
||||
prepareStorage("files/bdr/Wie weiter bei Kristeneinrichtungen.pdf");
|
||||
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER);
|
||||
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER_OLD);
|
||||
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
|
||||
Arrays.stream(finishedEvent.message().split("\n")).forEach(log::info);
|
||||
}
|
||||
|
||||
@ -55,12 +55,13 @@ public class DocumentGraphJsonWritingTest extends BuildDocumentTest {
|
||||
@SneakyThrows
|
||||
private void writeJsons(Path filename) {
|
||||
|
||||
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||
filename.toFile(),
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse(),
|
||||
new VisualLayoutParsingResponse(),
|
||||
filename.toFile().toString()));
|
||||
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_OLD,
|
||||
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD,
|
||||
filename.toFile(),
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse(),
|
||||
new VisualLayoutParsingResponse(),
|
||||
filename.toFile().toString()));
|
||||
|
||||
DocumentData documentData = DocumentDataMapper.toDocumentData(documentGraph);
|
||||
ObjectMapper mapper = ObjectMapperFactory.create();
|
||||
|
||||
@ -26,7 +26,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
@SneakyThrows
|
||||
public void testViewerDocument() {
|
||||
|
||||
String fileName = "files/SinglePages/T5 VV-640252-Page16.pdf";
|
||||
String fileName = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
|
||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||
|
||||
var documentFile = new ClassPathResource(fileName).getFile();
|
||||
@ -54,13 +54,14 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
var documentFile = new ClassPathResource(fileName).getFile();
|
||||
|
||||
var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE,
|
||||
documentFile,
|
||||
new ImageServiceResponse(),
|
||||
tableResponse,
|
||||
new VisualLayoutParsingResponse(),Path.of(fileName).getFileName().toFile().toString());
|
||||
documentFile,
|
||||
new ImageServiceResponse(),
|
||||
tableResponse,
|
||||
new VisualLayoutParsingResponse(),
|
||||
Path.of(fileName).getFileName().toFile().toString());
|
||||
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
|
||||
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
|
||||
Document document = DocumentGraphFactory.buildDocumentGraph(classificationDocument);
|
||||
Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.DOCUMINE, classificationDocument);
|
||||
|
||||
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
|
||||
}
|
||||
|
||||
@ -56,12 +56,12 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
@SneakyThrows
|
||||
public ClassificationDocument buildClassificationDocument(File originDocument, TableServiceResponse tableServiceResponse) {
|
||||
|
||||
ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||
originDocument,
|
||||
new ImageServiceResponse(),
|
||||
tableServiceResponse,
|
||||
new VisualLayoutParsingResponse(),
|
||||
"document");
|
||||
ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD,
|
||||
originDocument,
|
||||
new ImageServiceResponse(),
|
||||
tableServiceResponse,
|
||||
new VisualLayoutParsingResponse(),
|
||||
"document");
|
||||
|
||||
redactManagerClassificationService.classifyDocument(classificationDocument);
|
||||
|
||||
@ -112,16 +112,8 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
var tableServiceResponse = objectMapper.readValue(cvTablesResource.getInputStream(), TableServiceResponse.class);
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile(), tableServiceResponse);
|
||||
assertThat(document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.collect(Collectors.toList())).isNotEmpty();
|
||||
var tables = document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.toList();
|
||||
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
|
||||
var tables = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList();
|
||||
|
||||
// Quality of the table parsing is not good, because the file is rotated at scanning.
|
||||
// We only asset that the table border is not the page border.
|
||||
@ -143,12 +135,12 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
imageServiceResponse.getData()
|
||||
.forEach(imageMetadata -> images.computeIfAbsent(imageMetadata.getPosition().getPageNumber(), x -> new ArrayList<>())
|
||||
.add(new ClassifiedImage(new Rectangle2D.Double(imageMetadata.getPosition().getX1(),
|
||||
imageMetadata.getPosition().getY1(),
|
||||
imageMetadata.getGeometry().getWidth(),
|
||||
imageMetadata.getGeometry().getHeight()),
|
||||
ImageType.valueOf(imageMetadata.getClassification().getLabel().toUpperCase(Locale.ROOT)),
|
||||
imageMetadata.isAlpha(),
|
||||
imageMetadata.getPosition().getPageNumber())));
|
||||
imageMetadata.getPosition().getY1(),
|
||||
imageMetadata.getGeometry().getWidth(),
|
||||
imageMetadata.getGeometry().getHeight()),
|
||||
ImageType.valueOf(imageMetadata.getClassification().getLabel().toUpperCase(Locale.ROOT)),
|
||||
imageMetadata.isAlpha(),
|
||||
imageMetadata.getPosition().getPageNumber())));
|
||||
|
||||
System.out.println("object");
|
||||
}
|
||||
@ -160,22 +152,11 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Spanning Cells.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
assertThat(document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.collect(Collectors.toList())).isNotEmpty();
|
||||
TablePageBlock table = document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.toList()
|
||||
.get(0);
|
||||
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
|
||||
TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(0);
|
||||
assertThat(table.getColCount()).isEqualTo(6);
|
||||
assertThat(table.getRowCount()).isEqualTo(13);
|
||||
assertThat(table.getRows()
|
||||
.stream()
|
||||
.mapToInt(List::size).sum()).isEqualTo(6 * 13);
|
||||
assertThat(table.getRows().stream().mapToInt(List::size).sum()).isEqualTo(6 * 13);
|
||||
}
|
||||
|
||||
|
||||
@ -185,37 +166,15 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Table.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
assertThat(document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.collect(Collectors.toList())).isNotEmpty();
|
||||
TablePageBlock firstTable = document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.toList()
|
||||
.get(0);
|
||||
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
|
||||
TablePageBlock firstTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(0);
|
||||
assertThat(firstTable.getColCount()).isEqualTo(8);
|
||||
assertThat(firstTable.getRowCount()).isEqualTo(1);
|
||||
TablePageBlock secondTable = document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.toList()
|
||||
.get(1);
|
||||
TablePageBlock secondTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(1);
|
||||
assertThat(secondTable.getColCount()).isEqualTo(8);
|
||||
assertThat(secondTable.getRowCount()).isEqualTo(2);
|
||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
|
||||
.get(0)
|
||||
.stream()
|
||||
.map(Collections::singletonList)
|
||||
.collect(Collectors.toList());
|
||||
assertThat(secondTable.getRows()
|
||||
.stream()
|
||||
.allMatch(row -> row.stream()
|
||||
.map(Cell::getHeaderCells)
|
||||
.toList().equals(firstTableHeaderCells))).isTrue();
|
||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows().get(0).stream().map(Collections::singletonList).collect(Collectors.toList());
|
||||
assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).toList().equals(firstTableHeaderCells))).isTrue();
|
||||
}
|
||||
|
||||
|
||||
@ -225,37 +184,15 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Multi Page Table.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
assertThat(document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.collect(Collectors.toList())).isNotEmpty();
|
||||
TablePageBlock firstTable = document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.toList()
|
||||
.get(0);
|
||||
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
|
||||
TablePageBlock firstTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(0);
|
||||
assertThat(firstTable.getColCount()).isEqualTo(9);
|
||||
assertThat(firstTable.getRowCount()).isEqualTo(5);
|
||||
TablePageBlock secondTable = document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.toList()
|
||||
.get(1);
|
||||
TablePageBlock secondTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(1);
|
||||
assertThat(secondTable.getColCount()).isEqualTo(9);
|
||||
assertThat(secondTable.getRowCount()).isEqualTo(6);
|
||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
|
||||
.get(firstTable.getRowCount() - 1)
|
||||
.stream()
|
||||
.map(Cell::getHeaderCells)
|
||||
.collect(Collectors.toList());
|
||||
assertThat(secondTable.getRows()
|
||||
.stream()
|
||||
.allMatch(row -> row.stream()
|
||||
.map(Cell::getHeaderCells)
|
||||
.toList().equals(firstTableHeaderCells))).isTrue();
|
||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows().get(firstTable.getRowCount() - 1).stream().map(Cell::getHeaderCells).collect(Collectors.toList());
|
||||
assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).toList().equals(firstTableHeaderCells))).isTrue();
|
||||
}
|
||||
|
||||
|
||||
@ -265,37 +202,15 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Rotated Table Headers.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
assertThat(document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.collect(Collectors.toList())).isNotEmpty();
|
||||
TablePageBlock firstTable = document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.toList()
|
||||
.get(0);
|
||||
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
|
||||
TablePageBlock firstTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(0);
|
||||
assertThat(firstTable.getColCount()).isEqualTo(8);
|
||||
assertThat(firstTable.getRowCount()).isEqualTo(1);
|
||||
TablePageBlock secondTable = document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.toList()
|
||||
.get(1);
|
||||
TablePageBlock secondTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(1);
|
||||
assertThat(secondTable.getColCount()).isEqualTo(8);
|
||||
assertThat(secondTable.getRowCount()).isEqualTo(6);
|
||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
|
||||
.get(0)
|
||||
.stream()
|
||||
.map(Collections::singletonList)
|
||||
.collect(Collectors.toList());
|
||||
assertThat(secondTable.getRows()
|
||||
.stream()
|
||||
.allMatch(row -> row.stream()
|
||||
.map(Cell::getHeaderCells)
|
||||
.toList().equals(firstTableHeaderCells))).isTrue();
|
||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows().get(0).stream().map(Collections::singletonList).collect(Collectors.toList());
|
||||
assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).toList().equals(firstTableHeaderCells))).isTrue();
|
||||
}
|
||||
|
||||
|
||||
@ -345,30 +260,29 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
validateTable(document, 0, 8, 8, 0, 0);
|
||||
|
||||
List<List<String>> values = Arrays.asList(Arrays.asList("Annex point Reference within DAR/RAR",
|
||||
"Author, date",
|
||||
"Study title",
|
||||
"Analytical method Author, date, No.",
|
||||
"Technique, LOQ of the method, validated working range",
|
||||
"Method meets analytical validation criteria",
|
||||
"Remarks (in case validation criteria are not met)",
|
||||
"Acceptability of the method"),
|
||||
Arrays.asList(
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies"),
|
||||
Arrays.asList("CA 7.1.2.1.1 DAR (2009)",
|
||||
"Evans P.G. 2001 TMJ4569B, VV-323245",
|
||||
"Azoxystrobin Laboratory Degradation Study in Three Soil Types, Sampled from Holland and the United Kingdom",
|
||||
"Method: RAM 269 Johnson R.I., Tummon O.J., Earl M. 1995 RJ1864B, VV-377731 Johnson R.I., Tummon O.J., Earl M. 1998 RAM 269/02, VV-124072 Johnson R.I., Tummon O.J., Earl M. 2000 RAM 269/03, VV-123986 Validation: Robinson N.J. 2001 TMJ4617B, VV-895845",
|
||||
"LC-MS/MS LOQ: 0.01 mg/kg (R401553 (SYN50165 7), R402173 (SYN501114 )) or 0.02 mg/kg (azoxystrobin, R230310, R234886) Working range: 0.02-1.0 or 0.01-0.5 mg/kg (depending on analyte) Other supporting quantificati on methods: HPLC-UV GC-MSD",
|
||||
"Y",
|
||||
"N/A",
|
||||
"Y"));
|
||||
"Author, date",
|
||||
"Study title",
|
||||
"Analytical method Author, date, No.",
|
||||
"Technique, LOQ of the method, validated working range",
|
||||
"Method meets analytical validation criteria",
|
||||
"Remarks (in case validation criteria are not met)",
|
||||
"Acceptability of the method"),
|
||||
Arrays.asList("Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies"),
|
||||
Arrays.asList("CA 7.1.2.1.1 DAR (2009)",
|
||||
"Evans P.G. 2001 TMJ4569B, VV-323245",
|
||||
"Azoxystrobin Laboratory Degradation Study in Three Soil Types, Sampled from Holland and the United Kingdom",
|
||||
"Method: RAM 269 Johnson R.I., Tummon O.J., Earl M. 1995 RJ1864B, VV-377731 Johnson R.I., Tummon O.J., Earl M. 1998 RAM 269/02, VV-124072 Johnson R.I., Tummon O.J., Earl M. 2000 RAM 269/03, VV-123986 Validation: Robinson N.J. 2001 TMJ4617B, VV-895845",
|
||||
"LC-MS/MS LOQ: 0.01 mg/kg (R401553 (SYN50165 7), R402173 (SYN501114 )) or 0.02 mg/kg (azoxystrobin, R230310, R234886) Working range: 0.02-1.0 or 0.01-0.5 mg/kg (depending on analyte) Other supporting quantificati on methods: HPLC-UV GC-MSD",
|
||||
"Y",
|
||||
"N/A",
|
||||
"Y"));
|
||||
|
||||
validateTable(document, 0, values);
|
||||
|
||||
@ -757,11 +671,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
@SneakyThrows
|
||||
private void toHtml(ClassificationDocument document, String filename) {
|
||||
|
||||
var tables = document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.toList();
|
||||
var tables = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList();
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
int currentPage = 1;
|
||||
@ -782,19 +692,9 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
private void validateTable(ClassificationDocument document, int tableIndex, int colCount, int rowCount, int emptyCellsCountCorrect, int emptyCellsCountIncorrect) {
|
||||
|
||||
TablePageBlock table = document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.toList()
|
||||
.get(tableIndex);
|
||||
TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(tableIndex);
|
||||
List<List<Cell>> rows = table.getRows();
|
||||
int emptyCellsFoundFound = rows.stream()
|
||||
.flatMap(List::stream)
|
||||
.toList()
|
||||
.stream()
|
||||
.filter(f -> f.toString().isEmpty())
|
||||
.toList().size();
|
||||
int emptyCellsFoundFound = rows.stream().flatMap(List::stream).toList().stream().filter(f -> f.toString().isEmpty()).toList().size();
|
||||
|
||||
for (List<Cell> row : table.getRows()) {
|
||||
row.forEach(r -> System.out.println(r.toString()));
|
||||
@ -809,20 +709,11 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
private void validateTable(ClassificationDocument document, int tableIndex, List<List<String>> values) {
|
||||
|
||||
TablePageBlock table = document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.toList()
|
||||
.get(tableIndex);
|
||||
TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(tableIndex);
|
||||
List<List<Cell>> rows = table.getRows();
|
||||
|
||||
List<Cell> rowsFlattened = rows.stream()
|
||||
.flatMap(List::stream)
|
||||
.toList();
|
||||
List<String> valuesFlattened = values.stream()
|
||||
.flatMap(List::stream)
|
||||
.toList();
|
||||
List<Cell> rowsFlattened = rows.stream().flatMap(List::stream).toList();
|
||||
List<String> valuesFlattened = values.stream().flatMap(List::stream).toList();
|
||||
|
||||
for (int i = 0; i < valuesFlattened.size(); i++) {
|
||||
Cell cell = rowsFlattened.get(i);
|
||||
@ -835,11 +726,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
private void validateTableSize(ClassificationDocument document, int tableSize) {
|
||||
|
||||
assertThat(document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.toList().size()).isEqualTo(tableSize);
|
||||
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().size()).isEqualTo(tableSize);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -21,7 +21,7 @@ class BodyTextFrameServiceTest extends BuildDocumentTest {
|
||||
|
||||
String filename = "files/211.pdf";
|
||||
String outputFilename = "/tmp/" + Path.of(filename).getFileName() + "_MAINBODY.pdf";
|
||||
ClassificationDocument document = parseLayout(filename, LayoutParsingType.TAAS);
|
||||
ClassificationDocument document = parseLayout(filename, LayoutParsingType.CLARIFYND);
|
||||
PdfDraw.drawRectanglesPerPage(filename,
|
||||
document.getPages().stream().map(page -> List.of(RectangleTransformations.toRectangle2D(page.getBodyTextFrame()))).toList(),
|
||||
outputFilename);
|
||||
|
||||
@ -74,7 +74,7 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
|
||||
cleanRulingsPerPage.add(rulingCleaningService.getCleanRulings(Collections.emptyList(), pageContent.getRulings()));
|
||||
}
|
||||
var cleanRulings = cleanRulingsPerPage.stream().map(CleanRulings::getVertical).collect(Collectors.toList());
|
||||
PdfDraw.drawLinesPerPage(fileName, cleanRulings, lineFileName);
|
||||
PdfDraw.drawLinesPerPage(fileName, cleanRulings, lineFileName);
|
||||
|
||||
}
|
||||
|
||||
@ -99,18 +99,20 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
|
||||
@SneakyThrows
|
||||
private void writeJsons(Path filename) {
|
||||
|
||||
Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||
filename.toFile(),
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse(),
|
||||
new VisualLayoutParsingResponse(),
|
||||
filename.toFile().toString()));
|
||||
Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||
filename.toFile(),
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse(),
|
||||
new VisualLayoutParsingResponse(),
|
||||
filename.toFile().toString()));
|
||||
Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_OLD,
|
||||
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD,
|
||||
filename.toFile(),
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse(),
|
||||
new VisualLayoutParsingResponse(),
|
||||
filename.toFile().toString()));
|
||||
Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_OLD,
|
||||
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD,
|
||||
filename.toFile(),
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse(),
|
||||
new VisualLayoutParsingResponse(),
|
||||
filename.toFile().toString()));
|
||||
DocumentData documentDataBefore = DocumentDataMapper.toDocumentData(documentGraphBefore);
|
||||
DocumentData documentDataAfter = DocumentDataMapper.toDocumentData(documentGraphAfter);
|
||||
if (!compareStructures(documentDataBefore.getDocumentStructure(), documentDataAfter.getDocumentStructure())) {
|
||||
|
||||
@ -20,7 +20,6 @@ import org.springframework.context.annotation.Import;
|
||||
import org.springframework.context.annotation.Primary;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
import org.springframework.test.context.junit.jupiter.SpringExtension;
|
||||
import org.xmlunit.builder.Input;
|
||||
|
||||
import com.iqser.red.commons.jackson.ObjectMapperFactory;
|
||||
import com.iqser.red.storage.commons.service.StorageService;
|
||||
@ -68,7 +67,7 @@ public abstract class AbstractTest {
|
||||
protected LayoutParsingRequest buildStandardLayoutParsingRequest() {
|
||||
|
||||
return LayoutParsingRequest.builder()
|
||||
.layoutParsingType(LayoutParsingType.REDACT_MANAGER)
|
||||
.layoutParsingType(LayoutParsingType.REDACT_MANAGER_OLD)
|
||||
.originFileStorageId(ORIGIN_FILE_ID)
|
||||
.tablesFileStorageId(Optional.of(TABLE_FILE_ID))
|
||||
.imagesFileStorageId(Optional.of(IMAGE_FILE_ID))
|
||||
@ -99,7 +98,7 @@ public abstract class AbstractTest {
|
||||
@SneakyThrows
|
||||
protected LayoutParsingRequest prepareStorage(String file) {
|
||||
|
||||
return prepareStorage(file, "cv_table_parsing_response/empty.json", "image_service_response/empty.json","visual_layout_parsing_response/empty.json");
|
||||
return prepareStorage(file, "cv_table_parsing_response/empty.json", "image_service_response/empty.json", "visual_layout_parsing_response/empty.json");
|
||||
}
|
||||
|
||||
|
||||
@ -107,7 +106,7 @@ public abstract class AbstractTest {
|
||||
protected LayoutParsingRequest prepareStorage(InputStream fileInputStream) {
|
||||
|
||||
storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileInputStream);
|
||||
return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER);
|
||||
return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER_OLD);
|
||||
}
|
||||
|
||||
|
||||
@ -140,6 +139,7 @@ public abstract class AbstractTest {
|
||||
return prepareStorage(pdfFileResource.getInputStream(), cvServiceResponseFileResource.getInputStream(), imageInfoFileResource.getInputStream());
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
protected LayoutParsingRequest prepareStorage(String file, String cvServiceResponseFile, String imageInfoFile, String visualLayoutParsingResponseFile) {
|
||||
|
||||
@ -148,9 +148,13 @@ public abstract class AbstractTest {
|
||||
ClassPathResource imageInfoFileResource = new ClassPathResource(imageInfoFile);
|
||||
ClassPathResource visualLayoutParsingResponseResource = new ClassPathResource(visualLayoutParsingResponseFile);
|
||||
|
||||
return prepareStorage(pdfFileResource.getInputStream(), cvServiceResponseFileResource.getInputStream(), imageInfoFileResource.getInputStream(), visualLayoutParsingResponseResource.getInputStream());
|
||||
return prepareStorage(pdfFileResource.getInputStream(),
|
||||
cvServiceResponseFileResource.getInputStream(),
|
||||
imageInfoFileResource.getInputStream(),
|
||||
visualLayoutParsingResponseResource.getInputStream());
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
protected LayoutParsingRequest prepareStorage(InputStream fileStream, InputStream cvServiceResponseFileStream, InputStream imageInfoStream) {
|
||||
|
||||
@ -158,18 +162,22 @@ public abstract class AbstractTest {
|
||||
storageService.storeObject(TenantContext.getTenantId(), TABLE_FILE_ID, cvServiceResponseFileStream);
|
||||
storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileStream);
|
||||
|
||||
return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER);
|
||||
return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER_OLD);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
protected LayoutParsingRequest prepareStorage(InputStream fileStream, InputStream cvServiceResponseFileStream, InputStream imageInfoStream, InputStream visualLayoutParsingResponseFileStream) {
|
||||
protected LayoutParsingRequest prepareStorage(InputStream fileStream,
|
||||
InputStream cvServiceResponseFileStream,
|
||||
InputStream imageInfoStream,
|
||||
InputStream visualLayoutParsingResponseFileStream) {
|
||||
|
||||
storageService.storeObject(TenantContext.getTenantId(), IMAGE_FILE_ID, imageInfoStream);
|
||||
storageService.storeObject(TenantContext.getTenantId(), TABLE_FILE_ID, cvServiceResponseFileStream);
|
||||
storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileStream);
|
||||
storageService.storeObject(TenantContext.getTenantId(),VISUAL_LAYOUT_FILE,visualLayoutParsingResponseFileStream );
|
||||
storageService.storeObject(TenantContext.getTenantId(), VISUAL_LAYOUT_FILE, visualLayoutParsingResponseFileStream);
|
||||
|
||||
return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER);
|
||||
return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER_OLD);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -26,14 +26,19 @@ public abstract class BuildDocumentTest extends AbstractTest {
|
||||
|
||||
File fileResource = new ClassPathResource(filename).getFile();
|
||||
prepareStorage(filename);
|
||||
return layoutParsingPipeline.parseLayout(layoutParsingType, fileResource, layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID), new TableServiceResponse(), new VisualLayoutParsingResponse(),filename);
|
||||
return layoutParsingPipeline.parseLayout(layoutParsingType,
|
||||
fileResource,
|
||||
layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID),
|
||||
new TableServiceResponse(),
|
||||
new VisualLayoutParsingResponse(),
|
||||
filename);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
protected Document buildGraph(String filename) {
|
||||
|
||||
return buildGraph(filename, LayoutParsingType.REDACT_MANAGER);
|
||||
return buildGraph(filename, LayoutParsingType.REDACT_MANAGER_OLD);
|
||||
}
|
||||
|
||||
|
||||
@ -46,7 +51,7 @@ public abstract class BuildDocumentTest extends AbstractTest {
|
||||
prepareStorage(filename);
|
||||
}
|
||||
|
||||
return DocumentGraphFactory.buildDocumentGraph(parseLayout(filename, layoutParsingType));
|
||||
return DocumentGraphFactory.buildDocumentGraph(layoutParsingType, parseLayout(filename, layoutParsingType));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
Binary file not shown.
Binary file not shown.
@ -1,6 +1,6 @@
|
||||
plugins {
|
||||
id("com.knecon.fforesight.java-conventions")
|
||||
id("io.freefair.lombok") version "8.2.2"
|
||||
id("io.freefair.lombok") version "8.4"
|
||||
}
|
||||
|
||||
description = "Library for adding/removing layers in the viewer document"
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user