RED-7141: Implemented docstrum layout parsing

This commit is contained in:
Dominique Eifländer 2024-02-22 11:02:50 +01:00
parent f146beeb44
commit 79239b751d
57 changed files with 1998 additions and 850 deletions

View File

@ -55,6 +55,13 @@ public class DocumentStructure implements Serializable {
}
@Schema(description = "Object containing the extra field names, a duplicate paragraph has in its properties field.")
public static class DuplicateParagraphProperties implements Serializable {
public static final String UNSORTED_TEXTBLOCK_ID = "utbid";
}
public static final String RECTANGLE_DELIMITER = ";";

View File

@ -2,6 +2,9 @@ package com.knecon.fforesight.service.layoutparser.internal.api.queue;
public enum LayoutParsingType {
REDACT_MANAGER,
TAAS,
DOCUMINE
REDACT_MANAGER_OLD,
REDACT_MANAGER_PARAGRAPH_DEBUG,
DOCUMINE,
CLARIFYND,
CLARIFYND_PARAGRAPH_DEBUG
}

View File

@ -24,4 +24,5 @@ dependencies {
implementation("com.fasterxml.jackson.module:jackson-module-afterburner:${jacksonVersion}")
implementation("com.fasterxml.jackson.datatype:jackson-datatype-jsr310:${jacksonVersion}")
implementation("org.springframework.boot:spring-boot-starter-web:3.1.3")
implementation("org.jgrapht:jgrapht-core:1.5.2")
}

View File

@ -28,6 +28,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
@ -43,12 +44,11 @@ import com.knecon.fforesight.service.layoutparser.processor.services.RulingClean
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService;
import com.knecon.fforesight.service.layoutparser.processor.services.TableExtractionService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.TaasBlockificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.DocuMineClassificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.TaasClassificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper;
@ -76,16 +76,15 @@ public class LayoutParsingPipeline {
CvTableParsingAdapter cvTableParsingAdapter;
LayoutParsingStorageService layoutParsingStorageService;
SectionsBuilderService sectionsBuilderService;
TaasClassificationService taasClassificationService;
RedactManagerClassificationService redactManagerClassificationService;
DocuMineClassificationService docuMineClassificationService;
SimplifiedSectionTextService simplifiedSectionTextService;
BodyTextFrameService bodyTextFrameService;
RulingCleaningService rulingCleaningService;
TableExtractionService tableExtractionService;
TaasBlockificationService taasBlockificationService;
DocuMineBlockificationService docuMineBlockificationService;
RedactManagerBlockificationService redactManagerBlockificationService;
DocstrumBlockificationService docstrumBlockificationService;
LayoutGridService layoutGridService;
ObservationRegistry observationRegistry;
VisualLayoutParsingAdapter visualLayoutParsingAdapter;
@ -97,40 +96,33 @@ public class LayoutParsingPipeline {
log.info("Starting layout parsing for {}", layoutParsingRequest.identifier());
File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId())
.orElse(originFile);
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()).orElse(originFile);
VisualLayoutParsingResponse visualLayoutParsingResponse = new VisualLayoutParsingResponse();
if (layoutParsingRequest.visualLayoutParsingFileId()
.isPresent()) {
visualLayoutParsingResponse = layoutParsingStorageService.getVisualLayoutParsingFile(layoutParsingRequest.visualLayoutParsingFileId()
.get());
if (layoutParsingRequest.visualLayoutParsingFileId().isPresent()) {
visualLayoutParsingResponse = layoutParsingStorageService.getVisualLayoutParsingFile(layoutParsingRequest.visualLayoutParsingFileId().get());
}
ImageServiceResponse imageServiceResponse = new ImageServiceResponse();
if (layoutParsingRequest.imagesFileStorageId()
.isPresent()) {
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId()
.get());
if (layoutParsingRequest.imagesFileStorageId().isPresent()) {
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get());
}
TableServiceResponse tableServiceResponse = new TableServiceResponse();
if (layoutParsingRequest.tablesFileStorageId()
.isPresent()) {
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId()
.get());
if (layoutParsingRequest.tablesFileStorageId().isPresent()) {
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get());
}
ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(),
originFile,
imageServiceResponse,
tableServiceResponse,
visualLayoutParsingResponse,
layoutParsingRequest.identifier().toString());
originFile,
imageServiceResponse,
tableServiceResponse,
visualLayoutParsingResponse,
layoutParsingRequest.identifier().toString());
log.info("Building document graph for {}", layoutParsingRequest.identifier());
Document documentGraph = observeBuildDocumentGraph(classificationDocument);
Document documentGraph = observeBuildDocumentGraph(layoutParsingRequest.layoutParsingType(), classificationDocument);
log.info("Creating viewer document for {}", layoutParsingRequest.identifier());
@ -142,7 +134,7 @@ public class LayoutParsingPipeline {
layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentGraph));
layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, viewerDocumentFile);
if (layoutParsingRequest.layoutParsingType().equals(LayoutParsingType.TAAS)) {
if (layoutParsingRequest.layoutParsingType().equals(LayoutParsingType.CLARIFYND)) {
log.info("Building research document data for {}", layoutParsingRequest.identifier());
var researchDocumentData = TaasDocumentDataMapper.fromDocument(documentGraph);
layoutParsingStorageService.storeResearchDocumentData(layoutParsingRequest, researchDocumentData);
@ -158,37 +150,37 @@ public class LayoutParsingPipeline {
.numberOfPages(documentGraph.getNumberOfPages())
.duration(System.currentTimeMillis() - start)
.message(format("""
Layout parsing has finished in %.02f s.
identifiers: %s
%s
Files have been saved with Ids:
Structure: %s
Text: %s
Positions: %s
PageData: %s
Simplified Text: %s
Viewer Doc: %s""",
((float) (System.currentTimeMillis() - start)) / 1000,
layoutParsingRequest.identifier(),
buildSemanticNodeCountMessage(documentGraph.getNumberOfPages(), documentGraph.buildSemanticNodeCounts()),
layoutParsingRequest.structureFileStorageId(),
layoutParsingRequest.textBlockFileStorageId(),
layoutParsingRequest.positionBlockFileStorageId(),
layoutParsingRequest.pageFileStorageId(),
layoutParsingRequest.simplifiedTextStorageId(),
layoutParsingRequest.viewerDocumentStorageId()))
Layout parsing has finished in %.02f s.
identifiers: %s
%s
Files have been saved with Ids:
Structure: %s
Text: %s
Positions: %s
PageData: %s
Simplified Text: %s
Viewer Doc: %s""",
((float) (System.currentTimeMillis() - start)) / 1000,
layoutParsingRequest.identifier(),
buildSemanticNodeCountMessage(documentGraph.getNumberOfPages(), documentGraph.buildSemanticNodeCounts()),
layoutParsingRequest.structureFileStorageId(),
layoutParsingRequest.textBlockFileStorageId(),
layoutParsingRequest.positionBlockFileStorageId(),
layoutParsingRequest.pageFileStorageId(),
layoutParsingRequest.simplifiedTextStorageId(),
layoutParsingRequest.viewerDocumentStorageId()))
.build();
}
private Document observeBuildDocumentGraph(ClassificationDocument classificationDocument) {
private Document observeBuildDocumentGraph(LayoutParsingType layoutParsingType, ClassificationDocument classificationDocument) {
AtomicReference<Document> documentReference = new AtomicReference<>();
Observation.createNotStarted("LayoutParsingPipeline", observationRegistry)
.contextualName("build-document-graph")
.observe(() -> documentReference.set(DocumentGraphFactory.buildDocumentGraph(classificationDocument)));
.observe(() -> documentReference.set(DocumentGraphFactory.buildDocumentGraph(layoutParsingType, classificationDocument)));
return documentReference.get();
}
@ -197,14 +189,14 @@ public class LayoutParsingPipeline {
private String buildSemanticNodeCountMessage(int numberOfPages, Map<NodeType, Long> semanticNodeCounts) {
return String.format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed",
numberOfPages,
semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION),
semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE),
semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH),
semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE),
semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL),
semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER),
semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER));
numberOfPages,
semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION),
semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE),
semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH),
semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE),
semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL),
semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER),
semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER));
}
@ -260,11 +252,16 @@ public class LayoutParsingPipeline {
PDRectangle cropbox = pdPage.getCropBox();
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), stripper.getRulings());
List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical());
ClassificationPage classificationPage = switch (layoutParsingType) {
case REDACT_MANAGER -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
case TAAS -> taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
case REDACT_MANAGER_OLD ->
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, true);
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, false);
};
classificationPage.setCleanRulings(cleanRulings);
classificationPage.setRotation(rotation);
classificationPage.setLandscape(isLandscape);
@ -289,7 +286,13 @@ public class LayoutParsingPipeline {
}
}
tableExtractionService.extractTables(cleanRulings, classificationPage);
tableExtractionService.extractTables(emptyTableCells, classificationPage);
if (layoutParsingType == LayoutParsingType.REDACT_MANAGER) {
docstrumBlockificationService.combineBlocks(classificationPage);
} else if (layoutParsingType == LayoutParsingType.CLARIFYND) {
docstrumBlockificationService.mergeZones(classificationPage.getTextBlocks());
}
buildPageStatistics(classificationPage);
increaseDocumentStatistics(classificationPage, classificationDocument);
@ -303,14 +306,21 @@ public class LayoutParsingPipeline {
bodyTextFrameService.setBodyTextFrames(classificationDocument, layoutParsingType);
log.info("Classify TextBlocks for {}", identifier);
switch (layoutParsingType) {
case TAAS -> taasClassificationService.classifyDocument(classificationDocument);
case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_OLD, CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG ->
redactManagerClassificationService.classifyDocument(classificationDocument);
case DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument);
case REDACT_MANAGER -> redactManagerClassificationService.classifyDocument(classificationDocument);
}
log.info("Building Sections for {}", identifier);
sectionsBuilderService.buildSections(classificationDocument);
sectionsBuilderService.addImagesToSections(classificationDocument);
switch (layoutParsingType) {
case CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_PARAGRAPH_DEBUG -> sectionsBuilderService.buildParagraphDebugSections(classificationDocument);
default -> {
sectionsBuilderService.buildSections(classificationDocument);
sectionsBuilderService.addImagesToSections(classificationDocument);
}
}
return classificationDocument;
}

View File

@ -96,7 +96,7 @@ public abstract class AbstractPageBlock extends Rectangle {
return this.minX - threshold <= apb.getMaxX() && this.maxX + threshold >= apb.getMinX();
}
public abstract boolean isEmpty();

View File

@ -15,7 +15,6 @@ import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.No
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
@ -52,7 +51,7 @@ public class Document implements GenericSemanticNode {
public TextBlock getTextBlock() {
if (textBlock == null) {
textBlock = streamTerminalTextBlocksInOrder().collect(new TextBlockCollector());
textBlock = GenericSemanticNode.super.getTextBlock();
}
return textBlock;
}
@ -67,8 +66,7 @@ public class Document implements GenericSemanticNode {
public Stream<TextBlock> streamTerminalTextBlocksInOrder() {
return streamAllNodes().filter(SemanticNode::isLeaf)
.map(SemanticNode::getLeafTextBlock);
return streamAllNodes().filter(SemanticNode::isLeaf).map(SemanticNode::getTextBlock);
}

View File

@ -0,0 +1,34 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.SuperBuilder;
@Data
@EqualsAndHashCode(callSuper = true)
@SuperBuilder
public class DuplicatedParagraph extends Paragraph {
TextBlock unsortedLeafTextBlock;
@Override
public TextBlock getTextBlock() {
return Stream.of(leafTextBlock, unsortedLeafTextBlock).collect(new TextBlockCollector());
}
@Override
public String toString() {
return super.toString();
}
}

View File

@ -18,11 +18,12 @@ import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
@Data
@Builder
@SuperBuilder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@FieldDefaults(level = AccessLevel.PROTECTED)
public class Paragraph implements GenericSemanticNode {
@Builder.Default

View File

@ -11,7 +11,6 @@ import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.No
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
@ -62,9 +61,7 @@ public class Section implements GenericSemanticNode {
public TextBlock getTextBlock() {
if (textBlock == null) {
textBlock = streamAllSubNodes().filter(SemanticNode::isLeaf)
.map(SemanticNode::getLeafTextBlock)
.collect(new TextBlockCollector());
textBlock = GenericSemanticNode.super.getTextBlock();
}
return textBlock;
}

View File

@ -20,6 +20,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.E
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
public interface SemanticNode {
@ -39,7 +40,10 @@ public interface SemanticNode {
*
* @return TextBlock containing all AtomicTextBlocks that are located under this Node.
*/
TextBlock getTextBlock();
default TextBlock getTextBlock() {
return streamAllSubNodes().filter(SemanticNode::isLeaf).map(SemanticNode::getTextBlock).collect(new TextBlockCollector());
}
/**

View File

@ -48,7 +48,6 @@ public class Table implements SemanticNode {
@EqualsAndHashCode.Exclude
Map<Page, Rectangle2D> bBoxCache;
/**
* Streams all entities in this table, that appear in a row, which contains any of the provided strings.
*
@ -332,9 +331,7 @@ public class Table implements SemanticNode {
public TextBlock getTextBlock() {
if (textBlock == null) {
textBlock = streamAllSubNodes().filter(SemanticNode::isLeaf)
.map(SemanticNode::getLeafTextBlock)
.collect(new TextBlockCollector());
textBlock = SemanticNode.super.getTextBlock();
}
return textBlock;
}

View File

@ -53,6 +53,9 @@ public class TextPageBlock extends AbstractPageBlock {
@JsonIgnore
private PageBlockType classification;
@JsonIgnore
private boolean toDuplicate;
@JsonIgnore
public TextDirection getDir() {
@ -73,7 +76,7 @@ public class TextPageBlock extends AbstractPageBlock {
return sequences.get(0).getPageWidth();
}
public static TextPageBlock merge(List<TextPageBlock> textBlocksToMerge) {
@ -82,6 +85,7 @@ public class TextPageBlock extends AbstractPageBlock {
return fromTextPositionSequences(sequences);
}
public static TextPageBlock fromTextPositionSequences(List<TextPositionSequence> wordBlockList) {
TextPageBlock textBlock = null;
@ -133,7 +137,6 @@ public class TextPageBlock extends AbstractPageBlock {
}
/**
* Returns the minX value in pdf coordinate system.
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
@ -362,7 +365,22 @@ public class TextPageBlock extends AbstractPageBlock {
}
return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString());
}
public int getNumberOfLines() {
int numberOfLines = 1;
TextPositionSequence previous = null;
for (TextPositionSequence word : sequences) {
if (previous != null) {
if (word.getMaxYDirAdj() - previous.getMaxYDirAdj() > word.getTextHeight()) {
numberOfLines++;
}
}
previous = word;
}
return numberOfLines;
}

View File

@ -55,6 +55,17 @@ public class TextPositionSequence implements CharSequence {
}
public TextPositionSequence(List<RedTextPosition> textPositions, int page) {
this.textPositions = textPositions;
this.page = page;
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
this.rotation = textPositions.get(0).getRotation();
this.pageHeight = textPositions.get(0).getPageHeight();
this.pageWidth = textPositions.get(0).getPageWidth();
}
@Override
public int length() {

View File

@ -25,6 +25,7 @@ public class BodyTextFrameService {
private static final float RULING_HEIGHT_THRESHOLD = 0.15f; // multiplied with page height. Header/Footer Rulings must be within that border of the page.
private static final float RULING_WIDTH_THRESHOLD = 0.75f; // multiplied with page width. Header/Footer Rulings must be at least that wide.
public void setBodyTextFrames(ClassificationDocument classificationDocument, LayoutParsingType layoutParsingType) {
Rectangle bodyTextFrame = calculateBodyTextFrame(classificationDocument.getPages(), classificationDocument.getFontSizeCounter(), false, layoutParsingType);
@ -132,12 +133,7 @@ public class BodyTextFrameService {
boolean landscape,
LayoutParsingType layoutParsingType) {
float approximateHeaderLineCount;
if (layoutParsingType.equals(LayoutParsingType.TAAS)) {
approximateHeaderLineCount = 3.3f;
} else {
approximateHeaderLineCount = 2.9f;
}
float approximateHeaderLineCount = 2.9f;
BodyTextFrameExpansionsRectangle expansionsRectangle = new BodyTextFrameExpansionsRectangle();
@ -155,8 +151,9 @@ public class BodyTextFrameService {
continue;
}
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|| MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)) {
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) || MarkedContentUtils.intersects(textBlock,
page.getMarkedContentBboxPerType(),
MarkedContentUtils.FOOTER)) {
continue;
}

View File

@ -7,6 +7,7 @@ import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import org.apache.logging.log4j.util.Strings;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
@ -110,6 +111,20 @@ public class SectionsBuilderService {
}
public void buildParagraphDebugSections(ClassificationDocument document) {
List<ClassificationSection> sections = new ArrayList<>();
for (var page : document.getPages()) {
page.getTextBlocks().forEach(block -> {
block.setPage(page.getPageNumber());
var section = buildTextBlock(List.of(block), Strings.EMPTY);
sections.add(section);
});
}
document.setSections(sections);
}
public void addImagesToSections(ClassificationDocument document) {
Map<Integer, List<ClassificationSection>> sectionMap = new HashMap<>();

View File

@ -14,7 +14,6 @@ import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
@ -41,19 +40,18 @@ public class TableExtractionService {
* <p>
* DirAdj (Text direction adjusted) values can not be used here.
*
* @param cleanRulings The lines used to build the table.
* @param page Page object that contains textblocks and statistics.
* @param emptyCells The cells used to build the table.
* @param page Page object that contains textblocks and statistics.
*/
public void extractTables(CleanRulings cleanRulings, ClassificationPage page) {
public void extractTables(List<Cell> emptyCells, ClassificationPage page) {
List<Cell> cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical());
// sort cells by size (height * width) ascending so that textBlocks are always assigned to the smallest cells that contain them
cells.sort(CELL_SIZE_COMPARATOR);
emptyCells.sort(CELL_SIZE_COMPARATOR);
for (AbstractPageBlock abstractPageBlock : page.getTextBlocks()) {
TextPageBlock textBlock = (TextPageBlock) abstractPageBlock;
for (Cell cell : cells) {
for (Cell cell : emptyCells) {
if (cell.hasMinimumSize() && doesCellContainTextBlock(cell, textBlock)) {
cell.addTextBlock(textBlock);
break;
@ -61,7 +59,7 @@ public class TableExtractionService {
}
}
cells = new ArrayList<>(new HashSet<>(cells));
var cells = new ArrayList<>(new HashSet<>(emptyCells));
DoubleComparisons.sort(cells, Rectangle.ILL_DEFINED_ORDER);
List<Rectangle> spreadsheetAreas = SpreadsheetFinder.findSpreadsheetsFromCells(cells);
@ -79,9 +77,7 @@ public class TableExtractionService {
}
}
var containedCellsWithText = containedCells.stream()
.filter(cell -> !cell.getTextBlocks().isEmpty())
.toList();
var containedCellsWithText = containedCells.stream().filter(cell -> !cell.getTextBlocks().isEmpty()).toList();
// verify if table would contain fewer cells with text than the threshold allows
if (containedCellsWithText.size() >= MAX_TABLE_CONTAINED_CELLS_WITH_TEXT && checkIfTableCellsAreUniform(containedCells)) {
@ -101,11 +97,7 @@ public class TableExtractionService {
if (position != -1) {
page.getTextBlocks().add(position, table);
var toBeRemoved = table.getCells()
.stream()
.map(Cell::getTextBlocks)
.flatMap(List::stream)
.toList();
var toBeRemoved = table.getCells().stream().map(Cell::getTextBlocks).flatMap(List::stream).toList();
// remove text blocks from the page that were also added with the table (from its contained cells)
page.getTextBlocks().removeAll(toBeRemoved);
}
@ -115,7 +107,7 @@ public class TableExtractionService {
private boolean checkIfTableCellsAreUniform(List<Cell> containedCells) {
if(containedCells.size() <= 2) {
if (containedCells.size() <= 2) {
return true;
}
@ -139,19 +131,13 @@ public class TableExtractionService {
}
double x0 = cell.getX();
double y0 = cell.getY();
return (x >= x0 - TEXT_BLOCK_CONTAINMENT_TOLERANCE
&& y >= y0 - TEXT_BLOCK_CONTAINMENT_TOLERANCE
&& (x + w) <= x0 + cell.getWidth() + 2 * TEXT_BLOCK_CONTAINMENT_TOLERANCE
&& (y + h) <= y0 + cell.getHeight() + 2 * TEXT_BLOCK_CONTAINMENT_TOLERANCE);
return (x >= x0 - TEXT_BLOCK_CONTAINMENT_TOLERANCE && y >= y0 - TEXT_BLOCK_CONTAINMENT_TOLERANCE && (x + w) <= x0 + cell.getWidth() + 2 * TEXT_BLOCK_CONTAINMENT_TOLERANCE && (y + h) <= y0 + cell.getHeight() + 2 * TEXT_BLOCK_CONTAINMENT_TOLERANCE);
}
public static List<Cell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
return RectangularIntersectionFinder.find(horizontalRulingLines, verticalRulingLines)
.stream()
.map(Cell::new)
.collect(Collectors.toList());
return RectangularIntersectionFinder.find(horizontalRulingLines, verticalRulingLines).stream().map(Cell::new).collect(Collectors.toList());
}
}

View File

@ -0,0 +1,408 @@
package com.knecon.fforesight.service.layoutparser.processor.services.blockification;
import static java.util.stream.Collectors.toSet;
import java.awt.geom.Point2D;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.ListIterator;
import java.util.Set;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.DocstrumSegmentationService;
import com.knecon.fforesight.service.layoutparser.processor.utils.QuickSort;
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionSequenceComparator;
import lombok.RequiredArgsConstructor;
@SuppressWarnings("all")
@Service
@RequiredArgsConstructor
public class DocstrumBlockificationService {
private final DocstrumSegmentationService docstrumSegmentationService;
static final float THRESHOLD = 1f;
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Cell> cells, boolean xyOrder) {
// Underlined or strikethrough are also in rulings but we dont want to split blocks with them so we use cells.
List<Ruling> usedHorizonalRulings = new ArrayList<>();
List<Ruling> usedVerticalRulings = new ArrayList<>();
cells.forEach(cell -> {
usedHorizonalRulings.add(new Ruling(new Point2D.Float(cell.x, cell.y), new Point2D.Float(cell.x + cell.width, cell.y)));
usedHorizonalRulings.add(new Ruling(new Point2D.Float(cell.x, cell.y + cell.height), new Point2D.Float(cell.x + cell.width, cell.y + cell.height)));
usedVerticalRulings.add(new Ruling(new Point2D.Float(cell.x, cell.y), new Point2D.Float(cell.x, cell.y + cell.height)));
usedVerticalRulings.add(new Ruling(new Point2D.Float(cell.x + cell.width, cell.y), new Point2D.Float(cell.x + cell.width, cell.y + cell.height)));
});
List<AbstractPageBlock> abstractPageBlocks = new ArrayList<>();
var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder);
zones.forEach(zone -> {
List<TextPositionSequence> textPositionSequences = new ArrayList<>();
zone.getLines().forEach(line -> {
line.getWords().forEach(word -> {
textPositionSequences.add(new TextPositionSequence(word.getTextPositions(), word.getPage()));
});
});
abstractPageBlocks.addAll(splitZonesAtRulings(textPositionSequences, usedHorizonalRulings, usedVerticalRulings));
});
return new ClassificationPage(abstractPageBlocks);
}
public void combineBlocks(ClassificationPage page) {
mergeZones(page.getTextBlocks());
TextPageBlock previous = new TextPageBlock();
ListIterator<AbstractPageBlock> itty = page.getTextBlocks().listIterator();
while (itty.hasNext()) {
AbstractPageBlock block = itty.next();
if (block instanceof TablePageBlock) {
continue;
}
TextPageBlock current = (TextPageBlock) block;
if (previous != null && !previous.getSequences().isEmpty()) {
if (current.getDir() == previous.getDir() //
&& previous.getNumberOfLines() >= 2 && current.getNumberOfLines() >= 2 //
&& previous.intersectsY(current) //
&& !hasBetween(current, previous, page.getTextBlocks()) //
&& numberOfYIntersections(current, previous, page.getTextBlocks()) == 0) {
previous.getSequences().addAll(current.getSequences());
previous = buildTextBlock(previous.getSequences(), 0);
previous.setToDuplicate(true);
itty.remove();
itty.previous();
itty.set(previous);
itty.next();
continue;
}
if (current.getDir() == previous.getDir() && (previous.almostIntersects(current, 0, 0))) {
previous.getSequences().addAll(current.getSequences());
boolean toDuplicate = previous.isToDuplicate();
previous = buildTextBlock(previous.getSequences(), 0);
previous.setToDuplicate(toDuplicate);
itty.remove();
itty.previous();
itty.set(previous);
itty.next();
continue;
}
if (current.getDir() == previous.getDir() //
&& (Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD || Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) //
&& (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() == 2 && current.getNumberOfLines() == 1) //
&& !hasBetween(current, previous, page.getTextBlocks()) && numberOfYIntersections(current, previous, page.getTextBlocks()) <= 4) {
previous.getSequences().addAll(current.getSequences());
previous = buildTextBlock(previous.getSequences(), 0);
itty.remove();
itty.previous();
itty.set(previous);
itty.next();
continue;
}
if (current.getDir() == previous.getDir() //
&& current.intersectsY(previous) //
&& (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() == 2 && current.getNumberOfLines() == 1) //
&& !hasBetween(current, previous, page.getTextBlocks()) //
&& numberOfYIntersections(current, previous, page.getTextBlocks()) <= 0) {
previous.getSequences().addAll(current.getSequences());
previous = buildTextBlock(previous.getSequences(), 0);
itty.remove();
itty.previous();
itty.set(previous);
itty.next();
continue;
}
}
previous = current;
}
mergeZones(page.getTextBlocks());
}
private boolean hasBetween(TextPageBlock block, TextPageBlock other, List<AbstractPageBlock> allBlocks) {
for (AbstractPageBlock current : allBlocks) {
if (current == other || current == block) {
continue;
}
if (other.intersectsY(current) && other.getMaxX() <= current.getMinX() && current.getMaxX() <= block.getMinX()) {
return true;
}
}
return false;
}
private int numberOfYIntersections(TextPageBlock block, TextPageBlock other, List<AbstractPageBlock> allBlocks) {
double minY = Math.min(block.getMinY(), other.getMinY());
double maxY = Math.min(block.getMaxY(), other.getMaxY());
int numberOfYIntersections = 0;
for (AbstractPageBlock current : allBlocks) {
if (current == other || current == block) {
continue;
}
if (minY <= current.getMaxY() && maxY >= current.getMinY()) {
numberOfYIntersections++;
}
}
return numberOfYIntersections;
}
public void mergeZones(List<AbstractPageBlock> zones) {
ListIterator<AbstractPageBlock> itty = zones.listIterator();
Set<AbstractPageBlock> toRemove = new HashSet<>();
while (itty.hasNext()) {
AbstractPageBlock block = itty.next();
if (block instanceof TablePageBlock) {
continue;
}
TextPageBlock current = (TextPageBlock) block;
if (current.isToDuplicate()) {
continue;
}
for (int i = 0; i < zones.size(); i++) {
if (toRemove.contains(zones.get(i))) {
continue;
}
if (zones.get(i) == current) {
continue;
}
if (zones.get(i) instanceof TablePageBlock) {
continue;
}
TextPageBlock inner = (TextPageBlock) zones.get(i);
if (inner.isToDuplicate()) {
continue;
}
if (current.getDir() == inner.getDir() && current.almostIntersects(inner, 0, 0)) {
current.getSequences().addAll(inner.getSequences());
QuickSort.sort(current.getSequences(), new TextPositionSequenceComparator());
current = buildTextBlock(current.getSequences(), 0);
toRemove.add(inner);
itty.set(current);
}
}
}
zones.removeAll(toRemove);
}
public List<AbstractPageBlock> splitZonesAtRulings(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
int indexOnPage = 0;
List<TextPositionSequence> chunkWords = new ArrayList<>();
List<AbstractPageBlock> chunkBlockList = new ArrayList<>();
float minX = 1000, maxX = 0, minY = 1000, maxY = 0;
TextPositionSequence prev = null;
for (TextPositionSequence word : textPositions) {
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
if (prev != null && (splitByDir || isSplitByRuling)) {
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
indexOnPage++;
chunkBlockList.add(cb1);
chunkWords = new ArrayList<>();
minX = 1000;
maxX = 0;
minY = 1000;
maxY = 0;
prev = null;
}
chunkWords.add(word);
prev = word;
if (word.getMinXDirAdj() < minX) {
minX = word.getMinXDirAdj();
}
if (word.getMaxXDirAdj() > maxX) {
maxX = word.getMaxXDirAdj();
}
if (word.getMinYDirAdj() < minY) {
minY = word.getMinYDirAdj();
}
if (word.getMaxYDirAdj() > maxY) {
maxY = word.getMaxYDirAdj();
}
}
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
if (cb1 != null) {
chunkBlockList.add(cb1);
}
return chunkBlockList;
}
private boolean equalsWithThreshold(float f1, float f2) {
return Math.abs(f1 - f2) < THRESHOLD;
}
private TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
TextPageBlock textBlock = null;
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter();
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
for (TextPositionSequence wordBlock : wordBlockList) {
lineHeightFrequencyCounter.add(wordBlock.getTextHeight());
fontSizeFrequencyCounter.add(wordBlock.getFontSize());
spaceFrequencyCounter.add(wordBlock.getSpaceWidth());
fontFrequencyCounter.add(wordBlock.getFont());
styleFrequencyCounter.add(wordBlock.getFontStyle());
if (textBlock == null) {
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
wordBlock.getMaxXDirAdj(),
wordBlock.getMinYDirAdj(),
wordBlock.getMaxYDirAdj(),
wordBlockList,
wordBlock.getRotation());
} else {
TextPageBlock spatialEntity = textBlock.union(wordBlock);
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
}
}
if (textBlock != null) {
textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
}
if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) {
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
}
return textBlock;
}
private boolean isSplitByRuling(float minX,
float minY,
float maxX,
float maxY,
TextPositionSequence word,
List<Ruling> horizontalRulingLines,
List<Ruling> verticalRulingLines) {
return isSplitByRuling(maxX,
minY,
word.getMinXDirAdj(),
word.getMinYDirAdj(),
verticalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(minX,
minY,
word.getMinXDirAdj(),
word.getMaxYDirAdj(),
horizontalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(maxX,
minY,
word.getMinXDirAdj(),
word.getMinYDirAdj(),
horizontalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(minX,
minY,
word.getMinXDirAdj(),
word.getMaxYDirAdj(),
verticalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight());
}
private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List<Ruling> rulingLines, float dir, float pageWidth, float pageHeight) {
for (Ruling ruling : rulingLines) {
var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight);
if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) {
return true;
}
}
return false;
}
private double round(float value, int decimalPoints) {
var d = Math.pow(10, decimalPoints);
return Math.round(value * d) / d;
}
}

View File

@ -1,330 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.services.blockification;
// TODO: figure out, why this fails the build
// import static com.knecon.fforesight.service.layoutparser.processor.services.factory.SearchTextWithTextPositionFactory.HEIGHT_PADDING;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.Orientation;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
import org.springframework.stereotype.Service;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Stream;
@Service
@SuppressWarnings("all")
public class TaasBlockificationService {
private static final float THRESHOLD = 1f;
private static final float Y_GAP_SPLIT_HEIGHT_MODIFIER = 1.25f; // multiplied with text height
private static final float INTERSECTS_Y_THRESHOLD = 4;// 2 * HEIGHT_PADDING // This is exactly 2 times our position height padding. This is required to find boxes that are visually intersecting.
private static final int X_GAP_SPLIT_CONSTANT = 50;
public static final int X_ALIGNMENT_THRESHOLD = 1;
public static final int NEGATIVE_X_GAP_THRESHOLD = -5;
private Pattern listIdentifier = Pattern.compile("^(?:(?:[1-9]|1\\d|20|[ivxlc]|[a-z])\\s*(?:[.)]))|\\uF0B7", Pattern.CASE_INSENSITIVE);
/**
* This method is building blocks by expanding the minX/maxX and minY/maxY value on each word that is not split by the conditions.
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
* Rulings (Table lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
*
* @param textPositions The words of a page.
* @param horizontalRulingLines Horizontal table lines.
* @param verticalRulingLines Vertical table lines.
* @return ClassificationPage object that contains the Textblock and text statistics.
*/
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
List<TextPageBlock> classificationTextBlocks = constructFineGranularTextPageBlocks(textPositions, horizontalRulingLines, verticalRulingLines);
classificationTextBlocks = mergeTextPageBlocksAligningX(classificationTextBlocks);
classificationTextBlocks = mergeIntersectingTextBlocksUntilConvergence(classificationTextBlocks);
return new ClassificationPage(new ArrayList<>(classificationTextBlocks.stream().map(classificationTextBlock -> (AbstractPageBlock) classificationTextBlock).toList()));
}
private List<TextPageBlock> mergeIntersectingTextBlocksUntilConvergence(List<TextPageBlock> classificationTextBlocks) {
int currentSize = classificationTextBlocks.size();
while (true) {
classificationTextBlocks = mergeTextPageBlocksAlmostIntersecting(classificationTextBlocks);
if (classificationTextBlocks.size() == currentSize) {
break;
}
currentSize = classificationTextBlocks.size();
}
return classificationTextBlocks;
}
private List<TextPageBlock> mergeTextPageBlocksAligningX(List<TextPageBlock> classificationTextBlocks) {
if (classificationTextBlocks.isEmpty()) {
return new ArrayList<>();
}
List<List<TextPageBlock>> textBlocksToMerge = new LinkedList<>();
List<TextPageBlock> currentTextBlocksToMerge = new LinkedList<>();
textBlocksToMerge.add(currentTextBlocksToMerge);
TextPageBlock previousTextBlock = null;
Float lastLineGap = null;
for (TextPageBlock currentTextBlock : classificationTextBlocks) {
if (previousTextBlock == null) {
currentTextBlocksToMerge.add(currentTextBlock);
previousTextBlock = currentTextBlock;
continue;
}
Matcher listIdentifierPattern = listIdentifier.matcher(currentTextBlock.getText());
boolean isListIdentifier = listIdentifierPattern.find();
boolean yGap = Math.abs(currentTextBlock.getPdfMaxY() - previousTextBlock.getPdfMinY()) < previousTextBlock.getMostPopularWordHeight() * Y_GAP_SPLIT_HEIGHT_MODIFIER;
boolean sameFont = previousTextBlock.getMostPopularWordFont().equals(currentTextBlock.getMostPopularWordFont()) && previousTextBlock.getMostPopularWordFontSize() == currentTextBlock.getMostPopularWordFontSize();
// boolean yGap = previousTextBlock != null && currentTextBlock.getMinYDirAdj() - maxY > Math.min(word.getHeight(), prev.getHeight()) * Y_GAP_SPLIT_HEIGHT_MODIFIER;
boolean alignsXRight = Math.abs(currentTextBlock.getPdfMaxX() - previousTextBlock.getPdfMaxX()) < X_ALIGNMENT_THRESHOLD;
boolean alignsXLeft = Math.abs(currentTextBlock.getPdfMinX() - previousTextBlock.getPdfMinX()) < X_ALIGNMENT_THRESHOLD;
// boolean smallYGap = Math.abs(currentTextBlock.getPdfMaxY() - previousTextBlock.getPdfMinY()) < yGap;
if (yGap && sameFont && !isListIdentifier) {
currentTextBlocksToMerge.add(currentTextBlock);
} else {
currentTextBlocksToMerge = new LinkedList<>();
currentTextBlocksToMerge.add(currentTextBlock);
textBlocksToMerge.add(currentTextBlocksToMerge);
}
previousTextBlock = currentTextBlock;
}
return textBlocksToMerge.stream().map(TextPageBlock::merge).toList();
}
private List<TextPageBlock> mergeTextPageBlocksAlmostIntersecting(List<TextPageBlock> textPageBlocks) {
Set<TextPageBlock> alreadyMerged = new HashSet<>();
List<List<TextPageBlock>> textBlocksToMerge = new LinkedList<>();
for (TextPageBlock textPageBlock : textPageBlocks) {
if (alreadyMerged.contains(textPageBlock)) {
continue;
}
alreadyMerged.add(textPageBlock);
textBlocksToMerge.add(Stream.concat(Stream.of(textPageBlock),
textPageBlocks.stream().filter(textPageBlock2 -> textPageBlock.almostIntersects(textPageBlock2, INTERSECTS_Y_THRESHOLD, 0) && !alreadyMerged.contains(textPageBlock2)).peek(alreadyMerged::add))
.toList());
}
return textBlocksToMerge.stream().map(TextPageBlock::merge).toList();
}
private void assignOrientations(List<TextPageBlock> classificationTextBlocks) {
Iterator<TextPageBlock> itty = classificationTextBlocks.iterator();
TextPageBlock previousLeft = null;
TextPageBlock previousRight = null;
while (itty.hasNext()) {
TextPageBlock block = (TextPageBlock) itty.next();
if (previousLeft != null && block.getOrientation().equals(Orientation.LEFT)) {
if (previousLeft.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousLeft.getMinY()) {
previousLeft.add(block);
itty.remove();
continue;
}
}
if (previousRight != null && block.getOrientation().equals(Orientation.RIGHT)) {
if (previousRight.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousRight.getMinY()) {
previousRight.add(block);
itty.remove();
continue;
}
}
if (block.getOrientation().equals(Orientation.LEFT)) {
previousLeft = block;
} else if (block.getOrientation().equals(Orientation.RIGHT)) {
previousRight = block;
}
}
itty = classificationTextBlocks.iterator();
TextPageBlock previous = null;
while (itty.hasNext()) {
TextPageBlock block = (TextPageBlock) itty.next();
if (previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold(
block.getMaxY(),
previous.getMaxY()) || previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation()
.equals(Orientation.RIGHT) && equalsWithThreshold(block.getMaxY(), previous.getMaxY())) {
previous.add(block);
itty.remove();
continue;
}
previous = block;
}
}
private List<TextPageBlock> constructFineGranularTextPageBlocks(List<TextPositionSequence> textPositions,
List<Ruling> horizontalRulingLines,
List<Ruling> verticalRulingLines) {
int indexOnPage = 0;
List<TextPositionSequence> wordClusterToCombine = new ArrayList<>();
List<TextPageBlock> classificationTextBlocks = new ArrayList<>();
float minX = 1000, maxX = 0, minY = 1000, maxY = 0;
TextPositionSequence prev = null;
// TODO: make static final constant
boolean wasSplitted = false;
Float splitX1 = null;
for (TextPositionSequence word : textPositions) {
Matcher listIdentifierPattern = listIdentifier.matcher(word.toString());
boolean yGap = prev != null && word.getMinYDirAdj() - maxY > Math.min(word.getHeight(), prev.getHeight()) * Y_GAP_SPLIT_HEIGHT_MODIFIER;
boolean sameLine = prev != null && equalsWithThreshold(prev.getMinYDirAdj(), word.getMinYDirAdj());
boolean positiveXGapInline = prev != null && maxX + X_GAP_SPLIT_CONSTANT < word.getMinXDirAdj() && sameLine;
boolean negativeXGap = prev != null && word.getMinXDirAdj() - minX < NEGATIVE_X_GAP_THRESHOLD;
boolean startFromTop = prev != null && word.getMinYDirAdj() < prev.getMinYDirAdj() - prev.getTextHeight();
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
boolean splitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
boolean fontChange = prev != null && (!word.getFont().equals(prev.getFont()) || !word.getFontStyle()
.equals(prev.getFontStyle()) || word.getFontSize() != prev.getFontSize());
boolean newline = prev != null && Math.abs(word.getMinYDirAdj() - prev.getMinYDirAdj()) > word.getHeight();
boolean isListIdentifier = listIdentifierPattern.matches();
if (prev != null && (prev.isParagraphStart() || negativeXGap || positiveXGapInline || yGap || startFromTop || splitByRuling || (newline && (fontChange || isListIdentifier)))) {
// if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) {
Orientation prevOrientation = null;
if (!classificationTextBlocks.isEmpty()) {
prevOrientation = classificationTextBlocks.get(classificationTextBlocks.size() - X_ALIGNMENT_THRESHOLD).getOrientation();
}
TextPageBlock classificationTextBlock = TextPageBlock.fromTextPositionSequences(wordClusterToCombine);
classificationTextBlocks.add(classificationTextBlock);
wordClusterToCombine = new ArrayList<>();
if (positiveXGapInline && !splitByRuling) {
wasSplitted = true;
classificationTextBlock.setOrientation(Orientation.LEFT);
splitX1 = word.getMinXDirAdj();
} else if (newLineAfterSplit && !splitByRuling) {
wasSplitted = false;
classificationTextBlock.setOrientation(Orientation.RIGHT);
splitX1 = null;
} else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (yGap || !startFromTop || !positiveXGapInline || !newLineAfterSplit || !splitByRuling)) {
classificationTextBlock.setOrientation(Orientation.LEFT);
}
minX = 1000;
maxX = 0;
minY = 1000;
maxY = 0;
prev = null;
}
wordClusterToCombine.add(word);
prev = word;
if (word.getMinXDirAdj() < minX) {
minX = word.getMinXDirAdj();
}
if (word.getMaxXDirAdj() > maxX) {
maxX = word.getMaxXDirAdj();
}
if (word.getMinYDirAdj() < minY) {
minY = word.getMinYDirAdj();
}
if (word.getMaxYDirAdj() > maxY) {
maxY = word.getMaxYDirAdj();
}
}
TextPageBlock classificationTextBlock = TextPageBlock.fromTextPositionSequences(wordClusterToCombine);
if (classificationTextBlock != null) {
classificationTextBlocks.add(classificationTextBlock);
}
return classificationTextBlocks;
}
private boolean equalsWithThreshold(float f1, float f2) {
return Math.abs(f1 - f2) < THRESHOLD;
}
private boolean isSplitByRuling(float minX,
float minY,
float maxX,
float maxY,
TextPositionSequence word,
List<Ruling> horizontalRulingLines,
List<Ruling> verticalRulingLines) {
return isSplitByRuling(maxX,
minY,
word.getMinXDirAdj(),
word.getMinYDirAdj(),
verticalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(minX,
minY,
word.getMinXDirAdj(),
word.getMaxYDirAdj(),
horizontalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(maxX,
minY,
word.getMinXDirAdj(),
word.getMinYDirAdj(),
horizontalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(minX,
minY,
word.getMinXDirAdj(),
word.getMaxYDirAdj(),
verticalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()); //
}
private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List<Ruling> rulingLines, float dir, float pageWidth, float pageHeight) {
for (Ruling ruling : rulingLines) {
var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight);
if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) {
return true;
}
}
return false;
}
}

View File

@ -1,114 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.services.classification;
import java.util.List;
import java.util.regex.Pattern;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService;
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
public class TaasClassificationService {
private final BodyTextFrameService bodyTextFrameService;
public void classifyDocument(ClassificationDocument document) {
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
for (ClassificationPage page : document.getPages()) {
classifyPage(page, document, headlineFontSizes);
}
}
public void classifyPage(ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
for (AbstractPageBlock textBlock : page.getTextBlocks()) {
if (textBlock instanceof TextPageBlock) {
classifyBlock((TextPageBlock) textBlock, page, document, headlineFontSizes);
}
}
}
public void classifyBlock(TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
var bodyTextFrame = page.getBodyTextFrame();
if (document.getFontSizeCounter().getMostPopular() == null) {
textBlock.setClassification(PageBlockType.OTHER);
return;
}
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation())) {
textBlock.setClassification(PageBlockType.HEADER);
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
|| PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation())) {
textBlock.setClassification(PageBlockType.FOOTER);
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
.size() == 1)) {
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
textBlock.setClassification(PageBlockType.TITLE);
}
} else if (textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter()
.getMostPopular() && PositionUtils.getApproxLineCount(textBlock) < 4.9 && (textBlock.getMostPopularWordStyle().equals("bold") || !document.getFontStyleCounter()
.getCountPerValue()
.containsKey("bold") && textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() + 1) && textBlock.getSequences()
.get(0)
.getTextPositions()
.get(0)
.getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
for (int i = 1; i <= headlineFontSizes.size(); i++) {
if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) {
textBlock.setClassification(PageBlockType.getHeadlineType(i));
document.setHeadlines(true);
}
}
} else if (!textBlock.getText().startsWith("Figure ") && PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordStyle()
.equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9 && textBlock.getSequences()
.get(0)
.getTextPositions()
.get(0)
.getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
textBlock.setClassification(PageBlockType.getHeadlineType(headlineFontSizes.size() + 1));
document.setHeadlines(true);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) {
textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFont()
.equals(document.getFontCounter().getMostPopular()) && textBlock.getMostPopularWordStyle()
.equals(document.getFontStyleCounter().getMostPopular()) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) {
textBlock.setClassification(PageBlockType.PARAGRAPH);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("italic") && !document.getFontStyleCounter()
.getMostPopular()
.equals("italic") && PositionUtils.getApproxLineCount(textBlock) < 2.9) {
textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) {
textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN);
} else {
textBlock.setClassification(PageBlockType.OTHER);
}
}
}

View File

@ -0,0 +1,59 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.LineBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.NearestNeighbourService;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.ReadingOrderService;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.SpacingService;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.ZoneBuilderService;
import lombok.RequiredArgsConstructor;
@Service
@RequiredArgsConstructor
public class DocstrumSegmentationService {
private final NearestNeighbourService nearestNeighbourService;
private final SpacingService spacingService;
private final LineBuilderService lineBuilderService;
private final ZoneBuilderService zoneBuilderService;
private final ReadingOrderService readingOrderService;
public List<Zone> segmentPage(List<TextPositionSequence> textPositions, boolean xyOrder) {
List<Zone> zones = new ArrayList<>();
zones.addAll(computeZones(textPositions, TextDirection.ZERO));
zones.addAll(computeZones(textPositions, TextDirection.QUARTER_CIRCLE));
zones.addAll(computeZones(textPositions, TextDirection.HALF_CIRCLE));
zones.addAll(computeZones(textPositions, TextDirection.THREE_QUARTER_CIRCLE));
return readingOrderService.resolve(zones, xyOrder);
}
private List<Zone> computeZones(List<TextPositionSequence> textPositions, TextDirection direction) {
var positions = textPositions.stream().filter(t -> t.getDir() == direction).map(TextPositionSequence::getTextPositions).flatMap(List::stream).toList();
var characters = positions.stream().map(Character::new).collect(Collectors.toList());
nearestNeighbourService.findNearestNeighbors(characters);
var characterSpacing = spacingService.computeCharacterSpacing(characters);
var lineSpacing = Math.min(spacingService.computeLineSpacing(characters), 20);
var lines = lineBuilderService.buildLines(characters, characterSpacing, lineSpacing);
return zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing);
}
}

View File

@ -0,0 +1,25 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
public class AngleFilter {
protected double lowerAngle;
protected double upperAngle;
public AngleFilter(double lowerAngle, double upperAngle) {
this.lowerAngle = lowerAngle < -Math.PI / 2 ? lowerAngle + Math.PI : lowerAngle;
this.upperAngle = upperAngle >= Math.PI / 2 ? upperAngle - Math.PI : upperAngle;
}
public boolean matches(Neighbor neighbor) {
if (lowerAngle <= upperAngle) {
return lowerAngle <= neighbor.getAngle() && neighbor.getAngle() < upperAngle;
} else {
return lowerAngle <= neighbor.getAngle() || neighbor.getAngle() < upperAngle;
}
}
}

View File

@ -0,0 +1,57 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
import java.awt.geom.Rectangle2D;
import lombok.Data;
@Data
public abstract class BoundingBox {
private Rectangle2D bBox;
public double getX() {
return bBox.getX();
}
public double getY() {
return bBox.getY();
}
public double getWidth() {
return bBox.getWidth();
}
public double getHeight() {
return bBox.getHeight();
}
public double getArea() {
return (bBox.getHeight() * bBox.getWidth());
}
public boolean contains(Rectangle2D contained, double tolerance) {
return bBox.getX() <= contained.getX() + tolerance
&& bBox.getY() <= contained.getY() + tolerance
&& bBox.getX() + bBox.getWidth() >= contained.getX() + contained.getWidth() - tolerance
&& bBox.getY() + bBox.getHeight() >= contained.getY() + contained.getHeight() - tolerance;
}
public boolean intersectsY(BoundingBox other) {
return this.getBBox().getMinY() <= other.getBBox().getMaxY() && this.getBBox().getMaxY() >= other.getBBox().getMinY();
}
}

View File

@ -0,0 +1,85 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
import lombok.Data;
@Data
public class Character {
private final double x;
private final double y;
private final RedTextPosition textPosition;
private List<Neighbor> neighbors = new ArrayList<>();
public Character(RedTextPosition chunk) {
this.x = chunk.getXDirAdj() + chunk.getWidthDirAdj() / 2;
this.y = chunk.getYDirAdj() + chunk.getHeightDir() / 2;
this.textPosition = chunk;
}
public double getHeight() {
return textPosition.getHeightDir();
}
public double distance(Character character) {
double dx = getX() - character.getX();
double dy = getY() - character.getY();
return Math.sqrt(dx * dx + dy * dy);
}
public double horizontalDistance(Character character) {
return Math.abs(getX() - character.getX());
}
public double verticalDistance(Character character) {
return Math.abs(getY() - character.getY());
}
public double overlappingDistance(Character other) {
double[] xs = new double[4];
double s = Math.sin(-0);
double c = Math.cos(-0);
xs[0] = c * x - s * y;
xs[1] = c * (x + textPosition.getWidthDirAdj()) - s * (y + textPosition.getHeightDir());
xs[2] = c * other.x - s * other.y;
xs[3] = c * (other.x + other.textPosition.getWidthDirAdj()) - s * (other.y + other.textPosition.getHeightDir());
boolean overlapping = xs[1] >= xs[2] && xs[3] >= xs[0];
Arrays.sort(xs);
return Math.abs(xs[2] - xs[1]) * (overlapping ? 1 : -1);
}
public void setNeighbors(List<Neighbor> neighbors) {
this.neighbors = neighbors;
}
public double angle(Character character) {
if (getX() > character.getX()) {
return Math.atan2(getY() - character.getY(), getX() - character.getX());
} else {
return Math.atan2(character.getY() - getY(), character.getX() - getX());
}
}
}

View File

@ -0,0 +1,90 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
public class Histogram {
private static final double EPSILON = 1.0e-6;
private final double min;
private final double resolution;
private double[] frequencies;
public Histogram(double minValue, double maxValue, double resolution) {
this.min = minValue - EPSILON;
double delta = maxValue - minValue + 2 * EPSILON;
int size = Math.max(1, (int) Math.round((maxValue - minValue) / resolution));
this.resolution = delta / size;
this.frequencies = new double[size];
}
public void kernelSmooth(double[] kernel) {
double[] newFrequencies = new double[frequencies.length];
int shift = (kernel.length - 1) / 2;
for (int i = 0; i < kernel.length; i++) {
int jStart = Math.max(0, i - shift);
int jEnd = Math.min(frequencies.length, frequencies.length + i - shift);
for (int j = jStart; j < jEnd; j++) {
newFrequencies[j - i + shift] += kernel[i] * frequencies[j];
}
}
frequencies = newFrequencies;
}
public double[] createGaussianKernel(double length, double stdDeviation) {
int r = (int) Math.round(length / resolution) / 2;
int size = 2 * r + 1;
double[] kernel = new double[size];
double sum = 0;
double b = 2 * (stdDeviation / resolution) * (stdDeviation / resolution);
double a = 1 / Math.sqrt(Math.PI * b);
for (int i = 0; i < size; i++) {
kernel[i] = a * Math.exp(-(i - r) * (i - r) / b);
sum += kernel[i];
}
for (int i = 0; i < size; i++) {
kernel[i] /= sum;
}
return kernel;
}
public void gaussianSmooth(double windowLength, double stdDeviation) {
kernelSmooth(createGaussianKernel(windowLength, stdDeviation));
}
public void add(double value) {
frequencies[(int) ((value - min) / resolution)] += 1.0;
}
public int getSize() {
return frequencies.length;
}
public double getPeakValue() {
int peakIndex = 0;
for (int i = 1; i < frequencies.length; i++) {
if (frequencies[i] > frequencies[peakIndex]) {
peakIndex = i;
}
}
int peakEndIndex = peakIndex + 1;
final double EPS = 0.0001;
while (peakEndIndex < frequencies.length && Math.abs(frequencies[peakEndIndex] - frequencies[peakIndex]) < EPS) {
peakEndIndex++;
}
return ((double) peakIndex + peakEndIndex) / 2 * resolution + min;
}
}

View File

@ -0,0 +1,164 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import lombok.Data;
@Data
public class Line extends BoundingBox {
private static final double WORD_DISTANCE_MULTIPLIER = 0.18;
private final double x0;
private final double y0;
private final double x1;
private final double y1;
private final double height;
private final List<Character> characters;
private final List<TextPositionSequence> words = new ArrayList<>();
public Line(List<Character> characters, double wordSpacing) {
this.characters = characters;
if (characters.size() >= 2) {
// linear regression
double sx = 0.0;
double sxx = 0.0;
double sxy = 0.0;
double sy = 0.0;
for (Character character : characters) {
sx += character.getX();
sxx += character.getX() * character.getX();
sxy += character.getX() * character.getY();
sy += character.getY();
}
double b = (characters.size() * sxy - sx * sy) / (characters.size() * sxx - sx * sx);
double a = (sy - b * sx) / characters.size();
this.x0 = characters.get(0).getX();
this.y0 = a + b * this.x0;
this.x1 = characters.get(characters.size() - 1).getX();
this.y1 = a + b * this.x1;
} else {
Character character = characters.get(0);
double dx = character.getTextPosition().getWidthDirAdj() / 3;
double dy = dx * Math.tan(0);
this.x0 = character.getX() - dx;
this.x1 = character.getX() + dx;
this.y0 = character.getY() - dy;
this.y1 = character.getY() + dy;
}
height = computeHeight();
computeWords(wordSpacing * WORD_DISTANCE_MULTIPLIER);
buildBBox();
}
public double getAngle() {
return Math.atan2(y1 - y0, x1 - x0);
}
public double getLength() {
return Math.sqrt((x0 - x1) * (x0 - x1) + (y0 - y1) * (y0 - y1));
}
private double computeHeight() {
return characters.stream().map(Character::getHeight).reduce(0d, Double::sum) / characters.size();
}
public double angularDifference(Line j) {
double diff = Math.abs(getAngle() - j.getAngle());
if (diff <= Math.PI / 2) {
return diff;
} else {
return Math.PI - diff;
}
}
public double horizontalDistance(Line other) {
double[] xs = new double[4];
xs[0] = x0;
xs[1] = x1;
xs[2] = other.x0;
xs[3] = other.x1;
boolean overlapping = xs[1] >= xs[2] && xs[3] >= xs[0];
Arrays.sort(xs);
return Math.abs(xs[2] - xs[1]) * (overlapping ? 1 : -1);
}
public double verticalDistance(Line other) {
double ym = (y0 + y1) / 2;
double yn = (other.y0 + other.y1) / 2;
return Math.abs(ym - yn) / Math.sqrt(1);
}
private void computeWords(double wordSpacing) {
TextPositionSequence word = new TextPositionSequence();
Character previous = null;
for (Character current : characters) {
if (previous != null) {
double dist = current.getTextPosition().getXDirAdj() - previous.getTextPosition().getXDirAdj() - previous.getTextPosition().getWidthDirAdj();
if (dist > wordSpacing) {
words.add(word);
word = new TextPositionSequence();
}
}
word.getTextPositions().add(current.getTextPosition());
previous = current;
}
words.add(word);
}
private void buildBBox() {
double minX = Double.POSITIVE_INFINITY;
double minY = Double.POSITIVE_INFINITY;
double maxX = Double.NEGATIVE_INFINITY;
double maxY = Double.NEGATIVE_INFINITY;
for (Character character : characters) {
minX = Math.min(minX, character.getTextPosition().getXDirAdj());
minY = Math.min(minY, character.getTextPosition().getYDirAdj());
maxX = Math.max(maxX, character.getTextPosition().getXDirAdj() + character.getTextPosition().getWidthDirAdj());
maxY = Math.max(maxY, character.getTextPosition().getYDirAdj() + character.getTextPosition().getHeightDir());
}
this.setBBox(new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY));
}
public String toString() {
StringBuilder sb = new StringBuilder();
words.forEach(word -> sb.append(word.toString()).append(" "));
return sb.toString().trim();
}
}

View File

@ -0,0 +1,36 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
import lombok.Getter;
public class Neighbor {
@Getter
private final double distance;
@Getter
private final double angle;
private final Character originCharacter;
@Getter
private final Character character;
public Neighbor(Character neighbor, Character origin) {
this.distance = neighbor.distance(origin);
this.angle = neighbor.angle(origin);
this.character = neighbor;
this.originCharacter = origin;
}
public double getHorizontalDistance() {
return character.horizontalDistance(originCharacter);
}
public double getVerticalDistance() {
return character.verticalDistance(originCharacter);
}
}

View File

@ -0,0 +1,31 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
import java.util.Collection;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Set;
public class UnionFind<T> extends org.jgrapht.alg.util.UnionFind<T> {
public UnionFind(Set<T> elements) {
super(elements);
}
public Collection<Set<T>> getGroups() {
Map<T, Set<T>> setRep = new LinkedHashMap<>();
for (T t : getParentMap().keySet()) {
T representative = find(t);
if (!setRep.containsKey(representative)) {
setRep.put(representative, new LinkedHashSet<>());
}
setRep.get(representative).add(t);
}
return setRep.values();
}
}

View File

@ -0,0 +1,51 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
import java.awt.geom.Rectangle2D;
import java.util.Comparator;
import java.util.List;
import lombok.Data;
@Data
public class Zone extends BoundingBox {
private List<Line> lines;
@SuppressWarnings("PMD.ConstructorCallsOverridableMethod")
public Zone(List<Line> lines) {
lines.sort(Comparator.comparingDouble(Line::getY));
this.lines = lines;
buildBBox();
}
public void buildBBox() {
double minX = Double.POSITIVE_INFINITY;
double minY = Double.POSITIVE_INFINITY;
double maxX = Double.NEGATIVE_INFINITY;
double maxY = Double.NEGATIVE_INFINITY;
for (Line line : lines) {
minX = Math.min(minX, line.getX());
minY = Math.min(minY, line.getY());
maxX = Math.max(maxX, line.getX() + line.getWidth());
maxY = Math.max(maxY, line.getY() + line.getHeight());
}
this.setBBox(new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY));
}
public String toString() {
StringBuilder sb = new StringBuilder();
lines.forEach(line -> sb.append(line.toString()).append("\n"));
return sb.toString().trim();
}
}

View File

@ -0,0 +1,53 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.AngleFilter;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Line;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.UnionFind;
@Service
public class LineBuilderService {
private static final double CHARACTER_SPACING_DISTANCE_MULTIPLIER = 3.5;
private static final double MAX_VERTICAL_CHARACTER_DISTANCE = 0.67;
private static final double ANGLE_TOLERANCE = Math.PI / 6;
public List<Line> buildLines(List<Character> characters, double characterSpacing, double lineSpacing) {
double maxHorizontalDistance = characterSpacing * CHARACTER_SPACING_DISTANCE_MULTIPLIER;
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_CHARACTER_DISTANCE;
UnionFind<Character> unionFind = new UnionFind<>(new HashSet<>(characters));
AngleFilter filter = new AngleFilter(-ANGLE_TOLERANCE, ANGLE_TOLERANCE);
characters.forEach(character -> {
character.getNeighbors().forEach(neighbor -> {
double x = neighbor.getHorizontalDistance() / maxHorizontalDistance;
double y = neighbor.getVerticalDistance() / maxVerticalDistance;
if (character.getTextPosition().getDir() == neighbor.getCharacter().getTextPosition().getDir() && filter.matches(neighbor) && Math.pow(x, 2) + Math.pow(y,
2) <= 1) {
unionFind.union(character, neighbor.getCharacter());
}
});
});
List<Line> lines = new ArrayList<>();
unionFind.getGroups().forEach(group -> {
List<Character> lineCharacters = new ArrayList<>(group);
lineCharacters.sort(Comparator.comparingDouble(Character::getX));
lines.add(new Line(lineCharacters, characterSpacing));
});
return lines;
}
}

View File

@ -0,0 +1,78 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Neighbor;
@Service
public class NearestNeighbourService {
private static final int NUMBER_OF_NEIGHBOURS = 8;
private static final double STEP = 16.0;
public void findNearestNeighbors(List<Character> characters) {
if (characters.isEmpty() || characters.size() == 1) {
return;
}
characters.sort(Comparator.comparingDouble(Character::getX));
int maxNeighborCount = NUMBER_OF_NEIGHBOURS;
if (characters.size() <= NUMBER_OF_NEIGHBOURS) {
maxNeighborCount = characters.size() - 1;
}
for (int i = 0; i < characters.size(); i++) {
List<Neighbor> candidates = new ArrayList<>();
int start = i;
int end = i + 1;
double distance = Double.POSITIVE_INFINITY;
for (double searchDistance = 0; searchDistance < distance; ) {
searchDistance += STEP;
boolean newCandidatesFound = false;
while (start > 0 && characters.get(i).getX() - characters.get(start - 1).getX() < searchDistance) {
start--;
candidates.add(new Neighbor(characters.get(start), characters.get(i)));
clearLeastDistant(candidates, maxNeighborCount);
newCandidatesFound = true;
}
while (end < characters.size() && characters.get(end).getX() - characters.get(i).getX() < searchDistance) {
candidates.add(new Neighbor(characters.get(end), characters.get(i)));
clearLeastDistant(candidates, maxNeighborCount);
end++;
newCandidatesFound = true;
}
if (newCandidatesFound && candidates.size() >= maxNeighborCount) {
distance = candidates.get(maxNeighborCount - 1).getDistance();
}
}
clearLeastDistant(candidates, maxNeighborCount);
characters.get(i).setNeighbors(new ArrayList<>(candidates));
}
}
private void clearLeastDistant(List<Neighbor> candidates, int maxNeighborCount) {
if (candidates.size() > maxNeighborCount) {
candidates.sort(Comparator.comparingDouble(Neighbor::getDistance));
candidates.remove(candidates.remove(candidates.size() - 1));
}
}
}

View File

@ -0,0 +1,165 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.ListIterator;
import java.util.Map;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.utils.DoubleUtils;
@Service
public class ReadingOrderService {
private static final double THRESHOLD = 5;
public static final double MULTI_COLUMN_DETECTION_THRESHOLD = 1.5;
public List<Zone> resolve(List<Zone> zones, boolean xyReadingOrder) {
if (zones.isEmpty() || zones.size() == 1) {
return zones;
}
if (xyReadingOrder) {
return resolveSingleColumnReadingOrder(zones);
}
Map<Long, Integer> histogram = new HashMap<>();
for (Zone zone : zones) {
long minY = Math.round(zone.getBBox().getMinY());
long maxY = Math.round(zone.getBBox().getMaxY());
for (long i = minY; i <= maxY; i++) {
histogram.put(i, histogram.getOrDefault(i, 0) + 1);
}
}
if (histogram.values().stream().mapToInt(Integer::intValue).average().orElse(1) < MULTI_COLUMN_DETECTION_THRESHOLD) {
return resolveSingleColumnReadingOrder(zones);
} else {
return resolveMultiColumnReadingOder(zones);
}
}
private static List<Zone> resolveSingleColumnReadingOrder(List<Zone> zones) {
zones.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
return zones;
}
private List<Zone> resolveMultiColumnReadingOder(List<Zone> zones) {
// Simple reading order resolver for multi column page layout as described here : https://pub.towardsai.net/advanced-rag-02-unveiling-pdf-parsing-b84ae866344e
// TODO implement a more fancy reading order resolver see https://github.com/BobLd/DocumentLayoutAnalysis/blob/master/README.md#reading-order
double minX = Double.POSITIVE_INFINITY;
double maxX = Double.NEGATIVE_INFINITY;
for (Zone zone : zones) {
if (zone.getX() < minX) {
minX = zone.getX();
}
if (zone.getX() + zone.getWidth() > maxX) {
maxX = zone.getX() + zone.getWidth();
}
}
double midLineXCoordinate = (minX + maxX) / 2;
List<Zone> leftOf = new ArrayList<>();
List<Zone> rightOf = new ArrayList<>();
List<Zone> middle = new ArrayList<>();
for (Zone zone : zones) {
if (zone.getX() < midLineXCoordinate && zone.getX() + zone.getWidth() < midLineXCoordinate) {
leftOf.add(zone);
} else if (zone.getX() > midLineXCoordinate && zone.getX() + zone.getWidth() > midLineXCoordinate) {
rightOf.add(zone);
} else {
middle.add(zone);
}
}
leftOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
rightOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
middle.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
List<Zone> leftNotIntersecting = new ArrayList<>();
for (Zone leftZone : leftOf) {
boolean intersects = false;
for (Zone rightZone : rightOf) {
if (leftZone.intersectsY(rightZone)) {
intersects = true;
break;
}
// early stopping
if (rightZone.getBBox().getMinY() > leftZone.getBBox().getMaxY()) {
break;
}
}
if (!intersects) {
leftNotIntersecting.add(leftZone);
}
}
List<Zone> rightNotIntersecting = new ArrayList<>();
for (Zone rightZone : rightOf) {
boolean intersects = false;
for (Zone leftZone : leftOf) {
if (rightZone.intersectsY(leftZone)) {
intersects = true;
break;
}
// early stopping
if (leftZone.getBBox().getMinY() > rightZone.getBBox().getMaxY()) {
break;
}
}
if (!intersects) {
rightNotIntersecting.add(rightZone);
}
}
leftOf.removeAll(leftNotIntersecting);
rightOf.removeAll(rightNotIntersecting);
middle.addAll(leftNotIntersecting);
middle.addAll(rightNotIntersecting);
List<Zone> sortedZones = new ArrayList<>();
sortedZones.addAll(leftOf);
sortedZones.addAll(rightOf);
ListIterator<Zone> itty = middle.listIterator();
while (itty.hasNext()) {
Zone current = itty.next();
for (int i = 0; i < sortedZones.size(); i++) {
if (current.getY() < sortedZones.get(i).getY()) {
sortedZones.add(i, current);
itty.remove();
break;
}
}
}
sortedZones.addAll(middle);
return sortedZones;
}
}

View File

@ -0,0 +1,56 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service;
import java.util.List;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.AngleFilter;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Histogram;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Neighbor;
@Service
public class SpacingService {
private static final double SPACING_HISTOGRAM_RESOLUTION = 0.5;
private static final double SPACING_HISTOGRAM_SMOOTHING_LENGTH = 2.5;
private static final double SPACING_HIST_SMOOTHING_STANDARD_DEVIATION = 0.5;
private static final double ANGLE_TOLERANCE = Math.PI / 6;
public double computeCharacterSpacing(List<Character> characters) {
return computeSpacing(characters, 0);
}
public double computeLineSpacing(List<Character> characters) {
return computeSpacing(characters, Math.PI / 2);
}
private double computeSpacing(List<Character> characters, double angle) {
double maxDistance = Double.NEGATIVE_INFINITY;
for (Character character : characters) {
for (Neighbor neighbor : character.getNeighbors()) {
maxDistance = Math.max(maxDistance, neighbor.getDistance());
}
}
Histogram histogram = new Histogram(0, maxDistance, SPACING_HISTOGRAM_RESOLUTION);
AngleFilter angleFilter = new AngleFilter(angle - ANGLE_TOLERANCE, angle + ANGLE_TOLERANCE);
for (Character character : characters) {
for (Neighbor neighbor : character.getNeighbors()) {
if (angleFilter.matches(neighbor)) {
histogram.add(neighbor.getDistance());
}
}
}
histogram.gaussianSmooth(SPACING_HISTOGRAM_SMOOTHING_LENGTH, SPACING_HIST_SMOOTHING_STANDARD_DEVIATION);
return histogram.getPeakValue();
}
}

View File

@ -0,0 +1,152 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Line;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.UnionFind;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone;
@Service
public class ZoneBuilderService {
private static final double MIN_HORIZONTAL_DISTANCE_MULTIPLIER = -0.5;
private static final double MAX_VERTICAL_DISTANCE_MULTIPLIER = 1.2;
private static final double MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER = -3.0;
private static final double MAX_VERTICAL_MERGE_DISTANCE_MULTIPLIER = 0.5;
private static final double MIN_LINE_SIZE_SCALE = 0.9;
private static final double MAX_LINE_SIZE_SCALE = 2.5;
private static final double ANGLE_TOLERANCE = Math.PI / 6;
private static final int MAX_ZONES = 300;
private static final double MAX_VERTICAL_MERGE_DISTANCE = 0.5;
public List<Zone> buildZones(List<Line> lines, double characterSpacing, double lineSpacing) {
double minHorizontalDistance = characterSpacing * MIN_HORIZONTAL_DISTANCE_MULTIPLIER;
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_DISTANCE_MULTIPLIER;
double minHorizontalMergeDistance = characterSpacing * MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER;
double maxVerticalMergeDistance = lineSpacing * MAX_VERTICAL_MERGE_DISTANCE_MULTIPLIER;
UnionFind<Line> unionFind = new UnionFind<>(new HashSet<>(lines));
double meanHeight = calculateMeanHeight(lines);
lines.forEach(outerLine -> //
lines.forEach(innerLine -> {
double scale = Math.min(outerLine.getHeight(), innerLine.getHeight()) / meanHeight;
scale = Math.max(MIN_LINE_SIZE_SCALE, Math.min(scale, MAX_LINE_SIZE_SCALE));
if (!unionFind.inSameSet(outerLine, innerLine) && outerLine.angularDifference(innerLine) <= ANGLE_TOLERANCE) {
double horizontalDistance = outerLine.horizontalDistance(innerLine) / scale;
double verticalDistance = outerLine.verticalDistance(innerLine) / scale;
if (minHorizontalDistance <= horizontalDistance && verticalDistance <= maxVerticalDistance //
|| minHorizontalMergeDistance <= horizontalDistance && verticalDistance <= maxVerticalMergeDistance) {
unionFind.union(outerLine, innerLine);
}
}
}));
List<Zone> zones = new ArrayList<>();
unionFind.getGroups().forEach(group -> {
zones.add(new Zone(new ArrayList<>(group)));
});
if (zones.size() > MAX_ZONES) {
List<Line> oneZoneLines = new ArrayList<>();
for (Zone zone : zones) {
oneZoneLines.addAll(zone.getLines());
}
return List.of(mergeLinesInZone(oneZoneLines, characterSpacing, lineSpacing));
}
return zones;
}
private double calculateMeanHeight(List<Line> lines) {
double meanHeight = 0.0;
double weights = 0.0;
for (Line line : lines) {
double weight = line.getLength();
meanHeight += line.getHeight() * weight;
weights += weight;
}
meanHeight /= weights;
return meanHeight;
}
private Zone mergeLinesInZone(List<Line> lines, double characterSpacing, double lineSpacing) {
double maxHorizontalDistance = 0;
double minVerticalDistance = 0;
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_MERGE_DISTANCE;
UnionFind<Line> unionFind = new UnionFind<>(new HashSet<>(lines));
lines.forEach(outer -> {
lines.forEach(inner -> {
if (inner != outer) {
double horizontalDistance = outer.horizontalDistance(inner);
double verticalDistance = outer.verticalDistance(inner);
if (horizontalDistance <= maxHorizontalDistance && minVerticalDistance <= verticalDistance && verticalDistance <= maxVerticalDistance) {
unionFind.union(outer, inner);
} else if (minVerticalDistance <= verticalDistance && verticalDistance <= maxVerticalDistance && Math.abs(horizontalDistance - Math.min(outer.getLength(),
inner.getLength())) < 0.1) {
boolean characterOverlap = false;
int overlappingCount = 0;
for (Character outerCharacter : outer.getCharacters()) {
for (Character innerCharacter : inner.getCharacters()) {
double characterOverlapDistance = outerCharacter.overlappingDistance(innerCharacter);
if (characterOverlapDistance > 2) {
characterOverlap = true;
}
if (characterOverlapDistance > 0) {
overlappingCount++;
}
}
}
if (!characterOverlap && overlappingCount <= 2) {
unionFind.union(outer, inner);
}
}
}
});
});
List<Line> outputZone = new ArrayList<>();
for (Set<Line> group : unionFind.getGroups()) {
List<Character> characters = new ArrayList<>();
for (Line line : group) {
characters.addAll(line.getCharacters());
}
characters.sort(Comparator.comparingDouble(Character::getX));
outputZone.add(new Line(characters, characterSpacing));
}
return new Zone(outputZone);
}
}

View File

@ -0,0 +1,15 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.utils;
public class DoubleUtils {
public static int compareDouble(double d1, double d2, double precision) {
if (Double.isNaN(d1) || Double.isNaN(d2)) {
return Double.compare(d1, d2);
}
long i1 = Math.round(d1 / (precision == 0 ? 1 : precision));
long i2 = Math.round(d2 / (precision == 0 ? 1 : precision));
return Long.compare(i1, i2);
}
}

View File

@ -13,8 +13,10 @@ import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Set;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter;
@ -22,6 +24,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.DuplicatedParagraph;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Footer;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Header;
@ -46,14 +49,14 @@ import lombok.experimental.UtilityClass;
@UtilityClass
public class DocumentGraphFactory {
public Document buildDocumentGraph(ClassificationDocument document) {
public Document buildDocumentGraph(LayoutParsingType layoutParsingType, ClassificationDocument document) {
Document documentGraph = new Document();
Context context = new Context(documentGraph);
document.getPages().forEach(context::buildAndAddPageWithCounter);
document.getSections().stream().flatMap(section -> section.getImages().stream()).forEach(image -> context.getImages().add(image));
addSections(document, context);
addSections(layoutParsingType, document, context);
addHeaderAndFooterToEachPage(document, context);
documentGraph.setNumberOfPages(context.pages.size());
@ -64,9 +67,9 @@ public class DocumentGraphFactory {
}
private void addSections(ClassificationDocument document, Context context) {
private void addSections(LayoutParsingType layoutParsingType, ClassificationDocument document, Context context) {
document.getSections().forEach(section -> SectionNodeFactory.addSection(null, section.getNonEmptyPageBlocks(), section.getImages(), context));
document.getSections().forEach(section -> SectionNodeFactory.addSection(layoutParsingType, null, section.getNonEmptyPageBlocks(), section.getImages(), context));
}
@ -77,6 +80,8 @@ public class DocumentGraphFactory {
GenericSemanticNode node;
if (originalTextBlock.isHeadline()) {
node = Headline.builder().documentTree(context.getDocumentTree()).build();
} else if (originalTextBlock.isToDuplicate()) {
node = DuplicatedParagraph.builder().documentTree(context.getDocumentTree()).build();
} else {
node = Paragraph.builder().documentTree(context.getDocumentTree()).build();
}
@ -86,7 +91,16 @@ public class DocumentGraphFactory {
List<TextPageBlock> textBlocks = new ArrayList<>();
textBlocks.add(originalTextBlock);
textBlocks.addAll(textBlocksToMerge);
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), node, context, page);
if (node instanceof DuplicatedParagraph duplicatedParagraph) {
AtomicTextBlock unsortedTextBlock = context.textBlockFactory.buildAtomicTextBlock(textBlocks.stream()
.flatMap(tb -> tb.getSequences().stream())
.collect(Collectors.toList()), node, context, page);
duplicatedParagraph.setUnsortedLeafTextBlock(unsortedTextBlock);
}
List<Integer> treeId = context.documentTree.createNewChildEntryAndReturnId(parentNode, node);
node.setLeafTextBlock(textBlock);
node.setTreeId(treeId);

View File

@ -4,19 +4,21 @@ import static java.lang.String.format;
import static java.util.Collections.emptyList;
import static java.util.stream.Collectors.groupingBy;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.utils.TableMergingUtility;
import lombok.experimental.UtilityClass;
@ -24,7 +26,11 @@ import lombok.experimental.UtilityClass;
@UtilityClass
public class SectionNodeFactory {
public void addSection(GenericSemanticNode parentNode, List<AbstractPageBlock> pageBlocks, List<ClassifiedImage> images, DocumentGraphFactory.Context context) {
public void addSection(LayoutParsingType layoutParsingType,
GenericSemanticNode parentNode,
List<AbstractPageBlock> pageBlocks,
List<ClassifiedImage> images,
DocumentGraphFactory.Context context) {
if (pageBlocks.isEmpty()) {
return;
@ -37,11 +43,11 @@ public class SectionNodeFactory {
section.setTreeId(getTreeId(parentNode, context, section));
addFirstHeadlineDirectlyToSection(pageBlocks, context, section);
addFirstHeadlineDirectlyToSection(layoutParsingType, pageBlocks, context, section);
if (containsTablesAndTextBlocks(pageBlocks)) {
splitPageBlocksIntoSubSections(pageBlocks).forEach(subSectionPageBlocks -> addSection(section, subSectionPageBlocks, emptyList(), context));
splitPageBlocksIntoSubSections(pageBlocks).forEach(subSectionPageBlocks -> addSection(layoutParsingType, section, subSectionPageBlocks, emptyList(), context));
} else {
addTablesAndParagraphsAndHeadlinesToSection(pageBlocks, context, section);
addTablesAndParagraphsAndHeadlinesToSection(layoutParsingType, pageBlocks, context, section);
}
images.stream().distinct().forEach(image -> DocumentGraphFactory.addImage(section, image, context));
@ -58,16 +64,19 @@ public class SectionNodeFactory {
}
private void addFirstHeadlineDirectlyToSection(List<AbstractPageBlock> pageBlocks, DocumentGraphFactory.Context context, Section section) {
private void addFirstHeadlineDirectlyToSection(LayoutParsingType layoutParsingType, List<AbstractPageBlock> pageBlocks, DocumentGraphFactory.Context context, Section section) {
if (pageBlocks.get(0).isHeadline()) {
addTablesAndParagraphsAndHeadlinesToSection(List.of(pageBlocks.get(0)), context, section);
addTablesAndParagraphsAndHeadlinesToSection(layoutParsingType, List.of(pageBlocks.get(0)), context, section);
pageBlocks.remove(0);
}
}
private void addTablesAndParagraphsAndHeadlinesToSection(List<AbstractPageBlock> pageBlocks, DocumentGraphFactory.Context context, Section section) {
private void addTablesAndParagraphsAndHeadlinesToSection(LayoutParsingType layoutParsingType,
List<AbstractPageBlock> pageBlocks,
DocumentGraphFactory.Context context,
Section section) {
Set<AbstractPageBlock> alreadyMerged = new HashSet<>();
List<AbstractPageBlock> remainingBlocks = new LinkedList<>(pageBlocks);
@ -80,13 +89,23 @@ public class SectionNodeFactory {
remainingBlocks.removeAll(alreadyMerged);
if (abstractPageBlock instanceof TextPageBlock) {
List<TextPageBlock> textBlocks = findTextBlocksWithSameClassificationAndAlignsY((TextPageBlock) abstractPageBlock, remainingBlocks);
alreadyMerged.addAll(textBlocks);
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, textBlocks);
switch (layoutParsingType) {
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> {
alreadyMerged.add(abstractPageBlock);
remainingBlocks.remove(abstractPageBlock);
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, new ArrayList<>());
}
default -> {
List<TextPageBlock> textBlocks = findTextBlocksWithSameClassificationAndAlignsY((TextPageBlock) abstractPageBlock, remainingBlocks);
alreadyMerged.addAll(textBlocks);
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, textBlocks);
}
}
} else if (abstractPageBlock instanceof TablePageBlock tablePageBlock) {
List<TablePageBlock> tablesToMerge = TableMergingUtility.findConsecutiveTablesWithSameColCountAndSameHeaders(tablePageBlock, remainingBlocks);
alreadyMerged.addAll(tablesToMerge);
TableNodeFactory.addTable(section, tablesToMerge, context);
TableNodeFactory.addTable(layoutParsingType, section, tablesToMerge, context);
} else {
throw new RuntimeException(format("Unhandled AbstractPageBlockType %s!", abstractPageBlock.getClass()));
}
@ -171,6 +190,7 @@ public class SectionNodeFactory {
.filter(abstractTextContainer -> abstractTextContainer.intersectsY(atc))
.map(abstractTextContainer -> (TextPageBlock) abstractTextContainer)
.filter(abstractTextContainer -> abstractTextContainer.getDir() == atc.getDir())
.filter(abstractTextContainer -> !abstractTextContainer.isToDuplicate())
.toList();
}

View File

@ -7,16 +7,17 @@ import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
import lombok.experimental.UtilityClass;
@ -27,7 +28,7 @@ public class TableNodeFactory {
public final double TABLE_CELL_MERGE_CONTENTS_SIZE_THRESHOLD = 0.05;
public void addTable(GenericSemanticNode parentNode, List<TablePageBlock> tablesToMerge, DocumentGraphFactory.Context context) {
public void addTable(LayoutParsingType layoutParsingType, GenericSemanticNode parentNode, List<TablePageBlock> tablesToMerge, DocumentGraphFactory.Context context) {
setPageNumberInCells(tablesToMerge);
Set<Page> pages = tablesToMerge.stream().map(AbstractPageBlock::getPage).map(context::getPage).collect(Collectors.toSet());
@ -43,7 +44,7 @@ public class TableNodeFactory {
List<Integer> treeId = context.getDocumentTree().createNewChildEntryAndReturnId(parentNode, table);
table.setTreeId(treeId);
addTableCells(mergedRows, table, context);
addTableCells(layoutParsingType, mergedRows, table, context);
ifTableHasNoHeadersSetFirstRowAsHeaders(table);
}
@ -88,18 +89,18 @@ public class TableNodeFactory {
}
private void addTableCells(List<List<Cell>> rows, Table table, DocumentGraphFactory.Context context) {
private void addTableCells(LayoutParsingType layoutParsingType, List<List<Cell>> rows, Table table, DocumentGraphFactory.Context context) {
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
for (int colIndex = 0; colIndex < rows.get(rowIndex).size(); colIndex++) {
addTableCell(rows.get(rowIndex).get(colIndex), rowIndex, colIndex, table, context);
addTableCell(layoutParsingType, rows.get(rowIndex).get(colIndex), rowIndex, colIndex, table, context);
}
}
}
@SuppressWarnings("PMD.UnusedPrivateMethod") // PMD actually flags this wrong
private void addTableCell(Cell cell, int rowIndex, int colIndex, Table tableNode, DocumentGraphFactory.Context context) {
private void addTableCell(LayoutParsingType layoutParsingType, Cell cell, int rowIndex, int colIndex, Table tableNode, DocumentGraphFactory.Context context) {
Page page = context.getPage(cell.getPageNumber());
@ -116,7 +117,7 @@ public class TableNodeFactory {
textBlock = context.getTextBlockFactory().buildAtomicTextBlock(cell.getTextBlocks().get(0).getSequences(), tableCell, context, page);
tableCell.setLeafTextBlock(textBlock);
} else if (firstTextBlockIsHeadline(cell)) {
SectionNodeFactory.addSection(tableCell, cell.getTextBlocks().stream().map(tb -> (AbstractPageBlock) tb).toList(), emptyList(), context);
SectionNodeFactory.addSection(layoutParsingType, tableCell, cell.getTextBlocks().stream().map(tb -> (AbstractPageBlock) tb).toList(), emptyList(), context);
} else if (cellAreaIsSmallerThanPageAreaTimesThreshold(cell, page)) {
List<TextPositionSequence> sequences = TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(cell.getTextBlocks());
textBlock = context.getTextBlockFactory().buildAtomicTextBlock(sequences, tableCell, context, page);

View File

@ -8,8 +8,6 @@ import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import javax.xml.parsers.DocumentBuilder;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
@ -18,6 +16,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.Do
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.DuplicatedParagraph;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
@ -33,27 +32,20 @@ public class DocumentDataMapper {
public DocumentData toDocumentData(Document document) {
List<DocumentTextData> documentTextData = document.streamTerminalTextBlocksInOrder()
.flatMap(textBlock -> textBlock.getAtomicTextBlocks()
.stream())
.flatMap(textBlock -> textBlock.getAtomicTextBlocks().stream())
.distinct()
.map(DocumentDataMapper::toAtomicTextBlockData)
.toList();
List<DocumentPositionData> atomicPositionBlockData = document.streamTerminalTextBlocksInOrder()
.flatMap(textBlock -> textBlock.getAtomicTextBlocks()
.stream())
.flatMap(textBlock -> textBlock.getAtomicTextBlocks().stream())
.distinct()
.map(DocumentDataMapper::toAtomicPositionBlockData)
.toList();
Set<Long> nonEmptyTextBlocks = documentTextData.stream()
.mapToLong(DocumentTextData::getId).boxed()
.collect(Collectors.toSet());
Set<Long> nonEmptyTextBlocks = documentTextData.stream().mapToLong(DocumentTextData::getId).boxed().collect(Collectors.toSet());
List<DocumentPage> documentPageData = document.getPages()
.stream()
.map(DocumentDataMapper::toPageData)
.toList();
List<DocumentPage> documentPageData = document.getPages().stream().map(DocumentDataMapper::toPageData).toList();
DocumentStructure tableOfContentsData = toDocumentTreeData(document.getDocumentTree());
return DocumentData.builder()
.documentTextData(documentTextData.toArray(new DocumentTextData[0]))
@ -84,22 +76,17 @@ public class DocumentDataMapper {
case TABLE -> PropertiesMapper.buildTableProperties((Table) entry.getNode());
case TABLE_CELL -> PropertiesMapper.buildTableCellProperties((TableCell) entry.getNode());
case IMAGE -> PropertiesMapper.buildImageProperties((Image) entry.getNode());
case PARAGRAPH ->
entry.getNode() instanceof DuplicatedParagraph duplicatedParagraph ? PropertiesMapper.buildDuplicateParagraphProperties(duplicatedParagraph) : new HashMap<>();
default -> new HashMap<>();
};
DocumentStructure.EntryData.EntryDataBuilder documentBuilder = DocumentStructure.EntryData.builder()
.treeId(toPrimitiveIntArray(entry.getTreeId()))
.children(entry.getChildren()
.stream()
.map(DocumentDataMapper::toEntryData)
.toList())
.children(entry.getChildren().stream().map(DocumentDataMapper::toEntryData).toList())
.type(entry.getType())
.atomicBlockIds(atomicTextBlocks)
.pageNumbers(entry.getNode().getPages()
.stream()
.map(Page::getNumber)
.map(Integer::longValue)
.toArray(Long[]::new))
.pageNumbers(entry.getNode().getPages().stream().map(Page::getNumber).map(Integer::longValue).toArray(Long[]::new))
.properties(properties);
if (entry.getNode() != null) {
documentBuilder.engines(entry.getNode().getEngines());
@ -112,10 +99,7 @@ public class DocumentDataMapper {
private Long[] toAtomicTextBlockIds(TextBlock textBlock) {
return textBlock.getAtomicTextBlocks()
.stream()
.map(AtomicTextBlock::getId)
.toArray(Long[]::new);
return textBlock.getAtomicTextBlocks().stream().map(AtomicTextBlock::getId).toArray(Long[]::new);
}
@ -167,9 +151,7 @@ public class DocumentDataMapper {
private int[] toPrimitiveIntArray(List<Integer> list) {
return list.stream()
.mapToInt(Integer::intValue)
.toArray();
return list.stream().mapToInt(Integer::intValue).toArray();
}
}

View File

@ -7,13 +7,14 @@ import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.DuplicatedParagraph;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Footer;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Header;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Headline;
@ -61,7 +62,7 @@ public class DocumentGraphMapper {
SemanticNode node = switch (entryData.getType()) {
case SECTION -> buildSection(context);
case PARAGRAPH -> buildParagraph(context);
case PARAGRAPH -> buildParagraph(context, entryData.getProperties());
case HEADLINE -> buildHeadline(context);
case HEADER -> buildHeader(context);
case FOOTER -> buildFooter(context);
@ -140,7 +141,17 @@ public class DocumentGraphMapper {
}
private Paragraph buildParagraph(Context context) {
private Paragraph buildParagraph(Context context, Map<String, String> properties) {
if (PropertiesMapper.isDuplicateParagraph(properties)) {
DuplicatedParagraph duplicatedParagraph = DuplicatedParagraph.builder().documentTree(context.documentTree).build();
Long[] unsortedTextblockIds = PropertiesMapper.getUnsortedTextblockIds(properties);
duplicatedParagraph.setUnsortedLeafTextBlock(toTextBlock(unsortedTextblockIds, context, duplicatedParagraph));
return duplicatedParagraph;
}
return Paragraph.builder().documentTree(context.documentTree).build();
}

View File

@ -1,17 +1,19 @@
package com.knecon.fforesight.service.layoutparser.processor.services.mapper;
import java.awt.geom.Rectangle2D;
import java.util.Collections;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.DuplicatedParagraph;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
public class PropertiesMapper {
@ -76,6 +78,32 @@ public class PropertiesMapper {
}
public static Map<String, String> buildDuplicateParagraphProperties(DuplicatedParagraph duplicatedParagraph) {
Map<String, String> properties = new HashMap<>();
properties.put(DocumentStructure.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID, Arrays.toString(toAtomicTextBlockIds(duplicatedParagraph.getUnsortedLeafTextBlock())));
return properties;
}
public static boolean isDuplicateParagraph(Map<String, String> properties) {
return properties.containsKey(DocumentStructure.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID);
}
public static Long[] getUnsortedTextblockIds(Map<String, String> properties) {
return toLongArray(properties.get(DocumentStructure.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID));
}
public static Long[] toLongArray(String ids) {
return Arrays.stream(ids.substring(1, ids.length() - 1).trim().split(",")).map(Long::valueOf).toArray(Long[]::new);
}
private static ImageType parseImageType(String imageType) {
return switch (imageType) {
@ -101,4 +129,10 @@ public class PropertiesMapper {
rectangle2D.getHeight());
}
private static Long[] toAtomicTextBlockIds(TextBlock textBlock) {
return textBlock.getAtomicTextBlocks().stream().map(AtomicTextBlock::getId).toArray(Long[]::new);
}
}

View File

@ -237,8 +237,13 @@ public class PDFLinesTextStripper extends PDFTextStripper {
int startIndex = 0;
RedTextPosition previous = null;
float direction = -1;
for (int i = 0; i <= textPositions.size() - 1; i++) {
if (direction == -1) {
direction = textPositions.get(i).getDir();
}
if (!textPositionSequences.isEmpty()) {
previous = textPositionSequences.get(textPositionSequences.size() - 1)
.getTextPositions()
@ -250,6 +255,13 @@ public class PDFLinesTextStripper extends PDFTextStripper {
continue;
}
if (textPositions.get(i).getDir() != direction && startIndex != i) {
List<TextPosition> sublist = textPositions.subList(startIndex, i);
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart));
startIndex = i;
direction = textPositions.get(i).getDir();
}
// Strange but sometimes this is happening, for example: Metolachlor2.pdf
if (checkIfCurrentPositionIsToTheRightOfPreviousPosition(i, textPositions)) {
List<TextPosition> sublist = textPositions.subList(startIndex, i);
@ -329,6 +341,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
.getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < maximumGapSize;
}
@Override
public String getText(PDDocument doc) throws IOException {

View File

@ -20,6 +20,7 @@ import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.DuplicatedParagraph;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
@ -53,6 +54,8 @@ public class LayoutGridService {
static Color INNER_LINES_COLOR = new Color(255, 175, 175);
static Color PARAGRAPH_COLOR = new Color(70, 130, 180);
static Color DUPLICATE_PARAGRAPH_COLOR = new Color(70, 180, 101);
static Color TABLE_COLOR = new Color(102, 205, 170);
static Color SECTION_COLOR = new Color(50, 50, 50);
static Color HEADLINE_COLOR = new Color(162, 56, 56);
@ -100,6 +103,11 @@ public class LayoutGridService {
case IMAGE -> IMAGE_COLOR;
default -> null;
};
if (semanticNode instanceof DuplicatedParagraph) {
color = DUPLICATE_PARAGRAPH_COLOR;
}
if (isNotSectionOrTableCellOrDocument(semanticNode)) {
addAsRectangle(semanticNode, layoutGrid, color);
}

View File

@ -3,7 +3,6 @@ package com.knecon.fforesight.service.layoutparser.processor.utils;
import java.util.List;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;

View File

@ -37,7 +37,7 @@ public class MessageHandler {
LayoutParsingRequest layoutParsingRequest = objectMapper.readValue(message.getBody(), LayoutParsingRequest.class);
if (layoutParsingRequest.layoutParsingType().equals(LayoutParsingType.TAAS) && layoutParsingRequest.researchDocumentStorageId() == null) {
if (layoutParsingRequest.layoutParsingType().equals(LayoutParsingType.CLARIFYND) && layoutParsingRequest.researchDocumentStorageId() == null) {
throw new IllegalArgumentException("ResearchDocumentDataStorageId is null!");
}
log.info("Layout parsing request received {}", layoutParsingRequest.identifier());

View File

@ -48,12 +48,13 @@ public class BdrJsonBuildTest extends AbstractTest {
@SneakyThrows
protected Document buildGraph(File file) {
return DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.TAAS,
file,
new ImageServiceResponse(),
new TableServiceResponse(),
new VisualLayoutParsingResponse(),
file.toString()));
return DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.CLARIFYND,
layoutParsingPipeline.parseLayout(LayoutParsingType.CLARIFYND,
file,
new ImageServiceResponse(),
new TableServiceResponse(),
new VisualLayoutParsingResponse(),
file.toString()));
}

View File

@ -95,12 +95,13 @@ public class HeadlinesGoldStandardIntegrationTest {
goldStandardLog.getRedactionLogEntry().removeIf(r -> !r.isRedacted() || r.getChanges().get(r.getChanges().size() - 1).getType().equals(ChangeType.REMOVED));
goldStandardLog.getRedactionLogEntry().forEach(e -> goldStandardHeadlines.add(new Headline(e.getPositions().get(0).getPage(), e.getValue())));
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
pdfFileResource.getFile(),
new ImageServiceResponse(),
new TableServiceResponse(),
new VisualLayoutParsingResponse(),
filePath));
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_OLD,
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD,
pdfFileResource.getFile(),
new ImageServiceResponse(),
new TableServiceResponse(),
new VisualLayoutParsingResponse(),
filePath));
var foundHeadlines = documentGraph.streamAllSubNodes()
.map(SemanticNode::getHeadline)

View File

@ -26,7 +26,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
public void testLayoutParserEndToEnd() {
prepareStorage("files/bdr/Wie weiter bei Kristeneinrichtungen.pdf");
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER);
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER_OLD);
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
Arrays.stream(finishedEvent.message().split("\n")).forEach(log::info);
}

View File

@ -55,12 +55,13 @@ public class DocumentGraphJsonWritingTest extends BuildDocumentTest {
@SneakyThrows
private void writeJsons(Path filename) {
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
filename.toFile(),
new ImageServiceResponse(),
new TableServiceResponse(),
new VisualLayoutParsingResponse(),
filename.toFile().toString()));
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_OLD,
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD,
filename.toFile(),
new ImageServiceResponse(),
new TableServiceResponse(),
new VisualLayoutParsingResponse(),
filename.toFile().toString()));
DocumentData documentData = DocumentDataMapper.toDocumentData(documentGraph);
ObjectMapper mapper = ObjectMapperFactory.create();

View File

@ -26,7 +26,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
@SneakyThrows
public void testViewerDocument() {
String fileName = "files/SinglePages/T5 VV-640252-Page16.pdf";
String fileName = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
var documentFile = new ClassPathResource(fileName).getFile();
@ -54,13 +54,14 @@ public class ViewerDocumentTest extends BuildDocumentTest {
var documentFile = new ClassPathResource(fileName).getFile();
var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE,
documentFile,
new ImageServiceResponse(),
tableResponse,
new VisualLayoutParsingResponse(),Path.of(fileName).getFileName().toFile().toString());
documentFile,
new ImageServiceResponse(),
tableResponse,
new VisualLayoutParsingResponse(),
Path.of(fileName).getFileName().toFile().toString());
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
Document document = DocumentGraphFactory.buildDocumentGraph(classificationDocument);
Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.DOCUMINE, classificationDocument);
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
}

View File

@ -56,12 +56,12 @@ public class PdfSegmentationServiceTest extends AbstractTest {
@SneakyThrows
public ClassificationDocument buildClassificationDocument(File originDocument, TableServiceResponse tableServiceResponse) {
ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
originDocument,
new ImageServiceResponse(),
tableServiceResponse,
new VisualLayoutParsingResponse(),
"document");
ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD,
originDocument,
new ImageServiceResponse(),
tableServiceResponse,
new VisualLayoutParsingResponse(),
"document");
redactManagerClassificationService.classifyDocument(classificationDocument);
@ -112,16 +112,8 @@ public class PdfSegmentationServiceTest extends AbstractTest {
var tableServiceResponse = objectMapper.readValue(cvTablesResource.getInputStream(), TableServiceResponse.class);
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile(), tableServiceResponse);
assertThat(document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.collect(Collectors.toList())).isNotEmpty();
var tables = document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.toList();
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
var tables = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList();
// Quality of the table parsing is not good, because the file is rotated at scanning.
// We only asset that the table border is not the page border.
@ -143,12 +135,12 @@ public class PdfSegmentationServiceTest extends AbstractTest {
imageServiceResponse.getData()
.forEach(imageMetadata -> images.computeIfAbsent(imageMetadata.getPosition().getPageNumber(), x -> new ArrayList<>())
.add(new ClassifiedImage(new Rectangle2D.Double(imageMetadata.getPosition().getX1(),
imageMetadata.getPosition().getY1(),
imageMetadata.getGeometry().getWidth(),
imageMetadata.getGeometry().getHeight()),
ImageType.valueOf(imageMetadata.getClassification().getLabel().toUpperCase(Locale.ROOT)),
imageMetadata.isAlpha(),
imageMetadata.getPosition().getPageNumber())));
imageMetadata.getPosition().getY1(),
imageMetadata.getGeometry().getWidth(),
imageMetadata.getGeometry().getHeight()),
ImageType.valueOf(imageMetadata.getClassification().getLabel().toUpperCase(Locale.ROOT)),
imageMetadata.isAlpha(),
imageMetadata.getPosition().getPageNumber())));
System.out.println("object");
}
@ -160,22 +152,11 @@ public class PdfSegmentationServiceTest extends AbstractTest {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Spanning Cells.pdf");
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
assertThat(document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.collect(Collectors.toList())).isNotEmpty();
TablePageBlock table = document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.toList()
.get(0);
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(0);
assertThat(table.getColCount()).isEqualTo(6);
assertThat(table.getRowCount()).isEqualTo(13);
assertThat(table.getRows()
.stream()
.mapToInt(List::size).sum()).isEqualTo(6 * 13);
assertThat(table.getRows().stream().mapToInt(List::size).sum()).isEqualTo(6 * 13);
}
@ -185,37 +166,15 @@ public class PdfSegmentationServiceTest extends AbstractTest {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Table.pdf");
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
assertThat(document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.collect(Collectors.toList())).isNotEmpty();
TablePageBlock firstTable = document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.toList()
.get(0);
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
TablePageBlock firstTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(0);
assertThat(firstTable.getColCount()).isEqualTo(8);
assertThat(firstTable.getRowCount()).isEqualTo(1);
TablePageBlock secondTable = document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.toList()
.get(1);
TablePageBlock secondTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(1);
assertThat(secondTable.getColCount()).isEqualTo(8);
assertThat(secondTable.getRowCount()).isEqualTo(2);
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
.get(0)
.stream()
.map(Collections::singletonList)
.collect(Collectors.toList());
assertThat(secondTable.getRows()
.stream()
.allMatch(row -> row.stream()
.map(Cell::getHeaderCells)
.toList().equals(firstTableHeaderCells))).isTrue();
List<List<Cell>> firstTableHeaderCells = firstTable.getRows().get(0).stream().map(Collections::singletonList).collect(Collectors.toList());
assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).toList().equals(firstTableHeaderCells))).isTrue();
}
@ -225,37 +184,15 @@ public class PdfSegmentationServiceTest extends AbstractTest {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Multi Page Table.pdf");
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
assertThat(document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.collect(Collectors.toList())).isNotEmpty();
TablePageBlock firstTable = document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.toList()
.get(0);
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
TablePageBlock firstTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(0);
assertThat(firstTable.getColCount()).isEqualTo(9);
assertThat(firstTable.getRowCount()).isEqualTo(5);
TablePageBlock secondTable = document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.toList()
.get(1);
TablePageBlock secondTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(1);
assertThat(secondTable.getColCount()).isEqualTo(9);
assertThat(secondTable.getRowCount()).isEqualTo(6);
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
.get(firstTable.getRowCount() - 1)
.stream()
.map(Cell::getHeaderCells)
.collect(Collectors.toList());
assertThat(secondTable.getRows()
.stream()
.allMatch(row -> row.stream()
.map(Cell::getHeaderCells)
.toList().equals(firstTableHeaderCells))).isTrue();
List<List<Cell>> firstTableHeaderCells = firstTable.getRows().get(firstTable.getRowCount() - 1).stream().map(Cell::getHeaderCells).collect(Collectors.toList());
assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).toList().equals(firstTableHeaderCells))).isTrue();
}
@ -265,37 +202,15 @@ public class PdfSegmentationServiceTest extends AbstractTest {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Rotated Table Headers.pdf");
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
assertThat(document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.collect(Collectors.toList())).isNotEmpty();
TablePageBlock firstTable = document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.toList()
.get(0);
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
TablePageBlock firstTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(0);
assertThat(firstTable.getColCount()).isEqualTo(8);
assertThat(firstTable.getRowCount()).isEqualTo(1);
TablePageBlock secondTable = document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.toList()
.get(1);
TablePageBlock secondTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(1);
assertThat(secondTable.getColCount()).isEqualTo(8);
assertThat(secondTable.getRowCount()).isEqualTo(6);
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
.get(0)
.stream()
.map(Collections::singletonList)
.collect(Collectors.toList());
assertThat(secondTable.getRows()
.stream()
.allMatch(row -> row.stream()
.map(Cell::getHeaderCells)
.toList().equals(firstTableHeaderCells))).isTrue();
List<List<Cell>> firstTableHeaderCells = firstTable.getRows().get(0).stream().map(Collections::singletonList).collect(Collectors.toList());
assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).toList().equals(firstTableHeaderCells))).isTrue();
}
@ -345,30 +260,29 @@ public class PdfSegmentationServiceTest extends AbstractTest {
validateTable(document, 0, 8, 8, 0, 0);
List<List<String>> values = Arrays.asList(Arrays.asList("Annex point Reference within DAR/RAR",
"Author, date",
"Study title",
"Analytical method Author, date, No.",
"Technique, LOQ of the method, validated working range",
"Method meets analytical validation criteria",
"Remarks (in case validation criteria are not met)",
"Acceptability of the method"),
Arrays.asList(
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies"),
Arrays.asList("CA 7.1.2.1.1 DAR (2009)",
"Evans P.G. 2001 TMJ4569B, VV-323245",
"Azoxystrobin Laboratory Degradation Study in Three Soil Types, Sampled from Holland and the United Kingdom",
"Method: RAM 269 Johnson R.I., Tummon O.J., Earl M. 1995 RJ1864B, VV-377731 Johnson R.I., Tummon O.J., Earl M. 1998 RAM 269/02, VV-124072 Johnson R.I., Tummon O.J., Earl M. 2000 RAM 269/03, VV-123986 Validation: Robinson N.J. 2001 TMJ4617B, VV-895845",
"LC-MS/MS LOQ: 0.01 mg/kg (R401553 (SYN50165 7), R402173 (SYN501114 )) or 0.02mg/kg (azoxystrobin, R230310, R234886) Working range: 0.02-1.0 or 0.01-0.5 mg/kg (depending on analyte) Other supporting quantificati on methods: HPLC-UV GC-MSD",
"Y",
"N/A",
"Y"));
"Author, date",
"Study title",
"Analytical method Author, date, No.",
"Technique, LOQ of the method, validated working range",
"Method meets analytical validation criteria",
"Remarks (in case validation criteria are not met)",
"Acceptability of the method"),
Arrays.asList("Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies"),
Arrays.asList("CA 7.1.2.1.1 DAR (2009)",
"Evans P.G. 2001 TMJ4569B, VV-323245",
"Azoxystrobin Laboratory Degradation Study in Three Soil Types, Sampled from Holland and the United Kingdom",
"Method: RAM 269 Johnson R.I., Tummon O.J., Earl M. 1995 RJ1864B, VV-377731 Johnson R.I., Tummon O.J., Earl M. 1998 RAM 269/02, VV-124072 Johnson R.I., Tummon O.J., Earl M. 2000 RAM 269/03, VV-123986 Validation: Robinson N.J. 2001 TMJ4617B, VV-895845",
"LC-MS/MS LOQ: 0.01 mg/kg (R401553 (SYN50165 7), R402173 (SYN501114 )) or 0.02mg/kg (azoxystrobin, R230310, R234886) Working range: 0.02-1.0 or 0.01-0.5 mg/kg (depending on analyte) Other supporting quantificati on methods: HPLC-UV GC-MSD",
"Y",
"N/A",
"Y"));
validateTable(document, 0, values);
@ -757,11 +671,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
@SneakyThrows
private void toHtml(ClassificationDocument document, String filename) {
var tables = document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.toList();
var tables = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList();
StringBuilder sb = new StringBuilder();
int currentPage = 1;
@ -782,19 +692,9 @@ public class PdfSegmentationServiceTest extends AbstractTest {
private void validateTable(ClassificationDocument document, int tableIndex, int colCount, int rowCount, int emptyCellsCountCorrect, int emptyCellsCountIncorrect) {
TablePageBlock table = document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.toList()
.get(tableIndex);
TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(tableIndex);
List<List<Cell>> rows = table.getRows();
int emptyCellsFoundFound = rows.stream()
.flatMap(List::stream)
.toList()
.stream()
.filter(f -> f.toString().isEmpty())
.toList().size();
int emptyCellsFoundFound = rows.stream().flatMap(List::stream).toList().stream().filter(f -> f.toString().isEmpty()).toList().size();
for (List<Cell> row : table.getRows()) {
row.forEach(r -> System.out.println(r.toString()));
@ -809,20 +709,11 @@ public class PdfSegmentationServiceTest extends AbstractTest {
private void validateTable(ClassificationDocument document, int tableIndex, List<List<String>> values) {
TablePageBlock table = document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.toList()
.get(tableIndex);
TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(tableIndex);
List<List<Cell>> rows = table.getRows();
List<Cell> rowsFlattened = rows.stream()
.flatMap(List::stream)
.toList();
List<String> valuesFlattened = values.stream()
.flatMap(List::stream)
.toList();
List<Cell> rowsFlattened = rows.stream().flatMap(List::stream).toList();
List<String> valuesFlattened = values.stream().flatMap(List::stream).toList();
for (int i = 0; i < valuesFlattened.size(); i++) {
Cell cell = rowsFlattened.get(i);
@ -835,11 +726,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
private void validateTableSize(ClassificationDocument document, int tableSize) {
assertThat(document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.toList().size()).isEqualTo(tableSize);
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().size()).isEqualTo(tableSize);
}

View File

@ -21,7 +21,7 @@ class BodyTextFrameServiceTest extends BuildDocumentTest {
String filename = "files/211.pdf";
String outputFilename = "/tmp/" + Path.of(filename).getFileName() + "_MAINBODY.pdf";
ClassificationDocument document = parseLayout(filename, LayoutParsingType.TAAS);
ClassificationDocument document = parseLayout(filename, LayoutParsingType.CLARIFYND);
PdfDraw.drawRectanglesPerPage(filename,
document.getPages().stream().map(page -> List.of(RectangleTransformations.toRectangle2D(page.getBodyTextFrame()))).toList(),
outputFilename);

View File

@ -74,7 +74,7 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
cleanRulingsPerPage.add(rulingCleaningService.getCleanRulings(Collections.emptyList(), pageContent.getRulings()));
}
var cleanRulings = cleanRulingsPerPage.stream().map(CleanRulings::getVertical).collect(Collectors.toList());
PdfDraw.drawLinesPerPage(fileName, cleanRulings, lineFileName);
PdfDraw.drawLinesPerPage(fileName, cleanRulings, lineFileName);
}
@ -99,18 +99,20 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
@SneakyThrows
private void writeJsons(Path filename) {
Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
filename.toFile(),
new ImageServiceResponse(),
new TableServiceResponse(),
new VisualLayoutParsingResponse(),
filename.toFile().toString()));
Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
filename.toFile(),
new ImageServiceResponse(),
new TableServiceResponse(),
new VisualLayoutParsingResponse(),
filename.toFile().toString()));
Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_OLD,
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD,
filename.toFile(),
new ImageServiceResponse(),
new TableServiceResponse(),
new VisualLayoutParsingResponse(),
filename.toFile().toString()));
Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_OLD,
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD,
filename.toFile(),
new ImageServiceResponse(),
new TableServiceResponse(),
new VisualLayoutParsingResponse(),
filename.toFile().toString()));
DocumentData documentDataBefore = DocumentDataMapper.toDocumentData(documentGraphBefore);
DocumentData documentDataAfter = DocumentDataMapper.toDocumentData(documentGraphAfter);
if (!compareStructures(documentDataBefore.getDocumentStructure(), documentDataAfter.getDocumentStructure())) {

View File

@ -20,7 +20,6 @@ import org.springframework.context.annotation.Import;
import org.springframework.context.annotation.Primary;
import org.springframework.core.io.ClassPathResource;
import org.springframework.test.context.junit.jupiter.SpringExtension;
import org.xmlunit.builder.Input;
import com.iqser.red.commons.jackson.ObjectMapperFactory;
import com.iqser.red.storage.commons.service.StorageService;
@ -68,7 +67,7 @@ public abstract class AbstractTest {
protected LayoutParsingRequest buildStandardLayoutParsingRequest() {
return LayoutParsingRequest.builder()
.layoutParsingType(LayoutParsingType.REDACT_MANAGER)
.layoutParsingType(LayoutParsingType.REDACT_MANAGER_OLD)
.originFileStorageId(ORIGIN_FILE_ID)
.tablesFileStorageId(Optional.of(TABLE_FILE_ID))
.imagesFileStorageId(Optional.of(IMAGE_FILE_ID))
@ -99,7 +98,7 @@ public abstract class AbstractTest {
@SneakyThrows
protected LayoutParsingRequest prepareStorage(String file) {
return prepareStorage(file, "cv_table_parsing_response/empty.json", "image_service_response/empty.json","visual_layout_parsing_response/empty.json");
return prepareStorage(file, "cv_table_parsing_response/empty.json", "image_service_response/empty.json", "visual_layout_parsing_response/empty.json");
}
@ -107,7 +106,7 @@ public abstract class AbstractTest {
protected LayoutParsingRequest prepareStorage(InputStream fileInputStream) {
storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileInputStream);
return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER);
return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER_OLD);
}
@ -140,6 +139,7 @@ public abstract class AbstractTest {
return prepareStorage(pdfFileResource.getInputStream(), cvServiceResponseFileResource.getInputStream(), imageInfoFileResource.getInputStream());
}
@SneakyThrows
protected LayoutParsingRequest prepareStorage(String file, String cvServiceResponseFile, String imageInfoFile, String visualLayoutParsingResponseFile) {
@ -148,9 +148,13 @@ public abstract class AbstractTest {
ClassPathResource imageInfoFileResource = new ClassPathResource(imageInfoFile);
ClassPathResource visualLayoutParsingResponseResource = new ClassPathResource(visualLayoutParsingResponseFile);
return prepareStorage(pdfFileResource.getInputStream(), cvServiceResponseFileResource.getInputStream(), imageInfoFileResource.getInputStream(), visualLayoutParsingResponseResource.getInputStream());
return prepareStorage(pdfFileResource.getInputStream(),
cvServiceResponseFileResource.getInputStream(),
imageInfoFileResource.getInputStream(),
visualLayoutParsingResponseResource.getInputStream());
}
@SneakyThrows
protected LayoutParsingRequest prepareStorage(InputStream fileStream, InputStream cvServiceResponseFileStream, InputStream imageInfoStream) {
@ -158,18 +162,22 @@ public abstract class AbstractTest {
storageService.storeObject(TenantContext.getTenantId(), TABLE_FILE_ID, cvServiceResponseFileStream);
storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileStream);
return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER);
return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER_OLD);
}
@SneakyThrows
protected LayoutParsingRequest prepareStorage(InputStream fileStream, InputStream cvServiceResponseFileStream, InputStream imageInfoStream, InputStream visualLayoutParsingResponseFileStream) {
protected LayoutParsingRequest prepareStorage(InputStream fileStream,
InputStream cvServiceResponseFileStream,
InputStream imageInfoStream,
InputStream visualLayoutParsingResponseFileStream) {
storageService.storeObject(TenantContext.getTenantId(), IMAGE_FILE_ID, imageInfoStream);
storageService.storeObject(TenantContext.getTenantId(), TABLE_FILE_ID, cvServiceResponseFileStream);
storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileStream);
storageService.storeObject(TenantContext.getTenantId(),VISUAL_LAYOUT_FILE,visualLayoutParsingResponseFileStream );
storageService.storeObject(TenantContext.getTenantId(), VISUAL_LAYOUT_FILE, visualLayoutParsingResponseFileStream);
return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER);
return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER_OLD);
}

View File

@ -26,14 +26,19 @@ public abstract class BuildDocumentTest extends AbstractTest {
File fileResource = new ClassPathResource(filename).getFile();
prepareStorage(filename);
return layoutParsingPipeline.parseLayout(layoutParsingType, fileResource, layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID), new TableServiceResponse(), new VisualLayoutParsingResponse(),filename);
return layoutParsingPipeline.parseLayout(layoutParsingType,
fileResource,
layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID),
new TableServiceResponse(),
new VisualLayoutParsingResponse(),
filename);
}
@SneakyThrows
protected Document buildGraph(String filename) {
return buildGraph(filename, LayoutParsingType.REDACT_MANAGER);
return buildGraph(filename, LayoutParsingType.REDACT_MANAGER_OLD);
}
@ -46,7 +51,7 @@ public abstract class BuildDocumentTest extends AbstractTest {
prepareStorage(filename);
}
return DocumentGraphFactory.buildDocumentGraph(parseLayout(filename, layoutParsingType));
return DocumentGraphFactory.buildDocumentGraph(layoutParsingType, parseLayout(filename, layoutParsingType));
}
}

View File

@ -1,6 +1,6 @@
plugins {
id("com.knecon.fforesight.java-conventions")
id("io.freefair.lombok") version "8.2.2"
id("io.freefair.lombok") version "8.4"
}
description = "Library for adding/removing layers in the viewer document"