Compare commits

...

8 Commits

Author SHA1 Message Date
Kilian Schüttler
de266dcfe5 Merge branch 'RED-9964' into 'release/0.159.x'
Red 9964: don't merge tables on non-consecutive pages or with tables in between

See merge request fforesight/layout-parser!204
2024-08-30 14:00:50 +02:00
Kilian Schüttler
10e525f0de Red 9964: don't merge tables on non-consecutive pages or with tables in between 2024-08-30 14:00:50 +02:00
Dominique Eifländer
e0e5e35b30 Merge branch 'RED-9974-4.2' into 'release/0.159.x'
RED-9974: Improved headline detection for documine old

See merge request fforesight/layout-parser!203
2024-08-30 10:52:31 +02:00
Dominique Eifländer
e1d8d1ea3b RED-9974: Improved headline detection for documine old 2024-08-30 10:35:24 +02:00
Kilian Schüttler
1546c05dd8 Merge branch 'RED-9975-bp' into 'release/0.159.x'
activate outline detection

See merge request fforesight/layout-parser!200
2024-08-29 14:26:14 +02:00
Kilian Schuettler
7c88c30ca7 RED-9975: activate outline detection 2024-08-29 14:17:20 +02:00
Kilian Schüttler
50427d08dc Merge branch 'RED-9975-bp' into 'release/0.159.x'
RED-9975: activate outline detection

See merge request fforesight/layout-parser!199
2024-08-29 12:43:14 +02:00
Kilian Schuettler
338c6c5dd0 RED-9975: activate outline detection 2024-08-29 12:27:20 +02:00
13 changed files with 296 additions and 118 deletions

View File

@ -119,14 +119,18 @@ public class LayoutParsingPipeline {
log.info("Starting layout parsing for {}", layoutParsingRequest.identifier());
File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()).orElse(originFile);
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId())
.orElse(originFile);
VisualLayoutParsingResponse visualLayoutParsingResponse = layoutParsingRequest.visualLayoutParsingFileId()
.map(layoutParsingStorageService::getVisualLayoutParsingFile).orElse(new VisualLayoutParsingResponse());
.map(layoutParsingStorageService::getVisualLayoutParsingFile)
.orElse(new VisualLayoutParsingResponse());
ImageServiceResponse imageServiceResponse = layoutParsingRequest.imagesFileStorageId()
.map(layoutParsingStorageService::getImagesFile).orElse(new ImageServiceResponse());
.map(layoutParsingStorageService::getImagesFile)
.orElse(new ImageServiceResponse());
TableServiceResponse tableServiceResponse = layoutParsingRequest.tablesFileStorageId()
.map(layoutParsingStorageService::getTablesFile).orElse(new TableServiceResponse());
.map(layoutParsingStorageService::getTablesFile)
.orElse(new TableServiceResponse());
ClassificationDocument classificationDocument = parseLayout(settings.getLayoutParsingTypeOverride() == null //
? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride(),
@ -143,13 +147,20 @@ public class LayoutParsingPipeline {
log.info("Creating viewer document for {}", layoutParsingRequest.identifier());
layoutGridService.addLayoutGrid(viewerDocumentFile, documentGraph, viewerDocumentFile, false, layoutParsingRequest.visualLayoutParsingFileId().isPresent());
layoutGridService.addLayoutGrid(viewerDocumentFile,
documentGraph,
viewerDocumentFile,
false,
layoutParsingRequest.visualLayoutParsingFileId()
.isPresent());
log.info("Storing resulting files for {}", layoutParsingRequest.identifier());
layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentGraph));
if (layoutParsingRequest.documentMarkdownFileStorageId().isPresent()) {
layoutParsingStorageService.storeMarkdownFile(layoutParsingRequest.documentMarkdownFileStorageId().get(), new MarkdownMapper().toMarkdownContent(documentGraph));
if (layoutParsingRequest.documentMarkdownFileStorageId()
.isPresent()) {
layoutParsingStorageService.storeMarkdownFile(layoutParsingRequest.documentMarkdownFileStorageId()
.get(), new MarkdownMapper().toMarkdownContent(documentGraph));
}
layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentGraph));
layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, viewerDocumentFile);
@ -246,7 +257,7 @@ public class LayoutParsingPipeline {
OutlineObject lastProcessedOutlineObject = null;
// parsing the structure elements could be useful as well
if (layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD && layoutParsingType != LayoutParsingType.DOCUMINE_OLD) {
if (layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD) {
classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originDocument));
}
@ -324,7 +335,7 @@ public class LayoutParsingPipeline {
classificationPage.setPageWidth(cropbox.getWidth());
classificationPage.setPageHeight(cropbox.getHeight());
if (layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD && layoutParsingType != LayoutParsingType.DOCUMINE_OLD) {
if (layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD) {
List<OutlineObject> outlineObjects = classificationDocument.getOutlineObjectTree().getOutlineObjectsPerPage().getOrDefault(pageNumber - 1, new ArrayList<>());
OutlineObject notFoundOutlineObject = null;
@ -379,6 +390,12 @@ public class LayoutParsingPipeline {
case CLARIFYND -> clarifyndClassificationService.classifyDocument(classificationDocument);
}
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD)) {
for (ClassificationPage page : classificationDocument.getPages()) {
docuMineBlockificationService.mergeblocks(page, page.getCleanRulings().withoutTextRulings(), 0, 10);
}
}
List<TextPageBlock> headlines = classificationDocument.getPages()
.stream()
.flatMap(classificationPage -> classificationPage.getTextBlocks()

View File

@ -133,7 +133,7 @@ public abstract class BoundingBox {
}
private boolean intersectsX(BoundingBox other, float threshold) {
public boolean intersectsX(BoundingBox other, float threshold) {
return this.getX() - threshold <= other.getMaxX() && this.getMaxX() + threshold >= other.getX();
}

View File

@ -1,12 +1,15 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import java.util.Comparator;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.TextEntity;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
@ -29,9 +32,8 @@ public class Page {
Integer height;
Integer width;
Integer rotation;
@EqualsAndHashCode.Exclude
List<SemanticNode> mainBody;
List<AtomicTextBlock> textBlocksOnPage;
@EqualsAndHashCode.Exclude
Header header;
@EqualsAndHashCode.Exclude
@ -53,20 +55,43 @@ public class Page {
.width((int) classificationPage.getPageWidth())
.number(classificationPage.getPageNumber())
.rotation(classificationPage.getRotation())
.mainBody(new LinkedList<>())
.textBlocksOnPage(new LinkedList<>())
.build();
}
/**
* Constructs and returns a {@link TextBlock} representing the concatenated text of all leaf semantic nodes in the main body.
*
* @return The main body text block.
*/
public TextBlock getMainBodyTextBlock() {
return mainBody.stream()
.filter(SemanticNode::isLeaf)
.map(SemanticNode::getLeafTextBlock)
return textBlocksOnPage.stream()
.collect(new TextBlockCollector());
}
public List<SemanticNode> getMainBody() {
return textBlocksOnPage.stream()
.map(AtomicTextBlock::getParent)
.map(this::getHighestParentOnPage)
.distinct()
.toList();
}
private SemanticNode getHighestParentOnPage(SemanticNode node) {
SemanticNode currentNode = node;
while (currentNode.getParent().onlyOnPage(this)) {
currentNode = currentNode.getParent();
}
return currentNode;
}
@Override
public String toString() {

View File

@ -74,7 +74,8 @@ public interface SemanticNode {
return getTextBlock().getPages()
.stream()
.min(Comparator.comparingInt(Page::getNumber)).orElseThrow(() -> new IllegalStateException("SemanticNode has no Page!"));
.min(Comparator.comparingInt(Page::getNumber))
.orElseThrow(() -> new IllegalStateException("SemanticNode has no Page!"));
}
@ -504,4 +505,17 @@ public interface SemanticNode {
void accept(NodeVisitor visitor);
/**
* Checks wether this SemanticNode appears on a single page only, and if that page is the provided one.
*
* @param page the page to check
* @return true, when SemanticNode is on a single page only and the page is the provided page. Otherwise, false.
*/
default boolean onlyOnPage(Page page) {
Set<Page> pages = getPages();
return pages.size() == 1 && pages.contains(page);
}
}

View File

@ -2,19 +2,23 @@ package com.knecon.fforesight.service.layoutparser.processor.services.blockifica
import java.util.ArrayList;
import java.util.List;
import java.util.ListIterator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.Orientation;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
@SuppressWarnings("all")
@Service
public class DocuMineBlockificationService {
@ -57,8 +61,11 @@ public class DocuMineBlockificationService {
boolean isSplitByRuling = prev != null && usedRulings.lineBetween(prev, word);
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
boolean splitByOtherFontAndOtherY = prev != null && Math.abs(prev.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight() * 0.2 //
&& (word.getFontStyle().contains("bold") && !prev.getFontStyle().contains("bold") //
|| prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold"));
&& (word.getFontStyle().contains("bold") && !prev.getFontStyle().contains("bold")
//
|| prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold")
|| Math.abs(prev.getFontSize() - word.getFontSize()) >= 1
|| Math.abs(word.getTextHeight() - prev.getTextHeight()) > 0.8);
Matcher matcher = pattern.matcher(chunkWords.stream()
.collect(Collectors.joining(" ")).toString());
@ -120,5 +127,77 @@ public class DocuMineBlockificationService {
return new ClassificationPage(textPageBlocks);
}
public void mergeblocks(ClassificationPage page, CleanRulings usedRulings, float xThreshold, float yThreshold) {
var blocks = page.getTextBlocks();
ListIterator<AbstractPageBlock> itty = blocks.listIterator();
while (itty.hasNext()) {
AbstractPageBlock block = itty.next();
if (block == null) {
continue;
}
if (block instanceof TablePageBlock) {
continue;
}
TextPageBlock current = (TextPageBlock) block;
for (int i = 0; i < blocks.size(); i++) {
AbstractPageBlock abstractPageBlock = blocks.get(i);
if (abstractPageBlock == null) {
continue;
}
if (abstractPageBlock == current) {
continue;
}
if (abstractPageBlock instanceof TablePageBlock) {
continue;
}
if (isHeadlineFromOutline(current) || isHeadlineFromOutline(abstractPageBlock)) {
continue;
}
TextPageBlock inner = (TextPageBlock) abstractPageBlock;
if (usedRulings.lineBetween(current, blocks.get(i))) {
continue;
}
if (current.getDir() == inner.getDir() && current.intersects(inner, yThreshold, xThreshold) && (current.getClassification() == null || current.getClassification()
.equals(inner.getClassification()))) {
boolean toDuplicate = current.isToDuplicate() || inner.isToDuplicate();
current.getSequences().addAll(inner.getSequences());
current = buildTextBlock(current.getSequences(), 0);
current.setClassification(inner.getClassification());
current.setToDuplicate(toDuplicate);
blocks.set(i, null);
itty.set(current);
}
}
}
var blocksIterator = blocks.iterator();
while (blocksIterator.hasNext()) {
if (blocksIterator.next() == null) {
blocksIterator.remove();
}
}
}
private boolean isHeadlineFromOutline(AbstractPageBlock abstractPageBlock) {
return abstractPageBlock.getEngines().contains(LayoutEngine.OUTLINE) && abstractPageBlock.getClassification() != null && abstractPageBlock.getClassification().isHeadline();
}
public static TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
return new TextPageBlock(wordBlockList);
}
}

View File

@ -74,7 +74,7 @@ public class DocuMineClassificationService {
return;
}
if (document.getFontSizeCounter().getMostPopular() == null) {
textBlock.setClassification(PageBlockType.OTHER);
textBlock.setClassification(PageBlockType.PARAGRAPH);
return;
}
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) //
@ -108,7 +108,10 @@ public class DocuMineClassificationService {
&& Character.isDigit(textBlock.toString().charAt(0))
&& atLeast3Matcher.reset().find()
&& !textBlock.toString().contains(":") //
|| textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && atLeast3Matcher.reset().find() && !textBlock.toString().contains(":") //
|| textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT))
&& atLeast3Matcher.reset().find()
&& !textBlock.toString().contains(":")
&& !textBlock.toString().startsWith("(")//
|| textBlock.toString().startsWith("APPENDIX") //
|| textBlock.toString().startsWith("FIGURE") //
|| textBlock.toString().startsWith("Continued TABLE") //
@ -143,9 +146,9 @@ public class DocuMineClassificationService {
&& PositionUtils.getApproxLineCount(textBlock) < 2.9) {
textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) {
textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN);
textBlock.setClassification(PageBlockType.PARAGRAPH);
} else {
textBlock.setClassification(PageBlockType.OTHER);
textBlock.setClassification(PageBlockType.PARAGRAPH);
}
}

View File

@ -6,6 +6,7 @@ import static java.util.stream.Collectors.toList;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
@ -15,6 +16,7 @@ import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
@ -32,7 +34,9 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.He
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContentItem;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
@ -68,10 +72,25 @@ public class DocumentGraphFactory {
documentGraph.setPages(context.pages.keySet());
documentGraph.setDocumentTree(context.documentTree);
documentGraph.setTextBlock(documentGraph.getTextBlock());
addTextBlocksToPages(documentGraph);
return documentGraph;
}
private void addTextBlocksToPages(Document documentGraph) {
documentGraph.streamAllSubNodes()
.filter(SemanticNode::isLeaf)
.filter(node -> !node.getType().equals(NodeType.HEADER))
.filter(node -> !node.getType().equals(NodeType.FOOTER))
.map(SemanticNode::getTextBlock)
.map(TextBlock::getAtomicTextBlocks)
.flatMap(Collection::stream)
.forEach(atb -> atb.getPage().getTextBlocksOnPage().add(atb));
}
private void addSections(LayoutParsingType layoutParsingType, ClassificationDocument classificationDocument, Context context, Document document) {
for (TableOfContentItem tocItem : classificationDocument.getTableOfContents()) {
@ -105,8 +124,6 @@ public class DocumentGraphFactory {
node = Paragraph.builder().documentTree(context.getDocumentTree()).build();
}
page.getMainBody().add(node);
List<TextPageBlock> textBlocks = new ArrayList<>();
textBlocks.add(originalTextBlock);
textBlocks.addAll(textBlocksToMerge);
@ -115,9 +132,9 @@ public class DocumentGraphFactory {
if (node instanceof DuplicatedParagraph duplicatedParagraph) {
AtomicTextBlock unsortedTextBlock = context.textBlockFactory.buildAtomicTextBlock2(textBlocks.stream()
.flatMap(tb -> tb.getSequences()
.stream())
.collect(Collectors.toList()), node, context, page);
.flatMap(tb -> tb.getSequences()
.stream())
.collect(Collectors.toList()), node, context, page);
duplicatedParagraph.setUnsortedLeafTextBlock(unsortedTextBlock);
}
@ -141,7 +158,7 @@ public class DocumentGraphFactory {
Rectangle2D position = image.getPosition();
Page page = context.getPage(image.getPage());
Image imageNode = Image.builder()
return Image.builder()
.id(IdBuilder.buildId(Set.of(page), List.of(position)))
.imageType(image.getImageType())
.position(position)
@ -150,8 +167,6 @@ public class DocumentGraphFactory {
.representationHash(image.getRepresentation())
.documentTree(context.getDocumentTree())
.build();
page.getMainBody().add(imageNode);
return imageNode;
}

View File

@ -51,9 +51,6 @@ public class SectionNodeFactory {
return Optional.empty();
}
Map<Integer, List<AbstractPageBlock>> blocksPerPage = pageBlocks.stream()
.collect(groupingBy(AbstractPageBlock::getPage));
AbstractSemanticNode section;
boolean containsTablesAndTextBlocks = containsTablesAndTextBlocks(pageBlocks);
if (isLeaf && !containsTablesAndTextBlocks) {
@ -63,8 +60,6 @@ public class SectionNodeFactory {
}
context.getSections().add(section);
blocksPerPage.keySet()
.forEach(pageNumber -> addSectionNodeToPageNode(context, section, pageNumber));
section.setTreeId(getTreeId(parentNode, context, section));
@ -242,10 +237,5 @@ public class SectionNodeFactory {
}
private void addSectionNodeToPageNode(DocumentGraphFactory.Context context, AbstractSemanticNode section, Integer pageNumber) {
Page page = context.getPage(pageNumber);
page.getMainBody().add(section);
}
}

View File

@ -51,8 +51,6 @@ public class TableNodeFactory {
.numberOfRows(mergedRows.size())
.build();
pages.forEach(page -> addTableToPage(page, parentNode, table));
List<Integer> treeId = context.getDocumentTree().createNewChildEntryAndReturnId(parentNode, table);
table.setTreeId(treeId);
addTableCells(layoutParsingType, mergedRows, table, context, document);
@ -82,17 +80,6 @@ public class TableNodeFactory {
}
@SuppressWarnings("PMD.UnusedPrivateMethod") // PMD actually flags this wrong
private void addTableToPage(Page page, SemanticNode parentNode, Table table) {
if (!page.getMainBody().contains(parentNode)) {
parentNode.getPages().add(page);
}
page.getMainBody().add(table);
}
private void ifTableHasNoHeadersSetFirstRowAsHeaders(Table table) {
if (table.streamHeaders()
@ -107,14 +94,7 @@ public class TableNodeFactory {
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
for (int colIndex = 0; colIndex < rows.get(rowIndex).size(); colIndex++) {
addTableCell(layoutParsingType,
rows.get(rowIndex)
.get(colIndex),
rowIndex,
colIndex,
table,
context,
document);
addTableCell(layoutParsingType, rows.get(rowIndex).get(colIndex), rowIndex, colIndex, table, context, document);
}
}
}
@ -131,14 +111,7 @@ public class TableNodeFactory {
Page page = context.getPage(cell.getPageNumber());
TableCell tableCell = TableCell.builder()
.documentTree(context.getDocumentTree())
.row(rowIndex)
.col(colIndex)
.header(cell.isHeaderCell())
.bBox(cell.getBBoxPdf())
.build();
page.getMainBody().add(tableCell);
TableCell tableCell = TableCell.builder().documentTree(context.getDocumentTree()).row(rowIndex).col(colIndex).header(cell.isHeaderCell()).bBox(cell.getBBoxPdf()).build();
List<Integer> treeId = context.getDocumentTree().createNewTableChildEntryAndReturnId(tableNode, tableCell);
tableCell.setTreeId(treeId);
@ -147,9 +120,7 @@ public class TableNodeFactory {
if (cell.getTextBlocks().isEmpty()) {
tableCell.setLeafTextBlock(context.getTextBlockFactory().emptyTextBlock(tableNode, context, page));
} else if (cell.getTextBlocks().size() == 1) {
textBlock = context.getTextBlockFactory()
.buildAtomicTextBlock2(cell.getTextBlocks()
.get(0).getSequences(), tableCell, context, page);
textBlock = context.getTextBlockFactory().buildAtomicTextBlock2(cell.getTextBlocks().get(0).getSequences(), tableCell, context, page);
tableCell.setLeafTextBlock(textBlock);
} else if (firstTextBlockIsHeadline(cell)) {
SectionNodeFactory.addSection(layoutParsingType,
@ -181,8 +152,7 @@ public class TableNodeFactory {
private boolean firstTextBlockIsHeadline(Cell cell) {
return cell.getTextBlocks()
.get(0).isHeadline();
return cell.getTextBlocks().get(0).isHeadline();
}
}

View File

@ -40,27 +40,26 @@ public class TextBlockFactory {
orientation = sequences.get(0).getDir().toString();
textRotation = sequences.get(0).getDir().getRotation();
}
return AtomicTextBlock.fromSearchTextWithTextPosition(searchTextWithTextPositionDto.getSearchText(),
searchTextWithTextPositionDto.getLineBreaks(),
searchTextWithTextPositionDto.getBoldTextBoundaries(),
searchTextWithTextPositionDto.getItalicTextBoundaries(),
searchTextWithTextPositionDto.getPositions(),
searchTextWithTextPositionDto.getStringIdxToPositionIdx(),
idx,
parent,
numberOnPage,
page,
offset,
orientation,
textRotation);
var atb = AtomicTextBlock.fromSearchTextWithTextPosition(searchTextWithTextPositionDto.getSearchText(),
searchTextWithTextPositionDto.getLineBreaks(),
searchTextWithTextPositionDto.getBoldTextBoundaries(),
searchTextWithTextPositionDto.getItalicTextBoundaries(),
searchTextWithTextPositionDto.getPositions(),
searchTextWithTextPositionDto.getStringIdxToPositionIdx(),
idx,
parent,
numberOnPage,
page,
offset,
orientation,
textRotation);
return atb;
}
public AtomicTextBlock emptyTextBlock(SemanticNode parent, DocumentGraphFactory.Context context, Page page) {
long idx = textBlockIdx;
textBlockIdx++;
return AtomicTextBlock.empty(idx, stringOffset, page, context.getAndIncrementTextBlockNumberOnPage(page), parent);
return emptyTextBlock(parent, context.getAndIncrementTextBlockNumberOnPage(page), page);
}
@ -68,7 +67,8 @@ public class TextBlockFactory {
long idx = textBlockIdx;
textBlockIdx++;
return AtomicTextBlock.empty(idx, stringOffset, page, numberOnPage, parent);
var atb = AtomicTextBlock.empty(idx, stringOffset, page, numberOnPage, parent);
return atb;
}
}

View File

@ -41,7 +41,9 @@ public class DocumentGraphMapper {
DocumentTree documentTree = new DocumentTree(document);
Context context = new Context(documentData, documentTree);
context.pages.addAll(Arrays.stream(documentData.getDocumentPages()).map(DocumentGraphMapper::buildPage).toList());
context.pages.addAll(Arrays.stream(documentData.getDocumentPages())
.map(DocumentGraphMapper::buildPage)
.toList());
context.documentTree.getRoot().getChildren().addAll(buildEntries(documentData.getDocumentStructure().getRoot().getChildren(), context));
@ -59,7 +61,9 @@ public class DocumentGraphMapper {
List<DocumentTree.Entry> newEntries = new LinkedList<>();
for (DocumentStructure.EntryData entryData : entries) {
List<Page> pages = Arrays.stream(entryData.getPageNumbers()).map(pageNumber -> getPage(pageNumber, context)).toList();
List<Page> pages = Arrays.stream(entryData.getPageNumbers())
.map(pageNumber -> getPage(pageNumber, context))
.toList();
SemanticNode node = switch (entryData.getType()) {
case SECTION -> buildSection(context);
@ -77,16 +81,17 @@ public class DocumentGraphMapper {
if (entryData.getAtomicBlockIds().length > 0) {
TextBlock textBlock = toTextBlock(entryData.getAtomicBlockIds(), context, node);
node.setLeafTextBlock(textBlock);
switch (entryData.getType()) {
case HEADER -> pages.forEach(page -> page.setHeader((Header) node));
case FOOTER -> pages.forEach(page -> page.setFooter((Footer) node));
default -> textBlock.getAtomicTextBlocks()
.forEach(atb -> atb.getPage().getTextBlocksOnPage().add(atb));
}
}
List<Integer> treeId = Arrays.stream(entryData.getTreeId()).boxed().toList();
List<Integer> treeId = Arrays.stream(entryData.getTreeId()).boxed()
.toList();
node.setTreeId(treeId);
switch (entryData.getType()) {
case HEADER -> pages.forEach(page -> page.setHeader((Header) node));
case FOOTER -> pages.forEach(page -> page.setFooter((Footer) node));
default -> pages.forEach(page -> page.getMainBody().add(node));
}
newEntries.add(DocumentTree.Entry.builder().treeId(treeId).children(buildEntries(entryData.getChildren(), context)).node(node).build());
}
return newEntries;
@ -142,6 +147,7 @@ public class DocumentGraphMapper {
return Section.builder().documentTree(context.documentTree).build();
}
private SuperSection buildSuperSection(Context context) {
return SuperSection.builder().documentTree(context.documentTree).build();
@ -166,22 +172,24 @@ public class DocumentGraphMapper {
private TextBlock toTextBlock(Long[] atomicTextBlockIds, Context context, SemanticNode parent) {
return Arrays.stream(atomicTextBlockIds).map(atomicTextBlockId -> getAtomicTextBlock(context, parent, atomicTextBlockId)).collect(new TextBlockCollector());
return Arrays.stream(atomicTextBlockIds)
.map(atomicTextBlockId -> getAtomicTextBlock(context, parent, atomicTextBlockId))
.collect(new TextBlockCollector());
}
private AtomicTextBlock getAtomicTextBlock(Context context, SemanticNode parent, Long atomicTextBlockId) {
return AtomicTextBlock.fromAtomicTextBlockData(context.documentTextDataBlockData.get(Math.toIntExact(atomicTextBlockId)),
context.atomicPositionBlockData.get(Math.toIntExact(atomicTextBlockId)),
parent,
getPage(context.documentTextDataBlockData.get(Math.toIntExact(atomicTextBlockId)).getPage(), context));
context.atomicPositionBlockData.get(Math.toIntExact(atomicTextBlockId)),
parent,
getPage(context.documentTextDataBlockData.get(Math.toIntExact(atomicTextBlockId)).getPage(), context));
}
private Page buildPage(DocumentPage p) {
return Page.builder().rotation(p.getRotation()).height(p.getHeight()).width(p.getWidth()).number(p.getNumber()).mainBody(new LinkedList<>()).build();
return Page.builder().rotation(p.getRotation()).height(p.getHeight()).width(p.getWidth()).number(p.getNumber()).textBlocksOnPage(new LinkedList<>()).build();
}
@ -206,8 +214,10 @@ public class DocumentGraphMapper {
this.documentTree = documentTree;
this.pages = new LinkedList<>();
this.documentTextDataBlockData = Arrays.stream(documentData.getDocumentTextData()).toList();
this.atomicPositionBlockData = Arrays.stream(documentData.getDocumentPositions()).toList();
this.documentTextDataBlockData = Arrays.stream(documentData.getDocumentTextData())
.toList();
this.atomicPositionBlockData = Arrays.stream(documentData.getDocumentPositions())
.toList();
}

View File

@ -1,9 +1,10 @@
package com.knecon.fforesight.service.layoutparser.processor.utils;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.LinkedList;
import java.util.List;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
@ -22,29 +23,83 @@ public class TableMergingUtility {
List<TablePageBlock> consecutiveTables = pageBlocks.stream()
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.filter(tablePageBlock -> !tablePageBlock.equals(originalTablePageBlock))
.sorted(Comparator.comparingInt(TablePageBlock::getPage).thenComparing(TablePageBlock::getY).thenComparing(TablePageBlock::getX))
.toList();
assert consecutiveTables.size() == pageBlocks.size() - 1;
var currentTable = originalTablePageBlock;
int currentTableIndex = 0;
List<TablePageBlock> consecutiveTablesWithSameColCountAndHeaders = new LinkedList<>();
for (TablePageBlock consecutiveTable : consecutiveTables) {
if (consecutiveTable.getColCount() == originalTablePageBlock.getColCount() && !hasTableHeader(consecutiveTable) && outerBoundaryAlignsX(originalTablePageBlock,
consecutiveTable)) {
consecutiveTablesWithSameColCountAndHeaders.add(originalTablePageBlock);
for (int i = 0; i < consecutiveTables.size(); i++) {
TablePageBlock consecutiveTable = consecutiveTables.get(i);
if (consecutiveTable.getColCount() == originalTablePageBlock.getColCount() //
&& headersMatch(originalTablePageBlock, consecutiveTable) //
&& outerBoundaryAlignsX(originalTablePageBlock, consecutiveTable) //
&& consecutiveOrSamePage(currentTable, consecutiveTable) //
&& !tableBetween(currentTable, consecutiveTable, findTablesBetween(consecutiveTables, currentTableIndex, i))) {
currentTable = consecutiveTable;
currentTableIndex = i;
consecutiveTablesWithSameColCountAndHeaders.add(consecutiveTable);
}
}
return Stream.concat(Stream.of(originalTablePageBlock), consecutiveTablesWithSameColCountAndHeaders.stream()).toList();
return consecutiveTablesWithSameColCountAndHeaders;
}
private static List<TablePageBlock> findTablesBetween(List<TablePageBlock> consecutiveTables, int currentTableIndex, int i) {
if (currentTableIndex + 1 == consecutiveTables.size() || currentTableIndex + 1 >= i) {
return Collections.emptyList();
}
return consecutiveTables.subList(currentTableIndex + 1, i);
}
private static boolean consecutiveOrSamePage(TablePageBlock currentTable, TablePageBlock consecutiveTable) {
return currentTable.getPage() == consecutiveTable.getPage() || currentTable.getPage() + 1 == consecutiveTable.getPage();
}
private static boolean tableBetween(TablePageBlock currentTable, TablePageBlock consecutiveTable, List<TablePageBlock> tablesBetween) {
if (tablesBetween.isEmpty()) {
return false;
}
// assumes the tables are on the same page or on consecutive pages, all tables on pages in between are ignored.
return tablesBetween.stream()
.filter(tableBetween -> tableBetween.getPage() == currentTable.getPage())
.anyMatch(tableBetween -> tableBetween.isBelow(currentTable)) //
|| tablesBetween.stream()
.filter(tableBetween -> tableBetween.getPage() == consecutiveTable.getPage())
.anyMatch(tableBetween -> tableBetween.isAbove(consecutiveTable));
}
private static boolean headersMatch(TablePageBlock originalTable, TablePageBlock consecutiveTable) {
return getHeaders(consecutiveTable).isEmpty() || getHeaders(originalTable).equals(getHeaders(consecutiveTable));
}
private static boolean outerBoundaryAlignsX(TablePageBlock originalTablePageBlock, TablePageBlock consecutiveTable) {
return Math.abs(consecutiveTable.getMinX() - originalTablePageBlock.getMinX()) < TABLE_ALIGNMENT_THRESHOLD && Math.abs(consecutiveTable.getMaxX() - originalTablePageBlock.getMaxX()) < TABLE_ALIGNMENT_THRESHOLD;
return Math.abs(consecutiveTable.getMinX() - originalTablePageBlock.getMinX()) < TABLE_ALIGNMENT_THRESHOLD
&& Math.abs(consecutiveTable.getMaxX() - originalTablePageBlock.getMaxX()) < TABLE_ALIGNMENT_THRESHOLD;
}
private boolean hasTableHeader(TablePageBlock table) {
private List<Cell> getHeaders(TablePageBlock table) {
return table.getRows().stream().flatMap(Collection::stream).anyMatch(Cell::isHeaderCell);
return table.getRows()
.stream()
.flatMap(Collection::stream)
.filter(Cell::isHeaderCell)
.toList();
}
}

View File

@ -27,7 +27,7 @@ import lombok.extern.slf4j.Slf4j;
@Slf4j
public class LayoutparserEnd2EndTest extends AbstractTest {
public static final LayoutParsingType LAYOUT_PARSING_TYPE = LayoutParsingType.DOCUMINE;
public static final LayoutParsingType LAYOUT_PARSING_TYPE = LayoutParsingType.DOCUMINE_OLD;
@Autowired
private LayoutParsingPipeline layoutParsingPipeline;
@ -37,7 +37,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
@Disabled
public void testLayoutParserEndToEnd() {
String filePath = "/home/kschuettler/Downloads/55974b3de7ed2915718a10458206bbd8.ORIGIN.pdf";
String filePath = "/home/kschuettler/Dokumente/Ticket Related/RED-9964/17a25133-e098-4610-b553-d1bf11a56d96/560e6ab1ab4754b9a62fd2e6d4d71327/560e6ab1ab4754b9a62fd2e6d4d71327.ORIGIN.pdf";
runForFile(filePath);
}