diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index 010c985..68c4e41 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -2,6 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor; import static java.lang.String.format; +import java.awt.geom.Point2D; import java.awt.geom.Rectangle2D; import java.io.File; import java.io.IOException; @@ -210,15 +211,15 @@ public class LayoutParsingPipeline { private String buildSemanticNodeCountMessage(int numberOfPages, Map semanticNodeCounts) { - return String.format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed", - numberOfPages, - semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION), - semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE), - semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH), - semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE), - semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL), - semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER), - semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER)); + return format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed", + numberOfPages, + semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION), + semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE), + semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH), + semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE), + semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL), + semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER), + semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER)); } @@ -239,6 +240,7 @@ public class LayoutParsingPipeline { Map> signatures = visualLayoutParsingAdapter.buildExtractedSignaturesPerPage(visualLayoutParsingResponse); ClassificationDocument classificationDocument = new ClassificationDocument(); List classificationPages = new ArrayList<>(); + OutlineObject lastProcessedOutlineObject = null; // parsing the structure elements could be useful as well classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originDocument)); @@ -307,11 +309,16 @@ public class LayoutParsingPipeline { classificationPage.setPageWidth(cropbox.getWidth()); classificationPage.setPageHeight(cropbox.getHeight()); - List outlineObjects = classificationDocument.getOutlineObjectTree().getOutlineObjectsPerPage() - .get(pageNumber - 1); - if (outlineObjects != null) { + List outlineObjects = classificationDocument.getOutlineObjectTree().getOutlineObjectsPerPage().getOrDefault(pageNumber - 1, new ArrayList<>()); + + OutlineObject notFoundOutlineObject = null; + if (lastProcessedOutlineObject != null && !lastProcessedOutlineObject.isFound()) { + lastProcessedOutlineObject.setPoint(new Point2D.Float(0, cropbox.getHeight())); + notFoundOutlineObject = lastProcessedOutlineObject; + } + if (!outlineObjects.isEmpty()) { classificationPage.setOutlineObjects(outlineObjects); - blockificationPostprocessingService.sanitizeOutlineBlocks(classificationPage); + lastProcessedOutlineObject = blockificationPostprocessingService.sanitizeOutlineBlocks(classificationPage, notFoundOutlineObject); } // MarkedContent needs to be converted at this point, otherwise it leads to GC Problems in Pdfbox. diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationDocument.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationDocument.java index ce151a1..1044b14 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationDocument.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationDocument.java @@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.model; import java.util.ArrayList; import java.util.List; +import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject; import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree; import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents; import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineObject.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineObject.java index 6cc6485..5e9bf6b 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineObject.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineObject.java @@ -5,16 +5,27 @@ import java.awt.geom.Point2D; import lombok.AllArgsConstructor; import lombok.Data; import lombok.NoArgsConstructor; +import lombok.RequiredArgsConstructor; @Data -@NoArgsConstructor +@RequiredArgsConstructor @AllArgsConstructor public class OutlineObject { - private String title; - private int pageNumber; + private final String title; + private final int pageNumber; private Point2D point; - private int treeDepth; + private final int treeDepth; + + private boolean found = false; + + + public OutlineObject(String title, int pageNumber, Point2D point2D, int depth) { + + this(title, pageNumber, depth); + this.point = point2D; + } + @Override public String toString() { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java index 6b41670..cfd5cea 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java @@ -80,7 +80,10 @@ public class TextPageBlock extends AbstractPageBlock { public static TextPageBlock merge(List textBlocksToMerge) { - List sequences = textBlocksToMerge.stream().map(TextPageBlock::getSequences).flatMap(java.util.Collection::stream).toList(); + List sequences = textBlocksToMerge.stream() + .map(TextPageBlock::getSequences) + .flatMap(java.util.Collection::stream) + .toList(); sequences = new ArrayList<>(sequences); return fromTextPositionSequences(sequences); } @@ -106,11 +109,11 @@ public class TextPageBlock extends AbstractPageBlock { if (textBlock == null) { textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(), - wordBlock.getMaxXDirAdj(), - wordBlock.getMinYDirAdj(), - wordBlock.getMaxYDirAdj(), - wordBlockList, - wordBlock.getRotation()); + wordBlock.getMaxXDirAdj(), + wordBlock.getMinYDirAdj(), + wordBlock.getMaxYDirAdj(), + wordBlockList, + wordBlock.getRotation()); } else { TextPageBlock spatialEntity = textBlock.union(wordBlock); textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight()); @@ -126,11 +129,12 @@ public class TextPageBlock extends AbstractPageBlock { textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest()); } - if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences() - .stream() - .map(t -> DoubleComparisons.round(t.getMinYDirAdj(), 3)) - .collect(toSet()) - .size() == 1) { + if (textBlock != null + && textBlock.getSequences() != null + && textBlock.getSequences() + .stream() + .map(t -> DoubleComparisons.round(t.getMinYDirAdj(), 3)) + .collect(toSet()).size() == 1) { textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj)); } return textBlock; @@ -290,18 +294,7 @@ public class TextPageBlock extends AbstractPageBlock { public void add(TextPositionSequence r) { - if (r.getMinXDirAdj() < minX) { - minX = r.getMinXDirAdj(); - } - if (r.getMaxXDirAdj() > maxX) { - maxX = r.getMaxXDirAdj(); - } - if (r.getMinYDirAdj() < minY) { - minY = r.getMinYDirAdj(); - } - if (r.getMaxYDirAdj() > maxY) { - maxY = r.getMaxYDirAdj(); - } + setCoordinates(r); } @@ -317,6 +310,33 @@ public class TextPageBlock extends AbstractPageBlock { } + public void resize() { + + minX = Float.MAX_VALUE; + minY = Float.MAX_VALUE; + maxX = Float.MIN_VALUE; + maxY = Float.MIN_VALUE; + sequences.forEach(this::setCoordinates); + } + + + private void setCoordinates(TextPositionSequence sequence) { + + if (sequence.getMinXDirAdj() < minX) { + minX = sequence.getMinXDirAdj(); + } + if (sequence.getMaxXDirAdj() > maxX) { + maxX = sequence.getMaxXDirAdj(); + } + if (sequence.getMinYDirAdj() < minY) { + minY = sequence.getMinYDirAdj(); + } + if (sequence.getMaxYDirAdj() > maxY) { + maxY = sequence.getMaxYDirAdj(); + } + } + + public void set(float x1, float y1, float x2, float y2) { this.minX = Math.min(x1, x2); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java index 063a209..a38780a 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java @@ -37,50 +37,129 @@ public class BlockificationPostprocessingService { .collect(RectangleTransformations.collectBBox()); - public void sanitizeOutlineBlocks(ClassificationPage classificationPage) { + public OutlineObject sanitizeOutlineBlocks(ClassificationPage classificationPage, OutlineObject notFoundOutlineObject) { List outlineObjects = classificationPage.getOutlineObjects(); - List textBlocks = classificationPage.getTextBlocks() - .stream() - .filter(block -> block instanceof TextPageBlock) - .toList() - .stream() - .map(block -> (TextPageBlock) block) - .toList(); - - if (textBlocks.isEmpty() || outlineObjects.isEmpty()) { - return; + if (getTextPageBlocks(classificationPage).isEmpty() || outlineObjects.isEmpty()) { + return null; } float pageHeight = classificationPage.getPageHeight(); - for (OutlineObject outlineObject : outlineObjects) { + ListIterator outlineObjectListIterator = outlineObjects.listIterator(); - OutlineProcessionContext context = new OutlineProcessionContext(outlineObject); + if (notFoundOutlineObject != null) { + OutlineProcessionContext notFoundOutlineObjectProcessionContext = new OutlineProcessionContext(notFoundOutlineObject); + processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, notFoundOutlineObjectProcessionContext); - ListIterator iterator = textBlocks.listIterator(); - while (iterator.hasNext()) { - TextPageBlock pageBlock = iterator.next(); - if (pageHeight - outlineObject.getPoint().getY() - BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD <= pageBlock.getMaxY()) { - break; - } + OutlineObject firstOutlineObject = null; + OutlineProcessionContext firstOutlineObjectProcessionContext = null; + if (outlineObjectListIterator.hasNext()) { + firstOutlineObject = outlineObjectListIterator.next(); + firstOutlineObjectProcessionContext = new OutlineProcessionContext(firstOutlineObject); + processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, firstOutlineObjectProcessionContext); } - if (iterator.hasPrevious()) { - iterator.previous(); - } - boolean earlyStop = false; - while (iterator.hasNext() && !earlyStop) { - TextPageBlock pageBlock = iterator.next(); - earlyStop = processOutlineForTextBlock(pageBlock, context); - } - selectMatch(classificationPage, context); + if (!contextsOverlap(notFoundOutlineObjectProcessionContext, firstOutlineObjectProcessionContext)) { + notFoundOutlineObject.setFound(selectMatch(classificationPage, notFoundOutlineObjectProcessionContext)); + } + if (firstOutlineObject != null) { + // re-create the context for the updated blocks + firstOutlineObjectProcessionContext = new OutlineProcessionContext(firstOutlineObject); + processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, firstOutlineObjectProcessionContext); + firstOutlineObject.setFound(selectMatch(classificationPage, firstOutlineObjectProcessionContext)); + } + + } + + outlineObjectListIterator.forEachRemaining(outlineObject -> { + OutlineProcessionContext outlineObjectProcessionContext = new OutlineProcessionContext(outlineObject); + processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, outlineObjectProcessionContext); + outlineObject.setFound(selectMatch(classificationPage, outlineObjectProcessionContext)); + }); + + if (!outlineObjects.isEmpty()) { + return outlineObjects.get(outlineObjects.size() - 1); + } else { + return notFoundOutlineObject; } } - private void selectMatch(ClassificationPage classificationPage, OutlineProcessionContext context) { + private static List getTextPageBlocks(ClassificationPage classificationPage) { + + return classificationPage.getTextBlocks() + .stream() + .filter(block -> block instanceof TextPageBlock) + .map(block -> (TextPageBlock) block) + .toList(); + } + + + private boolean contextsOverlap(OutlineProcessionContext notFoundOutlineObjectProcessionContext, OutlineProcessionContext firstOutlineObjectProcessionContext) { + + if (firstOutlineObjectProcessionContext == null) { + return false; + } + + String notFoundTitle = notFoundOutlineObjectProcessionContext.getOutlineObject().getTitle(); + String firstTitle = firstOutlineObjectProcessionContext.getOutlineObject().getTitle(); + + if (!firstTitle.startsWith(notFoundTitle)) { + return false; + } + + var blocksOfNotFoundOutline = getAllMatchingBlocks(notFoundOutlineObjectProcessionContext); + var blocksOfFirstOutline = getAllMatchingBlocks(firstOutlineObjectProcessionContext); + + double maxYFirst = blocksOfFirstOutline.stream() + .mapToDouble(TextPageBlock::getPdfMaxY) + .max() + .orElse(Double.NEGATIVE_INFINITY); + + return blocksOfNotFoundOutline.stream() + .mapToDouble(TextPageBlock::getPdfMaxY) + .anyMatch(y -> y >= maxYFirst); + } + + + private List getAllMatchingBlocks(OutlineProcessionContext context) { + + List blocks = new ArrayList<>(); + if (context.getDirectMatch() != null) { + blocks.add(context.getDirectMatch()); + } + if (context.getSplitCandidate() != null) { + blocks.add(context.getSplitCandidate()); + } + blocks.addAll(context.getMergeCandidates()); + return blocks; + } + + + private void processTextBlocks(List textBlocks, float pageHeight, OutlineProcessionContext context) { + + OutlineObject outlineObject = context.getOutlineObject(); + ListIterator iterator = textBlocks.listIterator(); + while (iterator.hasNext()) { + TextPageBlock pageBlock = iterator.next(); + if (pageHeight - outlineObject.getPoint().getY() - BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD <= pageBlock.getMaxY()) { + break; + } + } + if (iterator.hasPrevious()) { + iterator.previous(); + } + boolean earlyStop = false; + while (iterator.hasNext() && !earlyStop) { + TextPageBlock pageBlock = iterator.next(); + earlyStop = processOutlineForTextBlock(pageBlock, context); + } + } + + + private boolean selectMatch(ClassificationPage classificationPage, OutlineProcessionContext context) { OutlineObject outlineObject = context.outlineObject; TextPageBlock directMatch = context.directMatch; @@ -122,28 +201,39 @@ public class BlockificationPostprocessingService { double minDistance = Math.min(distanceToDirectMatch, Math.min(distanceToSplitCandidate, distanceToBestMergeCandidates)); if (minDistance == Double.MAX_VALUE) { - return; + return false; } if (minDistance == distanceToDirectMatch) { directMatch.setClassification(headlineType); } else if (minDistance == distanceToSplitCandidate) { - List others = splitBlock(classificationPage, splitCandidate, context.sectionIdentifier + outlineObject.getTitle()); + List others = splitBlock(classificationPage, splitCandidate, context.sectionIdentifier, outlineObject.getTitle()); splitCandidate.setClassification(headlineType); others.forEach(other -> other.setClassification(null)); } else { var merged = mergeBlocks(classificationPage, bestMergeCandidateCombination); merged.setClassification(headlineType); } + return true; } - private List splitBlock(ClassificationPage classificationPage, TextPageBlock blockToSplit, String text) { + private List splitBlock(ClassificationPage classificationPage, TextPageBlock blockToSplit, SectionIdentifier sectionIdentifier, String title) { List otherBlocks = new ArrayList<>(); int blockToSplitIdx = classificationPage.getTextBlocks().indexOf(blockToSplit); - WordSequenceResult wordSequenceResult = findWordSequence(blockToSplit.getSequences(), text); + + String headline = title; + if (!sectionIdentifier.getFormat().equals(SectionIdentifier.Format.EMPTY) && !title.startsWith(sectionIdentifier.getIdentifierString())) { + headline = sectionIdentifier + headline; + } + + WordSequenceResult wordSequenceResult = findWordSequence(blockToSplit.getSequences(), headline); + if (wordSequenceResult.inSequence.isEmpty()) { + wordSequenceResult = findWordSequence(blockToSplit.getSequences(), title); + } blockToSplit.setSequences(wordSequenceResult.inSequence); + blockToSplit.resize(); if (!wordSequenceResult.preSequence.isEmpty()) { TextPageBlock block = buildTextBlock(wordSequenceResult.preSequence, 0); @@ -301,6 +391,7 @@ public class BlockificationPostprocessingService { assert firstBlock != null; firstBlock.setToDuplicate(false); + firstBlock.resize(); classificationPage.getTextBlocks().removeAll(mergedBlocks); } @@ -378,13 +469,13 @@ public class BlockificationPostprocessingService { if (blockTextContainsOutlineTitle) { SectionIdentifier sectionIdentifier = SectionIdentifier.fromSearchText(blockText); - if (sectionIdentifier.getFormat() != SectionIdentifier.Format.EMPTY) { + if (sectionIdentifier.getFormat() != SectionIdentifier.Format.EMPTY && !outlineTitle.startsWith(sectionIdentifier.getIdentifierString())) { if (blockText.startsWith(sectionIdentifier.getIdentifierString()) && blockText.endsWith(outlineTitle) && context.directMatch == null) { context.directMatch = pageBlock; return true; } else if (context.splitCandidate == null) { - context.sectionIdentifier = sectionIdentifier.getIdentifierString(); + context.sectionIdentifier = sectionIdentifier; } } if (context.splitCandidate == null) { @@ -408,7 +499,7 @@ public class BlockificationPostprocessingService { private OutlineObject outlineObject; private List mergeCandidates; private TextPageBlock splitCandidate; - private String sectionIdentifier; + private SectionIdentifier sectionIdentifier; public OutlineProcessionContext(OutlineObject outlineObject) { @@ -417,7 +508,7 @@ public class BlockificationPostprocessingService { this.directMatch = null; this.mergeCandidates = new ArrayList<>(); this.splitCandidate = null; - this.sectionIdentifier = ""; + this.sectionIdentifier = SectionIdentifier.empty(); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java index 2ab953d..765249d 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java @@ -58,18 +58,20 @@ public class DocstrumBlockificationService { zones.forEach(zone -> { List textPositionSequences = new ArrayList<>(); - zone.getLines().forEach(line -> { - line.getWords().forEach(word -> { - textPositionSequences.add(new TextPositionSequence(word.getTextPositions(), word.getPage())); - }); - }); + zone.getLines() + .forEach(line -> { + line.getWords() + .forEach(word -> { + textPositionSequences.add(new TextPositionSequence(word.getTextPositions(), word.getPage())); + }); + }); abstractPageBlocks.addAll(splitZonesAtRulings(textPositionSequences, horizontalRulings, verticalRulings)); }); if (xyOrder) { abstractPageBlocks.sort(Comparator.comparing(AbstractPageBlock::getMinY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) - .thenComparing(AbstractPageBlock::getMinX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); + .thenComparing(AbstractPageBlock::getMinX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); abstractPageBlocks.sort(new Comparator() { @Override public int compare(AbstractPageBlock o1, AbstractPageBlock o2) { @@ -90,7 +92,7 @@ public class DocstrumBlockificationService { while (itty.hasNext()) { AbstractPageBlock block = itty.next(); - if (block instanceof TablePageBlock || previous.isHeadline()) { + if (block instanceof TablePageBlock) { previous = new TextPageBlock(); continue; } @@ -98,11 +100,21 @@ public class DocstrumBlockificationService { if (previous != null && !previous.getSequences().isEmpty()) { - if (current.getDir() != previous.getDir() || current.isHeadline()) { + if (current.getDir() != previous.getDir()) { previous = current; continue; } + if (current.isHeadline() || previous.isHeadline()) { + if (intersectsYWithPreviousHavingMaxOneLine(previous, current, page)) { + previous = combineBlocksAndResetIterator(previous, current, itty, false); + } else { + previous = current; + } + + continue; + } + if (areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(previous, current, page)) { previous = combineBlocksAndResetIterator(previous, current, itty, true); continue; @@ -134,8 +146,8 @@ public class DocstrumBlockificationService { private boolean isOnlyIntersectingYAndOnelineOrPrevoiusTwoLines(TextPageBlock previous, TextPageBlock current, ClassificationPage page) { return current.intersectsY(previous) // - && (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() == 2 && current.getNumberOfLines() == 1) // - && numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) <= 0; + && (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() == 2 && current.getNumberOfLines() == 1) // + && numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) <= 0; } @@ -144,16 +156,23 @@ public class DocstrumBlockificationService { ClassificationPage page) { return (Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD || Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) // - && (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() >= 2 && current.getNumberOfLines() == 1) // - && !hasBetween(current, previous, page.getTextBlocks()) && numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) <= 4; + && (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() >= 2 && current.getNumberOfLines() == 1) // + && !hasBetween(current, previous, page.getTextBlocks()) && numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) <= 4; + } + + + private boolean intersectsYWithPreviousHavingMaxOneLine(TextPageBlock previous, TextPageBlock current, ClassificationPage page) { + + return previous.intersectsY(current)//(Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD && Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) // + && (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1); } private boolean areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(TextPageBlock previous, TextPageBlock current, ClassificationPage page) { return previous.getNumberOfLines() >= 2 && current.getNumberOfLines() >= 2 // - && previous.intersectsY(current) // - && numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) == 0; + && previous.intersectsY(current) // + && numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) == 0; } @@ -162,7 +181,7 @@ public class DocstrumBlockificationService { previous.getSequences().addAll(current.getSequences()); previous = buildTextBlock(previous.getSequences(), 0); previous.setToDuplicate(toDuplicate); - if(current.getClassification() != null && previous.getClassification() == null) { + if (current.getClassification() != null && previous.getClassification() == null) { previous.setClassification(current.getClassification()); } itty.remove(); @@ -216,14 +235,14 @@ public class DocstrumBlockificationService { ListIterator itty = blocks.listIterator(); while (itty.hasNext()) { AbstractPageBlock block = itty.next(); - if(block == null){ + if (block == null) { continue; } if (block instanceof TablePageBlock) { continue; } - if(block.getClassification() != null && block.getClassification().isHeadline()) { + if (block.getClassification() != null && block.getClassification().isHeadline()) { continue; } @@ -232,7 +251,7 @@ public class DocstrumBlockificationService { for (int i = 0; i < blocks.size(); i++) { AbstractPageBlock abstractPageBlock = blocks.get(i); - if(abstractPageBlock == null){ + if (abstractPageBlock == null) { continue; } if (abstractPageBlock == current) { @@ -242,13 +261,12 @@ public class DocstrumBlockificationService { continue; } - if(abstractPageBlock.getClassification() != null && abstractPageBlock.getClassification().isHeadline()) { + if (abstractPageBlock.getClassification() != null && abstractPageBlock.getClassification().isHeadline()) { continue; } TextPageBlock inner = (TextPageBlock) abstractPageBlock; - if (current.getDir() == inner.getDir() && current.almostIntersects(inner, yThreshold, xThreshold)) { boolean toDuplicate = current.isToDuplicate() || inner.isToDuplicate(); @@ -262,8 +280,8 @@ public class DocstrumBlockificationService { } } var blocksIterator = blocks.iterator(); - while(blocksIterator.hasNext()){ - if(blocksIterator.next() == null){ + while (blocksIterator.hasNext()) { + if (blocksIterator.next() == null) { blocksIterator.remove(); } } @@ -351,11 +369,11 @@ public class DocstrumBlockificationService { if (textBlock == null) { textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(), - wordBlock.getMaxXDirAdj(), - wordBlock.getMinYDirAdj(), - wordBlock.getMaxYDirAdj(), - wordBlockList, - wordBlock.getRotation()); + wordBlock.getMaxXDirAdj(), + wordBlock.getMinYDirAdj(), + wordBlock.getMaxYDirAdj(), + wordBlockList, + wordBlock.getRotation()); } else { TextPageBlock spatialEntity = textBlock.union(wordBlock); textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight()); @@ -371,7 +389,12 @@ public class DocstrumBlockificationService { textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest()); } - if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) { + if (textBlock != null + && textBlock.getSequences() != null + && textBlock.getSequences() + .stream() + .map(t -> round(t.getMinYDirAdj(), 3)) + .collect(toSet()).size() == 1) { textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj)); } return textBlock; @@ -386,38 +409,34 @@ public class DocstrumBlockificationService { List horizontalRulingLines, List verticalRulingLines) { - return isSplitByRuling(maxX, - minY, - word.getMinXDirAdj(), - word.getMinYDirAdj(), - verticalRulingLines, - word.getDir().getDegrees(), - word.getPageWidth(), - word.getPageHeight()) // - || isSplitByRuling(minX, - minY, - word.getMinXDirAdj(), - word.getMaxYDirAdj(), - horizontalRulingLines, - word.getDir().getDegrees(), - word.getPageWidth(), - word.getPageHeight()) // - || isSplitByRuling(maxX, - minY, - word.getMinXDirAdj(), - word.getMinYDirAdj(), - horizontalRulingLines, - word.getDir().getDegrees(), - word.getPageWidth(), - word.getPageHeight()) // - || isSplitByRuling(minX, - minY, - word.getMinXDirAdj(), - word.getMaxYDirAdj(), - verticalRulingLines, - word.getDir().getDegrees(), - word.getPageWidth(), - word.getPageHeight()); + return isSplitByRuling(maxX, minY, word.getMinXDirAdj(), word.getMinYDirAdj(), verticalRulingLines, word.getDir().getDegrees(), word.getPageWidth(), word.getPageHeight()) + // + || isSplitByRuling(minX, + minY, + word.getMinXDirAdj(), + word.getMaxYDirAdj(), + horizontalRulingLines, + word.getDir().getDegrees(), + word.getPageWidth(), + word.getPageHeight()) + // + || isSplitByRuling(maxX, + minY, + word.getMinXDirAdj(), + word.getMinYDirAdj(), + horizontalRulingLines, + word.getDir().getDegrees(), + word.getPageWidth(), + word.getPageHeight()) + // + || isSplitByRuling(minX, + minY, + word.getMinXDirAdj(), + word.getMaxYDirAdj(), + verticalRulingLines, + word.getDir().getDegrees(), + word.getPageWidth(), + word.getPageHeight()); } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java index 37e02e7..a8ab674 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java @@ -82,13 +82,15 @@ public class ViewerDocumentTest extends BuildDocumentTest { @SneakyThrows public void testViewerDocument() { + //String fileName = "files/documine/21_TiltPlus_MutacaoGenicaEmCelulasBacterianas.pdf";//fail here - //String fileName = "files/new/UTT-Books-53.pdf"; - String fileName = "files/documine/A21924A - Acute Oral Toxicity - Rats.pdf"; - - - //String fileName = "files/documine/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica.pdf"; //String fileName = "files/documine/Study Document 1 - Acute Eye IrritationCorrosion - Rabbits.pdf"; + //String fileName = "files/documine/Study Document 3 - Acute Eye IrritationCorrosion - Rabbits.pdf"; + //String fileName = "files/documine/VV-547521_Irritação_Ocular_in_Vivo.pdf"; + //String fileName = "files/documine/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica.pdf"; + //String fileName = "files/new/UTT-Books-53.pdf"; + //String fileName = "files/documine/A21924A - Acute Oral Toxicity - Rats.pdf"; + //String fileName = "files/documine/A16361B - Acute Dermal Irritation Toxicity Study in Rabbits.pdf"; //String fileName = "files/documine/ITEM 20_Sensibilização cutânea.pdf"; //String fileName = "files/documine/VV-547523_LLNA.pdf"; //String fileName = "files/new/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"; @@ -96,7 +98,7 @@ public class ViewerDocumentTest extends BuildDocumentTest { //String fileName = "files/new/abschlussarbeiten-template-institut-fur-informatik-padagogische-hochschule-karlsruhe.pdf"; //String fileName = "files/new/kaust-official-thesis-template.pdf"; //String fileName = "files/new/$100m Offers.pdf"; - //String fileName = "files/new/18-Curacron_ToxicidadeOcularInVitro.pdf"; + String fileName = "files/new/18-Curacron_ToxicidadeOcularInVitro.pdf"; //String fileName = "files/new/mistitled_outlines_example.pdf"; //String fileName = "files/bdr/Plenarprotokoll 1 (keine Druchsache!) (1) 1.pdf"; String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";