RED-7074: Design Subsection section tree structure algorithm

* improved merging of headlines as well as splitting logic so that more headlines are detected correctly
This commit is contained in:
maverickstuder 2024-05-14 17:41:44 +02:00
parent 2fcaeb3d8c
commit 1856fed640
7 changed files with 294 additions and 143 deletions

View File

@ -2,6 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor;
import static java.lang.String.format;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.io.File;
import java.io.IOException;
@ -210,15 +211,15 @@ public class LayoutParsingPipeline {
private String buildSemanticNodeCountMessage(int numberOfPages, Map<NodeType, Long> semanticNodeCounts) {
return String.format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed",
numberOfPages,
semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION),
semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE),
semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH),
semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE),
semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL),
semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER),
semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER));
return format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed",
numberOfPages,
semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION),
semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE),
semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH),
semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE),
semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL),
semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER),
semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER));
}
@ -239,6 +240,7 @@ public class LayoutParsingPipeline {
Map<Integer, List<ClassifiedImage>> signatures = visualLayoutParsingAdapter.buildExtractedSignaturesPerPage(visualLayoutParsingResponse);
ClassificationDocument classificationDocument = new ClassificationDocument();
List<ClassificationPage> classificationPages = new ArrayList<>();
OutlineObject lastProcessedOutlineObject = null;
// parsing the structure elements could be useful as well
classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originDocument));
@ -307,11 +309,16 @@ public class LayoutParsingPipeline {
classificationPage.setPageWidth(cropbox.getWidth());
classificationPage.setPageHeight(cropbox.getHeight());
List<OutlineObject> outlineObjects = classificationDocument.getOutlineObjectTree().getOutlineObjectsPerPage()
.get(pageNumber - 1);
if (outlineObjects != null) {
List<OutlineObject> outlineObjects = classificationDocument.getOutlineObjectTree().getOutlineObjectsPerPage().getOrDefault(pageNumber - 1, new ArrayList<>());
OutlineObject notFoundOutlineObject = null;
if (lastProcessedOutlineObject != null && !lastProcessedOutlineObject.isFound()) {
lastProcessedOutlineObject.setPoint(new Point2D.Float(0, cropbox.getHeight()));
notFoundOutlineObject = lastProcessedOutlineObject;
}
if (!outlineObjects.isEmpty()) {
classificationPage.setOutlineObjects(outlineObjects);
blockificationPostprocessingService.sanitizeOutlineBlocks(classificationPage);
lastProcessedOutlineObject = blockificationPostprocessingService.sanitizeOutlineBlocks(classificationPage, notFoundOutlineObject);
}
// MarkedContent needs to be converted at this point, otherwise it leads to GC Problems in Pdfbox.

View File

@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.model;
import java.util.ArrayList;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents;
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;

View File

@ -5,16 +5,27 @@ import java.awt.geom.Point2D;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.RequiredArgsConstructor;
@Data
@NoArgsConstructor
@RequiredArgsConstructor
@AllArgsConstructor
public class OutlineObject {
private String title;
private int pageNumber;
private final String title;
private final int pageNumber;
private Point2D point;
private int treeDepth;
private final int treeDepth;
private boolean found = false;
public OutlineObject(String title, int pageNumber, Point2D point2D, int depth) {
this(title, pageNumber, depth);
this.point = point2D;
}
@Override
public String toString() {

View File

@ -80,7 +80,10 @@ public class TextPageBlock extends AbstractPageBlock {
public static TextPageBlock merge(List<TextPageBlock> textBlocksToMerge) {
List<TextPositionSequence> sequences = textBlocksToMerge.stream().map(TextPageBlock::getSequences).flatMap(java.util.Collection::stream).toList();
List<TextPositionSequence> sequences = textBlocksToMerge.stream()
.map(TextPageBlock::getSequences)
.flatMap(java.util.Collection::stream)
.toList();
sequences = new ArrayList<>(sequences);
return fromTextPositionSequences(sequences);
}
@ -106,11 +109,11 @@ public class TextPageBlock extends AbstractPageBlock {
if (textBlock == null) {
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
wordBlock.getMaxXDirAdj(),
wordBlock.getMinYDirAdj(),
wordBlock.getMaxYDirAdj(),
wordBlockList,
wordBlock.getRotation());
wordBlock.getMaxXDirAdj(),
wordBlock.getMinYDirAdj(),
wordBlock.getMaxYDirAdj(),
wordBlockList,
wordBlock.getRotation());
} else {
TextPageBlock spatialEntity = textBlock.union(wordBlock);
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
@ -126,11 +129,12 @@ public class TextPageBlock extends AbstractPageBlock {
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
}
if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences()
.stream()
.map(t -> DoubleComparisons.round(t.getMinYDirAdj(), 3))
.collect(toSet())
.size() == 1) {
if (textBlock != null
&& textBlock.getSequences() != null
&& textBlock.getSequences()
.stream()
.map(t -> DoubleComparisons.round(t.getMinYDirAdj(), 3))
.collect(toSet()).size() == 1) {
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
}
return textBlock;
@ -290,18 +294,7 @@ public class TextPageBlock extends AbstractPageBlock {
public void add(TextPositionSequence r) {
if (r.getMinXDirAdj() < minX) {
minX = r.getMinXDirAdj();
}
if (r.getMaxXDirAdj() > maxX) {
maxX = r.getMaxXDirAdj();
}
if (r.getMinYDirAdj() < minY) {
minY = r.getMinYDirAdj();
}
if (r.getMaxYDirAdj() > maxY) {
maxY = r.getMaxYDirAdj();
}
setCoordinates(r);
}
@ -317,6 +310,33 @@ public class TextPageBlock extends AbstractPageBlock {
}
public void resize() {
minX = Float.MAX_VALUE;
minY = Float.MAX_VALUE;
maxX = Float.MIN_VALUE;
maxY = Float.MIN_VALUE;
sequences.forEach(this::setCoordinates);
}
private void setCoordinates(TextPositionSequence sequence) {
if (sequence.getMinXDirAdj() < minX) {
minX = sequence.getMinXDirAdj();
}
if (sequence.getMaxXDirAdj() > maxX) {
maxX = sequence.getMaxXDirAdj();
}
if (sequence.getMinYDirAdj() < minY) {
minY = sequence.getMinYDirAdj();
}
if (sequence.getMaxYDirAdj() > maxY) {
maxY = sequence.getMaxYDirAdj();
}
}
public void set(float x1, float y1, float x2, float y2) {
this.minX = Math.min(x1, x2);

View File

@ -37,50 +37,129 @@ public class BlockificationPostprocessingService {
.collect(RectangleTransformations.collectBBox());
public void sanitizeOutlineBlocks(ClassificationPage classificationPage) {
public OutlineObject sanitizeOutlineBlocks(ClassificationPage classificationPage, OutlineObject notFoundOutlineObject) {
List<OutlineObject> outlineObjects = classificationPage.getOutlineObjects();
List<TextPageBlock> textBlocks = classificationPage.getTextBlocks()
.stream()
.filter(block -> block instanceof TextPageBlock)
.toList()
.stream()
.map(block -> (TextPageBlock) block)
.toList();
if (textBlocks.isEmpty() || outlineObjects.isEmpty()) {
return;
if (getTextPageBlocks(classificationPage).isEmpty() || outlineObjects.isEmpty()) {
return null;
}
float pageHeight = classificationPage.getPageHeight();
for (OutlineObject outlineObject : outlineObjects) {
ListIterator<OutlineObject> outlineObjectListIterator = outlineObjects.listIterator();
OutlineProcessionContext context = new OutlineProcessionContext(outlineObject);
if (notFoundOutlineObject != null) {
OutlineProcessionContext notFoundOutlineObjectProcessionContext = new OutlineProcessionContext(notFoundOutlineObject);
processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, notFoundOutlineObjectProcessionContext);
ListIterator<TextPageBlock> iterator = textBlocks.listIterator();
while (iterator.hasNext()) {
TextPageBlock pageBlock = iterator.next();
if (pageHeight - outlineObject.getPoint().getY() - BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD <= pageBlock.getMaxY()) {
break;
}
OutlineObject firstOutlineObject = null;
OutlineProcessionContext firstOutlineObjectProcessionContext = null;
if (outlineObjectListIterator.hasNext()) {
firstOutlineObject = outlineObjectListIterator.next();
firstOutlineObjectProcessionContext = new OutlineProcessionContext(firstOutlineObject);
processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, firstOutlineObjectProcessionContext);
}
if (iterator.hasPrevious()) {
iterator.previous();
}
boolean earlyStop = false;
while (iterator.hasNext() && !earlyStop) {
TextPageBlock pageBlock = iterator.next();
earlyStop = processOutlineForTextBlock(pageBlock, context);
}
selectMatch(classificationPage, context);
if (!contextsOverlap(notFoundOutlineObjectProcessionContext, firstOutlineObjectProcessionContext)) {
notFoundOutlineObject.setFound(selectMatch(classificationPage, notFoundOutlineObjectProcessionContext));
}
if (firstOutlineObject != null) {
// re-create the context for the updated blocks
firstOutlineObjectProcessionContext = new OutlineProcessionContext(firstOutlineObject);
processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, firstOutlineObjectProcessionContext);
firstOutlineObject.setFound(selectMatch(classificationPage, firstOutlineObjectProcessionContext));
}
}
outlineObjectListIterator.forEachRemaining(outlineObject -> {
OutlineProcessionContext outlineObjectProcessionContext = new OutlineProcessionContext(outlineObject);
processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, outlineObjectProcessionContext);
outlineObject.setFound(selectMatch(classificationPage, outlineObjectProcessionContext));
});
if (!outlineObjects.isEmpty()) {
return outlineObjects.get(outlineObjects.size() - 1);
} else {
return notFoundOutlineObject;
}
}
private void selectMatch(ClassificationPage classificationPage, OutlineProcessionContext context) {
private static List<TextPageBlock> getTextPageBlocks(ClassificationPage classificationPage) {
return classificationPage.getTextBlocks()
.stream()
.filter(block -> block instanceof TextPageBlock)
.map(block -> (TextPageBlock) block)
.toList();
}
private boolean contextsOverlap(OutlineProcessionContext notFoundOutlineObjectProcessionContext, OutlineProcessionContext firstOutlineObjectProcessionContext) {
if (firstOutlineObjectProcessionContext == null) {
return false;
}
String notFoundTitle = notFoundOutlineObjectProcessionContext.getOutlineObject().getTitle();
String firstTitle = firstOutlineObjectProcessionContext.getOutlineObject().getTitle();
if (!firstTitle.startsWith(notFoundTitle)) {
return false;
}
var blocksOfNotFoundOutline = getAllMatchingBlocks(notFoundOutlineObjectProcessionContext);
var blocksOfFirstOutline = getAllMatchingBlocks(firstOutlineObjectProcessionContext);
double maxYFirst = blocksOfFirstOutline.stream()
.mapToDouble(TextPageBlock::getPdfMaxY)
.max()
.orElse(Double.NEGATIVE_INFINITY);
return blocksOfNotFoundOutline.stream()
.mapToDouble(TextPageBlock::getPdfMaxY)
.anyMatch(y -> y >= maxYFirst);
}
private List<TextPageBlock> getAllMatchingBlocks(OutlineProcessionContext context) {
List<TextPageBlock> blocks = new ArrayList<>();
if (context.getDirectMatch() != null) {
blocks.add(context.getDirectMatch());
}
if (context.getSplitCandidate() != null) {
blocks.add(context.getSplitCandidate());
}
blocks.addAll(context.getMergeCandidates());
return blocks;
}
private void processTextBlocks(List<TextPageBlock> textBlocks, float pageHeight, OutlineProcessionContext context) {
OutlineObject outlineObject = context.getOutlineObject();
ListIterator<TextPageBlock> iterator = textBlocks.listIterator();
while (iterator.hasNext()) {
TextPageBlock pageBlock = iterator.next();
if (pageHeight - outlineObject.getPoint().getY() - BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD <= pageBlock.getMaxY()) {
break;
}
}
if (iterator.hasPrevious()) {
iterator.previous();
}
boolean earlyStop = false;
while (iterator.hasNext() && !earlyStop) {
TextPageBlock pageBlock = iterator.next();
earlyStop = processOutlineForTextBlock(pageBlock, context);
}
}
private boolean selectMatch(ClassificationPage classificationPage, OutlineProcessionContext context) {
OutlineObject outlineObject = context.outlineObject;
TextPageBlock directMatch = context.directMatch;
@ -122,28 +201,39 @@ public class BlockificationPostprocessingService {
double minDistance = Math.min(distanceToDirectMatch, Math.min(distanceToSplitCandidate, distanceToBestMergeCandidates));
if (minDistance == Double.MAX_VALUE) {
return;
return false;
}
if (minDistance == distanceToDirectMatch) {
directMatch.setClassification(headlineType);
} else if (minDistance == distanceToSplitCandidate) {
List<TextPageBlock> others = splitBlock(classificationPage, splitCandidate, context.sectionIdentifier + outlineObject.getTitle());
List<TextPageBlock> others = splitBlock(classificationPage, splitCandidate, context.sectionIdentifier, outlineObject.getTitle());
splitCandidate.setClassification(headlineType);
others.forEach(other -> other.setClassification(null));
} else {
var merged = mergeBlocks(classificationPage, bestMergeCandidateCombination);
merged.setClassification(headlineType);
}
return true;
}
private List<TextPageBlock> splitBlock(ClassificationPage classificationPage, TextPageBlock blockToSplit, String text) {
private List<TextPageBlock> splitBlock(ClassificationPage classificationPage, TextPageBlock blockToSplit, SectionIdentifier sectionIdentifier, String title) {
List<TextPageBlock> otherBlocks = new ArrayList<>();
int blockToSplitIdx = classificationPage.getTextBlocks().indexOf(blockToSplit);
WordSequenceResult wordSequenceResult = findWordSequence(blockToSplit.getSequences(), text);
String headline = title;
if (!sectionIdentifier.getFormat().equals(SectionIdentifier.Format.EMPTY) && !title.startsWith(sectionIdentifier.getIdentifierString())) {
headline = sectionIdentifier + headline;
}
WordSequenceResult wordSequenceResult = findWordSequence(blockToSplit.getSequences(), headline);
if (wordSequenceResult.inSequence.isEmpty()) {
wordSequenceResult = findWordSequence(blockToSplit.getSequences(), title);
}
blockToSplit.setSequences(wordSequenceResult.inSequence);
blockToSplit.resize();
if (!wordSequenceResult.preSequence.isEmpty()) {
TextPageBlock block = buildTextBlock(wordSequenceResult.preSequence, 0);
@ -301,6 +391,7 @@ public class BlockificationPostprocessingService {
assert firstBlock != null;
firstBlock.setToDuplicate(false);
firstBlock.resize();
classificationPage.getTextBlocks().removeAll(mergedBlocks);
}
@ -378,13 +469,13 @@ public class BlockificationPostprocessingService {
if (blockTextContainsOutlineTitle) {
SectionIdentifier sectionIdentifier = SectionIdentifier.fromSearchText(blockText);
if (sectionIdentifier.getFormat() != SectionIdentifier.Format.EMPTY) {
if (sectionIdentifier.getFormat() != SectionIdentifier.Format.EMPTY && !outlineTitle.startsWith(sectionIdentifier.getIdentifierString())) {
if (blockText.startsWith(sectionIdentifier.getIdentifierString()) && blockText.endsWith(outlineTitle) && context.directMatch == null) {
context.directMatch = pageBlock;
return true;
} else if (context.splitCandidate == null) {
context.sectionIdentifier = sectionIdentifier.getIdentifierString();
context.sectionIdentifier = sectionIdentifier;
}
}
if (context.splitCandidate == null) {
@ -408,7 +499,7 @@ public class BlockificationPostprocessingService {
private OutlineObject outlineObject;
private List<TextPageBlock> mergeCandidates;
private TextPageBlock splitCandidate;
private String sectionIdentifier;
private SectionIdentifier sectionIdentifier;
public OutlineProcessionContext(OutlineObject outlineObject) {
@ -417,7 +508,7 @@ public class BlockificationPostprocessingService {
this.directMatch = null;
this.mergeCandidates = new ArrayList<>();
this.splitCandidate = null;
this.sectionIdentifier = "";
this.sectionIdentifier = SectionIdentifier.empty();
}
}

View File

@ -58,18 +58,20 @@ public class DocstrumBlockificationService {
zones.forEach(zone -> {
List<TextPositionSequence> textPositionSequences = new ArrayList<>();
zone.getLines().forEach(line -> {
line.getWords().forEach(word -> {
textPositionSequences.add(new TextPositionSequence(word.getTextPositions(), word.getPage()));
});
});
zone.getLines()
.forEach(line -> {
line.getWords()
.forEach(word -> {
textPositionSequences.add(new TextPositionSequence(word.getTextPositions(), word.getPage()));
});
});
abstractPageBlocks.addAll(splitZonesAtRulings(textPositionSequences, horizontalRulings, verticalRulings));
});
if (xyOrder) {
abstractPageBlocks.sort(Comparator.comparing(AbstractPageBlock::getMinY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(AbstractPageBlock::getMinX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
.thenComparing(AbstractPageBlock::getMinX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
abstractPageBlocks.sort(new Comparator<AbstractPageBlock>() {
@Override
public int compare(AbstractPageBlock o1, AbstractPageBlock o2) {
@ -90,7 +92,7 @@ public class DocstrumBlockificationService {
while (itty.hasNext()) {
AbstractPageBlock block = itty.next();
if (block instanceof TablePageBlock || previous.isHeadline()) {
if (block instanceof TablePageBlock) {
previous = new TextPageBlock();
continue;
}
@ -98,11 +100,21 @@ public class DocstrumBlockificationService {
if (previous != null && !previous.getSequences().isEmpty()) {
if (current.getDir() != previous.getDir() || current.isHeadline()) {
if (current.getDir() != previous.getDir()) {
previous = current;
continue;
}
if (current.isHeadline() || previous.isHeadline()) {
if (intersectsYWithPreviousHavingMaxOneLine(previous, current, page)) {
previous = combineBlocksAndResetIterator(previous, current, itty, false);
} else {
previous = current;
}
continue;
}
if (areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(previous, current, page)) {
previous = combineBlocksAndResetIterator(previous, current, itty, true);
continue;
@ -134,8 +146,8 @@ public class DocstrumBlockificationService {
private boolean isOnlyIntersectingYAndOnelineOrPrevoiusTwoLines(TextPageBlock previous, TextPageBlock current, ClassificationPage page) {
return current.intersectsY(previous) //
&& (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() == 2 && current.getNumberOfLines() == 1) //
&& numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) <= 0;
&& (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() == 2 && current.getNumberOfLines() == 1) //
&& numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) <= 0;
}
@ -144,16 +156,23 @@ public class DocstrumBlockificationService {
ClassificationPage page) {
return (Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD || Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) //
&& (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() >= 2 && current.getNumberOfLines() == 1) //
&& !hasBetween(current, previous, page.getTextBlocks()) && numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) <= 4;
&& (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() >= 2 && current.getNumberOfLines() == 1) //
&& !hasBetween(current, previous, page.getTextBlocks()) && numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) <= 4;
}
private boolean intersectsYWithPreviousHavingMaxOneLine(TextPageBlock previous, TextPageBlock current, ClassificationPage page) {
return previous.intersectsY(current)//(Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD && Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) //
&& (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1);
}
private boolean areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(TextPageBlock previous, TextPageBlock current, ClassificationPage page) {
return previous.getNumberOfLines() >= 2 && current.getNumberOfLines() >= 2 //
&& previous.intersectsY(current) //
&& numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) == 0;
&& previous.intersectsY(current) //
&& numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) == 0;
}
@ -162,7 +181,7 @@ public class DocstrumBlockificationService {
previous.getSequences().addAll(current.getSequences());
previous = buildTextBlock(previous.getSequences(), 0);
previous.setToDuplicate(toDuplicate);
if(current.getClassification() != null && previous.getClassification() == null) {
if (current.getClassification() != null && previous.getClassification() == null) {
previous.setClassification(current.getClassification());
}
itty.remove();
@ -216,14 +235,14 @@ public class DocstrumBlockificationService {
ListIterator<AbstractPageBlock> itty = blocks.listIterator();
while (itty.hasNext()) {
AbstractPageBlock block = itty.next();
if(block == null){
if (block == null) {
continue;
}
if (block instanceof TablePageBlock) {
continue;
}
if(block.getClassification() != null && block.getClassification().isHeadline()) {
if (block.getClassification() != null && block.getClassification().isHeadline()) {
continue;
}
@ -232,7 +251,7 @@ public class DocstrumBlockificationService {
for (int i = 0; i < blocks.size(); i++) {
AbstractPageBlock abstractPageBlock = blocks.get(i);
if(abstractPageBlock == null){
if (abstractPageBlock == null) {
continue;
}
if (abstractPageBlock == current) {
@ -242,13 +261,12 @@ public class DocstrumBlockificationService {
continue;
}
if(abstractPageBlock.getClassification() != null && abstractPageBlock.getClassification().isHeadline()) {
if (abstractPageBlock.getClassification() != null && abstractPageBlock.getClassification().isHeadline()) {
continue;
}
TextPageBlock inner = (TextPageBlock) abstractPageBlock;
if (current.getDir() == inner.getDir() && current.almostIntersects(inner, yThreshold, xThreshold)) {
boolean toDuplicate = current.isToDuplicate() || inner.isToDuplicate();
@ -262,8 +280,8 @@ public class DocstrumBlockificationService {
}
}
var blocksIterator = blocks.iterator();
while(blocksIterator.hasNext()){
if(blocksIterator.next() == null){
while (blocksIterator.hasNext()) {
if (blocksIterator.next() == null) {
blocksIterator.remove();
}
}
@ -351,11 +369,11 @@ public class DocstrumBlockificationService {
if (textBlock == null) {
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
wordBlock.getMaxXDirAdj(),
wordBlock.getMinYDirAdj(),
wordBlock.getMaxYDirAdj(),
wordBlockList,
wordBlock.getRotation());
wordBlock.getMaxXDirAdj(),
wordBlock.getMinYDirAdj(),
wordBlock.getMaxYDirAdj(),
wordBlockList,
wordBlock.getRotation());
} else {
TextPageBlock spatialEntity = textBlock.union(wordBlock);
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
@ -371,7 +389,12 @@ public class DocstrumBlockificationService {
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
}
if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) {
if (textBlock != null
&& textBlock.getSequences() != null
&& textBlock.getSequences()
.stream()
.map(t -> round(t.getMinYDirAdj(), 3))
.collect(toSet()).size() == 1) {
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
}
return textBlock;
@ -386,38 +409,34 @@ public class DocstrumBlockificationService {
List<Ruling> horizontalRulingLines,
List<Ruling> verticalRulingLines) {
return isSplitByRuling(maxX,
minY,
word.getMinXDirAdj(),
word.getMinYDirAdj(),
verticalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(minX,
minY,
word.getMinXDirAdj(),
word.getMaxYDirAdj(),
horizontalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(maxX,
minY,
word.getMinXDirAdj(),
word.getMinYDirAdj(),
horizontalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(minX,
minY,
word.getMinXDirAdj(),
word.getMaxYDirAdj(),
verticalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight());
return isSplitByRuling(maxX, minY, word.getMinXDirAdj(), word.getMinYDirAdj(), verticalRulingLines, word.getDir().getDegrees(), word.getPageWidth(), word.getPageHeight())
//
|| isSplitByRuling(minX,
minY,
word.getMinXDirAdj(),
word.getMaxYDirAdj(),
horizontalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight())
//
|| isSplitByRuling(maxX,
minY,
word.getMinXDirAdj(),
word.getMinYDirAdj(),
horizontalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight())
//
|| isSplitByRuling(minX,
minY,
word.getMinXDirAdj(),
word.getMaxYDirAdj(),
verticalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight());
}

View File

@ -82,13 +82,15 @@ public class ViewerDocumentTest extends BuildDocumentTest {
@SneakyThrows
public void testViewerDocument() {
//String fileName = "files/documine/21_TiltPlus_MutacaoGenicaEmCelulasBacterianas.pdf";//fail here
//String fileName = "files/new/UTT-Books-53.pdf";
String fileName = "files/documine/A21924A - Acute Oral Toxicity - Rats.pdf";
//String fileName = "files/documine/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica.pdf";
//String fileName = "files/documine/Study Document 1 - Acute Eye IrritationCorrosion - Rabbits.pdf";
//String fileName = "files/documine/Study Document 3 - Acute Eye IrritationCorrosion - Rabbits.pdf";
//String fileName = "files/documine/VV-547521_Irritação_Ocular_in_Vivo.pdf";
//String fileName = "files/documine/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica.pdf";
//String fileName = "files/new/UTT-Books-53.pdf";
//String fileName = "files/documine/A21924A - Acute Oral Toxicity - Rats.pdf";
//String fileName = "files/documine/A16361B - Acute Dermal Irritation Toxicity Study in Rabbits.pdf";
//String fileName = "files/documine/ITEM 20_Sensibilização cutânea.pdf";
//String fileName = "files/documine/VV-547523_LLNA.pdf";
//String fileName = "files/new/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
@ -96,7 +98,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
//String fileName = "files/new/abschlussarbeiten-template-institut-fur-informatik-padagogische-hochschule-karlsruhe.pdf";
//String fileName = "files/new/kaust-official-thesis-template.pdf";
//String fileName = "files/new/$100m Offers.pdf";
//String fileName = "files/new/18-Curacron_ToxicidadeOcularInVitro.pdf";
String fileName = "files/new/18-Curacron_ToxicidadeOcularInVitro.pdf";
//String fileName = "files/new/mistitled_outlines_example.pdf";
//String fileName = "files/bdr/Plenarprotokoll 1 (keine Druchsache!) (1) 1.pdf";
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";