RED-7074: Design Subsection section tree structure algorithm

* improved merging of headlines as well as splitting logic so that more headlines are detected correctly
This commit is contained in:
maverickstuder 2024-05-14 17:41:44 +02:00
parent 2fcaeb3d8c
commit 1856fed640
7 changed files with 294 additions and 143 deletions

View File

@ -2,6 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor;
import static java.lang.String.format; import static java.lang.String.format;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D; import java.awt.geom.Rectangle2D;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
@ -210,7 +211,7 @@ public class LayoutParsingPipeline {
private String buildSemanticNodeCountMessage(int numberOfPages, Map<NodeType, Long> semanticNodeCounts) { private String buildSemanticNodeCountMessage(int numberOfPages, Map<NodeType, Long> semanticNodeCounts) {
return String.format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed", return format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed",
numberOfPages, numberOfPages,
semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION), semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION),
semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE), semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE),
@ -239,6 +240,7 @@ public class LayoutParsingPipeline {
Map<Integer, List<ClassifiedImage>> signatures = visualLayoutParsingAdapter.buildExtractedSignaturesPerPage(visualLayoutParsingResponse); Map<Integer, List<ClassifiedImage>> signatures = visualLayoutParsingAdapter.buildExtractedSignaturesPerPage(visualLayoutParsingResponse);
ClassificationDocument classificationDocument = new ClassificationDocument(); ClassificationDocument classificationDocument = new ClassificationDocument();
List<ClassificationPage> classificationPages = new ArrayList<>(); List<ClassificationPage> classificationPages = new ArrayList<>();
OutlineObject lastProcessedOutlineObject = null;
// parsing the structure elements could be useful as well // parsing the structure elements could be useful as well
classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originDocument)); classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originDocument));
@ -307,11 +309,16 @@ public class LayoutParsingPipeline {
classificationPage.setPageWidth(cropbox.getWidth()); classificationPage.setPageWidth(cropbox.getWidth());
classificationPage.setPageHeight(cropbox.getHeight()); classificationPage.setPageHeight(cropbox.getHeight());
List<OutlineObject> outlineObjects = classificationDocument.getOutlineObjectTree().getOutlineObjectsPerPage() List<OutlineObject> outlineObjects = classificationDocument.getOutlineObjectTree().getOutlineObjectsPerPage().getOrDefault(pageNumber - 1, new ArrayList<>());
.get(pageNumber - 1);
if (outlineObjects != null) { OutlineObject notFoundOutlineObject = null;
if (lastProcessedOutlineObject != null && !lastProcessedOutlineObject.isFound()) {
lastProcessedOutlineObject.setPoint(new Point2D.Float(0, cropbox.getHeight()));
notFoundOutlineObject = lastProcessedOutlineObject;
}
if (!outlineObjects.isEmpty()) {
classificationPage.setOutlineObjects(outlineObjects); classificationPage.setOutlineObjects(outlineObjects);
blockificationPostprocessingService.sanitizeOutlineBlocks(classificationPage); lastProcessedOutlineObject = blockificationPostprocessingService.sanitizeOutlineBlocks(classificationPage, notFoundOutlineObject);
} }
// MarkedContent needs to be converted at this point, otherwise it leads to GC Problems in Pdfbox. // MarkedContent needs to be converted at this point, otherwise it leads to GC Problems in Pdfbox.

View File

@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.model;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree; import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents; import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents;
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter; import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;

View File

@ -5,16 +5,27 @@ import java.awt.geom.Point2D;
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
import lombok.Data; import lombok.Data;
import lombok.NoArgsConstructor; import lombok.NoArgsConstructor;
import lombok.RequiredArgsConstructor;
@Data @Data
@NoArgsConstructor @RequiredArgsConstructor
@AllArgsConstructor @AllArgsConstructor
public class OutlineObject { public class OutlineObject {
private String title; private final String title;
private int pageNumber; private final int pageNumber;
private Point2D point; private Point2D point;
private int treeDepth; private final int treeDepth;
private boolean found = false;
public OutlineObject(String title, int pageNumber, Point2D point2D, int depth) {
this(title, pageNumber, depth);
this.point = point2D;
}
@Override @Override
public String toString() { public String toString() {

View File

@ -80,7 +80,10 @@ public class TextPageBlock extends AbstractPageBlock {
public static TextPageBlock merge(List<TextPageBlock> textBlocksToMerge) { public static TextPageBlock merge(List<TextPageBlock> textBlocksToMerge) {
List<TextPositionSequence> sequences = textBlocksToMerge.stream().map(TextPageBlock::getSequences).flatMap(java.util.Collection::stream).toList(); List<TextPositionSequence> sequences = textBlocksToMerge.stream()
.map(TextPageBlock::getSequences)
.flatMap(java.util.Collection::stream)
.toList();
sequences = new ArrayList<>(sequences); sequences = new ArrayList<>(sequences);
return fromTextPositionSequences(sequences); return fromTextPositionSequences(sequences);
} }
@ -126,11 +129,12 @@ public class TextPageBlock extends AbstractPageBlock {
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest()); textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
} }
if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences() if (textBlock != null
&& textBlock.getSequences() != null
&& textBlock.getSequences()
.stream() .stream()
.map(t -> DoubleComparisons.round(t.getMinYDirAdj(), 3)) .map(t -> DoubleComparisons.round(t.getMinYDirAdj(), 3))
.collect(toSet()) .collect(toSet()).size() == 1) {
.size() == 1) {
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj)); textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
} }
return textBlock; return textBlock;
@ -290,18 +294,7 @@ public class TextPageBlock extends AbstractPageBlock {
public void add(TextPositionSequence r) { public void add(TextPositionSequence r) {
if (r.getMinXDirAdj() < minX) { setCoordinates(r);
minX = r.getMinXDirAdj();
}
if (r.getMaxXDirAdj() > maxX) {
maxX = r.getMaxXDirAdj();
}
if (r.getMinYDirAdj() < minY) {
minY = r.getMinYDirAdj();
}
if (r.getMaxYDirAdj() > maxY) {
maxY = r.getMaxYDirAdj();
}
} }
@ -317,6 +310,33 @@ public class TextPageBlock extends AbstractPageBlock {
} }
public void resize() {
minX = Float.MAX_VALUE;
minY = Float.MAX_VALUE;
maxX = Float.MIN_VALUE;
maxY = Float.MIN_VALUE;
sequences.forEach(this::setCoordinates);
}
private void setCoordinates(TextPositionSequence sequence) {
if (sequence.getMinXDirAdj() < minX) {
minX = sequence.getMinXDirAdj();
}
if (sequence.getMaxXDirAdj() > maxX) {
maxX = sequence.getMaxXDirAdj();
}
if (sequence.getMinYDirAdj() < minY) {
minY = sequence.getMinYDirAdj();
}
if (sequence.getMaxYDirAdj() > maxY) {
maxY = sequence.getMaxYDirAdj();
}
}
public void set(float x1, float y1, float x2, float y2) { public void set(float x1, float y1, float x2, float y2) {
this.minX = Math.min(x1, x2); this.minX = Math.min(x1, x2);

View File

@ -37,28 +37,110 @@ public class BlockificationPostprocessingService {
.collect(RectangleTransformations.collectBBox()); .collect(RectangleTransformations.collectBBox());
public void sanitizeOutlineBlocks(ClassificationPage classificationPage) { public OutlineObject sanitizeOutlineBlocks(ClassificationPage classificationPage, OutlineObject notFoundOutlineObject) {
List<OutlineObject> outlineObjects = classificationPage.getOutlineObjects(); List<OutlineObject> outlineObjects = classificationPage.getOutlineObjects();
List<TextPageBlock> textBlocks = classificationPage.getTextBlocks() if (getTextPageBlocks(classificationPage).isEmpty() || outlineObjects.isEmpty()) {
.stream() return null;
.filter(block -> block instanceof TextPageBlock)
.toList()
.stream()
.map(block -> (TextPageBlock) block)
.toList();
if (textBlocks.isEmpty() || outlineObjects.isEmpty()) {
return;
} }
float pageHeight = classificationPage.getPageHeight(); float pageHeight = classificationPage.getPageHeight();
for (OutlineObject outlineObject : outlineObjects) { ListIterator<OutlineObject> outlineObjectListIterator = outlineObjects.listIterator();
OutlineProcessionContext context = new OutlineProcessionContext(outlineObject); if (notFoundOutlineObject != null) {
OutlineProcessionContext notFoundOutlineObjectProcessionContext = new OutlineProcessionContext(notFoundOutlineObject);
processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, notFoundOutlineObjectProcessionContext);
OutlineObject firstOutlineObject = null;
OutlineProcessionContext firstOutlineObjectProcessionContext = null;
if (outlineObjectListIterator.hasNext()) {
firstOutlineObject = outlineObjectListIterator.next();
firstOutlineObjectProcessionContext = new OutlineProcessionContext(firstOutlineObject);
processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, firstOutlineObjectProcessionContext);
}
if (!contextsOverlap(notFoundOutlineObjectProcessionContext, firstOutlineObjectProcessionContext)) {
notFoundOutlineObject.setFound(selectMatch(classificationPage, notFoundOutlineObjectProcessionContext));
}
if (firstOutlineObject != null) {
// re-create the context for the updated blocks
firstOutlineObjectProcessionContext = new OutlineProcessionContext(firstOutlineObject);
processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, firstOutlineObjectProcessionContext);
firstOutlineObject.setFound(selectMatch(classificationPage, firstOutlineObjectProcessionContext));
}
}
outlineObjectListIterator.forEachRemaining(outlineObject -> {
OutlineProcessionContext outlineObjectProcessionContext = new OutlineProcessionContext(outlineObject);
processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, outlineObjectProcessionContext);
outlineObject.setFound(selectMatch(classificationPage, outlineObjectProcessionContext));
});
if (!outlineObjects.isEmpty()) {
return outlineObjects.get(outlineObjects.size() - 1);
} else {
return notFoundOutlineObject;
}
}
private static List<TextPageBlock> getTextPageBlocks(ClassificationPage classificationPage) {
return classificationPage.getTextBlocks()
.stream()
.filter(block -> block instanceof TextPageBlock)
.map(block -> (TextPageBlock) block)
.toList();
}
private boolean contextsOverlap(OutlineProcessionContext notFoundOutlineObjectProcessionContext, OutlineProcessionContext firstOutlineObjectProcessionContext) {
if (firstOutlineObjectProcessionContext == null) {
return false;
}
String notFoundTitle = notFoundOutlineObjectProcessionContext.getOutlineObject().getTitle();
String firstTitle = firstOutlineObjectProcessionContext.getOutlineObject().getTitle();
if (!firstTitle.startsWith(notFoundTitle)) {
return false;
}
var blocksOfNotFoundOutline = getAllMatchingBlocks(notFoundOutlineObjectProcessionContext);
var blocksOfFirstOutline = getAllMatchingBlocks(firstOutlineObjectProcessionContext);
double maxYFirst = blocksOfFirstOutline.stream()
.mapToDouble(TextPageBlock::getPdfMaxY)
.max()
.orElse(Double.NEGATIVE_INFINITY);
return blocksOfNotFoundOutline.stream()
.mapToDouble(TextPageBlock::getPdfMaxY)
.anyMatch(y -> y >= maxYFirst);
}
private List<TextPageBlock> getAllMatchingBlocks(OutlineProcessionContext context) {
List<TextPageBlock> blocks = new ArrayList<>();
if (context.getDirectMatch() != null) {
blocks.add(context.getDirectMatch());
}
if (context.getSplitCandidate() != null) {
blocks.add(context.getSplitCandidate());
}
blocks.addAll(context.getMergeCandidates());
return blocks;
}
private void processTextBlocks(List<TextPageBlock> textBlocks, float pageHeight, OutlineProcessionContext context) {
OutlineObject outlineObject = context.getOutlineObject();
ListIterator<TextPageBlock> iterator = textBlocks.listIterator(); ListIterator<TextPageBlock> iterator = textBlocks.listIterator();
while (iterator.hasNext()) { while (iterator.hasNext()) {
TextPageBlock pageBlock = iterator.next(); TextPageBlock pageBlock = iterator.next();
@ -74,13 +156,10 @@ public class BlockificationPostprocessingService {
TextPageBlock pageBlock = iterator.next(); TextPageBlock pageBlock = iterator.next();
earlyStop = processOutlineForTextBlock(pageBlock, context); earlyStop = processOutlineForTextBlock(pageBlock, context);
} }
selectMatch(classificationPage, context);
}
} }
private void selectMatch(ClassificationPage classificationPage, OutlineProcessionContext context) { private boolean selectMatch(ClassificationPage classificationPage, OutlineProcessionContext context) {
OutlineObject outlineObject = context.outlineObject; OutlineObject outlineObject = context.outlineObject;
TextPageBlock directMatch = context.directMatch; TextPageBlock directMatch = context.directMatch;
@ -122,28 +201,39 @@ public class BlockificationPostprocessingService {
double minDistance = Math.min(distanceToDirectMatch, Math.min(distanceToSplitCandidate, distanceToBestMergeCandidates)); double minDistance = Math.min(distanceToDirectMatch, Math.min(distanceToSplitCandidate, distanceToBestMergeCandidates));
if (minDistance == Double.MAX_VALUE) { if (minDistance == Double.MAX_VALUE) {
return; return false;
} }
if (minDistance == distanceToDirectMatch) { if (minDistance == distanceToDirectMatch) {
directMatch.setClassification(headlineType); directMatch.setClassification(headlineType);
} else if (minDistance == distanceToSplitCandidate) { } else if (minDistance == distanceToSplitCandidate) {
List<TextPageBlock> others = splitBlock(classificationPage, splitCandidate, context.sectionIdentifier + outlineObject.getTitle()); List<TextPageBlock> others = splitBlock(classificationPage, splitCandidate, context.sectionIdentifier, outlineObject.getTitle());
splitCandidate.setClassification(headlineType); splitCandidate.setClassification(headlineType);
others.forEach(other -> other.setClassification(null)); others.forEach(other -> other.setClassification(null));
} else { } else {
var merged = mergeBlocks(classificationPage, bestMergeCandidateCombination); var merged = mergeBlocks(classificationPage, bestMergeCandidateCombination);
merged.setClassification(headlineType); merged.setClassification(headlineType);
} }
return true;
} }
private List<TextPageBlock> splitBlock(ClassificationPage classificationPage, TextPageBlock blockToSplit, String text) { private List<TextPageBlock> splitBlock(ClassificationPage classificationPage, TextPageBlock blockToSplit, SectionIdentifier sectionIdentifier, String title) {
List<TextPageBlock> otherBlocks = new ArrayList<>(); List<TextPageBlock> otherBlocks = new ArrayList<>();
int blockToSplitIdx = classificationPage.getTextBlocks().indexOf(blockToSplit); int blockToSplitIdx = classificationPage.getTextBlocks().indexOf(blockToSplit);
WordSequenceResult wordSequenceResult = findWordSequence(blockToSplit.getSequences(), text);
String headline = title;
if (!sectionIdentifier.getFormat().equals(SectionIdentifier.Format.EMPTY) && !title.startsWith(sectionIdentifier.getIdentifierString())) {
headline = sectionIdentifier + headline;
}
WordSequenceResult wordSequenceResult = findWordSequence(blockToSplit.getSequences(), headline);
if (wordSequenceResult.inSequence.isEmpty()) {
wordSequenceResult = findWordSequence(blockToSplit.getSequences(), title);
}
blockToSplit.setSequences(wordSequenceResult.inSequence); blockToSplit.setSequences(wordSequenceResult.inSequence);
blockToSplit.resize();
if (!wordSequenceResult.preSequence.isEmpty()) { if (!wordSequenceResult.preSequence.isEmpty()) {
TextPageBlock block = buildTextBlock(wordSequenceResult.preSequence, 0); TextPageBlock block = buildTextBlock(wordSequenceResult.preSequence, 0);
@ -301,6 +391,7 @@ public class BlockificationPostprocessingService {
assert firstBlock != null; assert firstBlock != null;
firstBlock.setToDuplicate(false); firstBlock.setToDuplicate(false);
firstBlock.resize();
classificationPage.getTextBlocks().removeAll(mergedBlocks); classificationPage.getTextBlocks().removeAll(mergedBlocks);
} }
@ -378,13 +469,13 @@ public class BlockificationPostprocessingService {
if (blockTextContainsOutlineTitle) { if (blockTextContainsOutlineTitle) {
SectionIdentifier sectionIdentifier = SectionIdentifier.fromSearchText(blockText); SectionIdentifier sectionIdentifier = SectionIdentifier.fromSearchText(blockText);
if (sectionIdentifier.getFormat() != SectionIdentifier.Format.EMPTY) { if (sectionIdentifier.getFormat() != SectionIdentifier.Format.EMPTY && !outlineTitle.startsWith(sectionIdentifier.getIdentifierString())) {
if (blockText.startsWith(sectionIdentifier.getIdentifierString()) && blockText.endsWith(outlineTitle) && context.directMatch == null) { if (blockText.startsWith(sectionIdentifier.getIdentifierString()) && blockText.endsWith(outlineTitle) && context.directMatch == null) {
context.directMatch = pageBlock; context.directMatch = pageBlock;
return true; return true;
} else if (context.splitCandidate == null) { } else if (context.splitCandidate == null) {
context.sectionIdentifier = sectionIdentifier.getIdentifierString(); context.sectionIdentifier = sectionIdentifier;
} }
} }
if (context.splitCandidate == null) { if (context.splitCandidate == null) {
@ -408,7 +499,7 @@ public class BlockificationPostprocessingService {
private OutlineObject outlineObject; private OutlineObject outlineObject;
private List<TextPageBlock> mergeCandidates; private List<TextPageBlock> mergeCandidates;
private TextPageBlock splitCandidate; private TextPageBlock splitCandidate;
private String sectionIdentifier; private SectionIdentifier sectionIdentifier;
public OutlineProcessionContext(OutlineObject outlineObject) { public OutlineProcessionContext(OutlineObject outlineObject) {
@ -417,7 +508,7 @@ public class BlockificationPostprocessingService {
this.directMatch = null; this.directMatch = null;
this.mergeCandidates = new ArrayList<>(); this.mergeCandidates = new ArrayList<>();
this.splitCandidate = null; this.splitCandidate = null;
this.sectionIdentifier = ""; this.sectionIdentifier = SectionIdentifier.empty();
} }
} }

View File

@ -58,8 +58,10 @@ public class DocstrumBlockificationService {
zones.forEach(zone -> { zones.forEach(zone -> {
List<TextPositionSequence> textPositionSequences = new ArrayList<>(); List<TextPositionSequence> textPositionSequences = new ArrayList<>();
zone.getLines().forEach(line -> { zone.getLines()
line.getWords().forEach(word -> { .forEach(line -> {
line.getWords()
.forEach(word -> {
textPositionSequences.add(new TextPositionSequence(word.getTextPositions(), word.getPage())); textPositionSequences.add(new TextPositionSequence(word.getTextPositions(), word.getPage()));
}); });
}); });
@ -90,7 +92,7 @@ public class DocstrumBlockificationService {
while (itty.hasNext()) { while (itty.hasNext()) {
AbstractPageBlock block = itty.next(); AbstractPageBlock block = itty.next();
if (block instanceof TablePageBlock || previous.isHeadline()) { if (block instanceof TablePageBlock) {
previous = new TextPageBlock(); previous = new TextPageBlock();
continue; continue;
} }
@ -98,11 +100,21 @@ public class DocstrumBlockificationService {
if (previous != null && !previous.getSequences().isEmpty()) { if (previous != null && !previous.getSequences().isEmpty()) {
if (current.getDir() != previous.getDir() || current.isHeadline()) { if (current.getDir() != previous.getDir()) {
previous = current; previous = current;
continue; continue;
} }
if (current.isHeadline() || previous.isHeadline()) {
if (intersectsYWithPreviousHavingMaxOneLine(previous, current, page)) {
previous = combineBlocksAndResetIterator(previous, current, itty, false);
} else {
previous = current;
}
continue;
}
if (areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(previous, current, page)) { if (areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(previous, current, page)) {
previous = combineBlocksAndResetIterator(previous, current, itty, true); previous = combineBlocksAndResetIterator(previous, current, itty, true);
continue; continue;
@ -149,6 +161,13 @@ public class DocstrumBlockificationService {
} }
private boolean intersectsYWithPreviousHavingMaxOneLine(TextPageBlock previous, TextPageBlock current, ClassificationPage page) {
return previous.intersectsY(current)//(Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD && Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) //
&& (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1);
}
private boolean areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(TextPageBlock previous, TextPageBlock current, ClassificationPage page) { private boolean areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(TextPageBlock previous, TextPageBlock current, ClassificationPage page) {
return previous.getNumberOfLines() >= 2 && current.getNumberOfLines() >= 2 // return previous.getNumberOfLines() >= 2 && current.getNumberOfLines() >= 2 //
@ -162,7 +181,7 @@ public class DocstrumBlockificationService {
previous.getSequences().addAll(current.getSequences()); previous.getSequences().addAll(current.getSequences());
previous = buildTextBlock(previous.getSequences(), 0); previous = buildTextBlock(previous.getSequences(), 0);
previous.setToDuplicate(toDuplicate); previous.setToDuplicate(toDuplicate);
if(current.getClassification() != null && previous.getClassification() == null) { if (current.getClassification() != null && previous.getClassification() == null) {
previous.setClassification(current.getClassification()); previous.setClassification(current.getClassification());
} }
itty.remove(); itty.remove();
@ -216,14 +235,14 @@ public class DocstrumBlockificationService {
ListIterator<AbstractPageBlock> itty = blocks.listIterator(); ListIterator<AbstractPageBlock> itty = blocks.listIterator();
while (itty.hasNext()) { while (itty.hasNext()) {
AbstractPageBlock block = itty.next(); AbstractPageBlock block = itty.next();
if(block == null){ if (block == null) {
continue; continue;
} }
if (block instanceof TablePageBlock) { if (block instanceof TablePageBlock) {
continue; continue;
} }
if(block.getClassification() != null && block.getClassification().isHeadline()) { if (block.getClassification() != null && block.getClassification().isHeadline()) {
continue; continue;
} }
@ -232,7 +251,7 @@ public class DocstrumBlockificationService {
for (int i = 0; i < blocks.size(); i++) { for (int i = 0; i < blocks.size(); i++) {
AbstractPageBlock abstractPageBlock = blocks.get(i); AbstractPageBlock abstractPageBlock = blocks.get(i);
if(abstractPageBlock == null){ if (abstractPageBlock == null) {
continue; continue;
} }
if (abstractPageBlock == current) { if (abstractPageBlock == current) {
@ -242,13 +261,12 @@ public class DocstrumBlockificationService {
continue; continue;
} }
if(abstractPageBlock.getClassification() != null && abstractPageBlock.getClassification().isHeadline()) { if (abstractPageBlock.getClassification() != null && abstractPageBlock.getClassification().isHeadline()) {
continue; continue;
} }
TextPageBlock inner = (TextPageBlock) abstractPageBlock; TextPageBlock inner = (TextPageBlock) abstractPageBlock;
if (current.getDir() == inner.getDir() && current.almostIntersects(inner, yThreshold, xThreshold)) { if (current.getDir() == inner.getDir() && current.almostIntersects(inner, yThreshold, xThreshold)) {
boolean toDuplicate = current.isToDuplicate() || inner.isToDuplicate(); boolean toDuplicate = current.isToDuplicate() || inner.isToDuplicate();
@ -262,8 +280,8 @@ public class DocstrumBlockificationService {
} }
} }
var blocksIterator = blocks.iterator(); var blocksIterator = blocks.iterator();
while(blocksIterator.hasNext()){ while (blocksIterator.hasNext()) {
if(blocksIterator.next() == null){ if (blocksIterator.next() == null) {
blocksIterator.remove(); blocksIterator.remove();
} }
} }
@ -371,7 +389,12 @@ public class DocstrumBlockificationService {
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest()); textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
} }
if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) { if (textBlock != null
&& textBlock.getSequences() != null
&& textBlock.getSequences()
.stream()
.map(t -> round(t.getMinYDirAdj(), 3))
.collect(toSet()).size() == 1) {
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj)); textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
} }
return textBlock; return textBlock;
@ -386,14 +409,8 @@ public class DocstrumBlockificationService {
List<Ruling> horizontalRulingLines, List<Ruling> horizontalRulingLines,
List<Ruling> verticalRulingLines) { List<Ruling> verticalRulingLines) {
return isSplitByRuling(maxX, return isSplitByRuling(maxX, minY, word.getMinXDirAdj(), word.getMinYDirAdj(), verticalRulingLines, word.getDir().getDegrees(), word.getPageWidth(), word.getPageHeight())
minY, //
word.getMinXDirAdj(),
word.getMinYDirAdj(),
verticalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(minX, || isSplitByRuling(minX,
minY, minY,
word.getMinXDirAdj(), word.getMinXDirAdj(),
@ -401,7 +418,8 @@ public class DocstrumBlockificationService {
horizontalRulingLines, horizontalRulingLines,
word.getDir().getDegrees(), word.getDir().getDegrees(),
word.getPageWidth(), word.getPageWidth(),
word.getPageHeight()) // word.getPageHeight())
//
|| isSplitByRuling(maxX, || isSplitByRuling(maxX,
minY, minY,
word.getMinXDirAdj(), word.getMinXDirAdj(),
@ -409,7 +427,8 @@ public class DocstrumBlockificationService {
horizontalRulingLines, horizontalRulingLines,
word.getDir().getDegrees(), word.getDir().getDegrees(),
word.getPageWidth(), word.getPageWidth(),
word.getPageHeight()) // word.getPageHeight())
//
|| isSplitByRuling(minX, || isSplitByRuling(minX,
minY, minY,
word.getMinXDirAdj(), word.getMinXDirAdj(),

View File

@ -82,13 +82,15 @@ public class ViewerDocumentTest extends BuildDocumentTest {
@SneakyThrows @SneakyThrows
public void testViewerDocument() { public void testViewerDocument() {
//String fileName = "files/documine/21_TiltPlus_MutacaoGenicaEmCelulasBacterianas.pdf";//fail here
//String fileName = "files/new/UTT-Books-53.pdf";
String fileName = "files/documine/A21924A - Acute Oral Toxicity - Rats.pdf";
//String fileName = "files/documine/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica.pdf";
//String fileName = "files/documine/Study Document 1 - Acute Eye IrritationCorrosion - Rabbits.pdf"; //String fileName = "files/documine/Study Document 1 - Acute Eye IrritationCorrosion - Rabbits.pdf";
//String fileName = "files/documine/Study Document 3 - Acute Eye IrritationCorrosion - Rabbits.pdf";
//String fileName = "files/documine/VV-547521_Irritação_Ocular_in_Vivo.pdf";
//String fileName = "files/documine/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica.pdf";
//String fileName = "files/new/UTT-Books-53.pdf";
//String fileName = "files/documine/A21924A - Acute Oral Toxicity - Rats.pdf";
//String fileName = "files/documine/A16361B - Acute Dermal Irritation Toxicity Study in Rabbits.pdf";
//String fileName = "files/documine/ITEM 20_Sensibilização cutânea.pdf"; //String fileName = "files/documine/ITEM 20_Sensibilização cutânea.pdf";
//String fileName = "files/documine/VV-547523_LLNA.pdf"; //String fileName = "files/documine/VV-547523_LLNA.pdf";
//String fileName = "files/new/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"; //String fileName = "files/new/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
@ -96,7 +98,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
//String fileName = "files/new/abschlussarbeiten-template-institut-fur-informatik-padagogische-hochschule-karlsruhe.pdf"; //String fileName = "files/new/abschlussarbeiten-template-institut-fur-informatik-padagogische-hochschule-karlsruhe.pdf";
//String fileName = "files/new/kaust-official-thesis-template.pdf"; //String fileName = "files/new/kaust-official-thesis-template.pdf";
//String fileName = "files/new/$100m Offers.pdf"; //String fileName = "files/new/$100m Offers.pdf";
//String fileName = "files/new/18-Curacron_ToxicidadeOcularInVitro.pdf"; String fileName = "files/new/18-Curacron_ToxicidadeOcularInVitro.pdf";
//String fileName = "files/new/mistitled_outlines_example.pdf"; //String fileName = "files/new/mistitled_outlines_example.pdf";
//String fileName = "files/bdr/Plenarprotokoll 1 (keine Druchsache!) (1) 1.pdf"; //String fileName = "files/bdr/Plenarprotokoll 1 (keine Druchsache!) (1) 1.pdf";
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf"; String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";