RED-7074: Design Subsection section tree structure algorithm
* improved merging of headlines as well as splitting logic so that more headlines are detected correctly
This commit is contained in:
parent
2fcaeb3d8c
commit
1856fed640
@ -2,6 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor;
|
||||
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
@ -210,15 +211,15 @@ public class LayoutParsingPipeline {
|
||||
|
||||
private String buildSemanticNodeCountMessage(int numberOfPages, Map<NodeType, Long> semanticNodeCounts) {
|
||||
|
||||
return String.format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed",
|
||||
numberOfPages,
|
||||
semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION),
|
||||
semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE),
|
||||
semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH),
|
||||
semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE),
|
||||
semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL),
|
||||
semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER),
|
||||
semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER));
|
||||
return format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed",
|
||||
numberOfPages,
|
||||
semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION),
|
||||
semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE),
|
||||
semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH),
|
||||
semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE),
|
||||
semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL),
|
||||
semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER),
|
||||
semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER));
|
||||
}
|
||||
|
||||
|
||||
@ -239,6 +240,7 @@ public class LayoutParsingPipeline {
|
||||
Map<Integer, List<ClassifiedImage>> signatures = visualLayoutParsingAdapter.buildExtractedSignaturesPerPage(visualLayoutParsingResponse);
|
||||
ClassificationDocument classificationDocument = new ClassificationDocument();
|
||||
List<ClassificationPage> classificationPages = new ArrayList<>();
|
||||
OutlineObject lastProcessedOutlineObject = null;
|
||||
|
||||
// parsing the structure elements could be useful as well
|
||||
classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originDocument));
|
||||
@ -307,11 +309,16 @@ public class LayoutParsingPipeline {
|
||||
classificationPage.setPageWidth(cropbox.getWidth());
|
||||
classificationPage.setPageHeight(cropbox.getHeight());
|
||||
|
||||
List<OutlineObject> outlineObjects = classificationDocument.getOutlineObjectTree().getOutlineObjectsPerPage()
|
||||
.get(pageNumber - 1);
|
||||
if (outlineObjects != null) {
|
||||
List<OutlineObject> outlineObjects = classificationDocument.getOutlineObjectTree().getOutlineObjectsPerPage().getOrDefault(pageNumber - 1, new ArrayList<>());
|
||||
|
||||
OutlineObject notFoundOutlineObject = null;
|
||||
if (lastProcessedOutlineObject != null && !lastProcessedOutlineObject.isFound()) {
|
||||
lastProcessedOutlineObject.setPoint(new Point2D.Float(0, cropbox.getHeight()));
|
||||
notFoundOutlineObject = lastProcessedOutlineObject;
|
||||
}
|
||||
if (!outlineObjects.isEmpty()) {
|
||||
classificationPage.setOutlineObjects(outlineObjects);
|
||||
blockificationPostprocessingService.sanitizeOutlineBlocks(classificationPage);
|
||||
lastProcessedOutlineObject = blockificationPostprocessingService.sanitizeOutlineBlocks(classificationPage, notFoundOutlineObject);
|
||||
}
|
||||
|
||||
// MarkedContent needs to be converted at this point, otherwise it leads to GC Problems in Pdfbox.
|
||||
|
||||
@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||
|
||||
@ -5,16 +5,27 @@ import java.awt.geom.Point2D;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
@RequiredArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class OutlineObject {
|
||||
|
||||
private String title;
|
||||
private int pageNumber;
|
||||
private final String title;
|
||||
private final int pageNumber;
|
||||
private Point2D point;
|
||||
private int treeDepth;
|
||||
private final int treeDepth;
|
||||
|
||||
private boolean found = false;
|
||||
|
||||
|
||||
public OutlineObject(String title, int pageNumber, Point2D point2D, int depth) {
|
||||
|
||||
this(title, pageNumber, depth);
|
||||
this.point = point2D;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
@ -80,7 +80,10 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
|
||||
public static TextPageBlock merge(List<TextPageBlock> textBlocksToMerge) {
|
||||
|
||||
List<TextPositionSequence> sequences = textBlocksToMerge.stream().map(TextPageBlock::getSequences).flatMap(java.util.Collection::stream).toList();
|
||||
List<TextPositionSequence> sequences = textBlocksToMerge.stream()
|
||||
.map(TextPageBlock::getSequences)
|
||||
.flatMap(java.util.Collection::stream)
|
||||
.toList();
|
||||
sequences = new ArrayList<>(sequences);
|
||||
return fromTextPositionSequences(sequences);
|
||||
}
|
||||
@ -106,11 +109,11 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
|
||||
if (textBlock == null) {
|
||||
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
|
||||
wordBlock.getMaxXDirAdj(),
|
||||
wordBlock.getMinYDirAdj(),
|
||||
wordBlock.getMaxYDirAdj(),
|
||||
wordBlockList,
|
||||
wordBlock.getRotation());
|
||||
wordBlock.getMaxXDirAdj(),
|
||||
wordBlock.getMinYDirAdj(),
|
||||
wordBlock.getMaxYDirAdj(),
|
||||
wordBlockList,
|
||||
wordBlock.getRotation());
|
||||
} else {
|
||||
TextPageBlock spatialEntity = textBlock.union(wordBlock);
|
||||
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
|
||||
@ -126,11 +129,12 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
|
||||
}
|
||||
|
||||
if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences()
|
||||
.stream()
|
||||
.map(t -> DoubleComparisons.round(t.getMinYDirAdj(), 3))
|
||||
.collect(toSet())
|
||||
.size() == 1) {
|
||||
if (textBlock != null
|
||||
&& textBlock.getSequences() != null
|
||||
&& textBlock.getSequences()
|
||||
.stream()
|
||||
.map(t -> DoubleComparisons.round(t.getMinYDirAdj(), 3))
|
||||
.collect(toSet()).size() == 1) {
|
||||
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
|
||||
}
|
||||
return textBlock;
|
||||
@ -290,18 +294,7 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
|
||||
public void add(TextPositionSequence r) {
|
||||
|
||||
if (r.getMinXDirAdj() < minX) {
|
||||
minX = r.getMinXDirAdj();
|
||||
}
|
||||
if (r.getMaxXDirAdj() > maxX) {
|
||||
maxX = r.getMaxXDirAdj();
|
||||
}
|
||||
if (r.getMinYDirAdj() < minY) {
|
||||
minY = r.getMinYDirAdj();
|
||||
}
|
||||
if (r.getMaxYDirAdj() > maxY) {
|
||||
maxY = r.getMaxYDirAdj();
|
||||
}
|
||||
setCoordinates(r);
|
||||
}
|
||||
|
||||
|
||||
@ -317,6 +310,33 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
}
|
||||
|
||||
|
||||
public void resize() {
|
||||
|
||||
minX = Float.MAX_VALUE;
|
||||
minY = Float.MAX_VALUE;
|
||||
maxX = Float.MIN_VALUE;
|
||||
maxY = Float.MIN_VALUE;
|
||||
sequences.forEach(this::setCoordinates);
|
||||
}
|
||||
|
||||
|
||||
private void setCoordinates(TextPositionSequence sequence) {
|
||||
|
||||
if (sequence.getMinXDirAdj() < minX) {
|
||||
minX = sequence.getMinXDirAdj();
|
||||
}
|
||||
if (sequence.getMaxXDirAdj() > maxX) {
|
||||
maxX = sequence.getMaxXDirAdj();
|
||||
}
|
||||
if (sequence.getMinYDirAdj() < minY) {
|
||||
minY = sequence.getMinYDirAdj();
|
||||
}
|
||||
if (sequence.getMaxYDirAdj() > maxY) {
|
||||
maxY = sequence.getMaxYDirAdj();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void set(float x1, float y1, float x2, float y2) {
|
||||
|
||||
this.minX = Math.min(x1, x2);
|
||||
|
||||
@ -37,50 +37,129 @@ public class BlockificationPostprocessingService {
|
||||
.collect(RectangleTransformations.collectBBox());
|
||||
|
||||
|
||||
public void sanitizeOutlineBlocks(ClassificationPage classificationPage) {
|
||||
public OutlineObject sanitizeOutlineBlocks(ClassificationPage classificationPage, OutlineObject notFoundOutlineObject) {
|
||||
|
||||
List<OutlineObject> outlineObjects = classificationPage.getOutlineObjects();
|
||||
|
||||
List<TextPageBlock> textBlocks = classificationPage.getTextBlocks()
|
||||
.stream()
|
||||
.filter(block -> block instanceof TextPageBlock)
|
||||
.toList()
|
||||
.stream()
|
||||
.map(block -> (TextPageBlock) block)
|
||||
.toList();
|
||||
|
||||
if (textBlocks.isEmpty() || outlineObjects.isEmpty()) {
|
||||
return;
|
||||
if (getTextPageBlocks(classificationPage).isEmpty() || outlineObjects.isEmpty()) {
|
||||
return null;
|
||||
}
|
||||
|
||||
float pageHeight = classificationPage.getPageHeight();
|
||||
|
||||
for (OutlineObject outlineObject : outlineObjects) {
|
||||
ListIterator<OutlineObject> outlineObjectListIterator = outlineObjects.listIterator();
|
||||
|
||||
OutlineProcessionContext context = new OutlineProcessionContext(outlineObject);
|
||||
if (notFoundOutlineObject != null) {
|
||||
OutlineProcessionContext notFoundOutlineObjectProcessionContext = new OutlineProcessionContext(notFoundOutlineObject);
|
||||
processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, notFoundOutlineObjectProcessionContext);
|
||||
|
||||
ListIterator<TextPageBlock> iterator = textBlocks.listIterator();
|
||||
while (iterator.hasNext()) {
|
||||
TextPageBlock pageBlock = iterator.next();
|
||||
if (pageHeight - outlineObject.getPoint().getY() - BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD <= pageBlock.getMaxY()) {
|
||||
break;
|
||||
}
|
||||
OutlineObject firstOutlineObject = null;
|
||||
OutlineProcessionContext firstOutlineObjectProcessionContext = null;
|
||||
if (outlineObjectListIterator.hasNext()) {
|
||||
firstOutlineObject = outlineObjectListIterator.next();
|
||||
firstOutlineObjectProcessionContext = new OutlineProcessionContext(firstOutlineObject);
|
||||
processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, firstOutlineObjectProcessionContext);
|
||||
}
|
||||
if (iterator.hasPrevious()) {
|
||||
iterator.previous();
|
||||
}
|
||||
boolean earlyStop = false;
|
||||
while (iterator.hasNext() && !earlyStop) {
|
||||
TextPageBlock pageBlock = iterator.next();
|
||||
earlyStop = processOutlineForTextBlock(pageBlock, context);
|
||||
}
|
||||
selectMatch(classificationPage, context);
|
||||
|
||||
if (!contextsOverlap(notFoundOutlineObjectProcessionContext, firstOutlineObjectProcessionContext)) {
|
||||
notFoundOutlineObject.setFound(selectMatch(classificationPage, notFoundOutlineObjectProcessionContext));
|
||||
}
|
||||
if (firstOutlineObject != null) {
|
||||
// re-create the context for the updated blocks
|
||||
firstOutlineObjectProcessionContext = new OutlineProcessionContext(firstOutlineObject);
|
||||
processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, firstOutlineObjectProcessionContext);
|
||||
firstOutlineObject.setFound(selectMatch(classificationPage, firstOutlineObjectProcessionContext));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
outlineObjectListIterator.forEachRemaining(outlineObject -> {
|
||||
OutlineProcessionContext outlineObjectProcessionContext = new OutlineProcessionContext(outlineObject);
|
||||
processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, outlineObjectProcessionContext);
|
||||
outlineObject.setFound(selectMatch(classificationPage, outlineObjectProcessionContext));
|
||||
});
|
||||
|
||||
if (!outlineObjects.isEmpty()) {
|
||||
return outlineObjects.get(outlineObjects.size() - 1);
|
||||
} else {
|
||||
return notFoundOutlineObject;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void selectMatch(ClassificationPage classificationPage, OutlineProcessionContext context) {
|
||||
private static List<TextPageBlock> getTextPageBlocks(ClassificationPage classificationPage) {
|
||||
|
||||
return classificationPage.getTextBlocks()
|
||||
.stream()
|
||||
.filter(block -> block instanceof TextPageBlock)
|
||||
.map(block -> (TextPageBlock) block)
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
private boolean contextsOverlap(OutlineProcessionContext notFoundOutlineObjectProcessionContext, OutlineProcessionContext firstOutlineObjectProcessionContext) {
|
||||
|
||||
if (firstOutlineObjectProcessionContext == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
String notFoundTitle = notFoundOutlineObjectProcessionContext.getOutlineObject().getTitle();
|
||||
String firstTitle = firstOutlineObjectProcessionContext.getOutlineObject().getTitle();
|
||||
|
||||
if (!firstTitle.startsWith(notFoundTitle)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
var blocksOfNotFoundOutline = getAllMatchingBlocks(notFoundOutlineObjectProcessionContext);
|
||||
var blocksOfFirstOutline = getAllMatchingBlocks(firstOutlineObjectProcessionContext);
|
||||
|
||||
double maxYFirst = blocksOfFirstOutline.stream()
|
||||
.mapToDouble(TextPageBlock::getPdfMaxY)
|
||||
.max()
|
||||
.orElse(Double.NEGATIVE_INFINITY);
|
||||
|
||||
return blocksOfNotFoundOutline.stream()
|
||||
.mapToDouble(TextPageBlock::getPdfMaxY)
|
||||
.anyMatch(y -> y >= maxYFirst);
|
||||
}
|
||||
|
||||
|
||||
private List<TextPageBlock> getAllMatchingBlocks(OutlineProcessionContext context) {
|
||||
|
||||
List<TextPageBlock> blocks = new ArrayList<>();
|
||||
if (context.getDirectMatch() != null) {
|
||||
blocks.add(context.getDirectMatch());
|
||||
}
|
||||
if (context.getSplitCandidate() != null) {
|
||||
blocks.add(context.getSplitCandidate());
|
||||
}
|
||||
blocks.addAll(context.getMergeCandidates());
|
||||
return blocks;
|
||||
}
|
||||
|
||||
|
||||
private void processTextBlocks(List<TextPageBlock> textBlocks, float pageHeight, OutlineProcessionContext context) {
|
||||
|
||||
OutlineObject outlineObject = context.getOutlineObject();
|
||||
ListIterator<TextPageBlock> iterator = textBlocks.listIterator();
|
||||
while (iterator.hasNext()) {
|
||||
TextPageBlock pageBlock = iterator.next();
|
||||
if (pageHeight - outlineObject.getPoint().getY() - BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD <= pageBlock.getMaxY()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (iterator.hasPrevious()) {
|
||||
iterator.previous();
|
||||
}
|
||||
boolean earlyStop = false;
|
||||
while (iterator.hasNext() && !earlyStop) {
|
||||
TextPageBlock pageBlock = iterator.next();
|
||||
earlyStop = processOutlineForTextBlock(pageBlock, context);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private boolean selectMatch(ClassificationPage classificationPage, OutlineProcessionContext context) {
|
||||
|
||||
OutlineObject outlineObject = context.outlineObject;
|
||||
TextPageBlock directMatch = context.directMatch;
|
||||
@ -122,28 +201,39 @@ public class BlockificationPostprocessingService {
|
||||
double minDistance = Math.min(distanceToDirectMatch, Math.min(distanceToSplitCandidate, distanceToBestMergeCandidates));
|
||||
|
||||
if (minDistance == Double.MAX_VALUE) {
|
||||
return;
|
||||
return false;
|
||||
}
|
||||
if (minDistance == distanceToDirectMatch) {
|
||||
directMatch.setClassification(headlineType);
|
||||
} else if (minDistance == distanceToSplitCandidate) {
|
||||
List<TextPageBlock> others = splitBlock(classificationPage, splitCandidate, context.sectionIdentifier + outlineObject.getTitle());
|
||||
List<TextPageBlock> others = splitBlock(classificationPage, splitCandidate, context.sectionIdentifier, outlineObject.getTitle());
|
||||
splitCandidate.setClassification(headlineType);
|
||||
others.forEach(other -> other.setClassification(null));
|
||||
} else {
|
||||
var merged = mergeBlocks(classificationPage, bestMergeCandidateCombination);
|
||||
merged.setClassification(headlineType);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
private List<TextPageBlock> splitBlock(ClassificationPage classificationPage, TextPageBlock blockToSplit, String text) {
|
||||
private List<TextPageBlock> splitBlock(ClassificationPage classificationPage, TextPageBlock blockToSplit, SectionIdentifier sectionIdentifier, String title) {
|
||||
|
||||
List<TextPageBlock> otherBlocks = new ArrayList<>();
|
||||
int blockToSplitIdx = classificationPage.getTextBlocks().indexOf(blockToSplit);
|
||||
WordSequenceResult wordSequenceResult = findWordSequence(blockToSplit.getSequences(), text);
|
||||
|
||||
String headline = title;
|
||||
if (!sectionIdentifier.getFormat().equals(SectionIdentifier.Format.EMPTY) && !title.startsWith(sectionIdentifier.getIdentifierString())) {
|
||||
headline = sectionIdentifier + headline;
|
||||
}
|
||||
|
||||
WordSequenceResult wordSequenceResult = findWordSequence(blockToSplit.getSequences(), headline);
|
||||
if (wordSequenceResult.inSequence.isEmpty()) {
|
||||
wordSequenceResult = findWordSequence(blockToSplit.getSequences(), title);
|
||||
}
|
||||
|
||||
blockToSplit.setSequences(wordSequenceResult.inSequence);
|
||||
blockToSplit.resize();
|
||||
|
||||
if (!wordSequenceResult.preSequence.isEmpty()) {
|
||||
TextPageBlock block = buildTextBlock(wordSequenceResult.preSequence, 0);
|
||||
@ -301,6 +391,7 @@ public class BlockificationPostprocessingService {
|
||||
|
||||
assert firstBlock != null;
|
||||
firstBlock.setToDuplicate(false);
|
||||
firstBlock.resize();
|
||||
classificationPage.getTextBlocks().removeAll(mergedBlocks);
|
||||
}
|
||||
|
||||
@ -378,13 +469,13 @@ public class BlockificationPostprocessingService {
|
||||
if (blockTextContainsOutlineTitle) {
|
||||
SectionIdentifier sectionIdentifier = SectionIdentifier.fromSearchText(blockText);
|
||||
|
||||
if (sectionIdentifier.getFormat() != SectionIdentifier.Format.EMPTY) {
|
||||
if (sectionIdentifier.getFormat() != SectionIdentifier.Format.EMPTY && !outlineTitle.startsWith(sectionIdentifier.getIdentifierString())) {
|
||||
|
||||
if (blockText.startsWith(sectionIdentifier.getIdentifierString()) && blockText.endsWith(outlineTitle) && context.directMatch == null) {
|
||||
context.directMatch = pageBlock;
|
||||
return true;
|
||||
} else if (context.splitCandidate == null) {
|
||||
context.sectionIdentifier = sectionIdentifier.getIdentifierString();
|
||||
context.sectionIdentifier = sectionIdentifier;
|
||||
}
|
||||
}
|
||||
if (context.splitCandidate == null) {
|
||||
@ -408,7 +499,7 @@ public class BlockificationPostprocessingService {
|
||||
private OutlineObject outlineObject;
|
||||
private List<TextPageBlock> mergeCandidates;
|
||||
private TextPageBlock splitCandidate;
|
||||
private String sectionIdentifier;
|
||||
private SectionIdentifier sectionIdentifier;
|
||||
|
||||
|
||||
public OutlineProcessionContext(OutlineObject outlineObject) {
|
||||
@ -417,7 +508,7 @@ public class BlockificationPostprocessingService {
|
||||
this.directMatch = null;
|
||||
this.mergeCandidates = new ArrayList<>();
|
||||
this.splitCandidate = null;
|
||||
this.sectionIdentifier = "";
|
||||
this.sectionIdentifier = SectionIdentifier.empty();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -58,18 +58,20 @@ public class DocstrumBlockificationService {
|
||||
zones.forEach(zone -> {
|
||||
|
||||
List<TextPositionSequence> textPositionSequences = new ArrayList<>();
|
||||
zone.getLines().forEach(line -> {
|
||||
line.getWords().forEach(word -> {
|
||||
textPositionSequences.add(new TextPositionSequence(word.getTextPositions(), word.getPage()));
|
||||
});
|
||||
});
|
||||
zone.getLines()
|
||||
.forEach(line -> {
|
||||
line.getWords()
|
||||
.forEach(word -> {
|
||||
textPositionSequences.add(new TextPositionSequence(word.getTextPositions(), word.getPage()));
|
||||
});
|
||||
});
|
||||
|
||||
abstractPageBlocks.addAll(splitZonesAtRulings(textPositionSequences, horizontalRulings, verticalRulings));
|
||||
});
|
||||
|
||||
if (xyOrder) {
|
||||
abstractPageBlocks.sort(Comparator.comparing(AbstractPageBlock::getMinY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||
.thenComparing(AbstractPageBlock::getMinX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||
.thenComparing(AbstractPageBlock::getMinX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||
abstractPageBlocks.sort(new Comparator<AbstractPageBlock>() {
|
||||
@Override
|
||||
public int compare(AbstractPageBlock o1, AbstractPageBlock o2) {
|
||||
@ -90,7 +92,7 @@ public class DocstrumBlockificationService {
|
||||
while (itty.hasNext()) {
|
||||
|
||||
AbstractPageBlock block = itty.next();
|
||||
if (block instanceof TablePageBlock || previous.isHeadline()) {
|
||||
if (block instanceof TablePageBlock) {
|
||||
previous = new TextPageBlock();
|
||||
continue;
|
||||
}
|
||||
@ -98,11 +100,21 @@ public class DocstrumBlockificationService {
|
||||
|
||||
if (previous != null && !previous.getSequences().isEmpty()) {
|
||||
|
||||
if (current.getDir() != previous.getDir() || current.isHeadline()) {
|
||||
if (current.getDir() != previous.getDir()) {
|
||||
previous = current;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (current.isHeadline() || previous.isHeadline()) {
|
||||
if (intersectsYWithPreviousHavingMaxOneLine(previous, current, page)) {
|
||||
previous = combineBlocksAndResetIterator(previous, current, itty, false);
|
||||
} else {
|
||||
previous = current;
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
if (areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(previous, current, page)) {
|
||||
previous = combineBlocksAndResetIterator(previous, current, itty, true);
|
||||
continue;
|
||||
@ -134,8 +146,8 @@ public class DocstrumBlockificationService {
|
||||
private boolean isOnlyIntersectingYAndOnelineOrPrevoiusTwoLines(TextPageBlock previous, TextPageBlock current, ClassificationPage page) {
|
||||
|
||||
return current.intersectsY(previous) //
|
||||
&& (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() == 2 && current.getNumberOfLines() == 1) //
|
||||
&& numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) <= 0;
|
||||
&& (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() == 2 && current.getNumberOfLines() == 1) //
|
||||
&& numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) <= 0;
|
||||
}
|
||||
|
||||
|
||||
@ -144,16 +156,23 @@ public class DocstrumBlockificationService {
|
||||
ClassificationPage page) {
|
||||
|
||||
return (Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD || Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) //
|
||||
&& (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() >= 2 && current.getNumberOfLines() == 1) //
|
||||
&& !hasBetween(current, previous, page.getTextBlocks()) && numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) <= 4;
|
||||
&& (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() >= 2 && current.getNumberOfLines() == 1) //
|
||||
&& !hasBetween(current, previous, page.getTextBlocks()) && numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) <= 4;
|
||||
}
|
||||
|
||||
|
||||
private boolean intersectsYWithPreviousHavingMaxOneLine(TextPageBlock previous, TextPageBlock current, ClassificationPage page) {
|
||||
|
||||
return previous.intersectsY(current)//(Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD && Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) //
|
||||
&& (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1);
|
||||
}
|
||||
|
||||
|
||||
private boolean areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(TextPageBlock previous, TextPageBlock current, ClassificationPage page) {
|
||||
|
||||
return previous.getNumberOfLines() >= 2 && current.getNumberOfLines() >= 2 //
|
||||
&& previous.intersectsY(current) //
|
||||
&& numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) == 0;
|
||||
&& previous.intersectsY(current) //
|
||||
&& numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) == 0;
|
||||
}
|
||||
|
||||
|
||||
@ -162,7 +181,7 @@ public class DocstrumBlockificationService {
|
||||
previous.getSequences().addAll(current.getSequences());
|
||||
previous = buildTextBlock(previous.getSequences(), 0);
|
||||
previous.setToDuplicate(toDuplicate);
|
||||
if(current.getClassification() != null && previous.getClassification() == null) {
|
||||
if (current.getClassification() != null && previous.getClassification() == null) {
|
||||
previous.setClassification(current.getClassification());
|
||||
}
|
||||
itty.remove();
|
||||
@ -216,14 +235,14 @@ public class DocstrumBlockificationService {
|
||||
ListIterator<AbstractPageBlock> itty = blocks.listIterator();
|
||||
while (itty.hasNext()) {
|
||||
AbstractPageBlock block = itty.next();
|
||||
if(block == null){
|
||||
if (block == null) {
|
||||
continue;
|
||||
}
|
||||
if (block instanceof TablePageBlock) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if(block.getClassification() != null && block.getClassification().isHeadline()) {
|
||||
if (block.getClassification() != null && block.getClassification().isHeadline()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -232,7 +251,7 @@ public class DocstrumBlockificationService {
|
||||
for (int i = 0; i < blocks.size(); i++) {
|
||||
|
||||
AbstractPageBlock abstractPageBlock = blocks.get(i);
|
||||
if(abstractPageBlock == null){
|
||||
if (abstractPageBlock == null) {
|
||||
continue;
|
||||
}
|
||||
if (abstractPageBlock == current) {
|
||||
@ -242,13 +261,12 @@ public class DocstrumBlockificationService {
|
||||
continue;
|
||||
}
|
||||
|
||||
if(abstractPageBlock.getClassification() != null && abstractPageBlock.getClassification().isHeadline()) {
|
||||
if (abstractPageBlock.getClassification() != null && abstractPageBlock.getClassification().isHeadline()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
TextPageBlock inner = (TextPageBlock) abstractPageBlock;
|
||||
|
||||
|
||||
if (current.getDir() == inner.getDir() && current.almostIntersects(inner, yThreshold, xThreshold)) {
|
||||
|
||||
boolean toDuplicate = current.isToDuplicate() || inner.isToDuplicate();
|
||||
@ -262,8 +280,8 @@ public class DocstrumBlockificationService {
|
||||
}
|
||||
}
|
||||
var blocksIterator = blocks.iterator();
|
||||
while(blocksIterator.hasNext()){
|
||||
if(blocksIterator.next() == null){
|
||||
while (blocksIterator.hasNext()) {
|
||||
if (blocksIterator.next() == null) {
|
||||
blocksIterator.remove();
|
||||
}
|
||||
}
|
||||
@ -351,11 +369,11 @@ public class DocstrumBlockificationService {
|
||||
|
||||
if (textBlock == null) {
|
||||
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
|
||||
wordBlock.getMaxXDirAdj(),
|
||||
wordBlock.getMinYDirAdj(),
|
||||
wordBlock.getMaxYDirAdj(),
|
||||
wordBlockList,
|
||||
wordBlock.getRotation());
|
||||
wordBlock.getMaxXDirAdj(),
|
||||
wordBlock.getMinYDirAdj(),
|
||||
wordBlock.getMaxYDirAdj(),
|
||||
wordBlockList,
|
||||
wordBlock.getRotation());
|
||||
} else {
|
||||
TextPageBlock spatialEntity = textBlock.union(wordBlock);
|
||||
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
|
||||
@ -371,7 +389,12 @@ public class DocstrumBlockificationService {
|
||||
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
|
||||
}
|
||||
|
||||
if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) {
|
||||
if (textBlock != null
|
||||
&& textBlock.getSequences() != null
|
||||
&& textBlock.getSequences()
|
||||
.stream()
|
||||
.map(t -> round(t.getMinYDirAdj(), 3))
|
||||
.collect(toSet()).size() == 1) {
|
||||
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
|
||||
}
|
||||
return textBlock;
|
||||
@ -386,38 +409,34 @@ public class DocstrumBlockificationService {
|
||||
List<Ruling> horizontalRulingLines,
|
||||
List<Ruling> verticalRulingLines) {
|
||||
|
||||
return isSplitByRuling(maxX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMinYDirAdj(),
|
||||
verticalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight()) //
|
||||
|| isSplitByRuling(minX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMaxYDirAdj(),
|
||||
horizontalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight()) //
|
||||
|| isSplitByRuling(maxX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMinYDirAdj(),
|
||||
horizontalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight()) //
|
||||
|| isSplitByRuling(minX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMaxYDirAdj(),
|
||||
verticalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight());
|
||||
return isSplitByRuling(maxX, minY, word.getMinXDirAdj(), word.getMinYDirAdj(), verticalRulingLines, word.getDir().getDegrees(), word.getPageWidth(), word.getPageHeight())
|
||||
//
|
||||
|| isSplitByRuling(minX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMaxYDirAdj(),
|
||||
horizontalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight())
|
||||
//
|
||||
|| isSplitByRuling(maxX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMinYDirAdj(),
|
||||
horizontalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight())
|
||||
//
|
||||
|| isSplitByRuling(minX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMaxYDirAdj(),
|
||||
verticalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight());
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -82,13 +82,15 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
@SneakyThrows
|
||||
public void testViewerDocument() {
|
||||
|
||||
//String fileName = "files/documine/21_TiltPlus_MutacaoGenicaEmCelulasBacterianas.pdf";//fail here
|
||||
|
||||
//String fileName = "files/new/UTT-Books-53.pdf";
|
||||
String fileName = "files/documine/A21924A - Acute Oral Toxicity - Rats.pdf";
|
||||
|
||||
|
||||
//String fileName = "files/documine/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica.pdf";
|
||||
//String fileName = "files/documine/Study Document 1 - Acute Eye IrritationCorrosion - Rabbits.pdf";
|
||||
//String fileName = "files/documine/Study Document 3 - Acute Eye IrritationCorrosion - Rabbits.pdf";
|
||||
//String fileName = "files/documine/VV-547521_Irritação_Ocular_in_Vivo.pdf";
|
||||
//String fileName = "files/documine/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica.pdf";
|
||||
//String fileName = "files/new/UTT-Books-53.pdf";
|
||||
//String fileName = "files/documine/A21924A - Acute Oral Toxicity - Rats.pdf";
|
||||
//String fileName = "files/documine/A16361B - Acute Dermal Irritation Toxicity Study in Rabbits.pdf";
|
||||
//String fileName = "files/documine/ITEM 20_Sensibilização cutânea.pdf";
|
||||
//String fileName = "files/documine/VV-547523_LLNA.pdf";
|
||||
//String fileName = "files/new/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
|
||||
@ -96,7 +98,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
//String fileName = "files/new/abschlussarbeiten-template-institut-fur-informatik-padagogische-hochschule-karlsruhe.pdf";
|
||||
//String fileName = "files/new/kaust-official-thesis-template.pdf";
|
||||
//String fileName = "files/new/$100m Offers.pdf";
|
||||
//String fileName = "files/new/18-Curacron_ToxicidadeOcularInVitro.pdf";
|
||||
String fileName = "files/new/18-Curacron_ToxicidadeOcularInVitro.pdf";
|
||||
//String fileName = "files/new/mistitled_outlines_example.pdf";
|
||||
//String fileName = "files/bdr/Plenarprotokoll 1 (keine Druchsache!) (1) 1.pdf";
|
||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user