RED-7074: Design Subsection section tree structure algorithm
* improved merging of headlines as well as splitting logic so that more headlines are detected correctly
This commit is contained in:
parent
2fcaeb3d8c
commit
1856fed640
@ -2,6 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor;
|
|||||||
|
|
||||||
import static java.lang.String.format;
|
import static java.lang.String.format;
|
||||||
|
|
||||||
|
import java.awt.geom.Point2D;
|
||||||
import java.awt.geom.Rectangle2D;
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
@ -210,7 +211,7 @@ public class LayoutParsingPipeline {
|
|||||||
|
|
||||||
private String buildSemanticNodeCountMessage(int numberOfPages, Map<NodeType, Long> semanticNodeCounts) {
|
private String buildSemanticNodeCountMessage(int numberOfPages, Map<NodeType, Long> semanticNodeCounts) {
|
||||||
|
|
||||||
return String.format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed",
|
return format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed",
|
||||||
numberOfPages,
|
numberOfPages,
|
||||||
semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION),
|
semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION),
|
||||||
semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE),
|
semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE),
|
||||||
@ -239,6 +240,7 @@ public class LayoutParsingPipeline {
|
|||||||
Map<Integer, List<ClassifiedImage>> signatures = visualLayoutParsingAdapter.buildExtractedSignaturesPerPage(visualLayoutParsingResponse);
|
Map<Integer, List<ClassifiedImage>> signatures = visualLayoutParsingAdapter.buildExtractedSignaturesPerPage(visualLayoutParsingResponse);
|
||||||
ClassificationDocument classificationDocument = new ClassificationDocument();
|
ClassificationDocument classificationDocument = new ClassificationDocument();
|
||||||
List<ClassificationPage> classificationPages = new ArrayList<>();
|
List<ClassificationPage> classificationPages = new ArrayList<>();
|
||||||
|
OutlineObject lastProcessedOutlineObject = null;
|
||||||
|
|
||||||
// parsing the structure elements could be useful as well
|
// parsing the structure elements could be useful as well
|
||||||
classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originDocument));
|
classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originDocument));
|
||||||
@ -307,11 +309,16 @@ public class LayoutParsingPipeline {
|
|||||||
classificationPage.setPageWidth(cropbox.getWidth());
|
classificationPage.setPageWidth(cropbox.getWidth());
|
||||||
classificationPage.setPageHeight(cropbox.getHeight());
|
classificationPage.setPageHeight(cropbox.getHeight());
|
||||||
|
|
||||||
List<OutlineObject> outlineObjects = classificationDocument.getOutlineObjectTree().getOutlineObjectsPerPage()
|
List<OutlineObject> outlineObjects = classificationDocument.getOutlineObjectTree().getOutlineObjectsPerPage().getOrDefault(pageNumber - 1, new ArrayList<>());
|
||||||
.get(pageNumber - 1);
|
|
||||||
if (outlineObjects != null) {
|
OutlineObject notFoundOutlineObject = null;
|
||||||
|
if (lastProcessedOutlineObject != null && !lastProcessedOutlineObject.isFound()) {
|
||||||
|
lastProcessedOutlineObject.setPoint(new Point2D.Float(0, cropbox.getHeight()));
|
||||||
|
notFoundOutlineObject = lastProcessedOutlineObject;
|
||||||
|
}
|
||||||
|
if (!outlineObjects.isEmpty()) {
|
||||||
classificationPage.setOutlineObjects(outlineObjects);
|
classificationPage.setOutlineObjects(outlineObjects);
|
||||||
blockificationPostprocessingService.sanitizeOutlineBlocks(classificationPage);
|
lastProcessedOutlineObject = blockificationPostprocessingService.sanitizeOutlineBlocks(classificationPage, notFoundOutlineObject);
|
||||||
}
|
}
|
||||||
|
|
||||||
// MarkedContent needs to be converted at this point, otherwise it leads to GC Problems in Pdfbox.
|
// MarkedContent needs to be converted at this point, otherwise it leads to GC Problems in Pdfbox.
|
||||||
|
|||||||
@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.model;
|
|||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree;
|
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents;
|
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||||
|
|||||||
@ -5,16 +5,27 @@ import java.awt.geom.Point2D;
|
|||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
import lombok.Data;
|
import lombok.Data;
|
||||||
import lombok.NoArgsConstructor;
|
import lombok.NoArgsConstructor;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
|
||||||
@Data
|
@Data
|
||||||
@NoArgsConstructor
|
@RequiredArgsConstructor
|
||||||
@AllArgsConstructor
|
@AllArgsConstructor
|
||||||
public class OutlineObject {
|
public class OutlineObject {
|
||||||
|
|
||||||
private String title;
|
private final String title;
|
||||||
private int pageNumber;
|
private final int pageNumber;
|
||||||
private Point2D point;
|
private Point2D point;
|
||||||
private int treeDepth;
|
private final int treeDepth;
|
||||||
|
|
||||||
|
private boolean found = false;
|
||||||
|
|
||||||
|
|
||||||
|
public OutlineObject(String title, int pageNumber, Point2D point2D, int depth) {
|
||||||
|
|
||||||
|
this(title, pageNumber, depth);
|
||||||
|
this.point = point2D;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
|
|||||||
@ -80,7 +80,10 @@ public class TextPageBlock extends AbstractPageBlock {
|
|||||||
|
|
||||||
public static TextPageBlock merge(List<TextPageBlock> textBlocksToMerge) {
|
public static TextPageBlock merge(List<TextPageBlock> textBlocksToMerge) {
|
||||||
|
|
||||||
List<TextPositionSequence> sequences = textBlocksToMerge.stream().map(TextPageBlock::getSequences).flatMap(java.util.Collection::stream).toList();
|
List<TextPositionSequence> sequences = textBlocksToMerge.stream()
|
||||||
|
.map(TextPageBlock::getSequences)
|
||||||
|
.flatMap(java.util.Collection::stream)
|
||||||
|
.toList();
|
||||||
sequences = new ArrayList<>(sequences);
|
sequences = new ArrayList<>(sequences);
|
||||||
return fromTextPositionSequences(sequences);
|
return fromTextPositionSequences(sequences);
|
||||||
}
|
}
|
||||||
@ -126,11 +129,12 @@ public class TextPageBlock extends AbstractPageBlock {
|
|||||||
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
|
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
|
||||||
}
|
}
|
||||||
|
|
||||||
if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences()
|
if (textBlock != null
|
||||||
|
&& textBlock.getSequences() != null
|
||||||
|
&& textBlock.getSequences()
|
||||||
.stream()
|
.stream()
|
||||||
.map(t -> DoubleComparisons.round(t.getMinYDirAdj(), 3))
|
.map(t -> DoubleComparisons.round(t.getMinYDirAdj(), 3))
|
||||||
.collect(toSet())
|
.collect(toSet()).size() == 1) {
|
||||||
.size() == 1) {
|
|
||||||
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
|
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
|
||||||
}
|
}
|
||||||
return textBlock;
|
return textBlock;
|
||||||
@ -290,18 +294,7 @@ public class TextPageBlock extends AbstractPageBlock {
|
|||||||
|
|
||||||
public void add(TextPositionSequence r) {
|
public void add(TextPositionSequence r) {
|
||||||
|
|
||||||
if (r.getMinXDirAdj() < minX) {
|
setCoordinates(r);
|
||||||
minX = r.getMinXDirAdj();
|
|
||||||
}
|
|
||||||
if (r.getMaxXDirAdj() > maxX) {
|
|
||||||
maxX = r.getMaxXDirAdj();
|
|
||||||
}
|
|
||||||
if (r.getMinYDirAdj() < minY) {
|
|
||||||
minY = r.getMinYDirAdj();
|
|
||||||
}
|
|
||||||
if (r.getMaxYDirAdj() > maxY) {
|
|
||||||
maxY = r.getMaxYDirAdj();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -317,6 +310,33 @@ public class TextPageBlock extends AbstractPageBlock {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void resize() {
|
||||||
|
|
||||||
|
minX = Float.MAX_VALUE;
|
||||||
|
minY = Float.MAX_VALUE;
|
||||||
|
maxX = Float.MIN_VALUE;
|
||||||
|
maxY = Float.MIN_VALUE;
|
||||||
|
sequences.forEach(this::setCoordinates);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void setCoordinates(TextPositionSequence sequence) {
|
||||||
|
|
||||||
|
if (sequence.getMinXDirAdj() < minX) {
|
||||||
|
minX = sequence.getMinXDirAdj();
|
||||||
|
}
|
||||||
|
if (sequence.getMaxXDirAdj() > maxX) {
|
||||||
|
maxX = sequence.getMaxXDirAdj();
|
||||||
|
}
|
||||||
|
if (sequence.getMinYDirAdj() < minY) {
|
||||||
|
minY = sequence.getMinYDirAdj();
|
||||||
|
}
|
||||||
|
if (sequence.getMaxYDirAdj() > maxY) {
|
||||||
|
maxY = sequence.getMaxYDirAdj();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public void set(float x1, float y1, float x2, float y2) {
|
public void set(float x1, float y1, float x2, float y2) {
|
||||||
|
|
||||||
this.minX = Math.min(x1, x2);
|
this.minX = Math.min(x1, x2);
|
||||||
|
|||||||
@ -37,28 +37,110 @@ public class BlockificationPostprocessingService {
|
|||||||
.collect(RectangleTransformations.collectBBox());
|
.collect(RectangleTransformations.collectBBox());
|
||||||
|
|
||||||
|
|
||||||
public void sanitizeOutlineBlocks(ClassificationPage classificationPage) {
|
public OutlineObject sanitizeOutlineBlocks(ClassificationPage classificationPage, OutlineObject notFoundOutlineObject) {
|
||||||
|
|
||||||
List<OutlineObject> outlineObjects = classificationPage.getOutlineObjects();
|
List<OutlineObject> outlineObjects = classificationPage.getOutlineObjects();
|
||||||
|
|
||||||
List<TextPageBlock> textBlocks = classificationPage.getTextBlocks()
|
if (getTextPageBlocks(classificationPage).isEmpty() || outlineObjects.isEmpty()) {
|
||||||
.stream()
|
return null;
|
||||||
.filter(block -> block instanceof TextPageBlock)
|
|
||||||
.toList()
|
|
||||||
.stream()
|
|
||||||
.map(block -> (TextPageBlock) block)
|
|
||||||
.toList();
|
|
||||||
|
|
||||||
if (textBlocks.isEmpty() || outlineObjects.isEmpty()) {
|
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
float pageHeight = classificationPage.getPageHeight();
|
float pageHeight = classificationPage.getPageHeight();
|
||||||
|
|
||||||
for (OutlineObject outlineObject : outlineObjects) {
|
ListIterator<OutlineObject> outlineObjectListIterator = outlineObjects.listIterator();
|
||||||
|
|
||||||
OutlineProcessionContext context = new OutlineProcessionContext(outlineObject);
|
if (notFoundOutlineObject != null) {
|
||||||
|
OutlineProcessionContext notFoundOutlineObjectProcessionContext = new OutlineProcessionContext(notFoundOutlineObject);
|
||||||
|
processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, notFoundOutlineObjectProcessionContext);
|
||||||
|
|
||||||
|
OutlineObject firstOutlineObject = null;
|
||||||
|
OutlineProcessionContext firstOutlineObjectProcessionContext = null;
|
||||||
|
if (outlineObjectListIterator.hasNext()) {
|
||||||
|
firstOutlineObject = outlineObjectListIterator.next();
|
||||||
|
firstOutlineObjectProcessionContext = new OutlineProcessionContext(firstOutlineObject);
|
||||||
|
processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, firstOutlineObjectProcessionContext);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!contextsOverlap(notFoundOutlineObjectProcessionContext, firstOutlineObjectProcessionContext)) {
|
||||||
|
notFoundOutlineObject.setFound(selectMatch(classificationPage, notFoundOutlineObjectProcessionContext));
|
||||||
|
}
|
||||||
|
if (firstOutlineObject != null) {
|
||||||
|
// re-create the context for the updated blocks
|
||||||
|
firstOutlineObjectProcessionContext = new OutlineProcessionContext(firstOutlineObject);
|
||||||
|
processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, firstOutlineObjectProcessionContext);
|
||||||
|
firstOutlineObject.setFound(selectMatch(classificationPage, firstOutlineObjectProcessionContext));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
outlineObjectListIterator.forEachRemaining(outlineObject -> {
|
||||||
|
OutlineProcessionContext outlineObjectProcessionContext = new OutlineProcessionContext(outlineObject);
|
||||||
|
processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, outlineObjectProcessionContext);
|
||||||
|
outlineObject.setFound(selectMatch(classificationPage, outlineObjectProcessionContext));
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!outlineObjects.isEmpty()) {
|
||||||
|
return outlineObjects.get(outlineObjects.size() - 1);
|
||||||
|
} else {
|
||||||
|
return notFoundOutlineObject;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static List<TextPageBlock> getTextPageBlocks(ClassificationPage classificationPage) {
|
||||||
|
|
||||||
|
return classificationPage.getTextBlocks()
|
||||||
|
.stream()
|
||||||
|
.filter(block -> block instanceof TextPageBlock)
|
||||||
|
.map(block -> (TextPageBlock) block)
|
||||||
|
.toList();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private boolean contextsOverlap(OutlineProcessionContext notFoundOutlineObjectProcessionContext, OutlineProcessionContext firstOutlineObjectProcessionContext) {
|
||||||
|
|
||||||
|
if (firstOutlineObjectProcessionContext == null) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
String notFoundTitle = notFoundOutlineObjectProcessionContext.getOutlineObject().getTitle();
|
||||||
|
String firstTitle = firstOutlineObjectProcessionContext.getOutlineObject().getTitle();
|
||||||
|
|
||||||
|
if (!firstTitle.startsWith(notFoundTitle)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
var blocksOfNotFoundOutline = getAllMatchingBlocks(notFoundOutlineObjectProcessionContext);
|
||||||
|
var blocksOfFirstOutline = getAllMatchingBlocks(firstOutlineObjectProcessionContext);
|
||||||
|
|
||||||
|
double maxYFirst = blocksOfFirstOutline.stream()
|
||||||
|
.mapToDouble(TextPageBlock::getPdfMaxY)
|
||||||
|
.max()
|
||||||
|
.orElse(Double.NEGATIVE_INFINITY);
|
||||||
|
|
||||||
|
return blocksOfNotFoundOutline.stream()
|
||||||
|
.mapToDouble(TextPageBlock::getPdfMaxY)
|
||||||
|
.anyMatch(y -> y >= maxYFirst);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private List<TextPageBlock> getAllMatchingBlocks(OutlineProcessionContext context) {
|
||||||
|
|
||||||
|
List<TextPageBlock> blocks = new ArrayList<>();
|
||||||
|
if (context.getDirectMatch() != null) {
|
||||||
|
blocks.add(context.getDirectMatch());
|
||||||
|
}
|
||||||
|
if (context.getSplitCandidate() != null) {
|
||||||
|
blocks.add(context.getSplitCandidate());
|
||||||
|
}
|
||||||
|
blocks.addAll(context.getMergeCandidates());
|
||||||
|
return blocks;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void processTextBlocks(List<TextPageBlock> textBlocks, float pageHeight, OutlineProcessionContext context) {
|
||||||
|
|
||||||
|
OutlineObject outlineObject = context.getOutlineObject();
|
||||||
ListIterator<TextPageBlock> iterator = textBlocks.listIterator();
|
ListIterator<TextPageBlock> iterator = textBlocks.listIterator();
|
||||||
while (iterator.hasNext()) {
|
while (iterator.hasNext()) {
|
||||||
TextPageBlock pageBlock = iterator.next();
|
TextPageBlock pageBlock = iterator.next();
|
||||||
@ -74,13 +156,10 @@ public class BlockificationPostprocessingService {
|
|||||||
TextPageBlock pageBlock = iterator.next();
|
TextPageBlock pageBlock = iterator.next();
|
||||||
earlyStop = processOutlineForTextBlock(pageBlock, context);
|
earlyStop = processOutlineForTextBlock(pageBlock, context);
|
||||||
}
|
}
|
||||||
selectMatch(classificationPage, context);
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void selectMatch(ClassificationPage classificationPage, OutlineProcessionContext context) {
|
private boolean selectMatch(ClassificationPage classificationPage, OutlineProcessionContext context) {
|
||||||
|
|
||||||
OutlineObject outlineObject = context.outlineObject;
|
OutlineObject outlineObject = context.outlineObject;
|
||||||
TextPageBlock directMatch = context.directMatch;
|
TextPageBlock directMatch = context.directMatch;
|
||||||
@ -122,28 +201,39 @@ public class BlockificationPostprocessingService {
|
|||||||
double minDistance = Math.min(distanceToDirectMatch, Math.min(distanceToSplitCandidate, distanceToBestMergeCandidates));
|
double minDistance = Math.min(distanceToDirectMatch, Math.min(distanceToSplitCandidate, distanceToBestMergeCandidates));
|
||||||
|
|
||||||
if (minDistance == Double.MAX_VALUE) {
|
if (minDistance == Double.MAX_VALUE) {
|
||||||
return;
|
return false;
|
||||||
}
|
}
|
||||||
if (minDistance == distanceToDirectMatch) {
|
if (minDistance == distanceToDirectMatch) {
|
||||||
directMatch.setClassification(headlineType);
|
directMatch.setClassification(headlineType);
|
||||||
} else if (minDistance == distanceToSplitCandidate) {
|
} else if (minDistance == distanceToSplitCandidate) {
|
||||||
List<TextPageBlock> others = splitBlock(classificationPage, splitCandidate, context.sectionIdentifier + outlineObject.getTitle());
|
List<TextPageBlock> others = splitBlock(classificationPage, splitCandidate, context.sectionIdentifier, outlineObject.getTitle());
|
||||||
splitCandidate.setClassification(headlineType);
|
splitCandidate.setClassification(headlineType);
|
||||||
others.forEach(other -> other.setClassification(null));
|
others.forEach(other -> other.setClassification(null));
|
||||||
} else {
|
} else {
|
||||||
var merged = mergeBlocks(classificationPage, bestMergeCandidateCombination);
|
var merged = mergeBlocks(classificationPage, bestMergeCandidateCombination);
|
||||||
merged.setClassification(headlineType);
|
merged.setClassification(headlineType);
|
||||||
}
|
}
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private List<TextPageBlock> splitBlock(ClassificationPage classificationPage, TextPageBlock blockToSplit, String text) {
|
private List<TextPageBlock> splitBlock(ClassificationPage classificationPage, TextPageBlock blockToSplit, SectionIdentifier sectionIdentifier, String title) {
|
||||||
|
|
||||||
List<TextPageBlock> otherBlocks = new ArrayList<>();
|
List<TextPageBlock> otherBlocks = new ArrayList<>();
|
||||||
int blockToSplitIdx = classificationPage.getTextBlocks().indexOf(blockToSplit);
|
int blockToSplitIdx = classificationPage.getTextBlocks().indexOf(blockToSplit);
|
||||||
WordSequenceResult wordSequenceResult = findWordSequence(blockToSplit.getSequences(), text);
|
|
||||||
|
String headline = title;
|
||||||
|
if (!sectionIdentifier.getFormat().equals(SectionIdentifier.Format.EMPTY) && !title.startsWith(sectionIdentifier.getIdentifierString())) {
|
||||||
|
headline = sectionIdentifier + headline;
|
||||||
|
}
|
||||||
|
|
||||||
|
WordSequenceResult wordSequenceResult = findWordSequence(blockToSplit.getSequences(), headline);
|
||||||
|
if (wordSequenceResult.inSequence.isEmpty()) {
|
||||||
|
wordSequenceResult = findWordSequence(blockToSplit.getSequences(), title);
|
||||||
|
}
|
||||||
|
|
||||||
blockToSplit.setSequences(wordSequenceResult.inSequence);
|
blockToSplit.setSequences(wordSequenceResult.inSequence);
|
||||||
|
blockToSplit.resize();
|
||||||
|
|
||||||
if (!wordSequenceResult.preSequence.isEmpty()) {
|
if (!wordSequenceResult.preSequence.isEmpty()) {
|
||||||
TextPageBlock block = buildTextBlock(wordSequenceResult.preSequence, 0);
|
TextPageBlock block = buildTextBlock(wordSequenceResult.preSequence, 0);
|
||||||
@ -301,6 +391,7 @@ public class BlockificationPostprocessingService {
|
|||||||
|
|
||||||
assert firstBlock != null;
|
assert firstBlock != null;
|
||||||
firstBlock.setToDuplicate(false);
|
firstBlock.setToDuplicate(false);
|
||||||
|
firstBlock.resize();
|
||||||
classificationPage.getTextBlocks().removeAll(mergedBlocks);
|
classificationPage.getTextBlocks().removeAll(mergedBlocks);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -378,13 +469,13 @@ public class BlockificationPostprocessingService {
|
|||||||
if (blockTextContainsOutlineTitle) {
|
if (blockTextContainsOutlineTitle) {
|
||||||
SectionIdentifier sectionIdentifier = SectionIdentifier.fromSearchText(blockText);
|
SectionIdentifier sectionIdentifier = SectionIdentifier.fromSearchText(blockText);
|
||||||
|
|
||||||
if (sectionIdentifier.getFormat() != SectionIdentifier.Format.EMPTY) {
|
if (sectionIdentifier.getFormat() != SectionIdentifier.Format.EMPTY && !outlineTitle.startsWith(sectionIdentifier.getIdentifierString())) {
|
||||||
|
|
||||||
if (blockText.startsWith(sectionIdentifier.getIdentifierString()) && blockText.endsWith(outlineTitle) && context.directMatch == null) {
|
if (blockText.startsWith(sectionIdentifier.getIdentifierString()) && blockText.endsWith(outlineTitle) && context.directMatch == null) {
|
||||||
context.directMatch = pageBlock;
|
context.directMatch = pageBlock;
|
||||||
return true;
|
return true;
|
||||||
} else if (context.splitCandidate == null) {
|
} else if (context.splitCandidate == null) {
|
||||||
context.sectionIdentifier = sectionIdentifier.getIdentifierString();
|
context.sectionIdentifier = sectionIdentifier;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (context.splitCandidate == null) {
|
if (context.splitCandidate == null) {
|
||||||
@ -408,7 +499,7 @@ public class BlockificationPostprocessingService {
|
|||||||
private OutlineObject outlineObject;
|
private OutlineObject outlineObject;
|
||||||
private List<TextPageBlock> mergeCandidates;
|
private List<TextPageBlock> mergeCandidates;
|
||||||
private TextPageBlock splitCandidate;
|
private TextPageBlock splitCandidate;
|
||||||
private String sectionIdentifier;
|
private SectionIdentifier sectionIdentifier;
|
||||||
|
|
||||||
|
|
||||||
public OutlineProcessionContext(OutlineObject outlineObject) {
|
public OutlineProcessionContext(OutlineObject outlineObject) {
|
||||||
@ -417,7 +508,7 @@ public class BlockificationPostprocessingService {
|
|||||||
this.directMatch = null;
|
this.directMatch = null;
|
||||||
this.mergeCandidates = new ArrayList<>();
|
this.mergeCandidates = new ArrayList<>();
|
||||||
this.splitCandidate = null;
|
this.splitCandidate = null;
|
||||||
this.sectionIdentifier = "";
|
this.sectionIdentifier = SectionIdentifier.empty();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -58,8 +58,10 @@ public class DocstrumBlockificationService {
|
|||||||
zones.forEach(zone -> {
|
zones.forEach(zone -> {
|
||||||
|
|
||||||
List<TextPositionSequence> textPositionSequences = new ArrayList<>();
|
List<TextPositionSequence> textPositionSequences = new ArrayList<>();
|
||||||
zone.getLines().forEach(line -> {
|
zone.getLines()
|
||||||
line.getWords().forEach(word -> {
|
.forEach(line -> {
|
||||||
|
line.getWords()
|
||||||
|
.forEach(word -> {
|
||||||
textPositionSequences.add(new TextPositionSequence(word.getTextPositions(), word.getPage()));
|
textPositionSequences.add(new TextPositionSequence(word.getTextPositions(), word.getPage()));
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
@ -90,7 +92,7 @@ public class DocstrumBlockificationService {
|
|||||||
while (itty.hasNext()) {
|
while (itty.hasNext()) {
|
||||||
|
|
||||||
AbstractPageBlock block = itty.next();
|
AbstractPageBlock block = itty.next();
|
||||||
if (block instanceof TablePageBlock || previous.isHeadline()) {
|
if (block instanceof TablePageBlock) {
|
||||||
previous = new TextPageBlock();
|
previous = new TextPageBlock();
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -98,11 +100,21 @@ public class DocstrumBlockificationService {
|
|||||||
|
|
||||||
if (previous != null && !previous.getSequences().isEmpty()) {
|
if (previous != null && !previous.getSequences().isEmpty()) {
|
||||||
|
|
||||||
if (current.getDir() != previous.getDir() || current.isHeadline()) {
|
if (current.getDir() != previous.getDir()) {
|
||||||
previous = current;
|
previous = current;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (current.isHeadline() || previous.isHeadline()) {
|
||||||
|
if (intersectsYWithPreviousHavingMaxOneLine(previous, current, page)) {
|
||||||
|
previous = combineBlocksAndResetIterator(previous, current, itty, false);
|
||||||
|
} else {
|
||||||
|
previous = current;
|
||||||
|
}
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
if (areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(previous, current, page)) {
|
if (areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(previous, current, page)) {
|
||||||
previous = combineBlocksAndResetIterator(previous, current, itty, true);
|
previous = combineBlocksAndResetIterator(previous, current, itty, true);
|
||||||
continue;
|
continue;
|
||||||
@ -149,6 +161,13 @@ public class DocstrumBlockificationService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private boolean intersectsYWithPreviousHavingMaxOneLine(TextPageBlock previous, TextPageBlock current, ClassificationPage page) {
|
||||||
|
|
||||||
|
return previous.intersectsY(current)//(Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD && Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) //
|
||||||
|
&& (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
private boolean areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(TextPageBlock previous, TextPageBlock current, ClassificationPage page) {
|
private boolean areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(TextPageBlock previous, TextPageBlock current, ClassificationPage page) {
|
||||||
|
|
||||||
return previous.getNumberOfLines() >= 2 && current.getNumberOfLines() >= 2 //
|
return previous.getNumberOfLines() >= 2 && current.getNumberOfLines() >= 2 //
|
||||||
@ -248,7 +267,6 @@ public class DocstrumBlockificationService {
|
|||||||
|
|
||||||
TextPageBlock inner = (TextPageBlock) abstractPageBlock;
|
TextPageBlock inner = (TextPageBlock) abstractPageBlock;
|
||||||
|
|
||||||
|
|
||||||
if (current.getDir() == inner.getDir() && current.almostIntersects(inner, yThreshold, xThreshold)) {
|
if (current.getDir() == inner.getDir() && current.almostIntersects(inner, yThreshold, xThreshold)) {
|
||||||
|
|
||||||
boolean toDuplicate = current.isToDuplicate() || inner.isToDuplicate();
|
boolean toDuplicate = current.isToDuplicate() || inner.isToDuplicate();
|
||||||
@ -371,7 +389,12 @@ public class DocstrumBlockificationService {
|
|||||||
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
|
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
|
||||||
}
|
}
|
||||||
|
|
||||||
if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) {
|
if (textBlock != null
|
||||||
|
&& textBlock.getSequences() != null
|
||||||
|
&& textBlock.getSequences()
|
||||||
|
.stream()
|
||||||
|
.map(t -> round(t.getMinYDirAdj(), 3))
|
||||||
|
.collect(toSet()).size() == 1) {
|
||||||
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
|
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
|
||||||
}
|
}
|
||||||
return textBlock;
|
return textBlock;
|
||||||
@ -386,14 +409,8 @@ public class DocstrumBlockificationService {
|
|||||||
List<Ruling> horizontalRulingLines,
|
List<Ruling> horizontalRulingLines,
|
||||||
List<Ruling> verticalRulingLines) {
|
List<Ruling> verticalRulingLines) {
|
||||||
|
|
||||||
return isSplitByRuling(maxX,
|
return isSplitByRuling(maxX, minY, word.getMinXDirAdj(), word.getMinYDirAdj(), verticalRulingLines, word.getDir().getDegrees(), word.getPageWidth(), word.getPageHeight())
|
||||||
minY,
|
//
|
||||||
word.getMinXDirAdj(),
|
|
||||||
word.getMinYDirAdj(),
|
|
||||||
verticalRulingLines,
|
|
||||||
word.getDir().getDegrees(),
|
|
||||||
word.getPageWidth(),
|
|
||||||
word.getPageHeight()) //
|
|
||||||
|| isSplitByRuling(minX,
|
|| isSplitByRuling(minX,
|
||||||
minY,
|
minY,
|
||||||
word.getMinXDirAdj(),
|
word.getMinXDirAdj(),
|
||||||
@ -401,7 +418,8 @@ public class DocstrumBlockificationService {
|
|||||||
horizontalRulingLines,
|
horizontalRulingLines,
|
||||||
word.getDir().getDegrees(),
|
word.getDir().getDegrees(),
|
||||||
word.getPageWidth(),
|
word.getPageWidth(),
|
||||||
word.getPageHeight()) //
|
word.getPageHeight())
|
||||||
|
//
|
||||||
|| isSplitByRuling(maxX,
|
|| isSplitByRuling(maxX,
|
||||||
minY,
|
minY,
|
||||||
word.getMinXDirAdj(),
|
word.getMinXDirAdj(),
|
||||||
@ -409,7 +427,8 @@ public class DocstrumBlockificationService {
|
|||||||
horizontalRulingLines,
|
horizontalRulingLines,
|
||||||
word.getDir().getDegrees(),
|
word.getDir().getDegrees(),
|
||||||
word.getPageWidth(),
|
word.getPageWidth(),
|
||||||
word.getPageHeight()) //
|
word.getPageHeight())
|
||||||
|
//
|
||||||
|| isSplitByRuling(minX,
|
|| isSplitByRuling(minX,
|
||||||
minY,
|
minY,
|
||||||
word.getMinXDirAdj(),
|
word.getMinXDirAdj(),
|
||||||
|
|||||||
@ -82,13 +82,15 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
|||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void testViewerDocument() {
|
public void testViewerDocument() {
|
||||||
|
|
||||||
|
//String fileName = "files/documine/21_TiltPlus_MutacaoGenicaEmCelulasBacterianas.pdf";//fail here
|
||||||
|
|
||||||
//String fileName = "files/new/UTT-Books-53.pdf";
|
|
||||||
String fileName = "files/documine/A21924A - Acute Oral Toxicity - Rats.pdf";
|
|
||||||
|
|
||||||
|
|
||||||
//String fileName = "files/documine/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica.pdf";
|
|
||||||
//String fileName = "files/documine/Study Document 1 - Acute Eye IrritationCorrosion - Rabbits.pdf";
|
//String fileName = "files/documine/Study Document 1 - Acute Eye IrritationCorrosion - Rabbits.pdf";
|
||||||
|
//String fileName = "files/documine/Study Document 3 - Acute Eye IrritationCorrosion - Rabbits.pdf";
|
||||||
|
//String fileName = "files/documine/VV-547521_Irritação_Ocular_in_Vivo.pdf";
|
||||||
|
//String fileName = "files/documine/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica.pdf";
|
||||||
|
//String fileName = "files/new/UTT-Books-53.pdf";
|
||||||
|
//String fileName = "files/documine/A21924A - Acute Oral Toxicity - Rats.pdf";
|
||||||
|
//String fileName = "files/documine/A16361B - Acute Dermal Irritation Toxicity Study in Rabbits.pdf";
|
||||||
//String fileName = "files/documine/ITEM 20_Sensibilização cutânea.pdf";
|
//String fileName = "files/documine/ITEM 20_Sensibilização cutânea.pdf";
|
||||||
//String fileName = "files/documine/VV-547523_LLNA.pdf";
|
//String fileName = "files/documine/VV-547523_LLNA.pdf";
|
||||||
//String fileName = "files/new/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
|
//String fileName = "files/new/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
|
||||||
@ -96,7 +98,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
|||||||
//String fileName = "files/new/abschlussarbeiten-template-institut-fur-informatik-padagogische-hochschule-karlsruhe.pdf";
|
//String fileName = "files/new/abschlussarbeiten-template-institut-fur-informatik-padagogische-hochschule-karlsruhe.pdf";
|
||||||
//String fileName = "files/new/kaust-official-thesis-template.pdf";
|
//String fileName = "files/new/kaust-official-thesis-template.pdf";
|
||||||
//String fileName = "files/new/$100m Offers.pdf";
|
//String fileName = "files/new/$100m Offers.pdf";
|
||||||
//String fileName = "files/new/18-Curacron_ToxicidadeOcularInVitro.pdf";
|
String fileName = "files/new/18-Curacron_ToxicidadeOcularInVitro.pdf";
|
||||||
//String fileName = "files/new/mistitled_outlines_example.pdf";
|
//String fileName = "files/new/mistitled_outlines_example.pdf";
|
||||||
//String fileName = "files/bdr/Plenarprotokoll 1 (keine Druchsache!) (1) 1.pdf";
|
//String fileName = "files/bdr/Plenarprotokoll 1 (keine Druchsache!) (1) 1.pdf";
|
||||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user