RED-7074: Design Subsection section tree structure algorithm
* first draft: further implementations
This commit is contained in:
parent
17756f5977
commit
85e3cf0ecc
@ -297,13 +297,6 @@ public class LayoutParsingPipeline {
|
||||
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, false);
|
||||
};
|
||||
|
||||
List<OutlineObject> outlineObjects = classificationDocument.getOutlineObjectTree().getOutlineObjectsPerPage()
|
||||
.get(pageNumber - 1);
|
||||
if (outlineObjects != null) {
|
||||
classificationPage.setOutlineObjects(outlineObjects);
|
||||
blockificationPostprocessingService.sanitizeOutlineBlocksWithKdTree(classificationPage);
|
||||
}
|
||||
|
||||
classificationPage.setCleanRulings(cleanRulings);
|
||||
classificationPage.setRotation(rotation);
|
||||
classificationPage.setLandscape(isLandscape);
|
||||
@ -311,6 +304,13 @@ public class LayoutParsingPipeline {
|
||||
classificationPage.setPageWidth(cropbox.getWidth());
|
||||
classificationPage.setPageHeight(cropbox.getHeight());
|
||||
|
||||
List<OutlineObject> outlineObjects = classificationDocument.getOutlineObjectTree().getOutlineObjectsPerPage()
|
||||
.get(pageNumber - 1);
|
||||
if (outlineObjects != null) {
|
||||
classificationPage.setOutlineObjects(outlineObjects);
|
||||
blockificationPostprocessingService.sanitizeOutlineBlocks(classificationPage);
|
||||
}
|
||||
|
||||
// MarkedContent needs to be converted at this point, otherwise it leads to GC Problems in Pdfbox.
|
||||
classificationPage.setMarkedContentBboxPerType(convertMarkedContents(stripper.getMarkedContents()));
|
||||
|
||||
|
||||
@ -52,9 +52,10 @@ public class OutlineValidationService {
|
||||
|
||||
private void addItemAtCorrectPosition(TableOfContents toc, TableOfContentItem tocItem, TableOfContentItem lastHeadlineFromOutlines) {
|
||||
|
||||
if(!tocItem.getChildren().isEmpty()) {
|
||||
|
||||
}
|
||||
//if (lastHeadlineFromOutlines == null || tocItem.g)
|
||||
//if(!tocItem.getChildren().isEmpty()) {
|
||||
//
|
||||
//}
|
||||
}
|
||||
|
||||
public TableOfContents createToC(List<TextPageBlock> headlines) {
|
||||
|
||||
@ -65,14 +65,14 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
private float getPageHeight() {
|
||||
public float getPageHeight() {
|
||||
|
||||
return sequences.get(0).getPageHeight();
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
private float getPageWidth() {
|
||||
public float getPageWidth() {
|
||||
|
||||
return sequences.get(0).getPageWidth();
|
||||
}
|
||||
|
||||
@ -4,7 +4,10 @@ import static com.knecon.fforesight.service.layoutparser.processor.services.bloc
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.ListIterator;
|
||||
import java.util.Locale;
|
||||
import java.util.function.Function;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
@ -36,39 +39,9 @@ public class BlockificationPostprocessingService {
|
||||
.collect(RectangleTransformations.collectBBox());
|
||||
|
||||
|
||||
public void sanitizeOutlineBlocksWithKdTree(ClassificationPage classificationPage) {
|
||||
public void sanitizeOutlineBlocks(ClassificationPage classificationPage) {
|
||||
|
||||
List<OutlineObject> outlineObjects = classificationPage.getOutlineObjects();
|
||||
if (classificationPage.getTextBlocks().isEmpty() || outlineObjects.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
KDTree<TextPageBlock> kdTree = createKdTree(classificationPage);
|
||||
|
||||
for (OutlineObject outlineObject : outlineObjects) {
|
||||
|
||||
KDIterator<TextPageBlock> successorIterator = kdTree.query(new double[]{ //
|
||||
0, //
|
||||
outlineObject.getPoint().getY() - BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD //
|
||||
}, //
|
||||
new double[]{Double.MAX_VALUE, Double.MAX_VALUE});
|
||||
|
||||
boolean matchedExactly = false;
|
||||
|
||||
OutlineProcessionContext context = new OutlineProcessionContext(outlineObject);
|
||||
while (successorIterator.hasNext() && !matchedExactly) {
|
||||
TextPageBlock pageBlock = successorIterator.next().value();
|
||||
matchedExactly = processOutlineForTextBlock(pageBlock, context);
|
||||
}
|
||||
|
||||
if (!matchedExactly) {
|
||||
selectMatch(classificationPage, kdTree, context);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static KDTree<TextPageBlock> createKdTree(ClassificationPage classificationPage) {
|
||||
|
||||
List<TextPageBlock> textBlocks = classificationPage.getTextBlocks()
|
||||
.stream()
|
||||
@ -78,97 +51,149 @@ public class BlockificationPostprocessingService {
|
||||
.map(block -> (TextPageBlock) block)
|
||||
.toList();
|
||||
|
||||
KDTree<TextPageBlock> kdTree = KDTree.create(2);
|
||||
textBlocks.forEach(block -> {
|
||||
var boundingBox = blockToBoundingBox.apply(block);
|
||||
kdTree.insert(new double[]{boundingBox.getMinX(), boundingBox.getMaxY()}, block);
|
||||
});
|
||||
return kdTree;
|
||||
if (textBlocks.isEmpty() || outlineObjects.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
float pageHeight = classificationPage.getPageHeight();
|
||||
|
||||
for (OutlineObject outlineObject : outlineObjects) {
|
||||
|
||||
OutlineProcessionContext context = new OutlineProcessionContext(outlineObject);
|
||||
|
||||
ListIterator<TextPageBlock> iterator = textBlocks.listIterator();
|
||||
while (iterator.hasNext()) {
|
||||
TextPageBlock pageBlock = iterator.next();
|
||||
if (pageHeight - outlineObject.getPoint().getY() - BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD <= pageBlock.getMaxY()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (iterator.hasPrevious()) {
|
||||
iterator.previous();
|
||||
}
|
||||
boolean earlyStop = false;
|
||||
while (iterator.hasNext() && !earlyStop) {
|
||||
TextPageBlock pageBlock = iterator.next();
|
||||
earlyStop = processOutlineForTextBlock(pageBlock, context);
|
||||
}
|
||||
selectMatch(classificationPage, context);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void selectMatch(ClassificationPage classificationPage, KDTree<TextPageBlock> kdTree, OutlineProcessionContext context) {
|
||||
private void selectMatch(ClassificationPage classificationPage, OutlineProcessionContext context) {
|
||||
|
||||
OutlineObject outlineObject = context.outlineObject;
|
||||
TextPageBlock directMatch = context.directMatch;
|
||||
List<TextPageBlock> mergeCandidates = context.mergeCandidates;
|
||||
TextPageBlock splitCandidate = context.splitCandidate;
|
||||
PageBlockType headlineType = PageBlockType.getHeadlineType(outlineObject.getTreeDepth());
|
||||
|
||||
double distanceToDirectMatch = directMatch != null ? calculateDistance(outlineObject, directMatch) : Double.MAX_VALUE;
|
||||
double distanceToSplitCandidate = splitCandidate != null ? calculateDistance(outlineObject, splitCandidate) : Double.MAX_VALUE;
|
||||
|
||||
double distanceToBestMergeCandidates = Double.MAX_VALUE;
|
||||
List<TextPageBlock> bestMergeCandidateCombination = new ArrayList<>();
|
||||
if (!mergeCandidates.isEmpty()) {
|
||||
|
||||
List<TextPageBlock> allMergeCandidates = new ArrayList<>(mergeCandidates);
|
||||
addNeighborsOfCandidate(kdTree, mergeCandidates.get(0), allMergeCandidates);
|
||||
if (mergeCandidates.size() > 1) {
|
||||
addNeighborsOfCandidate(kdTree, mergeCandidates.get(mergeCandidates.size() - 1), allMergeCandidates);
|
||||
}
|
||||
allMergeCandidates = allMergeCandidates.stream()
|
||||
.distinct()
|
||||
.toList();
|
||||
// with this code adjacent blocks to the first and last merge candidate get added, this could be useful for some edge cases:
|
||||
//List<TextPageBlock> allMergeCandidates = new ArrayList<>(mergeCandidates);
|
||||
//addNeighborsOfCandidate(kdTree, mergeCandidates.get(0), allMergeCandidates);
|
||||
//if (mergeCandidates.size() > 1) {
|
||||
// addNeighborsOfCandidate(kdTree, mergeCandidates.get(mergeCandidates.size() - 1), allMergeCandidates);
|
||||
//}
|
||||
//allMergeCandidates = allMergeCandidates.stream()
|
||||
// .distinct()
|
||||
// .toList();
|
||||
|
||||
List<List<TextPageBlock>> combinations = findCombinations(outlineObject.getTitle(), mergeCandidates);
|
||||
|
||||
List<List<TextPageBlock>> combinations = findCombinations(outlineObject.getTitle(), allMergeCandidates);
|
||||
double maxDistance = Double.MAX_VALUE;
|
||||
List<TextPageBlock> bestCombination = new ArrayList<>();
|
||||
for (List<TextPageBlock> combination : combinations) {
|
||||
double averageDistance = combination.stream()
|
||||
.map(block -> calculateDistance(outlineObject, block))
|
||||
.mapToDouble(Double::doubleValue).average()
|
||||
.orElse(Double.MAX_VALUE);
|
||||
if (maxDistance > averageDistance) {
|
||||
maxDistance = averageDistance;
|
||||
bestCombination = combination;
|
||||
if (distanceToBestMergeCandidates > averageDistance) {
|
||||
distanceToBestMergeCandidates = averageDistance;
|
||||
bestMergeCandidateCombination = combination;
|
||||
}
|
||||
}
|
||||
var merged = mergeBlocks(classificationPage, bestCombination);
|
||||
}
|
||||
|
||||
double minDistance = Math.min(distanceToDirectMatch, Math.min(distanceToSplitCandidate, distanceToBestMergeCandidates));
|
||||
|
||||
if(minDistance == Double.MAX_VALUE) {
|
||||
return;
|
||||
}
|
||||
if (minDistance == distanceToDirectMatch) {
|
||||
directMatch.setClassification(headlineType);
|
||||
} else if (minDistance == distanceToSplitCandidate) {
|
||||
List<TextPageBlock> others = splitBlock(classificationPage, splitCandidate, outlineObject.getTitle());
|
||||
splitCandidate.setClassification(headlineType);
|
||||
others.forEach(other -> other.setClassification(headlineType));
|
||||
} else {
|
||||
var merged = mergeBlocks(classificationPage, bestMergeCandidateCombination);
|
||||
merged.setClassification(headlineType);
|
||||
}
|
||||
}
|
||||
|
||||
if (splitCandidate != null) {
|
||||
TextPageBlock other = splitBlock(classificationPage, splitCandidate, outlineObject.getTitle());
|
||||
splitCandidate.setClassification(headlineType);
|
||||
other.setClassification(headlineType);
|
||||
|
||||
private List<TextPageBlock> splitBlock(ClassificationPage classificationPage, TextPageBlock blockToSplit, String text) {
|
||||
|
||||
List<TextPageBlock> otherBlocks = new ArrayList<>();
|
||||
int blockToSplitIdx = classificationPage.getTextBlocks().indexOf(blockToSplit);
|
||||
WordSequenceResult wordSequenceResult = findWordSequence(blockToSplit.getSequences(), text);
|
||||
List<TextPositionSequence> postSequence = blockToSplit.getSequences();
|
||||
postSequence.removeAll(wordSequenceResult.inSequence);
|
||||
postSequence.removeAll(wordSequenceResult.preSequence);
|
||||
|
||||
blockToSplit.setSequences(wordSequenceResult.inSequence);
|
||||
|
||||
if (!wordSequenceResult.preSequence.isEmpty()) {
|
||||
TextPageBlock block = buildTextBlock(wordSequenceResult.preSequence, 0);
|
||||
classificationPage.getTextBlocks().add(blockToSplitIdx, block);
|
||||
otherBlocks.add(block);
|
||||
blockToSplitIdx++;
|
||||
}
|
||||
if (!postSequence.isEmpty()) {
|
||||
TextPageBlock block = buildTextBlock(postSequence, 0);
|
||||
classificationPage.getTextBlocks().add(blockToSplitIdx + 1, block);
|
||||
otherBlocks.add(block);
|
||||
}
|
||||
return otherBlocks;
|
||||
}
|
||||
|
||||
|
||||
private TextPageBlock splitBlock(ClassificationPage classificationPage, TextPageBlock blockToSplit, String text) {
|
||||
private static WordSequenceResult findWordSequence(List<TextPositionSequence> textPositionSequences, String text) {
|
||||
|
||||
List<TextPositionSequence> wordSequence = findWordSequence(blockToSplit.getSequences(), text);
|
||||
List<TextPositionSequence> remaining = blockToSplit.getSequences();
|
||||
remaining.removeAll(wordSequence);
|
||||
|
||||
blockToSplit.setSequences(wordSequence);
|
||||
|
||||
TextPageBlock other = buildTextBlock(remaining, 0);
|
||||
classificationPage.getTextBlocks().add(other);
|
||||
return other;
|
||||
}
|
||||
|
||||
|
||||
private static List<TextPositionSequence> findWordSequence(List<TextPositionSequence> textPositionSequences, String text) {
|
||||
|
||||
String target = text.replaceAll("\\s", "");
|
||||
String target = sanitizeString(text);
|
||||
List<TextPositionSequence> inSequence = new ArrayList<>();
|
||||
List<TextPositionSequence> preSequence = new ArrayList<>();
|
||||
StringBuilder currentSequence = new StringBuilder();
|
||||
|
||||
for (TextPositionSequence sequence : textPositionSequences) {
|
||||
|
||||
if (currentSequence.toString().equals(target)) {
|
||||
return inSequence;
|
||||
}
|
||||
currentSequence.append(sequence.toString());
|
||||
currentSequence.append(sanitizeString(sequence.toString()));
|
||||
inSequence.add(sequence);
|
||||
|
||||
if (currentSequence.length() > target.length()) {
|
||||
TextPositionSequence removed = inSequence.remove(0);
|
||||
currentSequence.delete(0, removed.toString().length());
|
||||
preSequence.add(removed);
|
||||
|
||||
while (currentSequence.length() > target.length()) {
|
||||
removed = inSequence.remove(0);
|
||||
currentSequence.delete(0, removed.toString().length());
|
||||
preSequence.add(removed);
|
||||
}
|
||||
}
|
||||
|
||||
if (currentSequence.toString().equals(target)) {
|
||||
return new WordSequenceResult(inSequence, preSequence);
|
||||
}
|
||||
}
|
||||
return new ArrayList<>();
|
||||
return new WordSequenceResult(new ArrayList<>(), new ArrayList<>());
|
||||
}
|
||||
|
||||
|
||||
@ -209,7 +234,7 @@ public class BlockificationPostprocessingService {
|
||||
|
||||
private static void findCombinations(String title, List<TextPageBlock> blocks, List<TextPageBlock> current, List<List<TextPageBlock>> combinations) {
|
||||
|
||||
String target = title.replaceAll("\\s", "");
|
||||
String target = sanitizeString(title);
|
||||
if (target.isEmpty()) {
|
||||
combinations.add(new ArrayList<>(current));
|
||||
return;
|
||||
@ -219,10 +244,10 @@ public class BlockificationPostprocessingService {
|
||||
.filter(block -> !current.contains(block))
|
||||
.toList();
|
||||
for (TextPageBlock block : remaining) {
|
||||
String prefix = block.getText().replaceAll("\\s", "");
|
||||
String prefix = sanitizeString(block.getText());
|
||||
if (target.startsWith(prefix)) {
|
||||
current.add(block);
|
||||
findCombinations(target.substring(prefix.length()), blocks, current, combinations);
|
||||
findCombinations(target.substring(prefix.length()), blocks.subList(blocks.indexOf(block) + 1, blocks.size()), current, combinations);
|
||||
current.remove(current.size() - 1);
|
||||
}
|
||||
}
|
||||
@ -232,7 +257,7 @@ public class BlockificationPostprocessingService {
|
||||
private double calculateDistance(OutlineObject outlineObject, TextPageBlock pageBlock) {
|
||||
|
||||
double deltaX = outlineObject.getPoint().getX() - pageBlock.getMinX();
|
||||
double deltaY = outlineObject.getPoint().getY() - pageBlock.getMinY();
|
||||
double deltaY = pageBlock.getPageHeight() - outlineObject.getPoint().getY() - pageBlock.getMinY();
|
||||
return Math.sqrt(deltaX * deltaX + deltaY * deltaY);
|
||||
}
|
||||
|
||||
@ -255,8 +280,8 @@ public class BlockificationPostprocessingService {
|
||||
private boolean processOutlineForTextBlock(TextPageBlock pageBlock, OutlineProcessionContext context) {
|
||||
|
||||
OutlineObject outlineObject = context.getOutlineObject();
|
||||
String blockText = pageBlock.getText();
|
||||
String outlineTitle = outlineObject.getTitle();
|
||||
String blockText = sanitizeString(pageBlock.getText());
|
||||
String outlineTitle = sanitizeString(outlineObject.getTitle());
|
||||
|
||||
boolean blockTextContainsOutlineTitle = blockText.contains(outlineTitle);
|
||||
boolean outlineTitleContainsBlockText = outlineTitle.contains(blockText);
|
||||
@ -265,8 +290,8 @@ public class BlockificationPostprocessingService {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (blockText.equals(outlineTitle)) {
|
||||
pageBlock.setClassification(PageBlockType.getHeadlineType(outlineObject.getTreeDepth()));
|
||||
if (blockText.equals(outlineTitle) && context.directMatch == null) {
|
||||
context.directMatch = pageBlock;
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -274,17 +299,27 @@ public class BlockificationPostprocessingService {
|
||||
context.mergeCandidates.add(pageBlock);
|
||||
}
|
||||
|
||||
if (blockTextContainsOutlineTitle && context.splitCandidate != null) {
|
||||
if (blockTextContainsOutlineTitle && context.splitCandidate == null) {
|
||||
context.splitCandidate = pageBlock;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
private static String sanitizeString(String text) {
|
||||
|
||||
return text.replaceAll("\\s", "").toLowerCase(Locale.ROOT);
|
||||
}
|
||||
|
||||
|
||||
private record WordSequenceResult(List<TextPositionSequence> inSequence, List<TextPositionSequence> preSequence) {
|
||||
|
||||
}
|
||||
|
||||
@Data
|
||||
private static class OutlineProcessionContext {
|
||||
|
||||
private TextPageBlock directMatch;
|
||||
private OutlineObject outlineObject;
|
||||
private List<TextPageBlock> mergeCandidates;
|
||||
private TextPageBlock splitCandidate;
|
||||
@ -293,10 +328,65 @@ public class BlockificationPostprocessingService {
|
||||
public OutlineProcessionContext(OutlineObject outlineObject) {
|
||||
|
||||
this.outlineObject = outlineObject;
|
||||
this.directMatch = null;
|
||||
this.mergeCandidates = new ArrayList<>();
|
||||
this.splitCandidate = null;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Deprecated
|
||||
public void sanitizeOutlineBlocksWithKdTree(ClassificationPage classificationPage) {
|
||||
|
||||
List<OutlineObject> outlineObjects = classificationPage.getOutlineObjects();
|
||||
if (classificationPage.getTextBlocks().isEmpty() || outlineObjects.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
KDTree<TextPageBlock> kdTree = createKdTree(classificationPage);
|
||||
float pageHeight = classificationPage.getPageHeight();
|
||||
|
||||
for (OutlineObject outlineObject : outlineObjects) {
|
||||
|
||||
// kd tree contains yx coordinates
|
||||
KDIterator<TextPageBlock> successorIterator = kdTree.query(new double[]{ //
|
||||
pageHeight - outlineObject.getPoint().getY() - BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD, 0, //
|
||||
//
|
||||
}, //
|
||||
new double[]{Double.MAX_VALUE, Double.MAX_VALUE});
|
||||
|
||||
OutlineProcessionContext context = new OutlineProcessionContext(outlineObject);
|
||||
|
||||
boolean earlyStop = false;
|
||||
while (successorIterator.hasNext() && !earlyStop) {
|
||||
TextPageBlock pageBlock = successorIterator.next().value();
|
||||
earlyStop = processOutlineForTextBlock(pageBlock, context);
|
||||
processOutlineForTextBlock(pageBlock, context);
|
||||
}
|
||||
selectMatch(classificationPage, context);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Deprecated
|
||||
private static KDTree<TextPageBlock> createKdTree(ClassificationPage classificationPage) {
|
||||
|
||||
List<TextPageBlock> textBlocks = classificationPage.getTextBlocks()
|
||||
.stream()
|
||||
.filter(block -> block instanceof TextPageBlock)
|
||||
.toList()
|
||||
.stream()
|
||||
.map(block -> (TextPageBlock) block)
|
||||
.toList();
|
||||
|
||||
KDTree<TextPageBlock> kdTree = KDTree.create(2);
|
||||
// insert y first then x, use pdf max y so that the page height is subtracted so that the order is inverted
|
||||
textBlocks.forEach(block -> {
|
||||
//var boundingBox = blockToBoundingBox.apply(block);
|
||||
kdTree.insert(new double[]{block.getMinY(), block.getMinX()}, block);
|
||||
});
|
||||
return kdTree;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -19,6 +19,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageB
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@ -44,15 +45,17 @@ public class RedactManagerClassificationService {
|
||||
.map(tb -> (TextPageBlock) tb))
|
||||
.toList();
|
||||
|
||||
|
||||
HeadLineClassificationContext headLineClassificationContext = new HeadLineClassificationContext();
|
||||
for (ClassificationPage page : document.getPages()) {
|
||||
classifyPage(page, document, headlineFontSizes);
|
||||
classifyPage(page, document, headlineFontSizes, headLineClassificationContext);
|
||||
}
|
||||
|
||||
List<TextPageBlock> allHeadlines = document.getPages()
|
||||
.stream()
|
||||
.flatMap(classificationPage -> classificationPage.getTextBlocks()
|
||||
.stream()
|
||||
.filter(tb -> tb instanceof TextPageBlock && tb.getClassification() != null && tb.getClassification().isHeadline())
|
||||
.filter(tb -> tb instanceof TextPageBlock && tb.getClassification() != null && tb.getClassification().isHeadline())
|
||||
.map(tb -> (TextPageBlock) tb))
|
||||
.toList();
|
||||
|
||||
@ -67,21 +70,26 @@ public class RedactManagerClassificationService {
|
||||
}
|
||||
|
||||
|
||||
private void classifyPage(ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
|
||||
private void classifyPage(ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes, HeadLineClassificationContext headLineClassificationContext) {
|
||||
|
||||
for (AbstractPageBlock textBlock : page.getTextBlocks()) {
|
||||
if (textBlock instanceof TextPageBlock) {
|
||||
classifyBlock((TextPageBlock) textBlock, page, document, headlineFontSizes);
|
||||
classifyBlock((TextPageBlock) textBlock, page, document, headlineFontSizes, headLineClassificationContext);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void classifyBlock(TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
|
||||
private void classifyBlock(TextPageBlock textBlock,
|
||||
ClassificationPage page,
|
||||
ClassificationDocument document,
|
||||
List<Float> headlineFontSizes,
|
||||
HeadLineClassificationContext headLineClassificationContext) {
|
||||
|
||||
var bodyTextFrame = page.getBodyTextFrame();
|
||||
|
||||
if (textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) {
|
||||
headLineClassificationContext.setLastHeadlineFromOutline(textBlock);
|
||||
return;
|
||||
}
|
||||
if (document.getFontSizeCounter().getMostPopular() == null) {
|
||||
@ -122,7 +130,8 @@ public class RedactManagerClassificationService {
|
||||
|
||||
for (int i = 1; i <= headlineFontSizes.size(); i++) {
|
||||
if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) {
|
||||
textBlock.setClassification(PageBlockType.getHeadlineType(i));
|
||||
PageBlockType headlineType = PageBlockType.getHeadlineType(i);
|
||||
classifyHeadline(textBlock, headLineClassificationContext, headlineType);
|
||||
document.setHeadlines(true);
|
||||
}
|
||||
}
|
||||
@ -134,7 +143,8 @@ public class RedactManagerClassificationService {
|
||||
&& textBlock.getSequences()
|
||||
.get(0).getTextPositions()
|
||||
.get(0).getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
|
||||
textBlock.setClassification(PageBlockType.getHeadlineType(headlineFontSizes.size() + 1));
|
||||
PageBlockType headlineType = PageBlockType.getHeadlineType(headlineFontSizes.size() + 1);
|
||||
classifyHeadline(textBlock, headLineClassificationContext, headlineType);
|
||||
document.setHeadlines(true);
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
|
||||
&& textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()
|
||||
@ -159,4 +169,66 @@ public class RedactManagerClassificationService {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static void classifyHeadline(TextPageBlock textBlock, HeadLineClassificationContext headLineClassificationContext, PageBlockType headlineType) {
|
||||
|
||||
TextPageBlock lastHeadline = headLineClassificationContext.getLastHeadline();
|
||||
TextPageBlock lastHeadlineFromOutline = headLineClassificationContext.getLastHeadlineFromOutline();
|
||||
PageBlockType originalClassifiedBlockType = headLineClassificationContext.getOriginalClassifiedBlockType();
|
||||
|
||||
if (lastHeadline != null) {
|
||||
|
||||
if (lastHeadline.equals(lastHeadlineFromOutline)) {
|
||||
|
||||
headlineType = getNextType(lastHeadline.getClassification());
|
||||
|
||||
} else if (originalClassifiedBlockType != null && lastHeadline.getClassification() != originalClassifiedBlockType) {
|
||||
|
||||
PageBlockType lastHeadlineType = lastHeadline.getClassification();
|
||||
int difference = getHeadlineNumber(originalClassifiedBlockType) - getHeadlineNumber(lastHeadlineType);
|
||||
headlineType = PageBlockType.getHeadlineType(getHeadlineNumber(headlineType) + difference);
|
||||
}
|
||||
}
|
||||
|
||||
headLineClassificationContext.setOriginalClassifiedBlockType(headlineType);
|
||||
textBlock.setClassification(headlineType);
|
||||
headLineClassificationContext.setLastHeadline(textBlock);
|
||||
}
|
||||
|
||||
|
||||
private static PageBlockType getNextType(PageBlockType pageBlockType) {
|
||||
|
||||
return PageBlockType.getHeadlineType(getHeadlineNumber(pageBlockType) + 1);
|
||||
}
|
||||
|
||||
|
||||
private static int getHeadlineNumber(PageBlockType pageBlockType) {
|
||||
|
||||
return switch (pageBlockType) {
|
||||
case H1 -> 1;
|
||||
case H2 -> 2;
|
||||
case H3 -> 3;
|
||||
case H4 -> 4;
|
||||
case H5 -> 5;
|
||||
default -> 6;
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@Data
|
||||
static class HeadLineClassificationContext {
|
||||
|
||||
TextPageBlock lastHeadline;
|
||||
PageBlockType originalClassifiedBlockType;
|
||||
TextPageBlock lastHeadlineFromOutline;
|
||||
|
||||
|
||||
public void setLastHeadlineFromOutline(TextPageBlock lastHeadlineFromOutline) {
|
||||
|
||||
this.lastHeadlineFromOutline = lastHeadlineFromOutline;
|
||||
this.setLastHeadline(lastHeadlineFromOutline);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -32,7 +32,8 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
//String fileName = "files/new/abschlussarbeiten-template-institut-fur-informatik-padagogische-hochschule-karlsruhe.pdf";
|
||||
//String fileName = "files/new/kaust-official-thesis-template.pdf";
|
||||
//String fileName = "files/new/$100m Offers.pdf";
|
||||
String fileName = "files/new/18-Curacron_ToxicidadeOcularInVitro.pdf";
|
||||
//String fileName = "files/new/18-Curacron_ToxicidadeOcularInVitro.pdf";
|
||||
String fileName = "files/new/UTT-Books-53.pdf";
|
||||
//String fileName = "files/new/mistitled_outlines_example.pdf";
|
||||
//String fileName = "files/bdr/Plenarprotokoll 1 (keine Druchsache!) (1) 1.pdf";
|
||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||
|
||||
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user