RED-7074: Design Subsection section tree structure algorithm

* first draft: further implementations
This commit is contained in:
maverickstuder 2024-04-18 17:52:33 +02:00
parent 17756f5977
commit 85e3cf0ecc
7 changed files with 272 additions and 108 deletions

View File

@ -297,13 +297,6 @@ public class LayoutParsingPipeline {
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, false);
};
List<OutlineObject> outlineObjects = classificationDocument.getOutlineObjectTree().getOutlineObjectsPerPage()
.get(pageNumber - 1);
if (outlineObjects != null) {
classificationPage.setOutlineObjects(outlineObjects);
blockificationPostprocessingService.sanitizeOutlineBlocksWithKdTree(classificationPage);
}
classificationPage.setCleanRulings(cleanRulings);
classificationPage.setRotation(rotation);
classificationPage.setLandscape(isLandscape);
@ -311,6 +304,13 @@ public class LayoutParsingPipeline {
classificationPage.setPageWidth(cropbox.getWidth());
classificationPage.setPageHeight(cropbox.getHeight());
List<OutlineObject> outlineObjects = classificationDocument.getOutlineObjectTree().getOutlineObjectsPerPage()
.get(pageNumber - 1);
if (outlineObjects != null) {
classificationPage.setOutlineObjects(outlineObjects);
blockificationPostprocessingService.sanitizeOutlineBlocks(classificationPage);
}
// MarkedContent needs to be converted at this point, otherwise it leads to GC Problems in Pdfbox.
classificationPage.setMarkedContentBboxPerType(convertMarkedContents(stripper.getMarkedContents()));

View File

@ -52,9 +52,10 @@ public class OutlineValidationService {
private void addItemAtCorrectPosition(TableOfContents toc, TableOfContentItem tocItem, TableOfContentItem lastHeadlineFromOutlines) {
if(!tocItem.getChildren().isEmpty()) {
}
//if (lastHeadlineFromOutlines == null || tocItem.g)
//if(!tocItem.getChildren().isEmpty()) {
//
//}
}
public TableOfContents createToC(List<TextPageBlock> headlines) {

View File

@ -65,14 +65,14 @@ public class TextPageBlock extends AbstractPageBlock {
@JsonIgnore
private float getPageHeight() {
public float getPageHeight() {
return sequences.get(0).getPageHeight();
}
@JsonIgnore
private float getPageWidth() {
public float getPageWidth() {
return sequences.get(0).getPageWidth();
}

View File

@ -4,7 +4,10 @@ import static com.knecon.fforesight.service.layoutparser.processor.services.bloc
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.ListIterator;
import java.util.Locale;
import java.util.function.Function;
import org.springframework.stereotype.Service;
@ -36,39 +39,9 @@ public class BlockificationPostprocessingService {
.collect(RectangleTransformations.collectBBox());
public void sanitizeOutlineBlocksWithKdTree(ClassificationPage classificationPage) {
public void sanitizeOutlineBlocks(ClassificationPage classificationPage) {
List<OutlineObject> outlineObjects = classificationPage.getOutlineObjects();
if (classificationPage.getTextBlocks().isEmpty() || outlineObjects.isEmpty()) {
return;
}
KDTree<TextPageBlock> kdTree = createKdTree(classificationPage);
for (OutlineObject outlineObject : outlineObjects) {
KDIterator<TextPageBlock> successorIterator = kdTree.query(new double[]{ //
0, //
outlineObject.getPoint().getY() - BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD //
}, //
new double[]{Double.MAX_VALUE, Double.MAX_VALUE});
boolean matchedExactly = false;
OutlineProcessionContext context = new OutlineProcessionContext(outlineObject);
while (successorIterator.hasNext() && !matchedExactly) {
TextPageBlock pageBlock = successorIterator.next().value();
matchedExactly = processOutlineForTextBlock(pageBlock, context);
}
if (!matchedExactly) {
selectMatch(classificationPage, kdTree, context);
}
}
}
private static KDTree<TextPageBlock> createKdTree(ClassificationPage classificationPage) {
List<TextPageBlock> textBlocks = classificationPage.getTextBlocks()
.stream()
@ -78,97 +51,149 @@ public class BlockificationPostprocessingService {
.map(block -> (TextPageBlock) block)
.toList();
KDTree<TextPageBlock> kdTree = KDTree.create(2);
textBlocks.forEach(block -> {
var boundingBox = blockToBoundingBox.apply(block);
kdTree.insert(new double[]{boundingBox.getMinX(), boundingBox.getMaxY()}, block);
});
return kdTree;
if (textBlocks.isEmpty() || outlineObjects.isEmpty()) {
return;
}
float pageHeight = classificationPage.getPageHeight();
for (OutlineObject outlineObject : outlineObjects) {
OutlineProcessionContext context = new OutlineProcessionContext(outlineObject);
ListIterator<TextPageBlock> iterator = textBlocks.listIterator();
while (iterator.hasNext()) {
TextPageBlock pageBlock = iterator.next();
if (pageHeight - outlineObject.getPoint().getY() - BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD <= pageBlock.getMaxY()) {
break;
}
}
if (iterator.hasPrevious()) {
iterator.previous();
}
boolean earlyStop = false;
while (iterator.hasNext() && !earlyStop) {
TextPageBlock pageBlock = iterator.next();
earlyStop = processOutlineForTextBlock(pageBlock, context);
}
selectMatch(classificationPage, context);
}
}
private void selectMatch(ClassificationPage classificationPage, KDTree<TextPageBlock> kdTree, OutlineProcessionContext context) {
private void selectMatch(ClassificationPage classificationPage, OutlineProcessionContext context) {
OutlineObject outlineObject = context.outlineObject;
TextPageBlock directMatch = context.directMatch;
List<TextPageBlock> mergeCandidates = context.mergeCandidates;
TextPageBlock splitCandidate = context.splitCandidate;
PageBlockType headlineType = PageBlockType.getHeadlineType(outlineObject.getTreeDepth());
double distanceToDirectMatch = directMatch != null ? calculateDistance(outlineObject, directMatch) : Double.MAX_VALUE;
double distanceToSplitCandidate = splitCandidate != null ? calculateDistance(outlineObject, splitCandidate) : Double.MAX_VALUE;
double distanceToBestMergeCandidates = Double.MAX_VALUE;
List<TextPageBlock> bestMergeCandidateCombination = new ArrayList<>();
if (!mergeCandidates.isEmpty()) {
List<TextPageBlock> allMergeCandidates = new ArrayList<>(mergeCandidates);
addNeighborsOfCandidate(kdTree, mergeCandidates.get(0), allMergeCandidates);
if (mergeCandidates.size() > 1) {
addNeighborsOfCandidate(kdTree, mergeCandidates.get(mergeCandidates.size() - 1), allMergeCandidates);
}
allMergeCandidates = allMergeCandidates.stream()
.distinct()
.toList();
// with this code adjacent blocks to the first and last merge candidate get added, this could be useful for some edge cases:
//List<TextPageBlock> allMergeCandidates = new ArrayList<>(mergeCandidates);
//addNeighborsOfCandidate(kdTree, mergeCandidates.get(0), allMergeCandidates);
//if (mergeCandidates.size() > 1) {
// addNeighborsOfCandidate(kdTree, mergeCandidates.get(mergeCandidates.size() - 1), allMergeCandidates);
//}
//allMergeCandidates = allMergeCandidates.stream()
// .distinct()
// .toList();
List<List<TextPageBlock>> combinations = findCombinations(outlineObject.getTitle(), mergeCandidates);
List<List<TextPageBlock>> combinations = findCombinations(outlineObject.getTitle(), allMergeCandidates);
double maxDistance = Double.MAX_VALUE;
List<TextPageBlock> bestCombination = new ArrayList<>();
for (List<TextPageBlock> combination : combinations) {
double averageDistance = combination.stream()
.map(block -> calculateDistance(outlineObject, block))
.mapToDouble(Double::doubleValue).average()
.orElse(Double.MAX_VALUE);
if (maxDistance > averageDistance) {
maxDistance = averageDistance;
bestCombination = combination;
if (distanceToBestMergeCandidates > averageDistance) {
distanceToBestMergeCandidates = averageDistance;
bestMergeCandidateCombination = combination;
}
}
var merged = mergeBlocks(classificationPage, bestCombination);
}
double minDistance = Math.min(distanceToDirectMatch, Math.min(distanceToSplitCandidate, distanceToBestMergeCandidates));
if(minDistance == Double.MAX_VALUE) {
return;
}
if (minDistance == distanceToDirectMatch) {
directMatch.setClassification(headlineType);
} else if (minDistance == distanceToSplitCandidate) {
List<TextPageBlock> others = splitBlock(classificationPage, splitCandidate, outlineObject.getTitle());
splitCandidate.setClassification(headlineType);
others.forEach(other -> other.setClassification(headlineType));
} else {
var merged = mergeBlocks(classificationPage, bestMergeCandidateCombination);
merged.setClassification(headlineType);
}
}
if (splitCandidate != null) {
TextPageBlock other = splitBlock(classificationPage, splitCandidate, outlineObject.getTitle());
splitCandidate.setClassification(headlineType);
other.setClassification(headlineType);
private List<TextPageBlock> splitBlock(ClassificationPage classificationPage, TextPageBlock blockToSplit, String text) {
List<TextPageBlock> otherBlocks = new ArrayList<>();
int blockToSplitIdx = classificationPage.getTextBlocks().indexOf(blockToSplit);
WordSequenceResult wordSequenceResult = findWordSequence(blockToSplit.getSequences(), text);
List<TextPositionSequence> postSequence = blockToSplit.getSequences();
postSequence.removeAll(wordSequenceResult.inSequence);
postSequence.removeAll(wordSequenceResult.preSequence);
blockToSplit.setSequences(wordSequenceResult.inSequence);
if (!wordSequenceResult.preSequence.isEmpty()) {
TextPageBlock block = buildTextBlock(wordSequenceResult.preSequence, 0);
classificationPage.getTextBlocks().add(blockToSplitIdx, block);
otherBlocks.add(block);
blockToSplitIdx++;
}
if (!postSequence.isEmpty()) {
TextPageBlock block = buildTextBlock(postSequence, 0);
classificationPage.getTextBlocks().add(blockToSplitIdx + 1, block);
otherBlocks.add(block);
}
return otherBlocks;
}
private TextPageBlock splitBlock(ClassificationPage classificationPage, TextPageBlock blockToSplit, String text) {
private static WordSequenceResult findWordSequence(List<TextPositionSequence> textPositionSequences, String text) {
List<TextPositionSequence> wordSequence = findWordSequence(blockToSplit.getSequences(), text);
List<TextPositionSequence> remaining = blockToSplit.getSequences();
remaining.removeAll(wordSequence);
blockToSplit.setSequences(wordSequence);
TextPageBlock other = buildTextBlock(remaining, 0);
classificationPage.getTextBlocks().add(other);
return other;
}
private static List<TextPositionSequence> findWordSequence(List<TextPositionSequence> textPositionSequences, String text) {
String target = text.replaceAll("\\s", "");
String target = sanitizeString(text);
List<TextPositionSequence> inSequence = new ArrayList<>();
List<TextPositionSequence> preSequence = new ArrayList<>();
StringBuilder currentSequence = new StringBuilder();
for (TextPositionSequence sequence : textPositionSequences) {
if (currentSequence.toString().equals(target)) {
return inSequence;
}
currentSequence.append(sequence.toString());
currentSequence.append(sanitizeString(sequence.toString()));
inSequence.add(sequence);
if (currentSequence.length() > target.length()) {
TextPositionSequence removed = inSequence.remove(0);
currentSequence.delete(0, removed.toString().length());
preSequence.add(removed);
while (currentSequence.length() > target.length()) {
removed = inSequence.remove(0);
currentSequence.delete(0, removed.toString().length());
preSequence.add(removed);
}
}
if (currentSequence.toString().equals(target)) {
return new WordSequenceResult(inSequence, preSequence);
}
}
return new ArrayList<>();
return new WordSequenceResult(new ArrayList<>(), new ArrayList<>());
}
@ -209,7 +234,7 @@ public class BlockificationPostprocessingService {
private static void findCombinations(String title, List<TextPageBlock> blocks, List<TextPageBlock> current, List<List<TextPageBlock>> combinations) {
String target = title.replaceAll("\\s", "");
String target = sanitizeString(title);
if (target.isEmpty()) {
combinations.add(new ArrayList<>(current));
return;
@ -219,10 +244,10 @@ public class BlockificationPostprocessingService {
.filter(block -> !current.contains(block))
.toList();
for (TextPageBlock block : remaining) {
String prefix = block.getText().replaceAll("\\s", "");
String prefix = sanitizeString(block.getText());
if (target.startsWith(prefix)) {
current.add(block);
findCombinations(target.substring(prefix.length()), blocks, current, combinations);
findCombinations(target.substring(prefix.length()), blocks.subList(blocks.indexOf(block) + 1, blocks.size()), current, combinations);
current.remove(current.size() - 1);
}
}
@ -232,7 +257,7 @@ public class BlockificationPostprocessingService {
private double calculateDistance(OutlineObject outlineObject, TextPageBlock pageBlock) {
double deltaX = outlineObject.getPoint().getX() - pageBlock.getMinX();
double deltaY = outlineObject.getPoint().getY() - pageBlock.getMinY();
double deltaY = pageBlock.getPageHeight() - outlineObject.getPoint().getY() - pageBlock.getMinY();
return Math.sqrt(deltaX * deltaX + deltaY * deltaY);
}
@ -255,8 +280,8 @@ public class BlockificationPostprocessingService {
private boolean processOutlineForTextBlock(TextPageBlock pageBlock, OutlineProcessionContext context) {
OutlineObject outlineObject = context.getOutlineObject();
String blockText = pageBlock.getText();
String outlineTitle = outlineObject.getTitle();
String blockText = sanitizeString(pageBlock.getText());
String outlineTitle = sanitizeString(outlineObject.getTitle());
boolean blockTextContainsOutlineTitle = blockText.contains(outlineTitle);
boolean outlineTitleContainsBlockText = outlineTitle.contains(blockText);
@ -265,8 +290,8 @@ public class BlockificationPostprocessingService {
return false;
}
if (blockText.equals(outlineTitle)) {
pageBlock.setClassification(PageBlockType.getHeadlineType(outlineObject.getTreeDepth()));
if (blockText.equals(outlineTitle) && context.directMatch == null) {
context.directMatch = pageBlock;
return true;
}
@ -274,17 +299,27 @@ public class BlockificationPostprocessingService {
context.mergeCandidates.add(pageBlock);
}
if (blockTextContainsOutlineTitle && context.splitCandidate != null) {
if (blockTextContainsOutlineTitle && context.splitCandidate == null) {
context.splitCandidate = pageBlock;
}
return false;
}
private static String sanitizeString(String text) {
return text.replaceAll("\\s", "").toLowerCase(Locale.ROOT);
}
private record WordSequenceResult(List<TextPositionSequence> inSequence, List<TextPositionSequence> preSequence) {
}
@Data
private static class OutlineProcessionContext {
private TextPageBlock directMatch;
private OutlineObject outlineObject;
private List<TextPageBlock> mergeCandidates;
private TextPageBlock splitCandidate;
@ -293,10 +328,65 @@ public class BlockificationPostprocessingService {
public OutlineProcessionContext(OutlineObject outlineObject) {
this.outlineObject = outlineObject;
this.directMatch = null;
this.mergeCandidates = new ArrayList<>();
this.splitCandidate = null;
}
}
@Deprecated
public void sanitizeOutlineBlocksWithKdTree(ClassificationPage classificationPage) {
List<OutlineObject> outlineObjects = classificationPage.getOutlineObjects();
if (classificationPage.getTextBlocks().isEmpty() || outlineObjects.isEmpty()) {
return;
}
KDTree<TextPageBlock> kdTree = createKdTree(classificationPage);
float pageHeight = classificationPage.getPageHeight();
for (OutlineObject outlineObject : outlineObjects) {
// kd tree contains yx coordinates
KDIterator<TextPageBlock> successorIterator = kdTree.query(new double[]{ //
pageHeight - outlineObject.getPoint().getY() - BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD, 0, //
//
}, //
new double[]{Double.MAX_VALUE, Double.MAX_VALUE});
OutlineProcessionContext context = new OutlineProcessionContext(outlineObject);
boolean earlyStop = false;
while (successorIterator.hasNext() && !earlyStop) {
TextPageBlock pageBlock = successorIterator.next().value();
earlyStop = processOutlineForTextBlock(pageBlock, context);
processOutlineForTextBlock(pageBlock, context);
}
selectMatch(classificationPage, context);
}
}
@Deprecated
private static KDTree<TextPageBlock> createKdTree(ClassificationPage classificationPage) {
List<TextPageBlock> textBlocks = classificationPage.getTextBlocks()
.stream()
.filter(block -> block instanceof TextPageBlock)
.toList()
.stream()
.map(block -> (TextPageBlock) block)
.toList();
KDTree<TextPageBlock> kdTree = KDTree.create(2);
// insert y first then x, use pdf max y so that the page height is subtracted so that the order is inverted
textBlocks.forEach(block -> {
//var boundingBox = blockToBoundingBox.apply(block);
kdTree.insert(new double[]{block.getMinY(), block.getMinX()}, block);
});
return kdTree;
}
}

View File

@ -19,6 +19,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageB
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
import lombok.Data;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@ -44,15 +45,17 @@ public class RedactManagerClassificationService {
.map(tb -> (TextPageBlock) tb))
.toList();
HeadLineClassificationContext headLineClassificationContext = new HeadLineClassificationContext();
for (ClassificationPage page : document.getPages()) {
classifyPage(page, document, headlineFontSizes);
classifyPage(page, document, headlineFontSizes, headLineClassificationContext);
}
List<TextPageBlock> allHeadlines = document.getPages()
.stream()
.flatMap(classificationPage -> classificationPage.getTextBlocks()
.stream()
.filter(tb -> tb instanceof TextPageBlock && tb.getClassification() != null && tb.getClassification().isHeadline())
.filter(tb -> tb instanceof TextPageBlock && tb.getClassification() != null && tb.getClassification().isHeadline())
.map(tb -> (TextPageBlock) tb))
.toList();
@ -67,21 +70,26 @@ public class RedactManagerClassificationService {
}
private void classifyPage(ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
private void classifyPage(ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes, HeadLineClassificationContext headLineClassificationContext) {
for (AbstractPageBlock textBlock : page.getTextBlocks()) {
if (textBlock instanceof TextPageBlock) {
classifyBlock((TextPageBlock) textBlock, page, document, headlineFontSizes);
classifyBlock((TextPageBlock) textBlock, page, document, headlineFontSizes, headLineClassificationContext);
}
}
}
private void classifyBlock(TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
private void classifyBlock(TextPageBlock textBlock,
ClassificationPage page,
ClassificationDocument document,
List<Float> headlineFontSizes,
HeadLineClassificationContext headLineClassificationContext) {
var bodyTextFrame = page.getBodyTextFrame();
if (textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) {
headLineClassificationContext.setLastHeadlineFromOutline(textBlock);
return;
}
if (document.getFontSizeCounter().getMostPopular() == null) {
@ -122,7 +130,8 @@ public class RedactManagerClassificationService {
for (int i = 1; i <= headlineFontSizes.size(); i++) {
if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) {
textBlock.setClassification(PageBlockType.getHeadlineType(i));
PageBlockType headlineType = PageBlockType.getHeadlineType(i);
classifyHeadline(textBlock, headLineClassificationContext, headlineType);
document.setHeadlines(true);
}
}
@ -134,7 +143,8 @@ public class RedactManagerClassificationService {
&& textBlock.getSequences()
.get(0).getTextPositions()
.get(0).getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
textBlock.setClassification(PageBlockType.getHeadlineType(headlineFontSizes.size() + 1));
PageBlockType headlineType = PageBlockType.getHeadlineType(headlineFontSizes.size() + 1);
classifyHeadline(textBlock, headLineClassificationContext, headlineType);
document.setHeadlines(true);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
&& textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()
@ -159,4 +169,66 @@ public class RedactManagerClassificationService {
}
}
private static void classifyHeadline(TextPageBlock textBlock, HeadLineClassificationContext headLineClassificationContext, PageBlockType headlineType) {
TextPageBlock lastHeadline = headLineClassificationContext.getLastHeadline();
TextPageBlock lastHeadlineFromOutline = headLineClassificationContext.getLastHeadlineFromOutline();
PageBlockType originalClassifiedBlockType = headLineClassificationContext.getOriginalClassifiedBlockType();
if (lastHeadline != null) {
if (lastHeadline.equals(lastHeadlineFromOutline)) {
headlineType = getNextType(lastHeadline.getClassification());
} else if (originalClassifiedBlockType != null && lastHeadline.getClassification() != originalClassifiedBlockType) {
PageBlockType lastHeadlineType = lastHeadline.getClassification();
int difference = getHeadlineNumber(originalClassifiedBlockType) - getHeadlineNumber(lastHeadlineType);
headlineType = PageBlockType.getHeadlineType(getHeadlineNumber(headlineType) + difference);
}
}
headLineClassificationContext.setOriginalClassifiedBlockType(headlineType);
textBlock.setClassification(headlineType);
headLineClassificationContext.setLastHeadline(textBlock);
}
private static PageBlockType getNextType(PageBlockType pageBlockType) {
return PageBlockType.getHeadlineType(getHeadlineNumber(pageBlockType) + 1);
}
private static int getHeadlineNumber(PageBlockType pageBlockType) {
return switch (pageBlockType) {
case H1 -> 1;
case H2 -> 2;
case H3 -> 3;
case H4 -> 4;
case H5 -> 5;
default -> 6;
};
}
@Data
static class HeadLineClassificationContext {
TextPageBlock lastHeadline;
PageBlockType originalClassifiedBlockType;
TextPageBlock lastHeadlineFromOutline;
public void setLastHeadlineFromOutline(TextPageBlock lastHeadlineFromOutline) {
this.lastHeadlineFromOutline = lastHeadlineFromOutline;
this.setLastHeadline(lastHeadlineFromOutline);
}
}
}

View File

@ -32,7 +32,8 @@ public class ViewerDocumentTest extends BuildDocumentTest {
//String fileName = "files/new/abschlussarbeiten-template-institut-fur-informatik-padagogische-hochschule-karlsruhe.pdf";
//String fileName = "files/new/kaust-official-thesis-template.pdf";
//String fileName = "files/new/$100m Offers.pdf";
String fileName = "files/new/18-Curacron_ToxicidadeOcularInVitro.pdf";
//String fileName = "files/new/18-Curacron_ToxicidadeOcularInVitro.pdf";
String fileName = "files/new/UTT-Books-53.pdf";
//String fileName = "files/new/mistitled_outlines_example.pdf";
//String fileName = "files/bdr/Plenarprotokoll 1 (keine Druchsache!) (1) 1.pdf";
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";