diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index b1696a8..f23dbd3 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -297,13 +297,6 @@ public class LayoutParsingPipeline { case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, false); }; - List outlineObjects = classificationDocument.getOutlineObjectTree().getOutlineObjectsPerPage() - .get(pageNumber - 1); - if (outlineObjects != null) { - classificationPage.setOutlineObjects(outlineObjects); - blockificationPostprocessingService.sanitizeOutlineBlocksWithKdTree(classificationPage); - } - classificationPage.setCleanRulings(cleanRulings); classificationPage.setRotation(rotation); classificationPage.setLandscape(isLandscape); @@ -311,6 +304,13 @@ public class LayoutParsingPipeline { classificationPage.setPageWidth(cropbox.getWidth()); classificationPage.setPageHeight(cropbox.getHeight()); + List outlineObjects = classificationDocument.getOutlineObjectTree().getOutlineObjectsPerPage() + .get(pageNumber - 1); + if (outlineObjects != null) { + classificationPage.setOutlineObjects(outlineObjects); + blockificationPostprocessingService.sanitizeOutlineBlocks(classificationPage); + } + // MarkedContent needs to be converted at this point, otherwise it leads to GC Problems in Pdfbox. classificationPage.setMarkedContentBboxPerType(convertMarkedContents(stripper.getMarkedContents())); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineValidationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineValidationService.java index b1f0ca4..ea06aa6 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineValidationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineValidationService.java @@ -52,9 +52,10 @@ public class OutlineValidationService { private void addItemAtCorrectPosition(TableOfContents toc, TableOfContentItem tocItem, TableOfContentItem lastHeadlineFromOutlines) { - if(!tocItem.getChildren().isEmpty()) { - - } + //if (lastHeadlineFromOutlines == null || tocItem.g) + //if(!tocItem.getChildren().isEmpty()) { +// + //} } public TableOfContents createToC(List headlines) { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java index 6323205..6b41670 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java @@ -65,14 +65,14 @@ public class TextPageBlock extends AbstractPageBlock { @JsonIgnore - private float getPageHeight() { + public float getPageHeight() { return sequences.get(0).getPageHeight(); } @JsonIgnore - private float getPageWidth() { + public float getPageWidth() { return sequences.get(0).getPageWidth(); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java index 5ea023e..b967c19 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java @@ -4,7 +4,10 @@ import static com.knecon.fforesight.service.layoutparser.processor.services.bloc import java.awt.geom.Rectangle2D; import java.util.ArrayList; +import java.util.Collections; import java.util.List; +import java.util.ListIterator; +import java.util.Locale; import java.util.function.Function; import org.springframework.stereotype.Service; @@ -36,39 +39,9 @@ public class BlockificationPostprocessingService { .collect(RectangleTransformations.collectBBox()); - public void sanitizeOutlineBlocksWithKdTree(ClassificationPage classificationPage) { + public void sanitizeOutlineBlocks(ClassificationPage classificationPage) { List outlineObjects = classificationPage.getOutlineObjects(); - if (classificationPage.getTextBlocks().isEmpty() || outlineObjects.isEmpty()) { - return; - } - - KDTree kdTree = createKdTree(classificationPage); - - for (OutlineObject outlineObject : outlineObjects) { - - KDIterator successorIterator = kdTree.query(new double[]{ // - 0, // - outlineObject.getPoint().getY() - BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD // - }, // - new double[]{Double.MAX_VALUE, Double.MAX_VALUE}); - - boolean matchedExactly = false; - - OutlineProcessionContext context = new OutlineProcessionContext(outlineObject); - while (successorIterator.hasNext() && !matchedExactly) { - TextPageBlock pageBlock = successorIterator.next().value(); - matchedExactly = processOutlineForTextBlock(pageBlock, context); - } - - if (!matchedExactly) { - selectMatch(classificationPage, kdTree, context); - } - } - } - - - private static KDTree createKdTree(ClassificationPage classificationPage) { List textBlocks = classificationPage.getTextBlocks() .stream() @@ -78,97 +51,149 @@ public class BlockificationPostprocessingService { .map(block -> (TextPageBlock) block) .toList(); - KDTree kdTree = KDTree.create(2); - textBlocks.forEach(block -> { - var boundingBox = blockToBoundingBox.apply(block); - kdTree.insert(new double[]{boundingBox.getMinX(), boundingBox.getMaxY()}, block); - }); - return kdTree; + if (textBlocks.isEmpty() || outlineObjects.isEmpty()) { + return; + } + + float pageHeight = classificationPage.getPageHeight(); + + for (OutlineObject outlineObject : outlineObjects) { + + OutlineProcessionContext context = new OutlineProcessionContext(outlineObject); + + ListIterator iterator = textBlocks.listIterator(); + while (iterator.hasNext()) { + TextPageBlock pageBlock = iterator.next(); + if (pageHeight - outlineObject.getPoint().getY() - BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD <= pageBlock.getMaxY()) { + break; + } + } + if (iterator.hasPrevious()) { + iterator.previous(); + } + boolean earlyStop = false; + while (iterator.hasNext() && !earlyStop) { + TextPageBlock pageBlock = iterator.next(); + earlyStop = processOutlineForTextBlock(pageBlock, context); + } + selectMatch(classificationPage, context); + + } } - private void selectMatch(ClassificationPage classificationPage, KDTree kdTree, OutlineProcessionContext context) { + private void selectMatch(ClassificationPage classificationPage, OutlineProcessionContext context) { OutlineObject outlineObject = context.outlineObject; + TextPageBlock directMatch = context.directMatch; List mergeCandidates = context.mergeCandidates; TextPageBlock splitCandidate = context.splitCandidate; PageBlockType headlineType = PageBlockType.getHeadlineType(outlineObject.getTreeDepth()); + double distanceToDirectMatch = directMatch != null ? calculateDistance(outlineObject, directMatch) : Double.MAX_VALUE; + double distanceToSplitCandidate = splitCandidate != null ? calculateDistance(outlineObject, splitCandidate) : Double.MAX_VALUE; + + double distanceToBestMergeCandidates = Double.MAX_VALUE; + List bestMergeCandidateCombination = new ArrayList<>(); if (!mergeCandidates.isEmpty()) { - List allMergeCandidates = new ArrayList<>(mergeCandidates); - addNeighborsOfCandidate(kdTree, mergeCandidates.get(0), allMergeCandidates); - if (mergeCandidates.size() > 1) { - addNeighborsOfCandidate(kdTree, mergeCandidates.get(mergeCandidates.size() - 1), allMergeCandidates); - } - allMergeCandidates = allMergeCandidates.stream() - .distinct() - .toList(); + // with this code adjacent blocks to the first and last merge candidate get added, this could be useful for some edge cases: + //List allMergeCandidates = new ArrayList<>(mergeCandidates); + //addNeighborsOfCandidate(kdTree, mergeCandidates.get(0), allMergeCandidates); + //if (mergeCandidates.size() > 1) { + // addNeighborsOfCandidate(kdTree, mergeCandidates.get(mergeCandidates.size() - 1), allMergeCandidates); + //} + //allMergeCandidates = allMergeCandidates.stream() + // .distinct() + // .toList(); + + List> combinations = findCombinations(outlineObject.getTitle(), mergeCandidates); - List> combinations = findCombinations(outlineObject.getTitle(), allMergeCandidates); - double maxDistance = Double.MAX_VALUE; - List bestCombination = new ArrayList<>(); for (List combination : combinations) { double averageDistance = combination.stream() .map(block -> calculateDistance(outlineObject, block)) .mapToDouble(Double::doubleValue).average() .orElse(Double.MAX_VALUE); - if (maxDistance > averageDistance) { - maxDistance = averageDistance; - bestCombination = combination; + if (distanceToBestMergeCandidates > averageDistance) { + distanceToBestMergeCandidates = averageDistance; + bestMergeCandidateCombination = combination; } } - var merged = mergeBlocks(classificationPage, bestCombination); + } + + double minDistance = Math.min(distanceToDirectMatch, Math.min(distanceToSplitCandidate, distanceToBestMergeCandidates)); + + if(minDistance == Double.MAX_VALUE) { + return; + } + if (minDistance == distanceToDirectMatch) { + directMatch.setClassification(headlineType); + } else if (minDistance == distanceToSplitCandidate) { + List others = splitBlock(classificationPage, splitCandidate, outlineObject.getTitle()); + splitCandidate.setClassification(headlineType); + others.forEach(other -> other.setClassification(headlineType)); + } else { + var merged = mergeBlocks(classificationPage, bestMergeCandidateCombination); merged.setClassification(headlineType); } + } - if (splitCandidate != null) { - TextPageBlock other = splitBlock(classificationPage, splitCandidate, outlineObject.getTitle()); - splitCandidate.setClassification(headlineType); - other.setClassification(headlineType); + + private List splitBlock(ClassificationPage classificationPage, TextPageBlock blockToSplit, String text) { + + List otherBlocks = new ArrayList<>(); + int blockToSplitIdx = classificationPage.getTextBlocks().indexOf(blockToSplit); + WordSequenceResult wordSequenceResult = findWordSequence(blockToSplit.getSequences(), text); + List postSequence = blockToSplit.getSequences(); + postSequence.removeAll(wordSequenceResult.inSequence); + postSequence.removeAll(wordSequenceResult.preSequence); + + blockToSplit.setSequences(wordSequenceResult.inSequence); + + if (!wordSequenceResult.preSequence.isEmpty()) { + TextPageBlock block = buildTextBlock(wordSequenceResult.preSequence, 0); + classificationPage.getTextBlocks().add(blockToSplitIdx, block); + otherBlocks.add(block); + blockToSplitIdx++; } + if (!postSequence.isEmpty()) { + TextPageBlock block = buildTextBlock(postSequence, 0); + classificationPage.getTextBlocks().add(blockToSplitIdx + 1, block); + otherBlocks.add(block); + } + return otherBlocks; } - private TextPageBlock splitBlock(ClassificationPage classificationPage, TextPageBlock blockToSplit, String text) { + private static WordSequenceResult findWordSequence(List textPositionSequences, String text) { - List wordSequence = findWordSequence(blockToSplit.getSequences(), text); - List remaining = blockToSplit.getSequences(); - remaining.removeAll(wordSequence); - - blockToSplit.setSequences(wordSequence); - - TextPageBlock other = buildTextBlock(remaining, 0); - classificationPage.getTextBlocks().add(other); - return other; - } - - - private static List findWordSequence(List textPositionSequences, String text) { - - String target = text.replaceAll("\\s", ""); + String target = sanitizeString(text); List inSequence = new ArrayList<>(); + List preSequence = new ArrayList<>(); StringBuilder currentSequence = new StringBuilder(); for (TextPositionSequence sequence : textPositionSequences) { - if (currentSequence.toString().equals(target)) { - return inSequence; - } - currentSequence.append(sequence.toString()); + currentSequence.append(sanitizeString(sequence.toString())); inSequence.add(sequence); if (currentSequence.length() > target.length()) { TextPositionSequence removed = inSequence.remove(0); currentSequence.delete(0, removed.toString().length()); + preSequence.add(removed); while (currentSequence.length() > target.length()) { removed = inSequence.remove(0); currentSequence.delete(0, removed.toString().length()); + preSequence.add(removed); } } + + if (currentSequence.toString().equals(target)) { + return new WordSequenceResult(inSequence, preSequence); + } } - return new ArrayList<>(); + return new WordSequenceResult(new ArrayList<>(), new ArrayList<>()); } @@ -209,7 +234,7 @@ public class BlockificationPostprocessingService { private static void findCombinations(String title, List blocks, List current, List> combinations) { - String target = title.replaceAll("\\s", ""); + String target = sanitizeString(title); if (target.isEmpty()) { combinations.add(new ArrayList<>(current)); return; @@ -219,10 +244,10 @@ public class BlockificationPostprocessingService { .filter(block -> !current.contains(block)) .toList(); for (TextPageBlock block : remaining) { - String prefix = block.getText().replaceAll("\\s", ""); + String prefix = sanitizeString(block.getText()); if (target.startsWith(prefix)) { current.add(block); - findCombinations(target.substring(prefix.length()), blocks, current, combinations); + findCombinations(target.substring(prefix.length()), blocks.subList(blocks.indexOf(block) + 1, blocks.size()), current, combinations); current.remove(current.size() - 1); } } @@ -232,7 +257,7 @@ public class BlockificationPostprocessingService { private double calculateDistance(OutlineObject outlineObject, TextPageBlock pageBlock) { double deltaX = outlineObject.getPoint().getX() - pageBlock.getMinX(); - double deltaY = outlineObject.getPoint().getY() - pageBlock.getMinY(); + double deltaY = pageBlock.getPageHeight() - outlineObject.getPoint().getY() - pageBlock.getMinY(); return Math.sqrt(deltaX * deltaX + deltaY * deltaY); } @@ -255,8 +280,8 @@ public class BlockificationPostprocessingService { private boolean processOutlineForTextBlock(TextPageBlock pageBlock, OutlineProcessionContext context) { OutlineObject outlineObject = context.getOutlineObject(); - String blockText = pageBlock.getText(); - String outlineTitle = outlineObject.getTitle(); + String blockText = sanitizeString(pageBlock.getText()); + String outlineTitle = sanitizeString(outlineObject.getTitle()); boolean blockTextContainsOutlineTitle = blockText.contains(outlineTitle); boolean outlineTitleContainsBlockText = outlineTitle.contains(blockText); @@ -265,8 +290,8 @@ public class BlockificationPostprocessingService { return false; } - if (blockText.equals(outlineTitle)) { - pageBlock.setClassification(PageBlockType.getHeadlineType(outlineObject.getTreeDepth())); + if (blockText.equals(outlineTitle) && context.directMatch == null) { + context.directMatch = pageBlock; return true; } @@ -274,17 +299,27 @@ public class BlockificationPostprocessingService { context.mergeCandidates.add(pageBlock); } - if (blockTextContainsOutlineTitle && context.splitCandidate != null) { + if (blockTextContainsOutlineTitle && context.splitCandidate == null) { context.splitCandidate = pageBlock; } - return false; } + private static String sanitizeString(String text) { + + return text.replaceAll("\\s", "").toLowerCase(Locale.ROOT); + } + + + private record WordSequenceResult(List inSequence, List preSequence) { + + } + @Data private static class OutlineProcessionContext { + private TextPageBlock directMatch; private OutlineObject outlineObject; private List mergeCandidates; private TextPageBlock splitCandidate; @@ -293,10 +328,65 @@ public class BlockificationPostprocessingService { public OutlineProcessionContext(OutlineObject outlineObject) { this.outlineObject = outlineObject; + this.directMatch = null; this.mergeCandidates = new ArrayList<>(); this.splitCandidate = null; } } + @Deprecated + public void sanitizeOutlineBlocksWithKdTree(ClassificationPage classificationPage) { + + List outlineObjects = classificationPage.getOutlineObjects(); + if (classificationPage.getTextBlocks().isEmpty() || outlineObjects.isEmpty()) { + return; + } + + KDTree kdTree = createKdTree(classificationPage); + float pageHeight = classificationPage.getPageHeight(); + + for (OutlineObject outlineObject : outlineObjects) { + + // kd tree contains yx coordinates + KDIterator successorIterator = kdTree.query(new double[]{ // + pageHeight - outlineObject.getPoint().getY() - BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD, 0, // + // + }, // + new double[]{Double.MAX_VALUE, Double.MAX_VALUE}); + + OutlineProcessionContext context = new OutlineProcessionContext(outlineObject); + + boolean earlyStop = false; + while (successorIterator.hasNext() && !earlyStop) { + TextPageBlock pageBlock = successorIterator.next().value(); + earlyStop = processOutlineForTextBlock(pageBlock, context); + processOutlineForTextBlock(pageBlock, context); + } + selectMatch(classificationPage, context); + + } + } + + + @Deprecated + private static KDTree createKdTree(ClassificationPage classificationPage) { + + List textBlocks = classificationPage.getTextBlocks() + .stream() + .filter(block -> block instanceof TextPageBlock) + .toList() + .stream() + .map(block -> (TextPageBlock) block) + .toList(); + + KDTree kdTree = KDTree.create(2); + // insert y first then x, use pdf max y so that the page height is subtracted so that the order is inverted + textBlocks.forEach(block -> { + //var boundingBox = blockToBoundingBox.apply(block); + kdTree.insert(new double[]{block.getMinY(), block.getMinX()}, block); + }); + return kdTree; + } + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java index 872dd85..d1b19e0 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java @@ -19,6 +19,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageB import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils; +import lombok.Data; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; @@ -44,15 +45,17 @@ public class RedactManagerClassificationService { .map(tb -> (TextPageBlock) tb)) .toList(); + + HeadLineClassificationContext headLineClassificationContext = new HeadLineClassificationContext(); for (ClassificationPage page : document.getPages()) { - classifyPage(page, document, headlineFontSizes); + classifyPage(page, document, headlineFontSizes, headLineClassificationContext); } List allHeadlines = document.getPages() .stream() .flatMap(classificationPage -> classificationPage.getTextBlocks() .stream() - .filter(tb -> tb instanceof TextPageBlock && tb.getClassification() != null && tb.getClassification().isHeadline()) + .filter(tb -> tb instanceof TextPageBlock && tb.getClassification() != null && tb.getClassification().isHeadline()) .map(tb -> (TextPageBlock) tb)) .toList(); @@ -67,21 +70,26 @@ public class RedactManagerClassificationService { } - private void classifyPage(ClassificationPage page, ClassificationDocument document, List headlineFontSizes) { + private void classifyPage(ClassificationPage page, ClassificationDocument document, List headlineFontSizes, HeadLineClassificationContext headLineClassificationContext) { for (AbstractPageBlock textBlock : page.getTextBlocks()) { if (textBlock instanceof TextPageBlock) { - classifyBlock((TextPageBlock) textBlock, page, document, headlineFontSizes); + classifyBlock((TextPageBlock) textBlock, page, document, headlineFontSizes, headLineClassificationContext); } } } - private void classifyBlock(TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List headlineFontSizes) { + private void classifyBlock(TextPageBlock textBlock, + ClassificationPage page, + ClassificationDocument document, + List headlineFontSizes, + HeadLineClassificationContext headLineClassificationContext) { var bodyTextFrame = page.getBodyTextFrame(); if (textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) { + headLineClassificationContext.setLastHeadlineFromOutline(textBlock); return; } if (document.getFontSizeCounter().getMostPopular() == null) { @@ -122,7 +130,8 @@ public class RedactManagerClassificationService { for (int i = 1; i <= headlineFontSizes.size(); i++) { if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) { - textBlock.setClassification(PageBlockType.getHeadlineType(i)); + PageBlockType headlineType = PageBlockType.getHeadlineType(i); + classifyHeadline(textBlock, headLineClassificationContext, headlineType); document.setHeadlines(true); } } @@ -134,7 +143,8 @@ public class RedactManagerClassificationService { && textBlock.getSequences() .get(0).getTextPositions() .get(0).getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) { - textBlock.setClassification(PageBlockType.getHeadlineType(headlineFontSizes.size() + 1)); + PageBlockType headlineType = PageBlockType.getHeadlineType(headlineFontSizes.size() + 1); + classifyHeadline(textBlock, headLineClassificationContext, headlineType); document.setHeadlines(true); } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular() @@ -159,4 +169,66 @@ public class RedactManagerClassificationService { } } + + private static void classifyHeadline(TextPageBlock textBlock, HeadLineClassificationContext headLineClassificationContext, PageBlockType headlineType) { + + TextPageBlock lastHeadline = headLineClassificationContext.getLastHeadline(); + TextPageBlock lastHeadlineFromOutline = headLineClassificationContext.getLastHeadlineFromOutline(); + PageBlockType originalClassifiedBlockType = headLineClassificationContext.getOriginalClassifiedBlockType(); + + if (lastHeadline != null) { + + if (lastHeadline.equals(lastHeadlineFromOutline)) { + + headlineType = getNextType(lastHeadline.getClassification()); + + } else if (originalClassifiedBlockType != null && lastHeadline.getClassification() != originalClassifiedBlockType) { + + PageBlockType lastHeadlineType = lastHeadline.getClassification(); + int difference = getHeadlineNumber(originalClassifiedBlockType) - getHeadlineNumber(lastHeadlineType); + headlineType = PageBlockType.getHeadlineType(getHeadlineNumber(headlineType) + difference); + } + } + + headLineClassificationContext.setOriginalClassifiedBlockType(headlineType); + textBlock.setClassification(headlineType); + headLineClassificationContext.setLastHeadline(textBlock); + } + + + private static PageBlockType getNextType(PageBlockType pageBlockType) { + + return PageBlockType.getHeadlineType(getHeadlineNumber(pageBlockType) + 1); + } + + + private static int getHeadlineNumber(PageBlockType pageBlockType) { + + return switch (pageBlockType) { + case H1 -> 1; + case H2 -> 2; + case H3 -> 3; + case H4 -> 4; + case H5 -> 5; + default -> 6; + }; + } + + + @Data + static class HeadLineClassificationContext { + + TextPageBlock lastHeadline; + PageBlockType originalClassifiedBlockType; + TextPageBlock lastHeadlineFromOutline; + + + public void setLastHeadlineFromOutline(TextPageBlock lastHeadlineFromOutline) { + + this.lastHeadlineFromOutline = lastHeadlineFromOutline; + this.setLastHeadline(lastHeadlineFromOutline); + } + + } + } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java index 8ccfe81..81cebbf 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java @@ -32,7 +32,8 @@ public class ViewerDocumentTest extends BuildDocumentTest { //String fileName = "files/new/abschlussarbeiten-template-institut-fur-informatik-padagogische-hochschule-karlsruhe.pdf"; //String fileName = "files/new/kaust-official-thesis-template.pdf"; //String fileName = "files/new/$100m Offers.pdf"; - String fileName = "files/new/18-Curacron_ToxicidadeOcularInVitro.pdf"; + //String fileName = "files/new/18-Curacron_ToxicidadeOcularInVitro.pdf"; + String fileName = "files/new/UTT-Books-53.pdf"; //String fileName = "files/new/mistitled_outlines_example.pdf"; //String fileName = "files/bdr/Plenarprotokoll 1 (keine Druchsache!) (1) 1.pdf"; String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf"; diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/new/UTT-Books-53.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/new/UTT-Books-53.pdf new file mode 100644 index 0000000..c56e8ac Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/new/UTT-Books-53.pdf differ