diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index c252cbe..010c985 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -29,7 +29,10 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType; import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; +import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineExtractorService; import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject; +import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineValidationService; +import com.knecon.fforesight.service.layoutparser.processor.model.outline.TOCEnrichmentService; import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents; import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; @@ -55,6 +58,7 @@ import com.knecon.fforesight.service.layoutparser.processor.services.classificat import com.knecon.fforesight.service.layoutparser.processor.services.classification.DocuMineClassificationService; import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService; import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; +import com.knecon.fforesight.service.layoutparser.processor.services.graphics.GraphicExtractorService; import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper; import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper; import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDFLinesTextStripper; @@ -357,7 +361,7 @@ public class LayoutParsingPipeline { .stream() .flatMap(classificationPage -> classificationPage.getTextBlocks() .stream() - .filter(tb -> tb instanceof TextPageBlock && tb.getClassification().isHeadline()) + .filter(tb -> tb instanceof TextPageBlock && tb.getClassification() != null && tb.getClassification().isHeadline()) .map(tb -> (TextPageBlock) tb)) .toList(); TableOfContents tableOfContents = outlineValidationService.createToC(headlines); @@ -368,9 +372,6 @@ public class LayoutParsingPipeline { switch (layoutParsingType) { case CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_PARAGRAPH_DEBUG -> sectionsBuilderService.buildParagraphDebugSections(classificationDocument); default -> { - sectionsBuilderService.buildSections(classificationDocument); - sectionsBuilderService.addImagesToSections(classificationDocument); - tocEnrichmentService.assignSectionBlocksAndImages(classificationDocument); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageBlockType.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageBlockType.java index 1292138..f67127a 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageBlockType.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageBlockType.java @@ -31,6 +31,19 @@ public enum PageBlockType { } + public static int getHeadlineNumber(PageBlockType pageBlockType) { + + return switch (pageBlockType) { + case H1 -> 1; + case H2 -> 2; + case H3 -> 3; + case H4 -> 4; + case H5 -> 5; + default -> 6; + }; + } + + public boolean isHeadline() { return this.equals(H1) || this.equals(H2) || this.equals(H3) || this.equals(H4) || this.equals(H5) || this.equals(H6); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineExtractorService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineExtractorService.java index 9be8c33..eb3f31b 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineExtractorService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineExtractorService.java @@ -26,10 +26,6 @@ import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocume import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem; import org.springframework.stereotype.Service; -import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject; -import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree; -import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTreeNode; - import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; @@ -53,8 +49,10 @@ public class OutlineExtractorService { PDDocumentOutline documentOutline = document.getDocumentCatalog().getDocumentOutline(); List rootNodes = new ArrayList<>(); - for (PDOutlineItem child : documentOutline.children()) { - rootNodes.add(createOutlineObjectWithChildren(child, document, 1)); + if (documentOutline != null) { + for (PDOutlineItem child : documentOutline.children()) { + rootNodes.add(createOutlineObjectWithChildren(child, document, 1)); + } } return new OutlineObjectTree(rootNodes); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineValidationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineValidationService.java index a1d5838..0931fe6 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineValidationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineValidationService.java @@ -1,16 +1,15 @@ package com.knecon.fforesight.service.layoutparser.processor.model.outline; -import java.awt.geom.Point2D; +import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.getHeadlineNumber; + import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.Objects; import java.util.TreeSet; import org.springframework.stereotype.Service; -import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import lombok.extern.slf4j.Slf4j; @@ -19,79 +18,6 @@ import lombok.extern.slf4j.Slf4j; @Slf4j public class OutlineValidationService { - public TableOfContents validateWithToC(List allHeadlines, List headlinesFromOutlines, List newlyClassifiedHeadlines) { - - TableOfContents validatedToC = createToC(headlinesFromOutlines); - TableOfContents currentToC = createToC(allHeadlines); - - TableOfContentItem lastHeadlineFromOutlines = null; - for (TableOfContentItem tocItem : currentToC.getAllTableOfContentItems()) { - if (!containsItem(validatedToC, tocItem)) { - addItemAtCorrectPosition(validatedToC, tocItem, lastHeadlineFromOutlines); - } else { - lastHeadlineFromOutlines = tocItem; - } - } - return validatedToC; - } - - - private boolean containsBlock(TableOfContents toc, TextPageBlock block) { - - for (TableOfContentItem existingItem : toc.getMainSections()) { - if (existingItem.getHeadline().equals(block) || existingItem.contains(block)) { - return true; - } - } - return false; - } - - - private boolean containsItem(TableOfContents toc, TableOfContentItem tocItem) { - - for (TableOfContentItem existingItem : toc.getMainSections()) { - if (existingItem.equals(tocItem) || existingItem.contains(tocItem)) { - return true; - } - } - return false; - } - - - private void addItemAtCorrectPosition(TableOfContents toc, TableOfContentItem tocItem, TableOfContentItem lastHeadlineFromOutlines) { - - //if (lastHeadlineFromOutlines == null || tocItem.g) - //if(!tocItem.getChildren().isEmpty()) { -// - //} - } - - - public TableOfContents createToCOld(List headlines) { - - List mainSections = new ArrayList<>(); - int parentDepth = 7; // more than 6 (h6) - TableOfContentItem parent = null; - for (TextPageBlock current : headlines) { - int currentDepth = getDepth(current.getClassification()); - if (parentDepth >= currentDepth) { - parentDepth = currentDepth; - parent = new TableOfContentItem(current); - mainSections.add(parent); - } else { - assert (parent != null); - while (parentDepth < currentDepth && parent.getParent() != null) { - parent = parent.getParent(); - parentDepth = getDepth(parent.getHeadline().getClassification()); - } - parent.addChild(new TableOfContentItem(current)); - } - } - return new TableOfContents(mainSections); - - } - - public TableOfContents createToC(List headlines) { List mainSections = new ArrayList<>(); @@ -100,7 +26,7 @@ public class OutlineValidationService { TreeSet depths = new TreeSet<>(); for (TextPageBlock current : headlines) { - int currentDepth = getDepth(current.getClassification()); + int currentDepth = getHeadlineNumber(current.getClassification()); Integer parentDepth = depths.floor(currentDepth - 1); var tocItem = new TableOfContentItem(current); @@ -110,12 +36,12 @@ public class OutlineValidationService { } else { assert last != null; - int lastDepth = getDepth(last.getHeadline().getClassification()); + int lastDepth = getHeadlineNumber(last.getHeadline().getClassification()); if (lastDepth < parentDepth) { parentDepth = lastDepth; } else if (lastDepth == currentDepth && last.getParent() != null) { - parentDepth = getDepth(last.getParent().getHeadline().getClassification()); + parentDepth = getHeadlineNumber(last.getParent().getHeadline().getClassification()); } TableOfContentItem parent = lastItemsPerDepth.get(parentDepth); @@ -130,131 +56,4 @@ public class OutlineValidationService { return new TableOfContents(mainSections); } - - public void updateOutlineObjectTree(OutlineObjectTree outlineObjectTree, List allHeadlines, List newlyClassifiedHeadlines) { - - List newOutlineObjects = newlyClassifiedHeadlines.stream() - .map(textPageBlock -> new OutlineObject(textPageBlock.getText(), - textPageBlock.getPage(), - new Point2D.Double(textPageBlock.getMinX(), textPageBlock.getMinY()), - getDepth(textPageBlock.getClassification()))) - .toList(); - - } - - - private static int getDepth(PageBlockType pageBlockType) { - - return switch (pageBlockType) { - case H1 -> 1; - case H2 -> 2; - case H3 -> 3; - case H4 -> 4; - case H5 -> 5; - default -> 6; - }; - } - - - public void validate(List allHeadlines, List newlyClassifiedHeadlines) { - - if (allHeadlines.size() - newlyClassifiedHeadlines.size() > newlyClassifiedHeadlines.size()) { - - List headlines = allHeadlines.stream() - .map(textPageBlock -> new Headline(textPageBlock, newlyClassifiedHeadlines.contains(textPageBlock))) - .toList(); - for (TextPageBlock newHeadline : newlyClassifiedHeadlines) { - int newHeadlineIndex = headlines.indexOf(newHeadline); - List adjacentNewlyClassified = findAdjacentNewlyClassified(newHeadline, newlyClassifiedHeadlines); - // Find neighboring headlines from outlines - //TextPageBlock previousOutline = findNeighboringOutline(allHeadlines.indexOf(newHeadline), -1); - //TextPageBlock nextOutline = findNeighboringOutline(allHeadlines.indexOf(newHeadline), headlinesFromOutlines, 1); - - // If we have neighboring outlines, perform comparison - //if (previousOutline != null && nextOutline != null) { - // // Compare headline orders - // int orderComparison = compareHeadlineOrder(previousOutline, nextOutline); - // if (orderComparison != 0) { - // // Set classification based on comparison - // setClassification(newHeadline, orderComparison, previousOutline, nextOutline); - // } - //} - } - } - - } - - - private List findAdjacentNewlyClassified(TextPageBlock headline, List newlyClassifiedHeadlines) { - // Find adjacent newly classified headlines - List adjacentNewlyClassified = new ArrayList<>(); - int index = newlyClassifiedHeadlines.indexOf(headline); - if (index != -1) { - adjacentNewlyClassified.add(headline); - for (int i = index - 1; i >= 0; i--) { - if (newlyClassifiedHeadlines.get(i).equals(adjacentNewlyClassified.get(0))) { - adjacentNewlyClassified.add(0, newlyClassifiedHeadlines.get(i)); - } else { - break; - } - } - for (int i = index + 1; i < newlyClassifiedHeadlines.size(); i++) { - if (newlyClassifiedHeadlines.get(i).equals(adjacentNewlyClassified.get(adjacentNewlyClassified.size() - 1))) { - adjacentNewlyClassified.add(newlyClassifiedHeadlines.get(i)); - } else { - break; - } - } - } - return adjacentNewlyClassified; - } - - - private TextPageBlock findNeighboringOutline(TextPageBlock headline, List headlinesFromOutlines, int direction) { - // Find neighboring headline from outlines in the specified direction - int index = headlinesFromOutlines.indexOf(headline); - if (index != -1 && index + direction >= 0 && index + direction < headlinesFromOutlines.size()) { - return headlinesFromOutlines.get(index + direction); - } - return null; - } - - - private int compareHeadlineOrder(TextPageBlock headline1, TextPageBlock headline2) { - // Compare headline orders - // Implement your comparison logic here - return 0; // Placeholder return, implement actual comparison logic - } - - - private void setClassification(TextPageBlock headline, int orderComparison, TextPageBlock previousOutline, TextPageBlock nextOutline) { - // Set classification based on comparison with neighboring outlines - // Implement your classification logic here - } - - - record Headline(TextPageBlock textPageBlock, boolean newlyClassified) { - - @Override - public boolean equals(Object obj) { - - if (this == obj) { - return true; - } - if (obj == null || getClass() != obj.getClass()) { - return false; - } - Headline headline = (Headline) obj; - return Objects.equals(textPageBlock, headline.textPageBlock); - } - - - @Override - public int hashCode() { - - return Objects.hash(textPageBlock); - } - - } - } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TOCEnrichmentService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TOCEnrichmentService.java index f5aa06f..95849b4 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TOCEnrichmentService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TOCEnrichmentService.java @@ -28,14 +28,9 @@ public class TOCEnrichmentService { TableOfContents toc = document.getTableOfContents(); List startBlocks = new ArrayList<>(); List startImages = new ArrayList<>(); - //Map> sectionsMap = new HashMap<>(); TableOfContentItem currentSection = null; boolean foundFirstHeadline = false; - //for (TableOfContentItem item : toc.getAllTableOfContentItems()) { - // sectionsMap.put(item, new ArrayList<>()); - //} - List headers = new ArrayList<>(); List footers = new ArrayList<>(); TablePageBlock previousTable = null; @@ -90,7 +85,6 @@ public class TOCEnrichmentService { startBlocks.add(current); } else { currentSection.getSectionBlocks().add(current); - //sectionsMap.get(currentSection).add(current); } } } @@ -179,7 +173,6 @@ public class TOCEnrichmentService { unassigned.setImages(startImages); document.getTableOfContents().getMainSections().add(0, unassigned); } - //document.setSectionsMap(sectionsMap); document.setHeaders(headers); document.setFooters(footers); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TableOfContentItem.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TableOfContentItem.java index bbbbeac..2d229bf 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TableOfContentItem.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TableOfContentItem.java @@ -1,7 +1,6 @@ package com.knecon.fforesight.service.layoutparser.processor.model.outline; import java.util.ArrayList; -import java.util.Collection; import java.util.List; import java.util.stream.Collectors; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TableOfContents.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TableOfContents.java index 769e5ac..72ee8a2 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TableOfContents.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TableOfContents.java @@ -43,7 +43,9 @@ public class TableOfContents implements Iterable { } } + public List getAllTableOfContentItems() { + List allItems = new ArrayList<>(); for (TableOfContentItem item : mainSections) { collectTableOfContentItems(item, allItems); @@ -51,7 +53,9 @@ public class TableOfContents implements Iterable { return allItems; } + private void collectTableOfContentItems(TableOfContentItem item, List allItems) { + allItems.add(item); for (TableOfContentItem child : item.getChildren()) { collectTableOfContentItems(child, allItems); @@ -59,39 +63,74 @@ public class TableOfContents implements Iterable { } + private boolean containsBlock(TextPageBlock block) { + + for (TableOfContentItem existingItem : this.getMainSections()) { + if (existingItem.getHeadline().equals(block) || existingItem.contains(block)) { + return true; + } + } + return false; + } + + + private boolean containsItem(TableOfContentItem tocItem) { + + for (TableOfContentItem existingItem : this.getMainSections()) { + if (existingItem.equals(tocItem) || existingItem.contains(tocItem)) { + return true; + } + } + return false; + } + + @Override public @NonNull Iterator iterator() { return new TableOfContentItemIterator(mainSections); } + private static class TableOfContentItemIterator implements Iterator { + private final Stack> stack = new Stack<>(); + public TableOfContentItemIterator(List mainSections) { + stack.push(mainSections.iterator()); } + @Override public boolean hasNext() { + ensureStackTopIsCurrent(); return !stack.isEmpty() && stack.peek().hasNext(); } + @Override public TableOfContentItem next() { + ensureStackTopIsCurrent(); TableOfContentItem currentItem = stack.peek().next(); if (currentItem.getChildren() != null && !currentItem.getChildren().isEmpty()) { - stack.push(currentItem.getChildren().iterator()); + stack.push(currentItem.getChildren() + .iterator()); } return currentItem; } + private void ensureStackTopIsCurrent() { + while (!stack.isEmpty() && !stack.peek().hasNext()) { stack.pop(); } } + } + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/SectionsBuilderService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/SectionsBuilderService.java index a0e9267..e7cfe31 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/SectionsBuilderService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/SectionsBuilderService.java @@ -27,6 +27,7 @@ import lombok.extern.slf4j.Slf4j; @Slf4j @Service +@Deprecated public class SectionsBuilderService { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java index 4755125..1930ce7 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java @@ -4,16 +4,12 @@ import static com.knecon.fforesight.service.layoutparser.processor.services.bloc import java.awt.geom.Rectangle2D; import java.util.ArrayList; -import java.util.Collections; import java.util.List; import java.util.ListIterator; import java.util.Locale; import java.util.function.Function; import org.springframework.stereotype.Service; -import org.tinspin.index.Index; -import org.tinspin.index.kdtree.KDIterator; -import org.tinspin.index.kdtree.KDTree; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; @@ -262,13 +258,6 @@ public class BlockificationPostprocessingService { } - private static void addNeighborsOfCandidate(KDTree kdTree, TextPageBlock mergeCandidate, List allMergeCandidates) { - - var boundingBox = blockToBoundingBox.apply(mergeCandidate); - Index.PointIteratorKnn knnIterator = kdTree.queryKnn(new double[]{boundingBox.getMinX(), boundingBox.getMaxY()}, 4); - knnIterator.forEachRemaining(neighbor -> allMergeCandidates.add(neighbor.value())); - } - // currently only three cases are handled here: // 1. equality @@ -335,58 +324,4 @@ public class BlockificationPostprocessingService { } - @Deprecated - public void sanitizeOutlineBlocksWithKdTree(ClassificationPage classificationPage) { - - List outlineObjects = classificationPage.getOutlineObjects(); - if (classificationPage.getTextBlocks().isEmpty() || outlineObjects.isEmpty()) { - return; - } - - KDTree kdTree = createKdTree(classificationPage); - float pageHeight = classificationPage.getPageHeight(); - - for (OutlineObject outlineObject : outlineObjects) { - - // kd tree contains yx coordinates - KDIterator successorIterator = kdTree.query(new double[]{ // - pageHeight - outlineObject.getPoint().getY() - BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD, 0, // - // - }, // - new double[]{Double.MAX_VALUE, Double.MAX_VALUE}); - - OutlineProcessionContext context = new OutlineProcessionContext(outlineObject); - - boolean earlyStop = false; - while (successorIterator.hasNext() && !earlyStop) { - TextPageBlock pageBlock = successorIterator.next().value(); - earlyStop = processOutlineForTextBlock(pageBlock, context); - processOutlineForTextBlock(pageBlock, context); - } - selectMatch(classificationPage, context); - - } - } - - - @Deprecated - private static KDTree createKdTree(ClassificationPage classificationPage) { - - List textBlocks = classificationPage.getTextBlocks() - .stream() - .filter(block -> block instanceof TextPageBlock) - .toList() - .stream() - .map(block -> (TextPageBlock) block) - .toList(); - - KDTree kdTree = KDTree.create(2); - // insert y first then x, use pdf max y so that the page height is subtracted so that the order is inverted - textBlocks.forEach(block -> { - //var boundingBox = blockToBoundingBox.apply(block); - kdTree.insert(new double[]{block.getMinY(), block.getMinX()}, block); - }); - return kdTree; - } - } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java index 97b4eea..62c9eef 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java @@ -1,5 +1,7 @@ package com.knecon.fforesight.service.layoutparser.processor.services.classification; +import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.getHeadlineNumber; + import java.util.List; import java.util.regex.Pattern; @@ -169,17 +171,6 @@ public class RedactManagerClassificationService { } - private static int getHeadlineNumber(PageBlockType pageBlockType) { - - return switch (pageBlockType) { - case H1 -> 1; - case H2 -> 2; - case H3 -> 3; - case H4 -> 4; - case H5 -> 5; - default -> 6; - }; - } @Data diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java index 8b21ec0..d15b336 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java @@ -76,9 +76,6 @@ public class DocumentGraphFactory { private void addSections(LayoutParsingType layoutParsingType, ClassificationDocument classificationDocument, Context context, Document document) { - //classificationDocument.getSections() - // .forEach(section -> SectionNodeFactory.addSection(layoutParsingType, null, section.getNonEmptyPageBlocks(), section.getImages(), context, document)); - for (TableOfContentItem tocItem : classificationDocument.getTableOfContents()) { var parent = tocItem.getParent() == null ? null : tocItem.getParent().getSection(); Optional
section = SectionNodeFactory.addSection(layoutParsingType, parent, tocItem.getNonEmptySectionBlocks(), tocItem.getImages(), context, document); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java index e9a091e..a0246cb 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java @@ -1,10 +1,12 @@ package com.knecon.fforesight.service.layoutparser.server.graph; import java.io.File; +import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; import java.util.List; import java.util.Map; +import java.util.stream.Stream; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; @@ -25,14 +27,66 @@ import lombok.SneakyThrows; public class ViewerDocumentTest extends BuildDocumentTest { + @Test + @SneakyThrows + public void testViewerDocuments() { + + String directory = "files/syngenta_190_deduplicated/"; + Path dirPath = new ClassPathResource(directory).getFile().toPath(); + + // Ensure the directory exists and is accessible + if (!Files.exists(dirPath) || !Files.isDirectory(dirPath)) { + throw new IllegalArgumentException("The specified path must be a directory and it must exist."); + } + + ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null); + LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService); + + // Use try-with-resources to ensure the stream is closed after use + try (Stream paths = Files.walk(dirPath)) { + paths.filter(Files::isRegularFile) + .filter(path -> path.toString().endsWith(".pdf")) // Filter to process only PDF files + .forEach(path -> processFile(path, layoutGridService)); + } + } + + + private void processFile(Path filePath, LayoutGridService layoutGridService) { + + try { + File documentFile = filePath.toFile(); + String tmpFileName = "/tmp/" + filePath.getFileName().toString() + "_VIEWER.pdf"; + + long start = System.currentTimeMillis(); + var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, + documentFile, + new ImageServiceResponse(), + new TableServiceResponse(), + new VisualLayoutParsingResponse(), + Map.of("file", filePath.getFileName().toFile().toString())); + Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER, classificationDocument); + + if (classificationDocument.getOutlineObjectTree().getRootNodes().size() > 1) { + layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true); + System.out.printf("Processed %s in %.2fs%n", filePath, ((float) (System.currentTimeMillis() - start)) / 1000); + } + } catch (Exception exception) + { + System.out.println(exception); + } + } + + @Test @SneakyThrows public void testViewerDocument() { + String fileName = "files/new/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"; + //String fileName = "files/syngenta_190_deduplicated/1 Abamectin_prr.pdf"; //String fileName = "files/new/abschlussarbeiten-template-institut-fur-informatik-padagogische-hochschule-karlsruhe.pdf"; //String fileName = "files/new/kaust-official-thesis-template.pdf"; //String fileName = "files/new/$100m Offers.pdf"; - String fileName = "files/new/18-Curacron_ToxicidadeOcularInVitro.pdf"; + //String fileName = "files/new/18-Curacron_ToxicidadeOcularInVitro.pdf"; //String fileName = "files/new/mistitled_outlines_example.pdf"; //String fileName = "files/bdr/Plenarprotokoll 1 (keine Druchsache!) (1) 1.pdf"; //String fileName = "files/new/UTT-Books-53.pdf"; @@ -48,6 +102,7 @@ public class ViewerDocumentTest extends BuildDocumentTest { System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000); } + @Test @SneakyThrows public void testViewerDocumentWithImages() { @@ -90,11 +145,11 @@ public class ViewerDocumentTest extends BuildDocumentTest { var documentFile = new ClassPathResource(fileName).getFile(); var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE, - documentFile, - new ImageServiceResponse(), - tableResponse, - new VisualLayoutParsingResponse(), - Map.of("file", path.getFileName().toFile().toString())); + documentFile, + new ImageServiceResponse(), + tableResponse, + new VisualLayoutParsingResponse(), + Map.of("file", path.getFileName().toFile().toString())); ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null); LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService); Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.DOCUMINE, classificationDocument);