RED-7074: Design Subsection section tree structure algorithm
* refactoring
This commit is contained in:
parent
9bf2f5c56c
commit
f7aeb9a406
@ -29,7 +29,10 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineExtractorService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineValidationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TOCEnrichmentService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
@ -55,6 +58,7 @@ import com.knecon.fforesight.service.layoutparser.processor.services.classificat
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.DocuMineClassificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.graphics.GraphicExtractorService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDFLinesTextStripper;
|
||||
@ -357,7 +361,7 @@ public class LayoutParsingPipeline {
|
||||
.stream()
|
||||
.flatMap(classificationPage -> classificationPage.getTextBlocks()
|
||||
.stream()
|
||||
.filter(tb -> tb instanceof TextPageBlock && tb.getClassification().isHeadline())
|
||||
.filter(tb -> tb instanceof TextPageBlock && tb.getClassification() != null && tb.getClassification().isHeadline())
|
||||
.map(tb -> (TextPageBlock) tb))
|
||||
.toList();
|
||||
TableOfContents tableOfContents = outlineValidationService.createToC(headlines);
|
||||
@ -368,9 +372,6 @@ public class LayoutParsingPipeline {
|
||||
switch (layoutParsingType) {
|
||||
case CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_PARAGRAPH_DEBUG -> sectionsBuilderService.buildParagraphDebugSections(classificationDocument);
|
||||
default -> {
|
||||
sectionsBuilderService.buildSections(classificationDocument);
|
||||
sectionsBuilderService.addImagesToSections(classificationDocument);
|
||||
|
||||
tocEnrichmentService.assignSectionBlocksAndImages(classificationDocument);
|
||||
}
|
||||
}
|
||||
|
||||
@ -31,6 +31,19 @@ public enum PageBlockType {
|
||||
}
|
||||
|
||||
|
||||
public static int getHeadlineNumber(PageBlockType pageBlockType) {
|
||||
|
||||
return switch (pageBlockType) {
|
||||
case H1 -> 1;
|
||||
case H2 -> 2;
|
||||
case H3 -> 3;
|
||||
case H4 -> 4;
|
||||
case H5 -> 5;
|
||||
default -> 6;
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
public boolean isHeadline() {
|
||||
|
||||
return this.equals(H1) || this.equals(H2) || this.equals(H3) || this.equals(H4) || this.equals(H5) || this.equals(H6);
|
||||
|
||||
@ -26,10 +26,6 @@ import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocume
|
||||
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTreeNode;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@ -53,8 +49,10 @@ public class OutlineExtractorService {
|
||||
PDDocumentOutline documentOutline = document.getDocumentCatalog().getDocumentOutline();
|
||||
|
||||
List<OutlineObjectTreeNode> rootNodes = new ArrayList<>();
|
||||
for (PDOutlineItem child : documentOutline.children()) {
|
||||
rootNodes.add(createOutlineObjectWithChildren(child, document, 1));
|
||||
if (documentOutline != null) {
|
||||
for (PDOutlineItem child : documentOutline.children()) {
|
||||
rootNodes.add(createOutlineObjectWithChildren(child, document, 1));
|
||||
}
|
||||
}
|
||||
|
||||
return new OutlineObjectTree(rootNodes);
|
||||
|
||||
@ -1,16 +1,15 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.getHeadlineNumber;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
@ -19,79 +18,6 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@Slf4j
|
||||
public class OutlineValidationService {
|
||||
|
||||
public TableOfContents validateWithToC(List<TextPageBlock> allHeadlines, List<TextPageBlock> headlinesFromOutlines, List<TextPageBlock> newlyClassifiedHeadlines) {
|
||||
|
||||
TableOfContents validatedToC = createToC(headlinesFromOutlines);
|
||||
TableOfContents currentToC = createToC(allHeadlines);
|
||||
|
||||
TableOfContentItem lastHeadlineFromOutlines = null;
|
||||
for (TableOfContentItem tocItem : currentToC.getAllTableOfContentItems()) {
|
||||
if (!containsItem(validatedToC, tocItem)) {
|
||||
addItemAtCorrectPosition(validatedToC, tocItem, lastHeadlineFromOutlines);
|
||||
} else {
|
||||
lastHeadlineFromOutlines = tocItem;
|
||||
}
|
||||
}
|
||||
return validatedToC;
|
||||
}
|
||||
|
||||
|
||||
private boolean containsBlock(TableOfContents toc, TextPageBlock block) {
|
||||
|
||||
for (TableOfContentItem existingItem : toc.getMainSections()) {
|
||||
if (existingItem.getHeadline().equals(block) || existingItem.contains(block)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
private boolean containsItem(TableOfContents toc, TableOfContentItem tocItem) {
|
||||
|
||||
for (TableOfContentItem existingItem : toc.getMainSections()) {
|
||||
if (existingItem.equals(tocItem) || existingItem.contains(tocItem)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
private void addItemAtCorrectPosition(TableOfContents toc, TableOfContentItem tocItem, TableOfContentItem lastHeadlineFromOutlines) {
|
||||
|
||||
//if (lastHeadlineFromOutlines == null || tocItem.g)
|
||||
//if(!tocItem.getChildren().isEmpty()) {
|
||||
//
|
||||
//}
|
||||
}
|
||||
|
||||
|
||||
public TableOfContents createToCOld(List<TextPageBlock> headlines) {
|
||||
|
||||
List<TableOfContentItem> mainSections = new ArrayList<>();
|
||||
int parentDepth = 7; // more than 6 (h6)
|
||||
TableOfContentItem parent = null;
|
||||
for (TextPageBlock current : headlines) {
|
||||
int currentDepth = getDepth(current.getClassification());
|
||||
if (parentDepth >= currentDepth) {
|
||||
parentDepth = currentDepth;
|
||||
parent = new TableOfContentItem(current);
|
||||
mainSections.add(parent);
|
||||
} else {
|
||||
assert (parent != null);
|
||||
while (parentDepth < currentDepth && parent.getParent() != null) {
|
||||
parent = parent.getParent();
|
||||
parentDepth = getDepth(parent.getHeadline().getClassification());
|
||||
}
|
||||
parent.addChild(new TableOfContentItem(current));
|
||||
}
|
||||
}
|
||||
return new TableOfContents(mainSections);
|
||||
|
||||
}
|
||||
|
||||
|
||||
public TableOfContents createToC(List<TextPageBlock> headlines) {
|
||||
|
||||
List<TableOfContentItem> mainSections = new ArrayList<>();
|
||||
@ -100,7 +26,7 @@ public class OutlineValidationService {
|
||||
TreeSet<Integer> depths = new TreeSet<>();
|
||||
|
||||
for (TextPageBlock current : headlines) {
|
||||
int currentDepth = getDepth(current.getClassification());
|
||||
int currentDepth = getHeadlineNumber(current.getClassification());
|
||||
Integer parentDepth = depths.floor(currentDepth - 1);
|
||||
|
||||
var tocItem = new TableOfContentItem(current);
|
||||
@ -110,12 +36,12 @@ public class OutlineValidationService {
|
||||
|
||||
} else {
|
||||
assert last != null;
|
||||
int lastDepth = getDepth(last.getHeadline().getClassification());
|
||||
int lastDepth = getHeadlineNumber(last.getHeadline().getClassification());
|
||||
|
||||
if (lastDepth < parentDepth) {
|
||||
parentDepth = lastDepth;
|
||||
} else if (lastDepth == currentDepth && last.getParent() != null) {
|
||||
parentDepth = getDepth(last.getParent().getHeadline().getClassification());
|
||||
parentDepth = getHeadlineNumber(last.getParent().getHeadline().getClassification());
|
||||
}
|
||||
|
||||
TableOfContentItem parent = lastItemsPerDepth.get(parentDepth);
|
||||
@ -130,131 +56,4 @@ public class OutlineValidationService {
|
||||
return new TableOfContents(mainSections);
|
||||
}
|
||||
|
||||
|
||||
public void updateOutlineObjectTree(OutlineObjectTree outlineObjectTree, List<TextPageBlock> allHeadlines, List<TextPageBlock> newlyClassifiedHeadlines) {
|
||||
|
||||
List<OutlineObject> newOutlineObjects = newlyClassifiedHeadlines.stream()
|
||||
.map(textPageBlock -> new OutlineObject(textPageBlock.getText(),
|
||||
textPageBlock.getPage(),
|
||||
new Point2D.Double(textPageBlock.getMinX(), textPageBlock.getMinY()),
|
||||
getDepth(textPageBlock.getClassification())))
|
||||
.toList();
|
||||
|
||||
}
|
||||
|
||||
|
||||
private static int getDepth(PageBlockType pageBlockType) {
|
||||
|
||||
return switch (pageBlockType) {
|
||||
case H1 -> 1;
|
||||
case H2 -> 2;
|
||||
case H3 -> 3;
|
||||
case H4 -> 4;
|
||||
case H5 -> 5;
|
||||
default -> 6;
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
public void validate(List<TextPageBlock> allHeadlines, List<TextPageBlock> newlyClassifiedHeadlines) {
|
||||
|
||||
if (allHeadlines.size() - newlyClassifiedHeadlines.size() > newlyClassifiedHeadlines.size()) {
|
||||
|
||||
List<Headline> headlines = allHeadlines.stream()
|
||||
.map(textPageBlock -> new Headline(textPageBlock, newlyClassifiedHeadlines.contains(textPageBlock)))
|
||||
.toList();
|
||||
for (TextPageBlock newHeadline : newlyClassifiedHeadlines) {
|
||||
int newHeadlineIndex = headlines.indexOf(newHeadline);
|
||||
List<TextPageBlock> adjacentNewlyClassified = findAdjacentNewlyClassified(newHeadline, newlyClassifiedHeadlines);
|
||||
// Find neighboring headlines from outlines
|
||||
//TextPageBlock previousOutline = findNeighboringOutline(allHeadlines.indexOf(newHeadline), -1);
|
||||
//TextPageBlock nextOutline = findNeighboringOutline(allHeadlines.indexOf(newHeadline), headlinesFromOutlines, 1);
|
||||
|
||||
// If we have neighboring outlines, perform comparison
|
||||
//if (previousOutline != null && nextOutline != null) {
|
||||
// // Compare headline orders
|
||||
// int orderComparison = compareHeadlineOrder(previousOutline, nextOutline);
|
||||
// if (orderComparison != 0) {
|
||||
// // Set classification based on comparison
|
||||
// setClassification(newHeadline, orderComparison, previousOutline, nextOutline);
|
||||
// }
|
||||
//}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
private List<TextPageBlock> findAdjacentNewlyClassified(TextPageBlock headline, List<TextPageBlock> newlyClassifiedHeadlines) {
|
||||
// Find adjacent newly classified headlines
|
||||
List<TextPageBlock> adjacentNewlyClassified = new ArrayList<>();
|
||||
int index = newlyClassifiedHeadlines.indexOf(headline);
|
||||
if (index != -1) {
|
||||
adjacentNewlyClassified.add(headline);
|
||||
for (int i = index - 1; i >= 0; i--) {
|
||||
if (newlyClassifiedHeadlines.get(i).equals(adjacentNewlyClassified.get(0))) {
|
||||
adjacentNewlyClassified.add(0, newlyClassifiedHeadlines.get(i));
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
for (int i = index + 1; i < newlyClassifiedHeadlines.size(); i++) {
|
||||
if (newlyClassifiedHeadlines.get(i).equals(adjacentNewlyClassified.get(adjacentNewlyClassified.size() - 1))) {
|
||||
adjacentNewlyClassified.add(newlyClassifiedHeadlines.get(i));
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return adjacentNewlyClassified;
|
||||
}
|
||||
|
||||
|
||||
private TextPageBlock findNeighboringOutline(TextPageBlock headline, List<TextPageBlock> headlinesFromOutlines, int direction) {
|
||||
// Find neighboring headline from outlines in the specified direction
|
||||
int index = headlinesFromOutlines.indexOf(headline);
|
||||
if (index != -1 && index + direction >= 0 && index + direction < headlinesFromOutlines.size()) {
|
||||
return headlinesFromOutlines.get(index + direction);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
private int compareHeadlineOrder(TextPageBlock headline1, TextPageBlock headline2) {
|
||||
// Compare headline orders
|
||||
// Implement your comparison logic here
|
||||
return 0; // Placeholder return, implement actual comparison logic
|
||||
}
|
||||
|
||||
|
||||
private void setClassification(TextPageBlock headline, int orderComparison, TextPageBlock previousOutline, TextPageBlock nextOutline) {
|
||||
// Set classification based on comparison with neighboring outlines
|
||||
// Implement your classification logic here
|
||||
}
|
||||
|
||||
|
||||
record Headline(TextPageBlock textPageBlock, boolean newlyClassified) {
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
|
||||
if (this == obj) {
|
||||
return true;
|
||||
}
|
||||
if (obj == null || getClass() != obj.getClass()) {
|
||||
return false;
|
||||
}
|
||||
Headline headline = (Headline) obj;
|
||||
return Objects.equals(textPageBlock, headline.textPageBlock);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
|
||||
return Objects.hash(textPageBlock);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -28,14 +28,9 @@ public class TOCEnrichmentService {
|
||||
TableOfContents toc = document.getTableOfContents();
|
||||
List<AbstractPageBlock> startBlocks = new ArrayList<>();
|
||||
List<ClassifiedImage> startImages = new ArrayList<>();
|
||||
//Map<TableOfContentItem, List<AbstractPageBlock>> sectionsMap = new HashMap<>();
|
||||
TableOfContentItem currentSection = null;
|
||||
boolean foundFirstHeadline = false;
|
||||
|
||||
//for (TableOfContentItem item : toc.getAllTableOfContentItems()) {
|
||||
// sectionsMap.put(item, new ArrayList<>());
|
||||
//}
|
||||
|
||||
List<ClassificationHeader> headers = new ArrayList<>();
|
||||
List<ClassificationFooter> footers = new ArrayList<>();
|
||||
TablePageBlock previousTable = null;
|
||||
@ -90,7 +85,6 @@ public class TOCEnrichmentService {
|
||||
startBlocks.add(current);
|
||||
} else {
|
||||
currentSection.getSectionBlocks().add(current);
|
||||
//sectionsMap.get(currentSection).add(current);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -179,7 +173,6 @@ public class TOCEnrichmentService {
|
||||
unassigned.setImages(startImages);
|
||||
document.getTableOfContents().getMainSections().add(0, unassigned);
|
||||
}
|
||||
//document.setSectionsMap(sectionsMap);
|
||||
document.setHeaders(headers);
|
||||
document.setFooters(footers);
|
||||
}
|
||||
|
||||
@ -1,7 +1,6 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
|
||||
@ -43,7 +43,9 @@ public class TableOfContents implements Iterable<TableOfContentItem> {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public List<TableOfContentItem> getAllTableOfContentItems() {
|
||||
|
||||
List<TableOfContentItem> allItems = new ArrayList<>();
|
||||
for (TableOfContentItem item : mainSections) {
|
||||
collectTableOfContentItems(item, allItems);
|
||||
@ -51,7 +53,9 @@ public class TableOfContents implements Iterable<TableOfContentItem> {
|
||||
return allItems;
|
||||
}
|
||||
|
||||
|
||||
private void collectTableOfContentItems(TableOfContentItem item, List<TableOfContentItem> allItems) {
|
||||
|
||||
allItems.add(item);
|
||||
for (TableOfContentItem child : item.getChildren()) {
|
||||
collectTableOfContentItems(child, allItems);
|
||||
@ -59,39 +63,74 @@ public class TableOfContents implements Iterable<TableOfContentItem> {
|
||||
}
|
||||
|
||||
|
||||
private boolean containsBlock(TextPageBlock block) {
|
||||
|
||||
for (TableOfContentItem existingItem : this.getMainSections()) {
|
||||
if (existingItem.getHeadline().equals(block) || existingItem.contains(block)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
private boolean containsItem(TableOfContentItem tocItem) {
|
||||
|
||||
for (TableOfContentItem existingItem : this.getMainSections()) {
|
||||
if (existingItem.equals(tocItem) || existingItem.contains(tocItem)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public @NonNull Iterator<TableOfContentItem> iterator() {
|
||||
|
||||
return new TableOfContentItemIterator(mainSections);
|
||||
}
|
||||
|
||||
|
||||
private static class TableOfContentItemIterator implements Iterator<TableOfContentItem> {
|
||||
|
||||
private final Stack<Iterator<TableOfContentItem>> stack = new Stack<>();
|
||||
|
||||
|
||||
public TableOfContentItemIterator(List<TableOfContentItem> mainSections) {
|
||||
|
||||
stack.push(mainSections.iterator());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
|
||||
ensureStackTopIsCurrent();
|
||||
return !stack.isEmpty() && stack.peek().hasNext();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public TableOfContentItem next() {
|
||||
|
||||
ensureStackTopIsCurrent();
|
||||
TableOfContentItem currentItem = stack.peek().next();
|
||||
if (currentItem.getChildren() != null && !currentItem.getChildren().isEmpty()) {
|
||||
stack.push(currentItem.getChildren().iterator());
|
||||
stack.push(currentItem.getChildren()
|
||||
.iterator());
|
||||
}
|
||||
return currentItem;
|
||||
}
|
||||
|
||||
|
||||
private void ensureStackTopIsCurrent() {
|
||||
|
||||
while (!stack.isEmpty() && !stack.peek().hasNext()) {
|
||||
stack.pop();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -27,6 +27,7 @@ import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@Deprecated
|
||||
public class SectionsBuilderService {
|
||||
|
||||
|
||||
|
||||
@ -4,16 +4,12 @@ import static com.knecon.fforesight.service.layoutparser.processor.services.bloc
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.ListIterator;
|
||||
import java.util.Locale;
|
||||
import java.util.function.Function;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.tinspin.index.Index;
|
||||
import org.tinspin.index.kdtree.KDIterator;
|
||||
import org.tinspin.index.kdtree.KDTree;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
@ -262,13 +258,6 @@ public class BlockificationPostprocessingService {
|
||||
}
|
||||
|
||||
|
||||
private static void addNeighborsOfCandidate(KDTree<TextPageBlock> kdTree, TextPageBlock mergeCandidate, List<TextPageBlock> allMergeCandidates) {
|
||||
|
||||
var boundingBox = blockToBoundingBox.apply(mergeCandidate);
|
||||
Index.PointIteratorKnn<TextPageBlock> knnIterator = kdTree.queryKnn(new double[]{boundingBox.getMinX(), boundingBox.getMaxY()}, 4);
|
||||
knnIterator.forEachRemaining(neighbor -> allMergeCandidates.add(neighbor.value()));
|
||||
}
|
||||
|
||||
|
||||
// currently only three cases are handled here:
|
||||
// 1. equality
|
||||
@ -335,58 +324,4 @@ public class BlockificationPostprocessingService {
|
||||
|
||||
}
|
||||
|
||||
@Deprecated
|
||||
public void sanitizeOutlineBlocksWithKdTree(ClassificationPage classificationPage) {
|
||||
|
||||
List<OutlineObject> outlineObjects = classificationPage.getOutlineObjects();
|
||||
if (classificationPage.getTextBlocks().isEmpty() || outlineObjects.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
KDTree<TextPageBlock> kdTree = createKdTree(classificationPage);
|
||||
float pageHeight = classificationPage.getPageHeight();
|
||||
|
||||
for (OutlineObject outlineObject : outlineObjects) {
|
||||
|
||||
// kd tree contains yx coordinates
|
||||
KDIterator<TextPageBlock> successorIterator = kdTree.query(new double[]{ //
|
||||
pageHeight - outlineObject.getPoint().getY() - BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD, 0, //
|
||||
//
|
||||
}, //
|
||||
new double[]{Double.MAX_VALUE, Double.MAX_VALUE});
|
||||
|
||||
OutlineProcessionContext context = new OutlineProcessionContext(outlineObject);
|
||||
|
||||
boolean earlyStop = false;
|
||||
while (successorIterator.hasNext() && !earlyStop) {
|
||||
TextPageBlock pageBlock = successorIterator.next().value();
|
||||
earlyStop = processOutlineForTextBlock(pageBlock, context);
|
||||
processOutlineForTextBlock(pageBlock, context);
|
||||
}
|
||||
selectMatch(classificationPage, context);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Deprecated
|
||||
private static KDTree<TextPageBlock> createKdTree(ClassificationPage classificationPage) {
|
||||
|
||||
List<TextPageBlock> textBlocks = classificationPage.getTextBlocks()
|
||||
.stream()
|
||||
.filter(block -> block instanceof TextPageBlock)
|
||||
.toList()
|
||||
.stream()
|
||||
.map(block -> (TextPageBlock) block)
|
||||
.toList();
|
||||
|
||||
KDTree<TextPageBlock> kdTree = KDTree.create(2);
|
||||
// insert y first then x, use pdf max y so that the page height is subtracted so that the order is inverted
|
||||
textBlocks.forEach(block -> {
|
||||
//var boundingBox = blockToBoundingBox.apply(block);
|
||||
kdTree.insert(new double[]{block.getMinY(), block.getMinX()}, block);
|
||||
});
|
||||
return kdTree;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,5 +1,7 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.classification;
|
||||
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.getHeadlineNumber;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
@ -169,17 +171,6 @@ public class RedactManagerClassificationService {
|
||||
}
|
||||
|
||||
|
||||
private static int getHeadlineNumber(PageBlockType pageBlockType) {
|
||||
|
||||
return switch (pageBlockType) {
|
||||
case H1 -> 1;
|
||||
case H2 -> 2;
|
||||
case H3 -> 3;
|
||||
case H4 -> 4;
|
||||
case H5 -> 5;
|
||||
default -> 6;
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@Data
|
||||
|
||||
@ -76,9 +76,6 @@ public class DocumentGraphFactory {
|
||||
|
||||
private void addSections(LayoutParsingType layoutParsingType, ClassificationDocument classificationDocument, Context context, Document document) {
|
||||
|
||||
//classificationDocument.getSections()
|
||||
// .forEach(section -> SectionNodeFactory.addSection(layoutParsingType, null, section.getNonEmptyPageBlocks(), section.getImages(), context, document));
|
||||
|
||||
for (TableOfContentItem tocItem : classificationDocument.getTableOfContents()) {
|
||||
var parent = tocItem.getParent() == null ? null : tocItem.getParent().getSection();
|
||||
Optional<Section> section = SectionNodeFactory.addSection(layoutParsingType, parent, tocItem.getNonEmptySectionBlocks(), tocItem.getImages(), context, document);
|
||||
|
||||
@ -1,10 +1,12 @@
|
||||
package com.knecon.fforesight.service.layoutparser.server.graph;
|
||||
|
||||
import java.io.File;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
@ -25,14 +27,66 @@ import lombok.SneakyThrows;
|
||||
|
||||
public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testViewerDocuments() {
|
||||
|
||||
String directory = "files/syngenta_190_deduplicated/";
|
||||
Path dirPath = new ClassPathResource(directory).getFile().toPath();
|
||||
|
||||
// Ensure the directory exists and is accessible
|
||||
if (!Files.exists(dirPath) || !Files.isDirectory(dirPath)) {
|
||||
throw new IllegalArgumentException("The specified path must be a directory and it must exist.");
|
||||
}
|
||||
|
||||
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
|
||||
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
|
||||
|
||||
// Use try-with-resources to ensure the stream is closed after use
|
||||
try (Stream<Path> paths = Files.walk(dirPath)) {
|
||||
paths.filter(Files::isRegularFile)
|
||||
.filter(path -> path.toString().endsWith(".pdf")) // Filter to process only PDF files
|
||||
.forEach(path -> processFile(path, layoutGridService));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void processFile(Path filePath, LayoutGridService layoutGridService) {
|
||||
|
||||
try {
|
||||
File documentFile = filePath.toFile();
|
||||
String tmpFileName = "/tmp/" + filePath.getFileName().toString() + "_VIEWER.pdf";
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||
documentFile,
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse(),
|
||||
new VisualLayoutParsingResponse(),
|
||||
Map.of("file", filePath.getFileName().toFile().toString()));
|
||||
Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER, classificationDocument);
|
||||
|
||||
if (classificationDocument.getOutlineObjectTree().getRootNodes().size() > 1) {
|
||||
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
|
||||
System.out.printf("Processed %s in %.2fs%n", filePath, ((float) (System.currentTimeMillis() - start)) / 1000);
|
||||
}
|
||||
} catch (Exception exception)
|
||||
{
|
||||
System.out.println(exception);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testViewerDocument() {
|
||||
|
||||
String fileName = "files/new/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
|
||||
//String fileName = "files/syngenta_190_deduplicated/1 Abamectin_prr.pdf";
|
||||
//String fileName = "files/new/abschlussarbeiten-template-institut-fur-informatik-padagogische-hochschule-karlsruhe.pdf";
|
||||
//String fileName = "files/new/kaust-official-thesis-template.pdf";
|
||||
//String fileName = "files/new/$100m Offers.pdf";
|
||||
String fileName = "files/new/18-Curacron_ToxicidadeOcularInVitro.pdf";
|
||||
//String fileName = "files/new/18-Curacron_ToxicidadeOcularInVitro.pdf";
|
||||
//String fileName = "files/new/mistitled_outlines_example.pdf";
|
||||
//String fileName = "files/bdr/Plenarprotokoll 1 (keine Druchsache!) (1) 1.pdf";
|
||||
//String fileName = "files/new/UTT-Books-53.pdf";
|
||||
@ -48,6 +102,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testViewerDocumentWithImages() {
|
||||
@ -90,11 +145,11 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
var documentFile = new ClassPathResource(fileName).getFile();
|
||||
|
||||
var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE,
|
||||
documentFile,
|
||||
new ImageServiceResponse(),
|
||||
tableResponse,
|
||||
new VisualLayoutParsingResponse(),
|
||||
Map.of("file", path.getFileName().toFile().toString()));
|
||||
documentFile,
|
||||
new ImageServiceResponse(),
|
||||
tableResponse,
|
||||
new VisualLayoutParsingResponse(),
|
||||
Map.of("file", path.getFileName().toFile().toString()));
|
||||
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
|
||||
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
|
||||
Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.DOCUMINE, classificationDocument);
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user