RED-7074: Design Subsection section tree structure algorithm
* first draft: further implementations
This commit is contained in:
parent
7279d0a870
commit
c888746761
@ -299,7 +299,7 @@ public class LayoutParsingPipeline {
|
|||||||
List<OutlineObject> outlineObjects = outlineObjectTree.getOutlineObjectsPerPage()
|
List<OutlineObject> outlineObjects = outlineObjectTree.getOutlineObjectsPerPage()
|
||||||
.get(pageNumber - 1);
|
.get(pageNumber - 1);
|
||||||
if(outlineObjects != null) {
|
if(outlineObjects != null) {
|
||||||
blockificationPostprocessingService.sanitizeOutlineBlocks(classificationPage, outlineObjects);
|
blockificationPostprocessingService.sanitizeOutlineBlocksWithKdTree(classificationPage, outlineObjects);
|
||||||
}
|
}
|
||||||
|
|
||||||
classificationPage.setCleanRulings(cleanRulings);
|
classificationPage.setCleanRulings(cleanRulings);
|
||||||
@ -319,7 +319,7 @@ public class LayoutParsingPipeline {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (signatures.containsKey(pageNumber)) {
|
if (signatures.containsKey(pageNumber)) {
|
||||||
if (classificationPage.getImages() == null || classificationPage.getImages().size() == 0) {
|
if (classificationPage.getImages() == null || classificationPage.getImages().isEmpty()) {
|
||||||
classificationPage.setImages(signatures.get(pageNumber));
|
classificationPage.setImages(signatures.get(pageNumber));
|
||||||
} else {
|
} else {
|
||||||
classificationPage.getImages().addAll(signatures.get(pageNumber));
|
classificationPage.getImages().addAll(signatures.get(pageNumber));
|
||||||
@ -352,6 +352,8 @@ public class LayoutParsingPipeline {
|
|||||||
case CLARIFYND -> clarifyndClassificationService.classifyDocument(classificationDocument);
|
case CLARIFYND -> clarifyndClassificationService.classifyDocument(classificationDocument);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// compute ToC
|
||||||
|
|
||||||
log.info("Building Sections for {}", identifier);
|
log.info("Building Sections for {}", identifier);
|
||||||
|
|
||||||
switch (layoutParsingType) {
|
switch (layoutParsingType) {
|
||||||
|
|||||||
@ -11,7 +11,6 @@ import org.apache.pdfbox.cos.COSArray;
|
|||||||
import org.apache.pdfbox.cos.COSBase;
|
import org.apache.pdfbox.cos.COSBase;
|
||||||
import org.apache.pdfbox.cos.COSDictionary;
|
import org.apache.pdfbox.cos.COSDictionary;
|
||||||
import org.apache.pdfbox.cos.COSName;
|
import org.apache.pdfbox.cos.COSName;
|
||||||
import org.apache.pdfbox.cos.COSNumber;
|
|
||||||
import org.apache.pdfbox.cos.COSString;
|
import org.apache.pdfbox.cos.COSString;
|
||||||
import org.apache.pdfbox.pdmodel.PDDestinationNameTreeNode;
|
import org.apache.pdfbox.pdmodel.PDDestinationNameTreeNode;
|
||||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||||
@ -19,6 +18,10 @@ import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
|
|||||||
import org.apache.pdfbox.pdmodel.PDPage;
|
import org.apache.pdfbox.pdmodel.PDPage;
|
||||||
import org.apache.pdfbox.pdmodel.interactive.action.PDAction;
|
import org.apache.pdfbox.pdmodel.interactive.action.PDAction;
|
||||||
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDDestination;
|
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDDestination;
|
||||||
|
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageFitHeightDestination;
|
||||||
|
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageFitRectangleDestination;
|
||||||
|
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageFitWidthDestination;
|
||||||
|
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageXYZDestination;
|
||||||
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
|
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
|
||||||
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
|
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
@ -34,6 +37,16 @@ import lombok.extern.slf4j.Slf4j;
|
|||||||
@Slf4j
|
@Slf4j
|
||||||
public class OutlineExtractorService {
|
public class OutlineExtractorService {
|
||||||
|
|
||||||
|
private static final String PDDESTINATION_TYPE_FIT = "Fit";
|
||||||
|
private static final String PDDESTINATION_TYPE_FIT_B = "FitB";
|
||||||
|
private static final String PDDESTINATION_TYPE_FIT_H = "FitH";
|
||||||
|
private static final String PDDESTINATION_TYPE_FIT_V = "FitV";
|
||||||
|
private static final String PDDESTINATION_TYPE_FIT_R = "FitR";
|
||||||
|
private static final String PDDESTINATION_TYPE_FIT_BH = "FitBH";
|
||||||
|
private static final String PDDESTINATION_TYPE_FIT_BV = "FitBV";
|
||||||
|
private static final String PDDESTINATION_TYPE_XYZ = "XYZ";
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public OutlineObjectTree getOutlineObjectTree(PDDocument document) {
|
public OutlineObjectTree getOutlineObjectTree(PDDocument document) {
|
||||||
|
|
||||||
@ -41,8 +54,7 @@ public class OutlineExtractorService {
|
|||||||
|
|
||||||
List<OutlineObjectTreeNode> rootNodes = new ArrayList<>();
|
List<OutlineObjectTreeNode> rootNodes = new ArrayList<>();
|
||||||
for (PDOutlineItem child : documentOutline.children()) {
|
for (PDOutlineItem child : documentOutline.children()) {
|
||||||
Optional<OutlineObjectTreeNode> outlineObject = createOutlineObjectWithChildren(child, document, 1);
|
rootNodes.add(createOutlineObjectWithChildren(child, document, 1));
|
||||||
outlineObject.ifPresent(rootNodes::add);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return new OutlineObjectTree(rootNodes);
|
return new OutlineObjectTree(rootNodes);
|
||||||
@ -50,85 +62,81 @@ public class OutlineExtractorService {
|
|||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
private Optional<OutlineObjectTreeNode> createOutlineObjectWithChildren(PDOutlineItem item, PDDocument document, int depth) {
|
private OutlineObjectTreeNode createOutlineObjectWithChildren(PDOutlineItem item, PDDocument document, int depth) {
|
||||||
|
|
||||||
Optional<OutlineObjectTreeNode> outlineObject = createOutlineObject(item, document, depth);
|
OutlineObjectTreeNode outlineObject = createOutlineObject(item, document, depth);
|
||||||
if (outlineObject.isPresent()) {
|
for (var child : item.children()) {
|
||||||
for (var child : item.children()) {
|
OutlineObjectTreeNode outlineObjectWithChildren = createOutlineObjectWithChildren(child, document, depth + 1);
|
||||||
Optional<OutlineObjectTreeNode> outlineObjectWithChildren = createOutlineObjectWithChildren(child, document, depth + 1);
|
outlineObject.addChild(outlineObjectWithChildren);
|
||||||
outlineObjectWithChildren.ifPresent(outlineObjectTreeNode -> outlineObject.get().addChild(outlineObjectTreeNode));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return outlineObject;
|
return outlineObject;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private Optional<OutlineObjectTreeNode> createOutlineObject(PDOutlineItem item, PDDocument document, int depth) {
|
@SneakyThrows
|
||||||
|
private OutlineObjectTreeNode createOutlineObject(PDOutlineItem item, PDDocument document, int depth) {
|
||||||
|
|
||||||
|
String title = item.getTitle();
|
||||||
|
|
||||||
|
PDPage page = item.findDestinationPage(document);
|
||||||
|
int pageNumber = document.getPages().indexOf(page);
|
||||||
|
|
||||||
|
Optional<Point2D> outlinePosition = Optional.empty();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
String title = item.getTitle();
|
|
||||||
|
|
||||||
PDPage page = item.findDestinationPage(document);
|
|
||||||
int pageNumber = document.getPages().indexOf(page);
|
|
||||||
|
|
||||||
|
|
||||||
PDDocumentNameDictionary names = document.getDocumentCatalog().getNames();
|
PDDocumentNameDictionary names = document.getDocumentCatalog().getNames();
|
||||||
PDDestinationNameTreeNode destinations = null;
|
PDDestinationNameTreeNode destinations = null;
|
||||||
if (names != null) {
|
if (names != null) {
|
||||||
destinations = names.getDests();
|
destinations = names.getDests();
|
||||||
}
|
}
|
||||||
|
|
||||||
Optional<Point2D> outlinePosition = Optional.empty();
|
|
||||||
|
|
||||||
PDDestination destination = item.getDestination();
|
PDDestination destination = item.getDestination();
|
||||||
if (destination != null) {
|
if (destination != null) {
|
||||||
outlinePosition = getLocationFromCOSBase(page, destinations, destination.getCOSObject());
|
outlinePosition = getLocationFromCOSBase(destinations, destination.getCOSObject());
|
||||||
}
|
}
|
||||||
|
|
||||||
if (outlinePosition.isEmpty()) {
|
if (outlinePosition.isEmpty()) {
|
||||||
|
|
||||||
PDAction action = item.getAction();
|
PDAction action = item.getAction();
|
||||||
COSDictionary cosDictionary = null;
|
|
||||||
if (action != null) {
|
if (action != null) {
|
||||||
cosDictionary = action.getCOSObject();
|
outlinePosition = extractOutlineLocationGoTo(destinations, action.getCOSObject());
|
||||||
}
|
}
|
||||||
|
|
||||||
outlinePosition = extractOutlineLocationGoTo(page, cosDictionary, destinations);
|
|
||||||
}
|
}
|
||||||
return outlinePosition.map(position -> new OutlineObjectTreeNode(new OutlineObject(title, pageNumber, position, depth)));
|
|
||||||
|
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
|
log.info(String.format("Error occurred during position resolution for outline item on page %s with title %s: " + e, pageNumber, title));
|
||||||
log.info("Could not find outline item in document with title: " + item.getTitle());
|
|
||||||
return Optional.empty();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return new OutlineObjectTreeNode(new OutlineObject(title, pageNumber, outlinePosition.orElse(new Point2D.Float(0, 0)), depth));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
private static Optional<Point2D> extractOutlineLocationGoTo(PDPage page, COSDictionary cosDictionary, PDDestinationNameTreeNode destinations) {
|
private static Optional<Point2D> extractOutlineLocationGoTo(PDDestinationNameTreeNode destinations, COSDictionary cosDictionary) {
|
||||||
|
|
||||||
if (isGoToAction(cosDictionary)) {
|
if (isGoToAction(cosDictionary)) {
|
||||||
COSBase cosBase = cosDictionary.getItem(COSName.D);
|
COSBase cosBase = cosDictionary.getItem(COSName.D);
|
||||||
return getLocationFromCOSBase(page, destinations, cosBase);
|
return getLocationFromCOSBase(destinations, cosBase);
|
||||||
}
|
}
|
||||||
|
|
||||||
return Optional.empty();
|
return Optional.empty();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static Optional<Point2D> getLocationFromCOSBase(PDPage page, PDDestinationNameTreeNode destinations, COSBase cosBase) throws IOException {
|
private static Optional<Point2D> getLocationFromCOSBase(PDDestinationNameTreeNode destinations, COSBase cosBase) throws IOException {
|
||||||
|
|
||||||
if (cosBase != null) {
|
if (cosBase != null) {
|
||||||
if (cosBase instanceof COSArray cosArray) {
|
if (cosBase instanceof COSArray cosArray) {
|
||||||
return getLocationFromCosArrayWithXYZCoordinates(cosArray, page);
|
return getLocationFromCosArray(cosArray);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (cosBase instanceof COSString cosString) {
|
if (cosBase instanceof COSString cosString) {
|
||||||
String destinationName = cosString.getString();
|
String destinationName = cosString.getString();
|
||||||
COSArray cosArray = destinations.getValue(destinationName).getCOSObject();
|
COSArray cosArray = destinations.getValue(destinationName).getCOSObject();
|
||||||
return getLocationFromCosArrayWithXYZCoordinates(cosArray, page);
|
return getLocationFromCosArray(cosArray);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -136,20 +144,54 @@ public class OutlineExtractorService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static Optional<Point2D> getLocationFromCosArrayWithXYZCoordinates(COSArray cosArray, PDPage page) {
|
private static Optional<Point2D> getLocationFromCosArray(COSArray cosArray) {
|
||||||
|
|
||||||
if (isXYZDestination(cosArray)) {
|
boolean located = false;
|
||||||
float x = ((COSNumber) cosArray.get(2)).floatValue();
|
float x = 0;
|
||||||
float y = Math.abs(((COSNumber) cosArray.get(3)).floatValue() - page.getMediaBox().getHeight());
|
float y = 0;
|
||||||
return Optional.of(new Point2D.Float(x, y));
|
|
||||||
|
try {
|
||||||
|
|
||||||
|
PDDestination destination = PDDestination.create(cosArray);
|
||||||
|
COSName type = (COSName) cosArray.getObject(1);
|
||||||
|
String typeString = type.getName();
|
||||||
|
|
||||||
|
switch (typeString) {
|
||||||
|
case PDDESTINATION_TYPE_FIT_V:
|
||||||
|
case PDDESTINATION_TYPE_FIT_BV:
|
||||||
|
PDPageFitHeightDestination fitHeightDestination = (PDPageFitHeightDestination) destination;
|
||||||
|
x = fitHeightDestination.getLeft();
|
||||||
|
located = true;
|
||||||
|
break;
|
||||||
|
case PDDESTINATION_TYPE_FIT_R:
|
||||||
|
PDPageFitRectangleDestination fitRectangleDestination = (PDPageFitRectangleDestination) destination;
|
||||||
|
x = fitRectangleDestination.getLeft();
|
||||||
|
y = fitRectangleDestination.getTop();
|
||||||
|
located = true;
|
||||||
|
break;
|
||||||
|
case PDDESTINATION_TYPE_FIT_H:
|
||||||
|
case PDDESTINATION_TYPE_FIT_BH:
|
||||||
|
PDPageFitWidthDestination fitWidthDestination = (PDPageFitWidthDestination) destination;
|
||||||
|
y = fitWidthDestination.getTop();
|
||||||
|
located = true;
|
||||||
|
break;
|
||||||
|
case PDDESTINATION_TYPE_XYZ:
|
||||||
|
PDPageXYZDestination xyzDestination = (PDPageXYZDestination) destination;
|
||||||
|
x = xyzDestination.getLeft();
|
||||||
|
y = xyzDestination.getTop();
|
||||||
|
located = true;
|
||||||
|
break;
|
||||||
|
case PDDESTINATION_TYPE_FIT:
|
||||||
|
case PDDESTINATION_TYPE_FIT_B:
|
||||||
|
default:
|
||||||
|
}
|
||||||
|
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
return Optional.empty();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
return located ? Optional.of(new Point2D.Float(x, y)) : Optional.empty();
|
||||||
|
|
||||||
private static boolean isXYZDestination(COSArray cosArray) {
|
|
||||||
|
|
||||||
return cosArray != null && cosArray.getName(1).equals("XYZ");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -1,57 +1,298 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.services.blockification;
|
package com.knecon.fforesight.service.layoutparser.processor.services.blockification;
|
||||||
|
|
||||||
|
import static com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService.buildTextBlock;
|
||||||
|
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Comparator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.function.Function;
|
||||||
|
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
import org.tinspin.index.Index;
|
||||||
|
import org.tinspin.index.kdtree.KDIterator;
|
||||||
|
import org.tinspin.index.kdtree.KDTree;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
|
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
import com.knecon.fforesight.service.layoutparser.processor.services.factory.SearchTextWithTextPositionFactory;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||||
|
|
||||||
@Service
|
@Service
|
||||||
public class BlockificationPostprocessingService {
|
public class BlockificationPostprocessingService {
|
||||||
|
|
||||||
public static final float BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD = 8.0f;
|
private static final float BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD = 5.0f;
|
||||||
|
private static final double BLOCK_COMPARISON_PRECISION = 1.0;
|
||||||
|
|
||||||
|
private static final Function<TextPageBlock, Rectangle2D> blockToBoundingBox = (abstractPageBlock) -> abstractPageBlock.getSequences()
|
||||||
|
.stream()
|
||||||
|
.map(textPositionSequence -> textPositionSequence.getTextPositions()
|
||||||
|
.stream()
|
||||||
|
.map(tp -> SearchTextWithTextPositionFactory.mapRedTextPositionToInitialUserSpace(tp, textPositionSequence))
|
||||||
|
.collect(RectangleTransformations.collectBBox()))
|
||||||
|
.collect(RectangleTransformations.collectBBox());
|
||||||
|
|
||||||
|
|
||||||
public void sanitizeOutlineBlocks(ClassificationPage classificationPage, List<OutlineObject> outlineObjects) {
|
public void sanitizeOutlineBlocks(ClassificationPage classificationPage, List<OutlineObject> outlineObjects) {
|
||||||
|
|
||||||
for (AbstractPageBlock textBlock : classificationPage.getTextBlocks()) {
|
List<TextPageBlock> textBlocks = classificationPage.getTextBlocks()
|
||||||
for (OutlineObject outlineObject : outlineObjects) {
|
.stream()
|
||||||
|
.filter(block -> block instanceof TextPageBlock)
|
||||||
|
.toList()
|
||||||
|
.stream()
|
||||||
|
.map(block -> (TextPageBlock) block)
|
||||||
|
.toList();
|
||||||
|
|
||||||
String blockText = textBlock.getText();
|
textBlocks.sort(Comparator.comparing(TextPageBlock::getMinY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, BLOCK_COMPARISON_PRECISION))
|
||||||
String outlineTitle = outlineObject.getTitle();
|
.thenComparing(TextPageBlock::getMinX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, BLOCK_COMPARISON_PRECISION)));
|
||||||
|
|
||||||
boolean blockTextContainsOutlineTitle = blockText.contains(outlineTitle);
|
for (OutlineObject outlineObject : outlineObjects) {
|
||||||
boolean outlineTitleContainsBlockText = outlineTitle.contains(blockText);
|
|
||||||
if (!isBlockCloseToOutline(textBlock, outlineObject) || !blockTextContainsOutlineTitle && !outlineTitleContainsBlockText) {
|
boolean matchedExactly = false;
|
||||||
continue;
|
List<TextPageBlock> splitCandidates = new ArrayList<>();
|
||||||
}
|
List<TextPageBlock> mergeCandidates = new ArrayList<>();
|
||||||
|
|
||||||
if (blockText.equals(outlineTitle)) {
|
for (TextPageBlock textPageBlock : textBlocks) {
|
||||||
|
matchedExactly = processOutlineObjectForTextBlock(textPageBlock, outlineObject, splitCandidates, mergeCandidates);
|
||||||
textBlock.setClassification(PageBlockType.getHeadlineType(outlineObject.getTreeDepth()));
|
|
||||||
continue;
|
if (matchedExactly) {
|
||||||
}
|
break;
|
||||||
|
|
||||||
if(blockTextContainsOutlineTitle) {
|
|
||||||
splitTextBlock(textBlock, outlineTitle, classificationPage);
|
|
||||||
}
|
|
||||||
|
|
||||||
if(outlineTitleContainsBlockText) {
|
|
||||||
// find other blocks, merge them into current, mark them for deletion after loop
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!matchedExactly) {
|
||||||
|
//selectMatch(outlineObject, kdTree, splitCandidates, mergeCandidates);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private boolean isBlockCloseToOutline(AbstractPageBlock textBlock, OutlineObject outlineObject) {
|
public void sanitizeOutlineBlocksWithKdTree(ClassificationPage classificationPage, List<OutlineObject> outlineObjects) {
|
||||||
|
|
||||||
|
List<TextPageBlock> textBlocks = classificationPage.getTextBlocks()
|
||||||
|
.stream()
|
||||||
|
.filter(block -> block instanceof TextPageBlock)
|
||||||
|
.toList()
|
||||||
|
.stream()
|
||||||
|
.map(block -> (TextPageBlock) block)
|
||||||
|
.toList();
|
||||||
|
if (textBlocks.isEmpty() || outlineObjects.isEmpty()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
KDTree<TextPageBlock> kdTree = KDTree.create(2);
|
||||||
|
textBlocks.forEach(block -> {
|
||||||
|
var boundingBox = blockToBoundingBox.apply(block);
|
||||||
|
kdTree.insert(new double[]{boundingBox.getMinX(), boundingBox.getMaxY()}, block);
|
||||||
|
});
|
||||||
|
|
||||||
|
for (OutlineObject outlineObject : outlineObjects) {
|
||||||
|
|
||||||
|
KDIterator<TextPageBlock> successorIterator = kdTree.query(new double[]{ //
|
||||||
|
0, //
|
||||||
|
outlineObject.getPoint().getY() - BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD //
|
||||||
|
}, //
|
||||||
|
new double[]{Double.MAX_VALUE, Double.MAX_VALUE});
|
||||||
|
|
||||||
|
boolean matchedExactly = false;
|
||||||
|
List<TextPageBlock> splitCandidates = new ArrayList<>();
|
||||||
|
List<TextPageBlock> mergeCandidates = new ArrayList<>();
|
||||||
|
|
||||||
|
while (successorIterator.hasNext() && !matchedExactly) {
|
||||||
|
TextPageBlock pageBlock = successorIterator.next().value();
|
||||||
|
matchedExactly = processOutlineObjectForTextBlock(pageBlock, outlineObject, splitCandidates, mergeCandidates);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!matchedExactly) {
|
||||||
|
selectMatch(classificationPage, outlineObject, kdTree, splitCandidates, mergeCandidates);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void selectMatch(ClassificationPage classificationPage,
|
||||||
|
OutlineObject outlineObject,
|
||||||
|
KDTree<TextPageBlock> kdTree,
|
||||||
|
List<TextPageBlock> splitCandidates,
|
||||||
|
List<TextPageBlock> mergeCandidates) {
|
||||||
|
|
||||||
|
for (TextPageBlock splitCandidate : splitCandidates) {
|
||||||
|
System.out.println(splitCandidate);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!mergeCandidates.isEmpty()) {
|
||||||
|
|
||||||
|
List<TextPageBlock> allMergeCandidates = new ArrayList<>(mergeCandidates);
|
||||||
|
addNeighborsOfCandidate(kdTree, mergeCandidates.get(0), allMergeCandidates);
|
||||||
|
addNeighborsOfCandidate(kdTree, mergeCandidates.get(mergeCandidates.size() - 1), allMergeCandidates);
|
||||||
|
allMergeCandidates = allMergeCandidates.stream()
|
||||||
|
.distinct()
|
||||||
|
.toList();
|
||||||
|
|
||||||
|
List<List<TextPageBlock>> combinations = findCombinations(outlineObject.getTitle(), allMergeCandidates);
|
||||||
|
double maxDistance = Double.MAX_VALUE;
|
||||||
|
List<TextPageBlock> bestCombination = new ArrayList<>();
|
||||||
|
for (List<TextPageBlock> combination : combinations) {
|
||||||
|
double averageDistance = combination.stream()
|
||||||
|
.map(block -> calculateDistance(outlineObject, block))
|
||||||
|
.mapToDouble(Double::doubleValue).average()
|
||||||
|
.orElse(Double.MAX_VALUE);
|
||||||
|
if (maxDistance > averageDistance) {
|
||||||
|
maxDistance = averageDistance;
|
||||||
|
bestCombination = combination;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
mergeBlocks(classificationPage, bestCombination);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void mergeBlocks(ClassificationPage classificationPage, List<TextPageBlock> blocksToMerge) {
|
||||||
|
|
||||||
|
if (blocksToMerge.size() <= 1) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
TextPageBlock firstBlock = blocksToMerge.get(0);
|
||||||
|
|
||||||
|
List<TextPageBlock> mergedBlocks = new ArrayList<>();
|
||||||
|
for (TextPageBlock textPageBlock : blocksToMerge.subList(1, blocksToMerge.size())) {
|
||||||
|
|
||||||
|
if (firstBlock != null && !firstBlock.getSequences().isEmpty()) {
|
||||||
|
|
||||||
|
if (textPageBlock.getDir() == firstBlock.getDir()) {
|
||||||
|
firstBlock.getSequences().addAll(textPageBlock.getSequences());
|
||||||
|
mergedBlocks.add(textPageBlock);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
assert firstBlock != null;
|
||||||
|
buildTextBlock(firstBlock.getSequences(), 0);
|
||||||
|
firstBlock.setToDuplicate(false);
|
||||||
|
classificationPage.getTextBlocks().removeAll(mergedBlocks);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static List<List<TextPageBlock>> findCombinations(String title, List<TextPageBlock> blocks) {
|
||||||
|
|
||||||
|
List<List<TextPageBlock>> combinations = new ArrayList<>();
|
||||||
|
findCombinations(title, blocks, new ArrayList<>(), combinations);
|
||||||
|
return combinations;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static void findCombinations(String title, List<TextPageBlock> blocks, List<TextPageBlock> current, List<List<TextPageBlock>> combinations) {
|
||||||
|
|
||||||
|
String target = title.replaceAll("\\s", "");
|
||||||
|
if (target.isEmpty()) {
|
||||||
|
combinations.add(new ArrayList<>(current));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
List<TextPageBlock> remaining = blocks.stream()
|
||||||
|
.filter(block -> !current.contains(block))
|
||||||
|
.toList();
|
||||||
|
for (TextPageBlock block : remaining) {
|
||||||
|
String prefix = block.getText().replaceAll("\\s", "");
|
||||||
|
if (target.startsWith(prefix)) {
|
||||||
|
current.add(block);
|
||||||
|
findCombinations(target.substring(prefix.length()), blocks, current, combinations);
|
||||||
|
current.remove(current.size() - 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private double calculateDistance(OutlineObject outlineObject, TextPageBlock pageBlock) {
|
||||||
|
|
||||||
|
double deltaX = outlineObject.getPoint().getX() - pageBlock.getMinX();
|
||||||
|
double deltaY = outlineObject.getPoint().getY() - pageBlock.getMinY();
|
||||||
|
return Math.sqrt(deltaX * deltaX + deltaY * deltaY);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static void addNeighborsOfCandidate(KDTree<TextPageBlock> kdTree, TextPageBlock mergeCandidate, List<TextPageBlock> allMergeCandidates) {
|
||||||
|
|
||||||
|
var boundingBox = blockToBoundingBox.apply(mergeCandidate);
|
||||||
|
Index.PointIteratorKnn<TextPageBlock> knnIterator = kdTree.queryKnn(new double[]{boundingBox.getMinX(), boundingBox.getMaxY()}, 4);
|
||||||
|
knnIterator.forEachRemaining(neighbor -> allMergeCandidates.add(neighbor.value()));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private boolean processOutlineObjectForTextBlock(TextPageBlock pageBlock,
|
||||||
|
OutlineObject outlineObject,
|
||||||
|
List<TextPageBlock> splitCandidates,
|
||||||
|
List<TextPageBlock> mergeCandidates) {
|
||||||
|
|
||||||
|
String blockText = pageBlock.getText();
|
||||||
|
String outlineTitle = outlineObject.getTitle();
|
||||||
|
|
||||||
|
boolean blockTextContainsOutlineTitle = blockText.contains(outlineTitle);
|
||||||
|
boolean outlineTitleContainsBlockText = outlineTitle.contains(blockText);
|
||||||
|
|
||||||
|
if (!blockTextContainsOutlineTitle && !outlineTitleContainsBlockText) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (blockText.equals(outlineTitle)) {
|
||||||
|
pageBlock.setClassification(PageBlockType.getHeadlineType(outlineObject.getTreeDepth()));
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (blockTextContainsOutlineTitle) {
|
||||||
|
splitCandidates.add(pageBlock);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (outlineTitleContainsBlockText) {
|
||||||
|
mergeCandidates.add(pageBlock);
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void processOutlineObjectForTextBlockOld(ClassificationPage classificationPage, TextPageBlock pageBlock, OutlineObject outlineObject) {
|
||||||
|
|
||||||
|
String blockText = pageBlock.getText();
|
||||||
|
String outlineTitle = outlineObject.getTitle();
|
||||||
|
|
||||||
|
boolean blockTextContainsOutlineTitle = blockText.contains(outlineTitle);
|
||||||
|
boolean outlineTitleContainsBlockText = outlineTitle.contains(blockText);
|
||||||
|
|
||||||
|
Rectangle2D boundingBox = pageBlock.getSequences()
|
||||||
|
.stream()
|
||||||
|
.map(textPositionSequence -> textPositionSequence.getTextPositions()
|
||||||
|
.stream()
|
||||||
|
.map(tp -> SearchTextWithTextPositionFactory.mapRedTextPositionToInitialUserSpace(tp, textPositionSequence))
|
||||||
|
.collect(RectangleTransformations.collectBBox()))
|
||||||
|
.collect(RectangleTransformations.collectBBox());
|
||||||
|
|
||||||
|
if (!isCloseToOutline(boundingBox, outlineObject) || !blockTextContainsOutlineTitle && !outlineTitleContainsBlockText) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (blockText.equals(outlineTitle)) {
|
||||||
|
|
||||||
|
pageBlock.setClassification(PageBlockType.getHeadlineType(outlineObject.getTreeDepth()));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (blockTextContainsOutlineTitle) {
|
||||||
|
splitTextBlock(pageBlock, outlineTitle, classificationPage);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (outlineTitleContainsBlockText) {
|
||||||
|
// find other blocks, merge them into current, mark them for deletion after loop
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private boolean isCloseToOutline(Rectangle2D boundingBox, OutlineObject outlineObject) {
|
||||||
|
|
||||||
float threshold = BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD;
|
float threshold = BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD;
|
||||||
//if (textBlock instanceof TextPageBlock) {
|
//if (textBlock instanceof TextPageBlock) {
|
||||||
@ -65,11 +306,11 @@ public class BlockificationPostprocessingService {
|
|||||||
// }
|
// }
|
||||||
//}
|
//}
|
||||||
|
|
||||||
return textBlock.getMinY() - outlineObject.getPoint().getY() < threshold && textBlock.getMinX() - outlineObject.getPoint().getX() < threshold;
|
return boundingBox.getMinY() - outlineObject.getPoint().getY() < threshold && boundingBox.getMinX() - outlineObject.getPoint().getX() < threshold;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void splitTextBlock(AbstractPageBlock textBlock, String title, ClassificationPage classificationPage) {
|
private void splitTextBlock(TextPageBlock pageBlock, String title, ClassificationPage classificationPage) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -318,7 +318,7 @@ public class DocstrumBlockificationService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
|
public static TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
|
||||||
|
|
||||||
TextPageBlock textBlock = null;
|
TextPageBlock textBlock = null;
|
||||||
|
|
||||||
@ -420,7 +420,7 @@ public class DocstrumBlockificationService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private double round(float value, int decimalPoints) {
|
private static double round(float value, int decimalPoints) {
|
||||||
|
|
||||||
var d = Math.pow(10, decimalPoints);
|
var d = Math.pow(10, decimalPoints);
|
||||||
return Math.round(value * d) / d;
|
return Math.round(value * d) / d;
|
||||||
|
|||||||
@ -2,6 +2,8 @@ package com.knecon.fforesight.service.layoutparser.server.graph;
|
|||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import org.junit.jupiter.api.Disabled;
|
import org.junit.jupiter.api.Disabled;
|
||||||
@ -31,6 +33,8 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
|||||||
//String fileName = "files/new/kaust-official-thesis-template.pdf";
|
//String fileName = "files/new/kaust-official-thesis-template.pdf";
|
||||||
//String fileName = "files/new/$100m Offers.pdf";
|
//String fileName = "files/new/$100m Offers.pdf";
|
||||||
String fileName = "files/new/18-Curacron_ToxicidadeOcularInVitro.pdf";
|
String fileName = "files/new/18-Curacron_ToxicidadeOcularInVitro.pdf";
|
||||||
|
//String fileName = "files/new/mistitled_outlines_example.pdf";
|
||||||
|
//String fileName = "files/bdr/Plenarprotokoll 1 (keine Druchsache!) (1) 1.pdf";
|
||||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||||
|
|
||||||
var documentFile = new ClassPathResource(fileName).getFile();
|
var documentFile = new ClassPathResource(fileName).getFile();
|
||||||
@ -69,6 +73,5 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
|||||||
|
|
||||||
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
|
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user