RED-7074: Design Subsection section tree structure algorithm

* first draft
This commit is contained in:
maverickstuder 2024-04-10 12:28:42 +02:00
parent 7f675b41cf
commit a32a43fc62
4 changed files with 147 additions and 26 deletions

View File

@ -13,7 +13,7 @@ public class OutlineObject {
private String title;
private int pageNumber;
//private Point2D point;
private Point2D point;
private int treeDepth;
@Override

View File

@ -1,10 +1,24 @@
package com.knecon.fforesight.service.layoutparser.processor.services;
import java.awt.geom.Point2D;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.Optional;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSNumber;
import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.pdmodel.PDDestinationNameTreeNode;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.interactive.action.PDAction;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDDestination;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
import org.springframework.stereotype.Service;
@ -14,8 +28,10 @@ import com.knecon.fforesight.service.layoutparser.processor.model.outline.Outlin
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTreeNode;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Service
@Slf4j
public class OutlineExtractorService {
@SneakyThrows
@ -25,8 +41,8 @@ public class OutlineExtractorService {
List<OutlineObjectTreeNode> rootNodes = new ArrayList<>();
for (PDOutlineItem child : documentOutline.children()) {
OutlineObjectTreeNode outlineObject = createOutlineObjectWithChildren(child, document, 1);
rootNodes.add(outlineObject);
Optional<OutlineObjectTreeNode> outlineObject = createOutlineObjectWithChildren(child, document, 1);
outlineObject.ifPresent(rootNodes::add);
}
return new OutlineObjectTree(rootNodes);
@ -34,37 +50,112 @@ public class OutlineExtractorService {
@SneakyThrows
private OutlineObjectTreeNode createOutlineObjectWithChildren(PDOutlineItem item, PDDocument document, int depth) {
private Optional<OutlineObjectTreeNode> createOutlineObjectWithChildren(PDOutlineItem item, PDDocument document, int depth) {
OutlineObjectTreeNode outlineObject = createOutlineObject(item, document, depth);
for (var child : item.children()) {
outlineObject.addChild(createOutlineObjectWithChildren(child, document, depth + 1));
Optional<OutlineObjectTreeNode> outlineObject = createOutlineObject(item, document, depth);
if (outlineObject.isPresent()) {
for (var child : item.children()) {
Optional<OutlineObjectTreeNode> outlineObjectWithChildren = createOutlineObjectWithChildren(child, document, depth + 1);
outlineObjectWithChildren.ifPresent(outlineObjectTreeNode -> outlineObject.get().addChild(outlineObjectTreeNode));
}
}
return outlineObject;
}
private Optional<OutlineObjectTreeNode> createOutlineObject(PDOutlineItem item, PDDocument document, int depth) {
try {
String title = item.getTitle();
PDPage page = item.findDestinationPage(document);
int pageNumber = document.getPages().indexOf(page);
PDDocumentNameDictionary names = document.getDocumentCatalog().getNames();
PDDestinationNameTreeNode destinations = null;
if (names != null) {
destinations = names.getDests();
}
Optional<Point2D> outlinePosition = Optional.empty();
PDDestination destination = item.getDestination();
if (destination != null) {
outlinePosition = getLocationFromCOSBase(page, destinations, destination.getCOSObject());
}
if (outlinePosition.isEmpty()) {
PDAction action = item.getAction();
COSDictionary cosDictionary = null;
if (action != null) {
cosDictionary = action.getCOSObject();
}
outlinePosition = extractOutlineLocationGoTo(page, cosDictionary, destinations);
}
return outlinePosition.map(position -> new OutlineObjectTreeNode(new OutlineObject(title, pageNumber, position, depth)));
} catch (Exception e) {
log.info("Could not find outline item in document with title: " + item.getTitle());
return Optional.empty();
}
}
@SneakyThrows
private OutlineObjectTreeNode createOutlineObject(PDOutlineItem item, PDDocument document, int depth) {
private static Optional<Point2D> extractOutlineLocationGoTo(PDPage page, COSDictionary cosDictionary, PDDestinationNameTreeNode destinations) {
String title = item.getTitle();
if (isGoToAction(cosDictionary)) {
COSBase cosBase = cosDictionary.getItem(COSName.D);
return getLocationFromCOSBase(page, destinations, cosBase);
}
PDPage page = item.findDestinationPage(document);
int pageNumber = document.getPages().indexOf(page);
return Optional.empty();
}
//float x = 0;
//float y = 0;
//COSDictionary cosObject = item.getAction().getCOSObject();
// if (cosObject.getNameAsString("S").toLowerCase(Locale.ROOT).equals("goto")) {
// COSArray cosArray = cosObject.getCOSArray(COSName.D);
// x = ((COSInteger)cosArray.get(2)).floatValue();
// y = ((COSInteger)cosArray.get(3)).floatValue();
//
// }
//return new OutlineObject(title, pageNumber, new Point2D.Float(x, y));
return new OutlineObjectTreeNode(new OutlineObject(title, pageNumber, depth));
private static Optional<Point2D> getLocationFromCOSBase(PDPage page, PDDestinationNameTreeNode destinations, COSBase cosBase) throws IOException {
if (cosBase != null) {
if (cosBase instanceof COSArray cosArray) {
return getLocationFromCosArrayWithXYZCoordinates(cosArray, page);
}
if (cosBase instanceof COSString cosString) {
String destinationName = cosString.getString();
COSArray cosArray = destinations.getValue(destinationName).getCOSObject();
return getLocationFromCosArrayWithXYZCoordinates(cosArray, page);
}
}
return Optional.empty();
}
private static Optional<Point2D> getLocationFromCosArrayWithXYZCoordinates(COSArray cosArray, PDPage page) {
if (isXYZDestination(cosArray)) {
float x = ((COSNumber) cosArray.get(2)).floatValue();
float y = Math.abs(((COSNumber) cosArray.get(3)).floatValue() - page.getMediaBox().getHeight());
return Optional.of(new Point2D.Float(x, y));
}
return Optional.empty();
}
private static boolean isXYZDestination(COSArray cosArray) {
return cosArray != null && cosArray.getName(1).equals("XYZ");
}
private static boolean isGoToAction(COSDictionary cosDictionary) {
return cosDictionary.getNameAsString("S").toLowerCase(Locale.ROOT).equals("goto");
}
}

View File

@ -8,20 +8,26 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
@Service
public class BlockificationPostprocessingService {
public void sanitizeOutlineBlocks(ClassificationPage classificationPage, List<OutlineObject> outlineObjects) {
public static final float BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD = 8.0f;
public void sanitizeOutlineBlocks(ClassificationPage classificationPage, List<OutlineObject> outlineObjects) {
for (AbstractPageBlock textBlock : classificationPage.getTextBlocks()) {
for (OutlineObject outlineObject : outlineObjects) {
String blockText = textBlock.getText();
String outlineTitle = outlineObject.getTitle();
if (!blockText.contains(outlineTitle)) {
boolean blockTextContainsOutlineTitle = blockText.contains(outlineTitle);
boolean outlineTitleContainsBlockText = outlineTitle.contains(blockText);
if (!isBlockCloseToOutline(textBlock, outlineObject) || !blockTextContainsOutlineTitle && !outlineTitleContainsBlockText) {
continue;
}
@ -31,7 +37,13 @@ public class BlockificationPostprocessingService {
continue;
}
splitTextBlock(textBlock, outlineTitle, classificationPage);
if(blockTextContainsOutlineTitle) {
splitTextBlock(textBlock, outlineTitle, classificationPage);
}
if(outlineTitleContainsBlockText) {
// find other blocks, merge them into current, mark them for deletion after loop
}
}
}
@ -39,6 +51,24 @@ public class BlockificationPostprocessingService {
}
private boolean isBlockCloseToOutline(AbstractPageBlock textBlock, OutlineObject outlineObject) {
float threshold = BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD;
//if (textBlock instanceof TextPageBlock) {
// List<TextPositionSequence> sequences = ((TextPageBlock) textBlock).getSequences();
// if (sequences != null) {
// float textHeightSum = 0;
// for (TextPositionSequence word : sequences) {
// textHeightSum += word.getTextHeight();
// }
// threshold = textHeightSum / sequences.size();
// }
//}
return textBlock.getMinY() - outlineObject.getPoint().getY() < threshold && textBlock.getMinX() - outlineObject.getPoint().getX() < threshold;
}
private void splitTextBlock(AbstractPageBlock textBlock, String title, ClassificationPage classificationPage) {
}

View File

@ -28,8 +28,8 @@ public class ViewerDocumentTest extends BuildDocumentTest {
public void testViewerDocument() {
//String fileName = "files/new/abschlussarbeiten-template-institut-fur-informatik-padagogische-hochschule-karlsruhe.pdf";
//String fileName = "files/new/$100m Offers.pdf";
//String fileName = "files/new/kaust-official-thesis-template.pdf";
//String fileName = "files/new/$100m Offers.pdf";
String fileName = "files/new/18-Curacron_ToxicidadeOcularInVitro.pdf";
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";