RED-7074: Design Subsection section tree structure algorithm
* first draft
This commit is contained in:
parent
7f675b41cf
commit
a32a43fc62
@ -13,7 +13,7 @@ public class OutlineObject {
|
||||
|
||||
private String title;
|
||||
private int pageNumber;
|
||||
//private Point2D point;
|
||||
private Point2D point;
|
||||
private int treeDepth;
|
||||
|
||||
@Override
|
||||
|
||||
@ -1,10 +1,24 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.pdfbox.cos.COSArray;
|
||||
import org.apache.pdfbox.cos.COSBase;
|
||||
import org.apache.pdfbox.cos.COSDictionary;
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
import org.apache.pdfbox.cos.COSNumber;
|
||||
import org.apache.pdfbox.cos.COSString;
|
||||
import org.apache.pdfbox.pdmodel.PDDestinationNameTreeNode;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.interactive.action.PDAction;
|
||||
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDDestination;
|
||||
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
|
||||
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
|
||||
import org.springframework.stereotype.Service;
|
||||
@ -14,8 +28,10 @@ import com.knecon.fforesight.service.layoutparser.processor.model.outline.Outlin
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTreeNode;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Service
|
||||
@Slf4j
|
||||
public class OutlineExtractorService {
|
||||
|
||||
@SneakyThrows
|
||||
@ -25,8 +41,8 @@ public class OutlineExtractorService {
|
||||
|
||||
List<OutlineObjectTreeNode> rootNodes = new ArrayList<>();
|
||||
for (PDOutlineItem child : documentOutline.children()) {
|
||||
OutlineObjectTreeNode outlineObject = createOutlineObjectWithChildren(child, document, 1);
|
||||
rootNodes.add(outlineObject);
|
||||
Optional<OutlineObjectTreeNode> outlineObject = createOutlineObjectWithChildren(child, document, 1);
|
||||
outlineObject.ifPresent(rootNodes::add);
|
||||
}
|
||||
|
||||
return new OutlineObjectTree(rootNodes);
|
||||
@ -34,37 +50,112 @@ public class OutlineExtractorService {
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private OutlineObjectTreeNode createOutlineObjectWithChildren(PDOutlineItem item, PDDocument document, int depth) {
|
||||
private Optional<OutlineObjectTreeNode> createOutlineObjectWithChildren(PDOutlineItem item, PDDocument document, int depth) {
|
||||
|
||||
OutlineObjectTreeNode outlineObject = createOutlineObject(item, document, depth);
|
||||
for (var child : item.children()) {
|
||||
outlineObject.addChild(createOutlineObjectWithChildren(child, document, depth + 1));
|
||||
Optional<OutlineObjectTreeNode> outlineObject = createOutlineObject(item, document, depth);
|
||||
if (outlineObject.isPresent()) {
|
||||
for (var child : item.children()) {
|
||||
Optional<OutlineObjectTreeNode> outlineObjectWithChildren = createOutlineObjectWithChildren(child, document, depth + 1);
|
||||
outlineObjectWithChildren.ifPresent(outlineObjectTreeNode -> outlineObject.get().addChild(outlineObjectTreeNode));
|
||||
}
|
||||
}
|
||||
|
||||
return outlineObject;
|
||||
}
|
||||
|
||||
|
||||
private Optional<OutlineObjectTreeNode> createOutlineObject(PDOutlineItem item, PDDocument document, int depth) {
|
||||
|
||||
try {
|
||||
String title = item.getTitle();
|
||||
|
||||
PDPage page = item.findDestinationPage(document);
|
||||
int pageNumber = document.getPages().indexOf(page);
|
||||
|
||||
|
||||
PDDocumentNameDictionary names = document.getDocumentCatalog().getNames();
|
||||
PDDestinationNameTreeNode destinations = null;
|
||||
if (names != null) {
|
||||
destinations = names.getDests();
|
||||
}
|
||||
|
||||
Optional<Point2D> outlinePosition = Optional.empty();
|
||||
|
||||
PDDestination destination = item.getDestination();
|
||||
if (destination != null) {
|
||||
outlinePosition = getLocationFromCOSBase(page, destinations, destination.getCOSObject());
|
||||
}
|
||||
|
||||
if (outlinePosition.isEmpty()) {
|
||||
|
||||
PDAction action = item.getAction();
|
||||
COSDictionary cosDictionary = null;
|
||||
if (action != null) {
|
||||
cosDictionary = action.getCOSObject();
|
||||
}
|
||||
|
||||
outlinePosition = extractOutlineLocationGoTo(page, cosDictionary, destinations);
|
||||
}
|
||||
return outlinePosition.map(position -> new OutlineObjectTreeNode(new OutlineObject(title, pageNumber, position, depth)));
|
||||
|
||||
} catch (Exception e) {
|
||||
|
||||
log.info("Could not find outline item in document with title: " + item.getTitle());
|
||||
return Optional.empty();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private OutlineObjectTreeNode createOutlineObject(PDOutlineItem item, PDDocument document, int depth) {
|
||||
private static Optional<Point2D> extractOutlineLocationGoTo(PDPage page, COSDictionary cosDictionary, PDDestinationNameTreeNode destinations) {
|
||||
|
||||
String title = item.getTitle();
|
||||
if (isGoToAction(cosDictionary)) {
|
||||
COSBase cosBase = cosDictionary.getItem(COSName.D);
|
||||
return getLocationFromCOSBase(page, destinations, cosBase);
|
||||
}
|
||||
|
||||
PDPage page = item.findDestinationPage(document);
|
||||
int pageNumber = document.getPages().indexOf(page);
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
//float x = 0;
|
||||
//float y = 0;
|
||||
//COSDictionary cosObject = item.getAction().getCOSObject();
|
||||
// if (cosObject.getNameAsString("S").toLowerCase(Locale.ROOT).equals("goto")) {
|
||||
// COSArray cosArray = cosObject.getCOSArray(COSName.D);
|
||||
// x = ((COSInteger)cosArray.get(2)).floatValue();
|
||||
// y = ((COSInteger)cosArray.get(3)).floatValue();
|
||||
//
|
||||
// }
|
||||
//return new OutlineObject(title, pageNumber, new Point2D.Float(x, y));
|
||||
|
||||
return new OutlineObjectTreeNode(new OutlineObject(title, pageNumber, depth));
|
||||
private static Optional<Point2D> getLocationFromCOSBase(PDPage page, PDDestinationNameTreeNode destinations, COSBase cosBase) throws IOException {
|
||||
|
||||
if (cosBase != null) {
|
||||
if (cosBase instanceof COSArray cosArray) {
|
||||
return getLocationFromCosArrayWithXYZCoordinates(cosArray, page);
|
||||
}
|
||||
|
||||
if (cosBase instanceof COSString cosString) {
|
||||
String destinationName = cosString.getString();
|
||||
COSArray cosArray = destinations.getValue(destinationName).getCOSObject();
|
||||
return getLocationFromCosArrayWithXYZCoordinates(cosArray, page);
|
||||
}
|
||||
|
||||
}
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
|
||||
private static Optional<Point2D> getLocationFromCosArrayWithXYZCoordinates(COSArray cosArray, PDPage page) {
|
||||
|
||||
if (isXYZDestination(cosArray)) {
|
||||
float x = ((COSNumber) cosArray.get(2)).floatValue();
|
||||
float y = Math.abs(((COSNumber) cosArray.get(3)).floatValue() - page.getMediaBox().getHeight());
|
||||
return Optional.of(new Point2D.Float(x, y));
|
||||
}
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
|
||||
private static boolean isXYZDestination(COSArray cosArray) {
|
||||
|
||||
return cosArray != null && cosArray.getName(1).equals("XYZ");
|
||||
}
|
||||
|
||||
|
||||
private static boolean isGoToAction(COSDictionary cosDictionary) {
|
||||
|
||||
return cosDictionary.getNameAsString("S").toLowerCase(Locale.ROOT).equals("goto");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -8,20 +8,26 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
|
||||
@Service
|
||||
public class BlockificationPostprocessingService {
|
||||
|
||||
public void sanitizeOutlineBlocks(ClassificationPage classificationPage, List<OutlineObject> outlineObjects) {
|
||||
public static final float BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD = 8.0f;
|
||||
|
||||
|
||||
public void sanitizeOutlineBlocks(ClassificationPage classificationPage, List<OutlineObject> outlineObjects) {
|
||||
|
||||
for (AbstractPageBlock textBlock : classificationPage.getTextBlocks()) {
|
||||
for (OutlineObject outlineObject : outlineObjects) {
|
||||
|
||||
String blockText = textBlock.getText();
|
||||
String outlineTitle = outlineObject.getTitle();
|
||||
|
||||
if (!blockText.contains(outlineTitle)) {
|
||||
boolean blockTextContainsOutlineTitle = blockText.contains(outlineTitle);
|
||||
boolean outlineTitleContainsBlockText = outlineTitle.contains(blockText);
|
||||
if (!isBlockCloseToOutline(textBlock, outlineObject) || !blockTextContainsOutlineTitle && !outlineTitleContainsBlockText) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -31,7 +37,13 @@ public class BlockificationPostprocessingService {
|
||||
continue;
|
||||
}
|
||||
|
||||
splitTextBlock(textBlock, outlineTitle, classificationPage);
|
||||
if(blockTextContainsOutlineTitle) {
|
||||
splitTextBlock(textBlock, outlineTitle, classificationPage);
|
||||
}
|
||||
|
||||
if(outlineTitleContainsBlockText) {
|
||||
// find other blocks, merge them into current, mark them for deletion after loop
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
@ -39,6 +51,24 @@ public class BlockificationPostprocessingService {
|
||||
}
|
||||
|
||||
|
||||
private boolean isBlockCloseToOutline(AbstractPageBlock textBlock, OutlineObject outlineObject) {
|
||||
|
||||
float threshold = BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD;
|
||||
//if (textBlock instanceof TextPageBlock) {
|
||||
// List<TextPositionSequence> sequences = ((TextPageBlock) textBlock).getSequences();
|
||||
// if (sequences != null) {
|
||||
// float textHeightSum = 0;
|
||||
// for (TextPositionSequence word : sequences) {
|
||||
// textHeightSum += word.getTextHeight();
|
||||
// }
|
||||
// threshold = textHeightSum / sequences.size();
|
||||
// }
|
||||
//}
|
||||
|
||||
return textBlock.getMinY() - outlineObject.getPoint().getY() < threshold && textBlock.getMinX() - outlineObject.getPoint().getX() < threshold;
|
||||
}
|
||||
|
||||
|
||||
private void splitTextBlock(AbstractPageBlock textBlock, String title, ClassificationPage classificationPage) {
|
||||
|
||||
}
|
||||
|
||||
@ -28,8 +28,8 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
public void testViewerDocument() {
|
||||
|
||||
//String fileName = "files/new/abschlussarbeiten-template-institut-fur-informatik-padagogische-hochschule-karlsruhe.pdf";
|
||||
//String fileName = "files/new/$100m Offers.pdf";
|
||||
//String fileName = "files/new/kaust-official-thesis-template.pdf";
|
||||
//String fileName = "files/new/$100m Offers.pdf";
|
||||
String fileName = "files/new/18-Curacron_ToxicidadeOcularInVitro.pdf";
|
||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user