diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineObject.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineObject.java index cddb81b..6cc6485 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineObject.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineObject.java @@ -13,7 +13,7 @@ public class OutlineObject { private String title; private int pageNumber; - //private Point2D point; + private Point2D point; private int treeDepth; @Override diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/OutlineExtractorService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/OutlineExtractorService.java index b33082e..0dbe471 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/OutlineExtractorService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/OutlineExtractorService.java @@ -1,10 +1,24 @@ package com.knecon.fforesight.service.layoutparser.processor.services; +import java.awt.geom.Point2D; +import java.io.IOException; import java.util.ArrayList; import java.util.List; +import java.util.Locale; +import java.util.Optional; +import org.apache.pdfbox.cos.COSArray; +import org.apache.pdfbox.cos.COSBase; +import org.apache.pdfbox.cos.COSDictionary; +import org.apache.pdfbox.cos.COSName; +import org.apache.pdfbox.cos.COSNumber; +import org.apache.pdfbox.cos.COSString; +import org.apache.pdfbox.pdmodel.PDDestinationNameTreeNode; import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary; import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.interactive.action.PDAction; +import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDDestination; import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline; import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem; import org.springframework.stereotype.Service; @@ -14,8 +28,10 @@ import com.knecon.fforesight.service.layoutparser.processor.model.outline.Outlin import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTreeNode; import lombok.SneakyThrows; +import lombok.extern.slf4j.Slf4j; @Service +@Slf4j public class OutlineExtractorService { @SneakyThrows @@ -25,8 +41,8 @@ public class OutlineExtractorService { List rootNodes = new ArrayList<>(); for (PDOutlineItem child : documentOutline.children()) { - OutlineObjectTreeNode outlineObject = createOutlineObjectWithChildren(child, document, 1); - rootNodes.add(outlineObject); + Optional outlineObject = createOutlineObjectWithChildren(child, document, 1); + outlineObject.ifPresent(rootNodes::add); } return new OutlineObjectTree(rootNodes); @@ -34,37 +50,112 @@ public class OutlineExtractorService { @SneakyThrows - private OutlineObjectTreeNode createOutlineObjectWithChildren(PDOutlineItem item, PDDocument document, int depth) { + private Optional createOutlineObjectWithChildren(PDOutlineItem item, PDDocument document, int depth) { - OutlineObjectTreeNode outlineObject = createOutlineObject(item, document, depth); - for (var child : item.children()) { - outlineObject.addChild(createOutlineObjectWithChildren(child, document, depth + 1)); + Optional outlineObject = createOutlineObject(item, document, depth); + if (outlineObject.isPresent()) { + for (var child : item.children()) { + Optional outlineObjectWithChildren = createOutlineObjectWithChildren(child, document, depth + 1); + outlineObjectWithChildren.ifPresent(outlineObjectTreeNode -> outlineObject.get().addChild(outlineObjectTreeNode)); + } } return outlineObject; } + private Optional createOutlineObject(PDOutlineItem item, PDDocument document, int depth) { + + try { + String title = item.getTitle(); + + PDPage page = item.findDestinationPage(document); + int pageNumber = document.getPages().indexOf(page); + + + PDDocumentNameDictionary names = document.getDocumentCatalog().getNames(); + PDDestinationNameTreeNode destinations = null; + if (names != null) { + destinations = names.getDests(); + } + + Optional outlinePosition = Optional.empty(); + + PDDestination destination = item.getDestination(); + if (destination != null) { + outlinePosition = getLocationFromCOSBase(page, destinations, destination.getCOSObject()); + } + + if (outlinePosition.isEmpty()) { + + PDAction action = item.getAction(); + COSDictionary cosDictionary = null; + if (action != null) { + cosDictionary = action.getCOSObject(); + } + + outlinePosition = extractOutlineLocationGoTo(page, cosDictionary, destinations); + } + return outlinePosition.map(position -> new OutlineObjectTreeNode(new OutlineObject(title, pageNumber, position, depth))); + + } catch (Exception e) { + + log.info("Could not find outline item in document with title: " + item.getTitle()); + return Optional.empty(); + } + } + + @SneakyThrows - private OutlineObjectTreeNode createOutlineObject(PDOutlineItem item, PDDocument document, int depth) { + private static Optional extractOutlineLocationGoTo(PDPage page, COSDictionary cosDictionary, PDDestinationNameTreeNode destinations) { - String title = item.getTitle(); + if (isGoToAction(cosDictionary)) { + COSBase cosBase = cosDictionary.getItem(COSName.D); + return getLocationFromCOSBase(page, destinations, cosBase); + } - PDPage page = item.findDestinationPage(document); - int pageNumber = document.getPages().indexOf(page); + return Optional.empty(); + } - //float x = 0; - //float y = 0; - //COSDictionary cosObject = item.getAction().getCOSObject(); - // if (cosObject.getNameAsString("S").toLowerCase(Locale.ROOT).equals("goto")) { - // COSArray cosArray = cosObject.getCOSArray(COSName.D); - // x = ((COSInteger)cosArray.get(2)).floatValue(); - // y = ((COSInteger)cosArray.get(3)).floatValue(); - // - // } - //return new OutlineObject(title, pageNumber, new Point2D.Float(x, y)); - return new OutlineObjectTreeNode(new OutlineObject(title, pageNumber, depth)); + private static Optional getLocationFromCOSBase(PDPage page, PDDestinationNameTreeNode destinations, COSBase cosBase) throws IOException { + + if (cosBase != null) { + if (cosBase instanceof COSArray cosArray) { + return getLocationFromCosArrayWithXYZCoordinates(cosArray, page); + } + + if (cosBase instanceof COSString cosString) { + String destinationName = cosString.getString(); + COSArray cosArray = destinations.getValue(destinationName).getCOSObject(); + return getLocationFromCosArrayWithXYZCoordinates(cosArray, page); + } + + } + return Optional.empty(); + } + + + private static Optional getLocationFromCosArrayWithXYZCoordinates(COSArray cosArray, PDPage page) { + + if (isXYZDestination(cosArray)) { + float x = ((COSNumber) cosArray.get(2)).floatValue(); + float y = Math.abs(((COSNumber) cosArray.get(3)).floatValue() - page.getMediaBox().getHeight()); + return Optional.of(new Point2D.Float(x, y)); + } + return Optional.empty(); + } + + + private static boolean isXYZDestination(COSArray cosArray) { + + return cosArray != null && cosArray.getName(1).equals("XYZ"); + } + + + private static boolean isGoToAction(COSDictionary cosDictionary) { + + return cosDictionary.getNameAsString("S").toLowerCase(Locale.ROOT).equals("goto"); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java index 27450f7..b3f288c 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java @@ -8,20 +8,26 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; @Service public class BlockificationPostprocessingService { - public void sanitizeOutlineBlocks(ClassificationPage classificationPage, List outlineObjects) { + public static final float BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD = 8.0f; + public void sanitizeOutlineBlocks(ClassificationPage classificationPage, List outlineObjects) { + for (AbstractPageBlock textBlock : classificationPage.getTextBlocks()) { for (OutlineObject outlineObject : outlineObjects) { String blockText = textBlock.getText(); String outlineTitle = outlineObject.getTitle(); - if (!blockText.contains(outlineTitle)) { + boolean blockTextContainsOutlineTitle = blockText.contains(outlineTitle); + boolean outlineTitleContainsBlockText = outlineTitle.contains(blockText); + if (!isBlockCloseToOutline(textBlock, outlineObject) || !blockTextContainsOutlineTitle && !outlineTitleContainsBlockText) { continue; } @@ -31,7 +37,13 @@ public class BlockificationPostprocessingService { continue; } - splitTextBlock(textBlock, outlineTitle, classificationPage); + if(blockTextContainsOutlineTitle) { + splitTextBlock(textBlock, outlineTitle, classificationPage); + } + + if(outlineTitleContainsBlockText) { + // find other blocks, merge them into current, mark them for deletion after loop + } } } @@ -39,6 +51,24 @@ public class BlockificationPostprocessingService { } + private boolean isBlockCloseToOutline(AbstractPageBlock textBlock, OutlineObject outlineObject) { + + float threshold = BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD; + //if (textBlock instanceof TextPageBlock) { + // List sequences = ((TextPageBlock) textBlock).getSequences(); + // if (sequences != null) { + // float textHeightSum = 0; + // for (TextPositionSequence word : sequences) { + // textHeightSum += word.getTextHeight(); + // } + // threshold = textHeightSum / sequences.size(); + // } + //} + + return textBlock.getMinY() - outlineObject.getPoint().getY() < threshold && textBlock.getMinX() - outlineObject.getPoint().getX() < threshold; + } + + private void splitTextBlock(AbstractPageBlock textBlock, String title, ClassificationPage classificationPage) { } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java index d347062..9b7bed7 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java @@ -28,8 +28,8 @@ public class ViewerDocumentTest extends BuildDocumentTest { public void testViewerDocument() { //String fileName = "files/new/abschlussarbeiten-template-institut-fur-informatik-padagogische-hochschule-karlsruhe.pdf"; - //String fileName = "files/new/$100m Offers.pdf"; //String fileName = "files/new/kaust-official-thesis-template.pdf"; + //String fileName = "files/new/$100m Offers.pdf"; String fileName = "files/new/18-Curacron_ToxicidadeOcularInVitro.pdf"; String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";