RED-8670: fix PageContentExtractor

2025-01-14 14:27:25 +01:00 · 2025-01-14 14:27:25 +01:00 · b307c03f7d
commit b307c03f7d
parent ceb9532d65
2 changed files with 13 additions and 15 deletions
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java
@ -116,24 +116,21 @@ public class LayoutParsingPipeline {
    @Value("${LAYOUT_PARSER_VERSION:}")
    private String layoutParserVersion;

+
    public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {

        long start = System.currentTimeMillis();
        log.info("Starting layout parsing for {}", layoutParsingRequest.identifier());

        File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
-        File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId())
-                .orElse(originFile);
+        File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()).orElse(originFile);

        VisualLayoutParsingResponse visualLayoutParsingResponse = layoutParsingRequest.visualLayoutParsingFileId()
-                .map(layoutParsingStorageService::getVisualLayoutParsingFile)
-                .orElse(new VisualLayoutParsingResponse());
+                .map(layoutParsingStorageService::getVisualLayoutParsingFile).orElse(new VisualLayoutParsingResponse());
        ImageServiceResponse imageServiceResponse = layoutParsingRequest.imagesFileStorageId()
-                .map(layoutParsingStorageService::getImagesFile)
-                .orElse(new ImageServiceResponse());
+                .map(layoutParsingStorageService::getImagesFile).orElse(new ImageServiceResponse());
        TableServiceResponse tableServiceResponse = layoutParsingRequest.tablesFileStorageId()
-                .map(layoutParsingStorageService::getTablesFile)
-                .orElse(new TableServiceResponse());
+                .map(layoutParsingStorageService::getTablesFile).orElse(new TableServiceResponse());
        IdpResult idpResult = layoutParsingRequest.idpResultStorageId()
                .map(layoutParsingStorageService::getIdpResultFile).orElse(IdpResult.empty());

@ -158,10 +155,8 @@ public class LayoutParsingPipeline {
        log.info("Storing resulting files for {}", layoutParsingRequest.identifier());

        layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentWithVisualization.document()));
-        if (layoutParsingRequest.documentMarkdownFileStorageId()
-                .isPresent()) {
-            layoutParsingStorageService.storeMarkdownFile(layoutParsingRequest.documentMarkdownFileStorageId()
-                                                                  .get(),
+        if (layoutParsingRequest.documentMarkdownFileStorageId().isPresent()) {
+            layoutParsingStorageService.storeMarkdownFile(layoutParsingRequest.documentMarkdownFileStorageId().get(),
                                                          new MarkdownMapper().toMarkdownContent(documentWithVisualization.document()));
        }
        layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentWithVisualization.document()));
@ -357,8 +352,7 @@ public class LayoutParsingPipeline {
                    .flatMap(Collection::stream)
                    .map(Character::getTextPosition)
                    .filter(pos -> pos.getDir().equals(dir))
-                    .mapToDouble(RedTextPosition::getExactDir).average()
-                    .orElse(0);
+                    .mapToDouble(RedTextPosition::getExactDir).average().orElse(0);

            if (averageRotation == 0) {
                continue;
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PageContentExtractor.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PageContentExtractor.java
@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.services;
 import java.awt.geom.Rectangle2D;
 import java.io.File;
 import java.io.IOException;
+import java.nio.file.Path;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Iterator;
@ -16,6 +17,7 @@ import org.apache.pdfbox.Loader;
 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.pdmodel.PDPage;

+import com.google.common.io.Files;
 import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
 import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
 import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
@ -116,6 +118,8 @@ public class PageContentExtractor {
                // As PDFBox caches all types of stuff, we need to close the document every once in a while to save on RAM
                doc.close();
                doc = openDocument(document);
+                pageGetter = new PageGetter(doc.getPages()
+                                                    .iterator(), pageCount);
            }

            extractPage(pageNumber, doc, pageGetter.getPage(pageNumber));
@ -136,6 +140,7 @@ public class PageContentExtractor {
    @SneakyThrows
    public void extractPage(Integer pageNumber, PDDocument doc, PDPage pdPage) {

+        PageInformation pageInformation = PageInformation.fromPDPage(pageNumber, pdPage);
        PDFLinesTextStripper stripper = new PDFLinesTextStripper();
        stripper.setPageNumber(pageNumber);
        stripper.setStartPage(pageNumber);
@ -143,7 +148,6 @@ public class PageContentExtractor {
        stripper.setPdpage(pdPage);
        stripper.getText(doc);

-        PageInformation pageInformation = PageInformation.fromPDPage(pageNumber, pdPage);
        List<Word> words = stripper.getWords();
        List<Ruling> rulings = stripper.getRulings();
        List<Box> graphicBBoxes = findGraphicBBoxes(pageInformation, pdPage, doc, words);