From b307c03f7d870ea3b872d015e1d0f25f4c4d27b3 Mon Sep 17 00:00:00 2001 From: Kilian Schuettler Date: Tue, 14 Jan 2025 14:27:25 +0100 Subject: [PATCH] RED-8670: fix PageContentExtractor --- .../processor/LayoutParsingPipeline.java | 22 +++++++------------ .../services/PageContentExtractor.java | 6 ++++- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index 1e83430..0710ddd 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -116,24 +116,21 @@ public class LayoutParsingPipeline { @Value("${LAYOUT_PARSER_VERSION:}") private String layoutParserVersion; + public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException { long start = System.currentTimeMillis(); log.info("Starting layout parsing for {}", layoutParsingRequest.identifier()); File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId()); - File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()) - .orElse(originFile); + File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()).orElse(originFile); VisualLayoutParsingResponse visualLayoutParsingResponse = layoutParsingRequest.visualLayoutParsingFileId() - .map(layoutParsingStorageService::getVisualLayoutParsingFile) - .orElse(new VisualLayoutParsingResponse()); + .map(layoutParsingStorageService::getVisualLayoutParsingFile).orElse(new VisualLayoutParsingResponse()); ImageServiceResponse imageServiceResponse = layoutParsingRequest.imagesFileStorageId() - .map(layoutParsingStorageService::getImagesFile) - .orElse(new ImageServiceResponse()); + .map(layoutParsingStorageService::getImagesFile).orElse(new ImageServiceResponse()); TableServiceResponse tableServiceResponse = layoutParsingRequest.tablesFileStorageId() - .map(layoutParsingStorageService::getTablesFile) - .orElse(new TableServiceResponse()); + .map(layoutParsingStorageService::getTablesFile).orElse(new TableServiceResponse()); IdpResult idpResult = layoutParsingRequest.idpResultStorageId() .map(layoutParsingStorageService::getIdpResultFile).orElse(IdpResult.empty()); @@ -158,10 +155,8 @@ public class LayoutParsingPipeline { log.info("Storing resulting files for {}", layoutParsingRequest.identifier()); layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentWithVisualization.document())); - if (layoutParsingRequest.documentMarkdownFileStorageId() - .isPresent()) { - layoutParsingStorageService.storeMarkdownFile(layoutParsingRequest.documentMarkdownFileStorageId() - .get(), + if (layoutParsingRequest.documentMarkdownFileStorageId().isPresent()) { + layoutParsingStorageService.storeMarkdownFile(layoutParsingRequest.documentMarkdownFileStorageId().get(), new MarkdownMapper().toMarkdownContent(documentWithVisualization.document())); } layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentWithVisualization.document())); @@ -357,8 +352,7 @@ public class LayoutParsingPipeline { .flatMap(Collection::stream) .map(Character::getTextPosition) .filter(pos -> pos.getDir().equals(dir)) - .mapToDouble(RedTextPosition::getExactDir).average() - .orElse(0); + .mapToDouble(RedTextPosition::getExactDir).average().orElse(0); if (averageRotation == 0) { continue; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PageContentExtractor.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PageContentExtractor.java index 79dbacd..fe5c462 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PageContentExtractor.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PageContentExtractor.java @@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.services; import java.awt.geom.Rectangle2D; import java.io.File; import java.io.IOException; +import java.nio.file.Path; import java.util.ArrayList; import java.util.Arrays; import java.util.Iterator; @@ -16,6 +17,7 @@ import org.apache.pdfbox.Loader; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; +import com.google.common.io.Files; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox; import com.knecon.fforesight.service.layoutparser.processor.model.PageContents; import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation; @@ -116,6 +118,8 @@ public class PageContentExtractor { // As PDFBox caches all types of stuff, we need to close the document every once in a while to save on RAM doc.close(); doc = openDocument(document); + pageGetter = new PageGetter(doc.getPages() + .iterator(), pageCount); } extractPage(pageNumber, doc, pageGetter.getPage(pageNumber)); @@ -136,6 +140,7 @@ public class PageContentExtractor { @SneakyThrows public void extractPage(Integer pageNumber, PDDocument doc, PDPage pdPage) { + PageInformation pageInformation = PageInformation.fromPDPage(pageNumber, pdPage); PDFLinesTextStripper stripper = new PDFLinesTextStripper(); stripper.setPageNumber(pageNumber); stripper.setStartPage(pageNumber); @@ -143,7 +148,6 @@ public class PageContentExtractor { stripper.setPdpage(pdPage); stripper.getText(doc); - PageInformation pageInformation = PageInformation.fromPDPage(pageNumber, pdPage); List words = stripper.getWords(); List rulings = stripper.getRulings(); List graphicBBoxes = findGraphicBBoxes(pageInformation, pdPage, doc, words);