RED-8670: fix PageContentExtractor

This commit is contained in:
Kilian Schuettler 2025-01-14 14:27:25 +01:00
parent ceb9532d65
commit b307c03f7d
2 changed files with 13 additions and 15 deletions

View File

@ -116,24 +116,21 @@ public class LayoutParsingPipeline {
@Value("${LAYOUT_PARSER_VERSION:}")
private String layoutParserVersion;
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
long start = System.currentTimeMillis();
log.info("Starting layout parsing for {}", layoutParsingRequest.identifier());
File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId())
.orElse(originFile);
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()).orElse(originFile);
VisualLayoutParsingResponse visualLayoutParsingResponse = layoutParsingRequest.visualLayoutParsingFileId()
.map(layoutParsingStorageService::getVisualLayoutParsingFile)
.orElse(new VisualLayoutParsingResponse());
.map(layoutParsingStorageService::getVisualLayoutParsingFile).orElse(new VisualLayoutParsingResponse());
ImageServiceResponse imageServiceResponse = layoutParsingRequest.imagesFileStorageId()
.map(layoutParsingStorageService::getImagesFile)
.orElse(new ImageServiceResponse());
.map(layoutParsingStorageService::getImagesFile).orElse(new ImageServiceResponse());
TableServiceResponse tableServiceResponse = layoutParsingRequest.tablesFileStorageId()
.map(layoutParsingStorageService::getTablesFile)
.orElse(new TableServiceResponse());
.map(layoutParsingStorageService::getTablesFile).orElse(new TableServiceResponse());
IdpResult idpResult = layoutParsingRequest.idpResultStorageId()
.map(layoutParsingStorageService::getIdpResultFile).orElse(IdpResult.empty());
@ -158,10 +155,8 @@ public class LayoutParsingPipeline {
log.info("Storing resulting files for {}", layoutParsingRequest.identifier());
layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentWithVisualization.document()));
if (layoutParsingRequest.documentMarkdownFileStorageId()
.isPresent()) {
layoutParsingStorageService.storeMarkdownFile(layoutParsingRequest.documentMarkdownFileStorageId()
.get(),
if (layoutParsingRequest.documentMarkdownFileStorageId().isPresent()) {
layoutParsingStorageService.storeMarkdownFile(layoutParsingRequest.documentMarkdownFileStorageId().get(),
new MarkdownMapper().toMarkdownContent(documentWithVisualization.document()));
}
layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentWithVisualization.document()));
@ -357,8 +352,7 @@ public class LayoutParsingPipeline {
.flatMap(Collection::stream)
.map(Character::getTextPosition)
.filter(pos -> pos.getDir().equals(dir))
.mapToDouble(RedTextPosition::getExactDir).average()
.orElse(0);
.mapToDouble(RedTextPosition::getExactDir).average().orElse(0);
if (averageRotation == 0) {
continue;

View File

@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.services;
import java.awt.geom.Rectangle2D;
import java.io.File;
import java.io.IOException;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
@ -16,6 +17,7 @@ import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import com.google.common.io.Files;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
@ -116,6 +118,8 @@ public class PageContentExtractor {
// As PDFBox caches all types of stuff, we need to close the document every once in a while to save on RAM
doc.close();
doc = openDocument(document);
pageGetter = new PageGetter(doc.getPages()
.iterator(), pageCount);
}
extractPage(pageNumber, doc, pageGetter.getPage(pageNumber));
@ -136,6 +140,7 @@ public class PageContentExtractor {
@SneakyThrows
public void extractPage(Integer pageNumber, PDDocument doc, PDPage pdPage) {
PageInformation pageInformation = PageInformation.fromPDPage(pageNumber, pdPage);
PDFLinesTextStripper stripper = new PDFLinesTextStripper();
stripper.setPageNumber(pageNumber);
stripper.setStartPage(pageNumber);
@ -143,7 +148,6 @@ public class PageContentExtractor {
stripper.setPdpage(pdPage);
stripper.getText(doc);
PageInformation pageInformation = PageInformation.fromPDPage(pageNumber, pdPage);
List<Word> words = stripper.getWords();
List<Ruling> rulings = stripper.getRulings();
List<Box> graphicBBoxes = findGraphicBBoxes(pageInformation, pdPage, doc, words);