RED-8670: fix PageContentExtractor
This commit is contained in:
parent
ceb9532d65
commit
b307c03f7d
@ -116,24 +116,21 @@ public class LayoutParsingPipeline {
|
||||
@Value("${LAYOUT_PARSER_VERSION:}")
|
||||
private String layoutParserVersion;
|
||||
|
||||
|
||||
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
log.info("Starting layout parsing for {}", layoutParsingRequest.identifier());
|
||||
|
||||
File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
|
||||
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId())
|
||||
.orElse(originFile);
|
||||
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()).orElse(originFile);
|
||||
|
||||
VisualLayoutParsingResponse visualLayoutParsingResponse = layoutParsingRequest.visualLayoutParsingFileId()
|
||||
.map(layoutParsingStorageService::getVisualLayoutParsingFile)
|
||||
.orElse(new VisualLayoutParsingResponse());
|
||||
.map(layoutParsingStorageService::getVisualLayoutParsingFile).orElse(new VisualLayoutParsingResponse());
|
||||
ImageServiceResponse imageServiceResponse = layoutParsingRequest.imagesFileStorageId()
|
||||
.map(layoutParsingStorageService::getImagesFile)
|
||||
.orElse(new ImageServiceResponse());
|
||||
.map(layoutParsingStorageService::getImagesFile).orElse(new ImageServiceResponse());
|
||||
TableServiceResponse tableServiceResponse = layoutParsingRequest.tablesFileStorageId()
|
||||
.map(layoutParsingStorageService::getTablesFile)
|
||||
.orElse(new TableServiceResponse());
|
||||
.map(layoutParsingStorageService::getTablesFile).orElse(new TableServiceResponse());
|
||||
IdpResult idpResult = layoutParsingRequest.idpResultStorageId()
|
||||
.map(layoutParsingStorageService::getIdpResultFile).orElse(IdpResult.empty());
|
||||
|
||||
@ -158,10 +155,8 @@ public class LayoutParsingPipeline {
|
||||
log.info("Storing resulting files for {}", layoutParsingRequest.identifier());
|
||||
|
||||
layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentWithVisualization.document()));
|
||||
if (layoutParsingRequest.documentMarkdownFileStorageId()
|
||||
.isPresent()) {
|
||||
layoutParsingStorageService.storeMarkdownFile(layoutParsingRequest.documentMarkdownFileStorageId()
|
||||
.get(),
|
||||
if (layoutParsingRequest.documentMarkdownFileStorageId().isPresent()) {
|
||||
layoutParsingStorageService.storeMarkdownFile(layoutParsingRequest.documentMarkdownFileStorageId().get(),
|
||||
new MarkdownMapper().toMarkdownContent(documentWithVisualization.document()));
|
||||
}
|
||||
layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentWithVisualization.document()));
|
||||
@ -357,8 +352,7 @@ public class LayoutParsingPipeline {
|
||||
.flatMap(Collection::stream)
|
||||
.map(Character::getTextPosition)
|
||||
.filter(pos -> pos.getDir().equals(dir))
|
||||
.mapToDouble(RedTextPosition::getExactDir).average()
|
||||
.orElse(0);
|
||||
.mapToDouble(RedTextPosition::getExactDir).average().orElse(0);
|
||||
|
||||
if (averageRotation == 0) {
|
||||
continue;
|
||||
|
||||
@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
@ -16,6 +17,7 @@ import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
|
||||
import com.google.common.io.Files;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
|
||||
@ -116,6 +118,8 @@ public class PageContentExtractor {
|
||||
// As PDFBox caches all types of stuff, we need to close the document every once in a while to save on RAM
|
||||
doc.close();
|
||||
doc = openDocument(document);
|
||||
pageGetter = new PageGetter(doc.getPages()
|
||||
.iterator(), pageCount);
|
||||
}
|
||||
|
||||
extractPage(pageNumber, doc, pageGetter.getPage(pageNumber));
|
||||
@ -136,6 +140,7 @@ public class PageContentExtractor {
|
||||
@SneakyThrows
|
||||
public void extractPage(Integer pageNumber, PDDocument doc, PDPage pdPage) {
|
||||
|
||||
PageInformation pageInformation = PageInformation.fromPDPage(pageNumber, pdPage);
|
||||
PDFLinesTextStripper stripper = new PDFLinesTextStripper();
|
||||
stripper.setPageNumber(pageNumber);
|
||||
stripper.setStartPage(pageNumber);
|
||||
@ -143,7 +148,6 @@ public class PageContentExtractor {
|
||||
stripper.setPdpage(pdPage);
|
||||
stripper.getText(doc);
|
||||
|
||||
PageInformation pageInformation = PageInformation.fromPDPage(pageNumber, pdPage);
|
||||
List<Word> words = stripper.getWords();
|
||||
List<Ruling> rulings = stripper.getRulings();
|
||||
List<Box> graphicBBoxes = findGraphicBBoxes(pageInformation, pdPage, doc, words);
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user