RED-7461: improve header/footer recognition

This commit is contained in:
Kilian Schuettler 2023-08-21 16:20:23 +02:00
parent 3722fff476
commit 2b15fd1d3c
11 changed files with 116 additions and 91 deletions

View File

@ -103,7 +103,7 @@ public class LayoutParsingPipeline {
layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentGraph)); layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentGraph));
try (var out = new ByteArrayOutputStream()) { try (var out = new ByteArrayOutputStream()) {
viewerDocumentService.createViewerDocument(originDocument, documentGraph, out); viewerDocumentService.createViewerDocument(originDocument, documentGraph, out, false);
layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, out); layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, out);
} }

View File

@ -1,5 +1,6 @@
package com.knecon.fforesight.service.layoutparser.processor.services; package com.knecon.fforesight.service.layoutparser.processor.services;
import java.util.Comparator;
import java.util.List; import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
@ -13,6 +14,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter; import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils; import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
@ -20,65 +22,61 @@ import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
@Service @Service
public class BodyTextFrameService { public class BodyTextFrameService {
private static final float RULING_HEIGHT_THRESHOLD = 0.15f; // multiplied with page height. Header/Footer Rulings must be within that border of the page.
private static final float RULING_WIDTH_THRESHOLD = 0.75f; // multiplied with page width. Header/Footer Rulings must be at least that wide.
public void setBodyTextFrames(ClassificationDocument classificationDocument, LayoutParsingType layoutParsingType) { public void setBodyTextFrames(ClassificationDocument classificationDocument, LayoutParsingType layoutParsingType) {
Rectangle bodyTextFrame = calculateBodyTextFrame(classificationDocument.getPages(), classificationDocument.getFontSizeCounter(), false, layoutParsingType); Rectangle bodyTextFrame = calculateBodyTextFrame(classificationDocument.getPages(), classificationDocument.getFontSizeCounter(), false, layoutParsingType);
Rectangle landscapeBodyTextFrame = calculateBodyTextFrame(classificationDocument.getPages(), classificationDocument.getFontSizeCounter(), true, layoutParsingType); Rectangle landscapeBodyTextFrame = calculateBodyTextFrame(classificationDocument.getPages(), classificationDocument.getFontSizeCounter(), true, layoutParsingType);
for (ClassificationPage page : classificationDocument.getPages()) { for (ClassificationPage page : classificationDocument.getPages()) {
setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame); var updatedBodyTextFrame = getBodyTextFrameFromRulings(page, bodyTextFrame, landscapeBodyTextFrame);
setBodyTextFrameAdjustedToPage(page, updatedBodyTextFrame, updatedBodyTextFrame);
} }
} }
/* private Rectangle getBodyTextFrameFromRulings(ClassificationPage page, Rectangle bodyTextFrame, Rectangle landscapeBodyTextFrame) {
private Rectangle calculateBodyTextFrameByRulings(List<ClassificationPage> pages) {
Map<ClassificationPage, List<Ruling>> potentialHeaderRulingsPerPage = new HashMap<>(); List<Ruling> potentialFooterRulings = getPotentialFooterRulings(page);
Map<ClassificationPage, List<Ruling>> potentialFooterRulingsPerPage = new HashMap<>(); List<Ruling> potentialHeaderRulings = getPotentialHeaderRulings(page);
var x = bodyTextFrame.getTopLeft().getX();
for (var page : pages) { var y = bodyTextFrame.getTopLeft().getY();
potentialHeaderRulingsPerPage.put(page, var w = bodyTextFrame.getWidth();
page.getCleanRulings() var h = bodyTextFrame.getHeight();
.getHorizontal() if (!potentialFooterRulings.isEmpty()) {
.stream() h = y + h - potentialFooterRulings.get(0).getTop();
.filter(ruling -> ruling.getY1() > page.getPageHeight() * 0.8) y = potentialFooterRulings.get(0).getTop();
.filter(ruling -> ruling.getWidth() > 0.6 * page.getPageWidth())
.toList());
potentialFooterRulingsPerPage.put(page,
page.getCleanRulings()
.getHorizontal()
.stream()
.filter(ruling -> ruling.getY1() < page.getPageHeight() * 0.2)
.filter(ruling -> ruling.getWidth() > 0.6 * page.getPageWidth())
.toList());
} }
if (!potentialHeaderRulings.isEmpty()) {
Optional<Ruling> headerRuling = potentialHeaderRulingsPerPage.values() h = potentialHeaderRulings.get(0).getBottom() - bodyTextFrame.getTopLeft().getY();
.stream() }
.flatMap(Collection::stream) return new Rectangle(new Point(x, y), w, h, page.getPageNumber());
.filter(ruling -> potentialHeaderRulingsPerPage.values() }
.stream()
.filter(rulingsPerPage -> rulingsPerPage.stream().anyMatch(ruling::almostMatches))
.count() > pages.size() * RULING_THRESHOLD_FACTOR) private List<Ruling> getPotentialFooterRulings(ClassificationPage page) {
.min(Comparator.comparingDouble(Ruling::getY1));
return page.getCleanRulings()
Optional<Ruling> footerRuling = potentialFooterRulingsPerPage.values() .getHorizontal()
.stream() .stream()
.flatMap(Collection::stream) .filter(ruling -> ruling.getY1() < page.getPageHeight() * RULING_HEIGHT_THRESHOLD)
.filter(ruling -> potentialHeaderRulingsPerPage.values() .filter(ruling -> ruling.getWidth() > RULING_WIDTH_THRESHOLD * page.getPageWidth())
.stream() .sorted(Comparator.comparingDouble(Ruling::getTop))
.filter(rulingsPerPage -> rulingsPerPage.stream().anyMatch(ruling::almostMatches)) .toList();
.count() > pages.size() * RULING_THRESHOLD_FACTOR) }
.max(Comparator.comparingDouble(Ruling::getY1));
double maxY = headerRuling.isPresent() ? headerRuling.get().y1 : pages.stream().mapToDouble(ClassificationPage::getPageHeight).max().orElse(Double.MAX_VALUE); private List<Ruling> getPotentialHeaderRulings(ClassificationPage page) {
double minY = footerRuling.map(ruling -> ruling.y1).orElse(0F);
double maxX = pages.stream().mapToDouble(ClassificationPage::getPageWidth).max().orElse(Double.MAX_VALUE); return page.getCleanRulings()
.getHorizontal()
return new Rectangle(new Point((float) maxX, (float) maxY), (float) 0, (float) minY, -1); .stream()
.filter(ruling -> ruling.getY1() > page.getPageHeight() * (1 - RULING_HEIGHT_THRESHOLD))
.filter(ruling -> ruling.getWidth() > RULING_WIDTH_THRESHOLD * page.getPageWidth())
.sorted(Comparator.comparingDouble(Ruling::getBottom).reversed())
.toList();
} }
*/
/** /**
@ -129,10 +127,10 @@ public class BodyTextFrameService {
* @param landscape Calculate for landscape or portrait * @param landscape Calculate for landscape or portrait
* @return Rectangle of the text frame * @return Rectangle of the text frame
*/ */
private Rectangle calculateBodyTextFrame(List<ClassificationPage> pages, protected Rectangle calculateBodyTextFrame(List<ClassificationPage> pages,
FloatFrequencyCounter documentFontSizeCounter, FloatFrequencyCounter documentFontSizeCounter,
boolean landscape, boolean landscape,
LayoutParsingType layoutParsingType) { LayoutParsingType layoutParsingType) {
float approximateHeaderLineCount; float approximateHeaderLineCount;
if (layoutParsingType.equals(LayoutParsingType.TAAS)) { if (layoutParsingType.equals(LayoutParsingType.TAAS)) {

View File

@ -39,8 +39,8 @@ import lombok.extern.slf4j.Slf4j;
@RequiredArgsConstructor @RequiredArgsConstructor
public class ViewerDocumentService { public class ViewerDocumentService {
private static final String layerName = "Layout grid";
private static final String LAYER_NAME = "Layout grid";
private static final int FONT_SIZE = 10; private static final int FONT_SIZE = 10;
public static final float LINE_WIDTH = 1f; public static final float LINE_WIDTH = 1f;
@ -48,14 +48,14 @@ public class ViewerDocumentService {
@SneakyThrows @SneakyThrows
public void createViewerDocument(PDDocument pdDocument, Document document, OutputStream outputStream) { public void createViewerDocument(PDDocument pdDocument, Document document, OutputStream outputStream, boolean layerVisibilityDefaultValue) {
log.info("Start Viewer Document Creation"); log.info("Start Viewer Document Creation");
LayoutGrid layoutGrid = layoutGridService.createLayoutGrid(document); LayoutGrid layoutGrid = layoutGridService.createLayoutGrid(document);
// PDDocument.save() is very slow, since it actually traverses the entire pdf and writes a new one. // PDDocument.save() is very slow, since it actually traverses the entire pdf and writes a new one.
// If we collect all COSDictionaries we changed and tell it explicitly to only add the changed ones by using saveIncremental it's very fast. // If we collect all COSDictionaries we changed and tell it explicitly to only add the changed ones by using saveIncremental it's very fast.
Set<COSDictionary> dictionariesToUpdate = new HashSet<>(); Set<COSDictionary> dictionariesToUpdate = new HashSet<>();
PDOptionalContentGroup layer = addLayerToDocument(pdDocument, dictionariesToUpdate); PDOptionalContentGroup layer = addLayerToDocument(pdDocument, dictionariesToUpdate, layerVisibilityDefaultValue);
PDFont font = PDType1Font.HELVETICA; PDFont font = PDType1Font.HELVETICA;
for (int pageNumber = 0; pageNumber < pdDocument.getNumberOfPages(); pageNumber++) { for (int pageNumber = 0; pageNumber < pdDocument.getNumberOfPages(); pageNumber++) {
@ -119,6 +119,7 @@ public class ViewerDocumentService {
dictionariesToUpdate.add(pdPage.getResources().getCOSObject()); dictionariesToUpdate.add(pdPage.getResources().getCOSObject());
} }
dictionariesToUpdate.add(pdDocument.getDocumentInformation().getCOSObject()); dictionariesToUpdate.add(pdDocument.getDocumentInformation().getCOSObject());
// dictionariesToUpdate.add(pdDocument.getDocument().getTrailer());
pdDocument.saveIncremental(outputStream, dictionariesToUpdate); pdDocument.saveIncremental(outputStream, dictionariesToUpdate);
log.info("Saved Viewer Document"); log.info("Saved Viewer Document");
} }
@ -145,7 +146,7 @@ public class ViewerDocumentService {
} }
private static PDOptionalContentGroup addLayerToDocument(PDDocument pdDocument, Set<COSDictionary> dictionariesToUpdate) { private static PDOptionalContentGroup addLayerToDocument(PDDocument pdDocument, Set<COSDictionary> dictionariesToUpdate, boolean layerVisibilityDefaultValue) {
PDDocumentCatalog catalog = pdDocument.getDocumentCatalog(); PDDocumentCatalog catalog = pdDocument.getDocumentCatalog();
PDOptionalContentProperties ocprops = catalog.getOCProperties(); PDOptionalContentProperties ocprops = catalog.getOCProperties();
@ -154,13 +155,13 @@ public class ViewerDocumentService {
catalog.setOCProperties(ocprops); catalog.setOCProperties(ocprops);
} }
PDOptionalContentGroup layer = null; PDOptionalContentGroup layer = null;
if (ocprops.hasGroup(layerName)) { if (ocprops.hasGroup(LAYER_NAME)) {
layer = ocprops.getGroup(layerName); layer = ocprops.getGroup(LAYER_NAME);
} else { } else {
layer = new PDOptionalContentGroup(layerName); layer = new PDOptionalContentGroup(LAYER_NAME);
ocprops.addGroup(layer); ocprops.addGroup(layer);
} }
ocprops.setGroupEnabled(layer, false); ocprops.setGroupEnabled(layer, layerVisibilityDefaultValue);
dictionariesToUpdate.add(catalog.getCOSObject()); dictionariesToUpdate.add(catalog.getCOSObject());
return layer; return layer;
} }

View File

@ -12,10 +12,11 @@ import org.springframework.core.io.ClassPathResource;
import com.iqser.red.commons.jackson.ObjectMapperFactory; import com.iqser.red.commons.jackson.ObjectMapperFactory;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper; import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
import lombok.SneakyThrows; import lombok.SneakyThrows;
public class DocumentDataTests extends BuildDocumentGraphTest{ public class DocumentDataTests extends BuildDocumentTest {
@Test @Test
@SneakyThrows @SneakyThrows
public void createDocumentDataForAllFiles() { public void createDocumentDataForAllFiles() {

View File

@ -20,10 +20,11 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.ima
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper; import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
import lombok.SneakyThrows; import lombok.SneakyThrows;
public class DocumentGraphJsonWritingTest extends BuildDocumentGraphTest { public class DocumentGraphJsonWritingTest extends BuildDocumentTest {
@Test @Test
@SneakyThrows @SneakyThrows

View File

@ -16,11 +16,12 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Ta
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper; import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentGraphMapper; import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentGraphMapper;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.PropertiesMapper; import com.knecon.fforesight.service.layoutparser.processor.services.mapper.PropertiesMapper;
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
import com.knecon.fforesight.tenantcommons.TenantContext; import com.knecon.fforesight.tenantcommons.TenantContext;
import lombok.SneakyThrows; import lombok.SneakyThrows;
public class DocumentGraphMappingTest extends BuildDocumentGraphTest { public class DocumentGraphMappingTest extends BuildDocumentTest {
@Test @Test
@SneakyThrows @SneakyThrows

View File

@ -13,13 +13,14 @@ import org.springframework.core.io.ClassPathResource;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw; import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
@Slf4j @Slf4j
public class DocumentGraphVisualizationTest extends BuildDocumentGraphTest { public class DocumentGraphVisualizationTest extends BuildDocumentTest {
@Test @Test
@SneakyThrows @SneakyThrows

View File

@ -12,10 +12,11 @@ import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsi
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService; import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.ViewerDocumentService; import com.knecon.fforesight.service.layoutparser.processor.services.visualization.ViewerDocumentService;
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
import lombok.SneakyThrows; import lombok.SneakyThrows;
public class ViewerDocumentTest extends BuildDocumentGraphTest { public class ViewerDocumentTest extends BuildDocumentTest {
@Test @Test
@Disabled @Disabled
@ -28,7 +29,7 @@ public class ViewerDocumentTest extends BuildDocumentGraphTest {
Document document = buildGraph(fileName, LayoutParsingType.DOCUMINE); Document document = buildGraph(fileName, LayoutParsingType.DOCUMINE);
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf"; String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
try (var pdDocument = Loader.loadPDF(new ClassPathResource(fileName).getInputStream()); var out = new FileOutputStream(tmpFileName)) { try (var pdDocument = Loader.loadPDF(new ClassPathResource(fileName).getInputStream()); var out = new FileOutputStream(tmpFileName)) {
viewerDocumentService.createViewerDocument(pdDocument, document, out); viewerDocumentService.createViewerDocument(pdDocument, document, out, true);
} }
} }

View File

@ -0,0 +1,31 @@
package com.knecon.fforesight.service.layoutparser.server.services;
import java.nio.file.Path;
import java.util.List;
import org.junit.jupiter.api.Test;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
import lombok.SneakyThrows;
class BodyTextFrameServiceTest extends BuildDocumentTest {
@Test
@SneakyThrows
public void testCalculateBodyTextFrame() {
String filename = "files/211.pdf";
String outputFilename = "/tmp/" + Path.of(filename).getFileName() + "_MAINBODY.pdf";
ClassificationDocument document = parseLayout(filename, LayoutParsingType.TAAS);
PdfDraw.drawRectanglesPerPage(filename,
document.getPages().stream().map(page -> List.of(RectangleTransformations.toRectangle2D(page.getBodyTextFrame()))).toList(),
outputFilename);
}
}

View File

@ -5,7 +5,6 @@ import java.util.Collections;
import java.util.LinkedList; import java.util.LinkedList;
import java.util.List; import java.util.List;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents; import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
@ -19,13 +18,13 @@ import lombok.SneakyThrows;
public class RulingCleaningServiceTest { public class RulingCleaningServiceTest {
@Test @Test
@Disabled // @Disabled
@SneakyThrows @SneakyThrows
public void textRulingExtraction() { public void textRulingExtraction() {
String fileName = "files/BASF/2013-1110704.pdf"; String fileName = "files/211.pdf";
String lineFileName = "/tmp/" + Path.of(fileName).getFileName().toString() + "_LINES.pdf"; String lineFileName = "/tmp/" + Path.of(fileName).getFileName().toString() + "_LINES.pdf";
List<PageContents> pageContents = PageContentExtractor.getSortedPageContents("files/BASF/2013-1110704.pdf"); List<PageContents> pageContents = PageContentExtractor.getSortedPageContents(fileName);
PdfDraw.drawLinesPerPage(fileName, pageContents.stream().map(PageContents::getRulings).toList(), lineFileName); PdfDraw.drawLinesPerPage(fileName, pageContents.stream().map(PageContents::getRulings).toList(), lineFileName);
RulingCleaningService rulingCleaningService = new RulingCleaningService(); RulingCleaningService rulingCleaningService = new RulingCleaningService();

View File

@ -1,39 +1,35 @@
package com.knecon.fforesight.service.layoutparser.server.graph; package com.knecon.fforesight.service.layoutparser.server.utils;
import static org.junit.jupiter.api.Assertions.assertEquals;
import java.io.InputStream; import java.io.InputStream;
import org.apache.pdfbox.Loader; import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocument;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.core.io.ClassPathResource; import org.springframework.core.io.ClassPathResource;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline; import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest;
import lombok.SneakyThrows; import lombok.SneakyThrows;
public class BuildDocumentGraphTest extends AbstractTest { public abstract class BuildDocumentTest extends AbstractTest {
@Autowired @Autowired
protected LayoutParsingPipeline layoutParsingPipeline; protected LayoutParsingPipeline layoutParsingPipeline;
@Test @SneakyThrows
@Disabled protected ClassificationDocument parseLayout(String filename, LayoutParsingType layoutParsingType) {
public void buildMetolachlor() {
Document documentGraph = buildGraph("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"); ClassPathResource fileResource = new ClassPathResource(filename);
assertEquals(221, documentGraph.getPages().size()); prepareStorage(filename);
assertEquals(220, documentGraph.getPages().stream().filter(page -> page.getHeader().hasText()).count()); try (InputStream inputStream = fileResource.getInputStream(); PDDocument pdDocument = Loader.loadPDF(inputStream)) {
assertEquals(0, documentGraph.getPages().stream().filter(page -> page.getFooter().hasText()).count()); return layoutParsingPipeline.parseLayout(layoutParsingType, pdDocument, layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID), new TableServiceResponse());
}
} }
@ -52,14 +48,9 @@ public class BuildDocumentGraphTest extends AbstractTest {
} else { } else {
prepareStorage(filename); prepareStorage(filename);
} }
ClassPathResource fileResource = new ClassPathResource(filename);
try (InputStream inputStream = fileResource.getInputStream(); PDDocument pdDocument = Loader.loadPDF(inputStream)) { return DocumentGraphFactory.buildDocumentGraph(parseLayout(filename, layoutParsingType));
return DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(layoutParsingType,
pdDocument,
layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID),
new TableServiceResponse()));
}
} }
} }