diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index 79d5cce..fa8d7d7 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -103,7 +103,7 @@ public class LayoutParsingPipeline { layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentGraph)); try (var out = new ByteArrayOutputStream()) { - viewerDocumentService.createViewerDocument(originDocument, documentGraph, out); + viewerDocumentService.createViewerDocument(originDocument, documentGraph, out, false); layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, out); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java index cb4db7b..b2fe519 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java @@ -1,5 +1,6 @@ package com.knecon.fforesight.service.layoutparser.processor.services; +import java.util.Comparator; import java.util.List; import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; @@ -13,6 +14,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter; import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils; @@ -20,65 +22,61 @@ import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils; @Service public class BodyTextFrameService { + private static final float RULING_HEIGHT_THRESHOLD = 0.15f; // multiplied with page height. Header/Footer Rulings must be within that border of the page. + private static final float RULING_WIDTH_THRESHOLD = 0.75f; // multiplied with page width. Header/Footer Rulings must be at least that wide. public void setBodyTextFrames(ClassificationDocument classificationDocument, LayoutParsingType layoutParsingType) { Rectangle bodyTextFrame = calculateBodyTextFrame(classificationDocument.getPages(), classificationDocument.getFontSizeCounter(), false, layoutParsingType); Rectangle landscapeBodyTextFrame = calculateBodyTextFrame(classificationDocument.getPages(), classificationDocument.getFontSizeCounter(), true, layoutParsingType); for (ClassificationPage page : classificationDocument.getPages()) { - setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame); + var updatedBodyTextFrame = getBodyTextFrameFromRulings(page, bodyTextFrame, landscapeBodyTextFrame); + setBodyTextFrameAdjustedToPage(page, updatedBodyTextFrame, updatedBodyTextFrame); } } -/* - private Rectangle calculateBodyTextFrameByRulings(List pages) { + private Rectangle getBodyTextFrameFromRulings(ClassificationPage page, Rectangle bodyTextFrame, Rectangle landscapeBodyTextFrame) { - Map> potentialHeaderRulingsPerPage = new HashMap<>(); - Map> potentialFooterRulingsPerPage = new HashMap<>(); - - for (var page : pages) { - potentialHeaderRulingsPerPage.put(page, - page.getCleanRulings() - .getHorizontal() - .stream() - .filter(ruling -> ruling.getY1() > page.getPageHeight() * 0.8) - .filter(ruling -> ruling.getWidth() > 0.6 * page.getPageWidth()) - .toList()); - potentialFooterRulingsPerPage.put(page, - page.getCleanRulings() - .getHorizontal() - .stream() - .filter(ruling -> ruling.getY1() < page.getPageHeight() * 0.2) - .filter(ruling -> ruling.getWidth() > 0.6 * page.getPageWidth()) - .toList()); + List potentialFooterRulings = getPotentialFooterRulings(page); + List potentialHeaderRulings = getPotentialHeaderRulings(page); + var x = bodyTextFrame.getTopLeft().getX(); + var y = bodyTextFrame.getTopLeft().getY(); + var w = bodyTextFrame.getWidth(); + var h = bodyTextFrame.getHeight(); + if (!potentialFooterRulings.isEmpty()) { + h = y + h - potentialFooterRulings.get(0).getTop(); + y = potentialFooterRulings.get(0).getTop(); } - - Optional headerRuling = potentialHeaderRulingsPerPage.values() - .stream() - .flatMap(Collection::stream) - .filter(ruling -> potentialHeaderRulingsPerPage.values() - .stream() - .filter(rulingsPerPage -> rulingsPerPage.stream().anyMatch(ruling::almostMatches)) - .count() > pages.size() * RULING_THRESHOLD_FACTOR) - .min(Comparator.comparingDouble(Ruling::getY1)); - - Optional footerRuling = potentialFooterRulingsPerPage.values() - .stream() - .flatMap(Collection::stream) - .filter(ruling -> potentialHeaderRulingsPerPage.values() - .stream() - .filter(rulingsPerPage -> rulingsPerPage.stream().anyMatch(ruling::almostMatches)) - .count() > pages.size() * RULING_THRESHOLD_FACTOR) - .max(Comparator.comparingDouble(Ruling::getY1)); - - double maxY = headerRuling.isPresent() ? headerRuling.get().y1 : pages.stream().mapToDouble(ClassificationPage::getPageHeight).max().orElse(Double.MAX_VALUE); - double minY = footerRuling.map(ruling -> ruling.y1).orElse(0F); - double maxX = pages.stream().mapToDouble(ClassificationPage::getPageWidth).max().orElse(Double.MAX_VALUE); - - return new Rectangle(new Point((float) maxX, (float) maxY), (float) 0, (float) minY, -1); + if (!potentialHeaderRulings.isEmpty()) { + h = potentialHeaderRulings.get(0).getBottom() - bodyTextFrame.getTopLeft().getY(); + } + return new Rectangle(new Point(x, y), w, h, page.getPageNumber()); + } + + + private List getPotentialFooterRulings(ClassificationPage page) { + + return page.getCleanRulings() + .getHorizontal() + .stream() + .filter(ruling -> ruling.getY1() < page.getPageHeight() * RULING_HEIGHT_THRESHOLD) + .filter(ruling -> ruling.getWidth() > RULING_WIDTH_THRESHOLD * page.getPageWidth()) + .sorted(Comparator.comparingDouble(Ruling::getTop)) + .toList(); + } + + + private List getPotentialHeaderRulings(ClassificationPage page) { + + return page.getCleanRulings() + .getHorizontal() + .stream() + .filter(ruling -> ruling.getY1() > page.getPageHeight() * (1 - RULING_HEIGHT_THRESHOLD)) + .filter(ruling -> ruling.getWidth() > RULING_WIDTH_THRESHOLD * page.getPageWidth()) + .sorted(Comparator.comparingDouble(Ruling::getBottom).reversed()) + .toList(); } -*/ /** @@ -129,10 +127,10 @@ public class BodyTextFrameService { * @param landscape Calculate for landscape or portrait * @return Rectangle of the text frame */ - private Rectangle calculateBodyTextFrame(List pages, - FloatFrequencyCounter documentFontSizeCounter, - boolean landscape, - LayoutParsingType layoutParsingType) { + protected Rectangle calculateBodyTextFrame(List pages, + FloatFrequencyCounter documentFontSizeCounter, + boolean landscape, + LayoutParsingType layoutParsingType) { float approximateHeaderLineCount; if (layoutParsingType.equals(LayoutParsingType.TAAS)) { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/ViewerDocumentService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/ViewerDocumentService.java index a9801c9..752fafa 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/ViewerDocumentService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/ViewerDocumentService.java @@ -39,8 +39,8 @@ import lombok.extern.slf4j.Slf4j; @RequiredArgsConstructor public class ViewerDocumentService { - private static final String layerName = "Layout grid"; + private static final String LAYER_NAME = "Layout grid"; private static final int FONT_SIZE = 10; public static final float LINE_WIDTH = 1f; @@ -48,14 +48,14 @@ public class ViewerDocumentService { @SneakyThrows - public void createViewerDocument(PDDocument pdDocument, Document document, OutputStream outputStream) { + public void createViewerDocument(PDDocument pdDocument, Document document, OutputStream outputStream, boolean layerVisibilityDefaultValue) { log.info("Start Viewer Document Creation"); LayoutGrid layoutGrid = layoutGridService.createLayoutGrid(document); // PDDocument.save() is very slow, since it actually traverses the entire pdf and writes a new one. // If we collect all COSDictionaries we changed and tell it explicitly to only add the changed ones by using saveIncremental it's very fast. Set dictionariesToUpdate = new HashSet<>(); - PDOptionalContentGroup layer = addLayerToDocument(pdDocument, dictionariesToUpdate); + PDOptionalContentGroup layer = addLayerToDocument(pdDocument, dictionariesToUpdate, layerVisibilityDefaultValue); PDFont font = PDType1Font.HELVETICA; for (int pageNumber = 0; pageNumber < pdDocument.getNumberOfPages(); pageNumber++) { @@ -119,6 +119,7 @@ public class ViewerDocumentService { dictionariesToUpdate.add(pdPage.getResources().getCOSObject()); } dictionariesToUpdate.add(pdDocument.getDocumentInformation().getCOSObject()); +// dictionariesToUpdate.add(pdDocument.getDocument().getTrailer()); pdDocument.saveIncremental(outputStream, dictionariesToUpdate); log.info("Saved Viewer Document"); } @@ -145,7 +146,7 @@ public class ViewerDocumentService { } - private static PDOptionalContentGroup addLayerToDocument(PDDocument pdDocument, Set dictionariesToUpdate) { + private static PDOptionalContentGroup addLayerToDocument(PDDocument pdDocument, Set dictionariesToUpdate, boolean layerVisibilityDefaultValue) { PDDocumentCatalog catalog = pdDocument.getDocumentCatalog(); PDOptionalContentProperties ocprops = catalog.getOCProperties(); @@ -154,13 +155,13 @@ public class ViewerDocumentService { catalog.setOCProperties(ocprops); } PDOptionalContentGroup layer = null; - if (ocprops.hasGroup(layerName)) { - layer = ocprops.getGroup(layerName); + if (ocprops.hasGroup(LAYER_NAME)) { + layer = ocprops.getGroup(LAYER_NAME); } else { - layer = new PDOptionalContentGroup(layerName); + layer = new PDOptionalContentGroup(LAYER_NAME); ocprops.addGroup(layer); } - ocprops.setGroupEnabled(layer, false); + ocprops.setGroupEnabled(layer, layerVisibilityDefaultValue); dictionariesToUpdate.add(catalog.getCOSObject()); return layer; } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentDataTests.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentDataTests.java index dfc1d72..003b94b 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentDataTests.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentDataTests.java @@ -12,10 +12,11 @@ import org.springframework.core.io.ClassPathResource; import com.iqser.red.commons.jackson.ObjectMapperFactory; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData; import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper; +import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest; import lombok.SneakyThrows; -public class DocumentDataTests extends BuildDocumentGraphTest{ +public class DocumentDataTests extends BuildDocumentTest { @Test @SneakyThrows public void createDocumentDataForAllFiles() { diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphJsonWritingTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphJsonWritingTest.java index 95e8c48..122c02c 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphJsonWritingTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphJsonWritingTest.java @@ -20,10 +20,11 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.ima import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper; +import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest; import lombok.SneakyThrows; -public class DocumentGraphJsonWritingTest extends BuildDocumentGraphTest { +public class DocumentGraphJsonWritingTest extends BuildDocumentTest { @Test @SneakyThrows diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphMappingTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphMappingTest.java index 7c007c0..4b28541 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphMappingTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphMappingTest.java @@ -16,11 +16,12 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Ta import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper; import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentGraphMapper; import com.knecon.fforesight.service.layoutparser.processor.services.mapper.PropertiesMapper; +import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest; import com.knecon.fforesight.tenantcommons.TenantContext; import lombok.SneakyThrows; -public class DocumentGraphMappingTest extends BuildDocumentGraphTest { +public class DocumentGraphMappingTest extends BuildDocumentTest { @Test @SneakyThrows diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphVisualizationTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphVisualizationTest.java index 0994643..733292e 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphVisualizationTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphVisualizationTest.java @@ -13,13 +13,14 @@ import org.springframework.core.io.ClassPathResource; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; +import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest; import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw; import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; @Slf4j -public class DocumentGraphVisualizationTest extends BuildDocumentGraphTest { +public class DocumentGraphVisualizationTest extends BuildDocumentTest { @Test @SneakyThrows diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java index f150ae2..cbec346 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java @@ -12,10 +12,11 @@ import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsi import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService; import com.knecon.fforesight.service.layoutparser.processor.services.visualization.ViewerDocumentService; +import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest; import lombok.SneakyThrows; -public class ViewerDocumentTest extends BuildDocumentGraphTest { +public class ViewerDocumentTest extends BuildDocumentTest { @Test @Disabled @@ -28,7 +29,7 @@ public class ViewerDocumentTest extends BuildDocumentGraphTest { Document document = buildGraph(fileName, LayoutParsingType.DOCUMINE); String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf"; try (var pdDocument = Loader.loadPDF(new ClassPathResource(fileName).getInputStream()); var out = new FileOutputStream(tmpFileName)) { - viewerDocumentService.createViewerDocument(pdDocument, document, out); + viewerDocumentService.createViewerDocument(pdDocument, document, out, true); } } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/BodyTextFrameServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/BodyTextFrameServiceTest.java new file mode 100644 index 0000000..cd74bbe --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/BodyTextFrameServiceTest.java @@ -0,0 +1,31 @@ +package com.knecon.fforesight.service.layoutparser.server.services; + +import java.nio.file.Path; +import java.util.List; + +import org.junit.jupiter.api.Test; + +import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; +import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; +import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest; +import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw; + +import lombok.SneakyThrows; + +class BodyTextFrameServiceTest extends BuildDocumentTest { + + @Test + @SneakyThrows + public void testCalculateBodyTextFrame() { + + String filename = "files/211.pdf"; + String outputFilename = "/tmp/" + Path.of(filename).getFileName() + "_MAINBODY.pdf"; + ClassificationDocument document = parseLayout(filename, LayoutParsingType.TAAS); + PdfDraw.drawRectanglesPerPage(filename, + document.getPages().stream().map(page -> List.of(RectangleTransformations.toRectangle2D(page.getBodyTextFrame()))).toList(), + outputFilename); + + } + +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java index 0fdcf29..cceec48 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java @@ -5,7 +5,6 @@ import java.util.Collections; import java.util.LinkedList; import java.util.List; -import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import com.knecon.fforesight.service.layoutparser.processor.model.PageContents; @@ -19,13 +18,13 @@ import lombok.SneakyThrows; public class RulingCleaningServiceTest { @Test - @Disabled +// @Disabled @SneakyThrows public void textRulingExtraction() { - String fileName = "files/BASF/2013-1110704.pdf"; + String fileName = "files/211.pdf"; String lineFileName = "/tmp/" + Path.of(fileName).getFileName().toString() + "_LINES.pdf"; - List pageContents = PageContentExtractor.getSortedPageContents("files/BASF/2013-1110704.pdf"); + List pageContents = PageContentExtractor.getSortedPageContents(fileName); PdfDraw.drawLinesPerPage(fileName, pageContents.stream().map(PageContents::getRulings).toList(), lineFileName); RulingCleaningService rulingCleaningService = new RulingCleaningService(); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/BuildDocumentGraphTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/BuildDocumentTest.java similarity index 60% rename from layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/BuildDocumentGraphTest.java rename to layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/BuildDocumentTest.java index 092a530..8b5eea9 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/BuildDocumentGraphTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/BuildDocumentTest.java @@ -1,39 +1,35 @@ -package com.knecon.fforesight.service.layoutparser.server.graph; - -import static org.junit.jupiter.api.Assertions.assertEquals; +package com.knecon.fforesight.service.layoutparser.server.utils; import java.io.InputStream; import org.apache.pdfbox.Loader; import org.apache.pdfbox.pdmodel.PDDocument; -import org.junit.jupiter.api.Disabled; -import org.junit.jupiter.api.Test; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.core.io.ClassPathResource; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline; -import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; +import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; -import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest; import lombok.SneakyThrows; -public class BuildDocumentGraphTest extends AbstractTest { +public abstract class BuildDocumentTest extends AbstractTest { @Autowired protected LayoutParsingPipeline layoutParsingPipeline; - @Test - @Disabled - public void buildMetolachlor() { + @SneakyThrows + protected ClassificationDocument parseLayout(String filename, LayoutParsingType layoutParsingType) { - Document documentGraph = buildGraph("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"); - assertEquals(221, documentGraph.getPages().size()); - assertEquals(220, documentGraph.getPages().stream().filter(page -> page.getHeader().hasText()).count()); - assertEquals(0, documentGraph.getPages().stream().filter(page -> page.getFooter().hasText()).count()); + ClassPathResource fileResource = new ClassPathResource(filename); + prepareStorage(filename); + try (InputStream inputStream = fileResource.getInputStream(); PDDocument pdDocument = Loader.loadPDF(inputStream)) { + return layoutParsingPipeline.parseLayout(layoutParsingType, pdDocument, layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID), new TableServiceResponse()); + } } @@ -52,14 +48,9 @@ public class BuildDocumentGraphTest extends AbstractTest { } else { prepareStorage(filename); } - ClassPathResource fileResource = new ClassPathResource(filename); - try (InputStream inputStream = fileResource.getInputStream(); PDDocument pdDocument = Loader.loadPDF(inputStream)) { - return DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(layoutParsingType, - pdDocument, - layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID), - new TableServiceResponse())); - } + return DocumentGraphFactory.buildDocumentGraph(parseLayout(filename, layoutParsingType)); } } +