diff --git a/layoutparser-service/layoutparser-service-processor/build.gradle.kts b/layoutparser-service/layoutparser-service-processor/build.gradle.kts index 4320936..cb71d20 100644 --- a/layoutparser-service/layoutparser-service-processor/build.gradle.kts +++ b/layoutparser-service/layoutparser-service-processor/build.gradle.kts @@ -6,7 +6,7 @@ plugins { description = "layoutparser-service-processor" val jacksonVersion = "2.15.2" -val pdfBoxVersion = "3.0.0-RC1" +val pdfBoxVersion = "3.0.0" dependencies { implementation(project(":layoutparser-service-internal-api")) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index 79d5cce..fa8d7d7 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -103,7 +103,7 @@ public class LayoutParsingPipeline { layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentGraph)); try (var out = new ByteArrayOutputStream()) { - viewerDocumentService.createViewerDocument(originDocument, documentGraph, out); + viewerDocumentService.createViewerDocument(originDocument, documentGraph, out, false); layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, out); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java index 727fe1e..9575d04 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java @@ -13,7 +13,6 @@ import java.nio.file.StandardOpenOption; import org.apache.commons.io.IOUtils; import org.apache.pdfbox.Loader; -import org.apache.pdfbox.io.MemoryUsageSetting; import org.apache.pdfbox.pdmodel.PDDocument; import org.springframework.stereotype.Service; @@ -49,7 +48,7 @@ public class LayoutParsingStorageService { IOUtils.copy(originDocumentInputStream, tempFileOutputStream); originDocumentInputStream.close(); } - return Loader.loadPDF(tempFile, MemoryUsageSetting.setupMixed(67108864L)); + return Loader.loadPDF(tempFile); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java index cb4db7b..5942443 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java @@ -1,8 +1,8 @@ package com.knecon.fforesight.service.layoutparser.processor.services; +import java.util.Comparator; import java.util.List; -import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; import org.springframework.stereotype.Service; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point; @@ -13,72 +13,70 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter; import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils; @Service public class BodyTextFrameService { + private static final float RULING_HEIGHT_THRESHOLD = 0.15f; // multiplied with page height. Header/Footer Rulings must be within that border of the page. + private static final float RULING_WIDTH_THRESHOLD = 0.75f; // multiplied with page width. Header/Footer Rulings must be at least that wide. public void setBodyTextFrames(ClassificationDocument classificationDocument, LayoutParsingType layoutParsingType) { Rectangle bodyTextFrame = calculateBodyTextFrame(classificationDocument.getPages(), classificationDocument.getFontSizeCounter(), false, layoutParsingType); Rectangle landscapeBodyTextFrame = calculateBodyTextFrame(classificationDocument.getPages(), classificationDocument.getFontSizeCounter(), true, layoutParsingType); for (ClassificationPage page : classificationDocument.getPages()) { +// var updatedBodyTextFrame = getBodyTextFrameFromRulings(page, bodyTextFrame, landscapeBodyTextFrame); setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame); } } -/* - private Rectangle calculateBodyTextFrameByRulings(List pages) { + private Rectangle getBodyTextFrameFromRulings(ClassificationPage page, Rectangle bodyTextFrame, Rectangle landscapeBodyTextFrame) { - Map> potentialHeaderRulingsPerPage = new HashMap<>(); - Map> potentialFooterRulingsPerPage = new HashMap<>(); - - for (var page : pages) { - potentialHeaderRulingsPerPage.put(page, - page.getCleanRulings() - .getHorizontal() - .stream() - .filter(ruling -> ruling.getY1() > page.getPageHeight() * 0.8) - .filter(ruling -> ruling.getWidth() > 0.6 * page.getPageWidth()) - .toList()); - potentialFooterRulingsPerPage.put(page, - page.getCleanRulings() - .getHorizontal() - .stream() - .filter(ruling -> ruling.getY1() < page.getPageHeight() * 0.2) - .filter(ruling -> ruling.getWidth() > 0.6 * page.getPageWidth()) - .toList()); + List potentialFooterRulings = getPotentialFooterRulings(page); + List potentialHeaderRulings = getPotentialHeaderRulings(page); + var x = bodyTextFrame.getTopLeft().getX(); + var y = bodyTextFrame.getTopLeft().getY(); + var w = bodyTextFrame.getWidth(); + var h = bodyTextFrame.getHeight(); + if (!potentialFooterRulings.isEmpty()) { + h = y + h - potentialFooterRulings.get(0).getTop(); + y = potentialFooterRulings.get(0).getTop(); } - - Optional headerRuling = potentialHeaderRulingsPerPage.values() - .stream() - .flatMap(Collection::stream) - .filter(ruling -> potentialHeaderRulingsPerPage.values() - .stream() - .filter(rulingsPerPage -> rulingsPerPage.stream().anyMatch(ruling::almostMatches)) - .count() > pages.size() * RULING_THRESHOLD_FACTOR) - .min(Comparator.comparingDouble(Ruling::getY1)); - - Optional footerRuling = potentialFooterRulingsPerPage.values() - .stream() - .flatMap(Collection::stream) - .filter(ruling -> potentialHeaderRulingsPerPage.values() - .stream() - .filter(rulingsPerPage -> rulingsPerPage.stream().anyMatch(ruling::almostMatches)) - .count() > pages.size() * RULING_THRESHOLD_FACTOR) - .max(Comparator.comparingDouble(Ruling::getY1)); - - double maxY = headerRuling.isPresent() ? headerRuling.get().y1 : pages.stream().mapToDouble(ClassificationPage::getPageHeight).max().orElse(Double.MAX_VALUE); - double minY = footerRuling.map(ruling -> ruling.y1).orElse(0F); - double maxX = pages.stream().mapToDouble(ClassificationPage::getPageWidth).max().orElse(Double.MAX_VALUE); - - return new Rectangle(new Point((float) maxX, (float) maxY), (float) 0, (float) minY, -1); + if (!potentialHeaderRulings.isEmpty()) { + h = potentialHeaderRulings.get(0).getBottom() - bodyTextFrame.getTopLeft().getY(); + } + return new Rectangle(new Point(x, y), w, h, page.getPageNumber()); + } + + + private List getPotentialFooterRulings(ClassificationPage page) { + + return page.getCleanRulings() + .getHorizontal() + .stream() + .filter(ruling -> ruling.getY1() < page.getPageHeight() * RULING_HEIGHT_THRESHOLD) + .filter(ruling -> ruling.getWidth() > RULING_WIDTH_THRESHOLD * page.getPageWidth()) + .sorted(Comparator.comparingDouble(Ruling::getTop)) + .toList(); + } + + + private List getPotentialHeaderRulings(ClassificationPage page) { + + return page.getCleanRulings() + .getHorizontal() + .stream() + .filter(ruling -> ruling.getY1() > page.getPageHeight() * (1 - RULING_HEIGHT_THRESHOLD)) + .filter(ruling -> ruling.getWidth() > RULING_WIDTH_THRESHOLD * page.getPageWidth()) + .sorted(Comparator.comparingDouble(Ruling::getBottom).reversed()) + .toList(); } -*/ /** @@ -129,10 +127,10 @@ public class BodyTextFrameService { * @param landscape Calculate for landscape or portrait * @return Rectangle of the text frame */ - private Rectangle calculateBodyTextFrame(List pages, - FloatFrequencyCounter documentFontSizeCounter, - boolean landscape, - LayoutParsingType layoutParsingType) { + protected Rectangle calculateBodyTextFrame(List pages, + FloatFrequencyCounter documentFontSizeCounter, + boolean landscape, + LayoutParsingType layoutParsingType) { float approximateHeaderLineCount; if (layoutParsingType.equals(LayoutParsingType.TAAS)) { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PageContentExtractor.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PageContentExtractor.java index 82a1ad3..dde3b94 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PageContentExtractor.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PageContentExtractor.java @@ -1,7 +1,6 @@ package com.knecon.fforesight.service.layoutparser.processor.services; import java.io.IOException; -import java.io.InputStream; import java.util.Collection; import java.util.LinkedList; import java.util.List; @@ -26,35 +25,34 @@ public class PageContentExtractor { public List getSortedPageContents(String filename) throws IOException { List textPositionSequencesPerPage = new LinkedList<>(); - try (InputStream inputStream = new ClassPathResource(filename).getInputStream()) { + ClassPathResource pdfResource = new ClassPathResource(filename); - try (PDDocument pdDocument = Loader.loadPDF(inputStream)) { + try (PDDocument pdDocument = Loader.loadPDF(pdfResource.getFile())) { - for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) { + for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) { - PDFLinesTextStripper stripper = new PDFLinesTextStripper(); - PDPage pdPage = pdDocument.getPage(pageNumber - 1); - stripper.setPageNumber(pageNumber); - stripper.setSortByPosition(true); - stripper.setStartPage(pageNumber); - stripper.setEndPage(pageNumber); - stripper.setPdpage(pdPage); - stripper.getText(pdDocument); + PDFLinesTextStripper stripper = new PDFLinesTextStripper(); + PDPage pdPage = pdDocument.getPage(pageNumber - 1); + stripper.setPageNumber(pageNumber); + stripper.setSortByPosition(true); + stripper.setStartPage(pageNumber); + stripper.setEndPage(pageNumber); + stripper.setPdpage(pdPage); + stripper.getText(pdDocument); - Map> sortedTextPositionSequencesPerDir = stripper.getTextPositionSequences() - .stream() - .collect(Collectors.groupingBy(textPositionSequence -> textPositionSequence.getDir().getDegrees())); + Map> sortedTextPositionSequencesPerDir = stripper.getTextPositionSequences() + .stream() + .collect(Collectors.groupingBy(textPositionSequence -> textPositionSequence.getDir().getDegrees())); - var sortedTextPositionSequences = sortByDirAccordingToPageRotation(sortedTextPositionSequencesPerDir, pdPage.getRotation()); + var sortedTextPositionSequences = sortByDirAccordingToPageRotation(sortedTextPositionSequencesPerDir, pdPage.getRotation()); - textPositionSequencesPerPage.add(new PageContents(sortedTextPositionSequences, - RectangleTransformations.toRectangle2D(pdPage.getCropBox()), - RectangleTransformations.toRectangle2D(pdPage.getMediaBox()), - stripper.getRulings())); - } + textPositionSequencesPerPage.add(new PageContents(sortedTextPositionSequences, + RectangleTransformations.toRectangle2D(pdPage.getCropBox()), + RectangleTransformations.toRectangle2D(pdPage.getMediaBox()), + stripper.getRulings())); } } - + return textPositionSequencesPerPage; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/LegacyPDFStreamEngine.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/LegacyPDFStreamEngine.java index 8d7b4f2..ff2e665 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/LegacyPDFStreamEngine.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/LegacyPDFStreamEngine.java @@ -92,28 +92,28 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine { */ LegacyPDFStreamEngine() throws IOException { - addOperator(new BeginText()); - addOperator(new Concatenate()); - addOperator(new DrawObject()); // special text version - addOperator(new EndText()); - addOperator(new SetGraphicsStateParameters()); - addOperator(new Save()); - addOperator(new Restore()); - addOperator(new NextLine()); - addOperator(new SetCharSpacing()); - addOperator(new MoveText()); - addOperator(new MoveTextSetLeading()); - addOperator(new SetFontAndSize()); - addOperator(new ShowText()); - addOperator(new ShowTextAdjusted()); - addOperator(new SetTextLeading()); - addOperator(new SetMatrix()); - addOperator(new SetTextRenderingMode()); - addOperator(new SetTextRise()); - addOperator(new SetWordSpacing()); - addOperator(new SetTextHorizontalScaling()); - addOperator(new ShowTextLine()); - addOperator(new ShowTextLineAndSpace()); + addOperator(new BeginText(this)); + addOperator(new Concatenate(this)); + addOperator(new DrawObject(this)); // special text version + addOperator(new EndText(this)); + addOperator(new SetGraphicsStateParameters(this)); + addOperator(new Save(this)); + addOperator(new Restore(this)); + addOperator(new NextLine(this)); + addOperator(new SetCharSpacing(this)); + addOperator(new MoveText(this)); + addOperator(new MoveTextSetLeading(this)); + addOperator(new SetFontAndSize(this)); + addOperator(new ShowText(this)); + addOperator(new ShowTextAdjusted(this)); + addOperator(new SetTextLeading(this)); + addOperator(new SetMatrix(this)); + addOperator(new SetTextRenderingMode(this)); + addOperator(new SetTextRise(this)); + addOperator(new SetWordSpacing(this)); + addOperator(new SetTextHorizontalScaling(this)); + addOperator(new ShowTextLine(this)); + addOperator(new ShowTextLineAndSpace(this)); // load additional glyph list for Unicode mapping String path = "/org/apache/pdfbox/resources/glyphlist/additional.txt"; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java index 6bdbba3..d3309bd 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java @@ -52,32 +52,31 @@ public class PDFLinesTextStripper extends PDFTextStripper { public PDFLinesTextStripper() throws IOException { super(); - this.addOperator(new SetStrokingColorSpace()); - this.addOperator(new SetNonStrokingColorSpace()); - this.addOperator(new SetLineDashPattern()); - this.addOperator(new SetStrokingDeviceGrayColor()); - this.addOperator(new SetNonStrokingDeviceGrayColor()); - this.addOperator(new SetFlatness()); - this.addOperator(new SetLineJoinStyle()); - this.addOperator(new SetLineCapStyle()); - this.addOperator(new SetStrokingDeviceCMYKColor()); - this.addOperator(new SetNonStrokingDeviceCMYKColor()); - this.addOperator(new SetLineMiterLimit()); - this.addOperator(new SetStrokingDeviceRGBColor()); - this.addOperator(new SetNonStrokingDeviceRGBColor()); - this.addOperator(new SetRenderingIntent()); - this.addOperator(new SetStrokingColor()); - this.addOperator(new SetNonStrokingColor()); - this.addOperator(new SetStrokingColorN()); - this.addOperator(new SetNonStrokingColorN()); - this.addOperator(new SetFontAndSize()); - this.addOperator(new SetLineWidth()); + this.addOperator(new SetStrokingColorSpace(this)); + this.addOperator(new SetNonStrokingColorSpace(this)); + this.addOperator(new SetLineDashPattern(this)); + this.addOperator(new SetStrokingDeviceGrayColor(this)); + this.addOperator(new SetNonStrokingDeviceGrayColor(this)); + this.addOperator(new SetFlatness(this)); + this.addOperator(new SetLineJoinStyle(this)); + this.addOperator(new SetLineCapStyle(this)); + this.addOperator(new SetStrokingDeviceCMYKColor(this)); + this.addOperator(new SetNonStrokingDeviceCMYKColor(this)); + this.addOperator(new SetLineMiterLimit(this)); + this.addOperator(new SetStrokingDeviceRGBColor(this)); + this.addOperator(new SetNonStrokingDeviceRGBColor(this)); + this.addOperator(new SetRenderingIntent(this)); + this.addOperator(new SetStrokingColor(this)); + this.addOperator(new SetNonStrokingColor(this)); + this.addOperator(new SetStrokingColorN(this)); + this.addOperator(new SetNonStrokingColorN(this)); + this.addOperator(new SetFontAndSize(this)); + this.addOperator(new SetLineWidth(this)); - addOperator(new BeginMarkedContentSequenceWithProperties()); -// addOperator(new BeginMarkedContentSequence()); - addOperator(new EndMarkedContentSequence()); - + addOperator(new BeginMarkedContentSequenceWithProperties(this)); +// addOperator(new BeginMarkedContentSequence(this)); + addOperator(new EndMarkedContentSequence(this)); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFTextStripper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFTextStripper.java index 46c0578..2f2d6ea 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFTextStripper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFTextStripper.java @@ -43,7 +43,8 @@ import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlin import org.apache.pdfbox.pdmodel.interactive.pagenavigation.PDThreadBead; import org.apache.pdfbox.text.TextPosition; import org.apache.pdfbox.text.TextPositionComparator; -import org.apache.pdfbox.util.QuickSort; + +import com.knecon.fforesight.service.layoutparser.processor.utils.QuickSort; /** * This is just a copy except i only adjusted lines 594-607 cause this is a bug in Pdfbox. diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/ViewerDocumentService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/ViewerDocumentService.java index a9801c9..39bba26 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/ViewerDocumentService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/ViewerDocumentService.java @@ -16,6 +16,7 @@ import org.apache.pdfbox.pdmodel.PDPageContentStream; import org.apache.pdfbox.pdmodel.PDResources; import org.apache.pdfbox.pdmodel.font.PDFont; import org.apache.pdfbox.pdmodel.font.PDType1Font; +import org.apache.pdfbox.pdmodel.font.Standard14Fonts; import org.apache.pdfbox.pdmodel.graphics.optionalcontent.PDOptionalContentGroup; import org.apache.pdfbox.pdmodel.graphics.optionalcontent.PDOptionalContentProperties; import org.apache.pdfbox.pdmodel.graphics.state.PDExtendedGraphicsState; @@ -39,8 +40,8 @@ import lombok.extern.slf4j.Slf4j; @RequiredArgsConstructor public class ViewerDocumentService { - private static final String layerName = "Layout grid"; + private static final String LAYER_NAME = "Layout grid"; private static final int FONT_SIZE = 10; public static final float LINE_WIDTH = 1f; @@ -48,15 +49,15 @@ public class ViewerDocumentService { @SneakyThrows - public void createViewerDocument(PDDocument pdDocument, Document document, OutputStream outputStream) { + public void createViewerDocument(PDDocument pdDocument, Document document, OutputStream outputStream, boolean layerVisibilityDefaultValue) { log.info("Start Viewer Document Creation"); LayoutGrid layoutGrid = layoutGridService.createLayoutGrid(document); // PDDocument.save() is very slow, since it actually traverses the entire pdf and writes a new one. // If we collect all COSDictionaries we changed and tell it explicitly to only add the changed ones by using saveIncremental it's very fast. Set dictionariesToUpdate = new HashSet<>(); - PDOptionalContentGroup layer = addLayerToDocument(pdDocument, dictionariesToUpdate); - PDFont font = PDType1Font.HELVETICA; + PDOptionalContentGroup layer = addLayerToDocument(pdDocument, dictionariesToUpdate, layerVisibilityDefaultValue); + PDFont font = new PDType1Font(Standard14Fonts.FontName.HELVETICA); for (int pageNumber = 0; pageNumber < pdDocument.getNumberOfPages(); pageNumber++) { PDPage pdPage = pdDocument.getPage(pageNumber); @@ -119,6 +120,7 @@ public class ViewerDocumentService { dictionariesToUpdate.add(pdPage.getResources().getCOSObject()); } dictionariesToUpdate.add(pdDocument.getDocumentInformation().getCOSObject()); +// dictionariesToUpdate.add(pdDocument.getDocument().getTrailer()); pdDocument.saveIncremental(outputStream, dictionariesToUpdate); log.info("Saved Viewer Document"); } @@ -145,7 +147,7 @@ public class ViewerDocumentService { } - private static PDOptionalContentGroup addLayerToDocument(PDDocument pdDocument, Set dictionariesToUpdate) { + private static PDOptionalContentGroup addLayerToDocument(PDDocument pdDocument, Set dictionariesToUpdate, boolean layerVisibilityDefaultValue) { PDDocumentCatalog catalog = pdDocument.getDocumentCatalog(); PDOptionalContentProperties ocprops = catalog.getOCProperties(); @@ -154,13 +156,13 @@ public class ViewerDocumentService { catalog.setOCProperties(ocprops); } PDOptionalContentGroup layer = null; - if (ocprops.hasGroup(layerName)) { - layer = ocprops.getGroup(layerName); + if (ocprops.hasGroup(LAYER_NAME)) { + layer = ocprops.getGroup(LAYER_NAME); } else { - layer = new PDOptionalContentGroup(layerName); + layer = new PDOptionalContentGroup(LAYER_NAME); ocprops.addGroup(layer); } - ocprops.setGroupEnabled(layer, false); + ocprops.setGroupEnabled(layer, layerVisibilityDefaultValue); dictionariesToUpdate.add(catalog.getCOSObject()); return layer; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PdfVisualisationUtility.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PdfVisualisationUtility.java index 095f55d..b950340 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PdfVisualisationUtility.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PdfVisualisationUtility.java @@ -12,6 +12,7 @@ import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPageContentStream; import org.apache.pdfbox.pdmodel.font.PDType1Font; +import org.apache.pdfbox.pdmodel.font.Standard14Fonts; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree; @@ -71,7 +72,7 @@ public class PdfVisualisationUtility { contentStream.beginText(); contentStream.newLineAtOffset((float) location.getX(), (float) location.getY()); - contentStream.setFont(PDType1Font.HELVETICA, 10); + contentStream.setFont(new PDType1Font(Standard14Fonts.FontName.HELVETICA), 10); contentStream.showText(string); contentStream.endText(); contentStream.close(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java index 6e8dd5a..53e8c29 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java @@ -3,7 +3,6 @@ package com.knecon.fforesight.service.layoutparser.processor.utils; import java.util.List; import java.util.stream.Collectors; -import org.apache.pdfbox.util.QuickSort; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; diff --git a/layoutparser-service/layoutparser-service-server/build.gradle.kts b/layoutparser-service/layoutparser-service-server/build.gradle.kts index a056b02..b14f8de 100644 --- a/layoutparser-service/layoutparser-service-server/build.gradle.kts +++ b/layoutparser-service/layoutparser-service-server/build.gradle.kts @@ -13,7 +13,7 @@ plugins { description = "layoutparser-service-server" val jacksonVersion = "2.15.2" -val pdfBoxVersion = "3.0.0-RC1" +val pdfBoxVersion = "3.0.0" dependencies { implementation(project(":layoutparser-service-processor")) diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/BdrJsonBuildTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/BdrJsonBuildTest.java index 965f89d..8e6255d 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/BdrJsonBuildTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/BdrJsonBuildTest.java @@ -4,10 +4,8 @@ import static org.assertj.core.api.AssertionsForClassTypes.assertThat; import java.awt.Color; import java.io.File; -import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; -import java.io.InputStream; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; @@ -46,15 +44,13 @@ public class BdrJsonBuildTest extends AbstractTest { @SneakyThrows - protected Document buildGraph(File filename) { + protected Document buildGraph(File file) { - try (InputStream inputStream = new FileInputStream(filename)) { - try (PDDocument pdDocument = Loader.loadPDF(inputStream)) { - return DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.TAAS, - pdDocument, - new ImageServiceResponse(), - new TableServiceResponse())); - } + try (PDDocument pdDocument = Loader.loadPDF(file)) { + return DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.TAAS, + pdDocument, + new ImageServiceResponse(), + new TableServiceResponse())); } } @@ -114,10 +110,7 @@ public class BdrJsonBuildTest extends AbstractTest { private static void visualizeSemanticNodes(File file, File resultingFileName, Document document, TextBlock textBlock) throws IOException { - try (var fileStream = new FileInputStream(file);// - PDDocument pdDocument = Loader.loadPDF(fileStream);// - var outputStream = new FileOutputStream(resultingFileName)// - ) { + try (PDDocument pdDocument = Loader.loadPDF(file); var outputStream = new FileOutputStream(resultingFileName)) { PdfDraw.drawDocumentGraph(pdDocument, document); PdfDraw.drawTextBlock(pdDocument, textBlock, PdfDraw.Options.builder().stroke(true).strokeWidth(0.1f).strokeColor(Color.YELLOW).build()); pdDocument.save(outputStream); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/HeadlinesGoldStandardIntegrationTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/HeadlinesGoldStandardIntegrationTest.java index 8da377e..b2e35d6 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/HeadlinesGoldStandardIntegrationTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/HeadlinesGoldStandardIntegrationTest.java @@ -96,7 +96,7 @@ public class HeadlinesGoldStandardIntegrationTest { goldStandardLog.getRedactionLogEntry().forEach(e -> goldStandardHeadlines.add(new Headline(e.getPositions().get(0).getPage(), e.getValue()))); Document documentGraph = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, - Loader.loadPDF(pdfFileResource.getInputStream()), + Loader.loadPDF(pdfFileResource.getFile()), new ImageServiceResponse(), new TableServiceResponse())); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentDataTests.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentDataTests.java index dfc1d72..003b94b 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentDataTests.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentDataTests.java @@ -12,10 +12,11 @@ import org.springframework.core.io.ClassPathResource; import com.iqser.red.commons.jackson.ObjectMapperFactory; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData; import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper; +import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest; import lombok.SneakyThrows; -public class DocumentDataTests extends BuildDocumentGraphTest{ +public class DocumentDataTests extends BuildDocumentTest { @Test @SneakyThrows public void createDocumentDataForAllFiles() { diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphJsonWritingTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphJsonWritingTest.java index 95e8c48..973e0b9 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphJsonWritingTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphJsonWritingTest.java @@ -1,7 +1,6 @@ package com.knecon.fforesight.service.layoutparser.server.graph; import java.io.File; -import java.io.FileInputStream; import java.io.FileOutputStream; import java.nio.file.Files; import java.nio.file.Path; @@ -20,10 +19,11 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.ima import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper; +import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest; import lombok.SneakyThrows; -public class DocumentGraphJsonWritingTest extends BuildDocumentGraphTest { +public class DocumentGraphJsonWritingTest extends BuildDocumentTest { @Test @SneakyThrows @@ -56,7 +56,7 @@ public class DocumentGraphJsonWritingTest extends BuildDocumentGraphTest { private void writeJsons(Path filename) { Document documentGraph = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, - Loader.loadPDF(new FileInputStream(filename.toFile())), + Loader.loadPDF(filename.toFile()), new ImageServiceResponse(), new TableServiceResponse())); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphMappingTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphMappingTest.java index 7c007c0..4b28541 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphMappingTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphMappingTest.java @@ -16,11 +16,12 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Ta import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper; import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentGraphMapper; import com.knecon.fforesight.service.layoutparser.processor.services.mapper.PropertiesMapper; +import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest; import com.knecon.fforesight.tenantcommons.TenantContext; import lombok.SneakyThrows; -public class DocumentGraphMappingTest extends BuildDocumentGraphTest { +public class DocumentGraphMappingTest extends BuildDocumentTest { @Test @SneakyThrows diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphVisualizationTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphVisualizationTest.java index 0994643..2fd6aae 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphVisualizationTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphVisualizationTest.java @@ -13,13 +13,14 @@ import org.springframework.core.io.ClassPathResource; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; +import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest; import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw; import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; @Slf4j -public class DocumentGraphVisualizationTest extends BuildDocumentGraphTest { +public class DocumentGraphVisualizationTest extends BuildDocumentTest { @Test @SneakyThrows @@ -66,9 +67,7 @@ public class DocumentGraphVisualizationTest extends BuildDocumentGraphTest { File tmpFile = new File("/tmp/" + Path.of(filename).getFileName().toString() + "_SEMANTIC_NODES_BBOX.pdf"); ClassPathResource fileResource = new ClassPathResource(filename); - try (var fileStream = fileResource.getInputStream();// - PDDocument pdDocument = Loader.loadPDF(fileStream)// - ) { + try (PDDocument pdDocument = Loader.loadPDF(fileResource.getFile())) { log.info("drawing document"); PdfDraw.drawDocumentGraph(pdDocument, documentGraph); PdfDraw.drawTextBlock(pdDocument, textBlock, PdfDraw.Options.builder().stroke(true).strokeWidth(0.1f).strokeColor(Color.YELLOW).build()); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java index f150ae2..78a4785 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java @@ -12,10 +12,11 @@ import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsi import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService; import com.knecon.fforesight.service.layoutparser.processor.services.visualization.ViewerDocumentService; +import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest; import lombok.SneakyThrows; -public class ViewerDocumentTest extends BuildDocumentGraphTest { +public class ViewerDocumentTest extends BuildDocumentTest { @Test @Disabled @@ -27,8 +28,8 @@ public class ViewerDocumentTest extends BuildDocumentGraphTest { String fileName = "files/new/VV-511309_OCR.pdf"; Document document = buildGraph(fileName, LayoutParsingType.DOCUMINE); String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf"; - try (var pdDocument = Loader.loadPDF(new ClassPathResource(fileName).getInputStream()); var out = new FileOutputStream(tmpFileName)) { - viewerDocumentService.createViewerDocument(pdDocument, document, out); + try (var pdDocument = Loader.loadPDF(new ClassPathResource(fileName).getFile()); var out = new FileOutputStream(tmpFileName)) { + viewerDocumentService.createViewerDocument(pdDocument, document, out, true); } } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java index 0eafbd4..be893a2 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java @@ -92,7 +92,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Spanning Cells.pdf"); - ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getInputStream())); + ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty(); TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0); assertThat(table.getColCount()).isEqualTo(6); @@ -106,7 +106,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Table.pdf"); - ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getInputStream())); + ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty(); TablePageBlock firstTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0); assertThat(firstTable.getColCount()).isEqualTo(8); @@ -124,7 +124,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Multi Page Table.pdf"); - ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getInputStream())); + ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty(); TablePageBlock firstTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0); assertThat(firstTable.getColCount()).isEqualTo(9); @@ -142,7 +142,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Rotated Table Headers.pdf"); - ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getInputStream())); + ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty(); TablePageBlock firstTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0); assertThat(firstTable.getColCount()).isEqualTo(8); @@ -160,7 +160,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/56 Fludioxonil_RAR_12_Volume_3CA_B-7_2018-02-21_Page170.pdf"); - ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getInputStream())); + ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); validateTableSize(document, 4); @@ -177,7 +177,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/VV-931175_Page1.pdf"); - ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getInputStream())); + ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); validateTableSize(document, 1); @@ -218,7 +218,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/27 A8637C - EU AIR3 - MCP Section 1 - Identity of the plant protection product_Page6.pdf"); - ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getInputStream())); + ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); validateTableSize(document, 4); @@ -235,7 +235,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/A20622A izRMS (CZ) fRR Part B9_Page185.pdf"); - ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getInputStream())); + ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); validateTableSize(document, 2); @@ -250,7 +250,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/A20622A izZRMS (CZ) fRR Part B7_Page123.pdf"); - ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getInputStream())); + ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); validateTableSize(document, 6); @@ -269,7 +269,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/77 Pirimicarb_RAR_08_Volume_3CA_B-6_2017-12-04_Page11.pdf"); - ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getInputStream())); + ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); validateTableSize(document, 3); @@ -285,7 +285,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/95 Trinexapac-ethyl_RAR_08_Volume_3CA_B-6_2018-01-10_Page532.pdf"); - ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getInputStream())); + ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); validateTableSize(document, 1); validateTable(document, 0, 9, 9, 0, 0); @@ -298,7 +298,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/52 Fludioxonil_RAR_07_Volume_3CA_B-5_2018-02-21_Page175.pdf"); - ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getInputStream())); + ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); validateTableSize(document, 1); @@ -312,7 +312,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/52 Fludioxonil_RAR_07_Volume_3CA_B-5_2018-02-21_Page174.pdf"); - ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getInputStream())); + ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); validateTableSize(document, 1); validateTable(document, 0, 9, 6, 7, 0); @@ -325,7 +325,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/19 Chlorothalonil RAR 08 Volume 3CA B 6b metabolites Oct 2017_Page35.pdf"); - ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getInputStream())); + ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); validateTableSize(document, 1); validateTable(document, 0, 10, 6, 0, 1); @@ -338,7 +338,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/19 Chlorothalonil RAR 08 Volume 3CA B 6b metabolites Oct 2017_Page161.pdf"); - ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getInputStream())); + ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); validateTableSize(document, 2); validateTable(document, 0, 2, 2, 0, 0); @@ -353,7 +353,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource( "files/SinglePages/47 Cyprodinil - EU AIR3 - MCA Section 5 Supplement - Toxicological and metabolism studies on the active substance_Page30.pdf"); - ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getInputStream())); + ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); validateTableSize(document, 2); @@ -369,7 +369,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource( "files/SinglePages/49 Cyprodinil - EU AIR3 - MCA Section 8 Supplement - Ecotoxicological studies on the active substance_Page61.pdf"); - ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getInputStream())); + ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); validateTableSize(document, 2); @@ -384,7 +384,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/81 Pirimicarb_RAR_20_Volume_3CP_A10788A (_Pirimor_)_B-9_2017-12-04_Page54.pdf"); - ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getInputStream())); + ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); validateTableSize(document, 2); @@ -399,7 +399,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/85 Pydiflumetofen_DAR_08_Volume_3CA_B-6_2017-07-26_Page134.pdf"); - ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getInputStream())); + ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); validateTableSize(document, 2); @@ -414,7 +414,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/Thiabendazole DAR Addendum for ED_April_2020_Page18.pdf"); - ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getInputStream())); + ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); validateTableSize(document, 4); @@ -431,7 +431,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/15 - Pretilachlor - Acute Oral Toxicity (Up and Down Procedure) - Rat_Page18.pdf"); - ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getInputStream())); + ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); validateTableSize(document, 1); @@ -446,7 +446,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource( "files/SinglePages/28 A8637C - EU AIR3 - MCP Section 10 - Ecotoxicological studies on the plant protection product_Page23.pdf"); - ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getInputStream())); + ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); validateTableSize(document, 2); @@ -461,7 +461,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/24 - SYN549522 - Acute Oral Toxicity - Rats_Page17.pdf"); - ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getInputStream())); + ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); validateTableSize(document, 1); @@ -475,7 +475,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/30 - Dicamba - Acute Oral Toxicity - Rats_Page5.pdf"); - ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getInputStream())); + ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); validateTableSize(document, 1); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/BodyTextFrameServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/BodyTextFrameServiceTest.java new file mode 100644 index 0000000..cd74bbe --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/BodyTextFrameServiceTest.java @@ -0,0 +1,31 @@ +package com.knecon.fforesight.service.layoutparser.server.services; + +import java.nio.file.Path; +import java.util.List; + +import org.junit.jupiter.api.Test; + +import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; +import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; +import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest; +import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw; + +import lombok.SneakyThrows; + +class BodyTextFrameServiceTest extends BuildDocumentTest { + + @Test + @SneakyThrows + public void testCalculateBodyTextFrame() { + + String filename = "files/211.pdf"; + String outputFilename = "/tmp/" + Path.of(filename).getFileName() + "_MAINBODY.pdf"; + ClassificationDocument document = parseLayout(filename, LayoutParsingType.TAAS); + PdfDraw.drawRectanglesPerPage(filename, + document.getPages().stream().map(page -> List.of(RectangleTransformations.toRectangle2D(page.getBodyTextFrame()))).toList(), + outputFilename); + + } + +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java index 0fdcf29..cceec48 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java @@ -5,7 +5,6 @@ import java.util.Collections; import java.util.LinkedList; import java.util.List; -import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import com.knecon.fforesight.service.layoutparser.processor.model.PageContents; @@ -19,13 +18,13 @@ import lombok.SneakyThrows; public class RulingCleaningServiceTest { @Test - @Disabled +// @Disabled @SneakyThrows public void textRulingExtraction() { - String fileName = "files/BASF/2013-1110704.pdf"; + String fileName = "files/211.pdf"; String lineFileName = "/tmp/" + Path.of(fileName).getFileName().toString() + "_LINES.pdf"; - List pageContents = PageContentExtractor.getSortedPageContents("files/BASF/2013-1110704.pdf"); + List pageContents = PageContentExtractor.getSortedPageContents(fileName); PdfDraw.drawLinesPerPage(fileName, pageContents.stream().map(PageContents::getRulings).toList(), lineFileName); RulingCleaningService rulingCleaningService = new RulingCleaningService(); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/BuildDocumentGraphTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/BuildDocumentTest.java similarity index 58% rename from layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/BuildDocumentGraphTest.java rename to layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/BuildDocumentTest.java index 092a530..dc9d0d6 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/BuildDocumentGraphTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/BuildDocumentTest.java @@ -1,39 +1,35 @@ -package com.knecon.fforesight.service.layoutparser.server.graph; - -import static org.junit.jupiter.api.Assertions.assertEquals; +package com.knecon.fforesight.service.layoutparser.server.utils; import java.io.InputStream; import org.apache.pdfbox.Loader; import org.apache.pdfbox.pdmodel.PDDocument; -import org.junit.jupiter.api.Disabled; -import org.junit.jupiter.api.Test; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.core.io.ClassPathResource; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline; -import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; +import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; -import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest; import lombok.SneakyThrows; -public class BuildDocumentGraphTest extends AbstractTest { +public abstract class BuildDocumentTest extends AbstractTest { @Autowired protected LayoutParsingPipeline layoutParsingPipeline; - @Test - @Disabled - public void buildMetolachlor() { + @SneakyThrows + protected ClassificationDocument parseLayout(String filename, LayoutParsingType layoutParsingType) { - Document documentGraph = buildGraph("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"); - assertEquals(221, documentGraph.getPages().size()); - assertEquals(220, documentGraph.getPages().stream().filter(page -> page.getHeader().hasText()).count()); - assertEquals(0, documentGraph.getPages().stream().filter(page -> page.getFooter().hasText()).count()); + ClassPathResource fileResource = new ClassPathResource(filename); + prepareStorage(filename); + try (InputStream inputStream = fileResource.getInputStream(); PDDocument pdDocument = Loader.loadPDF(inputStream.readAllBytes())) { + return layoutParsingPipeline.parseLayout(layoutParsingType, pdDocument, layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID), new TableServiceResponse()); + } } @@ -52,14 +48,9 @@ public class BuildDocumentGraphTest extends AbstractTest { } else { prepareStorage(filename); } - ClassPathResource fileResource = new ClassPathResource(filename); - try (InputStream inputStream = fileResource.getInputStream(); PDDocument pdDocument = Loader.loadPDF(inputStream)) { - return DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(layoutParsingType, - pdDocument, - layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID), - new TableServiceResponse())); - } + return DocumentGraphFactory.buildDocumentGraph(parseLayout(filename, layoutParsingType)); } } + diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/visualizations/PdfDraw.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/visualizations/PdfDraw.java index 63d2eb8..5576017 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/visualizations/PdfDraw.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/visualizations/PdfDraw.java @@ -5,7 +5,6 @@ import java.awt.geom.Point2D; import java.awt.geom.Rectangle2D; import java.io.FileOutputStream; import java.io.IOException; -import java.io.InputStream; import java.util.List; import java.util.Map; @@ -14,6 +13,7 @@ import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPageContentStream; import org.apache.pdfbox.pdmodel.font.PDType1Font; +import org.apache.pdfbox.pdmodel.font.Standard14Fonts; import org.apache.pdfbox.util.Matrix; import org.springframework.core.io.ClassPathResource; @@ -40,10 +40,8 @@ public class PdfDraw { public static void drawRectanglesPerPage(String filename, List> rectanglesPerPage, String tmpFileName) throws IOException { - try (InputStream inputStream = new ClassPathResource(filename).getInputStream();// - PDDocument pdDocument = Loader.loadPDF(inputStream);// - var out = new FileOutputStream(tmpFileName)// - ) { + ClassPathResource pdfResource = new ClassPathResource(filename); + try (PDDocument pdDocument = Loader.loadPDF(pdfResource.getFile()); var out = new FileOutputStream(tmpFileName)) { for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) { PdfVisualisationUtility.drawRectangle2DList(pdDocument, pageNumber, @@ -58,11 +56,8 @@ public class PdfDraw { public static void drawRectanglesPerPageNumberedByLine(String filename, List>> rectanglesPerPage, String tmpFileName) throws IOException { - try (InputStream inputStream = new ClassPathResource(filename).getInputStream();// - PDDocument pdDocument = Loader.loadPDF(inputStream);// - var out = new FileOutputStream(tmpFileName)// - ) { - + ClassPathResource pdfResource = new ClassPathResource(filename); + try (PDDocument pdDocument = Loader.loadPDF(pdfResource.getFile()); var out = new FileOutputStream(tmpFileName)) { for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) { var rectanglesOnPage = rectanglesPerPage.get(pageNumber - 1); for (int lineNumber = 0; lineNumber < rectanglesOnPage.size(); lineNumber++) { @@ -74,13 +69,9 @@ public class PdfDraw { new Point2D.Double(rectanglesInLine.get(0).getX() - (5 + (5 * countNumberOfDigits(lineNumber))), y + 2), pageNumber, PdfVisualisationUtility.Options.builder().stroke(true).build()); - } - } - pdDocument.save(out); - } } @@ -143,7 +134,7 @@ public class PdfDraw { } else { contentStream.newLineAtOffset((float) location.getX(), (float) location.getY()); } - contentStream.setFont(PDType1Font.HELVETICA, 10); + contentStream.setFont(new PDType1Font(Standard14Fonts.FontName.HELVETICA), 10); contentStream.showText(string); contentStream.endText(); contentStream.close(); @@ -184,11 +175,8 @@ public class PdfDraw { @SneakyThrows public static void drawRectanglesAndLinesPerPage(String filename, List> list, List> rectanglesPerPage, String tmpFileName) { - try (InputStream inputStream = new ClassPathResource(filename).getInputStream();// - PDDocument pdDocument = Loader.loadPDF(inputStream);// - var out = new FileOutputStream(tmpFileName)// - ) { - + ClassPathResource pdfResource = new ClassPathResource(filename); + try (PDDocument pdDocument = Loader.loadPDF(pdfResource.getFile()); var out = new FileOutputStream(tmpFileName)) { for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) { // PdfVisualisationUtility.drawLine2DList(pdDocument, // pageNumber, @@ -201,7 +189,6 @@ public class PdfDraw { PdfVisualisationUtility.drawRectangle2DList(pdDocument, pageNumber, list.get(pageNumber - 1), PdfVisualisationUtility.Options.builder().stroke(true).build()); } pdDocument.save(out); - } } @@ -209,12 +196,13 @@ public class PdfDraw { @SneakyThrows public static void drawLinesPerPage(String filename, List> linesPerPage, String tmpFileName) { - try (InputStream inputStream = new ClassPathResource(filename).getInputStream();// - PDDocument pdDocument = Loader.loadPDF(inputStream);// - var out = new FileOutputStream(tmpFileName)// - ) { + ClassPathResource pdfResource = new ClassPathResource(filename); + try (PDDocument pdDocument = Loader.loadPDF(pdfResource.getFile()); var out = new FileOutputStream(tmpFileName)) { for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) { - PdfVisualisationUtility.drawLine2DList(pdDocument, pageNumber, linesPerPage.get(pageNumber - 1), PdfVisualisationUtility.Options.builder().strokeColor(Color.RED).stroke(true).build()); + PdfVisualisationUtility.drawLine2DList(pdDocument, + pageNumber, + linesPerPage.get(pageNumber - 1), + PdfVisualisationUtility.Options.builder().strokeColor(Color.RED).stroke(true).build()); } pdDocument.save(out); }