diff --git a/layoutparser-service/layoutparser-service-processor/build.gradle.kts b/layoutparser-service/layoutparser-service-processor/build.gradle.kts index 69569bc..df746ba 100644 --- a/layoutparser-service/layoutparser-service-processor/build.gradle.kts +++ b/layoutparser-service/layoutparser-service-processor/build.gradle.kts @@ -10,6 +10,7 @@ val pdfBoxVersion = "3.0.0" dependencies { implementation(project(":layoutparser-service-internal-api")) + implementation(project(":viewer-doc-processor")) implementation("com.iqser.red.service:persistence-service-shared-api-v1:2.144.0") { exclude("org.springframework.boot", "spring-boot-starter-security") diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index ba1b1c2..01b4cdf 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -51,7 +51,7 @@ import com.knecon.fforesight.service.layoutparser.processor.services.factory.Doc import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper; import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper; import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDFLinesTextStripper; -import com.knecon.fforesight.service.layoutparser.processor.services.visualization.ViewerDocumentService; +import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService; import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; import io.micrometer.observation.Observation; @@ -84,7 +84,7 @@ public class LayoutParsingPipeline { TaasBlockificationService taasBlockificationService; DocuMineBlockificationService docuMineBlockificationService; RedactManagerBlockificationService redactManagerBlockificationService; - ViewerDocumentService viewerDocumentService; + LayoutGridService layoutGridService; ObservationRegistry observationRegistry; @@ -94,7 +94,7 @@ public class LayoutParsingPipeline { log.info("Starting layout parsing for {}", layoutParsingRequest.identifier()); File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId()); - File viewerDocumentFile = File.createTempFile("viewer_document", ".pdf"); + File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()).orElse(originFile); ImageServiceResponse imageServiceResponse = new ImageServiceResponse(); if (layoutParsingRequest.imagesFileStorageId().isPresent()) { @@ -111,25 +111,31 @@ public class LayoutParsingPipeline { imageServiceResponse, tableServiceResponse, layoutParsingRequest.identifier().toString()); + log.info("Building document graph for {}", layoutParsingRequest.identifier()); Document documentGraph = observeBuildDocumentGraph(classificationDocument); + log.info("Creating viewer document for {}", layoutParsingRequest.identifier()); + + layoutGridService.addLayoutGrid(viewerDocumentFile, documentGraph, viewerDocumentFile, false); + log.info("Storing resulting files for {}", layoutParsingRequest.identifier()); + layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentGraph)); layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentGraph)); - - log.info("Creating viewer document for {}", layoutParsingRequest.identifier()); - viewerDocumentService.createViewerDocument(originFile, documentGraph, viewerDocumentFile, false); layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, viewerDocumentFile); + if (layoutParsingRequest.layoutParsingType().equals(LayoutParsingType.TAAS)) { log.info("Building research document data for {}", layoutParsingRequest.identifier()); var researchDocumentData = TaasDocumentDataMapper.fromDocument(documentGraph); layoutParsingStorageService.storeResearchDocumentData(layoutParsingRequest, researchDocumentData); } + if (!viewerDocumentFile.equals(originFile)) { + viewerDocumentFile.delete(); + } originFile.delete(); - viewerDocumentFile.delete(); return LayoutParsingFinishedEvent.builder() .identifier(layoutParsingRequest.identifier()) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingServiceProcessorConfiguration.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingServiceProcessorConfiguration.java index d4dbc32..5471311 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingServiceProcessorConfiguration.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingServiceProcessorConfiguration.java @@ -1,10 +1,24 @@ package com.knecon.fforesight.service.layoutparser.processor; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.ComponentScan; import org.springframework.context.annotation.Configuration; +import com.knecon.fforesight.service.viewerdoc.service.ViewerDocumentService; + +import io.micrometer.observation.ObservationRegistry; + @Configuration @ComponentScan public class LayoutParsingServiceProcessorConfiguration { + + @Bean + @Autowired + public ViewerDocumentService viewerDocumentService(ObservationRegistry registry) { + + return new ViewerDocumentService(registry); + } + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java index 3082d54..e74fc4e 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java @@ -8,6 +8,7 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.nio.file.StandardOpenOption; +import java.util.Optional; import org.apache.pdfbox.Loader; import org.apache.pdfbox.pdmodel.PDDocument; @@ -36,13 +37,6 @@ public class LayoutParsingStorageService { private final StorageService storageService; private final ObjectMapper objectMapper; - - public PDDocument getOriginDocument(String storageId) throws IOException { - - return Loader.loadPDF(getOriginFile(storageId)); - } - - @Observed(name = "LayoutParsingStorageService", contextualName = "get-origin-file") public File getOriginFile(String storageId) throws IOException { @@ -52,6 +46,18 @@ public class LayoutParsingStorageService { } + @Observed(name = "LayoutParsingStorageService", contextualName = "get-viewer-doc-file") + public Optional getViewerDocFile(String storageId) throws IOException { + + if (!storageService.objectExists(TenantContext.getTenantId(), storageId)) { + return Optional.empty(); + } + File tempFile = createTempFile("viewerDocument", ".pdf"); + storageService.downloadTo(TenantContext.getTenantId(), storageId, tempFile); + return Optional.of(tempFile); + } + + public ImageServiceResponse getImagesFile(String storageId) throws IOException { try (InputStream inputStream = getObject(storageId)) { @@ -137,7 +143,6 @@ public class LayoutParsingStorageService { public void storeViewerDocument(LayoutParsingRequest layoutParsingRequest, File out) { try (var in = new FileInputStream(out)) { - storageService.storeObject(TenantContext.getTenantId(), layoutParsingRequest.viewerDocumentStorageId(), in); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/visualization/ColoredLine.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/visualization/ColoredLine.java deleted file mode 100644 index 41896fe..0000000 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/visualization/ColoredLine.java +++ /dev/null @@ -1,8 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.processor.model.visualization; - -import java.awt.Color; -import java.awt.geom.Line2D; - -public record ColoredLine(Line2D line, Color color) { - -} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/visualization/PlacedText.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/visualization/PlacedText.java deleted file mode 100644 index b959e3d..0000000 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/visualization/PlacedText.java +++ /dev/null @@ -1,7 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.processor.model.visualization; - -import java.awt.geom.Point2D; - -public record PlacedText(String text, Point2D lineStart) { - -} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LayoutGridService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LayoutGridService.java index 5ca5707..a2ebcc0 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LayoutGridService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LayoutGridService.java @@ -1,12 +1,11 @@ package com.knecon.fforesight.service.layoutparser.processor.services.visualization; -import static com.knecon.fforesight.service.layoutparser.processor.services.visualization.ViewerDocumentService.LINE_WIDTH; - import java.awt.Color; import java.awt.geom.Line2D; import java.awt.geom.Point2D; import java.awt.geom.Rectangle2D; import java.awt.geom.RectangularShape; +import java.io.File; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; @@ -24,25 +23,59 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Pa import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell; -import com.knecon.fforesight.service.layoutparser.processor.model.visualization.ColoredLine; -import com.knecon.fforesight.service.layoutparser.processor.model.visualization.ColoredRectangle; -import com.knecon.fforesight.service.layoutparser.processor.model.visualization.LayoutGrid; -import com.knecon.fforesight.service.layoutparser.processor.model.visualization.PlacedText; import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; +import com.knecon.fforesight.service.viewerdoc.ContentStreams; +import com.knecon.fforesight.service.viewerdoc.model.ColoredLine; +import com.knecon.fforesight.service.viewerdoc.model.ColoredRectangle; +import com.knecon.fforesight.service.viewerdoc.model.LayoutGrid; +import com.knecon.fforesight.service.viewerdoc.model.PlacedText; +import com.knecon.fforesight.service.viewerdoc.model.Standard14EmbeddableFont; +import com.knecon.fforesight.service.viewerdoc.model.Visualizations; +import com.knecon.fforesight.service.viewerdoc.service.ViewerDocumentService; + +import io.micrometer.observation.annotation.Observed; +import lombok.AccessLevel; +import lombok.RequiredArgsConstructor; +import lombok.SneakyThrows; +import lombok.experimental.FieldDefaults; @Service +@RequiredArgsConstructor +@FieldDefaults(level = AccessLevel.PRIVATE, makeFinal = true) public class LayoutGridService { - private static final Color INNER_LINES_COLOR = new Color(255, 175, 175); - private static final Color PARAGRAPH_COLOR = new Color(70, 130, 180); - public static final Color TABLE_COLOR = new Color(102, 205, 170); - public static final Color SECTION_COLOR = new Color(50, 50, 50); - public static final Color HEADLINE_COLOR = new Color(162, 56, 56); - public static final Color HEADER_COLOR = new Color(171, 131, 6); - public static final Color IMAGE_COLOR = new Color(253, 63, 146); + ViewerDocumentService viewerDocumentService; + + static float FONT_SIZE = 10f; + static float LINE_WIDTH = 1f; + static Standard14EmbeddableFont FONT = Standard14EmbeddableFont.helvetica(); + + static Color INNER_LINES_COLOR = new Color(255, 175, 175); + static Color PARAGRAPH_COLOR = new Color(70, 130, 180); + static Color TABLE_COLOR = new Color(102, 205, 170); + static Color SECTION_COLOR = new Color(50, 50, 50); + static Color HEADLINE_COLOR = new Color(162, 56, 56); + static Color HEADER_COLOR = new Color(171, 131, 6); + static Color IMAGE_COLOR = new Color(253, 63, 146); - public LayoutGrid createLayoutGrid(Document document) { + @SneakyThrows + @Observed(name = "ViewerDocumentService", contextualName = "create-viewer-document") + public void addLayoutGrid(File originFile, Document document, File destinationFile, boolean layerVisibilityDefaultValue) { + + LayoutGrid layoutGrid = createLayoutGrid(document); + + viewerDocumentService.addVisualizationsOnPage(originFile, + destinationFile, + Visualizations.builder() + .layer(ContentStreams.KNECON_LAYOUT) + .visualizationsOnPages(layoutGrid.getVisualizationsPerPages()) + .layerVisibilityDefaultValue(layerVisibilityDefaultValue) + .build()); + } + + + private LayoutGrid createLayoutGrid(Document document) { LayoutGrid layoutGrid = new LayoutGrid(document.getNumberOfPages()); document.streamAllSubNodes().forEach(semanticNode -> { @@ -103,11 +136,11 @@ public class LayoutGridService { List coloredLines = layoutGrid.getVisualizationsPerPages().get(page.getNumber() - 1).getColoredLines(); xs.forEach(x -> { Line2D line = new Line2D.Double(new Point2D.Double(x, tableBBox.getMaxY()), new Point2D.Double(x, tableBBox.getMinY())); - coloredLines.add(new ColoredLine(line, INNER_LINES_COLOR)); + coloredLines.add(new ColoredLine(line, INNER_LINES_COLOR, LINE_WIDTH)); }); ys.forEach(y -> { Line2D line = new Line2D.Double(new Point2D.Double(tableBBox.getMinX(), y), new Point2D.Double(tableBBox.getMaxX(), y)); - coloredLines.add(new ColoredLine(line, INNER_LINES_COLOR)); + coloredLines.add(new ColoredLine(line, INNER_LINES_COLOR, LINE_WIDTH)); }); } } @@ -135,9 +168,9 @@ public class LayoutGridService { List lines = createLinesFromRectangle(r, firstPage.getRotation()); // add string to top line var firstLine = lines.remove(0); - coloredLines.add(new ColoredLine(firstLine, color)); + coloredLines.add(new ColoredLine(firstLine, color, LINE_WIDTH)); for (Line2D line : lines) { - coloredLines.add(new ColoredLine(line, color)); + coloredLines.add(new ColoredLine(line, color, LINE_WIDTH)); } return; } @@ -152,6 +185,7 @@ public class LayoutGridService { } + @SneakyThrows private void addPlacedText(Page page, Rectangle2D textBBox, String s, LayoutGrid layoutGrid) { Point2D.Float upperLeftCorner = switch (page.getRotation()) { @@ -161,7 +195,8 @@ public class LayoutGridService { default -> new Point2D.Float((float) (textBBox.getMinX()), (float) textBBox.getMaxY()); }; var placedTexts = layoutGrid.getVisualizationsPerPages().get(page.getNumber() - 1).getPlacedTexts(); - placedTexts.add(new PlacedText(s, upperLeftCorner)); + upperLeftCorner.setLocation(upperLeftCorner.getX() - ((FONT.getStringWidth(s) / 1000) * FONT_SIZE + (2 * LINE_WIDTH) + 4), upperLeftCorner.getY() - FONT_SIZE); + placedTexts.add(PlacedText.textFacingUp(s, upperLeftCorner, FONT_SIZE, Color.BLACK, FONT)); } @@ -176,9 +211,9 @@ public class LayoutGridService { midPageLines.remove(1); // add string to left line var leftLine = midPageLines.remove(1); - coloredLines.add(new ColoredLine(leftLine, color)); + coloredLines.add(new ColoredLine(leftLine, color, LINE_WIDTH)); for (Line2D line : midPageLines) { - coloredLines.add(new ColoredLine(line, color)); + coloredLines.add(new ColoredLine(line, color, LINE_WIDTH)); } } @@ -192,9 +227,9 @@ public class LayoutGridService { lastPageLines.remove(0); // add string to left line var leftLine = lastPageLines.remove(2); - coloredLines.add(new ColoredLine(leftLine, color)); + coloredLines.add(new ColoredLine(leftLine, color, LINE_WIDTH)); for (Line2D line : lastPageLines) { - coloredLines.add(new ColoredLine(line, color)); + coloredLines.add(new ColoredLine(line, color, LINE_WIDTH)); } } @@ -208,9 +243,9 @@ public class LayoutGridService { firstPageLines.remove(2); // add string to top line var firstLine = firstPageLines.remove(0); - coloredLines.add(new ColoredLine(firstLine, color)); + coloredLines.add(new ColoredLine(firstLine, color, LINE_WIDTH)); for (Line2D line : firstPageLines) { - coloredLines.add(new ColoredLine(line, color)); + coloredLines.add(new ColoredLine(line, color, LINE_WIDTH)); } } @@ -276,7 +311,10 @@ public class LayoutGridService { private void addAsRectangle(SemanticNode semanticNode, LayoutGrid layoutGrid, Color color) { semanticNode.getBBox() - .forEach((page, textBBox) -> layoutGrid.getVisualizationsPerPages().get(page.getNumber() - 1).getColoredRectangles().add(new ColoredRectangle(textBBox, color))); + .forEach((page, textBBox) -> layoutGrid.getVisualizationsPerPages() + .get(page.getNumber() - 1) + .getColoredRectangles() + .add(new ColoredRectangle(textBBox, color, LINE_WIDTH))); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/ViewerDocumentService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/ViewerDocumentService.java deleted file mode 100644 index 2335517..0000000 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/ViewerDocumentService.java +++ /dev/null @@ -1,217 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.processor.services.visualization; - -import java.awt.geom.AffineTransform; -import java.awt.geom.Rectangle2D; -import java.io.File; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.StandardCopyOption; - -import org.apache.pdfbox.Loader; -import org.apache.pdfbox.cos.COSName; -import org.apache.pdfbox.pdmodel.PDDocument; -import org.apache.pdfbox.pdmodel.PDDocumentCatalog; -import org.apache.pdfbox.pdmodel.PDPage; -import org.apache.pdfbox.pdmodel.PDPageContentStream; -import org.apache.pdfbox.pdmodel.PDResources; -import org.apache.pdfbox.pdmodel.font.PDFont; -import org.apache.pdfbox.pdmodel.font.PDType1Font; -import org.apache.pdfbox.pdmodel.font.Standard14Fonts; -import org.apache.pdfbox.pdmodel.graphics.optionalcontent.PDOptionalContentGroup; -import org.apache.pdfbox.pdmodel.graphics.optionalcontent.PDOptionalContentProperties; -import org.apache.pdfbox.pdmodel.graphics.state.PDExtendedGraphicsState; -import org.apache.pdfbox.util.Matrix; -import org.springframework.stereotype.Service; - -import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; -import com.knecon.fforesight.service.layoutparser.processor.model.visualization.ColoredLine; -import com.knecon.fforesight.service.layoutparser.processor.model.visualization.ColoredRectangle; -import com.knecon.fforesight.service.layoutparser.processor.model.visualization.FilledRectangle; -import com.knecon.fforesight.service.layoutparser.processor.model.visualization.LayoutGrid; -import com.knecon.fforesight.service.layoutparser.processor.model.visualization.PlacedText; -import com.knecon.fforesight.service.layoutparser.processor.model.visualization.VisualizationsOnPage; - -import io.micrometer.observation.Observation; -import io.micrometer.observation.ObservationRegistry; -import io.micrometer.observation.annotation.Observed; -import lombok.RequiredArgsConstructor; -import lombok.SneakyThrows; -import lombok.extern.slf4j.Slf4j; - -@Slf4j -@Service -@RequiredArgsConstructor -public class ViewerDocumentService { - - private static final String LAYER_NAME = "Layout grid"; - private static final int FONT_SIZE = 10; - public static final float LINE_WIDTH = 1f; - - private final LayoutGridService layoutGridService; - private final ObservationRegistry observationRegistry; - - - @SneakyThrows - @Observed(name = "ViewerDocumentService", contextualName = "create-viewer-document") - public void createViewerDocument(File originFile, Document document, File destinationFile, boolean layerVisibilityDefaultValue) { - - Path tmpFile = Files.createTempFile("tmpViewerDocument", ".pdf"); - PDDocument pdDocument = openPDDocument(originFile); - LayoutGrid layoutGrid = layoutGridService.createLayoutGrid(document); - - PDOptionalContentGroup layer = addLayerToDocument(pdDocument, layerVisibilityDefaultValue); - PDFont font = new PDType1Font(Standard14Fonts.FontName.HELVETICA); - - for (int pageNumber = 0; pageNumber < pdDocument.getNumberOfPages(); pageNumber++) { - - PDPage pdPage = pdDocument.getPage(pageNumber); -// - AffineTransform textDeRotationMatrix = getTextDeRotationTransform(pdPage); - addLayerToPageResources(pdPage); - - // We need to save the graphics state before, such that our appended content cannot be affected by previous content streams with side effects, - // e.g. not escaped matrix transformations. - escapePreviousContents(pdDocument, pdPage); - - VisualizationsOnPage visualizationsOnPage = layoutGrid.getVisualizationsPerPages().get(pageNumber); - assert pageNumber == visualizationsOnPage.getPageNumber(); - // We need to append to the content stream, otherwise the content could be overlapped by following content. - try (var contentStream = new PDPageContentStream(pdDocument, pdPage, PDPageContentStream.AppendMode.APPEND, true)) { - - contentStream.beginMarkedContent(COSName.OC, layer); - contentStream.saveGraphicsState(); - - contentStream.setLineWidth(LINE_WIDTH); - for (ColoredLine coloredLine : visualizationsOnPage.getColoredLines()) { - contentStream.setStrokingColor(coloredLine.color()); - contentStream.moveTo((float) coloredLine.line().getX1(), (float) coloredLine.line().getY1()); - contentStream.lineTo((float) coloredLine.line().getX2(), (float) coloredLine.line().getY2()); - contentStream.stroke(); - } - for (ColoredRectangle coloredRectangle : visualizationsOnPage.getColoredRectangles()) { - contentStream.setStrokingColor(coloredRectangle.color()); - Rectangle2D r = coloredRectangle.rectangle2D(); - contentStream.addRect((float) r.getX(), (float) r.getY(), (float) r.getWidth(), (float) r.getHeight()); - contentStream.stroke(); - } - for (FilledRectangle filledRectangle : visualizationsOnPage.getFilledRectangles()) { - contentStream.setNonStrokingColor(filledRectangle.color()); - PDExtendedGraphicsState graphicsState = new PDExtendedGraphicsState(); - graphicsState.setNonStrokingAlphaConstant(filledRectangle.alpha()); - contentStream.setGraphicsStateParameters(graphicsState); - Rectangle2D r = filledRectangle.rectangle2D(); - contentStream.addRect((float) r.getX(), (float) r.getY(), (float) r.getWidth(), (float) r.getHeight()); - contentStream.fill(); - } - for (PlacedText placedText : visualizationsOnPage.getPlacedTexts()) { - contentStream.setFont(font, FONT_SIZE); - contentStream.beginText(); - Matrix textMatrix = new Matrix((float) textDeRotationMatrix.getScaleX(), - (float) textDeRotationMatrix.getShearX(), - (float) textDeRotationMatrix.getShearY(), - (float) textDeRotationMatrix.getScaleY(), - (float) placedText.lineStart().getX(), - (float) placedText.lineStart().getY()); - textMatrix.translate(-((font.getStringWidth(placedText.text()) / 1000) * FONT_SIZE + (2 * LINE_WIDTH) + 4), -FONT_SIZE); - contentStream.setTextMatrix(textMatrix); - contentStream.showText(placedText.text()); - contentStream.endText(); - } - contentStream.restoreGraphicsState(); - contentStream.endMarkedContent(); - } - - if (pageNumber % 500 == 0 && pageNumber != 0) { // re-open document every once in a while to save on RAM - log.info("Incremental save after {} pages", pageNumber); - observedIncrementalSave(pdDocument, destinationFile); - pdDocument.close(); - Files.copy(destinationFile.toPath(), tmpFile, StandardCopyOption.REPLACE_EXISTING); - pdDocument = openPDDocument(tmpFile.toFile()); - layer = addLayerToDocument(pdDocument, layerVisibilityDefaultValue); - } - - } - observedIncrementalSave(pdDocument, destinationFile); - - tmpFile.toFile().delete(); - pdDocument.close(); - } - - - private static PDDocument openPDDocument(File tmpFile) throws IOException { - - PDDocument pdDocument; - pdDocument = Loader.loadPDF(tmpFile); - pdDocument.setAllSecurityToBeRemoved(true); - return pdDocument; - } - - - @SneakyThrows - private void observedIncrementalSave(PDDocument pdDocument, File outputFile) { - - Observation.createNotStarted("ViewerDocumentService", observationRegistry).contextualName("incremental-save").observe(() -> { - try { - pdDocument.save(outputFile); - } catch (IOException e) { - throw new RuntimeException(e); - } - }); - - } - - - private static void addLayerToPageResources(PDPage pdPage) { - - PDResources resources = pdPage.getResources(); - if (resources == null) { - resources = new PDResources(); - pdPage.setResources(resources); - } - } - - - private static void escapePreviousContents(PDDocument pdDocument, PDPage pdPage) throws IOException { - - try (var contentStream = new PDPageContentStream(pdDocument, pdPage, PDPageContentStream.AppendMode.PREPEND, false)) { - contentStream.saveGraphicsState(); - } - try (var contentStream = new PDPageContentStream(pdDocument, pdPage, PDPageContentStream.AppendMode.APPEND, false)) { - contentStream.restoreGraphicsState(); - } - } - - - private static PDOptionalContentGroup addLayerToDocument(PDDocument pdDocument, boolean layerVisibilityDefaultValue) { - - PDDocumentCatalog catalog = pdDocument.getDocumentCatalog(); - PDOptionalContentProperties ocprops = catalog.getOCProperties(); - if (ocprops == null) { - ocprops = new PDOptionalContentProperties(); - catalog.setOCProperties(ocprops); - } - PDOptionalContentGroup layer = null; - if (ocprops.hasGroup(LAYER_NAME)) { - layer = ocprops.getGroup(LAYER_NAME); - } else { - layer = new PDOptionalContentGroup(LAYER_NAME); - ocprops.addGroup(layer); - } - ocprops.setGroupEnabled(layer, layerVisibilityDefaultValue); -// dictionariesToUpdate.add(catalog.getCOSObject()); - return layer; - } - - - private static AffineTransform getTextDeRotationTransform(PDPage page) { - - return AffineTransform.getQuadrantRotateInstance(switch (page.getRotation()) { - case 90 -> 3; - case 180 -> 2; - case 270 -> 1; - default -> 0; - }); - } - -} diff --git a/layoutparser-service/layoutparser-service-server/build.gradle.kts b/layoutparser-service/layoutparser-service-server/build.gradle.kts index fd2da35..cc20ea7 100644 --- a/layoutparser-service/layoutparser-service-server/build.gradle.kts +++ b/layoutparser-service/layoutparser-service-server/build.gradle.kts @@ -39,6 +39,7 @@ dependencies { implementation("net.logstash.logback:logstash-logback-encoder:7.4") // for integration testing only + testImplementation(project(":viewer-doc-processor")) testImplementation(project(":layoutparser-service-internal-api")) testImplementation("org.springframework.boot:spring-boot-starter-amqp:${springBootStarterVersion}") diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java index c8f5207..5c5eae9 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java @@ -3,14 +3,19 @@ package com.knecon.fforesight.service.layoutparser.server.graph; import java.io.File; import java.nio.file.Path; +import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import org.springframework.core.io.ClassPathResource; +import com.iqser.red.commons.jackson.ObjectMapperFactory; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; +import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse; +import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; +import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService; -import com.knecon.fforesight.service.layoutparser.processor.services.visualization.ViewerDocumentService; import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest; +import com.knecon.fforesight.service.viewerdoc.service.ViewerDocumentService; import lombok.SneakyThrows; @@ -22,11 +27,36 @@ public class ViewerDocumentTest extends BuildDocumentTest { String fileName = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"; String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf"; - LayoutGridService layoutGridService = new LayoutGridService(); - ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService, null); - Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER); + var documentFile = new ClassPathResource(fileName).getFile(); - viewerDocumentService.createViewerDocument(documentFile, document, new File(tmpFileName), true); + ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null); + LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService); + + Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER); + long start = System.currentTimeMillis(); + layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true); + System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000); + } + + @Test + @Disabled + @SneakyThrows + public void testViewerDocumentWithTables() { + + String fileName = "files/cv_tables/brokenTablesOnOcr_ocred.pdf"; + String tableFileName = "files/cv_tables/brokenTablesOnOcr_ocred.TABLES.json"; + String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf"; + + var mapper = ObjectMapperFactory.create(); + var tableResponse = mapper.readValue(new ClassPathResource(tableFileName).getInputStream(), TableServiceResponse.class); + var documentFile = new ClassPathResource(fileName).getFile(); + + var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE, documentFile, new ImageServiceResponse(), tableResponse, Path.of(fileName).getFileName().toFile().toString()); + ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null); + LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService); + Document document = DocumentGraphFactory.buildDocumentGraph(classificationDocument); + + layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true); } } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java index d712814..03d8cae 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java @@ -27,9 +27,9 @@ import com.knecon.fforesight.service.layoutparser.processor.services.factory.Doc import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper; import com.knecon.fforesight.service.layoutparser.processor.services.mapper.PropertiesMapper; import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService; -import com.knecon.fforesight.service.layoutparser.processor.services.visualization.ViewerDocumentService; import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest; import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw; +import com.knecon.fforesight.service.viewerdoc.service.ViewerDocumentService; import lombok.SneakyThrows; @@ -57,8 +57,8 @@ public class RulingCleaningServiceTest extends BuildDocumentTest { @SneakyThrows public void testTableExtraction() { - LayoutGridService layoutGridService = new LayoutGridService(); - ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService, null); + ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null); + LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService); ClassPathResource resource = new ClassPathResource("files"); List pdfFileNames = Files.walk(resource.getFile().toPath()) diff --git a/layoutparser-service/viewer-doc-processor/build.gradle b/layoutparser-service/viewer-doc-processor/build.gradle new file mode 100644 index 0000000..c6c5dbc --- /dev/null +++ b/layoutparser-service/viewer-doc-processor/build.gradle @@ -0,0 +1,19 @@ +plugins { + id("com.knecon.fforesight.java-conventions") + id("io.freefair.lombok") version "8.2.2" +} + +description = "Library for adding/removing layers in the viewer document" + +var pdfBoxVersion = "3.0.0" + +dependencies { + implementation("org.apache.pdfbox:pdfbox:${pdfBoxVersion}") + implementation("org.apache.pdfbox:pdfbox-tools:${pdfBoxVersion}") + implementation("org.slf4j:slf4j-api:1.7.25") + implementation("com.knecon.fforesight:tracing-commons:0.5.0") + + testImplementation("org.apache.logging.log4j:log4j-slf4j-impl:2.22.1") + testImplementation("org.junit.jupiter:junit-jupiter") + testImplementation platform('org.junit:junit-bom:5.10.0') +} diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/ContentStreams.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/ContentStreams.java new file mode 100644 index 0000000..882e72b --- /dev/null +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/ContentStreams.java @@ -0,0 +1,33 @@ +package com.knecon.fforesight.service.viewerdoc; + +import java.util.List; + +import org.apache.pdfbox.cos.COSName; + +import lombok.AccessLevel; +import lombok.experimental.FieldDefaults; + +@FieldDefaults(makeFinal = true, level = AccessLevel.PUBLIC) +public class ContentStreams { + + public static Identifier KNECON_LAYOUT = new Identifier("Layout grid", COSName.getPDFName("KNECON_LAYOUT"), true); + + public static Identifier KNECON_OCR = new Identifier("OCR", COSName.getPDFName("KNECON_OCR"), false); + + public static Identifier KNECON_OCR_TEXT_DEBUG = new Identifier("OCR Text", COSName.getPDFName("KNECON_OCR_TEXT_DEBUG"), true); + + public static Identifier KNECON_OCR_BBOX_DEBUG = new Identifier("OCR Boxes", COSName.getPDFName("KNECON_OCR_BBOX_DEBUG"), true); + + public static Identifier OTHER = new Identifier("other", COSName.getPDFName("OTHER"), false); + + public static Identifier ESCAPE_START = new Identifier("escape start", COSName.getPDFName("ESCAPE_START"), false); + + public static Identifier ESCAPE_END = new Identifier("escape start", COSName.getPDFName("ESCAPE_END"), false); + + public static List allContentStreams = List.of(KNECON_LAYOUT, KNECON_OCR, KNECON_OCR_BBOX_DEBUG, KNECON_OCR_TEXT_DEBUG, OTHER, ESCAPE_START, ESCAPE_END); + + public record Identifier(String name, COSName cosName, boolean optionalContent) { + + } + +} diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/ColoredLine.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/ColoredLine.java new file mode 100644 index 0000000..1bb25f2 --- /dev/null +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/ColoredLine.java @@ -0,0 +1,8 @@ +package com.knecon.fforesight.service.viewerdoc.model; + +import java.awt.Color; +import java.awt.geom.Line2D; + +public record ColoredLine(Line2D line, Color color, float lineWidth) { + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/visualization/ColoredRectangle.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/ColoredRectangle.java similarity index 58% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/visualization/ColoredRectangle.java rename to layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/ColoredRectangle.java index b251181..4151d11 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/visualization/ColoredRectangle.java +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/ColoredRectangle.java @@ -1,8 +1,8 @@ -package com.knecon.fforesight.service.layoutparser.processor.model.visualization; +package com.knecon.fforesight.service.viewerdoc.model; import java.awt.Color; import java.awt.geom.Rectangle2D; -public record ColoredRectangle(Rectangle2D rectangle2D, Color color) { +public record ColoredRectangle(Rectangle2D rectangle2D, Color color, float lineWidth) { } diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/EmbeddableFont.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/EmbeddableFont.java new file mode 100644 index 0000000..f7e878f --- /dev/null +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/EmbeddableFont.java @@ -0,0 +1,10 @@ +package com.knecon.fforesight.service.viewerdoc.model; + +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.font.PDFont; + +public interface EmbeddableFont { + + PDFont embed(PDDocument document); + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/visualization/FilledRectangle.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/FilledRectangle.java similarity index 63% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/visualization/FilledRectangle.java rename to layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/FilledRectangle.java index f043a31..58d5eb6 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/visualization/FilledRectangle.java +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/FilledRectangle.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.model.visualization; +package com.knecon.fforesight.service.viewerdoc.model; import java.awt.Color; import java.awt.geom.Rectangle2D; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/visualization/LayoutGrid.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/LayoutGrid.java similarity index 51% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/visualization/LayoutGrid.java rename to layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/LayoutGrid.java index 4d7cc3b..d8f47ca 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/visualization/LayoutGrid.java +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/LayoutGrid.java @@ -1,7 +1,7 @@ -package com.knecon.fforesight.service.layoutparser.processor.model.visualization; +package com.knecon.fforesight.service.viewerdoc.model; -import java.util.ArrayList; -import java.util.List; +import java.util.HashMap; +import java.util.Map; import lombok.AccessLevel; import lombok.Getter; @@ -12,15 +12,15 @@ import lombok.experimental.FieldDefaults; public class LayoutGrid { int numberOfPages; - List visualizationsPerPages; + Map visualizationsPerPages; public LayoutGrid(int numberOfPages) { this.numberOfPages = numberOfPages; - this.visualizationsPerPages = new ArrayList<>(numberOfPages); + this.visualizationsPerPages = new HashMap<>(); for (int i = 0; i < numberOfPages; i++) { - this.visualizationsPerPages.add(VisualizationsOnPage.builder().pageNumber(i).build()); + this.visualizationsPerPages.put(i, VisualizationsOnPage.builder().build()); } } diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/OperatorWithArguments.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/OperatorWithArguments.java new file mode 100644 index 0000000..c9a1b9d --- /dev/null +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/OperatorWithArguments.java @@ -0,0 +1,10 @@ +package com.knecon.fforesight.service.viewerdoc.model; + +import java.util.List; + +import org.apache.pdfbox.contentstream.operator.Operator; +import org.apache.pdfbox.cos.COSBase; + +public record OperatorWithArguments(Operator operator, List arguments) { + +} \ No newline at end of file diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/PlacedText.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/PlacedText.java new file mode 100644 index 0000000..ff9e449 --- /dev/null +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/PlacedText.java @@ -0,0 +1,17 @@ +package com.knecon.fforesight.service.viewerdoc.model; + +import java.awt.Color; +import java.awt.geom.Point2D; +import java.util.Optional; + +import org.apache.pdfbox.pdmodel.graphics.state.RenderingMode; +import org.apache.pdfbox.util.Matrix; + +public record PlacedText(String text, Point2D lineStart, Color color, float fontSize, EmbeddableFont font, Optional textMatrix, Optional renderingMode) { + + public static PlacedText textFacingUp(String text, Point2D lineStart, float fontSize, Color color, EmbeddableFont font) { + + return new PlacedText(text, lineStart, color, fontSize, font, Optional.empty(), Optional.empty()); + } + +} diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/Standard14EmbeddableFont.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/Standard14EmbeddableFont.java new file mode 100644 index 0000000..5431602 --- /dev/null +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/Standard14EmbeddableFont.java @@ -0,0 +1,37 @@ +package com.knecon.fforesight.service.viewerdoc.model; + +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.font.PDFont; +import org.apache.pdfbox.pdmodel.font.PDType1Font; +import org.apache.pdfbox.pdmodel.font.Standard14Fonts; + +import lombok.RequiredArgsConstructor; +import lombok.SneakyThrows; + +@RequiredArgsConstructor +public class Standard14EmbeddableFont implements EmbeddableFont { + + private final PDType1Font font; + + + public static Standard14EmbeddableFont helvetica() { + + return new Standard14EmbeddableFont(new PDType1Font(Standard14Fonts.FontName.HELVETICA)); + } + + + @SneakyThrows + public float getStringWidth(String text) { + + return font.getStringWidth(text); + } + + + @Override + public PDFont embed(PDDocument document) { + + // no need to embed anything + return font; + } + +} diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/Visualizations.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/Visualizations.java new file mode 100644 index 0000000..fb17113 --- /dev/null +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/Visualizations.java @@ -0,0 +1,23 @@ +package com.knecon.fforesight.service.viewerdoc.model; + +import java.util.Map; + +import com.knecon.fforesight.service.viewerdoc.ContentStreams; + +import lombok.AccessLevel; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Getter; +import lombok.experimental.FieldDefaults; + +@Builder +@Getter +@AllArgsConstructor +@FieldDefaults(level = AccessLevel.PRIVATE) +public class Visualizations { + + ContentStreams.Identifier layer; + Map visualizationsOnPages; + boolean layerVisibilityDefaultValue; + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/visualization/VisualizationsOnPage.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/VisualizationsOnPage.java similarity index 85% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/visualization/VisualizationsOnPage.java rename to layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/VisualizationsOnPage.java index 2b2b4ea..a945708 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/visualization/VisualizationsOnPage.java +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/VisualizationsOnPage.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.model.visualization; +package com.knecon.fforesight.service.viewerdoc.model; import java.util.LinkedList; import java.util.List; @@ -13,7 +13,7 @@ import lombok.experimental.FieldDefaults; @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) public class VisualizationsOnPage { - int pageNumber; + boolean makePathsInvisible; @Builder.Default List placedTexts = new LinkedList<>(); @Builder.Default diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/pdf/ClassifiedContentStream.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/pdf/ClassifiedContentStream.java new file mode 100644 index 0000000..dba3e6b --- /dev/null +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/pdf/ClassifiedContentStream.java @@ -0,0 +1,7 @@ +package com.knecon.fforesight.service.viewerdoc.pdf; + +import com.knecon.fforesight.service.viewerdoc.ContentStreams; + +public record ClassifiedContentStream(SinglePDContentStream contentStream, ContentStreams.Identifier classification) { + +} \ No newline at end of file diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/pdf/SinglePDContentStream.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/pdf/SinglePDContentStream.java new file mode 100644 index 0000000..f429639 --- /dev/null +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/pdf/SinglePDContentStream.java @@ -0,0 +1,61 @@ +package com.knecon.fforesight.service.viewerdoc.pdf; + +import java.io.IOException; +import java.io.InputStream; + +import org.apache.pdfbox.contentstream.PDContentStream; +import org.apache.pdfbox.io.RandomAccessInputStream; +import org.apache.pdfbox.io.RandomAccessRead; +import org.apache.pdfbox.pdmodel.PDResources; +import org.apache.pdfbox.pdmodel.common.PDRectangle; +import org.apache.pdfbox.pdmodel.common.PDStream; +import org.apache.pdfbox.util.Matrix; + +import lombok.AccessLevel; +import lombok.Getter; +import lombok.RequiredArgsConstructor; +import lombok.experimental.FieldDefaults; + +@Getter +@RequiredArgsConstructor +@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) +public class SinglePDContentStream implements PDContentStream { + + PDStream pdStream; + + + @Override + public InputStream getContents() throws IOException { + + return new RandomAccessInputStream(getContentsForRandomAccess()); + } + + + @Override + public RandomAccessRead getContentsForRandomAccess() throws IOException { + + return pdStream.getCOSObject().createView(); + } + + + @Override + public PDResources getResources() { + + return null; + } + + + @Override + public PDRectangle getBBox() { + + return null; + } + + + @Override + public Matrix getMatrix() { + + return null; + } + +} diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/ContentStreamClassifier.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/ContentStreamClassifier.java new file mode 100644 index 0000000..e34148b --- /dev/null +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/ContentStreamClassifier.java @@ -0,0 +1,121 @@ +package com.knecon.fforesight.service.viewerdoc.service; + +import java.util.LinkedList; +import java.util.List; +import java.util.Optional; + +import org.apache.pdfbox.contentstream.PDContentStream; +import org.apache.pdfbox.contentstream.operator.OperatorName; +import org.apache.pdfbox.cos.COSName; +import org.apache.pdfbox.pdmodel.PDPage; + +import com.knecon.fforesight.service.viewerdoc.ContentStreams; +import com.knecon.fforesight.service.viewerdoc.model.OperatorWithArguments; +import com.knecon.fforesight.service.viewerdoc.pdf.ClassifiedContentStream; +import com.knecon.fforesight.service.viewerdoc.pdf.SinglePDContentStream; + +import lombok.SneakyThrows; +import lombok.experimental.UtilityClass; + +@UtilityClass +public class ContentStreamClassifier { + + public List getClassifiedContentStreams(PDPage page) { + + List streams = new LinkedList<>(); + page.getContentStreams().forEachRemaining(stream -> streams.add(new SinglePDContentStream(stream))); + return ContentStreamClassifier.classifySingleContentStreams(page, streams); + } + + + public List classifySingleContentStreams(PDPage page, List streams) { + + return streams.stream().map(singlePDContentStream -> classifySingleContentStream(page, singlePDContentStream)).toList(); + } + + + private ClassifiedContentStream classifySingleContentStream(PDPage page, SinglePDContentStream singlePDContentStream) { + + ContentStreams.Identifier classification = classifyContentStream(singlePDContentStream, page); + return new ClassifiedContentStream(singlePDContentStream, classification); + } + + + /** + * We assume all of our layers are written escaped, so only unknown content streams need to be escaped. + * + * @param classifiers List of all content streams of a page with their classification + * @return false, if any content stream with classification other is not prefixed with an ESCAPE_START and suffixed with an ESCAPE_END + */ + public boolean areAllContentStreamsEscaped(List classifiers) { + + int escapeDepth = 0; + for (ClassifiedContentStream classifier : classifiers) { + if (classifier.classification().equals(ContentStreams.OTHER) && escapeDepth == 0) { + return false; + } + if (classifier.classification().equals(ContentStreams.ESCAPE_START)) { + escapeDepth++; + } + if (classifier.classification().equals(ContentStreams.ESCAPE_END)) { + escapeDepth--; + } + } + return escapeDepth == 0; + } + + + @SneakyThrows + public ContentStreams.Identifier classifyContentStream(PDContentStream contentStream, PDPage page) { + + List operatorsWithArguments = ContentStreamUtility.parseLeadingOperators(contentStream, 2); + if (operatorsWithArguments.isEmpty()) { + return ContentStreams.OTHER; + } + OperatorWithArguments firstOperator = operatorsWithArguments.get(0); + + // If we wrap the content streams we append and prepend a content stream with exactly one operator "q" or "Q". + if (operatorsWithArguments.size() == 1) { + if (firstOperator.operator().getName().equals(OperatorName.SAVE)) { + return ContentStreams.ESCAPE_START; + } + if (firstOperator.operator().getName().equals(OperatorName.RESTORE)) { + return ContentStreams.ESCAPE_END; + } + } + + // In previous versions we did not set a marked content with an explicit name. Instead, we wrote an optional content group (OCG) with the name "Layout grid". + // This OCG is then assigned a COSName by PDFBox. Usually its "oc1". + // Thus, in order to find this name we need to look in the page resources to find the COSName assigned to the OCG. + // This COSName can then be found as an argument for the first operator in the content stream. + if (firstOperator.operator().getName().equals(OperatorName.BEGIN_MARKED_CONTENT_SEQ)) { + Optional layoutGridOCGName = ContentStreamUtility.findLayoutGridOCGName(page); + if (layoutGridOCGName.isPresent()) { + if (arumentsContainLayoutGridOCG(firstOperator, layoutGridOCGName.get())) { + return ContentStreams.KNECON_LAYOUT; + } + } + } + + if (!firstOperator.operator().getName().equals(OperatorName.BEGIN_MARKED_CONTENT)) { + return ContentStreams.OTHER; + } + + Optional firstCOSNameFromArguments = firstOperator.arguments().stream().filter(c -> c instanceof COSName).map(c -> (COSName) c).findFirst(); + + if (firstCOSNameFromArguments.isEmpty()) { + return ContentStreams.OTHER; + } + + var cosName = firstCOSNameFromArguments.get(); + + return ContentStreams.allContentStreams.stream().filter(identifier -> identifier.cosName().equals(cosName)).findAny().orElse(ContentStreams.OTHER); + } + + + private static boolean arumentsContainLayoutGridOCG(OperatorWithArguments operator, COSName layoutGridOCGName) { + + return operator.arguments().stream().filter(c -> c instanceof COSName).map(c -> (COSName) c).anyMatch(cosName -> cosName.equals(layoutGridOCGName)); + } + +} diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/ContentStreamUtility.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/ContentStreamUtility.java new file mode 100644 index 0000000..939c801 --- /dev/null +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/ContentStreamUtility.java @@ -0,0 +1,78 @@ +package com.knecon.fforesight.service.viewerdoc.service; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.LinkedList; +import java.util.List; +import java.util.Optional; +import java.util.Set; + +import org.apache.pdfbox.contentstream.PDContentStream; +import org.apache.pdfbox.contentstream.operator.Operator; +import org.apache.pdfbox.cos.COSBase; +import org.apache.pdfbox.cos.COSName; +import org.apache.pdfbox.cos.COSString; +import org.apache.pdfbox.pdfparser.PDFStreamParser; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.common.PDStream; + +import com.knecon.fforesight.service.viewerdoc.ContentStreams; +import com.knecon.fforesight.service.viewerdoc.model.OperatorWithArguments; +import com.knecon.fforesight.service.viewerdoc.pdf.ClassifiedContentStream; +import com.knecon.fforesight.service.viewerdoc.pdf.SinglePDContentStream; + +import lombok.experimental.UtilityClass; + +@UtilityClass +public class ContentStreamUtility { + + public static List parseLeadingOperators(PDContentStream contentStream, + int numberOfOperatorsToRead) throws IOException { + + List arguments = new ArrayList<>(); + PDFStreamParser parser = new PDFStreamParser(contentStream); + List operatorsWithArguments = new LinkedList<>(); + for (int i = 0; i < numberOfOperatorsToRead; ) { + Object token = parser.parseNextToken(); + if (token == null) { + break; + } + if (token instanceof Operator operator) { + operatorsWithArguments.add(new OperatorWithArguments(operator, arguments)); + arguments = new ArrayList<>(); + i++; + } else { + arguments.add((COSBase) token); + } + + } + return operatorsWithArguments; + } + + + public static Optional findLayoutGridOCGName(PDPage page) { + + Optional layoutGridOCGName = Optional.empty(); + var resourceIterator = page.getResources().getPropertiesNames(); + for (COSName cosName : resourceIterator) { + COSBase cosBase = page.getResources().getProperties(cosName).getCOSObject().getDictionaryObject(COSName.NAME); + if (cosBase instanceof COSString string) { + if (ContentStreams.KNECON_LAYOUT.name().equals(string.getString())) { + layoutGridOCGName = Optional.of(cosName); + } + } + } + return layoutGridOCGName; + } + + + public static List removeLayerFromContentStreams(Set layers, List classifiers) { + + return classifiers.stream() + .filter(classifiedContentStream -> !layers.contains(classifiedContentStream.classification())) + .map(ClassifiedContentStream::contentStream) + .map(SinglePDContentStream::getPdStream) + .toList(); + } + +} diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/ViewerDocumentService.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/ViewerDocumentService.java new file mode 100644 index 0000000..cc63845 --- /dev/null +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/ViewerDocumentService.java @@ -0,0 +1,316 @@ +package com.knecon.fforesight.service.viewerdoc.service; + +import java.awt.geom.AffineTransform; +import java.awt.geom.Rectangle2D; +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardCopyOption; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.stream.Collectors; + +import org.apache.pdfbox.Loader; +import org.apache.pdfbox.cos.COSName; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDDocumentCatalog; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.PDPageContentStream; +import org.apache.pdfbox.pdmodel.PDResources; +import org.apache.pdfbox.pdmodel.font.PDFont; +import org.apache.pdfbox.pdmodel.graphics.optionalcontent.PDOptionalContentGroup; +import org.apache.pdfbox.pdmodel.graphics.optionalcontent.PDOptionalContentProperties; +import org.apache.pdfbox.pdmodel.graphics.state.PDExtendedGraphicsState; +import org.apache.pdfbox.pdmodel.graphics.state.RenderingMode; +import org.apache.pdfbox.util.Matrix; + +import com.knecon.fforesight.service.viewerdoc.ContentStreams; +import com.knecon.fforesight.service.viewerdoc.model.ColoredLine; +import com.knecon.fforesight.service.viewerdoc.model.ColoredRectangle; +import com.knecon.fforesight.service.viewerdoc.model.FilledRectangle; +import com.knecon.fforesight.service.viewerdoc.model.PlacedText; +import com.knecon.fforesight.service.viewerdoc.model.Visualizations; +import com.knecon.fforesight.service.viewerdoc.model.VisualizationsOnPage; +import com.knecon.fforesight.service.viewerdoc.pdf.ClassifiedContentStream; + +import io.micrometer.observation.Observation; +import io.micrometer.observation.ObservationRegistry; +import io.micrometer.observation.annotation.Observed; +import lombok.RequiredArgsConstructor; +import lombok.SneakyThrows; +import lombok.extern.slf4j.Slf4j; + +@Slf4j +@RequiredArgsConstructor +public class ViewerDocumentService { + + private final ObservationRegistry registry; + + + public void addVisualizationsOnPage(File originFile, File destinationFile, Visualizations visualizations) { + + addVisualizationsOnPage(originFile, destinationFile, List.of(visualizations)); + } + + + @Observed(name = "ViewerDocumentService", contextualName = "add-visualizations") + @SneakyThrows + public void addVisualizationsOnPage(File originFile, File destinationFile, List visualizations) { + + // originFile and destinationFile might be the same, so we use a temp file. + // Otherwise, saving the document might corrupt the file + Path tmpFile = Files.createTempFile("tmpViewerDocument", ".pdf"); + Files.copy(originFile.toPath(), tmpFile, StandardCopyOption.REPLACE_EXISTING); + + PDDocument pdDocument = openPDDocument(tmpFile.toFile()); + + enrichObservation(pdDocument, visualizations.stream().map(Visualizations::getLayer).toList()); + + Set allLayers = visualizations.stream().map(Visualizations::getLayer).collect(Collectors.toUnmodifiableSet()); + + Map optionalContentGroupMap = addLayersToDocument(visualizations, pdDocument); + + for (int pageNumber = 0; pageNumber < pdDocument.getNumberOfPages(); pageNumber++) { + PDPage pdPage = pdDocument.getPage(pageNumber); + createPageResourcesIfNotPresent(pdPage); // needed for optionalContentGroups + + List classifiers = ContentStreamClassifier.getClassifiedContentStreams(pdPage); + + pdPage.setContents(ContentStreamUtility.removeLayerFromContentStreams(allLayers, classifiers)); + + AffineTransform textDeRotationMatrix = getTextDeRotationTransform(pdPage); + + if (!ContentStreamClassifier.areAllContentStreamsEscaped(classifiers)) { + // We need to save the graphics state before, such that our appended content cannot be affected by previous content streams with side effects, + // e.g. not escaped matrix transformations. + wrapContentStreams(pdDocument, pdPage); + } + + for (Visualizations visualization : visualizations) { + if (!visualization.getVisualizationsOnPages().containsKey(pageNumber)) { + continue; + } + // We need to append to the content stream, otherwise the content could be overlapped by following content. + try (var contentStream = new PDPageContentStream(pdDocument, pdPage, PDPageContentStream.AppendMode.APPEND, true)) { + + contentStream.beginMarkedContent(visualization.getLayer().cosName()); + + if (optionalContentGroupMap.containsKey(visualization.getLayer())) { + contentStream.beginMarkedContent(COSName.OC, optionalContentGroupMap.get(visualization.getLayer())); + } + + contentStream.saveGraphicsState(); + + drawVisualizationsToContentStream(pdDocument, visualization.getVisualizationsOnPages().get(pageNumber), contentStream, textDeRotationMatrix); + + contentStream.restoreGraphicsState(); + + if (optionalContentGroupMap.containsKey(visualization.getLayer())) { + contentStream.endMarkedContent(); + } + + contentStream.endMarkedContent(); + } + + } + if (pageNumber % 500 == 0 && pageNumber != 0) { // re-open document every once in a while to save on RAM + log.info("Incremental save after {}/{} pages", pageNumber, pdDocument.getNumberOfPages()); + observedIncrementalSave(pdDocument, destinationFile); + pdDocument.close(); + Files.copy(destinationFile.toPath(), tmpFile, StandardCopyOption.REPLACE_EXISTING); + pdDocument = openPDDocument(tmpFile.toFile()); + } + } + observedIncrementalSave(pdDocument, destinationFile); + + pdDocument.close(); + assert tmpFile.toFile().delete(); + } + + + private static Map addLayersToDocument(List visualizations, PDDocument pdDocument) { + + Map optionalContentGroupMap = new HashMap<>(); + for (Visualizations visualization : visualizations) { + addLayerToDocument(visualization.getLayer(), pdDocument, visualization.isLayerVisibilityDefaultValue())// + .ifPresent(ocg -> optionalContentGroupMap.put(visualization.getLayer(), ocg)); + } + return optionalContentGroupMap; + } + + + private static void drawVisualizationsToContentStream(PDDocument pdDocument, + VisualizationsOnPage visualizationsOnPage, + PDPageContentStream contentStream, + AffineTransform textDeRotationMatrix) throws IOException { + + if (visualizationsOnPage.isMakePathsInvisible()) { + contentStream.addRect(0, 0, 1, 1); + contentStream.clip(); + } + + for (ColoredLine coloredLine : visualizationsOnPage.getColoredLines()) { + contentStream.setLineWidth(coloredLine.lineWidth()); + contentStream.setStrokingColor(coloredLine.color()); + contentStream.moveTo((float) coloredLine.line().getX1(), (float) coloredLine.line().getY1()); + contentStream.lineTo((float) coloredLine.line().getX2(), (float) coloredLine.line().getY2()); + contentStream.stroke(); + } + + for (ColoredRectangle coloredRectangle : visualizationsOnPage.getColoredRectangles()) { + contentStream.setLineWidth(coloredRectangle.lineWidth()); + contentStream.setStrokingColor(coloredRectangle.color()); + Rectangle2D r = coloredRectangle.rectangle2D(); + contentStream.addRect((float) r.getX(), (float) r.getY(), (float) r.getWidth(), (float) r.getHeight()); + contentStream.stroke(); + } + + for (FilledRectangle filledRectangle : visualizationsOnPage.getFilledRectangles()) { + contentStream.setNonStrokingColor(filledRectangle.color()); + PDExtendedGraphicsState graphicsState = new PDExtendedGraphicsState(); + graphicsState.setNonStrokingAlphaConstant(filledRectangle.alpha()); + contentStream.setGraphicsStateParameters(graphicsState); + Rectangle2D r = filledRectangle.rectangle2D(); + contentStream.addRect((float) r.getX(), (float) r.getY(), (float) r.getWidth(), (float) r.getHeight()); + contentStream.fill(); + } + + for (PlacedText placedText : visualizationsOnPage.getPlacedTexts()) { + PDFont font = placedText.font().embed(pdDocument); + contentStream.setFont(font, placedText.fontSize()); + contentStream.beginText(); + contentStream.setNonStrokingColor(placedText.color()); + if (placedText.renderingMode().isPresent()) { + contentStream.setRenderingMode(placedText.renderingMode().get()); + } else { + contentStream.setRenderingMode(RenderingMode.FILL); + } + Matrix textMatrix = getTextMatrix(placedText, textDeRotationMatrix); + contentStream.setTextMatrix(textMatrix); + contentStream.showText(placedText.text()); + contentStream.endText(); + } + } + + + private void enrichObservation(PDDocument pdDocument, List layers) { + + if (registry == null || registry.getCurrentObservation() == null || registry.isNoop()) { + return; + } + registry.getCurrentObservation().highCardinalityKeyValue("numberOfPages", String.valueOf(pdDocument.getNumberOfPages())); + for (int i = 0; i < layers.size(); i++) { + ContentStreams.Identifier layer = layers.get(i); + + registry.getCurrentObservation().highCardinalityKeyValue("layer_" + i, String.valueOf(layer.name())); + } + } + + + private static void wrapContentStreams(PDDocument pdDocument, PDPage pdPage) throws IOException { + + try (var contentStream = new PDPageContentStream(pdDocument, pdPage, PDPageContentStream.AppendMode.PREPEND, false)) { + contentStream.saveGraphicsState(); + } + try (var contentStream = new PDPageContentStream(pdDocument, pdPage, PDPageContentStream.AppendMode.APPEND, false)) { + contentStream.restoreGraphicsState(); + } + } + + + private static Matrix getTextMatrix(PlacedText placedText, AffineTransform textDeRotationMatrix) { + + Matrix textMatrix; + if (placedText.textMatrix().isEmpty()) { + textMatrix = new Matrix((float) textDeRotationMatrix.getScaleX(), + (float) textDeRotationMatrix.getShearX(), + (float) textDeRotationMatrix.getShearY(), + (float) textDeRotationMatrix.getScaleY(), + (float) placedText.lineStart().getX(), + (float) placedText.lineStart().getY()); + } else { + textMatrix = placedText.textMatrix().get(); + } + return textMatrix; + } + + + private static Optional addLayerToDocument(ContentStreams.Identifier layer, PDDocument pdDocument, boolean layerVisibilityDefaultValue) { + + if (layer.optionalContent()) { + return Optional.of(addLayerToDocument(pdDocument, layer.name(), layerVisibilityDefaultValue)); + + } + return Optional.empty(); + + } + + + private static PDOptionalContentGroup addLayerToDocument(PDDocument pdDocument, String layerName, boolean layerVisibilityDefaultValue) { + + PDDocumentCatalog catalog = pdDocument.getDocumentCatalog(); + PDOptionalContentProperties ocprops = catalog.getOCProperties(); + if (ocprops == null) { + ocprops = new PDOptionalContentProperties(); + catalog.setOCProperties(ocprops); + } + PDOptionalContentGroup layer = null; + if (ocprops.hasGroup(layerName)) { + layer = ocprops.getGroup(layerName); + } else { + layer = new PDOptionalContentGroup(layerName); + ocprops.addGroup(layer); + } + ocprops.setGroupEnabled(layer, layerVisibilityDefaultValue); + return layer; + } + + + private static PDDocument openPDDocument(File tmpFile) throws IOException { + + PDDocument pdDocument; + pdDocument = Loader.loadPDF(tmpFile); + pdDocument.setAllSecurityToBeRemoved(true); + return pdDocument; + } + + + @SneakyThrows + private void observedIncrementalSave(PDDocument pdDocument, File outputFile) { + + Observation.createNotStarted("ViewerDocumentService", registry).contextualName("incremental-save").observe(() -> { + try (var out = new FileOutputStream(outputFile)) { + pdDocument.save(out); + } catch (IOException e) { + throw new RuntimeException(e); + } + }); + } + + + private static void createPageResourcesIfNotPresent(PDPage pdPage) { + + PDResources resources = pdPage.getResources(); + if (resources == null) { + resources = new PDResources(); + pdPage.setResources(resources); + } + } + + + private static AffineTransform getTextDeRotationTransform(PDPage page) { + + return AffineTransform.getQuadrantRotateInstance(switch (page.getRotation()) { + case 90 -> 3; + case 180 -> 2; + case 270 -> 1; + default -> 0; + }); + } + +} diff --git a/layoutparser-service/viewer-doc-processor/src/test/java/com/knecon/fforesight/service/viewerdoc/service/ContentStreamClassifierTest.java b/layoutparser-service/viewer-doc-processor/src/test/java/com/knecon/fforesight/service/viewerdoc/service/ContentStreamClassifierTest.java new file mode 100644 index 0000000..fd33307 --- /dev/null +++ b/layoutparser-service/viewer-doc-processor/src/test/java/com/knecon/fforesight/service/viewerdoc/service/ContentStreamClassifierTest.java @@ -0,0 +1,124 @@ +package com.knecon.fforesight.service.viewerdoc.service; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.io.File; +import java.nio.file.Files; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + +import org.apache.pdfbox.Loader; +import org.apache.pdfbox.cos.COSName; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.junit.jupiter.api.Test; + +import com.knecon.fforesight.service.viewerdoc.ContentStreams; +import com.knecon.fforesight.service.viewerdoc.pdf.ClassifiedContentStream; + +import lombok.SneakyThrows; +import lombok.extern.slf4j.Slf4j; + +@Slf4j +public class ContentStreamClassifierTest { + + @Test + @SneakyThrows + public void testClassification() { + + File pdfFile = new File(Thread.currentThread().getContextClassLoader().getResource("viewerDocLayers.pdf").getFile()); + + try (PDDocument document = Loader.loadPDF(pdfFile)) { + + PDPage page = document.getPage(0); + + List classifieds = ContentStreamClassifier.getClassifiedContentStreams(page); + + logContentStreamClassifications(classifieds); + + assertEquals(11, classifieds.size()); + assertEquals(ContentStreams.ESCAPE_START, classifieds.get(0).classification()); + for (int i = 1; i < 9; i++) { + assertEquals(ContentStreams.OTHER, classifieds.get(i).classification()); + } + assertEquals(ContentStreams.ESCAPE_END, classifieds.get(9).classification()); + assertEquals(ContentStreams.KNECON_LAYOUT, classifieds.get(10).classification()); + assertTrue(ContentStreamClassifier.areAllContentStreamsEscaped(classifieds)); + } + } + + + @Test + @SneakyThrows + public void testRemoveLayoutLayer() { + + File pdfFile = new File(Thread.currentThread().getContextClassLoader().getResource("viewerDocLayers.pdf").getFile()); + File tmpFile = Files.createTempFile("removedLayout", ".pdf").toFile(); + + try (PDDocument document = Loader.loadPDF(pdfFile)) { + + PDPage page = document.getPage(0); + + List classifieds = ContentStreamClassifier.getClassifiedContentStreams(page); + page.setContents(ContentStreamUtility.removeLayerFromContentStreams(Set.of(ContentStreams.KNECON_LAYOUT), classifieds)); + + document.save(tmpFile); + } + try (PDDocument document2 = Loader.loadPDF(tmpFile)) { + + PDPage page2 = document2.getPage(0); + + List classifieds2 = ContentStreamClassifier.getClassifiedContentStreams(page2); + + logContentStreamClassifications(classifieds2); + + assertEquals(10, classifieds2.size()); + assertEquals(ContentStreams.ESCAPE_START, classifieds2.get(0).classification()); + for (int i = 1; i < 9; i++) { + assertEquals(ContentStreams.OTHER, classifieds2.get(i).classification()); + } + assertEquals(ContentStreams.ESCAPE_END, classifieds2.get(9).classification()); + assertTrue(ContentStreamClassifier.areAllContentStreamsEscaped(classifieds2)); + } + assert tmpFile.delete(); + } + + + @Test + @SneakyThrows + public void testClassificationForOldLayers() { + + File pdfFile = new File(Thread.currentThread().getContextClassLoader().getResource("oldViewerDocLayers.pdf").getFile()); + try (PDDocument document = Loader.loadPDF(pdfFile)) { + + PDPage page = document.getPage(0); + + List classifieds = ContentStreamClassifier.getClassifiedContentStreams(page); + + logContentStreamClassifications(classifieds); + + assertEquals(11, classifieds.size()); + assertEquals(ContentStreams.ESCAPE_START, classifieds.get(0).classification()); + for (int i = 1; i < 9; i++) { + assertEquals(ContentStreams.OTHER, classifieds.get(i).classification()); + } + assertEquals(ContentStreams.ESCAPE_END, classifieds.get(9).classification()); + assertEquals(ContentStreams.KNECON_LAYOUT, classifieds.get(10).classification()); + assertTrue(ContentStreamClassifier.areAllContentStreamsEscaped(classifieds)); + } + } + + + private static void logContentStreamClassifications(List classifieds) { + + log.info("number of content streams: {}", classifieds.size()); + log.info("Classifications: {}", classifieds.stream()// + .map(ClassifiedContentStream::classification)// + .map(ContentStreams.Identifier::cosName)// + .map(COSName::getName)// + .collect(Collectors.joining(", "))); + } + +} diff --git a/layoutparser-service/viewer-doc-processor/src/test/resources/log4j2-test.xml b/layoutparser-service/viewer-doc-processor/src/test/resources/log4j2-test.xml new file mode 100644 index 0000000..433988b --- /dev/null +++ b/layoutparser-service/viewer-doc-processor/src/test/resources/log4j2-test.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/layoutparser-service/viewer-doc-processor/src/test/resources/oldViewerDocLayers.pdf b/layoutparser-service/viewer-doc-processor/src/test/resources/oldViewerDocLayers.pdf new file mode 100644 index 0000000..9b3f010 Binary files /dev/null and b/layoutparser-service/viewer-doc-processor/src/test/resources/oldViewerDocLayers.pdf differ diff --git a/layoutparser-service/viewer-doc-processor/src/test/resources/viewerDocLayers.pdf b/layoutparser-service/viewer-doc-processor/src/test/resources/viewerDocLayers.pdf new file mode 100644 index 0000000..8848184 Binary files /dev/null and b/layoutparser-service/viewer-doc-processor/src/test/resources/viewerDocLayers.pdf differ diff --git a/settings.gradle.kts b/settings.gradle.kts index 696c016..ca91555 100644 --- a/settings.gradle.kts +++ b/settings.gradle.kts @@ -1,13 +1,9 @@ -/* - * This file was generated by the Gradle 'init' task. - * - * This project uses @Incubating APIs which are subject to change. - */ - rootProject.name = "layoutparser" include(":layoutparser-service-server") include(":layoutparser-service-processor") include(":layoutparser-service-internal-api") +include("viewer-doc-processor") project(":layoutparser-service-server").projectDir = file("layoutparser-service/layoutparser-service-server") project(":layoutparser-service-processor").projectDir = file("layoutparser-service/layoutparser-service-processor") project(":layoutparser-service-internal-api").projectDir = file("layoutparser-service/layoutparser-service-internal-api") +project(":viewer-doc-processor").projectDir = file("layoutparser-service/viewer-doc-processor")