From 66d3433e04b78b02101b5038b8d5bef3f4f1807d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kilian=20Sch=C3=BCttler?= Date: Mon, 1 Jul 2024 11:13:26 +0200 Subject: [PATCH] RED-9353: use azure ocr service --- .../processor/LayoutParsingPipeline.java | 40 +-- ...tParsingServiceProcessorConfiguration.java | 13 +- .../processor/LayoutparserSettings.java | 1 + .../visualization/LayoutGridService.java | 4 +- .../PdfSegmentationServiceTest.java | 3 +- .../viewer-doc-processor/build.gradle | 1 + .../viewerdoc/model/EmbeddableFont.java | 5 + .../model/Standard14EmbeddableFont.java | 16 +- .../viewerdoc/model/VisualizationsOnPage.java | 6 + .../service/IViewerDocumentService.java | 27 ++ .../service/ViewerDocumentService.java | 24 +- .../service/pdftron/MarkedContentStack.java | 73 +++++ .../pdftron/PDFTronViewerDocumentService.java | 153 +++++++++++ .../service/pdftron/PageContentCleaner.java | 120 +++++++++ .../service/pdftron/PdftronLayerUtility.java | 96 +++++++ .../service/pdftron/VisualizationWriter.java | 249 ++++++++++++++++++ publish-custom-image.sh | 34 ++- 17 files changed, 821 insertions(+), 44 deletions(-) create mode 100644 layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/IViewerDocumentService.java create mode 100644 layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/pdftron/MarkedContentStack.java create mode 100644 layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/pdftron/PDFTronViewerDocumentService.java create mode 100644 layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/pdftron/PageContentCleaner.java create mode 100644 layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/pdftron/PdftronLayerUtility.java create mode 100644 layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/pdftron/VisualizationWriter.java diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index cdfe625..d981b5c 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -148,6 +148,8 @@ public class LayoutParsingPipeline { visualLayoutParsingResponse, layoutParsingRequest.identifier()); + log.info("Building document graph for {}", layoutParsingRequest.identifier()); + Document documentGraph = observeBuildDocumentGraph(settings.getLayoutParsingTypeOverride() == null // ? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride(), classificationDocument); @@ -166,7 +168,7 @@ public class LayoutParsingPipeline { layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentGraph)); layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, viewerDocumentFile); - if (layoutParsingRequest.layoutParsingType().equals(LayoutParsingType.CLARIFYND)) { + if (layoutParsingRequest.researchDocumentStorageId() != null) { log.info("Building research document data for {}", layoutParsingRequest.identifier()); var researchDocumentData = TaasDocumentDataMapper.fromDocument(documentGraph); layoutParsingStorageService.storeResearchDocumentData(layoutParsingRequest, researchDocumentData); @@ -257,7 +259,7 @@ public class LayoutParsingPipeline { OutlineObject lastProcessedOutlineObject = null; // parsing the structure elements could be useful as well - if(layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD && layoutParsingType != LayoutParsingType.DOCUMINE_OLD) { + if (layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD && layoutParsingType != LayoutParsingType.DOCUMINE_OLD) { classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originDocument)); } @@ -305,37 +307,23 @@ public class LayoutParsingPipeline { TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(words, cleanRulings); - List graphics = graphicExtractorService.extractPathElementGraphics(originDocument, - pdPage, - pageNumber, - cleanRulings, - stripper.getTextPositionSequences(), + List graphics = graphicExtractorService.extractPathElementGraphics(originDocument, pdPage, pageNumber, cleanRulings, stripper.getTextPositionSequences(), - false); + false); pdfImages.computeIfAbsent(pageNumber, x -> new ArrayList<>()) .addAll(graphics.stream() - .map(g -> new ClassifiedImage(new Rectangle2D.Double(g.x1, g.y1, g.width(), g.height()), - ImageType.GRAPHIC, - false, - stripper.getPageNumber(), - "")) + .map(g -> new ClassifiedImage(new Rectangle2D.Double(g.x1, g.y1, g.width(), g.height()), ImageType.GRAPHIC, false, stripper.getPageNumber(), "")) .toList()); ClassificationPage classificationPage = switch (layoutParsingType) { case REDACT_MANAGER_OLD -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings, classificationDocument.getVisualizations()); case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings); - case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH -> docstrumBlockificationService.blockify(words, - cleanRulings, - true, - classificationDocument.getVisualizations(), - layoutParsingType); - case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(words, - cleanRulings, - false, - classificationDocument.getVisualizations(), - layoutParsingType); + case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH -> + docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getVisualizations(), layoutParsingType); + case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> + docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getVisualizations(), layoutParsingType); }; classificationPage.setCleanRulings(cleanRulings); @@ -345,7 +333,7 @@ public class LayoutParsingPipeline { classificationPage.setPageWidth(cropbox.getWidth()); classificationPage.setPageHeight(cropbox.getHeight()); - if(layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD && layoutParsingType != LayoutParsingType.DOCUMINE_OLD) { + if (layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD && layoutParsingType != LayoutParsingType.DOCUMINE_OLD) { List outlineObjects = classificationDocument.getOutlineObjectTree().getOutlineObjectsPerPage().getOrDefault(pageNumber - 1, new ArrayList<>()); OutlineObject notFoundOutlineObject = null; @@ -394,8 +382,8 @@ public class LayoutParsingPipeline { } log.info("Classify TextBlocks for {}", identifier); switch (layoutParsingType) { - case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_OLD, CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH -> redactManagerClassificationService.classifyDocument( - classificationDocument); + case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_OLD, CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH -> + redactManagerClassificationService.classifyDocument(classificationDocument); case DOCUMINE_OLD, DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument); case CLARIFYND -> clarifyndClassificationService.classifyDocument(classificationDocument); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingServiceProcessorConfiguration.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingServiceProcessorConfiguration.java index 5471311..00e80f0 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingServiceProcessorConfiguration.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingServiceProcessorConfiguration.java @@ -5,6 +5,9 @@ import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.ComponentScan; import org.springframework.context.annotation.Configuration; +import com.google.common.base.Strings; +import com.knecon.fforesight.service.viewerdoc.service.IViewerDocumentService; +import com.knecon.fforesight.service.viewerdoc.service.pdftron.PDFTronViewerDocumentService; import com.knecon.fforesight.service.viewerdoc.service.ViewerDocumentService; import io.micrometer.observation.ObservationRegistry; @@ -13,12 +16,16 @@ import io.micrometer.observation.ObservationRegistry; @ComponentScan public class LayoutParsingServiceProcessorConfiguration { - @Bean @Autowired - public ViewerDocumentService viewerDocumentService(ObservationRegistry registry) { + public IViewerDocumentService viewerDocumentService(ObservationRegistry registry, LayoutparserSettings settings) { + + if (!Strings.isNullOrEmpty(settings.getPdftronLicense())) { + return new PDFTronViewerDocumentService(registry, settings.getPdftronLicense()); + } else { + return new ViewerDocumentService(registry); + } - return new ViewerDocumentService(registry); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutparserSettings.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutparserSettings.java index e64c8bd..7eefa33 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutparserSettings.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutparserSettings.java @@ -17,4 +17,5 @@ public class LayoutparserSettings { boolean debug; LayoutParsingType layoutParsingTypeOverride; + String pdftronLicense; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LayoutGridService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LayoutGridService.java index 1a0cf8e..857bd0a 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LayoutGridService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LayoutGridService.java @@ -35,7 +35,7 @@ import com.knecon.fforesight.service.viewerdoc.model.LayoutGrid; import com.knecon.fforesight.service.viewerdoc.model.PlacedText; import com.knecon.fforesight.service.viewerdoc.model.Standard14EmbeddableFont; import com.knecon.fforesight.service.viewerdoc.model.Visualizations; -import com.knecon.fforesight.service.viewerdoc.service.ViewerDocumentService; +import com.knecon.fforesight.service.viewerdoc.service.IViewerDocumentService; import io.micrometer.observation.annotation.Observed; import lombok.AccessLevel; @@ -48,7 +48,7 @@ import lombok.experimental.FieldDefaults; @FieldDefaults(level = AccessLevel.PRIVATE, makeFinal = true) public class LayoutGridService { - ViewerDocumentService viewerDocumentService; + IViewerDocumentService viewerDocumentService; static float FONT_SIZE = 10f; static float LINE_WIDTH = 1f; diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java index 1995d69..b01dc27 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java @@ -190,7 +190,8 @@ public class PdfSegmentationServiceTest extends AbstractTest { imageMetadata.getGeometry().getHeight()), ImageType.valueOf(imageMetadata.getClassification().getLabel().toUpperCase(Locale.ROOT)), imageMetadata.isAlpha(), - imageMetadata.getPosition().getPageNumber(), ""))); + imageMetadata.getPosition().getPageNumber(), + ""))); System.out.println("object"); } diff --git a/layoutparser-service/viewer-doc-processor/build.gradle b/layoutparser-service/viewer-doc-processor/build.gradle index 4a2f2c4..cdfe7e4 100644 --- a/layoutparser-service/viewer-doc-processor/build.gradle +++ b/layoutparser-service/viewer-doc-processor/build.gradle @@ -12,6 +12,7 @@ dependencies { implementation("org.apache.pdfbox:pdfbox-tools:${pdfBoxVersion}") implementation("org.slf4j:slf4j-api:1.7.25") implementation("com.knecon.fforesight:tracing-commons:0.5.0") + implementation("com.pdftron:PDFNet:10.5.0") testImplementation("org.apache.logging.log4j:log4j-slf4j-impl:2.22.1") testImplementation("org.junit.jupiter:junit-jupiter") diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/EmbeddableFont.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/EmbeddableFont.java index f7e878f..ee5388a 100644 --- a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/EmbeddableFont.java +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/EmbeddableFont.java @@ -3,8 +3,13 @@ package com.knecon.fforesight.service.viewerdoc.model; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.font.PDFont; +import com.pdftron.pdf.Font; +import com.pdftron.pdf.PDFDoc; + public interface EmbeddableFont { PDFont embed(PDDocument document); + Font embed(PDFDoc doc); + } diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/Standard14EmbeddableFont.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/Standard14EmbeddableFont.java index 5431602..a12dbd2 100644 --- a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/Standard14EmbeddableFont.java +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/Standard14EmbeddableFont.java @@ -1,10 +1,15 @@ package com.knecon.fforesight.service.viewerdoc.model; +import java.util.Objects; + import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.font.PDFont; import org.apache.pdfbox.pdmodel.font.PDType1Font; import org.apache.pdfbox.pdmodel.font.Standard14Fonts; +import com.pdftron.pdf.Font; +import com.pdftron.pdf.PDFDoc; + import lombok.RequiredArgsConstructor; import lombok.SneakyThrows; @@ -13,10 +18,11 @@ public class Standard14EmbeddableFont implements EmbeddableFont { private final PDType1Font font; + private final int pdfTronIdentifier; public static Standard14EmbeddableFont helvetica() { - return new Standard14EmbeddableFont(new PDType1Font(Standard14Fonts.FontName.HELVETICA)); + return new Standard14EmbeddableFont(new PDType1Font(Standard14Fonts.FontName.HELVETICA), Font.e_helvetica); } @@ -34,4 +40,12 @@ public class Standard14EmbeddableFont implements EmbeddableFont { return font; } + + @Override + @SneakyThrows + public Font embed(PDFDoc document) { + + return Font.create(document, pdfTronIdentifier); + } + } diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/VisualizationsOnPage.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/VisualizationsOnPage.java index a945708..0001805 100644 --- a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/VisualizationsOnPage.java +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/VisualizationsOnPage.java @@ -23,4 +23,10 @@ public class VisualizationsOnPage { @Builder.Default List filledRectangles = new LinkedList<>(); + + public boolean isEmpty() { + + return placedTexts.isEmpty() && coloredLines.isEmpty() && coloredRectangles.isEmpty() && filledRectangles.isEmpty(); + } + } diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/IViewerDocumentService.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/IViewerDocumentService.java new file mode 100644 index 0000000..76bfd37 --- /dev/null +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/IViewerDocumentService.java @@ -0,0 +1,27 @@ +package com.knecon.fforesight.service.viewerdoc.service; + +import java.io.File; +import java.util.List; + +import com.knecon.fforesight.service.viewerdoc.ContentStreams; +import com.knecon.fforesight.service.viewerdoc.model.Visualizations; + +import io.micrometer.observation.ObservationRegistry; + +public interface IViewerDocumentService { + + void addVisualizationsOnPage(File originFile, File destinationFile, List visualizations); + + default void enrichObservation(ObservationRegistry registry, int numberOfPages, List layers) { + + if (registry == null || registry.getCurrentObservation() == null || registry.isNoop()) { + return; + } + registry.getCurrentObservation().highCardinalityKeyValue("numberOfPages", String.valueOf(numberOfPages)); + for (int i = 0; i < layers.size(); i++) { + ContentStreams.Identifier layer = layers.get(i); + + registry.getCurrentObservation().highCardinalityKeyValue("layer_" + i, String.valueOf(layer.name())); + } + } +} diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/ViewerDocumentService.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/ViewerDocumentService.java index 04233da..9fd713c 100644 --- a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/ViewerDocumentService.java +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/ViewerDocumentService.java @@ -48,7 +48,7 @@ import lombok.extern.slf4j.Slf4j; @Slf4j @RequiredArgsConstructor -public class ViewerDocumentService { +public class ViewerDocumentService implements IViewerDocumentService { private final ObservationRegistry registry; @@ -64,7 +64,8 @@ public class ViewerDocumentService { PDDocument pdDocument = openPDDocument(tmpFile.toFile()); - enrichObservation(pdDocument, + enrichObservation(registry, + pdDocument.getNumberOfPages(), visualizations.stream() .map(Visualizations::getLayer) .toList()); @@ -106,7 +107,11 @@ public class ViewerDocumentService { contentStream.saveGraphicsState(); - drawVisualizationsToContentStream(pdDocument, visualization.getVisualizationsOnPages().get(pageNumber), contentStream, textDeRotationMatrix); + drawVisualizationsToContentStream(pdDocument, + visualization.getVisualizationsOnPages() + .get(pageNumber), + contentStream, + textDeRotationMatrix); contentStream.restoreGraphicsState(); @@ -185,8 +190,10 @@ public class ViewerDocumentService { contentStream.setFont(font, placedText.fontSize()); contentStream.beginText(); contentStream.setNonStrokingColor(placedText.color()); - if (placedText.renderingMode().isPresent()) { - contentStream.setRenderingMode(placedText.renderingMode().get()); + if (placedText.renderingMode() + .isPresent()) { + contentStream.setRenderingMode(placedText.renderingMode() + .get()); } else { contentStream.setRenderingMode(RenderingMode.FILL); } @@ -198,12 +205,12 @@ public class ViewerDocumentService { } - private void enrichObservation(PDDocument pdDocument, List layers) { + private void enrichObservation(int numberOfPages, List layers) { if (registry == null || registry.getCurrentObservation() == null || registry.isNoop()) { return; } - registry.getCurrentObservation().highCardinalityKeyValue("numberOfPages", String.valueOf(pdDocument.getNumberOfPages())); + registry.getCurrentObservation().highCardinalityKeyValue("numberOfPages", String.valueOf(numberOfPages)); for (int i = 0; i < layers.size(); i++) { ContentStreams.Identifier layer = layers.get(i); @@ -234,7 +241,8 @@ public class ViewerDocumentService { (float) placedText.lineStart().getX(), (float) placedText.lineStart().getY()); } else { - textMatrix = placedText.textMatrix().get(); + textMatrix = placedText.textMatrix() + .get(); } return textMatrix; } diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/pdftron/MarkedContentStack.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/pdftron/MarkedContentStack.java new file mode 100644 index 0000000..eedd1fe --- /dev/null +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/pdftron/MarkedContentStack.java @@ -0,0 +1,73 @@ +package com.knecon.fforesight.service.viewerdoc.service.pdftron; + +import java.util.Deque; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.Set; + +public class MarkedContentStack { + + private final Deque stack = new LinkedList<>(); + + + public void enterMarkedContent(String name) { + + stack.push(new MarkedContent(name)); + } + + + public void leaveMarkedContent() { + + stack.pop(); + } + + + public String currentMarkedContent() { + + if (stack.isEmpty()) { + return ""; + } + return stack.peek().name(); + } + + + public boolean currentMarkedContentContains(String name) { + + Iterator markedContentIterator = stack.descendingIterator(); + while (markedContentIterator.hasNext()) { + var markedContent = markedContentIterator.next(); + if (markedContent.name().equals(name)) { + return true; + } + } + return false; + } + + + public boolean currentMarkedContentContainsAny(Set names) { + + if (stack.isEmpty()) { + return false; + } + Iterator markedContentIterator = stack.descendingIterator(); + while (markedContentIterator.hasNext()) { + var markedContent = markedContentIterator.next(); + if (names.contains(markedContent.name())) { + return true; + } + } + return false; + } + + + public void clear() { + + stack.clear(); + } + + + private record MarkedContent(String name) { + + } + +} \ No newline at end of file diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/pdftron/PDFTronViewerDocumentService.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/pdftron/PDFTronViewerDocumentService.java new file mode 100644 index 0000000..424b02c --- /dev/null +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/pdftron/PDFTronViewerDocumentService.java @@ -0,0 +1,153 @@ +package com.knecon.fforesight.service.viewerdoc.service.pdftron; + +import java.io.File; +import java.io.FileInputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardCopyOption; +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.function.Function; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import org.apache.pdfbox.cos.COSName; + +import com.knecon.fforesight.service.viewerdoc.ContentStreams; +import com.knecon.fforesight.service.viewerdoc.model.EmbeddableFont; +import com.knecon.fforesight.service.viewerdoc.model.PlacedText; +import com.knecon.fforesight.service.viewerdoc.model.Visualizations; +import com.knecon.fforesight.service.viewerdoc.model.VisualizationsOnPage; +import com.knecon.fforesight.service.viewerdoc.service.IViewerDocumentService; +import com.pdftron.pdf.ElementBuilder; +import com.pdftron.pdf.ElementReader; +import com.pdftron.pdf.ElementWriter; +import com.pdftron.pdf.Font; +import com.pdftron.pdf.PDFDoc; +import com.pdftron.pdf.PDFNet; +import com.pdftron.pdf.Page; +import com.pdftron.pdf.PageIterator; +import com.pdftron.pdf.ocg.Group; +import com.pdftron.sdf.SDFDoc; + +import io.micrometer.observation.ObservationRegistry; +import io.micrometer.observation.annotation.Observed; +import lombok.RequiredArgsConstructor; +import lombok.SneakyThrows; +import lombok.extern.slf4j.Slf4j; + +@Slf4j +@RequiredArgsConstructor +public class PDFTronViewerDocumentService implements IViewerDocumentService { + + private final ObservationRegistry registry; + private final String pdftronLicense; + + + @Override + @Observed(name = "PDFTronViewerDocumentService", contextualName = "add-visualizations") + @SneakyThrows + public synchronized void addVisualizationsOnPage(File originFile, File destinationFile, List visualizations) { + + PDFNet.initialize(pdftronLicense); + + // originFile and destinationFile might be the same, so we use a temp file. + // Otherwise, saving the document might corrupt the file + Path tmpFile = Files.createTempFile("tmpViewerDocument", ".pdf"); + Files.copy(originFile.toPath(), tmpFile, StandardCopyOption.REPLACE_EXISTING); + + try (PDFDoc pdfDoc = loadPdfDoc(tmpFile);// + ElementWriter pageWriter = new ElementWriter();// + ElementReader reader = new ElementReader();// + ElementBuilder builder = new ElementBuilder()// + ) { + enrichObservation(registry, + pdfDoc.getPageCount(), + visualizations.stream() + .map(Visualizations::getLayer) + .toList()); + + Map groupMap = PdftronLayerUtility.addLayersToDocument(visualizations, pdfDoc); + + Map fontMap = buildFontMap(visualizations, pdfDoc); + + Set markedContentToDraw = extractMarkedContentNames(visualizations.stream() + .map(Visualizations::getLayer)); + + Set kneconMarkedContents = extractMarkedContentNames(ContentStreams.allContentStreams.stream()); + + PageContentCleaner pageContentCleaner = PageContentCleaner.builder() + .writer(pageWriter) + .reader(reader) + .elementBuilder(builder) + .markedContentToDraw(markedContentToDraw) + .kneconMarkedContents(kneconMarkedContents) + .build(); + + VisualizationWriter visualizationWriter = VisualizationWriter.builder() + .writer(pageWriter) + .builder(builder) + .groupMap(groupMap) + .visualizations(visualizations) + .fontMap(fontMap) + .build(); + + int pageNumber = 0; + for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); pageNumber++) { + + Page page = iterator.next(); + + pageContentCleaner.cleanPage(page); + + visualizationWriter.drawVisualizationsOnPage(pageNumber, page); + + } + + saveDocument(pdfDoc, destinationFile); + } + + PDFNet.terminate(); + } + + + private static Set extractMarkedContentNames(Stream visualizations) { + + return visualizations.map(ContentStreams.Identifier::cosName) + .map(COSName::getName) + .collect(Collectors.toSet()); + } + + + private static Map buildFontMap(List visualizations, PDFDoc pdfDoc) { + + return visualizations.stream() + .map(Visualizations::getVisualizationsOnPages) + .map(Map::values) + .flatMap(Collection::stream) + .map(VisualizationsOnPage::getPlacedTexts) + .flatMap(Collection::stream) + .map(PlacedText::font) + .distinct() + .collect(Collectors.toMap(Function.identity(), font -> font.embed(pdfDoc))); + + } + + + @SneakyThrows + private void saveDocument(PDFDoc doc, File target) { + + doc.save(target.toString(), SDFDoc.SaveMode.REMOVE_UNUSED, null); + } + + + @SneakyThrows + private static PDFDoc loadPdfDoc(Path tmpFile) { + + try (var in = new FileInputStream(tmpFile.toFile())) { + return new PDFDoc(in); + } + } + +} diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/pdftron/PageContentCleaner.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/pdftron/PageContentCleaner.java new file mode 100644 index 0000000..d6615f4 --- /dev/null +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/pdftron/PageContentCleaner.java @@ -0,0 +1,120 @@ +package com.knecon.fforesight.service.viewerdoc.service.pdftron; + +import java.util.Set; + +import com.pdftron.pdf.Element; +import com.pdftron.pdf.ElementBuilder; +import com.pdftron.pdf.ElementReader; +import com.pdftron.pdf.ElementWriter; +import com.pdftron.pdf.Page; + +import lombok.AccessLevel; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.SneakyThrows; +import lombok.experimental.FieldDefaults; + +@Builder +@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) +public class PageContentCleaner { + + ElementWriter writer; + ElementReader reader; + ElementBuilder elementBuilder; + Set markedContentToDraw; + Set kneconMarkedContents; + + @Builder.Default + MarkedContentStack markedContentStack = new MarkedContentStack(); + + + @SneakyThrows + public void cleanPage(Page page) { + + begin(page); + boolean escaped = reader.next().getType() == Element.e_group_begin; + + if (!escaped) { + writer.writeElement(elementBuilder.createGroupBegin()); + } + + copyElementsUntilFirstKneconMarkedContent(); + + if (!escaped) { + writer.writeElement(elementBuilder.createGroupEnd()); + } + + copyElementsExceptMarkedContentToDraw(); + end(); + } + + + @SneakyThrows + private void begin(Page page) { + + writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict()); + reader.begin(page); + } + + + @SneakyThrows + private void end() { + + writer.end(); + reader.end(); + } + + + @SneakyThrows + private void copyElementsUntilFirstKneconMarkedContent() { + + for (Element element = reader.current(); element != null; element = reader.next()) { + + switch (element.getType()) { + case Element.e_marked_content_begin -> { + markedContentStack.enterMarkedContent(element.getMCTag().getName()); + if (markedContentStack.currentMarkedContentContainsAny(kneconMarkedContents)) { + break; + } + writer.writeElement(element); + } + case Element.e_marked_content_end -> { + markedContentStack.leaveMarkedContent(); + writer.writeElement(element); + } + default -> writer.writeElement(element); + } + } + } + + + @SneakyThrows + private void copyElementsExceptMarkedContentToDraw() { + + for (Element element = reader.current(); element != null; element = reader.next()) { + + switch (element.getType()) { + case Element.e_marked_content_begin -> { + markedContentStack.enterMarkedContent(element.getMCTag().getName()); + if (!markedContentStack.currentMarkedContentContainsAny(markedContentToDraw)) { + writer.writeElement(element); + } + } + case Element.e_marked_content_end -> { + if (!markedContentStack.currentMarkedContentContainsAny(markedContentToDraw)) { + writer.writeElement(element); + } + markedContentStack.leaveMarkedContent(); + } + default -> { + if (!markedContentStack.currentMarkedContentContainsAny(markedContentToDraw)) { + writer.writeElement(element); + } + } + } + + } + + } + +} diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/pdftron/PdftronLayerUtility.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/pdftron/PdftronLayerUtility.java new file mode 100644 index 0000000..0359196 --- /dev/null +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/pdftron/PdftronLayerUtility.java @@ -0,0 +1,96 @@ +package com.knecon.fforesight.service.viewerdoc.service.pdftron; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; + +import com.knecon.fforesight.service.viewerdoc.ContentStreams; +import com.knecon.fforesight.service.viewerdoc.model.Visualizations; +import com.pdftron.common.PDFNetException; +import com.pdftron.pdf.PDFDoc; +import com.pdftron.pdf.ocg.Config; +import com.pdftron.pdf.ocg.Group; +import com.pdftron.sdf.Obj; + +import lombok.SneakyThrows; +import lombok.experimental.UtilityClass; + +@UtilityClass +public class PdftronLayerUtility { + + public Map addLayersToDocument(List visualizations, PDFDoc pdfDoc) { + + Map optionalContentGroupMap = new HashMap<>(); + for (Visualizations visualization : visualizations) { + addLayerToDocument(visualization.getLayer(), pdfDoc, visualization.isLayerVisibilityDefaultValue())// + .ifPresent(ocg -> optionalContentGroupMap.put(visualization.getLayer(), ocg)); + } + return optionalContentGroupMap; + } + + + private Optional addLayerToDocument(ContentStreams.Identifier layer, PDFDoc pdfDoc, boolean layerVisibilityDefaultValue) { + + if (layer.optionalContent()) { + return Optional.of(addLayerToDocument(pdfDoc, layer.name(), layerVisibilityDefaultValue)); + + } + return Optional.empty(); + + } + + + @SneakyThrows + private Group addLayerToDocument(PDFDoc doc, String layerName, boolean layerVisibilityDefaultValue) { + + Optional existingGroup = findGroupInDoc(doc, layerName); + + if (existingGroup.isPresent()) { + return existingGroup.get(); + } + + return addNewLayer(doc, layerName, layerVisibilityDefaultValue); + } + + + private Group addNewLayer(PDFDoc doc, String layerName, boolean layerVisibilityDefaultValue) throws PDFNetException { + + Config cfg = doc.getOCGConfig(); + if (cfg == null) { + cfg = Config.create(doc, true); + cfg.setName("Default"); + } + Group grp = Group.create(doc, layerName); + grp.setInitialState(cfg, layerVisibilityDefaultValue); + + // Add the new OCG to the list of layers that should appear in PDF viewer GUI. + Obj layerOrderArray = cfg.getOrder(); + if (layerOrderArray == null) { + layerOrderArray = doc.createIndirectArray(); + cfg.setOrder(layerOrderArray); + } + layerOrderArray.pushBack(grp.getSDFObj()); + + return grp; + } + + + @SneakyThrows + private Optional findGroupInDoc(PDFDoc doc, String layerName) { + + Obj ocgs = doc.getOCGs(); + if (ocgs != null) { + int i; + int sz = (int) ocgs.size(); + for (i = 0; i < sz; ++i) { + Group ocg = new Group(ocgs.getAt(i)); + if (ocg.getName().equals(layerName)) { + return Optional.of(ocg); + } + } + } + return Optional.empty(); + } + +} diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/pdftron/VisualizationWriter.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/pdftron/VisualizationWriter.java new file mode 100644 index 0000000..cf68d61 --- /dev/null +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/pdftron/VisualizationWriter.java @@ -0,0 +1,249 @@ +package com.knecon.fforesight.service.viewerdoc.service.pdftron; + +import java.awt.geom.AffineTransform; +import java.awt.geom.Line2D; +import java.awt.geom.Rectangle2D; +import java.util.List; +import java.util.Map; + +import com.knecon.fforesight.service.viewerdoc.ContentStreams; +import com.knecon.fforesight.service.viewerdoc.model.ColoredLine; +import com.knecon.fforesight.service.viewerdoc.model.ColoredRectangle; +import com.knecon.fforesight.service.viewerdoc.model.EmbeddableFont; +import com.knecon.fforesight.service.viewerdoc.model.FilledRectangle; +import com.knecon.fforesight.service.viewerdoc.model.PlacedText; +import com.knecon.fforesight.service.viewerdoc.model.Visualizations; +import com.knecon.fforesight.service.viewerdoc.model.VisualizationsOnPage; +import com.pdftron.common.Matrix2D; +import com.pdftron.common.PDFNetException; +import com.pdftron.pdf.ColorPt; +import com.pdftron.pdf.ColorSpace; +import com.pdftron.pdf.Element; +import com.pdftron.pdf.ElementBuilder; +import com.pdftron.pdf.ElementWriter; +import com.pdftron.pdf.Font; +import com.pdftron.pdf.GState; +import com.pdftron.pdf.Page; +import com.pdftron.pdf.ocg.Group; + +import lombok.AccessLevel; +import lombok.Builder; +import lombok.SneakyThrows; +import lombok.experimental.FieldDefaults; + +@Builder +@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) +public class VisualizationWriter { + + ElementWriter writer; + ElementBuilder builder; + List visualizations; + Map groupMap; + Map fontMap; + + + @SneakyThrows + public void drawVisualizationsOnPage(int pageNumber, Page page) { + + begin(page); + + AffineTransform textDeRotationMatrix = getTextDeRotationTransform(page); + + for (Visualizations visualization : visualizations) { + + VisualizationsOnPage visualizationsOnPage = visualization.getVisualizationsOnPages() + .get(pageNumber); + + if (visualizationsOnPage == null || visualizationsOnPage.isEmpty()) { + continue; + } + + Element markedContentStart = builder.createMarkedContentBeginInlineProperties(visualization.getLayer().cosName().getName()); + writer.writeElement(markedContentStart); + + if (visualization.getLayer().optionalContent()) { + Element ocgStart = builder.createMarkedContentBegin("OC", groupMap.get(visualization.getLayer()).getSDFObj()); + writer.writeElement(ocgStart); + } + + writeVisualization(visualizationsOnPage, textDeRotationMatrix); + + if (visualization.getLayer().optionalContent()) { + Element ocgEnd = builder.createMarkedContentEnd(); + writer.writeElement(ocgEnd); + } + + Element markedContentEnd = builder.createMarkedContentEnd(); + writer.writeElement(markedContentEnd); + + } + + end(); + + } + + + private void end() throws PDFNetException { + + writer.end(); + } + + + private void begin(Page page) throws PDFNetException { + + writer.begin(page, ElementWriter.e_overlay, false, true, page.getResourceDict()); + } + + + @SneakyThrows + private void writeVisualization(VisualizationsOnPage visualizationsOnPage, AffineTransform textDeRotationMatrix) { + + if (visualizationsOnPage.isMakePathsInvisible()) { + Element rect = builder.createRect(0, 0, 0, 0); + rect.setPathClip(true); + writer.writeElement(rect); + } + + for (ColoredLine coloredLine : visualizationsOnPage.getColoredLines()) { + + drawColoredLine(coloredLine); + } + + for (ColoredRectangle coloredRectangle : visualizationsOnPage.getColoredRectangles()) { + + drawColoredRectangle(coloredRectangle); + } + + for (FilledRectangle filledRectangle : visualizationsOnPage.getFilledRectangles()) { + + drawFilledRectangle(filledRectangle); + } + + for (PlacedText placedText : visualizationsOnPage.getPlacedTexts()) { + + writePlacedText(textDeRotationMatrix, placedText); + } + } + + + private void writePlacedText(AffineTransform textDeRotationMatrix, PlacedText placedText) throws PDFNetException { + + float[] rgbComponents = placedText.color().getRGBColorComponents(null); + Font font = fontMap.get(placedText.font()); + + Element text = builder.createTextRun(placedText.text(), font, placedText.fontSize()); + + if (placedText.renderingMode() + .isPresent()) { + text.getGState() + .setRenderingIntent(placedText.renderingMode() + .get().intValue()); + } else { + try (ColorPt color = new ColorPt(rgbComponents[0], rgbComponents[1], rgbComponents[2])) { + text.getGState().setFillColor(color); + } + text.getGState().setRenderingIntent(GState.e_fill_text); + } + + try (Matrix2D textMatrix = getTextMatrix(placedText, textDeRotationMatrix)) { + text.setTextMatrix(textMatrix); + } + + writer.writeElement(text); + } + + + private void drawFilledRectangle(FilledRectangle filledRectangle) throws PDFNetException { + + float[] rgbComponents = filledRectangle.color().getRGBColorComponents(null); + Rectangle2D r = filledRectangle.rectangle2D(); + + Element rect = builder.createRect(r.getX(), r.getY(), r.getWidth(), r.getHeight()); + + rect.setPathFill(true); + rect.setPathStroke(false); + rect.getGState().setFillColorSpace(ColorSpace.createDeviceRGB()); + rect.getGState().setFillOpacity(filledRectangle.alpha()); + + try (ColorPt color = new ColorPt(rgbComponents[0], rgbComponents[1], rgbComponents[2])) { + rect.getGState().setFillColor(color); + } + + writer.writeElement(rect); + } + + + private void drawColoredRectangle(ColoredRectangle coloredRectangle) throws PDFNetException { + + float[] rgbComponents = coloredRectangle.color().getRGBColorComponents(null); + Rectangle2D r = coloredRectangle.rectangle2D(); + + Element rect = builder.createRect(r.getX(), r.getY(), r.getWidth(), r.getHeight()); + + rect.setPathStroke(true); + rect.setPathFill(false); + rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB()); + rect.getGState().setLineWidth(coloredRectangle.lineWidth()); + + try (ColorPt color = new ColorPt(rgbComponents[0], rgbComponents[1], rgbComponents[2])) { + rect.getGState().setStrokeColor(color); + } + + writer.writeElement(rect); + } + + + private void drawColoredLine(ColoredLine coloredLine) throws PDFNetException { + + float[] rgbComponents = coloredLine.color().getRGBColorComponents(null); + Line2D l = coloredLine.line(); + + builder.pathBegin(); + builder.moveTo(l.getX1(), l.getY1()); + builder.lineTo(l.getX2(), l.getY2()); + Element line = builder.pathEnd(); + + line.setPathStroke(true); + line.setPathFill(false); + line.getGState().setLineWidth(coloredLine.lineWidth()); + line.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB()); + + try (ColorPt color = new ColorPt(rgbComponents[0], rgbComponents[1], rgbComponents[2])) { + line.getGState().setStrokeColor(color); + } + writer.writeElement(line); + } + + + @SneakyThrows + private static Matrix2D getTextMatrix(PlacedText placedText, AffineTransform textDeRotationMatrix) { + + Matrix2D textMatrix; + if (placedText.textMatrix().isEmpty()) { + textMatrix = new Matrix2D(textDeRotationMatrix.getScaleX(), + textDeRotationMatrix.getShearX(), + textDeRotationMatrix.getShearY(), + textDeRotationMatrix.getScaleY(), + placedText.lineStart().getX(), + placedText.lineStart().getY()); + } else { + var matrix = placedText.textMatrix() + .get(); + textMatrix = new Matrix2D(matrix.getScaleX(), matrix.getShearX(), matrix.getShearY(), matrix.getScaleY(), matrix.getTranslateX(), matrix.getTranslateY()); + } + return textMatrix; + } + + + @SneakyThrows + private static AffineTransform getTextDeRotationTransform(Page page) { + + return AffineTransform.getQuadrantRotateInstance(switch (page.getRotation()) { + case 90 -> 3; + case 180 -> 2; + case 270 -> 1; + default -> 0; + }); + } + +} diff --git a/publish-custom-image.sh b/publish-custom-image.sh index e2191d7..2411d1f 100755 --- a/publish-custom-image.sh +++ b/publish-custom-image.sh @@ -1,5 +1,7 @@ #!/bin/bash -dir=${PWD##*/} + +set -e + gradle assemble # Get the current Git branch @@ -11,5 +13,31 @@ commit_hash=$(git rev-parse --short=5 HEAD) # Combine branch and commit hash buildName="${USER}-${branch}-${commit_hash}" -gradle bootBuildImage --publishImage -PbuildbootDockerHostNetwork=true -Pversion=$buildName --no-build-cache -echo "nexus.knecon.com:5001/ff/layoutparser-service-server:$buildName" +gradle bootBuildImage --publishImage -PbuildbootDockerHostNetwork=true -Pversion=${buildName} + +newImageName="nexus.knecon.com:5001/ff/layoutparser-service-server:${buildName}" + +echo "full image name:" +echo ${newImageName} +echo "" + +if [ -z "$1" ]; then + exit 0 +fi + +namespace=${1} +deployment_name="layoutparser-service" + +echo "deploying to ${namespace}" + +oldImageName=$(rancher kubectl -n ${namespace} get deployment ${deployment_name} -o=jsonpath='{.spec.template.spec.containers[*].image}') + +if [ "${newImageName}" = "${oldImageName}" ]; then + echo "Image tag did not change, redeploying..." + rancher kubectl rollout restart deployment ${deployment_name} -n ${namespace} +else + echo "upgrading the image tag..." + rancher kubectl set image deployment/${deployment_name} ${deployment_name}=${newImageName} -n ${namespace} +fi +rancher kubectl rollout status deployment ${deployment_name} -n ${namespace} +echo "Built ${deployment_name}:${buildName} and deployed to ${namespace}"