RED-9974: wip

2024-09-13 12:36:59 +02:00 · 2024-09-13 12:36:59 +02:00 · 1337c56591
commit 1337c56591
parent 31bf4ba8c8
8 changed files with 166 additions and 52 deletions
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java
@ -40,6 +40,8 @@ public class TextPageBlock extends AbstractPageBlock {

    private double mostPopularWordSpaceWidth;

+    private boolean underlined;
+
    private double highestFontSize;

    private PageBlockType classification;
@ -140,6 +142,9 @@ public class TextPageBlock extends AbstractPageBlock {
        setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
        setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
        setHighestFontSize(fontSizeFrequencyCounter.getHighest());
+
+        setUnderlined(sequences.stream()
+                              .allMatch(TextPositionSequence::isUnderline));
    }


@ -199,19 +204,7 @@ public class TextPageBlock extends AbstractPageBlock {
    @Override
    public String toString() {

-        StringBuilder builder = new StringBuilder();
-
-        for (int i = 0; i < sequences.size(); i++) {
-            String sequenceAsString = sequences.get(i).toString();
-            // Fix for missing Whitespace. This is recognized in getSequences method. See PDFTextStripper Line 1730.
-            if (i != 0 && sequences.get(i - 1).charAt(sequences.get(i - 1).length() - 1) != ' ' && sequenceAsString.charAt(0) != ' ') {
-                builder.append(' ');
-            }
-            builder.append(sequenceAsString);
-        }
-
-        return builder.toString();
-
+        return getText();
    }


--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/RedactManagerBlockificationService.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/RedactManagerBlockificationService.java
@ -161,7 +161,6 @@ public class RedactManagerBlockificationService {
        }
        if (!textPositions.isEmpty()) {
            visualizations.addTextBlockVisualizations(chunkBlockList.stream()
-                                                              .map(tb -> (TextPageBlock) tb)
                                                              .toList(), textPositions.get(0).getPage());
        }

--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java
@ -1,5 +1,6 @@
 package com.knecon.fforesight.service.layoutparser.processor.services.classification;

+import java.util.ArrayList;
 import java.util.List;
 import java.util.Locale;
 import java.util.regex.Matcher;
@ -24,9 +25,17 @@ import lombok.extern.slf4j.Slf4j;
@RequiredArgsConstructor
 public class DocuMineClassificationService {

-    private static final Pattern HEADLINE_WITH_IDENTIFER_PATTERN = Pattern.compile("^([1-9]\\d?\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
-    private static final Pattern AT_LEAST_3_PATTERN = Pattern.compile("\\p{L}{3,}", Pattern.CASE_INSENSITIVE);
-    private static final Pattern HEADLINE_PATTTERN_WITH_SLASHES = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*");
+    private static final Pattern HEADLINE_WITH_2_IDENTIFER_PATTERN = Pattern.compile("^([1-9]\\d?\\.){1,3}\\d{1,2}\\.?\\s[a-z][0-9a-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
+    private static final Pattern HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN = Pattern.compile("^([0-9]\\.)\\s[a-z][0-9a-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
+    private static final Pattern AT_LEAST_3_CHARS_PATTERN = Pattern.compile("\\p{L}{3,}", Pattern.CASE_INSENSITIVE);
+    private static final Pattern HEADLINE_PATTERN_WITH_SLASHES = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*");
+    private static final Pattern AMOUNT_PATTERN = Pattern.compile("^\\s*\\d+(?:\\.\\d+)?\\s*(?:ml|l|g|kg|mg|cm|mm|km|m|lb|oz|ppm|%|f)\\b", Pattern.CASE_INSENSITIVE);
+    private static final Pattern TABLE_OR_FIGURE_PATTER = Pattern.compile(
+            "^\\s*(?:table|continued\\s+table|appendix|figure)\\s+(?:[xvi]+|[a-z0-9]{1,3}(?:\\.[0-9]{1,3})*(?:-[0-9]{1,3})?)\\b",
+            Pattern.CASE_INSENSITIVE);
+
+    public static final int SEPARATION_THRESHOLD = 10; // if the min distance between a textblock and all its surrounding blocks, the regexes can be more lenient.
+    public static final int SURROUNDING_BLOCKS_RADIUS = 3; // number of surrounding blocks before and after the current textblock to be tested


    public void classifyDocument(ClassificationDocument document) {
@ -38,6 +47,7 @@ public class DocuMineClassificationService {
        HeadlineClassificationService headlineClassificationService = new HeadlineClassificationService();

        for (ClassificationPage page : document.getPages()) {
+            document.getLayoutDebugLayer().addTextBlockVisualizations(page.getTextBlocks(), page.getPageNumber());
            classifyPage(headlineClassificationService, page, document, headlineFontSizes);
        }
    }
@ -48,16 +58,35 @@ public class DocuMineClassificationService {
                              ClassificationDocument document,
                              List<Double> headlineFontSizes) {

-        for (AbstractPageBlock textBlock : page.getTextBlocks()) {
+        List<AbstractPageBlock> textBlocks = page.getTextBlocks();
+        for (int i = 0; i < textBlocks.size(); i++) {
+            AbstractPageBlock textBlock = textBlocks.get(i);
            if (textBlock instanceof TextPageBlock) {
-                classifyBlock(headlineClassificationService, (TextPageBlock) textBlock, page, document, headlineFontSizes);
+                List<AbstractPageBlock> surroundingBlocks = getSurroundingBlocks(i, textBlocks);
+                classifyBlock(headlineClassificationService, (TextPageBlock) textBlock, surroundingBlocks, page, document, headlineFontSizes);
            }
        }
    }


+    private List<AbstractPageBlock> getSurroundingBlocks(int originalIndex, List<AbstractPageBlock> textBlocks) {
+
+        int start = Math.max(originalIndex - SURROUNDING_BLOCKS_RADIUS, 0);
+        int end = Math.min(originalIndex + SURROUNDING_BLOCKS_RADIUS, textBlocks.size());
+        List<AbstractPageBlock> surroundingBlocks = new ArrayList<>(2 * SURROUNDING_BLOCKS_RADIUS);
+        for (int i = start; i < end; i++) {
+            if (i == originalIndex) {
+                continue;
+            }
+            surroundingBlocks.add(textBlocks.get(i));
+        }
+        return surroundingBlocks;
+    }
+
+
    private void classifyBlock(HeadlineClassificationService headlineClassificationService,
                               TextPageBlock textBlock,
+                               List<AbstractPageBlock> surroundingBlocks,
                               ClassificationPage page,
                               ClassificationDocument document,
                               List<Double> headlineFontSizes) {
@ -65,9 +94,16 @@ public class DocuMineClassificationService {
        log.debug("headlineFontSizes: {}", headlineFontSizes);
        var bodyTextFrame = page.getBodyTextFrame();

-        Matcher headlineWithIdentifierMatcher = HEADLINE_WITH_IDENTIFER_PATTERN.matcher(textBlock.toString());
-        Matcher atLeast3Matcher = AT_LEAST_3_PATTERN.matcher(textBlock.toString());
-        Matcher headlineWithSlashesMatcher = HEADLINE_PATTTERN_WITH_SLASHES.matcher(textBlock.toString());
+        Matcher headlineWith2IdentifierMatcher = HEADLINE_WITH_2_IDENTIFER_PATTERN.matcher(textBlock.toString());
+        Matcher atLeast3Matcher = AT_LEAST_3_CHARS_PATTERN.matcher(textBlock.toString());
+        Matcher headlineWithSlashesMatcher = HEADLINE_PATTERN_WITH_SLASHES.matcher(textBlock.toString());
+        Matcher amountMatcher = AMOUNT_PATTERN.matcher(textBlock.toString());
+        Matcher tableOrFigureMatcher = TABLE_OR_FIGURE_PATTER.matcher(textBlock.toString());
+        Matcher headlineWithSingleIdentifierMatcher = HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN.matcher(textBlock.toString());
+        boolean isAtLeast3Characters = atLeast3Matcher.reset().find();
+        boolean isTocItem = textBlock.getText().contains("..............");
+        boolean headlineWithSlashesMatches = headlineWithSlashesMatcher.reset().matches();
+        boolean isAmount = amountMatcher.reset().find();

        if (textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) {
            headlineClassificationService.setLastHeadlineFromOutline(textBlock);
@ -103,54 +139,110 @@ public class DocuMineClassificationService {
                   && (textBlock.getMostPopularWordHeight() > document.getTextHeightCounter().getMostPopular()
                       || textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular())
                   && PositionUtils.getApproxLineCount(textBlock) < 5.9
-
-                   && (textBlock.getMostPopularWordStyle().contains("bold")
-                       && Character.isDigit(textBlock.toString().charAt(0))
-                       && atLeast3Matcher.reset().find()
+                   && ((textBlock.getMostPopularWordStyle().contains("bold") || textBlock.isUnderlined())//
+                       && Character.isDigit(textBlock.toString().charAt(0)) //
+                       && isAtLeast3Characters //
                       && !textBlock.toString().contains(":") //
-                       || textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT))
-                          && atLeast3Matcher.reset().find()
-                          && !textBlock.toString().contains(":")
-                          && !textBlock.toString().startsWith("(")//
                       || textBlock.toString().startsWith("APPENDIX") //
                       || textBlock.toString().startsWith("FIGURE") //
                       || textBlock.toString().startsWith("Continued TABLE") //
                       || textBlock.toString().startsWith("TABLE"))
                   && !textBlock.toString().endsWith(":")
-                   && atLeast3Matcher.reset().find()) {
+                   && isAtLeast3Characters
+                   && !isTocItem
+                   && !isAmount) {

-            PageBlockType headlineType = HeadlineClassificationService.headlineClassByFontSize(textBlock, headlineFontSizes);
-            headlineClassificationService.classifyHeadline(textBlock, headlineType);
-            document.setHeadlines(true);
-        } else if (headlineWithIdentifierMatcher.reset().find()
+            setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes);
+        } else if (isAllCaps(textBlock) //
+                   && textBlock.getText().length() > 5 //
+                   && isAtLeast3Characters //
+                   && !isAmount//
+                   && !textBlock.toString().contains(":") //
+                   && !textBlock.toString().startsWith("(") //
+                   && PositionUtils.getApproxLineCount(textBlock) < 2.9) {
+
+            setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes);
+        } else if (headlineWith2IdentifierMatcher.reset().find()
                   && PositionUtils.getApproxLineCount(textBlock) < 2.9
-                   && atLeast3Matcher.reset().find()
-                   && !headlineWithSlashesMatcher.reset().matches()) {
+                   && isAtLeast3Characters
+                   && !headlineWithSlashesMatches
+                   && !isAmount
+                   && !isTocItem) {

-            PageBlockType headlineType = HeadlineClassificationService.headlineClassByFontSize(textBlock, headlineFontSizes);
-            headlineClassificationService.classifyHeadline(textBlock, headlineType);
-            document.setHeadlines(true);
+            setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes);
+        } else if (!isTocItem //
+                   && hasSeparation(textBlock, surroundingBlocks) //
+                   && (textBlock.getMostPopularWordHeight() >= page.getTextHeightCounter().getMostPopular() //
+                       || textBlock.getMostPopularWordFontSize() >= page.getFontSizeCounter().getMostPopular()) //
+                   && PositionUtils.getApproxLineCount(textBlock) < 2.9 //
+                   && (tableOrFigureMatcher.reset().find() || headlineWithSingleIdentifierMatcher.reset().find()) //
+                   && !isAmount //
+                   && !headlineWithSlashesMatches) {
+
+            setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes);
        } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
                   && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()
                   && textBlock.getMostPopularWordStyle().equals("bold")
                   && !document.getFontStyleCounter().getMostPopular().equals("bold")) {
+
            textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD);
        } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
                   && textBlock.getMostPopularWordFont().equals(document.getFontCounter().getMostPopular())
                   && textBlock.getMostPopularWordStyle().equals(document.getFontStyleCounter().getMostPopular())
                   && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) {
+
            textBlock.setClassification(PageBlockType.PARAGRAPH);
        } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
                   && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()
                   && textBlock.getMostPopularWordStyle().equals("italic")
                   && !document.getFontStyleCounter().getMostPopular().equals("italic")
                   && PositionUtils.getApproxLineCount(textBlock) < 2.9) {
+
            textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC);
-        } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) {
-            textBlock.setClassification(PageBlockType.PARAGRAPH);
        } else {
            textBlock.setClassification(PageBlockType.PARAGRAPH);
        }
    }

-}
+
+    private static boolean isAllCaps(TextPageBlock textBlock) {
+
+        return textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT));
+    }
+
+
+    private boolean hasSeparation(TextPageBlock textBlock, List<AbstractPageBlock> surroundingBlocks) {
+
+        return surroundingBlocks.stream()
+                .allMatch(surroundingBlock -> calculateSeparation(textBlock, surroundingBlock) > Math.pow(SEPARATION_THRESHOLD, 2));
+    }
+
+
+    private double calculateMinSeparation(TextPageBlock textBlock, List<AbstractPageBlock> surroundingBlocks) {
+
+        return surroundingBlocks.stream()
+                .mapToDouble(surroundingBlock -> calculateSeparation(textBlock, surroundingBlock))
+                .min().orElse(Double.MAX_VALUE);
+    }
+
+
+    private static double calculateSeparation(TextPageBlock textBlock, AbstractPageBlock surroundingBlock) {
+
+        return Math.pow(surroundingBlock.horizontalDistance(textBlock), 2) + Math.pow(surroundingBlock.verticalDistance(textBlock), 2);
+    }
+
+
+    private static void setAsHeadline(HeadlineClassificationService headlineClassificationService,
+                                      TextPageBlock textBlock,
+                                      ClassificationDocument document,
+                                      List<Double> headlineFontSizes) {
+
+        PageBlockType headlineType = HeadlineClassificationService.headlineClassByFontSize(textBlock, headlineFontSizes);
+        headlineClassificationService.classifyHeadline(textBlock, headlineType);
+        document.setHeadlines(true);
+    }
+
+}
+
+
+
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutDebugLayer.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutDebugLayer.java
@ -18,10 +18,10 @@ import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlo
 import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
 import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
 import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
+import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
 import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
 import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
 import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
-import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
 import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
 import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
 import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
@ -177,7 +177,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
    }


-    public void addTextBlockVisualizations(List<TextPageBlock> textPageBlocks, int page) {
+    public void addTextBlockVisualizations(List<AbstractPageBlock> textPageBlocks, int page) {

        if (!active) {
            return;
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutGrid.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutGrid.java
@ -88,6 +88,9 @@ public class LayoutGrid extends LayoutGridLayerConfig {
    public void addTreeId(SemanticNode semanticNode) {

        Page page = semanticNode.getFirstPage();
+        if (semanticNode.getBBox().get(page) == null) {
+            return;
+        }
        addPlacedText(page, semanticNode.getBBox().get(page), semanticNode.getBBox().get(page), buildTreeIdString(semanticNode), 1, treeIds, TREEID_COLOR);
    }

--- a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/OcrDebugLayerConfig.java
+++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/OcrDebugLayerConfig.java
@ -25,7 +25,7 @@ public class OcrDebugLayerConfig extends AbstractLayerGroup {

    protected final Visualizations debugText = Visualizations.builder().layer(LayerIdentifier.KNECON_OCR_TEXT_DEBUG).visibleByDefault(true).build();
    protected final Visualizations tableLines = Visualizations.builder().layer(LayerIdentifier.KNECON_OCR_LINE_DEBUG).visibleByDefault(true).build();
-    protected final Visualizations overlappedText = Visualizations.builder().layer(LayerIdentifier.KNECON_OCR_OVERLAPPED_TEXT).visibleByDefault(false).build();
+    protected final Visualizations overlappedText = Visualizations.builder().layer(LayerIdentifier.KNECON_OCR_OVERLAPPED_TEXT).visibleByDefault(true).build();
    protected final Visualizations debugBBox = Visualizations.builder().layer(LayerIdentifier.KNECON_OCR_BBOX_DEBUG).visibleByDefault(false).build();


@ -35,4 +35,11 @@ public class OcrDebugLayerConfig extends AbstractLayerGroup {
        return List.of(debugText, tableLines, debugBBox, overlappedText);
    }

+
+    @Override
+    public boolean isVisibleByDefault() {
+
+        return true;
+    }
+
 }
--- a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/OutlineUtility.java
+++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/OutlineUtility.java
@ -65,13 +65,13 @@ public class OutlineUtility {


    @SneakyThrows
-    private static void deleteExistingOutline(PDFDoc doc) {
+    public static void deleteExistingOutline(PDFDoc doc) {

        Bookmark firstBookmark = doc.getFirstBookmark();
-        while (firstBookmark != null && firstBookmark.isValid()) {
+//        while (firstBookmark != null && firstBookmark.isValid()) {
            firstBookmark.delete();
            firstBookmark = doc.getFirstBookmark();
-        }
+//        }

    }

--- a/layoutparser-service/viewer-doc-processor/src/test/java/com/knecon/fforesight/service/viewerdoc/service/PageContentCleanerTest.java
+++ b/layoutparser-service/viewer-doc-processor/src/test/java/com/knecon/fforesight/service/viewerdoc/service/PageContentCleanerTest.java
@ -4,6 +4,7 @@ import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileOutputStream;
 import java.nio.file.Path;
+import java.util.List;
 import java.util.Set;

 import org.junit.jupiter.api.AfterAll;
@ -12,6 +13,8 @@ import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;

 import com.knecon.fforesight.service.viewerdoc.LayerIdentifier;
+import com.knecon.fforesight.service.viewerdoc.layers.IdpLayerConfig;
+import com.knecon.fforesight.service.viewerdoc.layers.OcrDebugLayerConfig;
 import com.pdftron.pdf.ElementBuilder;
 import com.pdftron.pdf.ElementReader;
 import com.pdftron.pdf.ElementWriter;
@ -44,8 +47,8 @@ class PageContentCleanerTest {
    @SneakyThrows
    public void testContentCleaning() {

-        Path file = Path.of("/home/kschuettler/Downloads/ITEM 23_Absorção cutanea.pdf");
-        File tmpFile = new File("/tmp/ITEM 23_Absorção cutanea.pdf");
+        Path file = Path.of("/home/kschuettler/Downloads/pdf24_zusammengefügt.pdf");
+        File tmpFile = new File("/tmp/OCR_DEMO.pdf");
        try (var in = new FileInputStream(file.toFile());//
             var doc = new PDFDoc(in);//
             var out = new FileOutputStream(tmpFile);//
@ -58,7 +61,12 @@ class PageContentCleanerTest {
                    .writer(pageWriter)
                    .reader(reader)
                    .elementBuilder(builder)
-                    .markedContentToRemove(Set.of(LayerIdentifier.KNECON_LAYOUT.markedContentName()))
+                    .markedContentToRemove(Set.of(LayerIdentifier.KNECON_OCR.markedContentName(),
+                                                  LayerIdentifier.KNECON_AZURE_IDP.markedContentName(),
+                                                  LayerIdentifier.KNECON_OCR_DEBUG.markedContentName(),
+                                                  LayerIdentifier.IDP_TABLES.markedContentName(),
+                                                  LayerIdentifier.IDP_KV_PAIRS.markedContentName(),
+                                                  LayerIdentifier.IDP_SECTIONS.markedContentName()))
                    .build();

            try (PageIterator iterator = doc.getPageIterator()) {
@ -74,4 +82,16 @@ class PageContentCleanerTest {

    }

+    @Test
+    @SneakyThrows
+    public void activateLayersByDefault() {
+
+        Path file = Path.of("/tmp/OCR_TEST/pdf24_zusammengefügt (1).pdf/viewerDocument.pdf");
+        try (var in = new FileInputStream(file.toFile()); PDFDoc doc = new PDFDoc(in); var out = new FileOutputStream("/tmp/OCR_DEMO_OCRED.pdf")) {
+            PdftronLayerUtility.setOrderArrayForPresentGroups(doc, List.of(OcrDebugLayerConfig.CONFIG_INSTANCE, IdpLayerConfig.CONFIG_INSTANCE));
+            doc.save(out, SDFDoc.SaveMode.REMOVE_UNUSED, null);
+        }
+
+    }
+
 }