Merge branch 'RED-9149' into 'release/0.78.x'

RED-9149 - Remove header detection & bump lombok See merge request fforesight/layout-parser!156
RED-9149 - Remove header detection & bump lombok
2024-05-20 14:12:06 +02:00 · 2024-05-20 14:12:06 +02:00 · 2024-05-13 14:57:51 +02:00 · 2024-05-13 14:57:51 +02:00 · 2024-05-08 11:22:03 +02:00 · 2024-05-08 10:50:55 +02:00
12 changed files with 346 additions and 106 deletions
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@ -16,6 +16,8 @@ deploy:
    reports:
      dotenv: version.env
  rules:
-    - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH 
+    - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
    - if: $CI_COMMIT_BRANCH =~ /^release/
    - if: $CI_COMMIT_TAG
+pmd:
+  allow_failure: true
--- a/layoutparser-service/layoutparser-service-internal-api/build.gradle.kts
+++ b/layoutparser-service/layoutparser-service-internal-api/build.gradle.kts
@ -1,6 +1,6 @@
 plugins {
    id("com.knecon.fforesight.java-conventions")
-    id("io.freefair.lombok") version "8.2.2"
+    id("io.freefair.lombok") version "8.6"
 }

 description = "layoutparser-service-internal-api"
--- a/layoutparser-service/layoutparser-service-processor/build.gradle.kts
+++ b/layoutparser-service/layoutparser-service-processor/build.gradle.kts
@ -1,6 +1,6 @@
 plugins {
    id("com.knecon.fforesight.java-conventions")
-    id("io.freefair.lombok") version "8.2.2"
+    id("io.freefair.lombok") version "8.6"
 }

 description = "layoutparser-service-processor"
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java
@ -83,13 +83,17 @@ public class LayoutParsingPipeline {

        try (PDDocument originDocument = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId())) {
            ImageServiceResponse imageServiceResponse = new ImageServiceResponse();
-            if (layoutParsingRequest.imagesFileStorageId().isPresent()) {
-                imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get());
+            if (layoutParsingRequest.imagesFileStorageId()
+                    .isPresent()) {
+                imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId()
+                                                                                         .get());
            }

            TableServiceResponse tableServiceResponse = new TableServiceResponse();
-            if (layoutParsingRequest.tablesFileStorageId().isPresent()) {
-                tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get());
+            if (layoutParsingRequest.tablesFileStorageId()
+                    .isPresent()) {
+                tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId()
+                                                                                         .get());
            }

            ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(), originDocument, imageServiceResponse, tableServiceResponse);
@ -115,25 +119,25 @@ public class LayoutParsingPipeline {
                    .numberOfPages(numberOfPages)
                    .duration(System.currentTimeMillis() - start)
                    .message(format("""
-                                    Layout parsing has finished in %.02f s.
-                                    identifiers: %s
-                                    %s
-                                    Files have been saved with Ids:
-                                    Structure: %s
-                                    Text: %s
-                                    Positions: %s
-                                    PageData: %s
-                                    Simplified Text: %s
-                                    Viewer Doc: %s""",
-                            ((float) (System.currentTimeMillis() - start)) / 1000,
-                            layoutParsingRequest.identifier(),
-                            buildSemanticNodeCountMessage(numberOfPages, documentGraph.buildSemanticNodeCounts()),
-                            layoutParsingRequest.structureFileStorageId(),
-                            layoutParsingRequest.textBlockFileStorageId(),
-                            layoutParsingRequest.positionBlockFileStorageId(),
-                            layoutParsingRequest.pageFileStorageId(),
-                            layoutParsingRequest.simplifiedTextStorageId(),
-                            layoutParsingRequest.viewerDocumentStorageId()))
+                                            Layout parsing has finished in %.02f s.
+                                            identifiers: %s
+                                            %s
+                                            Files have been saved with Ids:
+                                            Structure: %s
+                                            Text: %s
+                                            Positions: %s
+                                            PageData: %s
+                                            Simplified Text: %s
+                                            Viewer Doc: %s""",
+                                    ((float) (System.currentTimeMillis() - start)) / 1000,
+                                    layoutParsingRequest.identifier(),
+                                    buildSemanticNodeCountMessage(numberOfPages, documentGraph.buildSemanticNodeCounts()),
+                                    layoutParsingRequest.structureFileStorageId(),
+                                    layoutParsingRequest.textBlockFileStorageId(),
+                                    layoutParsingRequest.positionBlockFileStorageId(),
+                                    layoutParsingRequest.pageFileStorageId(),
+                                    layoutParsingRequest.simplifiedTextStorageId(),
+                                    layoutParsingRequest.viewerDocumentStorageId()))
                    .build();
        }
    }
@ -142,14 +146,14 @@ public class LayoutParsingPipeline {
    private String buildSemanticNodeCountMessage(int numberOfPages, Map<NodeType, Long> semanticNodeCounts) {

        return String.format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed",
-                numberOfPages,
-                semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION),
-                semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE),
-                semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH),
-                semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE),
-                semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL),
-                semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER),
-                semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER));
+                             numberOfPages,
+                             semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION),
+                             semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE),
+                             semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH),
+                             semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE),
+                             semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL),
+                             semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER),
+                             semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER));
    }


@ -213,7 +217,7 @@ public class LayoutParsingPipeline {
            tableExtractionService.extractTables(cleanRulings, classificationPage);

            buildPageStatistics(classificationPage);
-            increaseDocumentStatistics(classificationPage, classificationDocument);
+            increaseDocumentStatistics(layoutParsingType, classificationPage, classificationDocument);

            classificationPages.add(classificationPage);
        }
@ -242,11 +246,11 @@ public class LayoutParsingPipeline {
    }


-    private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) {
+    private void increaseDocumentStatistics(LayoutParsingType layoutParsingType, ClassificationPage classificationPage, ClassificationDocument document) {

-       if (!classificationPage.isLandscape()) {
-        document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue());
-      }
+        if (layoutParsingType.equals(LayoutParsingType.DOCUMINE) || !classificationPage.isLandscape()) {
+            document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue());
+        }
        document.getFontCounter().addAll(classificationPage.getFontCounter().getCountPerValue());
        document.getTextHeightCounter().addAll(classificationPage.getTextHeightCounter().getCountPerValue());
        document.getFontStyleCounter().addAll(classificationPage.getFontStyleCounter().getCountPerValue());
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequence.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequence.java
@ -196,6 +196,12 @@ public class TextPositionSequence implements CharSequence {
    }


+    public float getTextHeightNoPadding() {
+
+        return textPositions.get(0).getHeightDir();
+    }
+
+
    @JsonIgnore
    @JsonAttribute(ignore = true)
    public float getTextHeight() {
@ -234,6 +240,7 @@ public class TextPositionSequence implements CharSequence {
    @JsonIgnore
    @JsonAttribute(ignore = true)
    public String getFontStyle() {
+
        if (textPositions.get(0).getFontName() == null) {
            return "standard";
        }
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocuMineBlockificationService.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocuMineBlockificationService.java
@ -61,8 +61,8 @@ public class DocuMineBlockificationService {
            boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
            boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
            boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
-            boolean splitByOtherFontAndOtherY = prev != null && prev.getMaxYDirAdj() != word.getMaxYDirAdj() && (word.getFontStyle().contains("bold") && !prev.getFontStyle()
-                    .contains("bold") || prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold"));
+            boolean splitByOtherFontAndOtherY = prev != null && Math.abs(prev.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight() * 0.2 && (word.getFontStyle()
+                    .contains("bold") && !prev.getFontStyle().contains("bold") || prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold"));

            Matcher matcher = pattern.matcher(chunkWords.stream().collect(Collectors.joining(" ")).toString());
            boolean startsOnSameX = Math.abs(minX - word.getMinXDirAdj()) < 5 && matcher.matches();
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java
@ -5,7 +5,6 @@ import java.util.Locale;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;

-import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
 import org.springframework.stereotype.Service;

 import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
@ -13,6 +12,8 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
 import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
 import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
 import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
+import com.knecon.fforesight.service.layoutparser.processor.utils.HeaderFooterDetection;
+import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
 import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;

 import lombok.RequiredArgsConstructor;
@ -23,7 +24,7 @@ import lombok.extern.slf4j.Slf4j;
@RequiredArgsConstructor
 public class DocuMineClassificationService {

-    private static final Pattern pattern = Pattern.compile("^(\\d{1,2}\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
+    private static final Pattern pattern = Pattern.compile("^([1-9]\\d?\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
    private static final Pattern pattern2 = Pattern.compile("\\p{L}{3,}", Pattern.CASE_INSENSITIVE);
    private static final Pattern pattern3 = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*");

@ -64,46 +65,64 @@ public class DocuMineClassificationService {
            return;
        }
        if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
-                || PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
-                .getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())
-        ) {
+            || (PositionUtils.isOverBodyTextFrame(bodyTextFrame,
+                                                  textBlock,
+                                                  page.getRotation()) && (document.getFontSizeCounter().getMostPopular()
+                                                                          == null
+                                                                          || textBlock.getHighestFontSize()
+                                                                             <= document.getFontSizeCounter()
+                                                                                     .getMostPopular()))) {
            textBlock.setClassification(PageBlockType.HEADER);

        } else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
-                || PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
-                .getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())
-        ) {
+                   || (PositionUtils.isUnderBodyTextFrame(bodyTextFrame,
+                                                          textBlock,
+                                                          page.getRotation())
+                       && (document.getFontSizeCounter().getMostPopular()
+                           == null
+                           || textBlock.getHighestFontSize()
+                              <= document.getFontSizeCounter()
+                                      .getMostPopular()))
+                   || HeaderFooterDetection.isLikelyFooter(textBlock, document, page)) {
            textBlock.setClassification(PageBlockType.FOOTER);
-        } else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
-                document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
-                .size() == 1)) {
+        } else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5
+                                                 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) {
            if (!Pattern.matches("[0-9]+", textBlock.toString())) {
                textBlock.setClassification(PageBlockType.TITLE);
            }
-        } else if (textBlock.getText().length() > 5 && (textBlock.getMostPopularWordHeight() > document.getTextHeightCounter()
-                .getMostPopular() || textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular()) && PositionUtils.getApproxLineCount(textBlock) < 5.9
-
-                && (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && !matcher2.matches() && !textBlock.toString()
-                .contains(":") || textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && !matcher2.matches() && !textBlock.toString()
-                .contains(":") || textBlock.toString().startsWith("APPENDIX") || textBlock.toString().startsWith("FIGURE") || textBlock.toString()
-                .startsWith("TABLE")) && !textBlock.toString().endsWith(":") && matcher2.find()) {
+        } else if (textBlock.getText().length() > 5
+                   && (textBlock.getMostPopularWordHeight() > document.getTextHeightCounter().getMostPopular()
+                       || textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular())
+                   && PositionUtils.getApproxLineCount(textBlock) < 5.9
+                   && (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && matcher2.reset().find() && !textBlock.toString()
+                .contains(":")
+                       || textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && matcher2.reset().find() && !textBlock.toString().contains(":")
+                       || textBlock.toString().startsWith("APPENDIX")
+                       || textBlock.toString().startsWith("FIGURE")
+                       || textBlock.toString().startsWith("TABLE"))
+                   && !textBlock.toString().endsWith(":")
+                   && matcher2.reset().find()) {
            textBlock.setClassification(PageBlockType.getHeadlineType(1));
            document.setHeadlines(true);

-        } else if (matcher.find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && matcher2.find() && !matcher3.matches()) {
+        } else if (matcher.reset().find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && matcher2.reset().find() && !matcher3.reset().matches()) {
            textBlock.setClassification(PageBlockType.getHeadlineType(2));
            document.setHeadlines(true);
-        } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
-                .getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) {
+        } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
+                   && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()
+                   && textBlock.getMostPopularWordStyle().equals("bold")
+                   && !document.getFontStyleCounter().getMostPopular().equals("bold")) {
            textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD);
-        } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFont()
-                .equals(document.getFontCounter().getMostPopular()) && textBlock.getMostPopularWordStyle()
-                .equals(document.getFontStyleCounter().getMostPopular()) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) {
+        } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
+                   && textBlock.getMostPopularWordFont().equals(document.getFontCounter().getMostPopular())
+                   && textBlock.getMostPopularWordStyle().equals(document.getFontStyleCounter().getMostPopular())
+                   && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) {
            textBlock.setClassification(PageBlockType.PARAGRAPH);
-        } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
-                .getMostPopular() && textBlock.getMostPopularWordStyle().equals("italic") && !document.getFontStyleCounter()
-                .getMostPopular()
-                .equals("italic") && PositionUtils.getApproxLineCount(textBlock) < 2.9) {
+        } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
+                   && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()
+                   && textBlock.getMostPopularWordStyle().equals("italic")
+                   && !document.getFontStyleCounter().getMostPopular().equals("italic")
+                   && PositionUtils.getApproxLineCount(textBlock) < 2.9) {
            textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC);
        } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) {
            textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN);
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/ViewerDocumentService.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/ViewerDocumentService.java
@ -4,11 +4,9 @@ import java.awt.geom.AffineTransform;
 import java.awt.geom.Rectangle2D;
 import java.io.IOException;
 import java.io.OutputStream;
-import java.util.HashSet;
-import java.util.Set;

-import org.apache.pdfbox.cos.COSDictionary;
 import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.pdfwriter.compress.CompressParameters;
 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
 import org.apache.pdfbox.pdmodel.PDPage;
@ -40,7 +38,6 @@ import lombok.extern.slf4j.Slf4j;
@RequiredArgsConstructor
 public class ViewerDocumentService {

-
    private static final String LAYER_NAME = "Layout grid";
    private static final int FONT_SIZE = 10;
    public static final float LINE_WIDTH = 1f;
@ -54,8 +51,7 @@ public class ViewerDocumentService {
        LayoutGrid layoutGrid = layoutGridService.createLayoutGrid(document);
        // PDDocument.save() is very slow, since it actually traverses the entire pdf and writes a new one.
        // If we collect all COSDictionaries we changed and tell it explicitly to only add the changed ones by using saveIncremental it's very fast.
-        Set<COSDictionary> dictionariesToUpdate = new HashSet<>();
-        PDOptionalContentGroup layer = addLayerToDocument(pdDocument, dictionariesToUpdate, layerVisibilityDefaultValue);
+        PDOptionalContentGroup layer = addLayerToDocument(pdDocument, layerVisibilityDefaultValue);
        PDFont font = new PDType1Font(Standard14Fonts.FontName.HELVETICA);

        for (int pageNumber = 0; pageNumber < pdDocument.getNumberOfPages(); pageNumber++) {
@ -68,7 +64,8 @@ public class ViewerDocumentService {
            // e.g. not escaped matrix transformations.
            escapePreviousContents(pdDocument, pdPage);

-            VisualizationsOnPage visualizationsOnPage = layoutGrid.getVisualizationsPerPages().get(pageNumber);
+            VisualizationsOnPage visualizationsOnPage = layoutGrid.getVisualizationsPerPages()
+                    .get(pageNumber);
            assert pageNumber == visualizationsOnPage.getPageNumber();
            // We need to append to the content stream, otherwise the content could be overlapped by following content.
            try (var contentStream = new PDPageContentStream(pdDocument, pdPage, PDPageContentStream.AppendMode.APPEND, true)) {
@ -102,11 +99,11 @@ public class ViewerDocumentService {
                    contentStream.setFont(font, FONT_SIZE);
                    contentStream.beginText();
                    Matrix textMatrix = new Matrix((float) textDeRotationMatrix.getScaleX(),
-                            (float) textDeRotationMatrix.getShearX(),
-                            (float) textDeRotationMatrix.getShearY(),
-                            (float) textDeRotationMatrix.getScaleY(),
-                            (float) placedText.lineStart().getX(),
-                            (float) placedText.lineStart().getY());
+                                                   (float) textDeRotationMatrix.getShearX(),
+                                                   (float) textDeRotationMatrix.getShearY(),
+                                                   (float) textDeRotationMatrix.getScaleY(),
+                                                   (float) placedText.lineStart().getX(),
+                                                   (float) placedText.lineStart().getY());
                    textMatrix.translate(-((font.getStringWidth(placedText.text()) / 1000) * FONT_SIZE + (2 * LINE_WIDTH) + 4), -FONT_SIZE);
                    contentStream.setTextMatrix(textMatrix);
                    contentStream.showText(placedText.text());
@ -115,12 +112,9 @@ public class ViewerDocumentService {
                contentStream.restoreGraphicsState();
                contentStream.endMarkedContent();
            }
-            dictionariesToUpdate.add(pdPage.getCOSObject());
-            dictionariesToUpdate.add(pdPage.getResources().getCOSObject());
        }
-        dictionariesToUpdate.add(pdDocument.getDocumentInformation().getCOSObject());
 //        dictionariesToUpdate.add(pdDocument.getDocument().getTrailer());
-        pdDocument.saveIncremental(outputStream, dictionariesToUpdate);
+        pdDocument.save(outputStream, CompressParameters.NO_COMPRESSION);
    }


@ -145,7 +139,7 @@ public class ViewerDocumentService {
    }


-    private static PDOptionalContentGroup addLayerToDocument(PDDocument pdDocument, Set<COSDictionary> dictionariesToUpdate, boolean layerVisibilityDefaultValue) {
+    private static PDOptionalContentGroup addLayerToDocument(PDDocument pdDocument, boolean layerVisibilityDefaultValue) {

        PDDocumentCatalog catalog = pdDocument.getDocumentCatalog();
        PDOptionalContentProperties ocprops = catalog.getOCProperties();
@ -161,7 +155,6 @@ public class ViewerDocumentService {
            ocprops.addGroup(layer);
        }
        ocprops.setGroupEnabled(layer, layerVisibilityDefaultValue);
-        dictionariesToUpdate.add(catalog.getCOSObject());
        return layer;
    }

--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java
@ -0,0 +1,223 @@
+package com.knecon.fforesight.service.layoutparser.processor.utils;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
+import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
+import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
+import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
+
+import lombok.experimental.UtilityClass;
+
+@UtilityClass
+public class HeaderFooterDetection {
+
+    private final Map<Integer, ClassificationPage> pagesCache = new HashMap<>();
+    private static final double THRESHOLD = 0.5;
+    // Weight will go from 1.0 to 0.5 because the first element is the most likely to be the header on the page.
+    private static final double[] headerWeights = {1.0, 0.75, 0.5};
+    // Weight will go from 0.5 to 1.0 because the last element is the most likely to be the footer on the page.
+    private static final double[] footerWeights = {0.5, 0.75, 1.0};
+
+
+    public boolean isLikelyFooter(TextPageBlock textPageBlock, ClassificationDocument document, ClassificationPage classificationPage) {
+
+        int numberOfPages = document.getPages().size();
+        if (numberOfPages < 3) {
+            // If the document has 1 or 2 pages this may lead to more false positives than actual findings.
+            return false;
+        }
+
+        int window = Math.min(numberOfPages, 8);
+
+        List<ClassificationPage> nearestPages = findNearestPages(classificationPage, document.getPages(), window);
+        List<List<AbstractPageBlock>> footerCandidates = getFooterCandidates(nearestPages);
+
+        return detectHeadersOrFootersByPageAssociation(textPageBlock.getText(), footerCandidates, window, footerWeights);
+    }
+
+
+    public boolean isLikelyHeader(TextPageBlock textPageBlock, ClassificationDocument document, ClassificationPage classificationPage) {
+
+        int numberOfPages = document.getPages().size();
+        if (numberOfPages < 3) {
+            // If the document has 1 or 2 pages this may lead to more false positives than actual findings.
+            return false;
+        }
+
+        int window = Math.min(numberOfPages, 8);
+
+        List<ClassificationPage> nearestPages = findNearestPages(classificationPage, document.getPages(), window);
+        List<List<AbstractPageBlock>> headerCandidates = getHeaderCandidates(nearestPages);
+
+        return detectHeadersOrFootersByPageAssociation(textPageBlock.getText(), headerCandidates, window, headerWeights);
+    }
+
+
+    private boolean detectHeadersOrFootersByPageAssociation(String testString, List<List<AbstractPageBlock>> candidates, int window, double[] weights) {
+
+        double highestScore = 0.0;
+
+        for (int i = 0; i < candidates.size(); i++) {
+            List<List<String>> candidateStrings = new ArrayList<>();
+            for (int k = Math.max(i - window, 0); k < Math.min(i + window, candidates.size()); k++) {
+                candidateStrings.add(candidates.get(k)
+                                             .stream()
+                                             .map(AbstractPageBlock::getText)
+                                             .collect(Collectors.toList()));
+            }
+
+            int maxLen = candidateStrings.stream()
+                    .mapToInt(List::size)
+                    .max()
+                    .orElse(0);
+            for (List<String> sublist : candidateStrings) {
+                while (sublist.size() < maxLen) {
+                    sublist.add(0, "");
+                }
+            }
+
+            // Compare the testString against each candidate in the window
+            for (int j = 0; j < maxLen; j++) {
+                double score = 0.0;
+                int finalJ = j;
+                List<String> paddedCandidateStrings = candidateStrings.stream()
+                        .map(sublist -> sublist.size() > finalJ ? sublist.get(finalJ) : "")
+                        .toList();
+                for (String paddedString : paddedCandidateStrings) {
+                    if ((testString.length() >= 5 && paddedString.length() >= 5) && (testString.length() > 2 * paddedString.length()
+                                                                                     || paddedString.length() > 2 * testString.length())) {
+                        // If both strings are at least 5 characters long and one string is more than twice as long as the other,
+                        // skip the distance calculation as it's time-consuming, and we can assume they are not similar enough
+                        continue;
+                    }
+
+                    int distance = calculateHammingDistanceWithPreprocessing(testString, paddedString);
+                    double normalizedScore = 1 - (double) distance / Math.max(testString.length(), paddedString.length());
+                    score += normalizedScore * (j < weights.length ? weights[j] : 1);
+                }
+                score /= paddedCandidateStrings.size();
+                highestScore = Math.max(highestScore, score);
+                // Early stop
+                if (highestScore > THRESHOLD) {
+                    return true;
+                }
+            }
+        }
+
+        return false;
+    }
+
+
+    /**
+     * Find the nearest n pages for a given page.
+     * For example: nearest 8 pages for page 4 are: 1, 2, 3, 5, 6, 7, 8, 9.
+     *
+     * @param currentPage  Current page to find the nearest ones.
+     * @param allPages     All pages in the document.
+     * @param numNeighbors Number of neighbouring pages to find.
+     * @return The nearest pages.
+     */
+    private List<ClassificationPage> findNearestPages(ClassificationPage currentPage, List<ClassificationPage> allPages, int numNeighbors) {
+
+        int totalPages = allPages.size();
+        List<ClassificationPage> nearestPages = new ArrayList<>();
+
+        int currentPageIndex = currentPage.getPageNumber() - 1;
+        int halfWin = numNeighbors / 2;
+        int start = Math.max(0, currentPageIndex - halfWin);
+        int end = Math.min(totalPages - 1, currentPageIndex + halfWin);
+
+        for (int i = start; i <= end; i++) {
+            if (i != currentPageIndex) {
+                nearestPages.add(pagesCache.computeIfAbsent(i, allPages::get));
+            }
+        }
+
+        pagesCache.keySet().removeIf(key -> key < start || key > end);
+
+        return nearestPages;
+    }
+
+
+    // Get the last 3 TextBlocks on the page as they are likely to be a footer
+    private List<List<AbstractPageBlock>> getFooterCandidates(List<ClassificationPage> pages) {
+
+        List<List<AbstractPageBlock>> footerCandidates = new ArrayList<>();
+        for (ClassificationPage page : pages) {
+            List<AbstractPageBlock> textBlocks = page.getTextBlocks();
+            List<TextPageBlock> textPageBlocks = textBlocks.stream()
+                    .filter(textBlock -> textBlock instanceof TextPageBlock)
+                    .map(textBlock -> (TextPageBlock) textBlock)
+                    .toList();
+            int blockCount = textPageBlocks.size();
+            if (blockCount > 0) {
+                int start = Math.max(0, blockCount - 3);
+                footerCandidates.add(new ArrayList<>(textPageBlocks.subList(start, blockCount)));
+            }
+        }
+        return footerCandidates;
+    }
+
+
+    // Get the first 3 TextBlocks on the page as they are likely to be a header
+    private List<List<AbstractPageBlock>> getHeaderCandidates(List<ClassificationPage> pages) {
+
+        List<List<AbstractPageBlock>> headerCandidates = new ArrayList<>();
+        for (ClassificationPage page : pages) {
+            List<AbstractPageBlock> textBlocks = page.getTextBlocks();
+            List<TextPageBlock> textPageBlocks = textBlocks.stream()
+                    .filter(textBlock -> textBlock instanceof TextPageBlock)
+                    .map(textBlock -> (TextPageBlock) textBlock)
+                    .toList();
+            int count = Math.min(3, textPageBlocks.size());
+            headerCandidates.add(new ArrayList<>(textPageBlocks.subList(0, count)));
+        }
+        return headerCandidates;
+    }
+
+
+    /**
+     * Calculate the Hamming distance between two strings after preprocessing to make them the same length
+     * and replacing all digits with a special character '@' since they are a common occurrence in headers/footers.
+     *
+     * @param firstCandidate  First string
+     * @param secondCandidate Second string
+     * @return The Hamming distance between the two preprocessed strings.
+     */
+    private int calculateHammingDistanceWithPreprocessing(String firstCandidate, String secondCandidate) {
+
+        int maxLength = Math.max(firstCandidate.length(), secondCandidate.length());
+
+        String cleanFirstCandidate = padString(firstCandidate, maxLength, '\0').replaceAll("\\d", "@");
+        String cleanSecondCandidate = padString(secondCandidate, maxLength, '\0').replaceAll("\\d", "@");
+
+        int distance = 0;
+        for (int i = 0; i < maxLength; i++) {
+            if (cleanFirstCandidate.charAt(i) != cleanSecondCandidate.charAt(i)) {
+                distance++;
+            }
+        }
+        return distance;
+    }
+
+
+    private String padString(String input, int length, char padChar) {
+
+        if (input.length() >= length) {
+            return input;
+        }
+
+        StringBuilder sb = new StringBuilder(input);
+
+        while (sb.length() < length) {
+            sb.append(padChar);
+        }
+        return sb.toString();
+    }
+
+}
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionSequenceComparator.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionSequenceComparator.java
@ -28,15 +28,13 @@ import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPosit
 *
 * @author Ben Litchfield
 */
-public class TextPositionSequenceComparator implements Comparator<TextPositionSequence>
-{
+public class TextPositionSequenceComparator implements Comparator<TextPositionSequence> {
+
    @Override
-    public int compare(TextPositionSequence pos1, TextPositionSequence pos2)
-    {
+    public int compare(TextPositionSequence pos1, TextPositionSequence pos2) {
        // only compare text that is in the same direction
        int cmp1 = Float.compare(pos1.getDir().getDegrees(), pos2.getDir().getDegrees());
-        if (cmp1 != 0)
-        {
+        if (cmp1 != 0) {
            return cmp1;
        }

@ -48,25 +46,19 @@ public class TextPositionSequenceComparator implements Comparator<TextPositionSe
        float pos2YBottom = pos2.getMaxYDirAdj();

        // note that the coordinates have been adjusted so 0,0 is in upper left
-        float pos1YTop = pos1YBottom - pos1.getTextHeight();
-        float pos2YTop = pos2YBottom - pos2.getTextHeight();
+        float pos1YTop = pos1YBottom - pos1.getTextHeightNoPadding();
+        float pos2YTop = pos2YBottom - pos2.getTextHeightNoPadding();

        float yDifference = Math.abs(pos1YBottom - pos2YBottom);

        // we will do a simple tolerance comparison
-        if (yDifference < .1 ||
-                pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom ||
-                pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom)
-        {
+        if (yDifference < .1 || pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom || pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom) {
            return Float.compare(x1, x2);
-        }
-        else if (pos1YBottom < pos2YBottom)
-        {
+        } else if (pos1YBottom < pos2YBottom) {
            return -1;
-        }
-        else
-        {
+        } else {
            return 1;
        }
    }
+
 }
--- a/layoutparser-service/layoutparser-service-server/build.gradle.kts
+++ b/layoutparser-service/layoutparser-service-server/build.gradle.kts
@ -6,7 +6,7 @@ plugins {
    id("org.springframework.boot") version "3.1.3"
    id("io.spring.dependency-management") version "1.1.3"
    id("org.sonarqube") version "4.3.0.3225"
-    id("io.freefair.lombok") version "8.2.2"
+    id("io.freefair.lombok") version "8.6"
 //    id("org.graalvm.buildtools.native") version "0.9.23"
 }

--- a/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/DocumineIdentifierProblem.pdf
+++ b/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/DocumineIdentifierProblem.pdf
Author	SHA1	Message	Date
Andrei Isvoran	b1a054267b	Merge branch 'RED-9149' into 'release/0.78.x' RED-9149 - Remove header detection & bump lombok See merge request fforesight/layout-parser!156	2024-05-20 14:12:06 +02:00
Andrei Isvoran	43dec8744a	RED-9149 - Remove header detection & bump lombok	2024-05-20 14:12:06 +02:00
Kilian Schüttler	e2a5b85c4a	Merge branch 'RED-9148-backport' into 'release/0.78.x' RED-9149 - Header and footer detection by page-association See merge request fforesight/layout-parser!151	2024-05-13 14:57:51 +02:00
Andrei Isvoran	d5a4dd4d42	RED-9149 - Header and footer detection by page-association	2024-05-13 14:57:51 +02:00
Dominique Eifländer	acd6d7f164	Merge branch 'RED-8933-0.78' into 'release/0.78.x' Red 8933 0.78 See merge request fforesight/layout-parser!147	2024-05-08 11:22:03 +02:00
Dominique Eifländer	71025f7f16	RED-9129: Track pagestatistics also for landscape pages in documine	2024-05-08 10:50:55 +02:00
Dominique Eifländer	ae6bad830e	RED-8933: Fixed bugs in DocumineClassificationService	2024-05-08 10:50:55 +02:00
Dominique Eifländer	e030ec9dd2	Merge branch 'RED-9103' into 'release/0.78.x' RED-9103: Fixed save of document viewer file See merge request fforesight/layout-parser!144	2024-05-02 13:03:38 +02:00
Dominique Eifländer	49139ee603	RED-9103: Fixed save of document viewer file	2024-05-02 12:55:14 +02:00
Dominique Eifländer	07da43f2d9	hotfix: revert layoutparsingResponseQueue changes that is not in persistence-service, should be done in migration to 4.0.0	2024-04-24 15:24:19 +02:00
Dominique Eifländer	df0bbc92c7	RED-8932 Fixed not merged headline with identifier	2024-04-24 11:38:26 +02:00
Kilian Schüttler	0497d764ec	Merge branch 'hotfix' into 'release/0.78.x' hotfix: remove DLQ for layoutparsing finished queue See merge request fforesight/layout-parser!129	2024-04-08 15:39:04 +02:00
Kilian Schuettler	1362e4fbb2	hotfix: remove DLQ for layoutparsing finished queue	2024-04-08 15:31:35 +02:00
Dominique Eifländer	665ad40b0b	RED-8627: Fixed scrambled text after sorting	2024-03-19 14:46:04 +01:00