From d5a4dd4d421b518078468705f2453ceb5e046041 Mon Sep 17 00:00:00 2001
From: Andrei Isvoran <andrei.isvoran.ext@knecon.com>
Date: Mon, 13 May 2024 14:57:51 +0200
Subject: [PATCH] RED-9149 - Header and footer detection by page-association

---
 .../DocuMineClassificationService.java        |  24 +-
 .../utils/HeaderFooterDetection.java          | 223 ++++++++++++++++++
 2 files changed, 241 insertions(+), 6 deletions(-)
 create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java

diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java
index 4c881c6..608e863 100644
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java
@@ -12,6 +12,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
 import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
 import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
 import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
+import com.knecon.fforesight.service.layoutparser.processor.utils.HeaderFooterDetection;
 import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
 import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
 
@@ -64,15 +65,26 @@ public class DocuMineClassificationService {
             return;
         }
         if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
-            || PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null
-                                                                                                   || textBlock.getHighestFontSize() <= document.getFontSizeCounter()
-                .getMostPopular())) {
+            || (PositionUtils.isOverBodyTextFrame(bodyTextFrame,
+                                                  textBlock,
+                                                  page.getRotation()) && (document.getFontSizeCounter().getMostPopular()
+                                                                          == null
+                                                                          || textBlock.getHighestFontSize()
+                                                                             <= document.getFontSizeCounter()
+                                                                                     .getMostPopular()))
+            || HeaderFooterDetection.isLikelyHeader(textBlock, document, page)) {
             textBlock.setClassification(PageBlockType.HEADER);
 
         } else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
-                   || PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null
-                                                                                                           || textBlock.getHighestFontSize() <= document.getFontSizeCounter()
-                .getMostPopular())) {
+                   || (PositionUtils.isUnderBodyTextFrame(bodyTextFrame,
+                                                          textBlock,
+                                                          page.getRotation())
+                       && (document.getFontSizeCounter().getMostPopular()
+                           == null
+                           || textBlock.getHighestFontSize()
+                              <= document.getFontSizeCounter()
+                                      .getMostPopular()))
+                   || HeaderFooterDetection.isLikelyFooter(textBlock, document, page)) {
             textBlock.setClassification(PageBlockType.FOOTER);
         } else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5
                                                  && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) {
diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java
new file mode 100644
index 0000000..24ed41d
--- /dev/null
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java
@@ -0,0 +1,223 @@
+package com.knecon.fforesight.service.layoutparser.processor.utils;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
+import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
+import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
+import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
+
+import lombok.experimental.UtilityClass;
+
+@UtilityClass
+public class HeaderFooterDetection {
+
+    private final Map<Integer, ClassificationPage> pagesCache = new HashMap<>();
+    private static final double THRESHOLD = 0.5;
+    // Weight will go from 1.0 to 0.5 because the first element is the most likely to be the header on the page.
+    private static final double[] headerWeights = {1.0, 0.75, 0.5};
+    // Weight will go from 0.5 to 1.0 because the last element is the most likely to be the footer on the page.
+    private static final double[] footerWeights = {0.5, 0.75, 1.0};
+
+
+    public boolean isLikelyFooter(TextPageBlock textPageBlock, ClassificationDocument document, ClassificationPage classificationPage) {
+
+        int numberOfPages = document.getPages().size();
+        if (numberOfPages < 3) {
+            // If the document has 1 or 2 pages this may lead to more false positives than actual findings.
+            return false;
+        }
+
+        int window = Math.min(numberOfPages, 8);
+
+        List<ClassificationPage> nearestPages = findNearestPages(classificationPage, document.getPages(), window);
+        List<List<AbstractPageBlock>> footerCandidates = getFooterCandidates(nearestPages);
+
+        return detectHeadersOrFootersByPageAssociation(textPageBlock.getText(), footerCandidates, window, footerWeights);
+    }
+
+
+    public boolean isLikelyHeader(TextPageBlock textPageBlock, ClassificationDocument document, ClassificationPage classificationPage) {
+
+        int numberOfPages = document.getPages().size();
+        if (numberOfPages < 3) {
+            // If the document has 1 or 2 pages this may lead to more false positives than actual findings.
+            return false;
+        }
+
+        int window = Math.min(numberOfPages, 8);
+
+        List<ClassificationPage> nearestPages = findNearestPages(classificationPage, document.getPages(), window);
+        List<List<AbstractPageBlock>> headerCandidates = getHeaderCandidates(nearestPages);
+
+        return detectHeadersOrFootersByPageAssociation(textPageBlock.getText(), headerCandidates, window, headerWeights);
+    }
+
+
+    private boolean detectHeadersOrFootersByPageAssociation(String testString, List<List<AbstractPageBlock>> candidates, int window, double[] weights) {
+
+        double highestScore = 0.0;
+
+        for (int i = 0; i < candidates.size(); i++) {
+            List<List<String>> candidateStrings = new ArrayList<>();
+            for (int k = Math.max(i - window, 0); k < Math.min(i + window, candidates.size()); k++) {
+                candidateStrings.add(candidates.get(k)
+                                             .stream()
+                                             .map(AbstractPageBlock::getText)
+                                             .collect(Collectors.toList()));
+            }
+
+            int maxLen = candidateStrings.stream()
+                    .mapToInt(List::size)
+                    .max()
+                    .orElse(0);
+            for (List<String> sublist : candidateStrings) {
+                while (sublist.size() < maxLen) {
+                    sublist.add(0, "");
+                }
+            }
+
+            // Compare the testString against each candidate in the window
+            for (int j = 0; j < maxLen; j++) {
+                double score = 0.0;
+                int finalJ = j;
+                List<String> paddedCandidateStrings = candidateStrings.stream()
+                        .map(sublist -> sublist.size() > finalJ ? sublist.get(finalJ) : "")
+                        .toList();
+                for (String paddedString : paddedCandidateStrings) {
+                    if ((testString.length() >= 5 && paddedString.length() >= 5) && (testString.length() > 2 * paddedString.length()
+                                                                                     || paddedString.length() > 2 * testString.length())) {
+                        // If both strings are at least 5 characters long and one string is more than twice as long as the other,
+                        // skip the distance calculation as it's time-consuming, and we can assume they are not similar enough
+                        continue;
+                    }
+
+                    int distance = calculateHammingDistanceWithPreprocessing(testString, paddedString);
+                    double normalizedScore = 1 - (double) distance / Math.max(testString.length(), paddedString.length());
+                    score += normalizedScore * (j < weights.length ? weights[j] : 1);
+                }
+                score /= paddedCandidateStrings.size();
+                highestScore = Math.max(highestScore, score);
+                // Early stop
+                if (highestScore > THRESHOLD) {
+                    return true;
+                }
+            }
+        }
+
+        return false;
+    }
+
+
+    /**
+     * Find the nearest n pages for a given page.
+     * For example: nearest 8 pages for page 4 are: 1, 2, 3, 5, 6, 7, 8, 9.
+     *
+     * @param currentPage  Current page to find the nearest ones.
+     * @param allPages     All pages in the document.
+     * @param numNeighbors Number of neighbouring pages to find.
+     * @return The nearest pages.
+     */
+    private List<ClassificationPage> findNearestPages(ClassificationPage currentPage, List<ClassificationPage> allPages, int numNeighbors) {
+
+        int totalPages = allPages.size();
+        List<ClassificationPage> nearestPages = new ArrayList<>();
+
+        int currentPageIndex = currentPage.getPageNumber() - 1;
+        int halfWin = numNeighbors / 2;
+        int start = Math.max(0, currentPageIndex - halfWin);
+        int end = Math.min(totalPages - 1, currentPageIndex + halfWin);
+
+        for (int i = start; i <= end; i++) {
+            if (i != currentPageIndex) {
+                nearestPages.add(pagesCache.computeIfAbsent(i, allPages::get));
+            }
+        }
+
+        pagesCache.keySet().removeIf(key -> key < start || key > end);
+
+        return nearestPages;
+    }
+
+
+    // Get the last 3 TextBlocks on the page as they are likely to be a footer
+    private List<List<AbstractPageBlock>> getFooterCandidates(List<ClassificationPage> pages) {
+
+        List<List<AbstractPageBlock>> footerCandidates = new ArrayList<>();
+        for (ClassificationPage page : pages) {
+            List<AbstractPageBlock> textBlocks = page.getTextBlocks();
+            List<TextPageBlock> textPageBlocks = textBlocks.stream()
+                    .filter(textBlock -> textBlock instanceof TextPageBlock)
+                    .map(textBlock -> (TextPageBlock) textBlock)
+                    .toList();
+            int blockCount = textPageBlocks.size();
+            if (blockCount > 0) {
+                int start = Math.max(0, blockCount - 3);
+                footerCandidates.add(new ArrayList<>(textPageBlocks.subList(start, blockCount)));
+            }
+        }
+        return footerCandidates;
+    }
+
+
+    // Get the first 3 TextBlocks on the page as they are likely to be a header
+    private List<List<AbstractPageBlock>> getHeaderCandidates(List<ClassificationPage> pages) {
+
+        List<List<AbstractPageBlock>> headerCandidates = new ArrayList<>();
+        for (ClassificationPage page : pages) {
+            List<AbstractPageBlock> textBlocks = page.getTextBlocks();
+            List<TextPageBlock> textPageBlocks = textBlocks.stream()
+                    .filter(textBlock -> textBlock instanceof TextPageBlock)
+                    .map(textBlock -> (TextPageBlock) textBlock)
+                    .toList();
+            int count = Math.min(3, textPageBlocks.size());
+            headerCandidates.add(new ArrayList<>(textPageBlocks.subList(0, count)));
+        }
+        return headerCandidates;
+    }
+
+
+    /**
+     * Calculate the Hamming distance between two strings after preprocessing to make them the same length
+     * and replacing all digits with a special character '@' since they are a common occurrence in headers/footers.
+     *
+     * @param firstCandidate  First string
+     * @param secondCandidate Second string
+     * @return The Hamming distance between the two preprocessed strings.
+     */
+    private int calculateHammingDistanceWithPreprocessing(String firstCandidate, String secondCandidate) {
+
+        int maxLength = Math.max(firstCandidate.length(), secondCandidate.length());
+
+        String cleanFirstCandidate = padString(firstCandidate, maxLength, '\0').replaceAll("\\d", "@");
+        String cleanSecondCandidate = padString(secondCandidate, maxLength, '\0').replaceAll("\\d", "@");
+
+        int distance = 0;
+        for (int i = 0; i < maxLength; i++) {
+            if (cleanFirstCandidate.charAt(i) != cleanSecondCandidate.charAt(i)) {
+                distance++;
+            }
+        }
+        return distance;
+    }
+
+
+    private String padString(String input, int length, char padChar) {
+
+        if (input.length() >= length) {
+            return input;
+        }
+
+        StringBuilder sb = new StringBuilder(input);
+
+        while (sb.length() < length) {
+            sb.append(padChar);
+        }
+        return sb.toString();
+    }
+
+}