From fda25852d1990d372307674e19578318ec469a5b Mon Sep 17 00:00:00 2001
From: Andrei Isvoran <andrei.isvoran.ext@knecon.com>
Date: Fri, 10 May 2024 15:17:41 +0300
Subject: [PATCH 1/5] RED-9149 - Header and footer extraction by
 page-association

---
 .../DocuMineClassificationService.java        |  25 ++-
 .../utils/HeaderFooterDetection.java          | 180 ++++++++++++++++++
 2 files changed, 199 insertions(+), 6 deletions(-)
 create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java
diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java
index f10ac3b..608e863 100644
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java
@@ -12,6 +12,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
 import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
 import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
 import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
+import com.knecon.fforesight.service.layoutparser.processor.utils.HeaderFooterDetection;
 import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
 import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
 
@@ -49,6 +50,7 @@ public class DocuMineClassificationService {
         }
     }
 
+
     private void classifyBlock(TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
 
         log.debug("headlineFontSizes: {}", headlineFontSizes);
@@ -63,15 +65,26 @@ public class DocuMineClassificationService {
             return;
         }
         if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
-            || PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null
-                                                                                                   || textBlock.getHighestFontSize() <= document.getFontSizeCounter()
-                .getMostPopular())) {
+            || (PositionUtils.isOverBodyTextFrame(bodyTextFrame,
+                                                  textBlock,
+                                                  page.getRotation()) && (document.getFontSizeCounter().getMostPopular()
+                                                                          == null
+                                                                          || textBlock.getHighestFontSize()
+                                                                             <= document.getFontSizeCounter()
+                                                                                     .getMostPopular()))
+            || HeaderFooterDetection.isLikelyHeader(textBlock, document, page)) {
             textBlock.setClassification(PageBlockType.HEADER);
 
         } else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
-                   || PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null
-                                                                                                           || textBlock.getHighestFontSize() <= document.getFontSizeCounter()
-                .getMostPopular())) {
+                   || (PositionUtils.isUnderBodyTextFrame(bodyTextFrame,
+                                                          textBlock,
+                                                          page.getRotation())
+                       && (document.getFontSizeCounter().getMostPopular()
+                           == null
+                           || textBlock.getHighestFontSize()
+                              <= document.getFontSizeCounter()
+                                      .getMostPopular()))
+                   || HeaderFooterDetection.isLikelyFooter(textBlock, document, page)) {
             textBlock.setClassification(PageBlockType.FOOTER);
         } else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5
                                                  && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) {
diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java
new file mode 100644
index 0000000..4668195
--- /dev/null
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java
@@ -0,0 +1,180 @@
+package com.knecon.fforesight.service.layoutparser.processor.utils;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
+import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
+import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
+import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
+
+import lombok.experimental.UtilityClass;
+
+@UtilityClass
+public class HeaderFooterDetection {
+
+    private final Map<Integer, ClassificationPage> pagesCache = new HashMap<>();
+
+
+    public boolean isLikelyFooter(TextPageBlock textPageBlock, ClassificationDocument document, ClassificationPage classificationPage) {
+
+        int numberOfPages = document.getPages().size();
+        if (numberOfPages < 3) {
+            // If the document has 1 or 2 pages this may lead to more false positives than actual findings.
+            return false;
+        }
+
+        int window = Math.min(numberOfPages, 8);
+
+        List<ClassificationPage> nearestPages = findNearestPages(classificationPage, document.getPages(), window);
+        List<List<AbstractPageBlock>> footerCandidates = getFooterCandidates(nearestPages);
+
+        // Weight will go from 0.5 to 1.0 because the last element is the most likely to be the footer on the page.
+        double[] footerWeights = {0.5, 0.75, 1.0};
+        return detectHeadersOrFootersByPageAssociation(textPageBlock.getText(), footerCandidates, window, footerWeights);
+    }
+
+
+    public boolean isLikelyHeader(TextPageBlock textPageBlock, ClassificationDocument document, ClassificationPage classificationPage) {
+
+        int numberOfPages = document.getPages().size();
+        if (numberOfPages < 3) {
+            // If the document has 1 or 2 pages this may lead to more false positives than actual findings.
+            return false;
+        }
+
+        int window = Math.min(numberOfPages, 8);
+
+        List<ClassificationPage> nearestPages = findNearestPages(classificationPage, document.getPages(), window);
+        List<List<AbstractPageBlock>> headerCandidates = getHeaderCandidates(nearestPages);
+
+        // Weight will go from 1.0 to 0.5 because the first element is the most likely to be the header on the page.
+        double[] headerWeights = {1.0, 0.75, 0.5};
+        return detectHeadersOrFootersByPageAssociation(textPageBlock.getText(), headerCandidates, window, headerWeights);
+    }
+
+
+    private boolean detectHeadersOrFootersByPageAssociation(String testString, List<List<AbstractPageBlock>> candidates, int window, double[] weights) {
+        
+        double highestScore = 0.0;
+
+        for (int i = 0; i < candidates.size(); i++) {
+            List<List<String>> temp = new ArrayList<>();
+            for (int k = Math.max(i - window, 0); k < Math.min(i + window, candidates.size()); k++) {
+                temp.add(candidates.get(k)
+                                 .stream()
+                                 .map(AbstractPageBlock::getText)
+                                 .collect(Collectors.toList()));
+            }
+
+            int maxLen = temp.stream()
+                    .mapToInt(List::size)
+                    .max()
+                    .orElse(0);
+            for (List<String> sublist : temp) {
+                while (sublist.size() < maxLen) {
+                    sublist.add(0, "");
+                }
+            }
+
+            // Compare the testString against each candidates in the window
+            for (int j = 0; j < maxLen; j++) {
+                double score = 0.0;
+                try {
+                    int finalJ = j;
+                    List<String> cmp = temp.stream()
+                            .map(sublist -> sublist.size() > finalJ ? sublist.get(finalJ) : "")
+                            .toList();
+                    for (String cm : cmp) {
+                        score += compare(testString, cm) * (j < weights.length ? weights[j] : 1);
+                    }
+                    score /= cmp.size();
+                } catch (IndexOutOfBoundsException e) {
+                    continue;
+                }
+                highestScore = Math.max(highestScore, score);
+            }
+        }
+
+        return highestScore > 0.5;
+    }
+
+
+    private double compare(String a, String b) {
+
+        int count = 0;
+        a = a.replaceAll("\\d", "@");
+        b = b.replaceAll("\\d", "@");
+
+        for (int i = 0; i < Math.min(a.length(), b.length()); i++) {
+            if (a.charAt(i) == b.charAt(i)) {
+                count++;
+            }
+        }
+        return (double) count / Math.max(a.length(), b.length());
+    }
+
+
+    /**
+     * Find the nearest n pages for a given page.
+     * For example: nearest 8 pages for page 4 are: 1, 2, 3, 5, 6, 7, 8, 9.
+     *
+     * @param currentPage  Current page to find the nearest ones.
+     * @param allPages     All pages in the document.
+     * @param numNeighbors Number of neighbouring pages to find.
+     * @return The nearest pages.
+     */
+    private List<ClassificationPage> findNearestPages(ClassificationPage currentPage, List<ClassificationPage> allPages, int numNeighbors) {
+
+        int totalPages = allPages.size();
+        List<ClassificationPage> nearestPages = new ArrayList<>();
+
+        int currentPageIndex = currentPage.getPageNumber() - 1;
+        int halfWin = numNeighbors / 2;
+        int start = Math.max(0, currentPageIndex - halfWin);
+        int end = Math.min(totalPages - 1, currentPageIndex + halfWin);
+
+        for (int i = start; i <= end; i++) {
+            if (i != currentPageIndex) {
+                nearestPages.add(pagesCache.computeIfAbsent(i, idx -> allPages.get(idx)));
+            }
+        }
+
+        pagesCache.keySet().removeIf(key -> key < start || key > end);
+
+        return nearestPages;
+    }
+
+
+    // Get the last 3 TextBlocks on the page as they are likely to be a footer
+    private List<List<AbstractPageBlock>> getFooterCandidates(List<ClassificationPage> pages) {
+
+        List<List<AbstractPageBlock>> footerCandidates = new ArrayList<>();
+        for (ClassificationPage page : pages) {
+            List<AbstractPageBlock> textBlocks = page.getTextBlocks();
+            int blockCount = textBlocks.size();
+            if (blockCount > 0) {
+                int start = Math.max(0, blockCount - 3);
+                footerCandidates.add(new ArrayList<>(textBlocks.subList(start, blockCount)));
+            }
+        }
+        return footerCandidates;
+    }
+
+
+    // Get the first 3 TextBlocks on the page as they are likely to be a header
+    private List<List<AbstractPageBlock>> getHeaderCandidates(List<ClassificationPage> pages) {
+
+        List<List<AbstractPageBlock>> headerCandidates = new ArrayList<>();
+        for (ClassificationPage page : pages) {
+            List<AbstractPageBlock> textBlocks = page.getTextBlocks();
+            int count = Math.min(3, textBlocks.size());
+            headerCandidates.add(new ArrayList<>(textBlocks.subList(0, count)));
+        }
+        return headerCandidates;
+    }
+
+}

From f1dbcc24a26de9372239096034f1e2304fb5abf3 Mon Sep 17 00:00:00 2001
From: Andrei Isvoran <andrei.isvoran.ext@knecon.com>
Date: Fri, 10 May 2024 15:49:08 +0300
Subject: [PATCH 2/5] RED-9149 - Header and footer extraction by
 page-association

---
 .../processor/utils/HeaderFooterDetection.java | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java
index 4668195..f11f250 100644
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java
@@ -58,7 +58,7 @@ public class HeaderFooterDetection {
 
 
     private boolean detectHeadersOrFootersByPageAssociation(String testString, List<List<AbstractPageBlock>> candidates, int window, double[] weights) {
-        
+
         double highestScore = 0.0;
 
         for (int i = 0; i < candidates.size(); i++) {
@@ -80,7 +80,7 @@ public class HeaderFooterDetection {
                 }
             }
 
-            // Compare the testString against each candidates in the window
+            // Compare the testString against each candidate in the window
             for (int j = 0; j < maxLen; j++) {
                 double score = 0.0;
                 try {
@@ -103,18 +103,18 @@ public class HeaderFooterDetection {
     }
 
 
-    private double compare(String a, String b) {
+    private double compare(String candidate1, String candidate2) {
 
         int count = 0;
-        a = a.replaceAll("\\d", "@");
-        b = b.replaceAll("\\d", "@");
+        candidate1 = candidate1.replaceAll("\\d", "@");
+        candidate2 = candidate2.replaceAll("\\d", "@");
 
-        for (int i = 0; i < Math.min(a.length(), b.length()); i++) {
-            if (a.charAt(i) == b.charAt(i)) {
+        for (int i = 0; i < Math.min(candidate1.length(), candidate2.length()); i++) {
+            if (candidate1.charAt(i) == candidate2.charAt(i)) {
                 count++;
             }
         }
-        return (double) count / Math.max(a.length(), b.length());
+        return (double) count / Math.max(candidate1.length(), candidate2.length());
     }
 
 
@@ -139,7 +139,7 @@ public class HeaderFooterDetection {
 
         for (int i = start; i <= end; i++) {
             if (i != currentPageIndex) {
-                nearestPages.add(pagesCache.computeIfAbsent(i, idx -> allPages.get(idx)));
+                nearestPages.add(pagesCache.computeIfAbsent(i, allPages::get));
             }
         }
 

From aeaca2f2781d6069187286e21a6ef3e06cd12cbe Mon Sep 17 00:00:00 2001
From: Andrei Isvoran <andrei.isvoran.ext@knecon.com>
Date: Fri, 10 May 2024 16:04:06 +0300
Subject: [PATCH 3/5] RED-9149 - Header and footer extraction by
 page-association

---
 .../processor/utils/HeaderFooterDetection.java       | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java
index f11f250..276a6ab 100644
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java
@@ -103,18 +103,18 @@ public class HeaderFooterDetection {
     }
 
 
-    private double compare(String candidate1, String candidate2) {
+    private double compare(String firstCandidate, String secondCandidate) {
 
         int count = 0;
-        candidate1 = candidate1.replaceAll("\\d", "@");
-        candidate2 = candidate2.replaceAll("\\d", "@");
+        String cleanedFirstCandidate = firstCandidate.replaceAll("\\d", "@");
+        String cleanedSecondCandidate = secondCandidate.replaceAll("\\d", "@");
 
-        for (int i = 0; i < Math.min(candidate1.length(), candidate2.length()); i++) {
-            if (candidate1.charAt(i) == candidate2.charAt(i)) {
+        for (int i = 0; i < Math.min(cleanedFirstCandidate.length(), cleanedSecondCandidate.length()); i++) {
+            if (cleanedFirstCandidate.charAt(i) == cleanedSecondCandidate.charAt(i)) {
                 count++;
             }
         }
-        return (double) count / Math.max(candidate1.length(), candidate2.length());
+        return (double) count / Math.max(cleanedFirstCandidate.length(), cleanedSecondCandidate.length());
     }
 
 

From a76b2ace3fd93d423b48db4d99824fcc8dcbd49f Mon Sep 17 00:00:00 2001
From: Andrei Isvoran <andrei.isvoran.ext@knecon.com>
Date: Mon, 13 May 2024 13:18:33 +0300
Subject: [PATCH 4/5] RED-9149 - Address comments

---
 .../utils/HeaderFooterDetection.java          | 41 ++++++++++---------
 1 file changed, 22 insertions(+), 19 deletions(-)

diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java
index 276a6ab..be46e96 100644
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java
@@ -17,6 +17,11 @@ import lombok.experimental.UtilityClass;
 public class HeaderFooterDetection {
 
     private final Map<Integer, ClassificationPage> pagesCache = new HashMap<>();
+    private static final double THRESHOLD = 0.5;
+    // Weight will go from 1.0 to 0.5 because the first element is the most likely to be the header on the page.
+    private static final double[] headerWeights = {1.0, 0.75, 0.5};
+    // Weight will go from 0.5 to 1.0 because the last element is the most likely to be the footer on the page.
+    private static final double[] footerWeights = {0.5, 0.75, 1.0};
 
 
     public boolean isLikelyFooter(TextPageBlock textPageBlock, ClassificationDocument document, ClassificationPage classificationPage) {
@@ -32,8 +37,6 @@ public class HeaderFooterDetection {
         List<ClassificationPage> nearestPages = findNearestPages(classificationPage, document.getPages(), window);
         List<List<AbstractPageBlock>> footerCandidates = getFooterCandidates(nearestPages);
 
-        // Weight will go from 0.5 to 1.0 because the last element is the most likely to be the footer on the page.
-        double[] footerWeights = {0.5, 0.75, 1.0};
         return detectHeadersOrFootersByPageAssociation(textPageBlock.getText(), footerCandidates, window, footerWeights);
     }
 
@@ -51,8 +54,6 @@ public class HeaderFooterDetection {
         List<ClassificationPage> nearestPages = findNearestPages(classificationPage, document.getPages(), window);
         List<List<AbstractPageBlock>> headerCandidates = getHeaderCandidates(nearestPages);
 
-        // Weight will go from 1.0 to 0.5 because the first element is the most likely to be the header on the page.
-        double[] headerWeights = {1.0, 0.75, 0.5};
         return detectHeadersOrFootersByPageAssociation(textPageBlock.getText(), headerCandidates, window, headerWeights);
     }
 
@@ -62,19 +63,19 @@ public class HeaderFooterDetection {
         double highestScore = 0.0;
 
         for (int i = 0; i < candidates.size(); i++) {
-            List<List<String>> temp = new ArrayList<>();
+            List<List<String>> candidateStrings = new ArrayList<>();
             for (int k = Math.max(i - window, 0); k < Math.min(i + window, candidates.size()); k++) {
-                temp.add(candidates.get(k)
-                                 .stream()
-                                 .map(AbstractPageBlock::getText)
-                                 .collect(Collectors.toList()));
+                candidateStrings.add(candidates.get(k)
+                                             .stream()
+                                             .map(AbstractPageBlock::getText)
+                                             .collect(Collectors.toList()));
             }
 
-            int maxLen = temp.stream()
+            int maxLen = candidateStrings.stream()
                     .mapToInt(List::size)
                     .max()
                     .orElse(0);
-            for (List<String> sublist : temp) {
+            for (List<String> sublist : candidateStrings) {
                 while (sublist.size() < maxLen) {
                     sublist.add(0, "");
                 }
@@ -85,13 +86,13 @@ public class HeaderFooterDetection {
                 double score = 0.0;
                 try {
                     int finalJ = j;
-                    List<String> cmp = temp.stream()
+                    List<String> paddedCandidateStrings = candidateStrings.stream()
                             .map(sublist -> sublist.size() > finalJ ? sublist.get(finalJ) : "")
                             .toList();
-                    for (String cm : cmp) {
+                    for (String cm : paddedCandidateStrings) {
                         score += compare(testString, cm) * (j < weights.length ? weights[j] : 1);
                     }
-                    score /= cmp.size();
+                    score /= paddedCandidateStrings.size();
                 } catch (IndexOutOfBoundsException e) {
                     continue;
                 }
@@ -99,7 +100,7 @@ public class HeaderFooterDetection {
             }
         }
 
-        return highestScore > 0.5;
+        return highestScore > THRESHOLD;
     }
 
 
@@ -155,10 +156,11 @@ public class HeaderFooterDetection {
         List<List<AbstractPageBlock>> footerCandidates = new ArrayList<>();
         for (ClassificationPage page : pages) {
             List<AbstractPageBlock> textBlocks = page.getTextBlocks();
-            int blockCount = textBlocks.size();
+            List<TextPageBlock> textPageBlocks = textBlocks.stream().filter(textBlock -> textBlock instanceof TextPageBlock).map(textBlock -> (TextPageBlock) textBlock).toList();
+            int blockCount = textPageBlocks.size();
             if (blockCount > 0) {
                 int start = Math.max(0, blockCount - 3);
-                footerCandidates.add(new ArrayList<>(textBlocks.subList(start, blockCount)));
+                footerCandidates.add(new ArrayList<>(textPageBlocks.subList(start, blockCount)));
             }
         }
         return footerCandidates;
@@ -171,8 +173,9 @@ public class HeaderFooterDetection {
         List<List<AbstractPageBlock>> headerCandidates = new ArrayList<>();
         for (ClassificationPage page : pages) {
             List<AbstractPageBlock> textBlocks = page.getTextBlocks();
-            int count = Math.min(3, textBlocks.size());
-            headerCandidates.add(new ArrayList<>(textBlocks.subList(0, count)));
+            List<TextPageBlock> textPageBlocks = textBlocks.stream().filter(textBlock -> textBlock instanceof TextPageBlock).map(textBlock -> (TextPageBlock) textBlock).toList();
+            int count = Math.min(3, textPageBlocks.size());
+            headerCandidates.add(new ArrayList<>(textPageBlocks.subList(0, count)));
         }
         return headerCandidates;
     }

From 40465e8778f5b1d3d44e2a9a3b176d6927ba81e1 Mon Sep 17 00:00:00 2001
From: Andrei Isvoran <andrei.isvoran.ext@knecon.com>
Date: Mon, 13 May 2024 15:13:37 +0300
Subject: [PATCH 5/5] RED-9149 - Improvements

---
 .../utils/HeaderFooterDetection.java          | 96 +++++++++++++------
 1 file changed, 68 insertions(+), 28 deletions(-)

diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java
index be46e96..24ed41d 100644
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java
@@ -84,38 +84,32 @@ public class HeaderFooterDetection {
             // Compare the testString against each candidate in the window
             for (int j = 0; j < maxLen; j++) {
                 double score = 0.0;
-                try {
-                    int finalJ = j;
-                    List<String> paddedCandidateStrings = candidateStrings.stream()
-                            .map(sublist -> sublist.size() > finalJ ? sublist.get(finalJ) : "")
-                            .toList();
-                    for (String cm : paddedCandidateStrings) {
-                        score += compare(testString, cm) * (j < weights.length ? weights[j] : 1);
+                int finalJ = j;
+                List<String> paddedCandidateStrings = candidateStrings.stream()
+                        .map(sublist -> sublist.size() > finalJ ? sublist.get(finalJ) : "")
+                        .toList();
+                for (String paddedString : paddedCandidateStrings) {
+                    if ((testString.length() >= 5 && paddedString.length() >= 5) && (testString.length() > 2 * paddedString.length()
+                                                                                     || paddedString.length() > 2 * testString.length())) {
+                        // If both strings are at least 5 characters long and one string is more than twice as long as the other,
+                        // skip the distance calculation as it's time-consuming, and we can assume they are not similar enough
+                        continue;
                     }
-                    score /= paddedCandidateStrings.size();
-                } catch (IndexOutOfBoundsException e) {
-                    continue;
+
+                    int distance = calculateHammingDistanceWithPreprocessing(testString, paddedString);
+                    double normalizedScore = 1 - (double) distance / Math.max(testString.length(), paddedString.length());
+                    score += normalizedScore * (j < weights.length ? weights[j] : 1);
                 }
+                score /= paddedCandidateStrings.size();
                 highestScore = Math.max(highestScore, score);
+                // Early stop
+                if (highestScore > THRESHOLD) {
+                    return true;
+                }
             }
         }
 
-        return highestScore > THRESHOLD;
-    }
-
-
-    private double compare(String firstCandidate, String secondCandidate) {
-
-        int count = 0;
-        String cleanedFirstCandidate = firstCandidate.replaceAll("\\d", "@");
-        String cleanedSecondCandidate = secondCandidate.replaceAll("\\d", "@");
-
-        for (int i = 0; i < Math.min(cleanedFirstCandidate.length(), cleanedSecondCandidate.length()); i++) {
-            if (cleanedFirstCandidate.charAt(i) == cleanedSecondCandidate.charAt(i)) {
-                count++;
-            }
-        }
-        return (double) count / Math.max(cleanedFirstCandidate.length(), cleanedSecondCandidate.length());
+        return false;
     }
 
 
@@ -156,7 +150,10 @@ public class HeaderFooterDetection {
         List<List<AbstractPageBlock>> footerCandidates = new ArrayList<>();
         for (ClassificationPage page : pages) {
             List<AbstractPageBlock> textBlocks = page.getTextBlocks();
-            List<TextPageBlock> textPageBlocks = textBlocks.stream().filter(textBlock -> textBlock instanceof TextPageBlock).map(textBlock -> (TextPageBlock) textBlock).toList();
+            List<TextPageBlock> textPageBlocks = textBlocks.stream()
+                    .filter(textBlock -> textBlock instanceof TextPageBlock)
+                    .map(textBlock -> (TextPageBlock) textBlock)
+                    .toList();
             int blockCount = textPageBlocks.size();
             if (blockCount > 0) {
                 int start = Math.max(0, blockCount - 3);
@@ -173,11 +170,54 @@ public class HeaderFooterDetection {
         List<List<AbstractPageBlock>> headerCandidates = new ArrayList<>();
         for (ClassificationPage page : pages) {
             List<AbstractPageBlock> textBlocks = page.getTextBlocks();
-            List<TextPageBlock> textPageBlocks = textBlocks.stream().filter(textBlock -> textBlock instanceof TextPageBlock).map(textBlock -> (TextPageBlock) textBlock).toList();
+            List<TextPageBlock> textPageBlocks = textBlocks.stream()
+                    .filter(textBlock -> textBlock instanceof TextPageBlock)
+                    .map(textBlock -> (TextPageBlock) textBlock)
+                    .toList();
             int count = Math.min(3, textPageBlocks.size());
             headerCandidates.add(new ArrayList<>(textPageBlocks.subList(0, count)));
         }
         return headerCandidates;
     }
 
+
+    /**
+     * Calculate the Hamming distance between two strings after preprocessing to make them the same length
+     * and replacing all digits with a special character '@' since they are a common occurrence in headers/footers.
+     *
+     * @param firstCandidate  First string
+     * @param secondCandidate Second string
+     * @return The Hamming distance between the two preprocessed strings.
+     */
+    private int calculateHammingDistanceWithPreprocessing(String firstCandidate, String secondCandidate) {
+
+        int maxLength = Math.max(firstCandidate.length(), secondCandidate.length());
+
+        String cleanFirstCandidate = padString(firstCandidate, maxLength, '\0').replaceAll("\\d", "@");
+        String cleanSecondCandidate = padString(secondCandidate, maxLength, '\0').replaceAll("\\d", "@");
+
+        int distance = 0;
+        for (int i = 0; i < maxLength; i++) {
+            if (cleanFirstCandidate.charAt(i) != cleanSecondCandidate.charAt(i)) {
+                distance++;
+            }
+        }
+        return distance;
+    }
+
+
+    private String padString(String input, int length, char padChar) {
+
+        if (input.length() >= length) {
+            return input;
+        }
+
+        StringBuilder sb = new StringBuilder(input);
+
+        while (sb.length() < length) {
+            sb.append(padChar);
+        }
+        return sb.toString();
+    }
+
 }