From fda25852d1990d372307674e19578318ec469a5b Mon Sep 17 00:00:00 2001 From: Andrei Isvoran Date: Fri, 10 May 2024 15:17:41 +0300 Subject: [PATCH 1/5] RED-9149 - Header and footer extraction by page-association --- .../DocuMineClassificationService.java | 25 ++- .../utils/HeaderFooterDetection.java | 180 ++++++++++++++++++ 2 files changed, 199 insertions(+), 6 deletions(-) create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java index f10ac3b..608e863 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java @@ -12,6 +12,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.utils.HeaderFooterDetection; import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils; @@ -49,6 +50,7 @@ public class DocuMineClassificationService { } } + private void classifyBlock(TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List headlineFontSizes) { log.debug("headlineFontSizes: {}", headlineFontSizes); @@ -63,15 +65,26 @@ public class DocuMineClassificationService { return; } if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) - || PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null - || textBlock.getHighestFontSize() <= document.getFontSizeCounter() - .getMostPopular())) { + || (PositionUtils.isOverBodyTextFrame(bodyTextFrame, + textBlock, + page.getRotation()) && (document.getFontSizeCounter().getMostPopular() + == null + || textBlock.getHighestFontSize() + <= document.getFontSizeCounter() + .getMostPopular())) + || HeaderFooterDetection.isLikelyHeader(textBlock, document, page)) { textBlock.setClassification(PageBlockType.HEADER); } else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER) - || PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null - || textBlock.getHighestFontSize() <= document.getFontSizeCounter() - .getMostPopular())) { + || (PositionUtils.isUnderBodyTextFrame(bodyTextFrame, + textBlock, + page.getRotation()) + && (document.getFontSizeCounter().getMostPopular() + == null + || textBlock.getHighestFontSize() + <= document.getFontSizeCounter() + .getMostPopular())) + || HeaderFooterDetection.isLikelyFooter(textBlock, document, page)) { textBlock.setClassification(PageBlockType.FOOTER); } else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java new file mode 100644 index 0000000..4668195 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java @@ -0,0 +1,180 @@ +package com.knecon.fforesight.service.layoutparser.processor.utils; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; + +import lombok.experimental.UtilityClass; + +@UtilityClass +public class HeaderFooterDetection { + + private final Map pagesCache = new HashMap<>(); + + + public boolean isLikelyFooter(TextPageBlock textPageBlock, ClassificationDocument document, ClassificationPage classificationPage) { + + int numberOfPages = document.getPages().size(); + if (numberOfPages < 3) { + // If the document has 1 or 2 pages this may lead to more false positives than actual findings. + return false; + } + + int window = Math.min(numberOfPages, 8); + + List nearestPages = findNearestPages(classificationPage, document.getPages(), window); + List> footerCandidates = getFooterCandidates(nearestPages); + + // Weight will go from 0.5 to 1.0 because the last element is the most likely to be the footer on the page. + double[] footerWeights = {0.5, 0.75, 1.0}; + return detectHeadersOrFootersByPageAssociation(textPageBlock.getText(), footerCandidates, window, footerWeights); + } + + + public boolean isLikelyHeader(TextPageBlock textPageBlock, ClassificationDocument document, ClassificationPage classificationPage) { + + int numberOfPages = document.getPages().size(); + if (numberOfPages < 3) { + // If the document has 1 or 2 pages this may lead to more false positives than actual findings. + return false; + } + + int window = Math.min(numberOfPages, 8); + + List nearestPages = findNearestPages(classificationPage, document.getPages(), window); + List> headerCandidates = getHeaderCandidates(nearestPages); + + // Weight will go from 1.0 to 0.5 because the first element is the most likely to be the header on the page. + double[] headerWeights = {1.0, 0.75, 0.5}; + return detectHeadersOrFootersByPageAssociation(textPageBlock.getText(), headerCandidates, window, headerWeights); + } + + + private boolean detectHeadersOrFootersByPageAssociation(String testString, List> candidates, int window, double[] weights) { + + double highestScore = 0.0; + + for (int i = 0; i < candidates.size(); i++) { + List> temp = new ArrayList<>(); + for (int k = Math.max(i - window, 0); k < Math.min(i + window, candidates.size()); k++) { + temp.add(candidates.get(k) + .stream() + .map(AbstractPageBlock::getText) + .collect(Collectors.toList())); + } + + int maxLen = temp.stream() + .mapToInt(List::size) + .max() + .orElse(0); + for (List sublist : temp) { + while (sublist.size() < maxLen) { + sublist.add(0, ""); + } + } + + // Compare the testString against each candidates in the window + for (int j = 0; j < maxLen; j++) { + double score = 0.0; + try { + int finalJ = j; + List cmp = temp.stream() + .map(sublist -> sublist.size() > finalJ ? sublist.get(finalJ) : "") + .toList(); + for (String cm : cmp) { + score += compare(testString, cm) * (j < weights.length ? weights[j] : 1); + } + score /= cmp.size(); + } catch (IndexOutOfBoundsException e) { + continue; + } + highestScore = Math.max(highestScore, score); + } + } + + return highestScore > 0.5; + } + + + private double compare(String a, String b) { + + int count = 0; + a = a.replaceAll("\\d", "@"); + b = b.replaceAll("\\d", "@"); + + for (int i = 0; i < Math.min(a.length(), b.length()); i++) { + if (a.charAt(i) == b.charAt(i)) { + count++; + } + } + return (double) count / Math.max(a.length(), b.length()); + } + + + /** + * Find the nearest n pages for a given page. + * For example: nearest 8 pages for page 4 are: 1, 2, 3, 5, 6, 7, 8, 9. + * + * @param currentPage Current page to find the nearest ones. + * @param allPages All pages in the document. + * @param numNeighbors Number of neighbouring pages to find. + * @return The nearest pages. + */ + private List findNearestPages(ClassificationPage currentPage, List allPages, int numNeighbors) { + + int totalPages = allPages.size(); + List nearestPages = new ArrayList<>(); + + int currentPageIndex = currentPage.getPageNumber() - 1; + int halfWin = numNeighbors / 2; + int start = Math.max(0, currentPageIndex - halfWin); + int end = Math.min(totalPages - 1, currentPageIndex + halfWin); + + for (int i = start; i <= end; i++) { + if (i != currentPageIndex) { + nearestPages.add(pagesCache.computeIfAbsent(i, idx -> allPages.get(idx))); + } + } + + pagesCache.keySet().removeIf(key -> key < start || key > end); + + return nearestPages; + } + + + // Get the last 3 TextBlocks on the page as they are likely to be a footer + private List> getFooterCandidates(List pages) { + + List> footerCandidates = new ArrayList<>(); + for (ClassificationPage page : pages) { + List textBlocks = page.getTextBlocks(); + int blockCount = textBlocks.size(); + if (blockCount > 0) { + int start = Math.max(0, blockCount - 3); + footerCandidates.add(new ArrayList<>(textBlocks.subList(start, blockCount))); + } + } + return footerCandidates; + } + + + // Get the first 3 TextBlocks on the page as they are likely to be a header + private List> getHeaderCandidates(List pages) { + + List> headerCandidates = new ArrayList<>(); + for (ClassificationPage page : pages) { + List textBlocks = page.getTextBlocks(); + int count = Math.min(3, textBlocks.size()); + headerCandidates.add(new ArrayList<>(textBlocks.subList(0, count))); + } + return headerCandidates; + } + +} From f1dbcc24a26de9372239096034f1e2304fb5abf3 Mon Sep 17 00:00:00 2001 From: Andrei Isvoran Date: Fri, 10 May 2024 15:49:08 +0300 Subject: [PATCH 2/5] RED-9149 - Header and footer extraction by page-association --- .../processor/utils/HeaderFooterDetection.java | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java index 4668195..f11f250 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java @@ -58,7 +58,7 @@ public class HeaderFooterDetection { private boolean detectHeadersOrFootersByPageAssociation(String testString, List> candidates, int window, double[] weights) { - + double highestScore = 0.0; for (int i = 0; i < candidates.size(); i++) { @@ -80,7 +80,7 @@ public class HeaderFooterDetection { } } - // Compare the testString against each candidates in the window + // Compare the testString against each candidate in the window for (int j = 0; j < maxLen; j++) { double score = 0.0; try { @@ -103,18 +103,18 @@ public class HeaderFooterDetection { } - private double compare(String a, String b) { + private double compare(String candidate1, String candidate2) { int count = 0; - a = a.replaceAll("\\d", "@"); - b = b.replaceAll("\\d", "@"); + candidate1 = candidate1.replaceAll("\\d", "@"); + candidate2 = candidate2.replaceAll("\\d", "@"); - for (int i = 0; i < Math.min(a.length(), b.length()); i++) { - if (a.charAt(i) == b.charAt(i)) { + for (int i = 0; i < Math.min(candidate1.length(), candidate2.length()); i++) { + if (candidate1.charAt(i) == candidate2.charAt(i)) { count++; } } - return (double) count / Math.max(a.length(), b.length()); + return (double) count / Math.max(candidate1.length(), candidate2.length()); } @@ -139,7 +139,7 @@ public class HeaderFooterDetection { for (int i = start; i <= end; i++) { if (i != currentPageIndex) { - nearestPages.add(pagesCache.computeIfAbsent(i, idx -> allPages.get(idx))); + nearestPages.add(pagesCache.computeIfAbsent(i, allPages::get)); } } From aeaca2f2781d6069187286e21a6ef3e06cd12cbe Mon Sep 17 00:00:00 2001 From: Andrei Isvoran Date: Fri, 10 May 2024 16:04:06 +0300 Subject: [PATCH 3/5] RED-9149 - Header and footer extraction by page-association --- .../processor/utils/HeaderFooterDetection.java | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java index f11f250..276a6ab 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java @@ -103,18 +103,18 @@ public class HeaderFooterDetection { } - private double compare(String candidate1, String candidate2) { + private double compare(String firstCandidate, String secondCandidate) { int count = 0; - candidate1 = candidate1.replaceAll("\\d", "@"); - candidate2 = candidate2.replaceAll("\\d", "@"); + String cleanedFirstCandidate = firstCandidate.replaceAll("\\d", "@"); + String cleanedSecondCandidate = secondCandidate.replaceAll("\\d", "@"); - for (int i = 0; i < Math.min(candidate1.length(), candidate2.length()); i++) { - if (candidate1.charAt(i) == candidate2.charAt(i)) { + for (int i = 0; i < Math.min(cleanedFirstCandidate.length(), cleanedSecondCandidate.length()); i++) { + if (cleanedFirstCandidate.charAt(i) == cleanedSecondCandidate.charAt(i)) { count++; } } - return (double) count / Math.max(candidate1.length(), candidate2.length()); + return (double) count / Math.max(cleanedFirstCandidate.length(), cleanedSecondCandidate.length()); } From a76b2ace3fd93d423b48db4d99824fcc8dcbd49f Mon Sep 17 00:00:00 2001 From: Andrei Isvoran Date: Mon, 13 May 2024 13:18:33 +0300 Subject: [PATCH 4/5] RED-9149 - Address comments --- .../utils/HeaderFooterDetection.java | 41 ++++++++++--------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java index 276a6ab..be46e96 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java @@ -17,6 +17,11 @@ import lombok.experimental.UtilityClass; public class HeaderFooterDetection { private final Map pagesCache = new HashMap<>(); + private static final double THRESHOLD = 0.5; + // Weight will go from 1.0 to 0.5 because the first element is the most likely to be the header on the page. + private static final double[] headerWeights = {1.0, 0.75, 0.5}; + // Weight will go from 0.5 to 1.0 because the last element is the most likely to be the footer on the page. + private static final double[] footerWeights = {0.5, 0.75, 1.0}; public boolean isLikelyFooter(TextPageBlock textPageBlock, ClassificationDocument document, ClassificationPage classificationPage) { @@ -32,8 +37,6 @@ public class HeaderFooterDetection { List nearestPages = findNearestPages(classificationPage, document.getPages(), window); List> footerCandidates = getFooterCandidates(nearestPages); - // Weight will go from 0.5 to 1.0 because the last element is the most likely to be the footer on the page. - double[] footerWeights = {0.5, 0.75, 1.0}; return detectHeadersOrFootersByPageAssociation(textPageBlock.getText(), footerCandidates, window, footerWeights); } @@ -51,8 +54,6 @@ public class HeaderFooterDetection { List nearestPages = findNearestPages(classificationPage, document.getPages(), window); List> headerCandidates = getHeaderCandidates(nearestPages); - // Weight will go from 1.0 to 0.5 because the first element is the most likely to be the header on the page. - double[] headerWeights = {1.0, 0.75, 0.5}; return detectHeadersOrFootersByPageAssociation(textPageBlock.getText(), headerCandidates, window, headerWeights); } @@ -62,19 +63,19 @@ public class HeaderFooterDetection { double highestScore = 0.0; for (int i = 0; i < candidates.size(); i++) { - List> temp = new ArrayList<>(); + List> candidateStrings = new ArrayList<>(); for (int k = Math.max(i - window, 0); k < Math.min(i + window, candidates.size()); k++) { - temp.add(candidates.get(k) - .stream() - .map(AbstractPageBlock::getText) - .collect(Collectors.toList())); + candidateStrings.add(candidates.get(k) + .stream() + .map(AbstractPageBlock::getText) + .collect(Collectors.toList())); } - int maxLen = temp.stream() + int maxLen = candidateStrings.stream() .mapToInt(List::size) .max() .orElse(0); - for (List sublist : temp) { + for (List sublist : candidateStrings) { while (sublist.size() < maxLen) { sublist.add(0, ""); } @@ -85,13 +86,13 @@ public class HeaderFooterDetection { double score = 0.0; try { int finalJ = j; - List cmp = temp.stream() + List paddedCandidateStrings = candidateStrings.stream() .map(sublist -> sublist.size() > finalJ ? sublist.get(finalJ) : "") .toList(); - for (String cm : cmp) { + for (String cm : paddedCandidateStrings) { score += compare(testString, cm) * (j < weights.length ? weights[j] : 1); } - score /= cmp.size(); + score /= paddedCandidateStrings.size(); } catch (IndexOutOfBoundsException e) { continue; } @@ -99,7 +100,7 @@ public class HeaderFooterDetection { } } - return highestScore > 0.5; + return highestScore > THRESHOLD; } @@ -155,10 +156,11 @@ public class HeaderFooterDetection { List> footerCandidates = new ArrayList<>(); for (ClassificationPage page : pages) { List textBlocks = page.getTextBlocks(); - int blockCount = textBlocks.size(); + List textPageBlocks = textBlocks.stream().filter(textBlock -> textBlock instanceof TextPageBlock).map(textBlock -> (TextPageBlock) textBlock).toList(); + int blockCount = textPageBlocks.size(); if (blockCount > 0) { int start = Math.max(0, blockCount - 3); - footerCandidates.add(new ArrayList<>(textBlocks.subList(start, blockCount))); + footerCandidates.add(new ArrayList<>(textPageBlocks.subList(start, blockCount))); } } return footerCandidates; @@ -171,8 +173,9 @@ public class HeaderFooterDetection { List> headerCandidates = new ArrayList<>(); for (ClassificationPage page : pages) { List textBlocks = page.getTextBlocks(); - int count = Math.min(3, textBlocks.size()); - headerCandidates.add(new ArrayList<>(textBlocks.subList(0, count))); + List textPageBlocks = textBlocks.stream().filter(textBlock -> textBlock instanceof TextPageBlock).map(textBlock -> (TextPageBlock) textBlock).toList(); + int count = Math.min(3, textPageBlocks.size()); + headerCandidates.add(new ArrayList<>(textPageBlocks.subList(0, count))); } return headerCandidates; } From 40465e8778f5b1d3d44e2a9a3b176d6927ba81e1 Mon Sep 17 00:00:00 2001 From: Andrei Isvoran Date: Mon, 13 May 2024 15:13:37 +0300 Subject: [PATCH 5/5] RED-9149 - Improvements --- .../utils/HeaderFooterDetection.java | 96 +++++++++++++------ 1 file changed, 68 insertions(+), 28 deletions(-) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java index be46e96..24ed41d 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java @@ -84,38 +84,32 @@ public class HeaderFooterDetection { // Compare the testString against each candidate in the window for (int j = 0; j < maxLen; j++) { double score = 0.0; - try { - int finalJ = j; - List paddedCandidateStrings = candidateStrings.stream() - .map(sublist -> sublist.size() > finalJ ? sublist.get(finalJ) : "") - .toList(); - for (String cm : paddedCandidateStrings) { - score += compare(testString, cm) * (j < weights.length ? weights[j] : 1); + int finalJ = j; + List paddedCandidateStrings = candidateStrings.stream() + .map(sublist -> sublist.size() > finalJ ? sublist.get(finalJ) : "") + .toList(); + for (String paddedString : paddedCandidateStrings) { + if ((testString.length() >= 5 && paddedString.length() >= 5) && (testString.length() > 2 * paddedString.length() + || paddedString.length() > 2 * testString.length())) { + // If both strings are at least 5 characters long and one string is more than twice as long as the other, + // skip the distance calculation as it's time-consuming, and we can assume they are not similar enough + continue; } - score /= paddedCandidateStrings.size(); - } catch (IndexOutOfBoundsException e) { - continue; + + int distance = calculateHammingDistanceWithPreprocessing(testString, paddedString); + double normalizedScore = 1 - (double) distance / Math.max(testString.length(), paddedString.length()); + score += normalizedScore * (j < weights.length ? weights[j] : 1); } + score /= paddedCandidateStrings.size(); highestScore = Math.max(highestScore, score); + // Early stop + if (highestScore > THRESHOLD) { + return true; + } } } - return highestScore > THRESHOLD; - } - - - private double compare(String firstCandidate, String secondCandidate) { - - int count = 0; - String cleanedFirstCandidate = firstCandidate.replaceAll("\\d", "@"); - String cleanedSecondCandidate = secondCandidate.replaceAll("\\d", "@"); - - for (int i = 0; i < Math.min(cleanedFirstCandidate.length(), cleanedSecondCandidate.length()); i++) { - if (cleanedFirstCandidate.charAt(i) == cleanedSecondCandidate.charAt(i)) { - count++; - } - } - return (double) count / Math.max(cleanedFirstCandidate.length(), cleanedSecondCandidate.length()); + return false; } @@ -156,7 +150,10 @@ public class HeaderFooterDetection { List> footerCandidates = new ArrayList<>(); for (ClassificationPage page : pages) { List textBlocks = page.getTextBlocks(); - List textPageBlocks = textBlocks.stream().filter(textBlock -> textBlock instanceof TextPageBlock).map(textBlock -> (TextPageBlock) textBlock).toList(); + List textPageBlocks = textBlocks.stream() + .filter(textBlock -> textBlock instanceof TextPageBlock) + .map(textBlock -> (TextPageBlock) textBlock) + .toList(); int blockCount = textPageBlocks.size(); if (blockCount > 0) { int start = Math.max(0, blockCount - 3); @@ -173,11 +170,54 @@ public class HeaderFooterDetection { List> headerCandidates = new ArrayList<>(); for (ClassificationPage page : pages) { List textBlocks = page.getTextBlocks(); - List textPageBlocks = textBlocks.stream().filter(textBlock -> textBlock instanceof TextPageBlock).map(textBlock -> (TextPageBlock) textBlock).toList(); + List textPageBlocks = textBlocks.stream() + .filter(textBlock -> textBlock instanceof TextPageBlock) + .map(textBlock -> (TextPageBlock) textBlock) + .toList(); int count = Math.min(3, textPageBlocks.size()); headerCandidates.add(new ArrayList<>(textPageBlocks.subList(0, count))); } return headerCandidates; } + + /** + * Calculate the Hamming distance between two strings after preprocessing to make them the same length + * and replacing all digits with a special character '@' since they are a common occurrence in headers/footers. + * + * @param firstCandidate First string + * @param secondCandidate Second string + * @return The Hamming distance between the two preprocessed strings. + */ + private int calculateHammingDistanceWithPreprocessing(String firstCandidate, String secondCandidate) { + + int maxLength = Math.max(firstCandidate.length(), secondCandidate.length()); + + String cleanFirstCandidate = padString(firstCandidate, maxLength, '\0').replaceAll("\\d", "@"); + String cleanSecondCandidate = padString(secondCandidate, maxLength, '\0').replaceAll("\\d", "@"); + + int distance = 0; + for (int i = 0; i < maxLength; i++) { + if (cleanFirstCandidate.charAt(i) != cleanSecondCandidate.charAt(i)) { + distance++; + } + } + return distance; + } + + + private String padString(String input, int length, char padChar) { + + if (input.length() >= length) { + return input; + } + + StringBuilder sb = new StringBuilder(input); + + while (sb.length() < length) { + sb.append(padChar); + } + return sb.toString(); + } + }