From d5a4dd4d421b518078468705f2453ceb5e046041 Mon Sep 17 00:00:00 2001 From: Andrei Isvoran Date: Mon, 13 May 2024 14:57:51 +0200 Subject: [PATCH] RED-9149 - Header and footer detection by page-association --- .../DocuMineClassificationService.java | 24 +- .../utils/HeaderFooterDetection.java | 223 ++++++++++++++++++ 2 files changed, 241 insertions(+), 6 deletions(-) create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java index 4c881c6..608e863 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java @@ -12,6 +12,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.utils.HeaderFooterDetection; import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils; @@ -64,15 +65,26 @@ public class DocuMineClassificationService { return; } if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) - || PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null - || textBlock.getHighestFontSize() <= document.getFontSizeCounter() - .getMostPopular())) { + || (PositionUtils.isOverBodyTextFrame(bodyTextFrame, + textBlock, + page.getRotation()) && (document.getFontSizeCounter().getMostPopular() + == null + || textBlock.getHighestFontSize() + <= document.getFontSizeCounter() + .getMostPopular())) + || HeaderFooterDetection.isLikelyHeader(textBlock, document, page)) { textBlock.setClassification(PageBlockType.HEADER); } else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER) - || PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null - || textBlock.getHighestFontSize() <= document.getFontSizeCounter() - .getMostPopular())) { + || (PositionUtils.isUnderBodyTextFrame(bodyTextFrame, + textBlock, + page.getRotation()) + && (document.getFontSizeCounter().getMostPopular() + == null + || textBlock.getHighestFontSize() + <= document.getFontSizeCounter() + .getMostPopular())) + || HeaderFooterDetection.isLikelyFooter(textBlock, document, page)) { textBlock.setClassification(PageBlockType.FOOTER); } else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java new file mode 100644 index 0000000..24ed41d --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java @@ -0,0 +1,223 @@ +package com.knecon.fforesight.service.layoutparser.processor.utils; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; + +import lombok.experimental.UtilityClass; + +@UtilityClass +public class HeaderFooterDetection { + + private final Map pagesCache = new HashMap<>(); + private static final double THRESHOLD = 0.5; + // Weight will go from 1.0 to 0.5 because the first element is the most likely to be the header on the page. + private static final double[] headerWeights = {1.0, 0.75, 0.5}; + // Weight will go from 0.5 to 1.0 because the last element is the most likely to be the footer on the page. + private static final double[] footerWeights = {0.5, 0.75, 1.0}; + + + public boolean isLikelyFooter(TextPageBlock textPageBlock, ClassificationDocument document, ClassificationPage classificationPage) { + + int numberOfPages = document.getPages().size(); + if (numberOfPages < 3) { + // If the document has 1 or 2 pages this may lead to more false positives than actual findings. + return false; + } + + int window = Math.min(numberOfPages, 8); + + List nearestPages = findNearestPages(classificationPage, document.getPages(), window); + List> footerCandidates = getFooterCandidates(nearestPages); + + return detectHeadersOrFootersByPageAssociation(textPageBlock.getText(), footerCandidates, window, footerWeights); + } + + + public boolean isLikelyHeader(TextPageBlock textPageBlock, ClassificationDocument document, ClassificationPage classificationPage) { + + int numberOfPages = document.getPages().size(); + if (numberOfPages < 3) { + // If the document has 1 or 2 pages this may lead to more false positives than actual findings. + return false; + } + + int window = Math.min(numberOfPages, 8); + + List nearestPages = findNearestPages(classificationPage, document.getPages(), window); + List> headerCandidates = getHeaderCandidates(nearestPages); + + return detectHeadersOrFootersByPageAssociation(textPageBlock.getText(), headerCandidates, window, headerWeights); + } + + + private boolean detectHeadersOrFootersByPageAssociation(String testString, List> candidates, int window, double[] weights) { + + double highestScore = 0.0; + + for (int i = 0; i < candidates.size(); i++) { + List> candidateStrings = new ArrayList<>(); + for (int k = Math.max(i - window, 0); k < Math.min(i + window, candidates.size()); k++) { + candidateStrings.add(candidates.get(k) + .stream() + .map(AbstractPageBlock::getText) + .collect(Collectors.toList())); + } + + int maxLen = candidateStrings.stream() + .mapToInt(List::size) + .max() + .orElse(0); + for (List sublist : candidateStrings) { + while (sublist.size() < maxLen) { + sublist.add(0, ""); + } + } + + // Compare the testString against each candidate in the window + for (int j = 0; j < maxLen; j++) { + double score = 0.0; + int finalJ = j; + List paddedCandidateStrings = candidateStrings.stream() + .map(sublist -> sublist.size() > finalJ ? sublist.get(finalJ) : "") + .toList(); + for (String paddedString : paddedCandidateStrings) { + if ((testString.length() >= 5 && paddedString.length() >= 5) && (testString.length() > 2 * paddedString.length() + || paddedString.length() > 2 * testString.length())) { + // If both strings are at least 5 characters long and one string is more than twice as long as the other, + // skip the distance calculation as it's time-consuming, and we can assume they are not similar enough + continue; + } + + int distance = calculateHammingDistanceWithPreprocessing(testString, paddedString); + double normalizedScore = 1 - (double) distance / Math.max(testString.length(), paddedString.length()); + score += normalizedScore * (j < weights.length ? weights[j] : 1); + } + score /= paddedCandidateStrings.size(); + highestScore = Math.max(highestScore, score); + // Early stop + if (highestScore > THRESHOLD) { + return true; + } + } + } + + return false; + } + + + /** + * Find the nearest n pages for a given page. + * For example: nearest 8 pages for page 4 are: 1, 2, 3, 5, 6, 7, 8, 9. + * + * @param currentPage Current page to find the nearest ones. + * @param allPages All pages in the document. + * @param numNeighbors Number of neighbouring pages to find. + * @return The nearest pages. + */ + private List findNearestPages(ClassificationPage currentPage, List allPages, int numNeighbors) { + + int totalPages = allPages.size(); + List nearestPages = new ArrayList<>(); + + int currentPageIndex = currentPage.getPageNumber() - 1; + int halfWin = numNeighbors / 2; + int start = Math.max(0, currentPageIndex - halfWin); + int end = Math.min(totalPages - 1, currentPageIndex + halfWin); + + for (int i = start; i <= end; i++) { + if (i != currentPageIndex) { + nearestPages.add(pagesCache.computeIfAbsent(i, allPages::get)); + } + } + + pagesCache.keySet().removeIf(key -> key < start || key > end); + + return nearestPages; + } + + + // Get the last 3 TextBlocks on the page as they are likely to be a footer + private List> getFooterCandidates(List pages) { + + List> footerCandidates = new ArrayList<>(); + for (ClassificationPage page : pages) { + List textBlocks = page.getTextBlocks(); + List textPageBlocks = textBlocks.stream() + .filter(textBlock -> textBlock instanceof TextPageBlock) + .map(textBlock -> (TextPageBlock) textBlock) + .toList(); + int blockCount = textPageBlocks.size(); + if (blockCount > 0) { + int start = Math.max(0, blockCount - 3); + footerCandidates.add(new ArrayList<>(textPageBlocks.subList(start, blockCount))); + } + } + return footerCandidates; + } + + + // Get the first 3 TextBlocks on the page as they are likely to be a header + private List> getHeaderCandidates(List pages) { + + List> headerCandidates = new ArrayList<>(); + for (ClassificationPage page : pages) { + List textBlocks = page.getTextBlocks(); + List textPageBlocks = textBlocks.stream() + .filter(textBlock -> textBlock instanceof TextPageBlock) + .map(textBlock -> (TextPageBlock) textBlock) + .toList(); + int count = Math.min(3, textPageBlocks.size()); + headerCandidates.add(new ArrayList<>(textPageBlocks.subList(0, count))); + } + return headerCandidates; + } + + + /** + * Calculate the Hamming distance between two strings after preprocessing to make them the same length + * and replacing all digits with a special character '@' since they are a common occurrence in headers/footers. + * + * @param firstCandidate First string + * @param secondCandidate Second string + * @return The Hamming distance between the two preprocessed strings. + */ + private int calculateHammingDistanceWithPreprocessing(String firstCandidate, String secondCandidate) { + + int maxLength = Math.max(firstCandidate.length(), secondCandidate.length()); + + String cleanFirstCandidate = padString(firstCandidate, maxLength, '\0').replaceAll("\\d", "@"); + String cleanSecondCandidate = padString(secondCandidate, maxLength, '\0').replaceAll("\\d", "@"); + + int distance = 0; + for (int i = 0; i < maxLength; i++) { + if (cleanFirstCandidate.charAt(i) != cleanSecondCandidate.charAt(i)) { + distance++; + } + } + return distance; + } + + + private String padString(String input, int length, char padChar) { + + if (input.length() >= length) { + return input; + } + + StringBuilder sb = new StringBuilder(input); + + while (sb.length() < length) { + sb.append(padChar); + } + return sb.toString(); + } + +}