From 40465e8778f5b1d3d44e2a9a3b176d6927ba81e1 Mon Sep 17 00:00:00 2001 From: Andrei Isvoran Date: Mon, 13 May 2024 15:13:37 +0300 Subject: [PATCH] RED-9149 - Improvements --- .../utils/HeaderFooterDetection.java | 96 +++++++++++++------ 1 file changed, 68 insertions(+), 28 deletions(-) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java index be46e96..24ed41d 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java @@ -84,38 +84,32 @@ public class HeaderFooterDetection { // Compare the testString against each candidate in the window for (int j = 0; j < maxLen; j++) { double score = 0.0; - try { - int finalJ = j; - List paddedCandidateStrings = candidateStrings.stream() - .map(sublist -> sublist.size() > finalJ ? sublist.get(finalJ) : "") - .toList(); - for (String cm : paddedCandidateStrings) { - score += compare(testString, cm) * (j < weights.length ? weights[j] : 1); + int finalJ = j; + List paddedCandidateStrings = candidateStrings.stream() + .map(sublist -> sublist.size() > finalJ ? sublist.get(finalJ) : "") + .toList(); + for (String paddedString : paddedCandidateStrings) { + if ((testString.length() >= 5 && paddedString.length() >= 5) && (testString.length() > 2 * paddedString.length() + || paddedString.length() > 2 * testString.length())) { + // If both strings are at least 5 characters long and one string is more than twice as long as the other, + // skip the distance calculation as it's time-consuming, and we can assume they are not similar enough + continue; } - score /= paddedCandidateStrings.size(); - } catch (IndexOutOfBoundsException e) { - continue; + + int distance = calculateHammingDistanceWithPreprocessing(testString, paddedString); + double normalizedScore = 1 - (double) distance / Math.max(testString.length(), paddedString.length()); + score += normalizedScore * (j < weights.length ? weights[j] : 1); } + score /= paddedCandidateStrings.size(); highestScore = Math.max(highestScore, score); + // Early stop + if (highestScore > THRESHOLD) { + return true; + } } } - return highestScore > THRESHOLD; - } - - - private double compare(String firstCandidate, String secondCandidate) { - - int count = 0; - String cleanedFirstCandidate = firstCandidate.replaceAll("\\d", "@"); - String cleanedSecondCandidate = secondCandidate.replaceAll("\\d", "@"); - - for (int i = 0; i < Math.min(cleanedFirstCandidate.length(), cleanedSecondCandidate.length()); i++) { - if (cleanedFirstCandidate.charAt(i) == cleanedSecondCandidate.charAt(i)) { - count++; - } - } - return (double) count / Math.max(cleanedFirstCandidate.length(), cleanedSecondCandidate.length()); + return false; } @@ -156,7 +150,10 @@ public class HeaderFooterDetection { List> footerCandidates = new ArrayList<>(); for (ClassificationPage page : pages) { List textBlocks = page.getTextBlocks(); - List textPageBlocks = textBlocks.stream().filter(textBlock -> textBlock instanceof TextPageBlock).map(textBlock -> (TextPageBlock) textBlock).toList(); + List textPageBlocks = textBlocks.stream() + .filter(textBlock -> textBlock instanceof TextPageBlock) + .map(textBlock -> (TextPageBlock) textBlock) + .toList(); int blockCount = textPageBlocks.size(); if (blockCount > 0) { int start = Math.max(0, blockCount - 3); @@ -173,11 +170,54 @@ public class HeaderFooterDetection { List> headerCandidates = new ArrayList<>(); for (ClassificationPage page : pages) { List textBlocks = page.getTextBlocks(); - List textPageBlocks = textBlocks.stream().filter(textBlock -> textBlock instanceof TextPageBlock).map(textBlock -> (TextPageBlock) textBlock).toList(); + List textPageBlocks = textBlocks.stream() + .filter(textBlock -> textBlock instanceof TextPageBlock) + .map(textBlock -> (TextPageBlock) textBlock) + .toList(); int count = Math.min(3, textPageBlocks.size()); headerCandidates.add(new ArrayList<>(textPageBlocks.subList(0, count))); } return headerCandidates; } + + /** + * Calculate the Hamming distance between two strings after preprocessing to make them the same length + * and replacing all digits with a special character '@' since they are a common occurrence in headers/footers. + * + * @param firstCandidate First string + * @param secondCandidate Second string + * @return The Hamming distance between the two preprocessed strings. + */ + private int calculateHammingDistanceWithPreprocessing(String firstCandidate, String secondCandidate) { + + int maxLength = Math.max(firstCandidate.length(), secondCandidate.length()); + + String cleanFirstCandidate = padString(firstCandidate, maxLength, '\0').replaceAll("\\d", "@"); + String cleanSecondCandidate = padString(secondCandidate, maxLength, '\0').replaceAll("\\d", "@"); + + int distance = 0; + for (int i = 0; i < maxLength; i++) { + if (cleanFirstCandidate.charAt(i) != cleanSecondCandidate.charAt(i)) { + distance++; + } + } + return distance; + } + + + private String padString(String input, int length, char padChar) { + + if (input.length() >= length) { + return input; + } + + StringBuilder sb = new StringBuilder(input); + + while (sb.length() < length) { + sb.append(padChar); + } + return sb.toString(); + } + }