RED-9149 - Improvements
This commit is contained in:
parent
a76b2ace3f
commit
40465e8778
@ -84,38 +84,32 @@ public class HeaderFooterDetection {
|
||||
// Compare the testString against each candidate in the window
|
||||
for (int j = 0; j < maxLen; j++) {
|
||||
double score = 0.0;
|
||||
try {
|
||||
int finalJ = j;
|
||||
List<String> paddedCandidateStrings = candidateStrings.stream()
|
||||
.map(sublist -> sublist.size() > finalJ ? sublist.get(finalJ) : "")
|
||||
.toList();
|
||||
for (String cm : paddedCandidateStrings) {
|
||||
score += compare(testString, cm) * (j < weights.length ? weights[j] : 1);
|
||||
int finalJ = j;
|
||||
List<String> paddedCandidateStrings = candidateStrings.stream()
|
||||
.map(sublist -> sublist.size() > finalJ ? sublist.get(finalJ) : "")
|
||||
.toList();
|
||||
for (String paddedString : paddedCandidateStrings) {
|
||||
if ((testString.length() >= 5 && paddedString.length() >= 5) && (testString.length() > 2 * paddedString.length()
|
||||
|| paddedString.length() > 2 * testString.length())) {
|
||||
// If both strings are at least 5 characters long and one string is more than twice as long as the other,
|
||||
// skip the distance calculation as it's time-consuming, and we can assume they are not similar enough
|
||||
continue;
|
||||
}
|
||||
score /= paddedCandidateStrings.size();
|
||||
} catch (IndexOutOfBoundsException e) {
|
||||
continue;
|
||||
|
||||
int distance = calculateHammingDistanceWithPreprocessing(testString, paddedString);
|
||||
double normalizedScore = 1 - (double) distance / Math.max(testString.length(), paddedString.length());
|
||||
score += normalizedScore * (j < weights.length ? weights[j] : 1);
|
||||
}
|
||||
score /= paddedCandidateStrings.size();
|
||||
highestScore = Math.max(highestScore, score);
|
||||
// Early stop
|
||||
if (highestScore > THRESHOLD) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return highestScore > THRESHOLD;
|
||||
}
|
||||
|
||||
|
||||
private double compare(String firstCandidate, String secondCandidate) {
|
||||
|
||||
int count = 0;
|
||||
String cleanedFirstCandidate = firstCandidate.replaceAll("\\d", "@");
|
||||
String cleanedSecondCandidate = secondCandidate.replaceAll("\\d", "@");
|
||||
|
||||
for (int i = 0; i < Math.min(cleanedFirstCandidate.length(), cleanedSecondCandidate.length()); i++) {
|
||||
if (cleanedFirstCandidate.charAt(i) == cleanedSecondCandidate.charAt(i)) {
|
||||
count++;
|
||||
}
|
||||
}
|
||||
return (double) count / Math.max(cleanedFirstCandidate.length(), cleanedSecondCandidate.length());
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
@ -156,7 +150,10 @@ public class HeaderFooterDetection {
|
||||
List<List<AbstractPageBlock>> footerCandidates = new ArrayList<>();
|
||||
for (ClassificationPage page : pages) {
|
||||
List<AbstractPageBlock> textBlocks = page.getTextBlocks();
|
||||
List<TextPageBlock> textPageBlocks = textBlocks.stream().filter(textBlock -> textBlock instanceof TextPageBlock).map(textBlock -> (TextPageBlock) textBlock).toList();
|
||||
List<TextPageBlock> textPageBlocks = textBlocks.stream()
|
||||
.filter(textBlock -> textBlock instanceof TextPageBlock)
|
||||
.map(textBlock -> (TextPageBlock) textBlock)
|
||||
.toList();
|
||||
int blockCount = textPageBlocks.size();
|
||||
if (blockCount > 0) {
|
||||
int start = Math.max(0, blockCount - 3);
|
||||
@ -173,11 +170,54 @@ public class HeaderFooterDetection {
|
||||
List<List<AbstractPageBlock>> headerCandidates = new ArrayList<>();
|
||||
for (ClassificationPage page : pages) {
|
||||
List<AbstractPageBlock> textBlocks = page.getTextBlocks();
|
||||
List<TextPageBlock> textPageBlocks = textBlocks.stream().filter(textBlock -> textBlock instanceof TextPageBlock).map(textBlock -> (TextPageBlock) textBlock).toList();
|
||||
List<TextPageBlock> textPageBlocks = textBlocks.stream()
|
||||
.filter(textBlock -> textBlock instanceof TextPageBlock)
|
||||
.map(textBlock -> (TextPageBlock) textBlock)
|
||||
.toList();
|
||||
int count = Math.min(3, textPageBlocks.size());
|
||||
headerCandidates.add(new ArrayList<>(textPageBlocks.subList(0, count)));
|
||||
}
|
||||
return headerCandidates;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Calculate the Hamming distance between two strings after preprocessing to make them the same length
|
||||
* and replacing all digits with a special character '@' since they are a common occurrence in headers/footers.
|
||||
*
|
||||
* @param firstCandidate First string
|
||||
* @param secondCandidate Second string
|
||||
* @return The Hamming distance between the two preprocessed strings.
|
||||
*/
|
||||
private int calculateHammingDistanceWithPreprocessing(String firstCandidate, String secondCandidate) {
|
||||
|
||||
int maxLength = Math.max(firstCandidate.length(), secondCandidate.length());
|
||||
|
||||
String cleanFirstCandidate = padString(firstCandidate, maxLength, '\0').replaceAll("\\d", "@");
|
||||
String cleanSecondCandidate = padString(secondCandidate, maxLength, '\0').replaceAll("\\d", "@");
|
||||
|
||||
int distance = 0;
|
||||
for (int i = 0; i < maxLength; i++) {
|
||||
if (cleanFirstCandidate.charAt(i) != cleanSecondCandidate.charAt(i)) {
|
||||
distance++;
|
||||
}
|
||||
}
|
||||
return distance;
|
||||
}
|
||||
|
||||
|
||||
private String padString(String input, int length, char padChar) {
|
||||
|
||||
if (input.length() >= length) {
|
||||
return input;
|
||||
}
|
||||
|
||||
StringBuilder sb = new StringBuilder(input);
|
||||
|
||||
while (sb.length() < length) {
|
||||
sb.append(padChar);
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user