RED-9149 - Address comments

This commit is contained in:
Andrei Isvoran 2024-05-13 13:18:33 +03:00
parent aeaca2f278
commit a76b2ace3f

View File

@ -17,6 +17,11 @@ import lombok.experimental.UtilityClass;
public class HeaderFooterDetection {
private final Map<Integer, ClassificationPage> pagesCache = new HashMap<>();
private static final double THRESHOLD = 0.5;
// Weight will go from 1.0 to 0.5 because the first element is the most likely to be the header on the page.
private static final double[] headerWeights = {1.0, 0.75, 0.5};
// Weight will go from 0.5 to 1.0 because the last element is the most likely to be the footer on the page.
private static final double[] footerWeights = {0.5, 0.75, 1.0};
public boolean isLikelyFooter(TextPageBlock textPageBlock, ClassificationDocument document, ClassificationPage classificationPage) {
@ -32,8 +37,6 @@ public class HeaderFooterDetection {
List<ClassificationPage> nearestPages = findNearestPages(classificationPage, document.getPages(), window);
List<List<AbstractPageBlock>> footerCandidates = getFooterCandidates(nearestPages);
// Weight will go from 0.5 to 1.0 because the last element is the most likely to be the footer on the page.
double[] footerWeights = {0.5, 0.75, 1.0};
return detectHeadersOrFootersByPageAssociation(textPageBlock.getText(), footerCandidates, window, footerWeights);
}
@ -51,8 +54,6 @@ public class HeaderFooterDetection {
List<ClassificationPage> nearestPages = findNearestPages(classificationPage, document.getPages(), window);
List<List<AbstractPageBlock>> headerCandidates = getHeaderCandidates(nearestPages);
// Weight will go from 1.0 to 0.5 because the first element is the most likely to be the header on the page.
double[] headerWeights = {1.0, 0.75, 0.5};
return detectHeadersOrFootersByPageAssociation(textPageBlock.getText(), headerCandidates, window, headerWeights);
}
@ -62,19 +63,19 @@ public class HeaderFooterDetection {
double highestScore = 0.0;
for (int i = 0; i < candidates.size(); i++) {
List<List<String>> temp = new ArrayList<>();
List<List<String>> candidateStrings = new ArrayList<>();
for (int k = Math.max(i - window, 0); k < Math.min(i + window, candidates.size()); k++) {
temp.add(candidates.get(k)
.stream()
.map(AbstractPageBlock::getText)
.collect(Collectors.toList()));
candidateStrings.add(candidates.get(k)
.stream()
.map(AbstractPageBlock::getText)
.collect(Collectors.toList()));
}
int maxLen = temp.stream()
int maxLen = candidateStrings.stream()
.mapToInt(List::size)
.max()
.orElse(0);
for (List<String> sublist : temp) {
for (List<String> sublist : candidateStrings) {
while (sublist.size() < maxLen) {
sublist.add(0, "");
}
@ -85,13 +86,13 @@ public class HeaderFooterDetection {
double score = 0.0;
try {
int finalJ = j;
List<String> cmp = temp.stream()
List<String> paddedCandidateStrings = candidateStrings.stream()
.map(sublist -> sublist.size() > finalJ ? sublist.get(finalJ) : "")
.toList();
for (String cm : cmp) {
for (String cm : paddedCandidateStrings) {
score += compare(testString, cm) * (j < weights.length ? weights[j] : 1);
}
score /= cmp.size();
score /= paddedCandidateStrings.size();
} catch (IndexOutOfBoundsException e) {
continue;
}
@ -99,7 +100,7 @@ public class HeaderFooterDetection {
}
}
return highestScore > 0.5;
return highestScore > THRESHOLD;
}
@ -155,10 +156,11 @@ public class HeaderFooterDetection {
List<List<AbstractPageBlock>> footerCandidates = new ArrayList<>();
for (ClassificationPage page : pages) {
List<AbstractPageBlock> textBlocks = page.getTextBlocks();
int blockCount = textBlocks.size();
List<TextPageBlock> textPageBlocks = textBlocks.stream().filter(textBlock -> textBlock instanceof TextPageBlock).map(textBlock -> (TextPageBlock) textBlock).toList();
int blockCount = textPageBlocks.size();
if (blockCount > 0) {
int start = Math.max(0, blockCount - 3);
footerCandidates.add(new ArrayList<>(textBlocks.subList(start, blockCount)));
footerCandidates.add(new ArrayList<>(textPageBlocks.subList(start, blockCount)));
}
}
return footerCandidates;
@ -171,8 +173,9 @@ public class HeaderFooterDetection {
List<List<AbstractPageBlock>> headerCandidates = new ArrayList<>();
for (ClassificationPage page : pages) {
List<AbstractPageBlock> textBlocks = page.getTextBlocks();
int count = Math.min(3, textBlocks.size());
headerCandidates.add(new ArrayList<>(textBlocks.subList(0, count)));
List<TextPageBlock> textPageBlocks = textBlocks.stream().filter(textBlock -> textBlock instanceof TextPageBlock).map(textBlock -> (TextPageBlock) textBlock).toList();
int count = Math.min(3, textPageBlocks.size());
headerCandidates.add(new ArrayList<>(textPageBlocks.subList(0, count)));
}
return headerCandidates;
}