RED-9149 - Address comments

This commit is contained in:
Andrei Isvoran 2024-05-13 13:18:33 +03:00
parent aeaca2f278
commit a76b2ace3f

View File

@ -17,6 +17,11 @@ import lombok.experimental.UtilityClass;
public class HeaderFooterDetection { public class HeaderFooterDetection {
private final Map<Integer, ClassificationPage> pagesCache = new HashMap<>(); private final Map<Integer, ClassificationPage> pagesCache = new HashMap<>();
private static final double THRESHOLD = 0.5;
// Weight will go from 1.0 to 0.5 because the first element is the most likely to be the header on the page.
private static final double[] headerWeights = {1.0, 0.75, 0.5};
// Weight will go from 0.5 to 1.0 because the last element is the most likely to be the footer on the page.
private static final double[] footerWeights = {0.5, 0.75, 1.0};
public boolean isLikelyFooter(TextPageBlock textPageBlock, ClassificationDocument document, ClassificationPage classificationPage) { public boolean isLikelyFooter(TextPageBlock textPageBlock, ClassificationDocument document, ClassificationPage classificationPage) {
@ -32,8 +37,6 @@ public class HeaderFooterDetection {
List<ClassificationPage> nearestPages = findNearestPages(classificationPage, document.getPages(), window); List<ClassificationPage> nearestPages = findNearestPages(classificationPage, document.getPages(), window);
List<List<AbstractPageBlock>> footerCandidates = getFooterCandidates(nearestPages); List<List<AbstractPageBlock>> footerCandidates = getFooterCandidates(nearestPages);
// Weight will go from 0.5 to 1.0 because the last element is the most likely to be the footer on the page.
double[] footerWeights = {0.5, 0.75, 1.0};
return detectHeadersOrFootersByPageAssociation(textPageBlock.getText(), footerCandidates, window, footerWeights); return detectHeadersOrFootersByPageAssociation(textPageBlock.getText(), footerCandidates, window, footerWeights);
} }
@ -51,8 +54,6 @@ public class HeaderFooterDetection {
List<ClassificationPage> nearestPages = findNearestPages(classificationPage, document.getPages(), window); List<ClassificationPage> nearestPages = findNearestPages(classificationPage, document.getPages(), window);
List<List<AbstractPageBlock>> headerCandidates = getHeaderCandidates(nearestPages); List<List<AbstractPageBlock>> headerCandidates = getHeaderCandidates(nearestPages);
// Weight will go from 1.0 to 0.5 because the first element is the most likely to be the header on the page.
double[] headerWeights = {1.0, 0.75, 0.5};
return detectHeadersOrFootersByPageAssociation(textPageBlock.getText(), headerCandidates, window, headerWeights); return detectHeadersOrFootersByPageAssociation(textPageBlock.getText(), headerCandidates, window, headerWeights);
} }
@ -62,19 +63,19 @@ public class HeaderFooterDetection {
double highestScore = 0.0; double highestScore = 0.0;
for (int i = 0; i < candidates.size(); i++) { for (int i = 0; i < candidates.size(); i++) {
List<List<String>> temp = new ArrayList<>(); List<List<String>> candidateStrings = new ArrayList<>();
for (int k = Math.max(i - window, 0); k < Math.min(i + window, candidates.size()); k++) { for (int k = Math.max(i - window, 0); k < Math.min(i + window, candidates.size()); k++) {
temp.add(candidates.get(k) candidateStrings.add(candidates.get(k)
.stream() .stream()
.map(AbstractPageBlock::getText) .map(AbstractPageBlock::getText)
.collect(Collectors.toList())); .collect(Collectors.toList()));
} }
int maxLen = temp.stream() int maxLen = candidateStrings.stream()
.mapToInt(List::size) .mapToInt(List::size)
.max() .max()
.orElse(0); .orElse(0);
for (List<String> sublist : temp) { for (List<String> sublist : candidateStrings) {
while (sublist.size() < maxLen) { while (sublist.size() < maxLen) {
sublist.add(0, ""); sublist.add(0, "");
} }
@ -85,13 +86,13 @@ public class HeaderFooterDetection {
double score = 0.0; double score = 0.0;
try { try {
int finalJ = j; int finalJ = j;
List<String> cmp = temp.stream() List<String> paddedCandidateStrings = candidateStrings.stream()
.map(sublist -> sublist.size() > finalJ ? sublist.get(finalJ) : "") .map(sublist -> sublist.size() > finalJ ? sublist.get(finalJ) : "")
.toList(); .toList();
for (String cm : cmp) { for (String cm : paddedCandidateStrings) {
score += compare(testString, cm) * (j < weights.length ? weights[j] : 1); score += compare(testString, cm) * (j < weights.length ? weights[j] : 1);
} }
score /= cmp.size(); score /= paddedCandidateStrings.size();
} catch (IndexOutOfBoundsException e) { } catch (IndexOutOfBoundsException e) {
continue; continue;
} }
@ -99,7 +100,7 @@ public class HeaderFooterDetection {
} }
} }
return highestScore > 0.5; return highestScore > THRESHOLD;
} }
@ -155,10 +156,11 @@ public class HeaderFooterDetection {
List<List<AbstractPageBlock>> footerCandidates = new ArrayList<>(); List<List<AbstractPageBlock>> footerCandidates = new ArrayList<>();
for (ClassificationPage page : pages) { for (ClassificationPage page : pages) {
List<AbstractPageBlock> textBlocks = page.getTextBlocks(); List<AbstractPageBlock> textBlocks = page.getTextBlocks();
int blockCount = textBlocks.size(); List<TextPageBlock> textPageBlocks = textBlocks.stream().filter(textBlock -> textBlock instanceof TextPageBlock).map(textBlock -> (TextPageBlock) textBlock).toList();
int blockCount = textPageBlocks.size();
if (blockCount > 0) { if (blockCount > 0) {
int start = Math.max(0, blockCount - 3); int start = Math.max(0, blockCount - 3);
footerCandidates.add(new ArrayList<>(textBlocks.subList(start, blockCount))); footerCandidates.add(new ArrayList<>(textPageBlocks.subList(start, blockCount)));
} }
} }
return footerCandidates; return footerCandidates;
@ -171,8 +173,9 @@ public class HeaderFooterDetection {
List<List<AbstractPageBlock>> headerCandidates = new ArrayList<>(); List<List<AbstractPageBlock>> headerCandidates = new ArrayList<>();
for (ClassificationPage page : pages) { for (ClassificationPage page : pages) {
List<AbstractPageBlock> textBlocks = page.getTextBlocks(); List<AbstractPageBlock> textBlocks = page.getTextBlocks();
int count = Math.min(3, textBlocks.size()); List<TextPageBlock> textPageBlocks = textBlocks.stream().filter(textBlock -> textBlock instanceof TextPageBlock).map(textBlock -> (TextPageBlock) textBlock).toList();
headerCandidates.add(new ArrayList<>(textBlocks.subList(0, count))); int count = Math.min(3, textPageBlocks.size());
headerCandidates.add(new ArrayList<>(textPageBlocks.subList(0, count)));
} }
return headerCandidates; return headerCandidates;
} }