RED-9149 - Address comments
This commit is contained in:
parent
aeaca2f278
commit
a76b2ace3f
@ -17,6 +17,11 @@ import lombok.experimental.UtilityClass;
|
||||
public class HeaderFooterDetection {
|
||||
|
||||
private final Map<Integer, ClassificationPage> pagesCache = new HashMap<>();
|
||||
private static final double THRESHOLD = 0.5;
|
||||
// Weight will go from 1.0 to 0.5 because the first element is the most likely to be the header on the page.
|
||||
private static final double[] headerWeights = {1.0, 0.75, 0.5};
|
||||
// Weight will go from 0.5 to 1.0 because the last element is the most likely to be the footer on the page.
|
||||
private static final double[] footerWeights = {0.5, 0.75, 1.0};
|
||||
|
||||
|
||||
public boolean isLikelyFooter(TextPageBlock textPageBlock, ClassificationDocument document, ClassificationPage classificationPage) {
|
||||
@ -32,8 +37,6 @@ public class HeaderFooterDetection {
|
||||
List<ClassificationPage> nearestPages = findNearestPages(classificationPage, document.getPages(), window);
|
||||
List<List<AbstractPageBlock>> footerCandidates = getFooterCandidates(nearestPages);
|
||||
|
||||
// Weight will go from 0.5 to 1.0 because the last element is the most likely to be the footer on the page.
|
||||
double[] footerWeights = {0.5, 0.75, 1.0};
|
||||
return detectHeadersOrFootersByPageAssociation(textPageBlock.getText(), footerCandidates, window, footerWeights);
|
||||
}
|
||||
|
||||
@ -51,8 +54,6 @@ public class HeaderFooterDetection {
|
||||
List<ClassificationPage> nearestPages = findNearestPages(classificationPage, document.getPages(), window);
|
||||
List<List<AbstractPageBlock>> headerCandidates = getHeaderCandidates(nearestPages);
|
||||
|
||||
// Weight will go from 1.0 to 0.5 because the first element is the most likely to be the header on the page.
|
||||
double[] headerWeights = {1.0, 0.75, 0.5};
|
||||
return detectHeadersOrFootersByPageAssociation(textPageBlock.getText(), headerCandidates, window, headerWeights);
|
||||
}
|
||||
|
||||
@ -62,19 +63,19 @@ public class HeaderFooterDetection {
|
||||
double highestScore = 0.0;
|
||||
|
||||
for (int i = 0; i < candidates.size(); i++) {
|
||||
List<List<String>> temp = new ArrayList<>();
|
||||
List<List<String>> candidateStrings = new ArrayList<>();
|
||||
for (int k = Math.max(i - window, 0); k < Math.min(i + window, candidates.size()); k++) {
|
||||
temp.add(candidates.get(k)
|
||||
.stream()
|
||||
.map(AbstractPageBlock::getText)
|
||||
.collect(Collectors.toList()));
|
||||
candidateStrings.add(candidates.get(k)
|
||||
.stream()
|
||||
.map(AbstractPageBlock::getText)
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
int maxLen = temp.stream()
|
||||
int maxLen = candidateStrings.stream()
|
||||
.mapToInt(List::size)
|
||||
.max()
|
||||
.orElse(0);
|
||||
for (List<String> sublist : temp) {
|
||||
for (List<String> sublist : candidateStrings) {
|
||||
while (sublist.size() < maxLen) {
|
||||
sublist.add(0, "");
|
||||
}
|
||||
@ -85,13 +86,13 @@ public class HeaderFooterDetection {
|
||||
double score = 0.0;
|
||||
try {
|
||||
int finalJ = j;
|
||||
List<String> cmp = temp.stream()
|
||||
List<String> paddedCandidateStrings = candidateStrings.stream()
|
||||
.map(sublist -> sublist.size() > finalJ ? sublist.get(finalJ) : "")
|
||||
.toList();
|
||||
for (String cm : cmp) {
|
||||
for (String cm : paddedCandidateStrings) {
|
||||
score += compare(testString, cm) * (j < weights.length ? weights[j] : 1);
|
||||
}
|
||||
score /= cmp.size();
|
||||
score /= paddedCandidateStrings.size();
|
||||
} catch (IndexOutOfBoundsException e) {
|
||||
continue;
|
||||
}
|
||||
@ -99,7 +100,7 @@ public class HeaderFooterDetection {
|
||||
}
|
||||
}
|
||||
|
||||
return highestScore > 0.5;
|
||||
return highestScore > THRESHOLD;
|
||||
}
|
||||
|
||||
|
||||
@ -155,10 +156,11 @@ public class HeaderFooterDetection {
|
||||
List<List<AbstractPageBlock>> footerCandidates = new ArrayList<>();
|
||||
for (ClassificationPage page : pages) {
|
||||
List<AbstractPageBlock> textBlocks = page.getTextBlocks();
|
||||
int blockCount = textBlocks.size();
|
||||
List<TextPageBlock> textPageBlocks = textBlocks.stream().filter(textBlock -> textBlock instanceof TextPageBlock).map(textBlock -> (TextPageBlock) textBlock).toList();
|
||||
int blockCount = textPageBlocks.size();
|
||||
if (blockCount > 0) {
|
||||
int start = Math.max(0, blockCount - 3);
|
||||
footerCandidates.add(new ArrayList<>(textBlocks.subList(start, blockCount)));
|
||||
footerCandidates.add(new ArrayList<>(textPageBlocks.subList(start, blockCount)));
|
||||
}
|
||||
}
|
||||
return footerCandidates;
|
||||
@ -171,8 +173,9 @@ public class HeaderFooterDetection {
|
||||
List<List<AbstractPageBlock>> headerCandidates = new ArrayList<>();
|
||||
for (ClassificationPage page : pages) {
|
||||
List<AbstractPageBlock> textBlocks = page.getTextBlocks();
|
||||
int count = Math.min(3, textBlocks.size());
|
||||
headerCandidates.add(new ArrayList<>(textBlocks.subList(0, count)));
|
||||
List<TextPageBlock> textPageBlocks = textBlocks.stream().filter(textBlock -> textBlock instanceof TextPageBlock).map(textBlock -> (TextPageBlock) textBlock).toList();
|
||||
int count = Math.min(3, textPageBlocks.size());
|
||||
headerCandidates.add(new ArrayList<>(textPageBlocks.subList(0, count)));
|
||||
}
|
||||
return headerCandidates;
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user