diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java index 276a6ab..be46e96 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java @@ -17,6 +17,11 @@ import lombok.experimental.UtilityClass; public class HeaderFooterDetection { private final Map pagesCache = new HashMap<>(); + private static final double THRESHOLD = 0.5; + // Weight will go from 1.0 to 0.5 because the first element is the most likely to be the header on the page. + private static final double[] headerWeights = {1.0, 0.75, 0.5}; + // Weight will go from 0.5 to 1.0 because the last element is the most likely to be the footer on the page. + private static final double[] footerWeights = {0.5, 0.75, 1.0}; public boolean isLikelyFooter(TextPageBlock textPageBlock, ClassificationDocument document, ClassificationPage classificationPage) { @@ -32,8 +37,6 @@ public class HeaderFooterDetection { List nearestPages = findNearestPages(classificationPage, document.getPages(), window); List> footerCandidates = getFooterCandidates(nearestPages); - // Weight will go from 0.5 to 1.0 because the last element is the most likely to be the footer on the page. - double[] footerWeights = {0.5, 0.75, 1.0}; return detectHeadersOrFootersByPageAssociation(textPageBlock.getText(), footerCandidates, window, footerWeights); } @@ -51,8 +54,6 @@ public class HeaderFooterDetection { List nearestPages = findNearestPages(classificationPage, document.getPages(), window); List> headerCandidates = getHeaderCandidates(nearestPages); - // Weight will go from 1.0 to 0.5 because the first element is the most likely to be the header on the page. - double[] headerWeights = {1.0, 0.75, 0.5}; return detectHeadersOrFootersByPageAssociation(textPageBlock.getText(), headerCandidates, window, headerWeights); } @@ -62,19 +63,19 @@ public class HeaderFooterDetection { double highestScore = 0.0; for (int i = 0; i < candidates.size(); i++) { - List> temp = new ArrayList<>(); + List> candidateStrings = new ArrayList<>(); for (int k = Math.max(i - window, 0); k < Math.min(i + window, candidates.size()); k++) { - temp.add(candidates.get(k) - .stream() - .map(AbstractPageBlock::getText) - .collect(Collectors.toList())); + candidateStrings.add(candidates.get(k) + .stream() + .map(AbstractPageBlock::getText) + .collect(Collectors.toList())); } - int maxLen = temp.stream() + int maxLen = candidateStrings.stream() .mapToInt(List::size) .max() .orElse(0); - for (List sublist : temp) { + for (List sublist : candidateStrings) { while (sublist.size() < maxLen) { sublist.add(0, ""); } @@ -85,13 +86,13 @@ public class HeaderFooterDetection { double score = 0.0; try { int finalJ = j; - List cmp = temp.stream() + List paddedCandidateStrings = candidateStrings.stream() .map(sublist -> sublist.size() > finalJ ? sublist.get(finalJ) : "") .toList(); - for (String cm : cmp) { + for (String cm : paddedCandidateStrings) { score += compare(testString, cm) * (j < weights.length ? weights[j] : 1); } - score /= cmp.size(); + score /= paddedCandidateStrings.size(); } catch (IndexOutOfBoundsException e) { continue; } @@ -99,7 +100,7 @@ public class HeaderFooterDetection { } } - return highestScore > 0.5; + return highestScore > THRESHOLD; } @@ -155,10 +156,11 @@ public class HeaderFooterDetection { List> footerCandidates = new ArrayList<>(); for (ClassificationPage page : pages) { List textBlocks = page.getTextBlocks(); - int blockCount = textBlocks.size(); + List textPageBlocks = textBlocks.stream().filter(textBlock -> textBlock instanceof TextPageBlock).map(textBlock -> (TextPageBlock) textBlock).toList(); + int blockCount = textPageBlocks.size(); if (blockCount > 0) { int start = Math.max(0, blockCount - 3); - footerCandidates.add(new ArrayList<>(textBlocks.subList(start, blockCount))); + footerCandidates.add(new ArrayList<>(textPageBlocks.subList(start, blockCount))); } } return footerCandidates; @@ -171,8 +173,9 @@ public class HeaderFooterDetection { List> headerCandidates = new ArrayList<>(); for (ClassificationPage page : pages) { List textBlocks = page.getTextBlocks(); - int count = Math.min(3, textBlocks.size()); - headerCandidates.add(new ArrayList<>(textBlocks.subList(0, count))); + List textPageBlocks = textBlocks.stream().filter(textBlock -> textBlock instanceof TextPageBlock).map(textBlock -> (TextPageBlock) textBlock).toList(); + int count = Math.min(3, textPageBlocks.size()); + headerCandidates.add(new ArrayList<>(textPageBlocks.subList(0, count))); } return headerCandidates; }