From 9d1ffdd779ccbb2c3d6f98729d6a1ace50b45ddf Mon Sep 17 00:00:00 2001 From: Maverick Studer Date: Tue, 8 Oct 2024 14:27:44 +0200 Subject: [PATCH] RM-187: Footers are recognized in the middle of the page --- .../utils/HeaderFooterDetection.java | 59 ++++++++++++++----- 1 file changed, 44 insertions(+), 15 deletions(-) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java index f010a98..43095d5 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java @@ -1,5 +1,8 @@ package com.knecon.fforesight.service.layoutparser.processor.utils; +import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.FOOTER; +import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.HEADER; + import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -9,6 +12,7 @@ import java.util.stream.Collectors; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import lombok.experimental.UtilityClass; @@ -26,35 +30,60 @@ public class HeaderFooterDetection { public boolean isLikelyFooter(TextPageBlock textPageBlock, ClassificationDocument document, ClassificationPage classificationPage) { - int numberOfPages = document.getPages().size(); - if (numberOfPages < 3) { - // If the document has 1 or 2 pages this may lead to more false positives than actual findings. - return false; - } - - int window = Math.min(numberOfPages, 8); - - List nearestPages = findNearestPages(classificationPage, document.getPages(), window); - List> footerCandidates = getFooterCandidates(nearestPages); - - return detectHeadersOrFootersByPageAssociation(textPageBlock.getText(), footerCandidates, window, footerWeights); + return isLikelyHeaderFooter(textPageBlock, document, classificationPage, FOOTER); } public boolean isLikelyHeader(TextPageBlock textPageBlock, ClassificationDocument document, ClassificationPage classificationPage) { + return isLikelyHeaderFooter(textPageBlock, document, classificationPage, HEADER); + } + + + private boolean isLikelyHeaderFooter(TextPageBlock textPageBlock, ClassificationDocument document, ClassificationPage classificationPage, PageBlockType type) { + int numberOfPages = document.getPages().size(); if (numberOfPages < 3) { // If the document has 1 or 2 pages this may lead to more false positives than actual findings. return false; } + List textPageBlocks = classificationPage.getTextBlocks() + .stream() + .filter(TextPageBlock.class::isInstance) + .map(TextPageBlock.class::cast) + .collect(Collectors.toList()); + + if (textPageBlocks.isEmpty()) { + return false; + } + + List selectedBlocks; + if (type == HEADER) { + selectedBlocks = textPageBlocks.subList(0, Math.min(3, textPageBlocks.size())); + } else { //FOOTER + selectedBlocks = textPageBlocks.subList(Math.max(0, textPageBlocks.size() - 3), textPageBlocks.size()); + } + + if (!selectedBlocks.contains(textPageBlock)) { + // The textPageBlock is not among the selected blocks on its page + return false; + } + int window = Math.min(numberOfPages, 8); - List nearestPages = findNearestPages(classificationPage, document.getPages(), window); - List> headerCandidates = getHeaderCandidates(nearestPages); - return detectHeadersOrFootersByPageAssociation(textPageBlock.getText(), headerCandidates, window, headerWeights); + List> candidates; + double[] weights; + if (type == HEADER) { + candidates = getHeaderCandidates(nearestPages); + weights = headerWeights; + } else { //FOOTER + candidates = getFooterCandidates(nearestPages); + weights = footerWeights; + } + + return detectHeadersOrFootersByPageAssociation(textPageBlock.getText(), candidates, window, weights); }