Merge branch 'RED-10126' into 'main'

RM-187: Footers are recognized in the middle of the page See merge request fforesight/layout-parser!233
2024-10-08 14:27:45 +02:00 · 2024-10-08 14:27:45 +02:00 · 23e23328ee
commit 23e23328ee
parent 3109a30ae1 9d1ffdd779
1 changed files with 44 additions and 15 deletions
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java
@ -1,5 +1,8 @@
 package com.knecon.fforesight.service.layoutparser.processor.utils;

+import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.FOOTER;
+import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.HEADER;
+
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
@ -9,6 +12,7 @@ import java.util.stream.Collectors;
 import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
 import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
 import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
+import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
 import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;

 import lombok.experimental.UtilityClass;
@ -26,35 +30,60 @@ public class HeaderFooterDetection {

    public boolean isLikelyFooter(TextPageBlock textPageBlock, ClassificationDocument document, ClassificationPage classificationPage) {

-        int numberOfPages = document.getPages().size();
-        if (numberOfPages < 3) {
-            // If the document has 1 or 2 pages this may lead to more false positives than actual findings.
-            return false;
-        }
-
-        int window = Math.min(numberOfPages, 8);
-
-        List<ClassificationPage> nearestPages = findNearestPages(classificationPage, document.getPages(), window);
-        List<List<AbstractPageBlock>> footerCandidates = getFooterCandidates(nearestPages);
-
-        return detectHeadersOrFootersByPageAssociation(textPageBlock.getText(), footerCandidates, window, footerWeights);
+        return isLikelyHeaderFooter(textPageBlock, document, classificationPage, FOOTER);
    }


    public boolean isLikelyHeader(TextPageBlock textPageBlock, ClassificationDocument document, ClassificationPage classificationPage) {

+        return isLikelyHeaderFooter(textPageBlock, document, classificationPage, HEADER);
+    }
+
+
+    private boolean isLikelyHeaderFooter(TextPageBlock textPageBlock, ClassificationDocument document, ClassificationPage classificationPage, PageBlockType type) {
+
        int numberOfPages = document.getPages().size();
        if (numberOfPages < 3) {
            // If the document has 1 or 2 pages this may lead to more false positives than actual findings.
            return false;
        }

+        List<TextPageBlock> textPageBlocks = classificationPage.getTextBlocks()
+                .stream()
+                .filter(TextPageBlock.class::isInstance)
+                .map(TextPageBlock.class::cast)
+                .collect(Collectors.toList());
+
+        if (textPageBlocks.isEmpty()) {
+            return false;
+        }
+
+        List<TextPageBlock> selectedBlocks;
+        if (type == HEADER) {
+            selectedBlocks = textPageBlocks.subList(0, Math.min(3, textPageBlocks.size()));
+        } else { //FOOTER
+            selectedBlocks = textPageBlocks.subList(Math.max(0, textPageBlocks.size() - 3), textPageBlocks.size());
+        }
+
+        if (!selectedBlocks.contains(textPageBlock)) {
+            // The textPageBlock is not among the selected blocks on its page
+            return false;
+        }
+
        int window = Math.min(numberOfPages, 8);
-
        List<ClassificationPage> nearestPages = findNearestPages(classificationPage, document.getPages(), window);
-        List<List<AbstractPageBlock>> headerCandidates = getHeaderCandidates(nearestPages);

-        return detectHeadersOrFootersByPageAssociation(textPageBlock.getText(), headerCandidates, window, headerWeights);
+        List<List<AbstractPageBlock>> candidates;
+        double[] weights;
+        if (type == HEADER) {
+            candidates = getHeaderCandidates(nearestPages);
+            weights = headerWeights;
+        } else { //FOOTER
+            candidates = getFooterCandidates(nearestPages);
+            weights = footerWeights;
+        }
+
+        return detectHeadersOrFootersByPageAssociation(textPageBlock.getText(), candidates, window, weights);
    }