RED-9149 - Header and footer extraction by page-association

2024-05-10 15:17:41 +03:00 · 2024-05-10 15:17:41 +03:00 · fda25852d1
commit fda25852d1
parent 471fadbcca
2 changed files with 199 additions and 6 deletions
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java
@ -12,6 +12,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
 import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
 import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
 import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
+import com.knecon.fforesight.service.layoutparser.processor.utils.HeaderFooterDetection;
 import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
 import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;

@ -49,6 +50,7 @@ public class DocuMineClassificationService {
        }
    }

+
    private void classifyBlock(TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {

        log.debug("headlineFontSizes: {}", headlineFontSizes);
@ -63,15 +65,26 @@ public class DocuMineClassificationService {
            return;
        }
        if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
-            || PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null
-                                                                                                   || textBlock.getHighestFontSize() <= document.getFontSizeCounter()
-                .getMostPopular())) {
+            || (PositionUtils.isOverBodyTextFrame(bodyTextFrame,
+                                                  textBlock,
+                                                  page.getRotation()) && (document.getFontSizeCounter().getMostPopular()
+                                                                          == null
+                                                                          || textBlock.getHighestFontSize()
+                                                                             <= document.getFontSizeCounter()
+                                                                                     .getMostPopular()))
+            || HeaderFooterDetection.isLikelyHeader(textBlock, document, page)) {
            textBlock.setClassification(PageBlockType.HEADER);

        } else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
-                   || PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null
-                                                                                                           || textBlock.getHighestFontSize() <= document.getFontSizeCounter()
-                .getMostPopular())) {
+                   || (PositionUtils.isUnderBodyTextFrame(bodyTextFrame,
+                                                          textBlock,
+                                                          page.getRotation())
+                       && (document.getFontSizeCounter().getMostPopular()
+                           == null
+                           || textBlock.getHighestFontSize()
+                              <= document.getFontSizeCounter()
+                                      .getMostPopular()))
+                   || HeaderFooterDetection.isLikelyFooter(textBlock, document, page)) {
            textBlock.setClassification(PageBlockType.FOOTER);
        } else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5
                                                 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) {
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java
@ -0,0 +1,180 @@
+package com.knecon.fforesight.service.layoutparser.processor.utils;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
+import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
+import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
+import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
+
+import lombok.experimental.UtilityClass;
+
+@UtilityClass
+public class HeaderFooterDetection {
+
+    private final Map<Integer, ClassificationPage> pagesCache = new HashMap<>();
+
+
+    public boolean isLikelyFooter(TextPageBlock textPageBlock, ClassificationDocument document, ClassificationPage classificationPage) {
+
+        int numberOfPages = document.getPages().size();
+        if (numberOfPages < 3) {
+            // If the document has 1 or 2 pages this may lead to more false positives than actual findings.
+            return false;
+        }
+
+        int window = Math.min(numberOfPages, 8);
+
+        List<ClassificationPage> nearestPages = findNearestPages(classificationPage, document.getPages(), window);
+        List<List<AbstractPageBlock>> footerCandidates = getFooterCandidates(nearestPages);
+
+        // Weight will go from 0.5 to 1.0 because the last element is the most likely to be the footer on the page.
+        double[] footerWeights = {0.5, 0.75, 1.0};
+        return detectHeadersOrFootersByPageAssociation(textPageBlock.getText(), footerCandidates, window, footerWeights);
+    }
+
+
+    public boolean isLikelyHeader(TextPageBlock textPageBlock, ClassificationDocument document, ClassificationPage classificationPage) {
+
+        int numberOfPages = document.getPages().size();
+        if (numberOfPages < 3) {
+            // If the document has 1 or 2 pages this may lead to more false positives than actual findings.
+            return false;
+        }
+
+        int window = Math.min(numberOfPages, 8);
+
+        List<ClassificationPage> nearestPages = findNearestPages(classificationPage, document.getPages(), window);
+        List<List<AbstractPageBlock>> headerCandidates = getHeaderCandidates(nearestPages);
+
+        // Weight will go from 1.0 to 0.5 because the first element is the most likely to be the header on the page.
+        double[] headerWeights = {1.0, 0.75, 0.5};
+        return detectHeadersOrFootersByPageAssociation(textPageBlock.getText(), headerCandidates, window, headerWeights);
+    }
+
+
+    private boolean detectHeadersOrFootersByPageAssociation(String testString, List<List<AbstractPageBlock>> candidates, int window, double[] weights) {
+        
+        double highestScore = 0.0;
+
+        for (int i = 0; i < candidates.size(); i++) {
+            List<List<String>> temp = new ArrayList<>();
+            for (int k = Math.max(i - window, 0); k < Math.min(i + window, candidates.size()); k++) {
+                temp.add(candidates.get(k)
+                                 .stream()
+                                 .map(AbstractPageBlock::getText)
+                                 .collect(Collectors.toList()));
+            }
+
+            int maxLen = temp.stream()
+                    .mapToInt(List::size)
+                    .max()
+                    .orElse(0);
+            for (List<String> sublist : temp) {
+                while (sublist.size() < maxLen) {
+                    sublist.add(0, "");
+                }
+            }
+
+            // Compare the testString against each candidates in the window
+            for (int j = 0; j < maxLen; j++) {
+                double score = 0.0;
+                try {
+                    int finalJ = j;
+                    List<String> cmp = temp.stream()
+                            .map(sublist -> sublist.size() > finalJ ? sublist.get(finalJ) : "")
+                            .toList();
+                    for (String cm : cmp) {
+                        score += compare(testString, cm) * (j < weights.length ? weights[j] : 1);
+                    }
+                    score /= cmp.size();
+                } catch (IndexOutOfBoundsException e) {
+                    continue;
+                }
+                highestScore = Math.max(highestScore, score);
+            }
+        }
+
+        return highestScore > 0.5;
+    }
+
+
+    private double compare(String a, String b) {
+
+        int count = 0;
+        a = a.replaceAll("\\d", "@");
+        b = b.replaceAll("\\d", "@");
+
+        for (int i = 0; i < Math.min(a.length(), b.length()); i++) {
+            if (a.charAt(i) == b.charAt(i)) {
+                count++;
+            }
+        }
+        return (double) count / Math.max(a.length(), b.length());
+    }
+
+
+    /**
+     * Find the nearest n pages for a given page.
+     * For example: nearest 8 pages for page 4 are: 1, 2, 3, 5, 6, 7, 8, 9.
+     *
+     * @param currentPage  Current page to find the nearest ones.
+     * @param allPages     All pages in the document.
+     * @param numNeighbors Number of neighbouring pages to find.
+     * @return The nearest pages.
+     */
+    private List<ClassificationPage> findNearestPages(ClassificationPage currentPage, List<ClassificationPage> allPages, int numNeighbors) {
+
+        int totalPages = allPages.size();
+        List<ClassificationPage> nearestPages = new ArrayList<>();
+
+        int currentPageIndex = currentPage.getPageNumber() - 1;
+        int halfWin = numNeighbors / 2;
+        int start = Math.max(0, currentPageIndex - halfWin);
+        int end = Math.min(totalPages - 1, currentPageIndex + halfWin);
+
+        for (int i = start; i <= end; i++) {
+            if (i != currentPageIndex) {
+                nearestPages.add(pagesCache.computeIfAbsent(i, idx -> allPages.get(idx)));
+            }
+        }
+
+        pagesCache.keySet().removeIf(key -> key < start || key > end);
+
+        return nearestPages;
+    }
+
+
+    // Get the last 3 TextBlocks on the page as they are likely to be a footer
+    private List<List<AbstractPageBlock>> getFooterCandidates(List<ClassificationPage> pages) {
+
+        List<List<AbstractPageBlock>> footerCandidates = new ArrayList<>();
+        for (ClassificationPage page : pages) {
+            List<AbstractPageBlock> textBlocks = page.getTextBlocks();
+            int blockCount = textBlocks.size();
+            if (blockCount > 0) {
+                int start = Math.max(0, blockCount - 3);
+                footerCandidates.add(new ArrayList<>(textBlocks.subList(start, blockCount)));
+            }
+        }
+        return footerCandidates;
+    }
+
+
+    // Get the first 3 TextBlocks on the page as they are likely to be a header
+    private List<List<AbstractPageBlock>> getHeaderCandidates(List<ClassificationPage> pages) {
+
+        List<List<AbstractPageBlock>> headerCandidates = new ArrayList<>();
+        for (ClassificationPage page : pages) {
+            List<AbstractPageBlock> textBlocks = page.getTextBlocks();
+            int count = Math.min(3, textBlocks.size());
+            headerCandidates.add(new ArrayList<>(textBlocks.subList(0, count)));
+        }
+        return headerCandidates;
+    }
+
+}