From fda25852d1990d372307674e19578318ec469a5b Mon Sep 17 00:00:00 2001 From: Andrei Isvoran Date: Fri, 10 May 2024 15:17:41 +0300 Subject: [PATCH] RED-9149 - Header and footer extraction by page-association --- .../DocuMineClassificationService.java | 25 ++- .../utils/HeaderFooterDetection.java | 180 ++++++++++++++++++ 2 files changed, 199 insertions(+), 6 deletions(-) create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java index f10ac3b..608e863 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java @@ -12,6 +12,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.utils.HeaderFooterDetection; import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils; @@ -49,6 +50,7 @@ public class DocuMineClassificationService { } } + private void classifyBlock(TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List headlineFontSizes) { log.debug("headlineFontSizes: {}", headlineFontSizes); @@ -63,15 +65,26 @@ public class DocuMineClassificationService { return; } if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) - || PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null - || textBlock.getHighestFontSize() <= document.getFontSizeCounter() - .getMostPopular())) { + || (PositionUtils.isOverBodyTextFrame(bodyTextFrame, + textBlock, + page.getRotation()) && (document.getFontSizeCounter().getMostPopular() + == null + || textBlock.getHighestFontSize() + <= document.getFontSizeCounter() + .getMostPopular())) + || HeaderFooterDetection.isLikelyHeader(textBlock, document, page)) { textBlock.setClassification(PageBlockType.HEADER); } else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER) - || PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null - || textBlock.getHighestFontSize() <= document.getFontSizeCounter() - .getMostPopular())) { + || (PositionUtils.isUnderBodyTextFrame(bodyTextFrame, + textBlock, + page.getRotation()) + && (document.getFontSizeCounter().getMostPopular() + == null + || textBlock.getHighestFontSize() + <= document.getFontSizeCounter() + .getMostPopular())) + || HeaderFooterDetection.isLikelyFooter(textBlock, document, page)) { textBlock.setClassification(PageBlockType.FOOTER); } else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java new file mode 100644 index 0000000..4668195 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/HeaderFooterDetection.java @@ -0,0 +1,180 @@ +package com.knecon.fforesight.service.layoutparser.processor.utils; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; + +import lombok.experimental.UtilityClass; + +@UtilityClass +public class HeaderFooterDetection { + + private final Map pagesCache = new HashMap<>(); + + + public boolean isLikelyFooter(TextPageBlock textPageBlock, ClassificationDocument document, ClassificationPage classificationPage) { + + int numberOfPages = document.getPages().size(); + if (numberOfPages < 3) { + // If the document has 1 or 2 pages this may lead to more false positives than actual findings. + return false; + } + + int window = Math.min(numberOfPages, 8); + + List nearestPages = findNearestPages(classificationPage, document.getPages(), window); + List> footerCandidates = getFooterCandidates(nearestPages); + + // Weight will go from 0.5 to 1.0 because the last element is the most likely to be the footer on the page. + double[] footerWeights = {0.5, 0.75, 1.0}; + return detectHeadersOrFootersByPageAssociation(textPageBlock.getText(), footerCandidates, window, footerWeights); + } + + + public boolean isLikelyHeader(TextPageBlock textPageBlock, ClassificationDocument document, ClassificationPage classificationPage) { + + int numberOfPages = document.getPages().size(); + if (numberOfPages < 3) { + // If the document has 1 or 2 pages this may lead to more false positives than actual findings. + return false; + } + + int window = Math.min(numberOfPages, 8); + + List nearestPages = findNearestPages(classificationPage, document.getPages(), window); + List> headerCandidates = getHeaderCandidates(nearestPages); + + // Weight will go from 1.0 to 0.5 because the first element is the most likely to be the header on the page. + double[] headerWeights = {1.0, 0.75, 0.5}; + return detectHeadersOrFootersByPageAssociation(textPageBlock.getText(), headerCandidates, window, headerWeights); + } + + + private boolean detectHeadersOrFootersByPageAssociation(String testString, List> candidates, int window, double[] weights) { + + double highestScore = 0.0; + + for (int i = 0; i < candidates.size(); i++) { + List> temp = new ArrayList<>(); + for (int k = Math.max(i - window, 0); k < Math.min(i + window, candidates.size()); k++) { + temp.add(candidates.get(k) + .stream() + .map(AbstractPageBlock::getText) + .collect(Collectors.toList())); + } + + int maxLen = temp.stream() + .mapToInt(List::size) + .max() + .orElse(0); + for (List sublist : temp) { + while (sublist.size() < maxLen) { + sublist.add(0, ""); + } + } + + // Compare the testString against each candidates in the window + for (int j = 0; j < maxLen; j++) { + double score = 0.0; + try { + int finalJ = j; + List cmp = temp.stream() + .map(sublist -> sublist.size() > finalJ ? sublist.get(finalJ) : "") + .toList(); + for (String cm : cmp) { + score += compare(testString, cm) * (j < weights.length ? weights[j] : 1); + } + score /= cmp.size(); + } catch (IndexOutOfBoundsException e) { + continue; + } + highestScore = Math.max(highestScore, score); + } + } + + return highestScore > 0.5; + } + + + private double compare(String a, String b) { + + int count = 0; + a = a.replaceAll("\\d", "@"); + b = b.replaceAll("\\d", "@"); + + for (int i = 0; i < Math.min(a.length(), b.length()); i++) { + if (a.charAt(i) == b.charAt(i)) { + count++; + } + } + return (double) count / Math.max(a.length(), b.length()); + } + + + /** + * Find the nearest n pages for a given page. + * For example: nearest 8 pages for page 4 are: 1, 2, 3, 5, 6, 7, 8, 9. + * + * @param currentPage Current page to find the nearest ones. + * @param allPages All pages in the document. + * @param numNeighbors Number of neighbouring pages to find. + * @return The nearest pages. + */ + private List findNearestPages(ClassificationPage currentPage, List allPages, int numNeighbors) { + + int totalPages = allPages.size(); + List nearestPages = new ArrayList<>(); + + int currentPageIndex = currentPage.getPageNumber() - 1; + int halfWin = numNeighbors / 2; + int start = Math.max(0, currentPageIndex - halfWin); + int end = Math.min(totalPages - 1, currentPageIndex + halfWin); + + for (int i = start; i <= end; i++) { + if (i != currentPageIndex) { + nearestPages.add(pagesCache.computeIfAbsent(i, idx -> allPages.get(idx))); + } + } + + pagesCache.keySet().removeIf(key -> key < start || key > end); + + return nearestPages; + } + + + // Get the last 3 TextBlocks on the page as they are likely to be a footer + private List> getFooterCandidates(List pages) { + + List> footerCandidates = new ArrayList<>(); + for (ClassificationPage page : pages) { + List textBlocks = page.getTextBlocks(); + int blockCount = textBlocks.size(); + if (blockCount > 0) { + int start = Math.max(0, blockCount - 3); + footerCandidates.add(new ArrayList<>(textBlocks.subList(start, blockCount))); + } + } + return footerCandidates; + } + + + // Get the first 3 TextBlocks on the page as they are likely to be a header + private List> getHeaderCandidates(List pages) { + + List> headerCandidates = new ArrayList<>(); + for (ClassificationPage page : pages) { + List textBlocks = page.getTextBlocks(); + int count = Math.min(3, textBlocks.size()); + headerCandidates.add(new ArrayList<>(textBlocks.subList(0, count))); + } + return headerCandidates; + } + +}