Merge branch 'RED-9148-backport' into 'release/0.78.x'

RED-9149 - Header and footer detection by page-association

See merge request fforesight/layout-parser!151
This commit is contained in:
Kilian Schüttler 2024-05-13 14:57:51 +02:00
commit e2a5b85c4a
2 changed files with 241 additions and 6 deletions

View File

@ -12,6 +12,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.utils.HeaderFooterDetection;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
@ -64,15 +65,26 @@ public class DocuMineClassificationService {
return;
}
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null
|| textBlock.getHighestFontSize() <= document.getFontSizeCounter()
.getMostPopular())) {
|| (PositionUtils.isOverBodyTextFrame(bodyTextFrame,
textBlock,
page.getRotation()) && (document.getFontSizeCounter().getMostPopular()
== null
|| textBlock.getHighestFontSize()
<= document.getFontSizeCounter()
.getMostPopular()))
|| HeaderFooterDetection.isLikelyHeader(textBlock, document, page)) {
textBlock.setClassification(PageBlockType.HEADER);
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
|| PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null
|| textBlock.getHighestFontSize() <= document.getFontSizeCounter()
.getMostPopular())) {
|| (PositionUtils.isUnderBodyTextFrame(bodyTextFrame,
textBlock,
page.getRotation())
&& (document.getFontSizeCounter().getMostPopular()
== null
|| textBlock.getHighestFontSize()
<= document.getFontSizeCounter()
.getMostPopular()))
|| HeaderFooterDetection.isLikelyFooter(textBlock, document, page)) {
textBlock.setClassification(PageBlockType.FOOTER);
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5
&& textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) {

View File

@ -0,0 +1,223 @@
package com.knecon.fforesight.service.layoutparser.processor.utils;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import lombok.experimental.UtilityClass;
@UtilityClass
public class HeaderFooterDetection {
private final Map<Integer, ClassificationPage> pagesCache = new HashMap<>();
private static final double THRESHOLD = 0.5;
// Weight will go from 1.0 to 0.5 because the first element is the most likely to be the header on the page.
private static final double[] headerWeights = {1.0, 0.75, 0.5};
// Weight will go from 0.5 to 1.0 because the last element is the most likely to be the footer on the page.
private static final double[] footerWeights = {0.5, 0.75, 1.0};
public boolean isLikelyFooter(TextPageBlock textPageBlock, ClassificationDocument document, ClassificationPage classificationPage) {
int numberOfPages = document.getPages().size();
if (numberOfPages < 3) {
// If the document has 1 or 2 pages this may lead to more false positives than actual findings.
return false;
}
int window = Math.min(numberOfPages, 8);
List<ClassificationPage> nearestPages = findNearestPages(classificationPage, document.getPages(), window);
List<List<AbstractPageBlock>> footerCandidates = getFooterCandidates(nearestPages);
return detectHeadersOrFootersByPageAssociation(textPageBlock.getText(), footerCandidates, window, footerWeights);
}
public boolean isLikelyHeader(TextPageBlock textPageBlock, ClassificationDocument document, ClassificationPage classificationPage) {
int numberOfPages = document.getPages().size();
if (numberOfPages < 3) {
// If the document has 1 or 2 pages this may lead to more false positives than actual findings.
return false;
}
int window = Math.min(numberOfPages, 8);
List<ClassificationPage> nearestPages = findNearestPages(classificationPage, document.getPages(), window);
List<List<AbstractPageBlock>> headerCandidates = getHeaderCandidates(nearestPages);
return detectHeadersOrFootersByPageAssociation(textPageBlock.getText(), headerCandidates, window, headerWeights);
}
private boolean detectHeadersOrFootersByPageAssociation(String testString, List<List<AbstractPageBlock>> candidates, int window, double[] weights) {
double highestScore = 0.0;
for (int i = 0; i < candidates.size(); i++) {
List<List<String>> candidateStrings = new ArrayList<>();
for (int k = Math.max(i - window, 0); k < Math.min(i + window, candidates.size()); k++) {
candidateStrings.add(candidates.get(k)
.stream()
.map(AbstractPageBlock::getText)
.collect(Collectors.toList()));
}
int maxLen = candidateStrings.stream()
.mapToInt(List::size)
.max()
.orElse(0);
for (List<String> sublist : candidateStrings) {
while (sublist.size() < maxLen) {
sublist.add(0, "");
}
}
// Compare the testString against each candidate in the window
for (int j = 0; j < maxLen; j++) {
double score = 0.0;
int finalJ = j;
List<String> paddedCandidateStrings = candidateStrings.stream()
.map(sublist -> sublist.size() > finalJ ? sublist.get(finalJ) : "")
.toList();
for (String paddedString : paddedCandidateStrings) {
if ((testString.length() >= 5 && paddedString.length() >= 5) && (testString.length() > 2 * paddedString.length()
|| paddedString.length() > 2 * testString.length())) {
// If both strings are at least 5 characters long and one string is more than twice as long as the other,
// skip the distance calculation as it's time-consuming, and we can assume they are not similar enough
continue;
}
int distance = calculateHammingDistanceWithPreprocessing(testString, paddedString);
double normalizedScore = 1 - (double) distance / Math.max(testString.length(), paddedString.length());
score += normalizedScore * (j < weights.length ? weights[j] : 1);
}
score /= paddedCandidateStrings.size();
highestScore = Math.max(highestScore, score);
// Early stop
if (highestScore > THRESHOLD) {
return true;
}
}
}
return false;
}
/**
* Find the nearest n pages for a given page.
* For example: nearest 8 pages for page 4 are: 1, 2, 3, 5, 6, 7, 8, 9.
*
* @param currentPage Current page to find the nearest ones.
* @param allPages All pages in the document.
* @param numNeighbors Number of neighbouring pages to find.
* @return The nearest pages.
*/
private List<ClassificationPage> findNearestPages(ClassificationPage currentPage, List<ClassificationPage> allPages, int numNeighbors) {
int totalPages = allPages.size();
List<ClassificationPage> nearestPages = new ArrayList<>();
int currentPageIndex = currentPage.getPageNumber() - 1;
int halfWin = numNeighbors / 2;
int start = Math.max(0, currentPageIndex - halfWin);
int end = Math.min(totalPages - 1, currentPageIndex + halfWin);
for (int i = start; i <= end; i++) {
if (i != currentPageIndex) {
nearestPages.add(pagesCache.computeIfAbsent(i, allPages::get));
}
}
pagesCache.keySet().removeIf(key -> key < start || key > end);
return nearestPages;
}
// Get the last 3 TextBlocks on the page as they are likely to be a footer
private List<List<AbstractPageBlock>> getFooterCandidates(List<ClassificationPage> pages) {
List<List<AbstractPageBlock>> footerCandidates = new ArrayList<>();
for (ClassificationPage page : pages) {
List<AbstractPageBlock> textBlocks = page.getTextBlocks();
List<TextPageBlock> textPageBlocks = textBlocks.stream()
.filter(textBlock -> textBlock instanceof TextPageBlock)
.map(textBlock -> (TextPageBlock) textBlock)
.toList();
int blockCount = textPageBlocks.size();
if (blockCount > 0) {
int start = Math.max(0, blockCount - 3);
footerCandidates.add(new ArrayList<>(textPageBlocks.subList(start, blockCount)));
}
}
return footerCandidates;
}
// Get the first 3 TextBlocks on the page as they are likely to be a header
private List<List<AbstractPageBlock>> getHeaderCandidates(List<ClassificationPage> pages) {
List<List<AbstractPageBlock>> headerCandidates = new ArrayList<>();
for (ClassificationPage page : pages) {
List<AbstractPageBlock> textBlocks = page.getTextBlocks();
List<TextPageBlock> textPageBlocks = textBlocks.stream()
.filter(textBlock -> textBlock instanceof TextPageBlock)
.map(textBlock -> (TextPageBlock) textBlock)
.toList();
int count = Math.min(3, textPageBlocks.size());
headerCandidates.add(new ArrayList<>(textPageBlocks.subList(0, count)));
}
return headerCandidates;
}
/**
* Calculate the Hamming distance between two strings after preprocessing to make them the same length
* and replacing all digits with a special character '@' since they are a common occurrence in headers/footers.
*
* @param firstCandidate First string
* @param secondCandidate Second string
* @return The Hamming distance between the two preprocessed strings.
*/
private int calculateHammingDistanceWithPreprocessing(String firstCandidate, String secondCandidate) {
int maxLength = Math.max(firstCandidate.length(), secondCandidate.length());
String cleanFirstCandidate = padString(firstCandidate, maxLength, '\0').replaceAll("\\d", "@");
String cleanSecondCandidate = padString(secondCandidate, maxLength, '\0').replaceAll("\\d", "@");
int distance = 0;
for (int i = 0; i < maxLength; i++) {
if (cleanFirstCandidate.charAt(i) != cleanSecondCandidate.charAt(i)) {
distance++;
}
}
return distance;
}
private String padString(String input, int length, char padChar) {
if (input.length() >= length) {
return input;
}
StringBuilder sb = new StringBuilder(input);
while (sb.length() < length) {
sb.append(padChar);
}
return sb.toString();
}
}