Merge branch 'RED-10126' into 'main'

RM-187: Footers are recognized in the middle of the page

See merge request fforesight/layout-parser!233
This commit is contained in:
Maverick Studer 2024-10-08 14:27:45 +02:00
commit 23e23328ee

View File

@ -1,5 +1,8 @@
package com.knecon.fforesight.service.layoutparser.processor.utils;
import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.FOOTER;
import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.HEADER;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
@ -9,6 +12,7 @@ import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import lombok.experimental.UtilityClass;
@ -26,35 +30,60 @@ public class HeaderFooterDetection {
public boolean isLikelyFooter(TextPageBlock textPageBlock, ClassificationDocument document, ClassificationPage classificationPage) {
int numberOfPages = document.getPages().size();
if (numberOfPages < 3) {
// If the document has 1 or 2 pages this may lead to more false positives than actual findings.
return false;
}
int window = Math.min(numberOfPages, 8);
List<ClassificationPage> nearestPages = findNearestPages(classificationPage, document.getPages(), window);
List<List<AbstractPageBlock>> footerCandidates = getFooterCandidates(nearestPages);
return detectHeadersOrFootersByPageAssociation(textPageBlock.getText(), footerCandidates, window, footerWeights);
return isLikelyHeaderFooter(textPageBlock, document, classificationPage, FOOTER);
}
public boolean isLikelyHeader(TextPageBlock textPageBlock, ClassificationDocument document, ClassificationPage classificationPage) {
return isLikelyHeaderFooter(textPageBlock, document, classificationPage, HEADER);
}
private boolean isLikelyHeaderFooter(TextPageBlock textPageBlock, ClassificationDocument document, ClassificationPage classificationPage, PageBlockType type) {
int numberOfPages = document.getPages().size();
if (numberOfPages < 3) {
// If the document has 1 or 2 pages this may lead to more false positives than actual findings.
return false;
}
List<TextPageBlock> textPageBlocks = classificationPage.getTextBlocks()
.stream()
.filter(TextPageBlock.class::isInstance)
.map(TextPageBlock.class::cast)
.collect(Collectors.toList());
if (textPageBlocks.isEmpty()) {
return false;
}
List<TextPageBlock> selectedBlocks;
if (type == HEADER) {
selectedBlocks = textPageBlocks.subList(0, Math.min(3, textPageBlocks.size()));
} else { //FOOTER
selectedBlocks = textPageBlocks.subList(Math.max(0, textPageBlocks.size() - 3), textPageBlocks.size());
}
if (!selectedBlocks.contains(textPageBlock)) {
// The textPageBlock is not among the selected blocks on its page
return false;
}
int window = Math.min(numberOfPages, 8);
List<ClassificationPage> nearestPages = findNearestPages(classificationPage, document.getPages(), window);
List<List<AbstractPageBlock>> headerCandidates = getHeaderCandidates(nearestPages);
return detectHeadersOrFootersByPageAssociation(textPageBlock.getText(), headerCandidates, window, headerWeights);
List<List<AbstractPageBlock>> candidates;
double[] weights;
if (type == HEADER) {
candidates = getHeaderCandidates(nearestPages);
weights = headerWeights;
} else { //FOOTER
candidates = getFooterCandidates(nearestPages);
weights = footerWeights;
}
return detectHeadersOrFootersByPageAssociation(textPageBlock.getText(), candidates, window, weights);
}