Merge branch 'RED-10126-bp' into 'release/0.159.x'
RM-187: Footers are recognized in the middle of the page See merge request fforesight/layout-parser!234
This commit is contained in:
commit
072ad3bf23
@ -1,5 +1,8 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.FOOTER;
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.HEADER;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
@ -9,6 +12,7 @@ import java.util.stream.Collectors;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
@ -26,35 +30,60 @@ public class HeaderFooterDetection {
|
||||
|
||||
public boolean isLikelyFooter(TextPageBlock textPageBlock, ClassificationDocument document, ClassificationPage classificationPage) {
|
||||
|
||||
int numberOfPages = document.getPages().size();
|
||||
if (numberOfPages < 3) {
|
||||
// If the document has 1 or 2 pages this may lead to more false positives than actual findings.
|
||||
return false;
|
||||
}
|
||||
|
||||
int window = Math.min(numberOfPages, 8);
|
||||
|
||||
List<ClassificationPage> nearestPages = findNearestPages(classificationPage, document.getPages(), window);
|
||||
List<List<AbstractPageBlock>> footerCandidates = getFooterCandidates(nearestPages);
|
||||
|
||||
return detectHeadersOrFootersByPageAssociation(textPageBlock.getText(), footerCandidates, window, footerWeights);
|
||||
return isLikelyHeaderFooter(textPageBlock, document, classificationPage, FOOTER);
|
||||
}
|
||||
|
||||
|
||||
public boolean isLikelyHeader(TextPageBlock textPageBlock, ClassificationDocument document, ClassificationPage classificationPage) {
|
||||
|
||||
return isLikelyHeaderFooter(textPageBlock, document, classificationPage, HEADER);
|
||||
}
|
||||
|
||||
|
||||
private boolean isLikelyHeaderFooter(TextPageBlock textPageBlock, ClassificationDocument document, ClassificationPage classificationPage, PageBlockType type) {
|
||||
|
||||
int numberOfPages = document.getPages().size();
|
||||
if (numberOfPages < 3) {
|
||||
// If the document has 1 or 2 pages this may lead to more false positives than actual findings.
|
||||
return false;
|
||||
}
|
||||
|
||||
List<TextPageBlock> textPageBlocks = classificationPage.getTextBlocks()
|
||||
.stream()
|
||||
.filter(TextPageBlock.class::isInstance)
|
||||
.map(TextPageBlock.class::cast)
|
||||
.collect(Collectors.toList());
|
||||
|
||||
if (textPageBlocks.isEmpty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
List<TextPageBlock> selectedBlocks;
|
||||
if (type == HEADER) {
|
||||
selectedBlocks = textPageBlocks.subList(0, Math.min(3, textPageBlocks.size()));
|
||||
} else { //FOOTER
|
||||
selectedBlocks = textPageBlocks.subList(Math.max(0, textPageBlocks.size() - 3), textPageBlocks.size());
|
||||
}
|
||||
|
||||
if (!selectedBlocks.contains(textPageBlock)) {
|
||||
// The textPageBlock is not among the selected blocks on its page
|
||||
return false;
|
||||
}
|
||||
|
||||
int window = Math.min(numberOfPages, 8);
|
||||
|
||||
List<ClassificationPage> nearestPages = findNearestPages(classificationPage, document.getPages(), window);
|
||||
List<List<AbstractPageBlock>> headerCandidates = getHeaderCandidates(nearestPages);
|
||||
|
||||
return detectHeadersOrFootersByPageAssociation(textPageBlock.getText(), headerCandidates, window, headerWeights);
|
||||
List<List<AbstractPageBlock>> candidates;
|
||||
double[] weights;
|
||||
if (type == HEADER) {
|
||||
candidates = getHeaderCandidates(nearestPages);
|
||||
weights = headerWeights;
|
||||
} else { //FOOTER
|
||||
candidates = getFooterCandidates(nearestPages);
|
||||
weights = footerWeights;
|
||||
}
|
||||
|
||||
return detectHeadersOrFootersByPageAssociation(textPageBlock.getText(), candidates, window, weights);
|
||||
}
|
||||
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user