Merge branch 'RED-10126' into 'main'
RM-187: Footers are recognized in the middle of the page See merge request fforesight/layout-parser!233
This commit is contained in:
commit
23e23328ee
@ -1,5 +1,8 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||||
|
|
||||||
|
import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.FOOTER;
|
||||||
|
import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.HEADER;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@ -9,6 +12,7 @@ import java.util.stream.Collectors;
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
|
|
||||||
import lombok.experimental.UtilityClass;
|
import lombok.experimental.UtilityClass;
|
||||||
@ -26,35 +30,60 @@ public class HeaderFooterDetection {
|
|||||||
|
|
||||||
public boolean isLikelyFooter(TextPageBlock textPageBlock, ClassificationDocument document, ClassificationPage classificationPage) {
|
public boolean isLikelyFooter(TextPageBlock textPageBlock, ClassificationDocument document, ClassificationPage classificationPage) {
|
||||||
|
|
||||||
int numberOfPages = document.getPages().size();
|
return isLikelyHeaderFooter(textPageBlock, document, classificationPage, FOOTER);
|
||||||
if (numberOfPages < 3) {
|
|
||||||
// If the document has 1 or 2 pages this may lead to more false positives than actual findings.
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
int window = Math.min(numberOfPages, 8);
|
|
||||||
|
|
||||||
List<ClassificationPage> nearestPages = findNearestPages(classificationPage, document.getPages(), window);
|
|
||||||
List<List<AbstractPageBlock>> footerCandidates = getFooterCandidates(nearestPages);
|
|
||||||
|
|
||||||
return detectHeadersOrFootersByPageAssociation(textPageBlock.getText(), footerCandidates, window, footerWeights);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public boolean isLikelyHeader(TextPageBlock textPageBlock, ClassificationDocument document, ClassificationPage classificationPage) {
|
public boolean isLikelyHeader(TextPageBlock textPageBlock, ClassificationDocument document, ClassificationPage classificationPage) {
|
||||||
|
|
||||||
|
return isLikelyHeaderFooter(textPageBlock, document, classificationPage, HEADER);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private boolean isLikelyHeaderFooter(TextPageBlock textPageBlock, ClassificationDocument document, ClassificationPage classificationPage, PageBlockType type) {
|
||||||
|
|
||||||
int numberOfPages = document.getPages().size();
|
int numberOfPages = document.getPages().size();
|
||||||
if (numberOfPages < 3) {
|
if (numberOfPages < 3) {
|
||||||
// If the document has 1 or 2 pages this may lead to more false positives than actual findings.
|
// If the document has 1 or 2 pages this may lead to more false positives than actual findings.
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
List<TextPageBlock> textPageBlocks = classificationPage.getTextBlocks()
|
||||||
|
.stream()
|
||||||
|
.filter(TextPageBlock.class::isInstance)
|
||||||
|
.map(TextPageBlock.class::cast)
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
|
||||||
|
if (textPageBlocks.isEmpty()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
List<TextPageBlock> selectedBlocks;
|
||||||
|
if (type == HEADER) {
|
||||||
|
selectedBlocks = textPageBlocks.subList(0, Math.min(3, textPageBlocks.size()));
|
||||||
|
} else { //FOOTER
|
||||||
|
selectedBlocks = textPageBlocks.subList(Math.max(0, textPageBlocks.size() - 3), textPageBlocks.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!selectedBlocks.contains(textPageBlock)) {
|
||||||
|
// The textPageBlock is not among the selected blocks on its page
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
int window = Math.min(numberOfPages, 8);
|
int window = Math.min(numberOfPages, 8);
|
||||||
|
|
||||||
List<ClassificationPage> nearestPages = findNearestPages(classificationPage, document.getPages(), window);
|
List<ClassificationPage> nearestPages = findNearestPages(classificationPage, document.getPages(), window);
|
||||||
List<List<AbstractPageBlock>> headerCandidates = getHeaderCandidates(nearestPages);
|
|
||||||
|
|
||||||
return detectHeadersOrFootersByPageAssociation(textPageBlock.getText(), headerCandidates, window, headerWeights);
|
List<List<AbstractPageBlock>> candidates;
|
||||||
|
double[] weights;
|
||||||
|
if (type == HEADER) {
|
||||||
|
candidates = getHeaderCandidates(nearestPages);
|
||||||
|
weights = headerWeights;
|
||||||
|
} else { //FOOTER
|
||||||
|
candidates = getFooterCandidates(nearestPages);
|
||||||
|
weights = footerWeights;
|
||||||
|
}
|
||||||
|
|
||||||
|
return detectHeadersOrFootersByPageAssociation(textPageBlock.getText(), candidates, window, weights);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user