RED-9149 - Header and footer extraction by page-association
This commit is contained in:
parent
471fadbcca
commit
fda25852d1
@ -12,6 +12,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.HeaderFooterDetection;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
|
||||||
|
|
||||||
@ -49,6 +50,7 @@ public class DocuMineClassificationService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void classifyBlock(TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
|
private void classifyBlock(TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
|
||||||
|
|
||||||
log.debug("headlineFontSizes: {}", headlineFontSizes);
|
log.debug("headlineFontSizes: {}", headlineFontSizes);
|
||||||
@ -63,15 +65,26 @@ public class DocuMineClassificationService {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|
||||||
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null
|
|| (PositionUtils.isOverBodyTextFrame(bodyTextFrame,
|
||||||
|| textBlock.getHighestFontSize() <= document.getFontSizeCounter()
|
textBlock,
|
||||||
.getMostPopular())) {
|
page.getRotation()) && (document.getFontSizeCounter().getMostPopular()
|
||||||
|
== null
|
||||||
|
|| textBlock.getHighestFontSize()
|
||||||
|
<= document.getFontSizeCounter()
|
||||||
|
.getMostPopular()))
|
||||||
|
|| HeaderFooterDetection.isLikelyHeader(textBlock, document, page)) {
|
||||||
textBlock.setClassification(PageBlockType.HEADER);
|
textBlock.setClassification(PageBlockType.HEADER);
|
||||||
|
|
||||||
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
|
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
|
||||||
|| PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null
|
|| (PositionUtils.isUnderBodyTextFrame(bodyTextFrame,
|
||||||
|| textBlock.getHighestFontSize() <= document.getFontSizeCounter()
|
textBlock,
|
||||||
.getMostPopular())) {
|
page.getRotation())
|
||||||
|
&& (document.getFontSizeCounter().getMostPopular()
|
||||||
|
== null
|
||||||
|
|| textBlock.getHighestFontSize()
|
||||||
|
<= document.getFontSizeCounter()
|
||||||
|
.getMostPopular()))
|
||||||
|
|| HeaderFooterDetection.isLikelyFooter(textBlock, document, page)) {
|
||||||
textBlock.setClassification(PageBlockType.FOOTER);
|
textBlock.setClassification(PageBlockType.FOOTER);
|
||||||
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5
|
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5
|
||||||
&& textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) {
|
&& textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) {
|
||||||
|
|||||||
@ -0,0 +1,180 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
|
|
||||||
|
import lombok.experimental.UtilityClass;
|
||||||
|
|
||||||
|
@UtilityClass
|
||||||
|
public class HeaderFooterDetection {
|
||||||
|
|
||||||
|
private final Map<Integer, ClassificationPage> pagesCache = new HashMap<>();
|
||||||
|
|
||||||
|
|
||||||
|
public boolean isLikelyFooter(TextPageBlock textPageBlock, ClassificationDocument document, ClassificationPage classificationPage) {
|
||||||
|
|
||||||
|
int numberOfPages = document.getPages().size();
|
||||||
|
if (numberOfPages < 3) {
|
||||||
|
// If the document has 1 or 2 pages this may lead to more false positives than actual findings.
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
int window = Math.min(numberOfPages, 8);
|
||||||
|
|
||||||
|
List<ClassificationPage> nearestPages = findNearestPages(classificationPage, document.getPages(), window);
|
||||||
|
List<List<AbstractPageBlock>> footerCandidates = getFooterCandidates(nearestPages);
|
||||||
|
|
||||||
|
// Weight will go from 0.5 to 1.0 because the last element is the most likely to be the footer on the page.
|
||||||
|
double[] footerWeights = {0.5, 0.75, 1.0};
|
||||||
|
return detectHeadersOrFootersByPageAssociation(textPageBlock.getText(), footerCandidates, window, footerWeights);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean isLikelyHeader(TextPageBlock textPageBlock, ClassificationDocument document, ClassificationPage classificationPage) {
|
||||||
|
|
||||||
|
int numberOfPages = document.getPages().size();
|
||||||
|
if (numberOfPages < 3) {
|
||||||
|
// If the document has 1 or 2 pages this may lead to more false positives than actual findings.
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
int window = Math.min(numberOfPages, 8);
|
||||||
|
|
||||||
|
List<ClassificationPage> nearestPages = findNearestPages(classificationPage, document.getPages(), window);
|
||||||
|
List<List<AbstractPageBlock>> headerCandidates = getHeaderCandidates(nearestPages);
|
||||||
|
|
||||||
|
// Weight will go from 1.0 to 0.5 because the first element is the most likely to be the header on the page.
|
||||||
|
double[] headerWeights = {1.0, 0.75, 0.5};
|
||||||
|
return detectHeadersOrFootersByPageAssociation(textPageBlock.getText(), headerCandidates, window, headerWeights);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private boolean detectHeadersOrFootersByPageAssociation(String testString, List<List<AbstractPageBlock>> candidates, int window, double[] weights) {
|
||||||
|
|
||||||
|
double highestScore = 0.0;
|
||||||
|
|
||||||
|
for (int i = 0; i < candidates.size(); i++) {
|
||||||
|
List<List<String>> temp = new ArrayList<>();
|
||||||
|
for (int k = Math.max(i - window, 0); k < Math.min(i + window, candidates.size()); k++) {
|
||||||
|
temp.add(candidates.get(k)
|
||||||
|
.stream()
|
||||||
|
.map(AbstractPageBlock::getText)
|
||||||
|
.collect(Collectors.toList()));
|
||||||
|
}
|
||||||
|
|
||||||
|
int maxLen = temp.stream()
|
||||||
|
.mapToInt(List::size)
|
||||||
|
.max()
|
||||||
|
.orElse(0);
|
||||||
|
for (List<String> sublist : temp) {
|
||||||
|
while (sublist.size() < maxLen) {
|
||||||
|
sublist.add(0, "");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compare the testString against each candidates in the window
|
||||||
|
for (int j = 0; j < maxLen; j++) {
|
||||||
|
double score = 0.0;
|
||||||
|
try {
|
||||||
|
int finalJ = j;
|
||||||
|
List<String> cmp = temp.stream()
|
||||||
|
.map(sublist -> sublist.size() > finalJ ? sublist.get(finalJ) : "")
|
||||||
|
.toList();
|
||||||
|
for (String cm : cmp) {
|
||||||
|
score += compare(testString, cm) * (j < weights.length ? weights[j] : 1);
|
||||||
|
}
|
||||||
|
score /= cmp.size();
|
||||||
|
} catch (IndexOutOfBoundsException e) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
highestScore = Math.max(highestScore, score);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return highestScore > 0.5;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private double compare(String a, String b) {
|
||||||
|
|
||||||
|
int count = 0;
|
||||||
|
a = a.replaceAll("\\d", "@");
|
||||||
|
b = b.replaceAll("\\d", "@");
|
||||||
|
|
||||||
|
for (int i = 0; i < Math.min(a.length(), b.length()); i++) {
|
||||||
|
if (a.charAt(i) == b.charAt(i)) {
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return (double) count / Math.max(a.length(), b.length());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Find the nearest n pages for a given page.
|
||||||
|
* For example: nearest 8 pages for page 4 are: 1, 2, 3, 5, 6, 7, 8, 9.
|
||||||
|
*
|
||||||
|
* @param currentPage Current page to find the nearest ones.
|
||||||
|
* @param allPages All pages in the document.
|
||||||
|
* @param numNeighbors Number of neighbouring pages to find.
|
||||||
|
* @return The nearest pages.
|
||||||
|
*/
|
||||||
|
private List<ClassificationPage> findNearestPages(ClassificationPage currentPage, List<ClassificationPage> allPages, int numNeighbors) {
|
||||||
|
|
||||||
|
int totalPages = allPages.size();
|
||||||
|
List<ClassificationPage> nearestPages = new ArrayList<>();
|
||||||
|
|
||||||
|
int currentPageIndex = currentPage.getPageNumber() - 1;
|
||||||
|
int halfWin = numNeighbors / 2;
|
||||||
|
int start = Math.max(0, currentPageIndex - halfWin);
|
||||||
|
int end = Math.min(totalPages - 1, currentPageIndex + halfWin);
|
||||||
|
|
||||||
|
for (int i = start; i <= end; i++) {
|
||||||
|
if (i != currentPageIndex) {
|
||||||
|
nearestPages.add(pagesCache.computeIfAbsent(i, idx -> allPages.get(idx)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pagesCache.keySet().removeIf(key -> key < start || key > end);
|
||||||
|
|
||||||
|
return nearestPages;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Get the last 3 TextBlocks on the page as they are likely to be a footer
|
||||||
|
private List<List<AbstractPageBlock>> getFooterCandidates(List<ClassificationPage> pages) {
|
||||||
|
|
||||||
|
List<List<AbstractPageBlock>> footerCandidates = new ArrayList<>();
|
||||||
|
for (ClassificationPage page : pages) {
|
||||||
|
List<AbstractPageBlock> textBlocks = page.getTextBlocks();
|
||||||
|
int blockCount = textBlocks.size();
|
||||||
|
if (blockCount > 0) {
|
||||||
|
int start = Math.max(0, blockCount - 3);
|
||||||
|
footerCandidates.add(new ArrayList<>(textBlocks.subList(start, blockCount)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return footerCandidates;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Get the first 3 TextBlocks on the page as they are likely to be a header
|
||||||
|
private List<List<AbstractPageBlock>> getHeaderCandidates(List<ClassificationPage> pages) {
|
||||||
|
|
||||||
|
List<List<AbstractPageBlock>> headerCandidates = new ArrayList<>();
|
||||||
|
for (ClassificationPage page : pages) {
|
||||||
|
List<AbstractPageBlock> textBlocks = page.getTextBlocks();
|
||||||
|
int count = Math.min(3, textBlocks.size());
|
||||||
|
headerCandidates.add(new ArrayList<>(textBlocks.subList(0, count)));
|
||||||
|
}
|
||||||
|
return headerCandidates;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
Loading…
x
Reference in New Issue
Block a user