RED-9139: more robust TOC detection

* detect numbers in words, and not just whole words that are numbers
This commit is contained in:
Kilian Schuettler 2024-11-08 12:15:56 +01:00
parent e6cd889444
commit ce41014d4b
3 changed files with 80 additions and 43 deletions

View File

@ -5,12 +5,12 @@ import java.util.HashMap;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.NumberWord;
public class TextPositionSequenceComparator implements Comparator<NumberWord> {
public class TocNumberComparator implements Comparator<NumberWord> {
private HashMap<NumberWord, TextBlockOnPage> lookup;
public TextPositionSequenceComparator(HashMap<NumberWord, TextBlockOnPage> lookup) {
public TocNumberComparator(HashMap<NumberWord, TextBlockOnPage> lookup) {
this.lookup = lookup;
}

View File

@ -28,7 +28,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.SectionIdentif
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextBlockOnPage;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequenceComparator;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TocNumberComparator;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
@ -175,12 +175,16 @@ public class TableOfContentsClassificationService {
for (int i = 0; i < words.size(); i++) {
Word word = words.get(i);
if (!wordIsEndOfLine(i, words)) {
continue;
}
if (AMOUNT_PATTERN.matcher(getSurroundingString(i, words)).matches()) {
continue;
}
if (SectionIdentifier.fromSearchText(word.toString()).level() > 1) {
Matcher matcher = SectionIdentifier.numericalIdentifierPattern.matcher(word.toString());
if (matcher.find() && matcher.group(2) != null) {
continue;
}
@ -205,6 +209,17 @@ public class TableOfContentsClassificationService {
}
private static boolean wordIsEndOfLine(int i, List<Word> words) {
if (i == words.size() - 1) {
return true;
}
Word word = words.get(i);
Word nextWord = words.get(i + 1);
return !nextWord.rightOf(word);
}
private static CharSequence getSurroundingString(int i, List<Word> sequences) {
int end = Math.min(i + 5, sequences.size());
@ -298,62 +313,89 @@ public class TableOfContentsClassificationService {
.stream()
.filter(cluster -> cluster.size() > MINIMUM_MATCHES)
.map(cluster -> cluster.stream()
.sorted(new TextPositionSequenceComparator(lookup))
.sorted(new TocNumberComparator(lookup))
.toList())
.map(this::removeOutliers)
// .map(this::filterByMinimumDensity)
.map(this::removeOnNonConsecutivePages)
.map(this::filterByWordNearTopOfPage)
.filter(cluster -> cluster.size() > MINIMUM_MATCHES)
.max(Comparator.comparingDouble(cluster -> cluster.get(0).word().getBBox().getMaxX())).orElse(Collections.emptyList());
}
// does not seem to be doing much, ideally instead of using the height of the blocks i would like to use the height, beginning from the MainBody top,
// but as the MainBody is often wrong, this results in some numbers being discarded even though they are correct.
// private List<TextPositionSequence> filterByMinimumDensity(List<TextPositionSequence> numbers) {
//
// Map<ClassificationPage, List<TextPositionSequence>> clustersPerPage = numbers.stream()
// .collect(Collectors.groupingBy(number -> lookup.get(number).page()));
//
// List<TextPositionSequence> result = new ArrayList<>(numbers.size());
// clustersPerPage.keySet()
// .stream()
// .sorted(Comparator.comparingInt(ClassificationPage::getPageNumber))
// .forEach(page -> {
// var numbersOnPage = clustersPerPage.get(page);
//
// double height = numbersOnPage.stream()
// .map(BoundingBox::getBBox)
// .collect(RectangleTransformations.collectBBox()).getHeight();
//
// double count = numbersOnPage.size();
//
// if ((count / height) >= (DENSITY_THRESHOLD_COUNT / page.getPageHeight())) {
// result.addAll(numbers);
// }
// });
// return result;
// }
public List<NumberWord> removeOutliers(List<NumberWord> numbers) {
private List<NumberWord> removeOnNonConsecutivePages(List<NumberWord> numbers) {
List<NumberWord> result = new ArrayList<>();
result.add(numbers.get(0));
for (int i = 1; i < numbers.size(); i++) {
int prev = getPageNumber(numbers, i - 1);
int curr = getPageNumber(numbers, i);
if (Math.abs(prev - curr) > 1) {
break;
} else {
result.add(numbers.get(i));
}
}
return result;
}
private int getPageNumber(List<NumberWord> numbers, int i) {
return lookup.get(numbers.get(i)).page().getPageNumber();
}
private List<NumberWord> filterByWordNearTopOfPage(List<NumberWord> numbers) {
List<NumberWord> result = new ArrayList<>();
result.add(numbers.get(0));
for (int i = 1; i < numbers.size(); i++) {
NumberWord prev = numbers.get(i - 1);
NumberWord curr = numbers.get(i);
ClassificationPage prevPage = lookup.get(prev).page();
ClassificationPage currPage = lookup.get(curr).page();
if (prevPage == currPage) {
result.add(curr);
} else if (curr.word().getBBox().getMinY() < currPage.getPageHeight() * 0.33) {
result.add(curr);
}
}
return result;
}
public List<NumberWord> removeOutliers(List<NumberWord> numbers) {
List<NumberWord> confirmedClusterNumbers = new ArrayList<>();
confirmedClusterNumbers.add(numbers.get(0));
for (int i = 1; i < numbers.size() - 1; i++) {
int prev = getNumberAsInt(numbers, i - 1);
int curr = getNumberAsInt(numbers, i);
int next = getNumberAsInt(numbers, i + 1);
if (!(curr <= prev || curr >= next) || !isBetterWithout(numbers, i)) {
result.add(numbers.get(i));
confirmedClusterNumbers.add(numbers.get(i));
}
}
if (getNumberAsInt(numbers, numbers.size() - 1) >= getNumberAsInt(numbers, Math.max(0, numbers.size() - 2))) {
result.add(numbers.get(numbers.size() - 1));
if (getNumberAsInt(numbers, numbers.size() - 1) >= getLatestNumber(confirmedClusterNumbers)) {
confirmedClusterNumbers.add(numbers.get(numbers.size() - 1));
}
return result;
return confirmedClusterNumbers;
}
private static int getLatestNumber(List<NumberWord> confirmedClusterNumbers) {
return confirmedClusterNumbers.get(confirmedClusterNumbers.size() - 1).number();
}

View File

@ -308,11 +308,6 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
.map(BoundingBox::getBBoxPdf)
.map(line -> new ColoredRectangle(line, LINES_COLOR, LINE_WIDTH))
.toList());
visualizationsOnPage.getColoredRectangles()
.add(new ColoredRectangle(numbers.stream()
.map(NumberWord::word)
.map(BoundingBox::getBBoxPdf)
.collect(RectangleTransformations.collectBBox()), LINES_COLOR, LINE_WIDTH));
}