RED-9139: more robust TOC detection
* detect numbers in words, and not just whole words that are numbers
This commit is contained in:
parent
e6cd889444
commit
ce41014d4b
@ -5,12 +5,12 @@ import java.util.HashMap;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.NumberWord;
|
||||
|
||||
public class TextPositionSequenceComparator implements Comparator<NumberWord> {
|
||||
public class TocNumberComparator implements Comparator<NumberWord> {
|
||||
|
||||
private HashMap<NumberWord, TextBlockOnPage> lookup;
|
||||
|
||||
|
||||
public TextPositionSequenceComparator(HashMap<NumberWord, TextBlockOnPage> lookup) {
|
||||
public TocNumberComparator(HashMap<NumberWord, TextBlockOnPage> lookup) {
|
||||
|
||||
this.lookup = lookup;
|
||||
}
|
||||
@ -28,7 +28,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.SectionIdentif
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextBlockOnPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequenceComparator;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TocNumberComparator;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
|
||||
|
||||
@ -175,12 +175,16 @@ public class TableOfContentsClassificationService {
|
||||
for (int i = 0; i < words.size(); i++) {
|
||||
|
||||
Word word = words.get(i);
|
||||
if (!wordIsEndOfLine(i, words)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (AMOUNT_PATTERN.matcher(getSurroundingString(i, words)).matches()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (SectionIdentifier.fromSearchText(word.toString()).level() > 1) {
|
||||
Matcher matcher = SectionIdentifier.numericalIdentifierPattern.matcher(word.toString());
|
||||
if (matcher.find() && matcher.group(2) != null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -205,6 +209,17 @@ public class TableOfContentsClassificationService {
|
||||
}
|
||||
|
||||
|
||||
private static boolean wordIsEndOfLine(int i, List<Word> words) {
|
||||
|
||||
if (i == words.size() - 1) {
|
||||
return true;
|
||||
}
|
||||
Word word = words.get(i);
|
||||
Word nextWord = words.get(i + 1);
|
||||
return !nextWord.rightOf(word);
|
||||
}
|
||||
|
||||
|
||||
private static CharSequence getSurroundingString(int i, List<Word> sequences) {
|
||||
|
||||
int end = Math.min(i + 5, sequences.size());
|
||||
@ -298,62 +313,89 @@ public class TableOfContentsClassificationService {
|
||||
.stream()
|
||||
.filter(cluster -> cluster.size() > MINIMUM_MATCHES)
|
||||
.map(cluster -> cluster.stream()
|
||||
.sorted(new TextPositionSequenceComparator(lookup))
|
||||
.sorted(new TocNumberComparator(lookup))
|
||||
.toList())
|
||||
.map(this::removeOutliers)
|
||||
// .map(this::filterByMinimumDensity)
|
||||
.map(this::removeOnNonConsecutivePages)
|
||||
.map(this::filterByWordNearTopOfPage)
|
||||
.filter(cluster -> cluster.size() > MINIMUM_MATCHES)
|
||||
.max(Comparator.comparingDouble(cluster -> cluster.get(0).word().getBBox().getMaxX())).orElse(Collections.emptyList());
|
||||
}
|
||||
|
||||
// does not seem to be doing much, ideally instead of using the height of the blocks i would like to use the height, beginning from the MainBody top,
|
||||
// but as the MainBody is often wrong, this results in some numbers being discarded even though they are correct.
|
||||
// private List<TextPositionSequence> filterByMinimumDensity(List<TextPositionSequence> numbers) {
|
||||
//
|
||||
// Map<ClassificationPage, List<TextPositionSequence>> clustersPerPage = numbers.stream()
|
||||
// .collect(Collectors.groupingBy(number -> lookup.get(number).page()));
|
||||
//
|
||||
// List<TextPositionSequence> result = new ArrayList<>(numbers.size());
|
||||
// clustersPerPage.keySet()
|
||||
// .stream()
|
||||
// .sorted(Comparator.comparingInt(ClassificationPage::getPageNumber))
|
||||
// .forEach(page -> {
|
||||
// var numbersOnPage = clustersPerPage.get(page);
|
||||
//
|
||||
// double height = numbersOnPage.stream()
|
||||
// .map(BoundingBox::getBBox)
|
||||
// .collect(RectangleTransformations.collectBBox()).getHeight();
|
||||
//
|
||||
// double count = numbersOnPage.size();
|
||||
//
|
||||
// if ((count / height) >= (DENSITY_THRESHOLD_COUNT / page.getPageHeight())) {
|
||||
// result.addAll(numbers);
|
||||
// }
|
||||
// });
|
||||
// return result;
|
||||
// }
|
||||
|
||||
|
||||
public List<NumberWord> removeOutliers(List<NumberWord> numbers) {
|
||||
private List<NumberWord> removeOnNonConsecutivePages(List<NumberWord> numbers) {
|
||||
|
||||
List<NumberWord> result = new ArrayList<>();
|
||||
|
||||
result.add(numbers.get(0));
|
||||
|
||||
for (int i = 1; i < numbers.size(); i++) {
|
||||
int prev = getPageNumber(numbers, i - 1);
|
||||
int curr = getPageNumber(numbers, i);
|
||||
|
||||
if (Math.abs(prev - curr) > 1) {
|
||||
break;
|
||||
} else {
|
||||
result.add(numbers.get(i));
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
private int getPageNumber(List<NumberWord> numbers, int i) {
|
||||
|
||||
return lookup.get(numbers.get(i)).page().getPageNumber();
|
||||
}
|
||||
|
||||
|
||||
private List<NumberWord> filterByWordNearTopOfPage(List<NumberWord> numbers) {
|
||||
|
||||
List<NumberWord> result = new ArrayList<>();
|
||||
|
||||
result.add(numbers.get(0));
|
||||
|
||||
for (int i = 1; i < numbers.size(); i++) {
|
||||
NumberWord prev = numbers.get(i - 1);
|
||||
NumberWord curr = numbers.get(i);
|
||||
ClassificationPage prevPage = lookup.get(prev).page();
|
||||
ClassificationPage currPage = lookup.get(curr).page();
|
||||
if (prevPage == currPage) {
|
||||
result.add(curr);
|
||||
} else if (curr.word().getBBox().getMinY() < currPage.getPageHeight() * 0.33) {
|
||||
result.add(curr);
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
public List<NumberWord> removeOutliers(List<NumberWord> numbers) {
|
||||
|
||||
List<NumberWord> confirmedClusterNumbers = new ArrayList<>();
|
||||
|
||||
confirmedClusterNumbers.add(numbers.get(0));
|
||||
|
||||
for (int i = 1; i < numbers.size() - 1; i++) {
|
||||
int prev = getNumberAsInt(numbers, i - 1);
|
||||
int curr = getNumberAsInt(numbers, i);
|
||||
int next = getNumberAsInt(numbers, i + 1);
|
||||
|
||||
if (!(curr <= prev || curr >= next) || !isBetterWithout(numbers, i)) {
|
||||
result.add(numbers.get(i));
|
||||
confirmedClusterNumbers.add(numbers.get(i));
|
||||
}
|
||||
}
|
||||
if (getNumberAsInt(numbers, numbers.size() - 1) >= getNumberAsInt(numbers, Math.max(0, numbers.size() - 2))) {
|
||||
result.add(numbers.get(numbers.size() - 1));
|
||||
if (getNumberAsInt(numbers, numbers.size() - 1) >= getLatestNumber(confirmedClusterNumbers)) {
|
||||
confirmedClusterNumbers.add(numbers.get(numbers.size() - 1));
|
||||
}
|
||||
|
||||
return result;
|
||||
return confirmedClusterNumbers;
|
||||
}
|
||||
|
||||
|
||||
private static int getLatestNumber(List<NumberWord> confirmedClusterNumbers) {
|
||||
|
||||
return confirmedClusterNumbers.get(confirmedClusterNumbers.size() - 1).number();
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -308,11 +308,6 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
||||
.map(BoundingBox::getBBoxPdf)
|
||||
.map(line -> new ColoredRectangle(line, LINES_COLOR, LINE_WIDTH))
|
||||
.toList());
|
||||
visualizationsOnPage.getColoredRectangles()
|
||||
.add(new ColoredRectangle(numbers.stream()
|
||||
.map(NumberWord::word)
|
||||
.map(BoundingBox::getBBoxPdf)
|
||||
.collect(RectangleTransformations.collectBBox()), LINES_COLOR, LINE_WIDTH));
|
||||
}
|
||||
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user