RED-9139: more robust TOC detection
* detect numbers in words, and not just whole words that are numbers
This commit is contained in:
parent
e6cd889444
commit
ce41014d4b
@ -5,12 +5,12 @@ import java.util.HashMap;
|
|||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.NumberWord;
|
import com.knecon.fforesight.service.layoutparser.processor.services.classification.NumberWord;
|
||||||
|
|
||||||
public class TextPositionSequenceComparator implements Comparator<NumberWord> {
|
public class TocNumberComparator implements Comparator<NumberWord> {
|
||||||
|
|
||||||
private HashMap<NumberWord, TextBlockOnPage> lookup;
|
private HashMap<NumberWord, TextBlockOnPage> lookup;
|
||||||
|
|
||||||
|
|
||||||
public TextPositionSequenceComparator(HashMap<NumberWord, TextBlockOnPage> lookup) {
|
public TocNumberComparator(HashMap<NumberWord, TextBlockOnPage> lookup) {
|
||||||
|
|
||||||
this.lookup = lookup;
|
this.lookup = lookup;
|
||||||
}
|
}
|
||||||
@ -28,7 +28,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.SectionIdentif
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextBlockOnPage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextBlockOnPage;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequenceComparator;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TocNumberComparator;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
|
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
|
||||||
|
|
||||||
@ -175,12 +175,16 @@ public class TableOfContentsClassificationService {
|
|||||||
for (int i = 0; i < words.size(); i++) {
|
for (int i = 0; i < words.size(); i++) {
|
||||||
|
|
||||||
Word word = words.get(i);
|
Word word = words.get(i);
|
||||||
|
if (!wordIsEndOfLine(i, words)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
if (AMOUNT_PATTERN.matcher(getSurroundingString(i, words)).matches()) {
|
if (AMOUNT_PATTERN.matcher(getSurroundingString(i, words)).matches()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (SectionIdentifier.fromSearchText(word.toString()).level() > 1) {
|
Matcher matcher = SectionIdentifier.numericalIdentifierPattern.matcher(word.toString());
|
||||||
|
if (matcher.find() && matcher.group(2) != null) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -205,6 +209,17 @@ public class TableOfContentsClassificationService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static boolean wordIsEndOfLine(int i, List<Word> words) {
|
||||||
|
|
||||||
|
if (i == words.size() - 1) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
Word word = words.get(i);
|
||||||
|
Word nextWord = words.get(i + 1);
|
||||||
|
return !nextWord.rightOf(word);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
private static CharSequence getSurroundingString(int i, List<Word> sequences) {
|
private static CharSequence getSurroundingString(int i, List<Word> sequences) {
|
||||||
|
|
||||||
int end = Math.min(i + 5, sequences.size());
|
int end = Math.min(i + 5, sequences.size());
|
||||||
@ -298,62 +313,89 @@ public class TableOfContentsClassificationService {
|
|||||||
.stream()
|
.stream()
|
||||||
.filter(cluster -> cluster.size() > MINIMUM_MATCHES)
|
.filter(cluster -> cluster.size() > MINIMUM_MATCHES)
|
||||||
.map(cluster -> cluster.stream()
|
.map(cluster -> cluster.stream()
|
||||||
.sorted(new TextPositionSequenceComparator(lookup))
|
.sorted(new TocNumberComparator(lookup))
|
||||||
.toList())
|
.toList())
|
||||||
.map(this::removeOutliers)
|
.map(this::removeOutliers)
|
||||||
// .map(this::filterByMinimumDensity)
|
.map(this::removeOnNonConsecutivePages)
|
||||||
|
.map(this::filterByWordNearTopOfPage)
|
||||||
.filter(cluster -> cluster.size() > MINIMUM_MATCHES)
|
.filter(cluster -> cluster.size() > MINIMUM_MATCHES)
|
||||||
.max(Comparator.comparingDouble(cluster -> cluster.get(0).word().getBBox().getMaxX())).orElse(Collections.emptyList());
|
.max(Comparator.comparingDouble(cluster -> cluster.get(0).word().getBBox().getMaxX())).orElse(Collections.emptyList());
|
||||||
}
|
}
|
||||||
|
|
||||||
// does not seem to be doing much, ideally instead of using the height of the blocks i would like to use the height, beginning from the MainBody top,
|
|
||||||
// but as the MainBody is often wrong, this results in some numbers being discarded even though they are correct.
|
|
||||||
// private List<TextPositionSequence> filterByMinimumDensity(List<TextPositionSequence> numbers) {
|
|
||||||
//
|
|
||||||
// Map<ClassificationPage, List<TextPositionSequence>> clustersPerPage = numbers.stream()
|
|
||||||
// .collect(Collectors.groupingBy(number -> lookup.get(number).page()));
|
|
||||||
//
|
|
||||||
// List<TextPositionSequence> result = new ArrayList<>(numbers.size());
|
|
||||||
// clustersPerPage.keySet()
|
|
||||||
// .stream()
|
|
||||||
// .sorted(Comparator.comparingInt(ClassificationPage::getPageNumber))
|
|
||||||
// .forEach(page -> {
|
|
||||||
// var numbersOnPage = clustersPerPage.get(page);
|
|
||||||
//
|
|
||||||
// double height = numbersOnPage.stream()
|
|
||||||
// .map(BoundingBox::getBBox)
|
|
||||||
// .collect(RectangleTransformations.collectBBox()).getHeight();
|
|
||||||
//
|
|
||||||
// double count = numbersOnPage.size();
|
|
||||||
//
|
|
||||||
// if ((count / height) >= (DENSITY_THRESHOLD_COUNT / page.getPageHeight())) {
|
|
||||||
// result.addAll(numbers);
|
|
||||||
// }
|
|
||||||
// });
|
|
||||||
// return result;
|
|
||||||
// }
|
|
||||||
|
|
||||||
|
private List<NumberWord> removeOnNonConsecutivePages(List<NumberWord> numbers) {
|
||||||
public List<NumberWord> removeOutliers(List<NumberWord> numbers) {
|
|
||||||
|
|
||||||
List<NumberWord> result = new ArrayList<>();
|
List<NumberWord> result = new ArrayList<>();
|
||||||
|
|
||||||
result.add(numbers.get(0));
|
result.add(numbers.get(0));
|
||||||
|
|
||||||
|
for (int i = 1; i < numbers.size(); i++) {
|
||||||
|
int prev = getPageNumber(numbers, i - 1);
|
||||||
|
int curr = getPageNumber(numbers, i);
|
||||||
|
|
||||||
|
if (Math.abs(prev - curr) > 1) {
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
result.add(numbers.get(i));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private int getPageNumber(List<NumberWord> numbers, int i) {
|
||||||
|
|
||||||
|
return lookup.get(numbers.get(i)).page().getPageNumber();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private List<NumberWord> filterByWordNearTopOfPage(List<NumberWord> numbers) {
|
||||||
|
|
||||||
|
List<NumberWord> result = new ArrayList<>();
|
||||||
|
|
||||||
|
result.add(numbers.get(0));
|
||||||
|
|
||||||
|
for (int i = 1; i < numbers.size(); i++) {
|
||||||
|
NumberWord prev = numbers.get(i - 1);
|
||||||
|
NumberWord curr = numbers.get(i);
|
||||||
|
ClassificationPage prevPage = lookup.get(prev).page();
|
||||||
|
ClassificationPage currPage = lookup.get(curr).page();
|
||||||
|
if (prevPage == currPage) {
|
||||||
|
result.add(curr);
|
||||||
|
} else if (curr.word().getBBox().getMinY() < currPage.getPageHeight() * 0.33) {
|
||||||
|
result.add(curr);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public List<NumberWord> removeOutliers(List<NumberWord> numbers) {
|
||||||
|
|
||||||
|
List<NumberWord> confirmedClusterNumbers = new ArrayList<>();
|
||||||
|
|
||||||
|
confirmedClusterNumbers.add(numbers.get(0));
|
||||||
|
|
||||||
for (int i = 1; i < numbers.size() - 1; i++) {
|
for (int i = 1; i < numbers.size() - 1; i++) {
|
||||||
int prev = getNumberAsInt(numbers, i - 1);
|
int prev = getNumberAsInt(numbers, i - 1);
|
||||||
int curr = getNumberAsInt(numbers, i);
|
int curr = getNumberAsInt(numbers, i);
|
||||||
int next = getNumberAsInt(numbers, i + 1);
|
int next = getNumberAsInt(numbers, i + 1);
|
||||||
|
|
||||||
if (!(curr <= prev || curr >= next) || !isBetterWithout(numbers, i)) {
|
if (!(curr <= prev || curr >= next) || !isBetterWithout(numbers, i)) {
|
||||||
result.add(numbers.get(i));
|
confirmedClusterNumbers.add(numbers.get(i));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (getNumberAsInt(numbers, numbers.size() - 1) >= getNumberAsInt(numbers, Math.max(0, numbers.size() - 2))) {
|
if (getNumberAsInt(numbers, numbers.size() - 1) >= getLatestNumber(confirmedClusterNumbers)) {
|
||||||
result.add(numbers.get(numbers.size() - 1));
|
confirmedClusterNumbers.add(numbers.get(numbers.size() - 1));
|
||||||
}
|
}
|
||||||
|
|
||||||
return result;
|
return confirmedClusterNumbers;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static int getLatestNumber(List<NumberWord> confirmedClusterNumbers) {
|
||||||
|
|
||||||
|
return confirmedClusterNumbers.get(confirmedClusterNumbers.size() - 1).number();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -308,11 +308,6 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
|||||||
.map(BoundingBox::getBBoxPdf)
|
.map(BoundingBox::getBBoxPdf)
|
||||||
.map(line -> new ColoredRectangle(line, LINES_COLOR, LINE_WIDTH))
|
.map(line -> new ColoredRectangle(line, LINES_COLOR, LINE_WIDTH))
|
||||||
.toList());
|
.toList());
|
||||||
visualizationsOnPage.getColoredRectangles()
|
|
||||||
.add(new ColoredRectangle(numbers.stream()
|
|
||||||
.map(NumberWord::word)
|
|
||||||
.map(BoundingBox::getBBoxPdf)
|
|
||||||
.collect(RectangleTransformations.collectBBox()), LINES_COLOR, LINE_WIDTH));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user