RED-9139: more robust TOC detection
This commit is contained in:
parent
f9b25c8157
commit
7ee1f9e360
@ -16,7 +16,7 @@ import lombok.experimental.FieldDefaults;
|
|||||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||||
public class SectionIdentifier {
|
public class SectionIdentifier {
|
||||||
|
|
||||||
public static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d{1,2})[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?");
|
public static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d{1,2})(?:[\\s.,;](\\d{1,2}))?(?:[\\s.,;](\\d{1,2}))?(?:[\\s.,;](\\d{1,2}))?");
|
||||||
public static Pattern alphanumericIdentifierPattern = Pattern.compile("^[\\s]?[A-Za-z][\\s.,;]?(\\d{1,2})[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?[\\s.,;]?");
|
public static Pattern alphanumericIdentifierPattern = Pattern.compile("^[\\s]?[A-Za-z][\\s.,;]?(\\d{1,2})[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?[\\s.,;]?");
|
||||||
|
|
||||||
public enum Format {
|
public enum Format {
|
||||||
|
|||||||
@ -1,34 +0,0 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
|
||||||
|
|
||||||
import java.util.Comparator;
|
|
||||||
import java.util.HashMap;
|
|
||||||
|
|
||||||
public class TextPositionSequenceComparator implements Comparator<Word> {
|
|
||||||
|
|
||||||
private HashMap<Word, TextBlockOnPage> lookup;
|
|
||||||
|
|
||||||
|
|
||||||
public TextPositionSequenceComparator(HashMap<Word, TextBlockOnPage> lookup) {
|
|
||||||
|
|
||||||
this.lookup = lookup;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int compare(Word number1, Word number2) {
|
|
||||||
|
|
||||||
int page1 = lookup.get(number1).page().getPageNumber();
|
|
||||||
int page2 = lookup.get(number2).page().getPageNumber();
|
|
||||||
|
|
||||||
if (page1 != page2) {
|
|
||||||
return Integer.compare(page1, page2);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (number1.getY() != number2.getY()) {
|
|
||||||
return Double.compare(number1.getY(), number2.getY());
|
|
||||||
}
|
|
||||||
|
|
||||||
return Integer.compare(Integer.parseInt(number1.toString()), Integer.parseInt(number2.toString()));
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -0,0 +1,36 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||||
|
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.HashMap;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.classification.NumberWord;
|
||||||
|
|
||||||
|
public class TocNumberComparator implements Comparator<NumberWord> {
|
||||||
|
|
||||||
|
private HashMap<NumberWord, TextBlockOnPage> lookup;
|
||||||
|
|
||||||
|
|
||||||
|
public TocNumberComparator(HashMap<NumberWord, TextBlockOnPage> lookup) {
|
||||||
|
|
||||||
|
this.lookup = lookup;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int compare(NumberWord number1, NumberWord number2) {
|
||||||
|
|
||||||
|
int page1 = lookup.get(number1).page().getPageNumber();
|
||||||
|
int page2 = lookup.get(number2).page().getPageNumber();
|
||||||
|
|
||||||
|
if (page1 != page2) {
|
||||||
|
return Integer.compare(page1, page2);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (number1.word().getY() != number2.word().getY()) {
|
||||||
|
return Double.compare(number1.word().getY(), number2.word().getY());
|
||||||
|
}
|
||||||
|
|
||||||
|
return Integer.compare(number1.number(), number2.number());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,7 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.services.classification;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||||
|
|
||||||
|
public record NumberWord(Word word, int number) {
|
||||||
|
|
||||||
|
}
|
||||||
@ -14,6 +14,7 @@ import java.util.List;
|
|||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
import java.util.regex.Matcher;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
@ -23,10 +24,11 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.SectionIdentifier;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextBlockOnPage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextBlockOnPage;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequenceComparator;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TocNumberComparator;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
|
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
|
||||||
|
|
||||||
@ -71,9 +73,9 @@ public class TableOfContentsClassificationService {
|
|||||||
|
|
||||||
ClassificationPage startPage = textBlocks.get(start).page();
|
ClassificationPage startPage = textBlocks.get(start).page();
|
||||||
List<TextBlockOnPage> initialLookAhead = textBlocks.subList(start, Math.min(start + SURROUNDING_BLOCKS_RADIUS, textBlocks.size()));
|
List<TextBlockOnPage> initialLookAhead = textBlocks.subList(start, Math.min(start + SURROUNDING_BLOCKS_RADIUS, textBlocks.size()));
|
||||||
HashMap<Word, TextBlockOnPage> lookup = new HashMap<>();
|
HashMap<NumberWord, TextBlockOnPage> numberToBlockLookup = new HashMap<>();
|
||||||
List<Word> numbers = extractNumbers(initialLookAhead, lookup, document.getPages().size());
|
List<NumberWord> numbers = extractNumbers(initialLookAhead, numberToBlockLookup, document.getPages().size());
|
||||||
TocNumberFinder tocNumberFinder = new TocNumberFinder(numbers, lookup);
|
TocNumberFinder tocNumberFinder = new TocNumberFinder(numbers, numberToBlockLookup);
|
||||||
|
|
||||||
int lastCandidate = start;
|
int lastCandidate = start;
|
||||||
for (int i = start; i < Math.min(lastCandidate + SURROUNDING_BLOCKS_RADIUS, textBlocks.size()); i++) {
|
for (int i = start; i < Math.min(lastCandidate + SURROUNDING_BLOCKS_RADIUS, textBlocks.size()); i++) {
|
||||||
@ -93,28 +95,28 @@ public class TableOfContentsClassificationService {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
List<Word> numbersFromBlock = extractNumbers(textBlockOnPage, lookup, document.getPages().size());
|
List<NumberWord> numbersFromBlock = extractNumbers(textBlockOnPage, numberToBlockLookup, document.getPages().size());
|
||||||
|
|
||||||
List<Word> currentRightmostCluster = tocNumberFinder.getCurrentRightmostCluster();
|
List<NumberWord> currentRightmostCluster = tocNumberFinder.getCurrentRightmostCluster();
|
||||||
|
|
||||||
if (currentRightmostCluster.size() < MINIMUM_MATCHES) {
|
if (currentRightmostCluster.size() < MINIMUM_MATCHES) {
|
||||||
log.debug("No numbers indicating a table of contents here.");
|
log.debug("No numbers indicating a table of contents here.");
|
||||||
return start;
|
return start;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (anyIntersection(currentRightmostCluster, numbersFromBlock, lookup)) {
|
if (anyIntersection(currentRightmostCluster, numbersFromBlock, numberToBlockLookup)) {
|
||||||
lastCandidate = i;
|
lastCandidate = i;
|
||||||
numbersFromBlock.forEach(tocNumberFinder::add);
|
numbersFromBlock.forEach(tocNumberFinder::add);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
addVisualization(document.getLayoutDebugLayer(), tocNumberFinder, lookup);
|
|
||||||
|
|
||||||
Set<TextBlockOnPage> blocksWithNumberInCluster = tocNumberFinder.getCurrentRightmostCluster()
|
Set<TextBlockOnPage> blocksWithNumberInCluster = tocNumberFinder.getCurrentRightmostCluster()
|
||||||
.stream()
|
.stream()
|
||||||
.map(lookup::get)
|
.map(numberToBlockLookup::get)
|
||||||
.collect(Collectors.toSet());
|
.collect(Collectors.toSet());
|
||||||
|
|
||||||
|
addVisualization(document.getLayoutDebugLayer(), tocNumberFinder, numberToBlockLookup, blocksWithNumberInCluster, textBlocks.get(start - 1));
|
||||||
|
|
||||||
int lastConfirmed = start;
|
int lastConfirmed = start;
|
||||||
for (int i = start; i < lastCandidate + 1; i++) {
|
for (int i = start; i < lastCandidate + 1; i++) {
|
||||||
TextBlockOnPage textBlockOnPage = textBlocks.get(i);
|
TextBlockOnPage textBlockOnPage = textBlocks.get(i);
|
||||||
@ -132,18 +134,22 @@ public class TableOfContentsClassificationService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static void addVisualization(LayoutDebugLayer layoutDebugLayer, TocNumberFinder tocNumberFinder, Map<Word, TextBlockOnPage> lookup) {
|
private static void addVisualization(LayoutDebugLayer layoutDebugLayer,
|
||||||
|
TocNumberFinder tocNumberFinder,
|
||||||
|
Map<NumberWord, TextBlockOnPage> lookup,
|
||||||
|
Set<TextBlockOnPage> blocksWithNumberInCluster,
|
||||||
|
TextBlockOnPage startingHeadline) {
|
||||||
|
|
||||||
tocNumberFinder.getCurrentRightmostCluster()
|
tocNumberFinder.getCurrentRightmostCluster()
|
||||||
.stream()
|
.stream()
|
||||||
.collect(Collectors.groupingBy(key -> lookup.get(key).page().getPageNumber()))
|
.collect(Collectors.groupingBy(key -> lookup.get(key).page().getPageNumber()))
|
||||||
.forEach((pageNumber, number) -> layoutDebugLayer.addTocPages(number, pageNumber));
|
.forEach((pageNumber, number) -> layoutDebugLayer.addTocPages(number, pageNumber));
|
||||||
|
layoutDebugLayer.addTocBlocks(blocksWithNumberInCluster);
|
||||||
|
layoutDebugLayer.addTocBlocks(Set.of(startingHeadline));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static boolean anyIntersection(Collection<Word> numbers1,
|
private static boolean anyIntersection(Collection<NumberWord> numbers1, Collection<NumberWord> numbers2, Map<NumberWord, TextBlockOnPage> lookup) {
|
||||||
Collection<Word> numbers2,
|
|
||||||
Map<Word, TextBlockOnPage> lookup) {
|
|
||||||
|
|
||||||
return numbers1.stream()
|
return numbers1.stream()
|
||||||
.anyMatch(numberFromCluster -> numbers2.stream()
|
.anyMatch(numberFromCluster -> numbers2.stream()
|
||||||
@ -151,9 +157,9 @@ public class TableOfContentsClassificationService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static List<Word> extractNumbers(List<TextBlockOnPage> textBlocks, Map<Word, TextBlockOnPage> lookup, int numberOfPages) {
|
private static List<NumberWord> extractNumbers(List<TextBlockOnPage> textBlocks, Map<NumberWord, TextBlockOnPage> lookup, int numberOfPages) {
|
||||||
|
|
||||||
List<Word> blocks = new LinkedList<>();
|
List<NumberWord> blocks = new LinkedList<>();
|
||||||
for (TextBlockOnPage textBlock : textBlocks) {
|
for (TextBlockOnPage textBlock : textBlocks) {
|
||||||
blocks.addAll(extractNumbers(textBlock, lookup, numberOfPages));
|
blocks.addAll(extractNumbers(textBlock, lookup, numberOfPages));
|
||||||
}
|
}
|
||||||
@ -161,30 +167,40 @@ public class TableOfContentsClassificationService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static List<Word> extractNumbers(TextBlockOnPage textBlock, Map<Word, TextBlockOnPage> lookup, int numberOfPages) {
|
private static List<NumberWord> extractNumbers(TextBlockOnPage textBlock, Map<NumberWord, TextBlockOnPage> lookup, int numberOfPages) {
|
||||||
|
|
||||||
List<Word> blocks = new LinkedList<>();
|
List<NumberWord> blocks = new LinkedList<>();
|
||||||
TextPageBlock block = textBlock.textBlock();
|
TextPageBlock block = textBlock.textBlock();
|
||||||
List<Word> sequences = block.getWords();
|
List<Word> words = block.getWords();
|
||||||
for (int i = 0; i < sequences.size(); i++) {
|
for (int i = 0; i < words.size(); i++) {
|
||||||
|
|
||||||
Word word = sequences.get(i);
|
Word word = words.get(i);
|
||||||
|
if (!wordIsEndOfLine(i, words)) {
|
||||||
if (!NUMERIC.matcher(word).matches() || word.length() > 5) {
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (AMOUNT_PATTERN.matcher(getSurroundingString(i, sequences)).matches()) {
|
if (AMOUNT_PATTERN.matcher(getSurroundingString(i, words)).matches()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
Matcher matcher = SectionIdentifier.numericalIdentifierPattern.matcher(word.toString());
|
||||||
|
if (matcher.find() && matcher.group(2) != null) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
Matcher numberFinder = NUMERIC.matcher(word);
|
||||||
|
if (!numberFinder.find() || word.length() > 5) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
int pageNumber = Integer.parseInt(word.toString());
|
int pageNumber = Integer.parseInt(numberFinder.group());
|
||||||
if (0 >= pageNumber || pageNumber > numberOfPages) {
|
if (0 >= pageNumber || pageNumber > numberOfPages) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
lookup.put(word, textBlock);
|
NumberWord numberWord = new NumberWord(word, pageNumber);
|
||||||
blocks.add(word);
|
lookup.put(numberWord, textBlock);
|
||||||
|
blocks.add(numberWord);
|
||||||
} catch (NumberFormatException e) {
|
} catch (NumberFormatException e) {
|
||||||
log.debug("That wasn't a number! Should not happen, due to numeric check beforehand.");
|
log.debug("That wasn't a number! Should not happen, due to numeric check beforehand.");
|
||||||
}
|
}
|
||||||
@ -193,6 +209,17 @@ public class TableOfContentsClassificationService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static boolean wordIsEndOfLine(int i, List<Word> words) {
|
||||||
|
|
||||||
|
if (i == words.size() - 1) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
Word word = words.get(i);
|
||||||
|
Word nextWord = words.get(i + 1);
|
||||||
|
return !nextWord.rightOf(word);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
private static CharSequence getSurroundingString(int i, List<Word> sequences) {
|
private static CharSequence getSurroundingString(int i, List<Word> sequences) {
|
||||||
|
|
||||||
int end = Math.min(i + 5, sequences.size());
|
int end = Math.min(i + 5, sequences.size());
|
||||||
@ -203,13 +230,13 @@ public class TableOfContentsClassificationService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static boolean matches(Word number1, Word number2, Map<Word, TextBlockOnPage> lookup) {
|
private static boolean matches(NumberWord number1, NumberWord number2, Map<NumberWord, TextBlockOnPage> lookup) {
|
||||||
|
|
||||||
if (number1.getDir() != number2.getDir()) {
|
if (number1.word().getDir() != number2.word().getDir()) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
return number1.intersectsXDirAdj(number2, INTERSECTION_TOLERANCE);
|
return number1.word().intersectsXDirAdj(number2.word(), INTERSECTION_TOLERANCE);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -247,11 +274,11 @@ public class TableOfContentsClassificationService {
|
|||||||
|
|
||||||
private static class TocNumberFinder {
|
private static class TocNumberFinder {
|
||||||
|
|
||||||
final UnionFind<Word> numberClusters;
|
final UnionFind<NumberWord> numberClusters;
|
||||||
final HashMap<Word, TextBlockOnPage> lookup;
|
final HashMap<NumberWord, TextBlockOnPage> lookup;
|
||||||
|
|
||||||
|
|
||||||
TocNumberFinder(List<Word> blocks, HashMap<Word, TextBlockOnPage> lookup) {
|
TocNumberFinder(List<NumberWord> blocks, HashMap<NumberWord, TextBlockOnPage> lookup) {
|
||||||
|
|
||||||
this.numberClusters = new UnionFind<>(new HashSet<>(blocks));
|
this.numberClusters = new UnionFind<>(new HashSet<>(blocks));
|
||||||
for (int i = 0; i < blocks.size(); i++) {
|
for (int i = 0; i < blocks.size(); i++) {
|
||||||
@ -265,14 +292,14 @@ public class TableOfContentsClassificationService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void add(Word number) {
|
public void add(NumberWord number) {
|
||||||
|
|
||||||
if (numberClusters.getElements().contains(number)) {
|
if (numberClusters.getElements().contains(number)) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
numberClusters.addElement(number);
|
numberClusters.addElement(number);
|
||||||
for (Word element : numberClusters.getElements()) {
|
for (NumberWord element : numberClusters.getElements()) {
|
||||||
if (matches(number, element, lookup)) {
|
if (matches(number, element, lookup)) {
|
||||||
numberClusters.union(element, number);
|
numberClusters.union(element, number);
|
||||||
}
|
}
|
||||||
@ -280,73 +307,100 @@ public class TableOfContentsClassificationService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public List<Word> getCurrentRightmostCluster() {
|
public List<NumberWord> getCurrentRightmostCluster() {
|
||||||
|
|
||||||
return numberClusters.getGroups()
|
return numberClusters.getGroups()
|
||||||
.stream()
|
.stream()
|
||||||
.filter(cluster -> cluster.size() > MINIMUM_MATCHES)
|
.filter(cluster -> cluster.size() > MINIMUM_MATCHES)
|
||||||
.map(cluster -> cluster.stream()
|
.map(cluster -> cluster.stream()
|
||||||
.sorted(new TextPositionSequenceComparator(lookup))
|
.sorted(new TocNumberComparator(lookup))
|
||||||
.toList())
|
.toList())
|
||||||
.map(this::removeOutliers)
|
.map(this::removeOutliers)
|
||||||
// .map(this::filterByMinimumDensity)
|
.map(this::removeOnNonConsecutivePages)
|
||||||
|
.map(this::filterByWordNearTopOfPage)
|
||||||
.filter(cluster -> cluster.size() > MINIMUM_MATCHES)
|
.filter(cluster -> cluster.size() > MINIMUM_MATCHES)
|
||||||
.max(Comparator.comparingDouble(cluster -> cluster.get(0).getBBox().getMaxX())).orElse(Collections.emptyList());
|
.max(Comparator.comparingDouble(cluster -> cluster.get(0).word().getBBox().getMaxX())).orElse(Collections.emptyList());
|
||||||
}
|
}
|
||||||
|
|
||||||
// does not seem to be doing much, ideally instead of using the height of the blocks i would like to use the height, beginning from the MainBody top,
|
|
||||||
// but as the MainBody is often wrong, this results in some numbers being discarded even though they are correct.
|
|
||||||
// private List<TextPositionSequence> filterByMinimumDensity(List<TextPositionSequence> numbers) {
|
|
||||||
//
|
|
||||||
// Map<ClassificationPage, List<TextPositionSequence>> clustersPerPage = numbers.stream()
|
|
||||||
// .collect(Collectors.groupingBy(number -> lookup.get(number).page()));
|
|
||||||
//
|
|
||||||
// List<TextPositionSequence> result = new ArrayList<>(numbers.size());
|
|
||||||
// clustersPerPage.keySet()
|
|
||||||
// .stream()
|
|
||||||
// .sorted(Comparator.comparingInt(ClassificationPage::getPageNumber))
|
|
||||||
// .forEach(page -> {
|
|
||||||
// var numbersOnPage = clustersPerPage.get(page);
|
|
||||||
//
|
|
||||||
// double height = numbersOnPage.stream()
|
|
||||||
// .map(BoundingBox::getBBox)
|
|
||||||
// .collect(RectangleTransformations.collectBBox()).getHeight();
|
|
||||||
//
|
|
||||||
// double count = numbersOnPage.size();
|
|
||||||
//
|
|
||||||
// if ((count / height) >= (DENSITY_THRESHOLD_COUNT / page.getPageHeight())) {
|
|
||||||
// result.addAll(numbers);
|
|
||||||
// }
|
|
||||||
// });
|
|
||||||
// return result;
|
|
||||||
// }
|
|
||||||
|
|
||||||
|
private List<NumberWord> removeOnNonConsecutivePages(List<NumberWord> numbers) {
|
||||||
|
|
||||||
public List<Word> removeOutliers(List<Word> numbers) {
|
List<NumberWord> result = new ArrayList<>();
|
||||||
|
|
||||||
List<Word> result = new ArrayList<>();
|
|
||||||
|
|
||||||
result.add(numbers.get(0));
|
result.add(numbers.get(0));
|
||||||
|
|
||||||
|
for (int i = 1; i < numbers.size(); i++) {
|
||||||
|
int prev = getPageNumber(numbers, i - 1);
|
||||||
|
int curr = getPageNumber(numbers, i);
|
||||||
|
|
||||||
|
if (Math.abs(prev - curr) > 1) {
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
result.add(numbers.get(i));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private int getPageNumber(List<NumberWord> numbers, int i) {
|
||||||
|
|
||||||
|
return lookup.get(numbers.get(i)).page().getPageNumber();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private List<NumberWord> filterByWordNearTopOfPage(List<NumberWord> numbers) {
|
||||||
|
|
||||||
|
List<NumberWord> result = new ArrayList<>();
|
||||||
|
|
||||||
|
result.add(numbers.get(0));
|
||||||
|
|
||||||
|
for (int i = 1; i < numbers.size(); i++) {
|
||||||
|
NumberWord prev = numbers.get(i - 1);
|
||||||
|
NumberWord curr = numbers.get(i);
|
||||||
|
ClassificationPage prevPage = lookup.get(prev).page();
|
||||||
|
ClassificationPage currPage = lookup.get(curr).page();
|
||||||
|
if (prevPage.equals(currPage)) {
|
||||||
|
result.add(curr);
|
||||||
|
} else if (curr.word().getBBox().getMinY() < currPage.getPageHeight() * 0.33) {
|
||||||
|
result.add(curr);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public List<NumberWord> removeOutliers(List<NumberWord> numbers) {
|
||||||
|
|
||||||
|
List<NumberWord> confirmedClusterNumbers = new ArrayList<>();
|
||||||
|
|
||||||
|
confirmedClusterNumbers.add(numbers.get(0));
|
||||||
|
|
||||||
for (int i = 1; i < numbers.size() - 1; i++) {
|
for (int i = 1; i < numbers.size() - 1; i++) {
|
||||||
int prev = getNumberAsInt(numbers, i - 1);
|
int prev = getNumberAsInt(numbers, i - 1);
|
||||||
int curr = getNumberAsInt(numbers, i);
|
int curr = getNumberAsInt(numbers, i);
|
||||||
int next = getNumberAsInt(numbers, i + 1);
|
int next = getNumberAsInt(numbers, i + 1);
|
||||||
|
|
||||||
if (!(curr <= prev || curr >= next) || !isBetterWithout(numbers, i)) {
|
if (!(curr <= prev || curr >= next) || !isBetterWithout(numbers, i)) {
|
||||||
result.add(numbers.get(i));
|
confirmedClusterNumbers.add(numbers.get(i));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (getNumberAsInt(numbers, numbers.size() - 1) >= getNumberAsInt(numbers, Math.max(0, numbers.size() - 2))) {
|
if (getNumberAsInt(numbers, numbers.size() - 1) >= getLatestNumber(confirmedClusterNumbers)) {
|
||||||
result.add(numbers.get(numbers.size() - 1));
|
confirmedClusterNumbers.add(numbers.get(numbers.size() - 1));
|
||||||
}
|
}
|
||||||
|
|
||||||
return result;
|
return confirmedClusterNumbers;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static int getLatestNumber(List<NumberWord> confirmedClusterNumbers) {
|
||||||
|
|
||||||
|
return confirmedClusterNumbers.get(confirmedClusterNumbers.size() - 1).number();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// Helper method to check if removing the current number results in a better order
|
// Helper method to check if removing the current number results in a better order
|
||||||
public static boolean isBetterWithout(List<Word> numbers, int i) {
|
public static boolean isBetterWithout(List<NumberWord> numbers, int i) {
|
||||||
|
|
||||||
if (i == 0 || i == numbers.size() - 1) {
|
if (i == 0 || i == numbers.size() - 1) {
|
||||||
return false;
|
return false;
|
||||||
@ -362,9 +416,9 @@ public class TableOfContentsClassificationService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static int getNumberAsInt(List<Word> numbers, int i) {
|
private static int getNumberAsInt(List<NumberWord> numbers, int i) {
|
||||||
|
|
||||||
return Integer.parseInt(numbers.get(i).toString());
|
return numbers.get(i).number();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -16,7 +16,7 @@ import java.util.Optional;
|
|||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||||
@ -82,9 +82,9 @@ public class DocumentGraphFactory {
|
|||||||
|
|
||||||
documentGraph.streamAllSubNodes()
|
documentGraph.streamAllSubNodes()
|
||||||
.filter(SemanticNode::isLeaf)
|
.filter(SemanticNode::isLeaf)
|
||||||
.filter(node -> !node.getType().equals(NodeType.HEADER))
|
.filter(node -> !node.getType().equals(NodeTypeProto.NodeType.HEADER))
|
||||||
.filter(node -> !node.getType().equals(NodeType.FOOTER))
|
.filter(node -> !node.getType().equals(NodeTypeProto.NodeType.FOOTER))
|
||||||
.filter(node -> !node.getType().equals(NodeType.IMAGE))
|
.filter(node -> !node.getType().equals(NodeTypeProto.NodeType.IMAGE))
|
||||||
.map(SemanticNode::getTextBlock)
|
.map(SemanticNode::getTextBlock)
|
||||||
.map(TextBlock::getAtomicTextBlocks)
|
.map(TextBlock::getAtomicTextBlocks)
|
||||||
.flatMap(Collection::stream)
|
.flatMap(Collection::stream)
|
||||||
|
|||||||
@ -27,7 +27,9 @@ import com.knecon.fforesight.service.layoutparser.processor.model.outline.Outlin
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.ListIdentifier;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.ListIdentifier;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextBlockOnPage;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.classification.NumberWord;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
|
||||||
@ -293,7 +295,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void addTocPages(List<Word> numbers, int page) {
|
public void addTocPages(List<NumberWord> numbers, int page) {
|
||||||
|
|
||||||
if (!active) {
|
if (!active) {
|
||||||
return;
|
return;
|
||||||
@ -302,13 +304,10 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
|||||||
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(page, this.tocPages);
|
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(page, this.tocPages);
|
||||||
visualizationsOnPage.getColoredRectangles()
|
visualizationsOnPage.getColoredRectangles()
|
||||||
.addAll(numbers.stream()
|
.addAll(numbers.stream()
|
||||||
|
.map(NumberWord::word)
|
||||||
.map(BoundingBox::getBBoxPdf)
|
.map(BoundingBox::getBBoxPdf)
|
||||||
.map(line -> new ColoredRectangle(line, LINES_COLOR, LINE_WIDTH))
|
.map(line -> new ColoredRectangle(line, LINES_COLOR, LINE_WIDTH))
|
||||||
.toList());
|
.toList());
|
||||||
visualizationsOnPage.getColoredRectangles()
|
|
||||||
.add(new ColoredRectangle(numbers.stream()
|
|
||||||
.map(BoundingBox::getBBoxPdf)
|
|
||||||
.collect(RectangleTransformations.collectBBox()), LINES_COLOR, LINE_WIDTH));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -332,8 +331,10 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
|||||||
|
|
||||||
private void addOutlineObject(OutlineObject outlineObject, PageInformation pageInformation) {
|
private void addOutlineObject(OutlineObject outlineObject, PageInformation pageInformation) {
|
||||||
|
|
||||||
|
if (!active) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
int rectSize = 5;
|
int rectSize = 5;
|
||||||
|
|
||||||
Point2D point2D;
|
Point2D point2D;
|
||||||
if (outlineObject.getPoint().isPresent()) {
|
if (outlineObject.getPoint().isPresent()) {
|
||||||
point2D = outlineObject.getPoint().get();
|
point2D = outlineObject.getPoint().get();
|
||||||
@ -357,10 +358,25 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
|||||||
|
|
||||||
public void addListIdentifiers(List<ListIdentifier> listIdentifiers) {
|
public void addListIdentifiers(List<ListIdentifier> listIdentifiers) {
|
||||||
|
|
||||||
|
if (!active) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
for (ListIdentifier listIdentifier : listIdentifiers) {
|
for (ListIdentifier listIdentifier : listIdentifiers) {
|
||||||
getOrCreateVisualizationsOnPage(listIdentifier.getPage(), this.listIdentifiers).getColoredRectangles()
|
getOrCreateVisualizationsOnPage(listIdentifier.getPage(), this.listIdentifiers).getColoredRectangles()
|
||||||
.add(new ColoredRectangle(listIdentifier.getWord().getBBoxPdf(), WORDS_COLOR, LINE_WIDTH));
|
.add(new ColoredRectangle(listIdentifier.getWord().getBBoxPdf(), WORDS_COLOR, LINE_WIDTH));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void addTocBlocks(Set<TextBlockOnPage> blocksWithNumberInCluster) {
|
||||||
|
|
||||||
|
if (!active) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
for (TextBlockOnPage textBlockOnPage : blocksWithNumberInCluster) {
|
||||||
|
getOrCreateVisualizationsOnPage(textBlockOnPage.page().getPageNumber(), this.tocBlocks).getColoredRectangles()
|
||||||
|
.add(new ColoredRectangle(textBlockOnPage.textBlock().getBBoxPdf(), TOC_COLOR, LINE_WIDTH));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -83,4 +83,11 @@ class SectionIdentifierTest {
|
|||||||
assertEquals(List.of(4, 1, 2, 4), identifier.getIdentifiers());
|
assertEquals(List.of(4, 1, 2, 4), identifier.getIdentifiers());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testFalsePositive111() {
|
||||||
|
SectionIdentifier identifier = SectionIdentifier.fromSearchText("111: Headline");
|
||||||
|
assertEquals(SectionIdentifier.Format.NUMERICAL, identifier.getFormat());
|
||||||
|
assertEquals(1, identifier.level());
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -57,6 +57,7 @@ public record LayerIdentifier(String name, String markedContentName) {
|
|||||||
public static final LayerIdentifier OUTLINE_OBJECTS = new LayerIdentifier("Outline Positions", "OUTLINE_OBJECTS");
|
public static final LayerIdentifier OUTLINE_OBJECTS = new LayerIdentifier("Outline Positions", "OUTLINE_OBJECTS");
|
||||||
public static final LayerIdentifier SENTENCES = new LayerIdentifier("Sentences", "SENTENCES");
|
public static final LayerIdentifier SENTENCES = new LayerIdentifier("Sentences", "SENTENCES");
|
||||||
public static final LayerIdentifier TOC_PAGES = new LayerIdentifier("TOC pages", "TOC_PAGES");
|
public static final LayerIdentifier TOC_PAGES = new LayerIdentifier("TOC pages", "TOC_PAGES");
|
||||||
|
public static final LayerIdentifier TOC_BLOCKS = new LayerIdentifier("TOC blocks", "TOC_BLOCKS");
|
||||||
public static final LayerIdentifier LIST_IDENTIFIERS = new LayerIdentifier("List identifiers", "LIST_IDENTIFIERS");
|
public static final LayerIdentifier LIST_IDENTIFIERS = new LayerIdentifier("List identifiers", "LIST_IDENTIFIERS");
|
||||||
|
|
||||||
// Visual layout parser
|
// Visual layout parser
|
||||||
|
|||||||
@ -22,6 +22,7 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup {
|
|||||||
|
|
||||||
protected static final Color WORDS_COLOR = new Color(68, 84, 147);
|
protected static final Color WORDS_COLOR = new Color(68, 84, 147);
|
||||||
protected static final Color LINES_COLOR = new Color(152, 45, 179);
|
protected static final Color LINES_COLOR = new Color(152, 45, 179);
|
||||||
|
protected static final Color TOC_COLOR = new Color(33, 159, 144);
|
||||||
protected static final Color ZONES_COLOR = new Color(131, 38, 38);
|
protected static final Color ZONES_COLOR = new Color(131, 38, 38);
|
||||||
|
|
||||||
protected static final Color RULINGS_COLOR = new Color(21, 221, 174);
|
protected static final Color RULINGS_COLOR = new Color(21, 221, 174);
|
||||||
@ -59,6 +60,7 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup {
|
|||||||
protected final Visualizations outlineObjects = Visualizations.builder().layer(LayerIdentifier.OUTLINE_OBJECTS).build();
|
protected final Visualizations outlineObjects = Visualizations.builder().layer(LayerIdentifier.OUTLINE_OBJECTS).build();
|
||||||
protected final Visualizations sentences = Visualizations.builder().layer(LayerIdentifier.SENTENCES).build();
|
protected final Visualizations sentences = Visualizations.builder().layer(LayerIdentifier.SENTENCES).build();
|
||||||
protected final Visualizations tocPages = Visualizations.builder().layer(LayerIdentifier.TOC_PAGES).build();
|
protected final Visualizations tocPages = Visualizations.builder().layer(LayerIdentifier.TOC_PAGES).build();
|
||||||
|
protected final Visualizations tocBlocks = Visualizations.builder().layer(LayerIdentifier.TOC_BLOCKS).build();
|
||||||
protected final Visualizations listIdentifiers = Visualizations.builder().layer(LayerIdentifier.LIST_IDENTIFIERS).build();
|
protected final Visualizations listIdentifiers = Visualizations.builder().layer(LayerIdentifier.LIST_IDENTIFIERS).build();
|
||||||
|
|
||||||
|
|
||||||
@ -77,9 +79,9 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup {
|
|||||||
markedContent, //
|
markedContent, //
|
||||||
outlineObjects, //
|
outlineObjects, //
|
||||||
tocPages, //
|
tocPages, //
|
||||||
|
tocBlocks, //
|
||||||
listIdentifiers //
|
listIdentifiers //
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user