RED-9139: more robust TOC detection
This commit is contained in:
parent
f9b25c8157
commit
7ee1f9e360
@ -16,7 +16,7 @@ import lombok.experimental.FieldDefaults;
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class SectionIdentifier {
|
||||
|
||||
public static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d{1,2})[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?");
|
||||
public static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d{1,2})(?:[\\s.,;](\\d{1,2}))?(?:[\\s.,;](\\d{1,2}))?(?:[\\s.,;](\\d{1,2}))?");
|
||||
public static Pattern alphanumericIdentifierPattern = Pattern.compile("^[\\s]?[A-Za-z][\\s.,;]?(\\d{1,2})[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?[\\s.,;]?");
|
||||
|
||||
public enum Format {
|
||||
|
||||
@ -1,34 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
|
||||
public class TextPositionSequenceComparator implements Comparator<Word> {
|
||||
|
||||
private HashMap<Word, TextBlockOnPage> lookup;
|
||||
|
||||
|
||||
public TextPositionSequenceComparator(HashMap<Word, TextBlockOnPage> lookup) {
|
||||
|
||||
this.lookup = lookup;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int compare(Word number1, Word number2) {
|
||||
|
||||
int page1 = lookup.get(number1).page().getPageNumber();
|
||||
int page2 = lookup.get(number2).page().getPageNumber();
|
||||
|
||||
if (page1 != page2) {
|
||||
return Integer.compare(page1, page2);
|
||||
}
|
||||
|
||||
if (number1.getY() != number2.getY()) {
|
||||
return Double.compare(number1.getY(), number2.getY());
|
||||
}
|
||||
|
||||
return Integer.compare(Integer.parseInt(number1.toString()), Integer.parseInt(number2.toString()));
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,36 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.NumberWord;
|
||||
|
||||
public class TocNumberComparator implements Comparator<NumberWord> {
|
||||
|
||||
private HashMap<NumberWord, TextBlockOnPage> lookup;
|
||||
|
||||
|
||||
public TocNumberComparator(HashMap<NumberWord, TextBlockOnPage> lookup) {
|
||||
|
||||
this.lookup = lookup;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int compare(NumberWord number1, NumberWord number2) {
|
||||
|
||||
int page1 = lookup.get(number1).page().getPageNumber();
|
||||
int page2 = lookup.get(number2).page().getPageNumber();
|
||||
|
||||
if (page1 != page2) {
|
||||
return Integer.compare(page1, page2);
|
||||
}
|
||||
|
||||
if (number1.word().getY() != number2.word().getY()) {
|
||||
return Double.compare(number1.word().getY(), number2.word().getY());
|
||||
}
|
||||
|
||||
return Integer.compare(number1.number(), number2.number());
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,7 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.classification;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
|
||||
public record NumberWord(Word word, int number) {
|
||||
|
||||
}
|
||||
@ -14,6 +14,7 @@ import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
@ -23,10 +24,11 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.SectionIdentifier;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextBlockOnPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequenceComparator;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TocNumberComparator;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
|
||||
|
||||
@ -71,9 +73,9 @@ public class TableOfContentsClassificationService {
|
||||
|
||||
ClassificationPage startPage = textBlocks.get(start).page();
|
||||
List<TextBlockOnPage> initialLookAhead = textBlocks.subList(start, Math.min(start + SURROUNDING_BLOCKS_RADIUS, textBlocks.size()));
|
||||
HashMap<Word, TextBlockOnPage> lookup = new HashMap<>();
|
||||
List<Word> numbers = extractNumbers(initialLookAhead, lookup, document.getPages().size());
|
||||
TocNumberFinder tocNumberFinder = new TocNumberFinder(numbers, lookup);
|
||||
HashMap<NumberWord, TextBlockOnPage> numberToBlockLookup = new HashMap<>();
|
||||
List<NumberWord> numbers = extractNumbers(initialLookAhead, numberToBlockLookup, document.getPages().size());
|
||||
TocNumberFinder tocNumberFinder = new TocNumberFinder(numbers, numberToBlockLookup);
|
||||
|
||||
int lastCandidate = start;
|
||||
for (int i = start; i < Math.min(lastCandidate + SURROUNDING_BLOCKS_RADIUS, textBlocks.size()); i++) {
|
||||
@ -93,28 +95,28 @@ public class TableOfContentsClassificationService {
|
||||
break;
|
||||
}
|
||||
|
||||
List<Word> numbersFromBlock = extractNumbers(textBlockOnPage, lookup, document.getPages().size());
|
||||
List<NumberWord> numbersFromBlock = extractNumbers(textBlockOnPage, numberToBlockLookup, document.getPages().size());
|
||||
|
||||
List<Word> currentRightmostCluster = tocNumberFinder.getCurrentRightmostCluster();
|
||||
List<NumberWord> currentRightmostCluster = tocNumberFinder.getCurrentRightmostCluster();
|
||||
|
||||
if (currentRightmostCluster.size() < MINIMUM_MATCHES) {
|
||||
log.debug("No numbers indicating a table of contents here.");
|
||||
return start;
|
||||
}
|
||||
|
||||
if (anyIntersection(currentRightmostCluster, numbersFromBlock, lookup)) {
|
||||
if (anyIntersection(currentRightmostCluster, numbersFromBlock, numberToBlockLookup)) {
|
||||
lastCandidate = i;
|
||||
numbersFromBlock.forEach(tocNumberFinder::add);
|
||||
}
|
||||
}
|
||||
|
||||
addVisualization(document.getLayoutDebugLayer(), tocNumberFinder, lookup);
|
||||
|
||||
Set<TextBlockOnPage> blocksWithNumberInCluster = tocNumberFinder.getCurrentRightmostCluster()
|
||||
.stream()
|
||||
.map(lookup::get)
|
||||
.map(numberToBlockLookup::get)
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
addVisualization(document.getLayoutDebugLayer(), tocNumberFinder, numberToBlockLookup, blocksWithNumberInCluster, textBlocks.get(start - 1));
|
||||
|
||||
int lastConfirmed = start;
|
||||
for (int i = start; i < lastCandidate + 1; i++) {
|
||||
TextBlockOnPage textBlockOnPage = textBlocks.get(i);
|
||||
@ -132,18 +134,22 @@ public class TableOfContentsClassificationService {
|
||||
}
|
||||
|
||||
|
||||
private static void addVisualization(LayoutDebugLayer layoutDebugLayer, TocNumberFinder tocNumberFinder, Map<Word, TextBlockOnPage> lookup) {
|
||||
private static void addVisualization(LayoutDebugLayer layoutDebugLayer,
|
||||
TocNumberFinder tocNumberFinder,
|
||||
Map<NumberWord, TextBlockOnPage> lookup,
|
||||
Set<TextBlockOnPage> blocksWithNumberInCluster,
|
||||
TextBlockOnPage startingHeadline) {
|
||||
|
||||
tocNumberFinder.getCurrentRightmostCluster()
|
||||
.stream()
|
||||
.collect(Collectors.groupingBy(key -> lookup.get(key).page().getPageNumber()))
|
||||
.forEach((pageNumber, number) -> layoutDebugLayer.addTocPages(number, pageNumber));
|
||||
layoutDebugLayer.addTocBlocks(blocksWithNumberInCluster);
|
||||
layoutDebugLayer.addTocBlocks(Set.of(startingHeadline));
|
||||
}
|
||||
|
||||
|
||||
private static boolean anyIntersection(Collection<Word> numbers1,
|
||||
Collection<Word> numbers2,
|
||||
Map<Word, TextBlockOnPage> lookup) {
|
||||
private static boolean anyIntersection(Collection<NumberWord> numbers1, Collection<NumberWord> numbers2, Map<NumberWord, TextBlockOnPage> lookup) {
|
||||
|
||||
return numbers1.stream()
|
||||
.anyMatch(numberFromCluster -> numbers2.stream()
|
||||
@ -151,9 +157,9 @@ public class TableOfContentsClassificationService {
|
||||
}
|
||||
|
||||
|
||||
private static List<Word> extractNumbers(List<TextBlockOnPage> textBlocks, Map<Word, TextBlockOnPage> lookup, int numberOfPages) {
|
||||
private static List<NumberWord> extractNumbers(List<TextBlockOnPage> textBlocks, Map<NumberWord, TextBlockOnPage> lookup, int numberOfPages) {
|
||||
|
||||
List<Word> blocks = new LinkedList<>();
|
||||
List<NumberWord> blocks = new LinkedList<>();
|
||||
for (TextBlockOnPage textBlock : textBlocks) {
|
||||
blocks.addAll(extractNumbers(textBlock, lookup, numberOfPages));
|
||||
}
|
||||
@ -161,30 +167,40 @@ public class TableOfContentsClassificationService {
|
||||
}
|
||||
|
||||
|
||||
private static List<Word> extractNumbers(TextBlockOnPage textBlock, Map<Word, TextBlockOnPage> lookup, int numberOfPages) {
|
||||
private static List<NumberWord> extractNumbers(TextBlockOnPage textBlock, Map<NumberWord, TextBlockOnPage> lookup, int numberOfPages) {
|
||||
|
||||
List<Word> blocks = new LinkedList<>();
|
||||
List<NumberWord> blocks = new LinkedList<>();
|
||||
TextPageBlock block = textBlock.textBlock();
|
||||
List<Word> sequences = block.getWords();
|
||||
for (int i = 0; i < sequences.size(); i++) {
|
||||
List<Word> words = block.getWords();
|
||||
for (int i = 0; i < words.size(); i++) {
|
||||
|
||||
Word word = sequences.get(i);
|
||||
|
||||
if (!NUMERIC.matcher(word).matches() || word.length() > 5) {
|
||||
Word word = words.get(i);
|
||||
if (!wordIsEndOfLine(i, words)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (AMOUNT_PATTERN.matcher(getSurroundingString(i, sequences)).matches()) {
|
||||
if (AMOUNT_PATTERN.matcher(getSurroundingString(i, words)).matches()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
Matcher matcher = SectionIdentifier.numericalIdentifierPattern.matcher(word.toString());
|
||||
if (matcher.find() && matcher.group(2) != null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
Matcher numberFinder = NUMERIC.matcher(word);
|
||||
if (!numberFinder.find() || word.length() > 5) {
|
||||
continue;
|
||||
}
|
||||
|
||||
try {
|
||||
int pageNumber = Integer.parseInt(word.toString());
|
||||
int pageNumber = Integer.parseInt(numberFinder.group());
|
||||
if (0 >= pageNumber || pageNumber > numberOfPages) {
|
||||
continue;
|
||||
}
|
||||
lookup.put(word, textBlock);
|
||||
blocks.add(word);
|
||||
NumberWord numberWord = new NumberWord(word, pageNumber);
|
||||
lookup.put(numberWord, textBlock);
|
||||
blocks.add(numberWord);
|
||||
} catch (NumberFormatException e) {
|
||||
log.debug("That wasn't a number! Should not happen, due to numeric check beforehand.");
|
||||
}
|
||||
@ -193,6 +209,17 @@ public class TableOfContentsClassificationService {
|
||||
}
|
||||
|
||||
|
||||
private static boolean wordIsEndOfLine(int i, List<Word> words) {
|
||||
|
||||
if (i == words.size() - 1) {
|
||||
return true;
|
||||
}
|
||||
Word word = words.get(i);
|
||||
Word nextWord = words.get(i + 1);
|
||||
return !nextWord.rightOf(word);
|
||||
}
|
||||
|
||||
|
||||
private static CharSequence getSurroundingString(int i, List<Word> sequences) {
|
||||
|
||||
int end = Math.min(i + 5, sequences.size());
|
||||
@ -203,13 +230,13 @@ public class TableOfContentsClassificationService {
|
||||
}
|
||||
|
||||
|
||||
private static boolean matches(Word number1, Word number2, Map<Word, TextBlockOnPage> lookup) {
|
||||
private static boolean matches(NumberWord number1, NumberWord number2, Map<NumberWord, TextBlockOnPage> lookup) {
|
||||
|
||||
if (number1.getDir() != number2.getDir()) {
|
||||
if (number1.word().getDir() != number2.word().getDir()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return number1.intersectsXDirAdj(number2, INTERSECTION_TOLERANCE);
|
||||
return number1.word().intersectsXDirAdj(number2.word(), INTERSECTION_TOLERANCE);
|
||||
}
|
||||
|
||||
|
||||
@ -247,11 +274,11 @@ public class TableOfContentsClassificationService {
|
||||
|
||||
private static class TocNumberFinder {
|
||||
|
||||
final UnionFind<Word> numberClusters;
|
||||
final HashMap<Word, TextBlockOnPage> lookup;
|
||||
final UnionFind<NumberWord> numberClusters;
|
||||
final HashMap<NumberWord, TextBlockOnPage> lookup;
|
||||
|
||||
|
||||
TocNumberFinder(List<Word> blocks, HashMap<Word, TextBlockOnPage> lookup) {
|
||||
TocNumberFinder(List<NumberWord> blocks, HashMap<NumberWord, TextBlockOnPage> lookup) {
|
||||
|
||||
this.numberClusters = new UnionFind<>(new HashSet<>(blocks));
|
||||
for (int i = 0; i < blocks.size(); i++) {
|
||||
@ -265,14 +292,14 @@ public class TableOfContentsClassificationService {
|
||||
}
|
||||
|
||||
|
||||
public void add(Word number) {
|
||||
public void add(NumberWord number) {
|
||||
|
||||
if (numberClusters.getElements().contains(number)) {
|
||||
return;
|
||||
}
|
||||
|
||||
numberClusters.addElement(number);
|
||||
for (Word element : numberClusters.getElements()) {
|
||||
for (NumberWord element : numberClusters.getElements()) {
|
||||
if (matches(number, element, lookup)) {
|
||||
numberClusters.union(element, number);
|
||||
}
|
||||
@ -280,73 +307,100 @@ public class TableOfContentsClassificationService {
|
||||
}
|
||||
|
||||
|
||||
public List<Word> getCurrentRightmostCluster() {
|
||||
public List<NumberWord> getCurrentRightmostCluster() {
|
||||
|
||||
return numberClusters.getGroups()
|
||||
.stream()
|
||||
.filter(cluster -> cluster.size() > MINIMUM_MATCHES)
|
||||
.map(cluster -> cluster.stream()
|
||||
.sorted(new TextPositionSequenceComparator(lookup))
|
||||
.sorted(new TocNumberComparator(lookup))
|
||||
.toList())
|
||||
.map(this::removeOutliers)
|
||||
// .map(this::filterByMinimumDensity)
|
||||
.map(this::removeOnNonConsecutivePages)
|
||||
.map(this::filterByWordNearTopOfPage)
|
||||
.filter(cluster -> cluster.size() > MINIMUM_MATCHES)
|
||||
.max(Comparator.comparingDouble(cluster -> cluster.get(0).getBBox().getMaxX())).orElse(Collections.emptyList());
|
||||
.max(Comparator.comparingDouble(cluster -> cluster.get(0).word().getBBox().getMaxX())).orElse(Collections.emptyList());
|
||||
}
|
||||
|
||||
// does not seem to be doing much, ideally instead of using the height of the blocks i would like to use the height, beginning from the MainBody top,
|
||||
// but as the MainBody is often wrong, this results in some numbers being discarded even though they are correct.
|
||||
// private List<TextPositionSequence> filterByMinimumDensity(List<TextPositionSequence> numbers) {
|
||||
//
|
||||
// Map<ClassificationPage, List<TextPositionSequence>> clustersPerPage = numbers.stream()
|
||||
// .collect(Collectors.groupingBy(number -> lookup.get(number).page()));
|
||||
//
|
||||
// List<TextPositionSequence> result = new ArrayList<>(numbers.size());
|
||||
// clustersPerPage.keySet()
|
||||
// .stream()
|
||||
// .sorted(Comparator.comparingInt(ClassificationPage::getPageNumber))
|
||||
// .forEach(page -> {
|
||||
// var numbersOnPage = clustersPerPage.get(page);
|
||||
//
|
||||
// double height = numbersOnPage.stream()
|
||||
// .map(BoundingBox::getBBox)
|
||||
// .collect(RectangleTransformations.collectBBox()).getHeight();
|
||||
//
|
||||
// double count = numbersOnPage.size();
|
||||
//
|
||||
// if ((count / height) >= (DENSITY_THRESHOLD_COUNT / page.getPageHeight())) {
|
||||
// result.addAll(numbers);
|
||||
// }
|
||||
// });
|
||||
// return result;
|
||||
// }
|
||||
|
||||
private List<NumberWord> removeOnNonConsecutivePages(List<NumberWord> numbers) {
|
||||
|
||||
public List<Word> removeOutliers(List<Word> numbers) {
|
||||
|
||||
List<Word> result = new ArrayList<>();
|
||||
List<NumberWord> result = new ArrayList<>();
|
||||
|
||||
result.add(numbers.get(0));
|
||||
|
||||
for (int i = 1; i < numbers.size(); i++) {
|
||||
int prev = getPageNumber(numbers, i - 1);
|
||||
int curr = getPageNumber(numbers, i);
|
||||
|
||||
if (Math.abs(prev - curr) > 1) {
|
||||
break;
|
||||
} else {
|
||||
result.add(numbers.get(i));
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
private int getPageNumber(List<NumberWord> numbers, int i) {
|
||||
|
||||
return lookup.get(numbers.get(i)).page().getPageNumber();
|
||||
}
|
||||
|
||||
|
||||
private List<NumberWord> filterByWordNearTopOfPage(List<NumberWord> numbers) {
|
||||
|
||||
List<NumberWord> result = new ArrayList<>();
|
||||
|
||||
result.add(numbers.get(0));
|
||||
|
||||
for (int i = 1; i < numbers.size(); i++) {
|
||||
NumberWord prev = numbers.get(i - 1);
|
||||
NumberWord curr = numbers.get(i);
|
||||
ClassificationPage prevPage = lookup.get(prev).page();
|
||||
ClassificationPage currPage = lookup.get(curr).page();
|
||||
if (prevPage.equals(currPage)) {
|
||||
result.add(curr);
|
||||
} else if (curr.word().getBBox().getMinY() < currPage.getPageHeight() * 0.33) {
|
||||
result.add(curr);
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
public List<NumberWord> removeOutliers(List<NumberWord> numbers) {
|
||||
|
||||
List<NumberWord> confirmedClusterNumbers = new ArrayList<>();
|
||||
|
||||
confirmedClusterNumbers.add(numbers.get(0));
|
||||
|
||||
for (int i = 1; i < numbers.size() - 1; i++) {
|
||||
int prev = getNumberAsInt(numbers, i - 1);
|
||||
int curr = getNumberAsInt(numbers, i);
|
||||
int next = getNumberAsInt(numbers, i + 1);
|
||||
|
||||
if (!(curr <= prev || curr >= next) || !isBetterWithout(numbers, i)) {
|
||||
result.add(numbers.get(i));
|
||||
confirmedClusterNumbers.add(numbers.get(i));
|
||||
}
|
||||
}
|
||||
if (getNumberAsInt(numbers, numbers.size() - 1) >= getNumberAsInt(numbers, Math.max(0, numbers.size() - 2))) {
|
||||
result.add(numbers.get(numbers.size() - 1));
|
||||
if (getNumberAsInt(numbers, numbers.size() - 1) >= getLatestNumber(confirmedClusterNumbers)) {
|
||||
confirmedClusterNumbers.add(numbers.get(numbers.size() - 1));
|
||||
}
|
||||
|
||||
return result;
|
||||
return confirmedClusterNumbers;
|
||||
}
|
||||
|
||||
|
||||
private static int getLatestNumber(List<NumberWord> confirmedClusterNumbers) {
|
||||
|
||||
return confirmedClusterNumbers.get(confirmedClusterNumbers.size() - 1).number();
|
||||
}
|
||||
|
||||
|
||||
// Helper method to check if removing the current number results in a better order
|
||||
public static boolean isBetterWithout(List<Word> numbers, int i) {
|
||||
public static boolean isBetterWithout(List<NumberWord> numbers, int i) {
|
||||
|
||||
if (i == 0 || i == numbers.size() - 1) {
|
||||
return false;
|
||||
@ -362,9 +416,9 @@ public class TableOfContentsClassificationService {
|
||||
}
|
||||
|
||||
|
||||
private static int getNumberAsInt(List<Word> numbers, int i) {
|
||||
private static int getNumberAsInt(List<NumberWord> numbers, int i) {
|
||||
|
||||
return Integer.parseInt(numbers.get(i).toString());
|
||||
return numbers.get(i).number();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -16,7 +16,7 @@ import java.util.Optional;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
@ -82,9 +82,9 @@ public class DocumentGraphFactory {
|
||||
|
||||
documentGraph.streamAllSubNodes()
|
||||
.filter(SemanticNode::isLeaf)
|
||||
.filter(node -> !node.getType().equals(NodeType.HEADER))
|
||||
.filter(node -> !node.getType().equals(NodeType.FOOTER))
|
||||
.filter(node -> !node.getType().equals(NodeType.IMAGE))
|
||||
.filter(node -> !node.getType().equals(NodeTypeProto.NodeType.HEADER))
|
||||
.filter(node -> !node.getType().equals(NodeTypeProto.NodeType.FOOTER))
|
||||
.filter(node -> !node.getType().equals(NodeTypeProto.NodeType.IMAGE))
|
||||
.map(SemanticNode::getTextBlock)
|
||||
.map(TextBlock::getAtomicTextBlocks)
|
||||
.flatMap(Collection::stream)
|
||||
|
||||
@ -27,7 +27,9 @@ import com.knecon.fforesight.service.layoutparser.processor.model.outline.Outlin
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.ListIdentifier;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextBlockOnPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.NumberWord;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
|
||||
@ -293,7 +295,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
||||
}
|
||||
|
||||
|
||||
public void addTocPages(List<Word> numbers, int page) {
|
||||
public void addTocPages(List<NumberWord> numbers, int page) {
|
||||
|
||||
if (!active) {
|
||||
return;
|
||||
@ -302,13 +304,10 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
||||
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(page, this.tocPages);
|
||||
visualizationsOnPage.getColoredRectangles()
|
||||
.addAll(numbers.stream()
|
||||
.map(NumberWord::word)
|
||||
.map(BoundingBox::getBBoxPdf)
|
||||
.map(line -> new ColoredRectangle(line, LINES_COLOR, LINE_WIDTH))
|
||||
.toList());
|
||||
visualizationsOnPage.getColoredRectangles()
|
||||
.add(new ColoredRectangle(numbers.stream()
|
||||
.map(BoundingBox::getBBoxPdf)
|
||||
.collect(RectangleTransformations.collectBBox()), LINES_COLOR, LINE_WIDTH));
|
||||
}
|
||||
|
||||
|
||||
@ -332,8 +331,10 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
||||
|
||||
private void addOutlineObject(OutlineObject outlineObject, PageInformation pageInformation) {
|
||||
|
||||
if (!active) {
|
||||
return;
|
||||
}
|
||||
int rectSize = 5;
|
||||
|
||||
Point2D point2D;
|
||||
if (outlineObject.getPoint().isPresent()) {
|
||||
point2D = outlineObject.getPoint().get();
|
||||
@ -357,10 +358,25 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
||||
|
||||
public void addListIdentifiers(List<ListIdentifier> listIdentifiers) {
|
||||
|
||||
if (!active) {
|
||||
return;
|
||||
}
|
||||
for (ListIdentifier listIdentifier : listIdentifiers) {
|
||||
getOrCreateVisualizationsOnPage(listIdentifier.getPage(), this.listIdentifiers).getColoredRectangles()
|
||||
.add(new ColoredRectangle(listIdentifier.getWord().getBBoxPdf(), WORDS_COLOR, LINE_WIDTH));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void addTocBlocks(Set<TextBlockOnPage> blocksWithNumberInCluster) {
|
||||
|
||||
if (!active) {
|
||||
return;
|
||||
}
|
||||
for (TextBlockOnPage textBlockOnPage : blocksWithNumberInCluster) {
|
||||
getOrCreateVisualizationsOnPage(textBlockOnPage.page().getPageNumber(), this.tocBlocks).getColoredRectangles()
|
||||
.add(new ColoredRectangle(textBlockOnPage.textBlock().getBBoxPdf(), TOC_COLOR, LINE_WIDTH));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -83,4 +83,11 @@ class SectionIdentifierTest {
|
||||
assertEquals(List.of(4, 1, 2, 4), identifier.getIdentifiers());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testFalsePositive111() {
|
||||
SectionIdentifier identifier = SectionIdentifier.fromSearchText("111: Headline");
|
||||
assertEquals(SectionIdentifier.Format.NUMERICAL, identifier.getFormat());
|
||||
assertEquals(1, identifier.level());
|
||||
}
|
||||
|
||||
}
|
||||
@ -57,6 +57,7 @@ public record LayerIdentifier(String name, String markedContentName) {
|
||||
public static final LayerIdentifier OUTLINE_OBJECTS = new LayerIdentifier("Outline Positions", "OUTLINE_OBJECTS");
|
||||
public static final LayerIdentifier SENTENCES = new LayerIdentifier("Sentences", "SENTENCES");
|
||||
public static final LayerIdentifier TOC_PAGES = new LayerIdentifier("TOC pages", "TOC_PAGES");
|
||||
public static final LayerIdentifier TOC_BLOCKS = new LayerIdentifier("TOC blocks", "TOC_BLOCKS");
|
||||
public static final LayerIdentifier LIST_IDENTIFIERS = new LayerIdentifier("List identifiers", "LIST_IDENTIFIERS");
|
||||
|
||||
// Visual layout parser
|
||||
|
||||
@ -22,6 +22,7 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup {
|
||||
|
||||
protected static final Color WORDS_COLOR = new Color(68, 84, 147);
|
||||
protected static final Color LINES_COLOR = new Color(152, 45, 179);
|
||||
protected static final Color TOC_COLOR = new Color(33, 159, 144);
|
||||
protected static final Color ZONES_COLOR = new Color(131, 38, 38);
|
||||
|
||||
protected static final Color RULINGS_COLOR = new Color(21, 221, 174);
|
||||
@ -59,6 +60,7 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup {
|
||||
protected final Visualizations outlineObjects = Visualizations.builder().layer(LayerIdentifier.OUTLINE_OBJECTS).build();
|
||||
protected final Visualizations sentences = Visualizations.builder().layer(LayerIdentifier.SENTENCES).build();
|
||||
protected final Visualizations tocPages = Visualizations.builder().layer(LayerIdentifier.TOC_PAGES).build();
|
||||
protected final Visualizations tocBlocks = Visualizations.builder().layer(LayerIdentifier.TOC_BLOCKS).build();
|
||||
protected final Visualizations listIdentifiers = Visualizations.builder().layer(LayerIdentifier.LIST_IDENTIFIERS).build();
|
||||
|
||||
|
||||
@ -77,9 +79,9 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup {
|
||||
markedContent, //
|
||||
outlineObjects, //
|
||||
tocPages, //
|
||||
tocBlocks, //
|
||||
listIdentifiers //
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user