RED-10127: add list classification

This commit is contained in:
Kilian Schüttler 2024-10-10 10:50:10 +02:00
parent 4b0c041d84
commit 7b073eb4f3
22 changed files with 482 additions and 128 deletions

View File

@ -374,14 +374,7 @@ public class LayoutParsingPipeline {
classificationService.classify(classificationDocument, layoutParsingType, identifier);
List<TextPageBlock> headlines = classificationDocument.getPages()
.stream()
.flatMap(classificationPage -> classificationPage.getTextBlocks()
.stream()
.filter(tb -> tb instanceof TextPageBlock && tb.getClassification() != null && tb.getClassification().isHeadline())
.map(tb -> (TextPageBlock) tb))
.toList();
TableOfContents tableOfContents = outlineValidationService.createToC(headlines);
TableOfContents tableOfContents = outlineValidationService.createToC(classificationDocument);
classificationDocument.setTableOfContents(tableOfContents);
log.info("Building Sections for {}", identifier);

View File

@ -165,4 +165,16 @@ public abstract class TextBoundingBox extends BoundingBox {
return this.getYDirAdj() - threshold <= other.getMaxYDirAdj() && this.getMaxYDirAdj() + threshold >= other.getYDirAdj();
}
public boolean isAboveDirAdj(TextBoundingBox other) {
return other.isBelow(this);
}
public boolean isBelowDirAdj(TextBoundingBox other) {
return this.intersectsXDirAdj(other) && this.getYDirAdj() >= other.getMaxYDirAdj();
}
}

View File

@ -9,13 +9,13 @@ public enum PageBlockType {
H6,
HEADER,
FOOTER,
TITLE,
PARAGRAPH,
PARAGRAPH_BOLD,
PARAGRAPH_ITALIC,
PARAGRAPH_UNKNOWN,
OTHER,
TABLE_OF_CONTENTS_ITEM,
LIST_ITEM,
TABLE;

View File

@ -10,6 +10,7 @@ import java.util.TreeSet;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import io.micrometer.observation.annotation.Observed;
@ -20,7 +21,9 @@ import lombok.extern.slf4j.Slf4j;
public class OutlineValidationService {
@Observed(name = "OutlineValidationService", contextualName = "create-toc")
public TableOfContents createToC(List<TextPageBlock> headlines) {
public TableOfContents createToC(ClassificationDocument classificationDocument) {
List<TextPageBlock> headlines = extractHeadlines(classificationDocument);
List<TableOfContentItem> mainSections = new ArrayList<>();
Map<Integer, TableOfContentItem> lastItemsPerDepth = new HashMap<>();
@ -60,4 +63,16 @@ public class OutlineValidationService {
return new TableOfContents(mainSections);
}
private static List<TextPageBlock> extractHeadlines(ClassificationDocument classificationDocument) {
return classificationDocument.getPages()
.stream()
.flatMap(classificationPage -> classificationPage.getTextBlocks()
.stream()
.filter(tb -> tb instanceof TextPageBlock && tb.getClassification() != null && tb.getClassification().isHeadline())
.map(tb -> (TextPageBlock) tb))
.toList();
}
}

View File

@ -0,0 +1,8 @@
package com.knecon.fforesight.service.layoutparser.processor.model.text;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
public record AbstractBlockOnPage(AbstractPageBlock block, ClassificationPage page) {
}

View File

@ -0,0 +1,85 @@
package com.knecon.fforesight.service.layoutparser.processor.model.text;
import java.util.List;
import java.util.Optional;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.experimental.FieldDefaults;
@AllArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class ListIdentifier {
public static final Pattern STARTING_NUMBERS = Pattern.compile("^\\s*([1-9]+)\\.\\s+");
enum Format {
NUMBERS
}
Format format;
@Getter
TextPositionSequence word;
@Getter
int page;
int representation;
public static Optional<ListIdentifier> parse(TextPageBlock textPageBlock, int page) {
return parse(textPageBlock.getSequences().subList(0, Math.min(5, textPageBlock.getSequences().size())), page);
}
public static Optional<ListIdentifier> parse(List<TextPositionSequence> sequences, int page) {
StringBuilder sb = new StringBuilder();
for (TextPositionSequence sequence : sequences) {
sb.append(sequence.toString());
sb.append(" ");
}
sb.replace(sb.length() - 1, sb.length(), "");
String text = sb.toString();
Matcher numberMatcher = STARTING_NUMBERS.matcher(text);
if (numberMatcher.find()) {
return Optional.of(new ListIdentifier(Format.NUMBERS, sequences.get(0), page, Integer.parseInt(numberMatcher.group(1))));
}
return Optional.empty();
}
public static boolean isInOrder(List<ListIdentifier> listIdentifiers) {
if (listIdentifiers.size() <= 1) {
return true;
}
for (int i = 1; i < listIdentifiers.size(); i++) {
ListIdentifier current = listIdentifiers.get(i);
ListIdentifier previous = listIdentifiers.get(i - 1);
if (current.format != previous.format) {
return false;
}
if (current.representation <= previous.representation) {
return false;
}
if (!current.word.intersectsXDirAdj(previous.word, 2)) {
return false;
}
if (current.page == previous.page && !current.word.isBelowDirAdj(previous.word)) {
return false;
}
if (current.page < previous.page) {
return false;
}
}
return true;
}
}

View File

@ -2,6 +2,6 @@ package com.knecon.fforesight.service.layoutparser.processor.model.text;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
public record TextBlockOnPage(ClassificationPage page, TextPageBlock textBlock) {
public record TextBlockOnPage(TextPageBlock textBlock, ClassificationPage page) {
}

View File

@ -33,8 +33,6 @@ public class TextPageBlock extends AbstractPageBlock {
private boolean underlined;
private double highestFontSize;
private PageBlockType classification;
private boolean toDuplicate;
@ -262,6 +260,12 @@ public class TextPageBlock extends AbstractPageBlock {
}
public double getHighestFontSize() {
return frequencyCounters.getFontSizeFrequencyCounter().getHighest();
}
@Override
public boolean isEmpty() {

View File

@ -18,6 +18,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.SectionIdentif
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
import lombok.Data;
@ -467,7 +468,9 @@ public class BlockificationPostprocessingService {
private static String sanitizeString(String text) {
return StringUtils.deleteWhitespace(text).toLowerCase(Locale.ROOT);
return TextNormalizationUtilities.removeAllWhitespaces(text)//
.trim() // sometimes there are trailing empty bytes at the end of the string trim() seems to remove them
.toLowerCase(Locale.ENGLISH);
}

View File

@ -10,7 +10,6 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
import lombok.RequiredArgsConstructor;
@ -73,7 +72,9 @@ public class ClarifyndClassificationService {
&& (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5
&& textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) {
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
textBlock.setClassification(PageBlockType.TITLE);
PageBlockType headlineType = HeadlineClassificationService.headlineClassByFontSize(textBlock, headlineFontSizes);
headlineClassificationService.classifyHeadline(textBlock, headlineType);
document.setHeadlines(true);
}
} else if (textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular()
&& PositionUtils.getApproxLineCount(textBlock) < 4.9

View File

@ -0,0 +1,29 @@
package com.knecon.fforesight.service.layoutparser.processor.services.classification;
import java.util.regex.Pattern;
public class ClassificationPatterns {
public static final Pattern HEADLINE_WITH_2_IDENTIFER_PATTERN = Pattern.compile("^([1-9]\\d?\\.){1,3}\\d{1,2}\\.?\\s[a-z][0-9a-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
public static final Pattern HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN = Pattern.compile("^([0-9]\\.)\\s[a-z][0-9a-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
public static final Pattern AT_LEAST_3_CHARS_PATTERN = Pattern.compile("\\p{L}{3,}", Pattern.CASE_INSENSITIVE);
public static final Pattern HEADLINE_PATTERN_WITH_SLASHES = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*");
public static final Pattern AMOUNT_PATTERN = Pattern.compile(
"^\\s*\\d+(?:\\.\\d+)?\\s*(?:ml|l|g|kg|mg|cm|cm2|cm3|mm|mm2|mm3|km|km2|m|m2|m3|lb|oz|ppm|dpm|days|weeks|months|%|f)\\b",
Pattern.CASE_INSENSITIVE);
public static final Pattern TABLE_OR_FIGURE_PATTERN = Pattern.compile(
"^\\s*(?:table|continued\\s+table|appendix|figure)\\s+(?:[xvi]+|[a-z0-9]{1,3}(?:\\.[0-9]{1,3})*(?:-[0-9]{1,3})?)\\b",
Pattern.CASE_INSENSITIVE);
public static final Pattern ALPHANUMERIC = Pattern.compile("[a-zA-Z0-9]");
public static final Pattern NUMERIC = Pattern.compile("[0-9]+");
}

View File

@ -1,9 +1,19 @@
package com.knecon.fforesight.service.layoutparser.processor.services.classification;
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.ALPHANUMERIC;
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.AMOUNT_PATTERN;
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.AT_LEAST_3_CHARS_PATTERN;
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.HEADLINE_PATTERN_WITH_SLASHES;
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.HEADLINE_WITH_2_IDENTIFER_PATTERN;
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN;
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.TABLE_OR_FIGURE_PATTERN;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@ -13,90 +23,60 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.text.AbstractBlockOnPage;
import com.knecon.fforesight.service.layoutparser.processor.model.text.ListIdentifier;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.utils.HeaderFooterDetection;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
import lombok.AccessLevel;
import lombok.RequiredArgsConstructor;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class DocuMineClassificationService {
private static final Pattern HEADLINE_WITH_2_IDENTIFER_PATTERN = Pattern.compile("^([1-9]\\d?\\.){1,3}\\d{1,2}\\.?\\s[a-z][0-9a-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
private static final Pattern HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN = Pattern.compile("^([0-9]\\.)\\s[a-z][0-9a-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
private static final Pattern AT_LEAST_3_CHARS_PATTERN = Pattern.compile("\\p{L}{3,}", Pattern.CASE_INSENSITIVE);
private static final Pattern HEADLINE_PATTERN_WITH_SLASHES = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*");
public static final Pattern AMOUNT_PATTERN = Pattern.compile("^\\s*\\d+(?:\\.\\d+)?\\s*(?:ml|l|g|kg|mg|cm|cm2|cm3|mm|mm2|mm3|km|km2|m|m2|m3|lb|oz|ppm|dpm|days|weeks|months|%|f)\\b",
Pattern.CASE_INSENSITIVE);
private static final Pattern TABLE_OR_FIGURE_PATTERN = Pattern.compile(
"^\\s*(?:table|continued\\s+table|appendix|figure)\\s+(?:[xvi]+|[a-z0-9]{1,3}(?:\\.[0-9]{1,3})*(?:-[0-9]{1,3})?)\\b",
Pattern.CASE_INSENSITIVE);
private static final Pattern ALPHANUMERIC = Pattern.compile("[a-zA-Z0-9]");
public static final int SEPARATION_THRESHOLD = 10; // if the min distance between a textblock and all its surrounding blocks, the regexes can be more lenient.
public static final int SURROUNDING_BLOCKS_RADIUS = 3; // number of surrounding blocks before and after the current textblock to be tested
ListItemClassificationService listItemClassificationService;
public void classifyDocument(ClassificationDocument document) {
List<Double> headlineFontSizes = document.getFontSizeCounter().getValuesInReverseOrder();
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
List<Double> headlineFontSizes = buildHeadlineFontSizes(document);
List<AbstractBlockOnPage> blocks = buildBlocksPerPage(document);
log.debug("Headline FontSizes are: {}", headlineFontSizes);
HeadlineClassificationService headlineClassificationService = new HeadlineClassificationService();
for (ClassificationPage page : document.getPages()) {
document.getLayoutDebugLayer().addTextBlockVisualizations(page.getTextBlocks(), page.getPageNumber());
classifyPage(headlineClassificationService, page, document, headlineFontSizes);
for (int i = 0; i < blocks.size(); i++) {
AbstractBlockOnPage block = blocks.get(i);
document.getLayoutDebugLayer().addTextBlockVisualizations(block.page().getTextBlocks(), block.page().getPageNumber());
classifyBlock(headlineClassificationService, i, blocks, document, headlineFontSizes);
}
}
private void classifyPage(HeadlineClassificationService headlineClassificationService,
ClassificationPage page,
ClassificationDocument document,
List<Double> headlineFontSizes) {
List<AbstractPageBlock> textBlocks = page.getTextBlocks();
for (int i = 0; i < textBlocks.size(); i++) {
AbstractPageBlock textBlock = textBlocks.get(i);
if (textBlock instanceof TextPageBlock) {
List<AbstractPageBlock> surroundingBlocks = getSurroundingBlocks(i, textBlocks);
classifyBlock(headlineClassificationService, (TextPageBlock) textBlock, surroundingBlocks, page, document, headlineFontSizes);
}
}
}
private List<AbstractPageBlock> getSurroundingBlocks(int originalIndex, List<AbstractPageBlock> textBlocks) {
int start = Math.max(originalIndex - SURROUNDING_BLOCKS_RADIUS, 0);
int end = Math.min(originalIndex + SURROUNDING_BLOCKS_RADIUS, textBlocks.size());
List<AbstractPageBlock> surroundingBlocks = new ArrayList<>(2 * SURROUNDING_BLOCKS_RADIUS);
for (int i = start; i < end; i++) {
if (i == originalIndex) {
continue;
}
if (textBlocks.get(i).getText().length() <= 1) {
continue;
}
surroundingBlocks.add(textBlocks.get(i));
}
return surroundingBlocks;
}
private void classifyBlock(HeadlineClassificationService headlineClassificationService,
TextPageBlock textBlock,
List<AbstractPageBlock> surroundingBlocks,
ClassificationPage page,
int currentIndex,
List<AbstractBlockOnPage> allBlocks,
ClassificationDocument document,
List<Double> headlineFontSizes) {
TextPageBlock textBlock;
if (allBlocks.get(currentIndex).block() instanceof TextPageBlock block) {
textBlock = block;
} else {
return;
}
ClassificationPage page = allBlocks.get(currentIndex).page();
List<AbstractPageBlock> surroundingBlocks = getSurroundingBlocksOnPage(currentIndex, allBlocks);
log.debug("headlineFontSizes: {}", headlineFontSizes);
var bodyTextFrame = page.getBodyTextFrame();
@ -113,6 +93,9 @@ public class DocuMineClassificationService {
boolean enoughChars = charCount > textBlock.getText().length() * 0.5;
List<ListIdentifier> listIdentifiers = listItemClassificationService.findConfirmedListIdentifiers(currentIndex, allBlocks);
document.getLayoutDebugLayer().addListIdentifiers(listIdentifiers);
if (textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) {
headlineClassificationService.setLastHeadlineFromOutline(textBlock);
return;
@ -126,26 +109,21 @@ public class DocuMineClassificationService {
textBlock.setClassification(PageBlockType.PARAGRAPH);
return;
}
if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5
&& textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) {
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
textBlock.setClassification(PageBlockType.TITLE);
}
} else if (textBlock.getText().length() > 5
&& greaterOrEqualFontThanDocumentAverage(textBlock, document)
&& PositionUtils.getApproxLineCount(textBlock) < 5.9
&& ((textBlock.getMostPopularWordStyle().contains("bold") || textBlock.isUnderlined())//
&& Character.isDigit(textBlock.toString().charAt(0)) //
&& isAtLeast3Characters //
&& !textBlock.toString().contains(":") //
|| textBlock.toString().startsWith("APPENDIX") //
|| textBlock.toString().startsWith("FIGURE") //
|| textBlock.toString().startsWith("Continued TABLE") //
|| textBlock.toString().startsWith("TABLE"))
&& !textBlock.toString().endsWith(":")
&& isAtLeast3Characters
&& !isAmount
&& enoughChars) {
if (textBlock.getText().length() > 5
&& greaterOrEqualFontThanDocumentAverage(textBlock, document)
&& PositionUtils.getApproxLineCount(textBlock) < 5.9
&& ((textBlock.getMostPopularWordStyle().contains("bold") || textBlock.isUnderlined())//
&& Character.isDigit(textBlock.toString().charAt(0)) //
&& isAtLeast3Characters //
&& !textBlock.toString().contains(":") //
|| textBlock.toString().startsWith("APPENDIX") //
|| textBlock.toString().startsWith("FIGURE") //
|| textBlock.toString().startsWith("Continued TABLE") //
|| textBlock.toString().startsWith("TABLE"))
&& !textBlock.toString().endsWith(":")
&& isAtLeast3Characters
&& !isAmount
&& enoughChars) {
setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes);
} else if (isAllCaps(textBlock)
@ -170,11 +148,14 @@ public class DocuMineClassificationService {
} else if (hasSeparation(textBlock, surroundingBlocks)//
&& greaterOrEqualFontThanPageAverage(textBlock, page)//
&& PositionUtils.getApproxLineCount(textBlock) < 2.9//
&& (tableOrFigureMatcher.reset().find() || headlineWithSingleIdentifierMatcher.reset().find())//
&& (tableOrFigureMatcher.reset().find() || (headlineWithSingleIdentifierMatcher.reset().find() && listIdentifiers.isEmpty())) //
&& !isAmount//
&& !headlineWithSlashesMatches) {
setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes);
} else if (!listIdentifiers.isEmpty()) {
textBlock.setClassification(PageBlockType.LIST_ITEM);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
&& textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()
&& textBlock.getMostPopularWordStyle().equals("bold")
@ -264,6 +245,92 @@ public class DocuMineClassificationService {
document.setHeadlines(true);
}
private List<AbstractBlockOnPage> buildBlocksPerPage(ClassificationDocument document) {
List<AbstractBlockOnPage> blocks = new ArrayList<>();
for (ClassificationPage page : document.getPages()) {
for (AbstractPageBlock abstractPageBlock : page.getTextBlocks()) {
if (abstractPageBlock instanceof TextPageBlock textBlock) {
if (textBlock.getClassification() != null && (textBlock.getClassification().equals(PageBlockType.HEADER) //
|| textBlock.getClassification().equals(PageBlockType.FOOTER))) {
continue;
}
blocks.add(new AbstractBlockOnPage(textBlock, page));
}
}
}
return blocks;
}
private static List<Double> buildHeadlineFontSizes(ClassificationDocument document) {
if (document.getFontSizeCounter().getCountPerValue().size() <= 6) {
return document.getFontSizeCounter().getValuesInReverseOrder();
}
List<Map.Entry<Double, Integer>> sortedEntries = new ArrayList<>(document.getFontSizeCounter().getCountPerValue().entrySet());
sortedEntries.sort(Map.Entry.comparingByKey());
int totalCount = sortedEntries.stream()
.mapToInt(Map.Entry::getValue).sum();
int cumulativeCount = 0;
Iterator<Map.Entry<Double, Integer>> iterator = sortedEntries.iterator();
while (iterator.hasNext()) {
Map.Entry<Double, Integer> entry = iterator.next();
cumulativeCount += entry.getValue();
if (cumulativeCount > totalCount * 0.3) {
break; // We've filtered the bottom 30%, so stop.
}
iterator.remove();
}
if (sortedEntries.size() < 6) {
return document.getFontSizeCounter().getValuesInReverseOrder();
}
int clusterSize = Math.max(1, sortedEntries.size() / 6);
List<List<Double>> clusters = new ArrayList<>();
for (int i = 0; i < 6; i++) {
clusters.add(new ArrayList<>());
}
for (int i = 0; i < sortedEntries.size(); i++) {
int clusterIndex = Math.min(i / clusterSize, 5);
clusters.get(clusterIndex).add(sortedEntries.get(i).getKey());
}
return clusters.stream()
.map(cluster -> cluster.stream()
.mapToDouble(d -> d).average()
.orElseThrow())
.sorted(Comparator.reverseOrder())
.toList();
}
private List<AbstractPageBlock> getSurroundingBlocksOnPage(int originalIndex, List<AbstractBlockOnPage> textBlocks) {
int start = Math.max(originalIndex - SURROUNDING_BLOCKS_RADIUS, 0);
int end = Math.min(originalIndex + SURROUNDING_BLOCKS_RADIUS, textBlocks.size());
List<AbstractPageBlock> surroundingBlocks = new ArrayList<>(2 * SURROUNDING_BLOCKS_RADIUS);
for (int i = start; i < end; i++) {
if (i == originalIndex) {
continue;
}
if (textBlocks.get(i).block().getText().length() <= 1) {
continue;
}
if (!textBlocks.get(i).page().equals(textBlocks.get(originalIndex).page())) {
continue;
}
surroundingBlocks.add(textBlocks.get(i).block());
}
return surroundingBlocks;
}
}

View File

@ -73,13 +73,18 @@ public class HeadlineClassificationService {
public static PageBlockType headlineClassByFontSize(TextPageBlock textBlock, List<Double> fontSizeGroups) {
PageBlockType headlineType = PageBlockType.H1;
for (int i = 1; i <= fontSizeGroups.size(); i++) {
if (textBlock.getMostPopularWordFontSize() == fontSizeGroups.get(i - 1)) {
headlineType = PageBlockType.getHeadlineType(i);
List<Double> distances = fontSizeGroups.stream()
.map(fontSize -> Math.abs(fontSize - textBlock.getMostPopularWordFontSize()))
.toList();
double min = Double.MAX_VALUE;
int argMin = -1;
for (int i = 0; i < distances.size(); i++) {
if (distances.get(i) < min) {
min = distances.get(i);
argMin = i;
}
}
return headlineType;
return PageBlockType.getHeadlineType(argMin);
}
}

View File

@ -0,0 +1,99 @@
package com.knecon.fforesight.service.layoutparser.processor.services.classification;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.text.AbstractBlockOnPage;
import com.knecon.fforesight.service.layoutparser.processor.model.text.ListIdentifier;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
@Service
public class ListItemClassificationService {
public static final int LIST_IDENTIFIER_SEARCH_RADIUS = 3;
public List<ListIdentifier> findConfirmedListIdentifiers(int currentIndex, List<AbstractBlockOnPage> allBlocks) {
List<ListIdentifier> listIdentifiers = extractListIdentifiers(allBlocks.get(currentIndex));
if (listIdentifiers.isEmpty()) {
return Collections.emptyList();
}
if (listIdentifiers.size() > 1 && ListIdentifier.isInOrder(listIdentifiers)) {
return listIdentifiers;
}
int start = Math.max(0, currentIndex - LIST_IDENTIFIER_SEARCH_RADIUS);
int end = Math.min(allBlocks.size(), currentIndex + LIST_IDENTIFIER_SEARCH_RADIUS);
List<ListIdentifier> identifiersBehind = new ArrayList<>();
if (start < currentIndex) {
identifiersBehind.addAll(allBlocks.subList(start, currentIndex)
.stream()
.map(this::extractListIdentifiers)
.flatMap(Collection::stream)
.toList());
}
if (!identifiersBehind.isEmpty()) {
listIdentifiers.add(0, identifiersBehind.get(identifiersBehind.size() - 1));
if (ListIdentifier.isInOrder(listIdentifiers)) {
return listIdentifiers;
}
listIdentifiers.remove(0);
}
List<ListIdentifier> identifiersAhead = new ArrayList<>();
if (currentIndex + 1 < end) {
identifiersAhead.addAll(allBlocks.subList(currentIndex + 1, end)
.stream()
.map(this::extractListIdentifiers)
.flatMap(Collection::stream)
.toList());
}
if (!identifiersAhead.isEmpty()) {
listIdentifiers.add(identifiersAhead.get(0));
if (ListIdentifier.isInOrder(listIdentifiers)) {
return listIdentifiers;
}
listIdentifiers.remove(listIdentifiers.size() - 1);
}
return Collections.emptyList();
}
private List<ListIdentifier> extractListIdentifiers(AbstractBlockOnPage block) {
List<ListIdentifier> result = new LinkedList<>();
if (block.block() instanceof TextPageBlock textBlock) {
List<TextPositionSequence> sequences = textBlock.getSequences();
for (int i = 0; i < sequences.size(); i++) {
if (i != 0 && sequences.get(i - 1).getXDirAdj() < sequences.get(i).getXDirAdj()) {
// is not the start of a line, continue
continue;
}
TextPositionSequence sequence = sequences.get(i);
List<TextPositionSequence> wordsAtStartOfLine = new ArrayList<>(3);
int end = Math.min(sequences.size(), i + 3);
for (int j = i; j < end; j++) {
if (sequences.get(j).intersectsYDirAdj(sequence, 2)) {
wordsAtStartOfLine.add(sequences.get(j));
} else {
break;
}
}
ListIdentifier.parse(wordsAtStartOfLine, block.page().getPageNumber()).ifPresent(result::add);
}
}
return result;
}
}

View File

@ -86,7 +86,9 @@ public class RedactManagerClassificationService {
&& textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) {
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
textBlock.setClassification(PageBlockType.TITLE);
PageBlockType headlineType = HeadlineClassificationService.headlineClassByFontSize(textBlock, headlineFontSizes);
headlineClassificationService.classifyHeadline(textBlock, headlineType);
document.setHeadlines(true);
}
} else if (textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular()
&& PositionUtils.getApproxLineCount(textBlock) < 4.9

View File

@ -1,6 +1,7 @@
package com.knecon.fforesight.service.layoutparser.processor.services.classification;
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.DocuMineClassificationService.AMOUNT_PATTERN;
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.AMOUNT_PATTERN;
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.NUMERIC;
import java.util.ArrayList;
import java.util.Collection;
@ -13,7 +14,6 @@ import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.springframework.stereotype.Service;
@ -42,8 +42,6 @@ public class TableOfContentsClassificationService {
public static final int INTERSECTION_TOLERANCE = 2; // equality threshold for x intersection
public static final int DENSITY_THRESHOLD_COUNT = 10; // describes the minimum density, at least this many entries per page height are required
private static final Pattern NUMERIC = Pattern.compile("[0-9]+");
@SuppressWarnings("checkstyle:ModifiedControlVariable")
public void classifyTableOfContents(ClassificationDocument document) {
@ -57,11 +55,13 @@ public class TableOfContentsClassificationService {
continue;
}
int offset = identifyTOCItems(i + 1, textBlocks, document);
int end = identifyTOCItems(i + 1, textBlocks, document);
if (offset > 1) {
textBlock.textBlock().setClassification(PageBlockType.H1);
i += offset;
if (end > i + 1) {
if (textBlock.textBlock().getClassification() == null) {
textBlock.textBlock().setClassification(PageBlockType.H1);
}
i = end;
}
}
}
@ -237,7 +237,7 @@ public class TableOfContentsClassificationService {
|| textBlock.getClassification().equals(PageBlockType.FOOTER))) {
continue;
}
blocks.add(new TextBlockOnPage(page, textBlock));
blocks.add(new TextBlockOnPage(textBlock, page));
}
}
}
@ -352,7 +352,7 @@ public class TableOfContentsClassificationService {
return false;
}
int prev = getNumberAsInt(numbers, i);
int prev = getNumberAsInt(numbers, i - 1);
int curr = getNumberAsInt(numbers, i);
int next = getNumberAsInt(numbers, i + 1);

View File

@ -5,6 +5,7 @@ import java.awt.geom.Point2D;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.contentstream.operator.OperatorName;
@ -50,6 +51,7 @@ import lombok.extern.slf4j.Slf4j;
@Slf4j
public class PDFLinesTextStripper extends PDFTextStripper {
private final static Set<String> DOT_LIKE_CHARACTERS = Set.of(".", "·", "", "", "", "", "", "", "", "", "", "", "", "");
private final List<TextPositionSequence> textPositionSequences = new ArrayList<>();
private final List<Ruling> rulings = new ArrayList<>();
private final List<Ruling> graphicsPath = new ArrayList<>();
@ -336,20 +338,32 @@ public class PDFLinesTextStripper extends PDFTextStripper {
private boolean isWordFollowedByDottedLine(List<TextPosition> textPositions, int i, int startIndex) {
return i - startIndex >= 4 //
&& textPositions.get(i).getUnicode().equals(".") //
&& textPositions.get(i - 1).getUnicode().equals(".") //
&& textPositions.get(i - 2).getUnicode().equals(".") //
&& !textPositions.get(i - 3).getUnicode().equals(".");
&& isDot(textPositions, i) //
&& isDot(textPositions, i - 1) //
&& isDot(textPositions, i - 2) //
&& alphanumeric(textPositions, i - 3);
}
private static boolean isDottedLineFollowedByWord(List<TextPosition> textPositions, int i, int startIndex) {
return i - startIndex >= 4 //
&& !textPositions.get(i).getUnicode().equals(".") //
&& textPositions.get(i - 1).getUnicode().equals(".") //
&& textPositions.get(i - 2).getUnicode().equals(".") //
&& textPositions.get(i - 3).getUnicode().equals(".");
&& alphanumeric(textPositions, i) //
&& isDot(textPositions, i - 1) //
&& isDot(textPositions, i - 2) //
&& isDot(textPositions, i - 3);
}
private static boolean isDot(List<TextPosition> textPositions, int i) {
return DOT_LIKE_CHARACTERS.contains(textPositions.get(i).getUnicode());
}
private static boolean alphanumeric(List<TextPosition> textPositions, int i) {
return Character.isAlphabetic(textPositions.get(i).getUnicode().charAt(0)) || Character.isDigit(textPositions.get(i).getUnicode().charAt(0));
}

View File

@ -26,6 +26,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.textbloc
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.text.ListIdentifier;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
@ -80,7 +81,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
visualizationsOnPage.getColoredLines()
.addAll(cleanRulings.buildAll()
.stream()
.map(ruling -> new ColoredLine(ruling, decideOnRulingColor(ruling), 0.5f))
.map(ruling -> new ColoredLine(ruling, decideOnRulingColor(ruling), LINE_WIDTH))
.toList());
}
@ -93,7 +94,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.rulings);
visualizationsOnPage.getColoredLines()
.addAll(rulings.stream()
.map(ruling -> new ColoredLine(ruling, decideOnRulingColor(ruling), 0.5f))
.map(ruling -> new ColoredLine(ruling, decideOnRulingColor(ruling), LINE_WIDTH))
.toList());
}
@ -182,7 +183,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
visualizationsOnPage.getColoredRectangles()
.addAll(lines.stream()
.map(BoundingBox::getBBoxPdf)
.map(line -> new ColoredRectangle(line, LINES_COLOR, 0.5f))
.map(line -> new ColoredRectangle(line, LINES_COLOR, LINE_WIDTH))
.toList());
}
@ -198,7 +199,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
.map(line -> line.stream()
.map(BoundingBox::getBBoxPdf)
.collect(RectangleTransformations.collectBBox()))
.map(line -> new ColoredRectangle(line, LINES_COLOR, 0.5f))
.map(line -> new ColoredRectangle(line, LINES_COLOR, LINE_WIDTH))
.toList());
}
@ -300,12 +301,12 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
visualizationsOnPage.getColoredRectangles()
.addAll(numbers.stream()
.map(BoundingBox::getBBoxPdf)
.map(line -> new ColoredRectangle(line, LINES_COLOR, 0.5f))
.map(line -> new ColoredRectangle(line, LINES_COLOR, LINE_WIDTH))
.toList());
visualizationsOnPage.getColoredRectangles()
.add(new ColoredRectangle(numbers.stream()
.map(BoundingBox::getBBoxPdf)
.collect(RectangleTransformations.collectBBox()), LINES_COLOR, 0.5f));
.collect(RectangleTransformations.collectBBox()), LINES_COLOR, LINE_WIDTH));
}
@ -351,4 +352,13 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
visualizationsOnPage.getPlacedTexts().add(PlacedText.textFacingUp(outlineObject.getTitle(), textPoint, 10, outlineObject.isFound() ? Color.BLACK : Color.RED, FONT));
}
public void addListIdentifiers(List<ListIdentifier> listIdentifiers) {
for (ListIdentifier listIdentifier : listIdentifiers) {
getOrCreateVisualizationsOnPage(listIdentifier.getPage(), this.listIdentifiers).getColoredRectangles()
.add(new ColoredRectangle(listIdentifier.getWord().getBBoxPdf(), WORDS_COLOR, LINE_WIDTH));
}
}
}

View File

@ -37,7 +37,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
@Disabled
public void testLayoutParserEndToEnd() {
String filePath = "/home/kschuettler/Dokumente/TestFiles/NER Dataset/Syngenta prod/77c680315c31d403d2e023be023b2087.PREVIEW.pdf";
String filePath = "/home/kschuettler/Dokumente/LayoutparsingEvaluation/VV-340050.pdf";
runForFile(filePath);
}

View File

@ -79,7 +79,7 @@ public class OutlineDetectionTest extends AbstractTest {
var documentFile = new ClassPathResource(fileName).getFile();
long start = System.currentTimeMillis();
ClassificationDocument classificationDocument = parseLayout(fileName, LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH);
ClassificationDocument classificationDocument = parseLayout(fileName, LayoutParsingType.DOCUMINE_OLD);
Document document = buildGraph(fileName, classificationDocument);
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
OutlineObjectTree outlineObjectTree = classificationDocument.getOutlineObjectTree();
@ -102,7 +102,7 @@ public class OutlineDetectionTest extends AbstractTest {
TableOfContents tableOfContents = classificationDocument.getTableOfContents();
assertEquals(tableOfContents.getMainSections().size(), 10);
assertEquals(tableOfContents.getMainSections().size(), 9);
assertEquals(tableOfContents.getMainSections().subList(1, 9)
.stream()
.map(tableOfContentItem -> sanitizeString(tableOfContentItem.getHeadline().toString()))
@ -135,7 +135,7 @@ public class OutlineDetectionTest extends AbstractTest {
List<SemanticNode> childrenOfTypeSectionOrSuperSection = document.getChildrenOfTypeSectionOrSuperSection();
assertEquals(childrenOfTypeSectionOrSuperSection.size(), 10);
assertEquals(childrenOfTypeSectionOrSuperSection.size(), 9);
assertEquals(childrenOfTypeSectionOrSuperSection.subList(1, 9)
.stream()
.map(section -> sanitizeString(section.getHeadline().getLeafTextBlock().toString()))

View File

@ -57,6 +57,7 @@ public record LayerIdentifier(String name, String markedContentName) {
public static final LayerIdentifier OUTLINE_OBJECTS = new LayerIdentifier("Outline Positions", "OUTLINE_OBJECTS");
public static final LayerIdentifier SENTENCES = new LayerIdentifier("Sentences", "SENTENCES");
public static final LayerIdentifier TOC_PAGES = new LayerIdentifier("TOC pages", "TOC_PAGES");
public static final LayerIdentifier LIST_IDENTIFIERS = new LayerIdentifier("List identifiers", "LIST_IDENTIFIERS");
// Visual layout parser
public static final LayerIdentifier KNECON_VISUAL_PARSING = new LayerIdentifier("Visual Layout Parser", "VISUAL_PARSING");

View File

@ -18,6 +18,8 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup {
protected static final Standard14EmbeddableFont FONT = Standard14EmbeddableFont.helvetica();
public static final float LINE_WIDTH = 0.5f;
protected static final Color WORDS_COLOR = new Color(68, 84, 147);
protected static final Color LINES_COLOR = new Color(152, 45, 179);
protected static final Color ZONES_COLOR = new Color(131, 38, 38);
@ -57,6 +59,7 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup {
protected final Visualizations outlineObjects = Visualizations.builder().layer(LayerIdentifier.OUTLINE_OBJECTS).build();
protected final Visualizations sentences = Visualizations.builder().layer(LayerIdentifier.SENTENCES).build();
protected final Visualizations tocPages = Visualizations.builder().layer(LayerIdentifier.TOC_PAGES).build();
protected final Visualizations listIdentifiers = Visualizations.builder().layer(LayerIdentifier.LIST_IDENTIFIERS).build();
public List<Visualizations> getVisualizations() {
@ -73,7 +76,10 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup {
mainBody, //
markedContent, //
outlineObjects, //
tocPages);
tocPages, //
listIdentifiers //
);
}
}