Merge branch 'feature/RED-10127-bp' into 'release/0.159.x'
RED-10127: add list classification See merge request fforesight/layout-parser!238
This commit is contained in:
commit
d3c4413ece
@ -374,14 +374,7 @@ public class LayoutParsingPipeline {
|
|||||||
|
|
||||||
classificationService.classify(classificationDocument, layoutParsingType, identifier);
|
classificationService.classify(classificationDocument, layoutParsingType, identifier);
|
||||||
|
|
||||||
List<TextPageBlock> headlines = classificationDocument.getPages()
|
TableOfContents tableOfContents = outlineValidationService.createToC(classificationDocument);
|
||||||
.stream()
|
|
||||||
.flatMap(classificationPage -> classificationPage.getTextBlocks()
|
|
||||||
.stream()
|
|
||||||
.filter(tb -> tb instanceof TextPageBlock && tb.getClassification() != null && tb.getClassification().isHeadline())
|
|
||||||
.map(tb -> (TextPageBlock) tb))
|
|
||||||
.toList();
|
|
||||||
TableOfContents tableOfContents = outlineValidationService.createToC(headlines);
|
|
||||||
classificationDocument.setTableOfContents(tableOfContents);
|
classificationDocument.setTableOfContents(tableOfContents);
|
||||||
|
|
||||||
log.info("Building Sections for {}", identifier);
|
log.info("Building Sections for {}", identifier);
|
||||||
|
|||||||
@ -165,4 +165,16 @@ public abstract class TextBoundingBox extends BoundingBox {
|
|||||||
return this.getYDirAdj() - threshold <= other.getMaxYDirAdj() && this.getMaxYDirAdj() + threshold >= other.getYDirAdj();
|
return this.getYDirAdj() - threshold <= other.getMaxYDirAdj() && this.getMaxYDirAdj() + threshold >= other.getYDirAdj();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean isAboveDirAdj(TextBoundingBox other) {
|
||||||
|
|
||||||
|
return other.isBelow(this);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean isBelowDirAdj(TextBoundingBox other) {
|
||||||
|
|
||||||
|
return this.intersectsXDirAdj(other) && this.getYDirAdj() >= other.getMaxYDirAdj();
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -9,13 +9,13 @@ public enum PageBlockType {
|
|||||||
H6,
|
H6,
|
||||||
HEADER,
|
HEADER,
|
||||||
FOOTER,
|
FOOTER,
|
||||||
TITLE,
|
|
||||||
PARAGRAPH,
|
PARAGRAPH,
|
||||||
PARAGRAPH_BOLD,
|
PARAGRAPH_BOLD,
|
||||||
PARAGRAPH_ITALIC,
|
PARAGRAPH_ITALIC,
|
||||||
PARAGRAPH_UNKNOWN,
|
PARAGRAPH_UNKNOWN,
|
||||||
OTHER,
|
OTHER,
|
||||||
TABLE_OF_CONTENTS_ITEM,
|
TABLE_OF_CONTENTS_ITEM,
|
||||||
|
LIST_ITEM,
|
||||||
TABLE;
|
TABLE;
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -10,6 +10,7 @@ import java.util.TreeSet;
|
|||||||
|
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
|
|
||||||
import io.micrometer.observation.annotation.Observed;
|
import io.micrometer.observation.annotation.Observed;
|
||||||
@ -20,7 +21,9 @@ import lombok.extern.slf4j.Slf4j;
|
|||||||
public class OutlineValidationService {
|
public class OutlineValidationService {
|
||||||
|
|
||||||
@Observed(name = "OutlineValidationService", contextualName = "create-toc")
|
@Observed(name = "OutlineValidationService", contextualName = "create-toc")
|
||||||
public TableOfContents createToC(List<TextPageBlock> headlines) {
|
public TableOfContents createToC(ClassificationDocument classificationDocument) {
|
||||||
|
|
||||||
|
List<TextPageBlock> headlines = extractHeadlines(classificationDocument);
|
||||||
|
|
||||||
List<TableOfContentItem> mainSections = new ArrayList<>();
|
List<TableOfContentItem> mainSections = new ArrayList<>();
|
||||||
Map<Integer, TableOfContentItem> lastItemsPerDepth = new HashMap<>();
|
Map<Integer, TableOfContentItem> lastItemsPerDepth = new HashMap<>();
|
||||||
@ -60,4 +63,16 @@ public class OutlineValidationService {
|
|||||||
return new TableOfContents(mainSections);
|
return new TableOfContents(mainSections);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static List<TextPageBlock> extractHeadlines(ClassificationDocument classificationDocument) {
|
||||||
|
|
||||||
|
return classificationDocument.getPages()
|
||||||
|
.stream()
|
||||||
|
.flatMap(classificationPage -> classificationPage.getTextBlocks()
|
||||||
|
.stream()
|
||||||
|
.filter(tb -> tb instanceof TextPageBlock && tb.getClassification() != null && tb.getClassification().isHeadline())
|
||||||
|
.map(tb -> (TextPageBlock) tb))
|
||||||
|
.toList();
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -0,0 +1,8 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||||
|
|
||||||
|
public record AbstractBlockOnPage(AbstractPageBlock block, ClassificationPage page) {
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,85 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.regex.Matcher;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import lombok.AccessLevel;
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Getter;
|
||||||
|
import lombok.experimental.FieldDefaults;
|
||||||
|
|
||||||
|
@AllArgsConstructor
|
||||||
|
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||||
|
public class ListIdentifier {
|
||||||
|
|
||||||
|
public static final Pattern STARTING_NUMBERS = Pattern.compile("^\\s*([1-9]+)\\.\\s+");
|
||||||
|
|
||||||
|
enum Format {
|
||||||
|
NUMBERS
|
||||||
|
}
|
||||||
|
|
||||||
|
Format format;
|
||||||
|
@Getter
|
||||||
|
TextPositionSequence word;
|
||||||
|
@Getter
|
||||||
|
int page;
|
||||||
|
int representation;
|
||||||
|
|
||||||
|
|
||||||
|
public static Optional<ListIdentifier> parse(TextPageBlock textPageBlock, int page) {
|
||||||
|
|
||||||
|
return parse(textPageBlock.getSequences().subList(0, Math.min(5, textPageBlock.getSequences().size())), page);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static Optional<ListIdentifier> parse(List<TextPositionSequence> sequences, int page) {
|
||||||
|
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
for (TextPositionSequence sequence : sequences) {
|
||||||
|
sb.append(sequence.toString());
|
||||||
|
sb.append(" ");
|
||||||
|
}
|
||||||
|
sb.replace(sb.length() - 1, sb.length(), "");
|
||||||
|
String text = sb.toString();
|
||||||
|
|
||||||
|
Matcher numberMatcher = STARTING_NUMBERS.matcher(text);
|
||||||
|
|
||||||
|
if (numberMatcher.find()) {
|
||||||
|
return Optional.of(new ListIdentifier(Format.NUMBERS, sequences.get(0), page, Integer.parseInt(numberMatcher.group(1))));
|
||||||
|
}
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static boolean isInOrder(List<ListIdentifier> listIdentifiers) {
|
||||||
|
|
||||||
|
if (listIdentifiers.size() <= 1) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 1; i < listIdentifiers.size(); i++) {
|
||||||
|
ListIdentifier current = listIdentifiers.get(i);
|
||||||
|
ListIdentifier previous = listIdentifiers.get(i - 1);
|
||||||
|
if (current.format != previous.format) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (current.representation <= previous.representation) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (!current.word.intersectsXDirAdj(previous.word, 2)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (current.page == previous.page && !current.word.isBelowDirAdj(previous.word)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (current.page < previous.page) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -2,6 +2,6 @@ package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
|||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||||
|
|
||||||
public record TextBlockOnPage(ClassificationPage page, TextPageBlock textBlock) {
|
public record TextBlockOnPage(TextPageBlock textBlock, ClassificationPage page) {
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -33,8 +33,6 @@ public class TextPageBlock extends AbstractPageBlock {
|
|||||||
|
|
||||||
private boolean underlined;
|
private boolean underlined;
|
||||||
|
|
||||||
private double highestFontSize;
|
|
||||||
|
|
||||||
private PageBlockType classification;
|
private PageBlockType classification;
|
||||||
|
|
||||||
private boolean toDuplicate;
|
private boolean toDuplicate;
|
||||||
@ -262,6 +260,12 @@ public class TextPageBlock extends AbstractPageBlock {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double getHighestFontSize() {
|
||||||
|
|
||||||
|
return frequencyCounters.getFontSizeFrequencyCounter().getHighest();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean isEmpty() {
|
public boolean isEmpty() {
|
||||||
|
|
||||||
|
|||||||
@ -18,6 +18,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.SectionIdentif
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
|
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
|
||||||
|
|
||||||
import lombok.Data;
|
import lombok.Data;
|
||||||
|
|
||||||
@ -467,7 +468,9 @@ public class BlockificationPostprocessingService {
|
|||||||
|
|
||||||
private static String sanitizeString(String text) {
|
private static String sanitizeString(String text) {
|
||||||
|
|
||||||
return StringUtils.deleteWhitespace(text).toLowerCase(Locale.ROOT);
|
return TextNormalizationUtilities.removeAllWhitespaces(text)//
|
||||||
|
.trim() // sometimes there are trailing empty bytes at the end of the string trim() seems to remove them
|
||||||
|
.toLowerCase(Locale.ENGLISH);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -10,7 +10,6 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
|
||||||
|
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
@ -73,7 +72,9 @@ public class ClarifyndClassificationService {
|
|||||||
&& (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5
|
&& (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5
|
||||||
&& textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) {
|
&& textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) {
|
||||||
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
|
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
|
||||||
textBlock.setClassification(PageBlockType.TITLE);
|
PageBlockType headlineType = HeadlineClassificationService.headlineClassByFontSize(textBlock, headlineFontSizes);
|
||||||
|
headlineClassificationService.classifyHeadline(textBlock, headlineType);
|
||||||
|
document.setHeadlines(true);
|
||||||
}
|
}
|
||||||
} else if (textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular()
|
} else if (textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular()
|
||||||
&& PositionUtils.getApproxLineCount(textBlock) < 4.9
|
&& PositionUtils.getApproxLineCount(textBlock) < 4.9
|
||||||
|
|||||||
@ -0,0 +1,29 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.services.classification;
|
||||||
|
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
public class ClassificationPatterns {
|
||||||
|
|
||||||
|
public static final Pattern HEADLINE_WITH_2_IDENTIFER_PATTERN = Pattern.compile("^([1-9]\\d?\\.){1,3}\\d{1,2}\\.?\\s[a-z][0-9a-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
|
||||||
|
|
||||||
|
public static final Pattern HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN = Pattern.compile("^([0-9]\\.)\\s[a-z][0-9a-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
|
||||||
|
|
||||||
|
public static final Pattern AT_LEAST_3_CHARS_PATTERN = Pattern.compile("\\p{L}{3,}", Pattern.CASE_INSENSITIVE);
|
||||||
|
|
||||||
|
public static final Pattern HEADLINE_PATTERN_WITH_SLASHES = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*");
|
||||||
|
|
||||||
|
public static final Pattern AMOUNT_PATTERN = Pattern.compile(
|
||||||
|
"^\\s*\\d+(?:\\.\\d+)?\\s*(?:ml|l|g|kg|mg|cm|cm2|cm3|mm|mm2|mm3|km|km2|m|m2|m3|lb|oz|ppm|dpm|days|weeks|months|%|f)\\b",
|
||||||
|
Pattern.CASE_INSENSITIVE);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
public static final Pattern TABLE_OR_FIGURE_PATTERN = Pattern.compile(
|
||||||
|
"^\\s*(?:table|continued\\s+table|appendix|figure)\\s+(?:[xvi]+|[a-z0-9]{1,3}(?:\\.[0-9]{1,3})*(?:-[0-9]{1,3})?)\\b",
|
||||||
|
Pattern.CASE_INSENSITIVE);
|
||||||
|
|
||||||
|
public static final Pattern ALPHANUMERIC = Pattern.compile("[a-zA-Z0-9]");
|
||||||
|
|
||||||
|
public static final Pattern NUMERIC = Pattern.compile("[0-9]+");
|
||||||
|
|
||||||
|
}
|
||||||
@ -1,9 +1,19 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.services.classification;
|
package com.knecon.fforesight.service.layoutparser.processor.services.classification;
|
||||||
|
|
||||||
|
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.ALPHANUMERIC;
|
||||||
|
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.AMOUNT_PATTERN;
|
||||||
|
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.AT_LEAST_3_CHARS_PATTERN;
|
||||||
|
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.HEADLINE_PATTERN_WITH_SLASHES;
|
||||||
|
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.HEADLINE_WITH_2_IDENTIFER_PATTERN;
|
||||||
|
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN;
|
||||||
|
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.TABLE_OR_FIGURE_PATTERN;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.Iterator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
import java.util.Set;
|
import java.util.Map;
|
||||||
import java.util.regex.Matcher;
|
import java.util.regex.Matcher;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
@ -13,90 +23,60 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.AbstractBlockOnPage;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.ListIdentifier;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.HeaderFooterDetection;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
|
||||||
|
|
||||||
|
import lombok.AccessLevel;
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import lombok.experimental.FieldDefaults;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
|
||||||
@Slf4j
|
@Slf4j
|
||||||
@Service
|
@Service
|
||||||
@RequiredArgsConstructor
|
@RequiredArgsConstructor
|
||||||
|
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||||
public class DocuMineClassificationService {
|
public class DocuMineClassificationService {
|
||||||
|
|
||||||
private static final Pattern HEADLINE_WITH_2_IDENTIFER_PATTERN = Pattern.compile("^([1-9]\\d?\\.){1,3}\\d{1,2}\\.?\\s[a-z][0-9a-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
|
|
||||||
private static final Pattern HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN = Pattern.compile("^([0-9]\\.)\\s[a-z][0-9a-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
|
|
||||||
private static final Pattern AT_LEAST_3_CHARS_PATTERN = Pattern.compile("\\p{L}{3,}", Pattern.CASE_INSENSITIVE);
|
|
||||||
private static final Pattern HEADLINE_PATTERN_WITH_SLASHES = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*");
|
|
||||||
public static final Pattern AMOUNT_PATTERN = Pattern.compile("^\\s*\\d+(?:\\.\\d+)?\\s*(?:ml|l|g|kg|mg|cm|cm2|cm3|mm|mm2|mm3|km|km2|m|m2|m3|lb|oz|ppm|dpm|days|weeks|months|%|f)\\b",
|
|
||||||
Pattern.CASE_INSENSITIVE);
|
|
||||||
private static final Pattern TABLE_OR_FIGURE_PATTERN = Pattern.compile(
|
|
||||||
"^\\s*(?:table|continued\\s+table|appendix|figure)\\s+(?:[xvi]+|[a-z0-9]{1,3}(?:\\.[0-9]{1,3})*(?:-[0-9]{1,3})?)\\b",
|
|
||||||
Pattern.CASE_INSENSITIVE);
|
|
||||||
private static final Pattern ALPHANUMERIC = Pattern.compile("[a-zA-Z0-9]");
|
|
||||||
|
|
||||||
public static final int SEPARATION_THRESHOLD = 10; // if the min distance between a textblock and all its surrounding blocks, the regexes can be more lenient.
|
public static final int SEPARATION_THRESHOLD = 10; // if the min distance between a textblock and all its surrounding blocks, the regexes can be more lenient.
|
||||||
public static final int SURROUNDING_BLOCKS_RADIUS = 3; // number of surrounding blocks before and after the current textblock to be tested
|
public static final int SURROUNDING_BLOCKS_RADIUS = 3; // number of surrounding blocks before and after the current textblock to be tested
|
||||||
|
|
||||||
|
ListItemClassificationService listItemClassificationService;
|
||||||
|
|
||||||
|
|
||||||
public void classifyDocument(ClassificationDocument document) {
|
public void classifyDocument(ClassificationDocument document) {
|
||||||
|
|
||||||
List<Double> headlineFontSizes = document.getFontSizeCounter().getValuesInReverseOrder();
|
List<Double> headlineFontSizes = buildHeadlineFontSizes(document);
|
||||||
|
List<AbstractBlockOnPage> blocks = buildBlocksPerPage(document);
|
||||||
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
|
log.debug("Headline FontSizes are: {}", headlineFontSizes);
|
||||||
|
|
||||||
HeadlineClassificationService headlineClassificationService = new HeadlineClassificationService();
|
HeadlineClassificationService headlineClassificationService = new HeadlineClassificationService();
|
||||||
|
|
||||||
for (ClassificationPage page : document.getPages()) {
|
for (int i = 0; i < blocks.size(); i++) {
|
||||||
document.getLayoutDebugLayer().addTextBlockVisualizations(page.getTextBlocks(), page.getPageNumber());
|
AbstractBlockOnPage block = blocks.get(i);
|
||||||
classifyPage(headlineClassificationService, page, document, headlineFontSizes);
|
document.getLayoutDebugLayer().addTextBlockVisualizations(block.page().getTextBlocks(), block.page().getPageNumber());
|
||||||
|
classifyBlock(headlineClassificationService, i, blocks, document, headlineFontSizes);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private void classifyPage(HeadlineClassificationService headlineClassificationService,
|
|
||||||
ClassificationPage page,
|
|
||||||
ClassificationDocument document,
|
|
||||||
List<Double> headlineFontSizes) {
|
|
||||||
|
|
||||||
List<AbstractPageBlock> textBlocks = page.getTextBlocks();
|
|
||||||
for (int i = 0; i < textBlocks.size(); i++) {
|
|
||||||
AbstractPageBlock textBlock = textBlocks.get(i);
|
|
||||||
if (textBlock instanceof TextPageBlock) {
|
|
||||||
List<AbstractPageBlock> surroundingBlocks = getSurroundingBlocks(i, textBlocks);
|
|
||||||
classifyBlock(headlineClassificationService, (TextPageBlock) textBlock, surroundingBlocks, page, document, headlineFontSizes);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private List<AbstractPageBlock> getSurroundingBlocks(int originalIndex, List<AbstractPageBlock> textBlocks) {
|
|
||||||
|
|
||||||
int start = Math.max(originalIndex - SURROUNDING_BLOCKS_RADIUS, 0);
|
|
||||||
int end = Math.min(originalIndex + SURROUNDING_BLOCKS_RADIUS, textBlocks.size());
|
|
||||||
List<AbstractPageBlock> surroundingBlocks = new ArrayList<>(2 * SURROUNDING_BLOCKS_RADIUS);
|
|
||||||
for (int i = start; i < end; i++) {
|
|
||||||
if (i == originalIndex) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (textBlocks.get(i).getText().length() <= 1) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
surroundingBlocks.add(textBlocks.get(i));
|
|
||||||
}
|
|
||||||
return surroundingBlocks;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void classifyBlock(HeadlineClassificationService headlineClassificationService,
|
private void classifyBlock(HeadlineClassificationService headlineClassificationService,
|
||||||
TextPageBlock textBlock,
|
int currentIndex,
|
||||||
List<AbstractPageBlock> surroundingBlocks,
|
List<AbstractBlockOnPage> allBlocks,
|
||||||
ClassificationPage page,
|
|
||||||
ClassificationDocument document,
|
ClassificationDocument document,
|
||||||
List<Double> headlineFontSizes) {
|
List<Double> headlineFontSizes) {
|
||||||
|
|
||||||
|
TextPageBlock textBlock;
|
||||||
|
if (allBlocks.get(currentIndex).block() instanceof TextPageBlock block) {
|
||||||
|
textBlock = block;
|
||||||
|
} else {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
ClassificationPage page = allBlocks.get(currentIndex).page();
|
||||||
|
List<AbstractPageBlock> surroundingBlocks = getSurroundingBlocksOnPage(currentIndex, allBlocks);
|
||||||
|
|
||||||
log.debug("headlineFontSizes: {}", headlineFontSizes);
|
log.debug("headlineFontSizes: {}", headlineFontSizes);
|
||||||
var bodyTextFrame = page.getBodyTextFrame();
|
var bodyTextFrame = page.getBodyTextFrame();
|
||||||
|
|
||||||
@ -113,6 +93,9 @@ public class DocuMineClassificationService {
|
|||||||
|
|
||||||
boolean enoughChars = charCount > textBlock.getText().length() * 0.5;
|
boolean enoughChars = charCount > textBlock.getText().length() * 0.5;
|
||||||
|
|
||||||
|
List<ListIdentifier> listIdentifiers = listItemClassificationService.findConfirmedListIdentifiers(currentIndex, allBlocks);
|
||||||
|
document.getLayoutDebugLayer().addListIdentifiers(listIdentifiers);
|
||||||
|
|
||||||
if (textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) {
|
if (textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) {
|
||||||
headlineClassificationService.setLastHeadlineFromOutline(textBlock);
|
headlineClassificationService.setLastHeadlineFromOutline(textBlock);
|
||||||
return;
|
return;
|
||||||
@ -126,26 +109,21 @@ public class DocuMineClassificationService {
|
|||||||
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5
|
if (textBlock.getText().length() > 5
|
||||||
&& textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) {
|
&& greaterOrEqualFontThanDocumentAverage(textBlock, document)
|
||||||
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
|
&& PositionUtils.getApproxLineCount(textBlock) < 5.9
|
||||||
textBlock.setClassification(PageBlockType.TITLE);
|
&& ((textBlock.getMostPopularWordStyle().contains("bold") || textBlock.isUnderlined())//
|
||||||
}
|
&& Character.isDigit(textBlock.toString().charAt(0)) //
|
||||||
} else if (textBlock.getText().length() > 5
|
&& isAtLeast3Characters //
|
||||||
&& greaterOrEqualFontThanDocumentAverage(textBlock, document)
|
&& !textBlock.toString().contains(":") //
|
||||||
&& PositionUtils.getApproxLineCount(textBlock) < 5.9
|
|| textBlock.toString().startsWith("APPENDIX") //
|
||||||
&& ((textBlock.getMostPopularWordStyle().contains("bold") || textBlock.isUnderlined())//
|
|| textBlock.toString().startsWith("FIGURE") //
|
||||||
&& Character.isDigit(textBlock.toString().charAt(0)) //
|
|| textBlock.toString().startsWith("Continued TABLE") //
|
||||||
&& isAtLeast3Characters //
|
|| textBlock.toString().startsWith("TABLE"))
|
||||||
&& !textBlock.toString().contains(":") //
|
&& !textBlock.toString().endsWith(":")
|
||||||
|| textBlock.toString().startsWith("APPENDIX") //
|
&& isAtLeast3Characters
|
||||||
|| textBlock.toString().startsWith("FIGURE") //
|
&& !isAmount
|
||||||
|| textBlock.toString().startsWith("Continued TABLE") //
|
&& enoughChars) {
|
||||||
|| textBlock.toString().startsWith("TABLE"))
|
|
||||||
&& !textBlock.toString().endsWith(":")
|
|
||||||
&& isAtLeast3Characters
|
|
||||||
&& !isAmount
|
|
||||||
&& enoughChars) {
|
|
||||||
|
|
||||||
setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes);
|
setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes);
|
||||||
} else if (isAllCaps(textBlock)
|
} else if (isAllCaps(textBlock)
|
||||||
@ -170,11 +148,14 @@ public class DocuMineClassificationService {
|
|||||||
} else if (hasSeparation(textBlock, surroundingBlocks)//
|
} else if (hasSeparation(textBlock, surroundingBlocks)//
|
||||||
&& greaterOrEqualFontThanPageAverage(textBlock, page)//
|
&& greaterOrEqualFontThanPageAverage(textBlock, page)//
|
||||||
&& PositionUtils.getApproxLineCount(textBlock) < 2.9//
|
&& PositionUtils.getApproxLineCount(textBlock) < 2.9//
|
||||||
&& (tableOrFigureMatcher.reset().find() || headlineWithSingleIdentifierMatcher.reset().find())//
|
&& (tableOrFigureMatcher.reset().find() || (headlineWithSingleIdentifierMatcher.reset().find() && listIdentifiers.isEmpty())) //
|
||||||
&& !isAmount//
|
&& !isAmount//
|
||||||
&& !headlineWithSlashesMatches) {
|
&& !headlineWithSlashesMatches) {
|
||||||
|
|
||||||
setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes);
|
setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes);
|
||||||
|
} else if (!listIdentifiers.isEmpty()) {
|
||||||
|
|
||||||
|
textBlock.setClassification(PageBlockType.LIST_ITEM);
|
||||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
|
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
|
||||||
&& textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()
|
&& textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()
|
||||||
&& textBlock.getMostPopularWordStyle().equals("bold")
|
&& textBlock.getMostPopularWordStyle().equals("bold")
|
||||||
@ -264,6 +245,92 @@ public class DocuMineClassificationService {
|
|||||||
document.setHeadlines(true);
|
document.setHeadlines(true);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private List<AbstractBlockOnPage> buildBlocksPerPage(ClassificationDocument document) {
|
||||||
|
|
||||||
|
List<AbstractBlockOnPage> blocks = new ArrayList<>();
|
||||||
|
for (ClassificationPage page : document.getPages()) {
|
||||||
|
for (AbstractPageBlock abstractPageBlock : page.getTextBlocks()) {
|
||||||
|
if (abstractPageBlock instanceof TextPageBlock textBlock) {
|
||||||
|
if (textBlock.getClassification() != null && (textBlock.getClassification().equals(PageBlockType.HEADER) //
|
||||||
|
|| textBlock.getClassification().equals(PageBlockType.FOOTER))) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
blocks.add(new AbstractBlockOnPage(textBlock, page));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return blocks;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static List<Double> buildHeadlineFontSizes(ClassificationDocument document) {
|
||||||
|
|
||||||
|
if (document.getFontSizeCounter().getCountPerValue().size() <= 6) {
|
||||||
|
return document.getFontSizeCounter().getValuesInReverseOrder();
|
||||||
|
}
|
||||||
|
|
||||||
|
List<Map.Entry<Double, Integer>> sortedEntries = new ArrayList<>(document.getFontSizeCounter().getCountPerValue().entrySet());
|
||||||
|
sortedEntries.sort(Map.Entry.comparingByKey());
|
||||||
|
|
||||||
|
int totalCount = sortedEntries.stream()
|
||||||
|
.mapToInt(Map.Entry::getValue).sum();
|
||||||
|
|
||||||
|
int cumulativeCount = 0;
|
||||||
|
Iterator<Map.Entry<Double, Integer>> iterator = sortedEntries.iterator();
|
||||||
|
while (iterator.hasNext()) {
|
||||||
|
Map.Entry<Double, Integer> entry = iterator.next();
|
||||||
|
cumulativeCount += entry.getValue();
|
||||||
|
if (cumulativeCount > totalCount * 0.3) {
|
||||||
|
break; // We've filtered the bottom 30%, so stop.
|
||||||
|
}
|
||||||
|
iterator.remove();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (sortedEntries.size() < 6) {
|
||||||
|
return document.getFontSizeCounter().getValuesInReverseOrder();
|
||||||
|
}
|
||||||
|
int clusterSize = Math.max(1, sortedEntries.size() / 6);
|
||||||
|
|
||||||
|
List<List<Double>> clusters = new ArrayList<>();
|
||||||
|
for (int i = 0; i < 6; i++) {
|
||||||
|
clusters.add(new ArrayList<>());
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < sortedEntries.size(); i++) {
|
||||||
|
int clusterIndex = Math.min(i / clusterSize, 5);
|
||||||
|
clusters.get(clusterIndex).add(sortedEntries.get(i).getKey());
|
||||||
|
}
|
||||||
|
|
||||||
|
return clusters.stream()
|
||||||
|
.map(cluster -> cluster.stream()
|
||||||
|
.mapToDouble(d -> d).average()
|
||||||
|
.orElseThrow())
|
||||||
|
.sorted(Comparator.reverseOrder())
|
||||||
|
.toList();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private List<AbstractPageBlock> getSurroundingBlocksOnPage(int originalIndex, List<AbstractBlockOnPage> textBlocks) {
|
||||||
|
|
||||||
|
int start = Math.max(originalIndex - SURROUNDING_BLOCKS_RADIUS, 0);
|
||||||
|
int end = Math.min(originalIndex + SURROUNDING_BLOCKS_RADIUS, textBlocks.size());
|
||||||
|
List<AbstractPageBlock> surroundingBlocks = new ArrayList<>(2 * SURROUNDING_BLOCKS_RADIUS);
|
||||||
|
for (int i = start; i < end; i++) {
|
||||||
|
if (i == originalIndex) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (textBlocks.get(i).block().getText().length() <= 1) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (!textBlocks.get(i).page().equals(textBlocks.get(originalIndex).page())) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
surroundingBlocks.add(textBlocks.get(i).block());
|
||||||
|
}
|
||||||
|
return surroundingBlocks;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -73,13 +73,18 @@ public class HeadlineClassificationService {
|
|||||||
|
|
||||||
public static PageBlockType headlineClassByFontSize(TextPageBlock textBlock, List<Double> fontSizeGroups) {
|
public static PageBlockType headlineClassByFontSize(TextPageBlock textBlock, List<Double> fontSizeGroups) {
|
||||||
|
|
||||||
PageBlockType headlineType = PageBlockType.H1;
|
List<Double> distances = fontSizeGroups.stream()
|
||||||
for (int i = 1; i <= fontSizeGroups.size(); i++) {
|
.map(fontSize -> Math.abs(fontSize - textBlock.getMostPopularWordFontSize()))
|
||||||
if (textBlock.getMostPopularWordFontSize() == fontSizeGroups.get(i - 1)) {
|
.toList();
|
||||||
headlineType = PageBlockType.getHeadlineType(i);
|
double min = Double.MAX_VALUE;
|
||||||
|
int argMin = -1;
|
||||||
|
for (int i = 0; i < distances.size(); i++) {
|
||||||
|
if (distances.get(i) < min) {
|
||||||
|
min = distances.get(i);
|
||||||
|
argMin = i;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return headlineType;
|
return PageBlockType.getHeadlineType(argMin);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -0,0 +1,99 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.services.classification;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.LinkedList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.AbstractBlockOnPage;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.ListIdentifier;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
|
|
||||||
|
@Service
|
||||||
|
public class ListItemClassificationService {
|
||||||
|
|
||||||
|
public static final int LIST_IDENTIFIER_SEARCH_RADIUS = 3;
|
||||||
|
|
||||||
|
|
||||||
|
public List<ListIdentifier> findConfirmedListIdentifiers(int currentIndex, List<AbstractBlockOnPage> allBlocks) {
|
||||||
|
|
||||||
|
List<ListIdentifier> listIdentifiers = extractListIdentifiers(allBlocks.get(currentIndex));
|
||||||
|
if (listIdentifiers.isEmpty()) {
|
||||||
|
return Collections.emptyList();
|
||||||
|
}
|
||||||
|
if (listIdentifiers.size() > 1 && ListIdentifier.isInOrder(listIdentifiers)) {
|
||||||
|
return listIdentifiers;
|
||||||
|
}
|
||||||
|
|
||||||
|
int start = Math.max(0, currentIndex - LIST_IDENTIFIER_SEARCH_RADIUS);
|
||||||
|
int end = Math.min(allBlocks.size(), currentIndex + LIST_IDENTIFIER_SEARCH_RADIUS);
|
||||||
|
|
||||||
|
List<ListIdentifier> identifiersBehind = new ArrayList<>();
|
||||||
|
if (start < currentIndex) {
|
||||||
|
identifiersBehind.addAll(allBlocks.subList(start, currentIndex)
|
||||||
|
.stream()
|
||||||
|
.map(this::extractListIdentifiers)
|
||||||
|
.flatMap(Collection::stream)
|
||||||
|
.toList());
|
||||||
|
}
|
||||||
|
if (!identifiersBehind.isEmpty()) {
|
||||||
|
listIdentifiers.add(0, identifiersBehind.get(identifiersBehind.size() - 1));
|
||||||
|
if (ListIdentifier.isInOrder(listIdentifiers)) {
|
||||||
|
return listIdentifiers;
|
||||||
|
}
|
||||||
|
listIdentifiers.remove(0);
|
||||||
|
}
|
||||||
|
List<ListIdentifier> identifiersAhead = new ArrayList<>();
|
||||||
|
if (currentIndex + 1 < end) {
|
||||||
|
identifiersAhead.addAll(allBlocks.subList(currentIndex + 1, end)
|
||||||
|
.stream()
|
||||||
|
.map(this::extractListIdentifiers)
|
||||||
|
.flatMap(Collection::stream)
|
||||||
|
.toList());
|
||||||
|
}
|
||||||
|
if (!identifiersAhead.isEmpty()) {
|
||||||
|
listIdentifiers.add(identifiersAhead.get(0));
|
||||||
|
if (ListIdentifier.isInOrder(listIdentifiers)) {
|
||||||
|
return listIdentifiers;
|
||||||
|
}
|
||||||
|
listIdentifiers.remove(listIdentifiers.size() - 1);
|
||||||
|
}
|
||||||
|
return Collections.emptyList();
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private List<ListIdentifier> extractListIdentifiers(AbstractBlockOnPage block) {
|
||||||
|
|
||||||
|
List<ListIdentifier> result = new LinkedList<>();
|
||||||
|
if (block.block() instanceof TextPageBlock textBlock) {
|
||||||
|
List<TextPositionSequence> sequences = textBlock.getSequences();
|
||||||
|
for (int i = 0; i < sequences.size(); i++) {
|
||||||
|
|
||||||
|
if (i != 0 && sequences.get(i - 1).getXDirAdj() < sequences.get(i).getXDirAdj()) {
|
||||||
|
// is not the start of a line, continue
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
TextPositionSequence sequence = sequences.get(i);
|
||||||
|
List<TextPositionSequence> wordsAtStartOfLine = new ArrayList<>(3);
|
||||||
|
int end = Math.min(sequences.size(), i + 3);
|
||||||
|
for (int j = i; j < end; j++) {
|
||||||
|
if (sequences.get(j).intersectsYDirAdj(sequence, 2)) {
|
||||||
|
wordsAtStartOfLine.add(sequences.get(j));
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ListIdentifier.parse(wordsAtStartOfLine, block.page().getPageNumber()).ifPresent(result::add);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -86,7 +86,9 @@ public class RedactManagerClassificationService {
|
|||||||
&& textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) {
|
&& textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) {
|
||||||
|
|
||||||
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
|
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
|
||||||
textBlock.setClassification(PageBlockType.TITLE);
|
PageBlockType headlineType = HeadlineClassificationService.headlineClassByFontSize(textBlock, headlineFontSizes);
|
||||||
|
headlineClassificationService.classifyHeadline(textBlock, headlineType);
|
||||||
|
document.setHeadlines(true);
|
||||||
}
|
}
|
||||||
} else if (textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular()
|
} else if (textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular()
|
||||||
&& PositionUtils.getApproxLineCount(textBlock) < 4.9
|
&& PositionUtils.getApproxLineCount(textBlock) < 4.9
|
||||||
|
|||||||
@ -1,6 +1,7 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.services.classification;
|
package com.knecon.fforesight.service.layoutparser.processor.services.classification;
|
||||||
|
|
||||||
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.DocuMineClassificationService.AMOUNT_PATTERN;
|
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.AMOUNT_PATTERN;
|
||||||
|
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.NUMERIC;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
@ -13,7 +14,6 @@ import java.util.List;
|
|||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.regex.Pattern;
|
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
@ -42,8 +42,6 @@ public class TableOfContentsClassificationService {
|
|||||||
public static final int INTERSECTION_TOLERANCE = 2; // equality threshold for x intersection
|
public static final int INTERSECTION_TOLERANCE = 2; // equality threshold for x intersection
|
||||||
public static final int DENSITY_THRESHOLD_COUNT = 10; // describes the minimum density, at least this many entries per page height are required
|
public static final int DENSITY_THRESHOLD_COUNT = 10; // describes the minimum density, at least this many entries per page height are required
|
||||||
|
|
||||||
private static final Pattern NUMERIC = Pattern.compile("[0-9]+");
|
|
||||||
|
|
||||||
|
|
||||||
@SuppressWarnings("checkstyle:ModifiedControlVariable")
|
@SuppressWarnings("checkstyle:ModifiedControlVariable")
|
||||||
public void classifyTableOfContents(ClassificationDocument document) {
|
public void classifyTableOfContents(ClassificationDocument document) {
|
||||||
@ -57,11 +55,13 @@ public class TableOfContentsClassificationService {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
int offset = identifyTOCItems(i + 1, textBlocks, document);
|
int end = identifyTOCItems(i + 1, textBlocks, document);
|
||||||
|
|
||||||
if (offset > 1) {
|
if (end > i + 1) {
|
||||||
textBlock.textBlock().setClassification(PageBlockType.H1);
|
if (textBlock.textBlock().getClassification() == null) {
|
||||||
i += offset;
|
textBlock.textBlock().setClassification(PageBlockType.H1);
|
||||||
|
}
|
||||||
|
i = end;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -237,7 +237,7 @@ public class TableOfContentsClassificationService {
|
|||||||
|| textBlock.getClassification().equals(PageBlockType.FOOTER))) {
|
|| textBlock.getClassification().equals(PageBlockType.FOOTER))) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
blocks.add(new TextBlockOnPage(page, textBlock));
|
blocks.add(new TextBlockOnPage(textBlock, page));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -352,7 +352,7 @@ public class TableOfContentsClassificationService {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
int prev = getNumberAsInt(numbers, i);
|
int prev = getNumberAsInt(numbers, i - 1);
|
||||||
int curr = getNumberAsInt(numbers, i);
|
int curr = getNumberAsInt(numbers, i);
|
||||||
int next = getNumberAsInt(numbers, i + 1);
|
int next = getNumberAsInt(numbers, i + 1);
|
||||||
|
|
||||||
|
|||||||
@ -5,6 +5,7 @@ import java.awt.geom.Point2D;
|
|||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
import org.apache.pdfbox.contentstream.operator.Operator;
|
import org.apache.pdfbox.contentstream.operator.Operator;
|
||||||
import org.apache.pdfbox.contentstream.operator.OperatorName;
|
import org.apache.pdfbox.contentstream.operator.OperatorName;
|
||||||
@ -50,6 +51,7 @@ import lombok.extern.slf4j.Slf4j;
|
|||||||
@Slf4j
|
@Slf4j
|
||||||
public class PDFLinesTextStripper extends PDFTextStripper {
|
public class PDFLinesTextStripper extends PDFTextStripper {
|
||||||
|
|
||||||
|
private final static Set<String> DOT_LIKE_CHARACTERS = Set.of(".", "·", "•", "․", "‧", "∙", "⋅", "・", ".", "・", "…", "⸱", "﹒", "ꞏ");
|
||||||
private final List<TextPositionSequence> textPositionSequences = new ArrayList<>();
|
private final List<TextPositionSequence> textPositionSequences = new ArrayList<>();
|
||||||
private final List<Ruling> rulings = new ArrayList<>();
|
private final List<Ruling> rulings = new ArrayList<>();
|
||||||
private final List<Ruling> graphicsPath = new ArrayList<>();
|
private final List<Ruling> graphicsPath = new ArrayList<>();
|
||||||
@ -336,20 +338,32 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
|||||||
private boolean isWordFollowedByDottedLine(List<TextPosition> textPositions, int i, int startIndex) {
|
private boolean isWordFollowedByDottedLine(List<TextPosition> textPositions, int i, int startIndex) {
|
||||||
|
|
||||||
return i - startIndex >= 4 //
|
return i - startIndex >= 4 //
|
||||||
&& textPositions.get(i).getUnicode().equals(".") //
|
&& isDot(textPositions, i) //
|
||||||
&& textPositions.get(i - 1).getUnicode().equals(".") //
|
&& isDot(textPositions, i - 1) //
|
||||||
&& textPositions.get(i - 2).getUnicode().equals(".") //
|
&& isDot(textPositions, i - 2) //
|
||||||
&& !textPositions.get(i - 3).getUnicode().equals(".");
|
&& alphanumeric(textPositions, i - 3);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static boolean isDottedLineFollowedByWord(List<TextPosition> textPositions, int i, int startIndex) {
|
private static boolean isDottedLineFollowedByWord(List<TextPosition> textPositions, int i, int startIndex) {
|
||||||
|
|
||||||
return i - startIndex >= 4 //
|
return i - startIndex >= 4 //
|
||||||
&& !textPositions.get(i).getUnicode().equals(".") //
|
&& alphanumeric(textPositions, i) //
|
||||||
&& textPositions.get(i - 1).getUnicode().equals(".") //
|
&& isDot(textPositions, i - 1) //
|
||||||
&& textPositions.get(i - 2).getUnicode().equals(".") //
|
&& isDot(textPositions, i - 2) //
|
||||||
&& textPositions.get(i - 3).getUnicode().equals(".");
|
&& isDot(textPositions, i - 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static boolean isDot(List<TextPosition> textPositions, int i) {
|
||||||
|
|
||||||
|
return DOT_LIKE_CHARACTERS.contains(textPositions.get(i).getUnicode());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static boolean alphanumeric(List<TextPosition> textPositions, int i) {
|
||||||
|
|
||||||
|
return Character.isAlphabetic(textPositions.get(i).getUnicode().charAt(0)) || Character.isDigit(textPositions.get(i).getUnicode().charAt(0));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -26,6 +26,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.textbloc
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
|
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.ListIdentifier;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||||
@ -80,7 +81,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
|||||||
visualizationsOnPage.getColoredLines()
|
visualizationsOnPage.getColoredLines()
|
||||||
.addAll(cleanRulings.buildAll()
|
.addAll(cleanRulings.buildAll()
|
||||||
.stream()
|
.stream()
|
||||||
.map(ruling -> new ColoredLine(ruling, decideOnRulingColor(ruling), 0.5f))
|
.map(ruling -> new ColoredLine(ruling, decideOnRulingColor(ruling), LINE_WIDTH))
|
||||||
.toList());
|
.toList());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -93,7 +94,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
|||||||
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.rulings);
|
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.rulings);
|
||||||
visualizationsOnPage.getColoredLines()
|
visualizationsOnPage.getColoredLines()
|
||||||
.addAll(rulings.stream()
|
.addAll(rulings.stream()
|
||||||
.map(ruling -> new ColoredLine(ruling, decideOnRulingColor(ruling), 0.5f))
|
.map(ruling -> new ColoredLine(ruling, decideOnRulingColor(ruling), LINE_WIDTH))
|
||||||
.toList());
|
.toList());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -182,7 +183,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
|||||||
visualizationsOnPage.getColoredRectangles()
|
visualizationsOnPage.getColoredRectangles()
|
||||||
.addAll(lines.stream()
|
.addAll(lines.stream()
|
||||||
.map(BoundingBox::getBBoxPdf)
|
.map(BoundingBox::getBBoxPdf)
|
||||||
.map(line -> new ColoredRectangle(line, LINES_COLOR, 0.5f))
|
.map(line -> new ColoredRectangle(line, LINES_COLOR, LINE_WIDTH))
|
||||||
.toList());
|
.toList());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -198,7 +199,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
|||||||
.map(line -> line.stream()
|
.map(line -> line.stream()
|
||||||
.map(BoundingBox::getBBoxPdf)
|
.map(BoundingBox::getBBoxPdf)
|
||||||
.collect(RectangleTransformations.collectBBox()))
|
.collect(RectangleTransformations.collectBBox()))
|
||||||
.map(line -> new ColoredRectangle(line, LINES_COLOR, 0.5f))
|
.map(line -> new ColoredRectangle(line, LINES_COLOR, LINE_WIDTH))
|
||||||
.toList());
|
.toList());
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -300,12 +301,12 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
|||||||
visualizationsOnPage.getColoredRectangles()
|
visualizationsOnPage.getColoredRectangles()
|
||||||
.addAll(numbers.stream()
|
.addAll(numbers.stream()
|
||||||
.map(BoundingBox::getBBoxPdf)
|
.map(BoundingBox::getBBoxPdf)
|
||||||
.map(line -> new ColoredRectangle(line, LINES_COLOR, 0.5f))
|
.map(line -> new ColoredRectangle(line, LINES_COLOR, LINE_WIDTH))
|
||||||
.toList());
|
.toList());
|
||||||
visualizationsOnPage.getColoredRectangles()
|
visualizationsOnPage.getColoredRectangles()
|
||||||
.add(new ColoredRectangle(numbers.stream()
|
.add(new ColoredRectangle(numbers.stream()
|
||||||
.map(BoundingBox::getBBoxPdf)
|
.map(BoundingBox::getBBoxPdf)
|
||||||
.collect(RectangleTransformations.collectBBox()), LINES_COLOR, 0.5f));
|
.collect(RectangleTransformations.collectBBox()), LINES_COLOR, LINE_WIDTH));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -351,4 +352,13 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
|||||||
visualizationsOnPage.getPlacedTexts().add(PlacedText.textFacingUp(outlineObject.getTitle(), textPoint, 10, outlineObject.isFound() ? Color.BLACK : Color.RED, FONT));
|
visualizationsOnPage.getPlacedTexts().add(PlacedText.textFacingUp(outlineObject.getTitle(), textPoint, 10, outlineObject.isFound() ? Color.BLACK : Color.RED, FONT));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void addListIdentifiers(List<ListIdentifier> listIdentifiers) {
|
||||||
|
|
||||||
|
for (ListIdentifier listIdentifier : listIdentifiers) {
|
||||||
|
getOrCreateVisualizationsOnPage(listIdentifier.getPage(), this.listIdentifiers).getColoredRectangles()
|
||||||
|
.add(new ColoredRectangle(listIdentifier.getWord().getBBoxPdf(), WORDS_COLOR, LINE_WIDTH));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -37,7 +37,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
|
|||||||
@Disabled
|
@Disabled
|
||||||
public void testLayoutParserEndToEnd() {
|
public void testLayoutParserEndToEnd() {
|
||||||
|
|
||||||
String filePath = "/home/kschuettler/Dokumente/TestFiles/NER Dataset/Syngenta prod/77c680315c31d403d2e023be023b2087.PREVIEW.pdf";
|
String filePath = "/home/kschuettler/Dokumente/LayoutparsingEvaluation/VV-340050.pdf";
|
||||||
|
|
||||||
runForFile(filePath);
|
runForFile(filePath);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -79,7 +79,7 @@ public class OutlineDetectionTest extends AbstractTest {
|
|||||||
var documentFile = new ClassPathResource(fileName).getFile();
|
var documentFile = new ClassPathResource(fileName).getFile();
|
||||||
|
|
||||||
long start = System.currentTimeMillis();
|
long start = System.currentTimeMillis();
|
||||||
ClassificationDocument classificationDocument = parseLayout(fileName, LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH);
|
ClassificationDocument classificationDocument = parseLayout(fileName, LayoutParsingType.DOCUMINE_OLD);
|
||||||
Document document = buildGraph(fileName, classificationDocument);
|
Document document = buildGraph(fileName, classificationDocument);
|
||||||
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
|
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
|
||||||
OutlineObjectTree outlineObjectTree = classificationDocument.getOutlineObjectTree();
|
OutlineObjectTree outlineObjectTree = classificationDocument.getOutlineObjectTree();
|
||||||
@ -102,7 +102,7 @@ public class OutlineDetectionTest extends AbstractTest {
|
|||||||
|
|
||||||
TableOfContents tableOfContents = classificationDocument.getTableOfContents();
|
TableOfContents tableOfContents = classificationDocument.getTableOfContents();
|
||||||
|
|
||||||
assertEquals(tableOfContents.getMainSections().size(), 10);
|
assertEquals(tableOfContents.getMainSections().size(), 9);
|
||||||
assertEquals(tableOfContents.getMainSections().subList(1, 9)
|
assertEquals(tableOfContents.getMainSections().subList(1, 9)
|
||||||
.stream()
|
.stream()
|
||||||
.map(tableOfContentItem -> sanitizeString(tableOfContentItem.getHeadline().toString()))
|
.map(tableOfContentItem -> sanitizeString(tableOfContentItem.getHeadline().toString()))
|
||||||
@ -135,7 +135,7 @@ public class OutlineDetectionTest extends AbstractTest {
|
|||||||
|
|
||||||
List<SemanticNode> childrenOfTypeSectionOrSuperSection = document.getChildrenOfTypeSectionOrSuperSection();
|
List<SemanticNode> childrenOfTypeSectionOrSuperSection = document.getChildrenOfTypeSectionOrSuperSection();
|
||||||
|
|
||||||
assertEquals(childrenOfTypeSectionOrSuperSection.size(), 10);
|
assertEquals(childrenOfTypeSectionOrSuperSection.size(), 9);
|
||||||
assertEquals(childrenOfTypeSectionOrSuperSection.subList(1, 9)
|
assertEquals(childrenOfTypeSectionOrSuperSection.subList(1, 9)
|
||||||
.stream()
|
.stream()
|
||||||
.map(section -> sanitizeString(section.getHeadline().getLeafTextBlock().toString()))
|
.map(section -> sanitizeString(section.getHeadline().getLeafTextBlock().toString()))
|
||||||
|
|||||||
@ -57,6 +57,7 @@ public record LayerIdentifier(String name, String markedContentName) {
|
|||||||
public static final LayerIdentifier OUTLINE_OBJECTS = new LayerIdentifier("Outline Positions", "OUTLINE_OBJECTS");
|
public static final LayerIdentifier OUTLINE_OBJECTS = new LayerIdentifier("Outline Positions", "OUTLINE_OBJECTS");
|
||||||
public static final LayerIdentifier SENTENCES = new LayerIdentifier("Sentences", "SENTENCES");
|
public static final LayerIdentifier SENTENCES = new LayerIdentifier("Sentences", "SENTENCES");
|
||||||
public static final LayerIdentifier TOC_PAGES = new LayerIdentifier("TOC pages", "TOC_PAGES");
|
public static final LayerIdentifier TOC_PAGES = new LayerIdentifier("TOC pages", "TOC_PAGES");
|
||||||
|
public static final LayerIdentifier LIST_IDENTIFIERS = new LayerIdentifier("List identifiers", "LIST_IDENTIFIERS");
|
||||||
|
|
||||||
// Visual layout parser
|
// Visual layout parser
|
||||||
public static final LayerIdentifier KNECON_VISUAL_PARSING = new LayerIdentifier("Visual Layout Parser", "VISUAL_PARSING");
|
public static final LayerIdentifier KNECON_VISUAL_PARSING = new LayerIdentifier("Visual Layout Parser", "VISUAL_PARSING");
|
||||||
|
|||||||
@ -18,6 +18,8 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup {
|
|||||||
|
|
||||||
protected static final Standard14EmbeddableFont FONT = Standard14EmbeddableFont.helvetica();
|
protected static final Standard14EmbeddableFont FONT = Standard14EmbeddableFont.helvetica();
|
||||||
|
|
||||||
|
public static final float LINE_WIDTH = 0.5f;
|
||||||
|
|
||||||
protected static final Color WORDS_COLOR = new Color(68, 84, 147);
|
protected static final Color WORDS_COLOR = new Color(68, 84, 147);
|
||||||
protected static final Color LINES_COLOR = new Color(152, 45, 179);
|
protected static final Color LINES_COLOR = new Color(152, 45, 179);
|
||||||
protected static final Color ZONES_COLOR = new Color(131, 38, 38);
|
protected static final Color ZONES_COLOR = new Color(131, 38, 38);
|
||||||
@ -57,6 +59,7 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup {
|
|||||||
protected final Visualizations outlineObjects = Visualizations.builder().layer(LayerIdentifier.OUTLINE_OBJECTS).build();
|
protected final Visualizations outlineObjects = Visualizations.builder().layer(LayerIdentifier.OUTLINE_OBJECTS).build();
|
||||||
protected final Visualizations sentences = Visualizations.builder().layer(LayerIdentifier.SENTENCES).build();
|
protected final Visualizations sentences = Visualizations.builder().layer(LayerIdentifier.SENTENCES).build();
|
||||||
protected final Visualizations tocPages = Visualizations.builder().layer(LayerIdentifier.TOC_PAGES).build();
|
protected final Visualizations tocPages = Visualizations.builder().layer(LayerIdentifier.TOC_PAGES).build();
|
||||||
|
protected final Visualizations listIdentifiers = Visualizations.builder().layer(LayerIdentifier.LIST_IDENTIFIERS).build();
|
||||||
|
|
||||||
|
|
||||||
public List<Visualizations> getVisualizations() {
|
public List<Visualizations> getVisualizations() {
|
||||||
@ -73,7 +76,10 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup {
|
|||||||
mainBody, //
|
mainBody, //
|
||||||
markedContent, //
|
markedContent, //
|
||||||
outlineObjects, //
|
outlineObjects, //
|
||||||
tocPages);
|
tocPages, //
|
||||||
|
listIdentifiers //
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user