RED-10127: add list classification
This commit is contained in:
parent
4b0c041d84
commit
7b073eb4f3
@ -374,14 +374,7 @@ public class LayoutParsingPipeline {
|
||||
|
||||
classificationService.classify(classificationDocument, layoutParsingType, identifier);
|
||||
|
||||
List<TextPageBlock> headlines = classificationDocument.getPages()
|
||||
.stream()
|
||||
.flatMap(classificationPage -> classificationPage.getTextBlocks()
|
||||
.stream()
|
||||
.filter(tb -> tb instanceof TextPageBlock && tb.getClassification() != null && tb.getClassification().isHeadline())
|
||||
.map(tb -> (TextPageBlock) tb))
|
||||
.toList();
|
||||
TableOfContents tableOfContents = outlineValidationService.createToC(headlines);
|
||||
TableOfContents tableOfContents = outlineValidationService.createToC(classificationDocument);
|
||||
classificationDocument.setTableOfContents(tableOfContents);
|
||||
|
||||
log.info("Building Sections for {}", identifier);
|
||||
|
||||
@ -165,4 +165,16 @@ public abstract class TextBoundingBox extends BoundingBox {
|
||||
return this.getYDirAdj() - threshold <= other.getMaxYDirAdj() && this.getMaxYDirAdj() + threshold >= other.getYDirAdj();
|
||||
}
|
||||
|
||||
|
||||
public boolean isAboveDirAdj(TextBoundingBox other) {
|
||||
|
||||
return other.isBelow(this);
|
||||
}
|
||||
|
||||
|
||||
public boolean isBelowDirAdj(TextBoundingBox other) {
|
||||
|
||||
return this.intersectsXDirAdj(other) && this.getYDirAdj() >= other.getMaxYDirAdj();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -9,13 +9,13 @@ public enum PageBlockType {
|
||||
H6,
|
||||
HEADER,
|
||||
FOOTER,
|
||||
TITLE,
|
||||
PARAGRAPH,
|
||||
PARAGRAPH_BOLD,
|
||||
PARAGRAPH_ITALIC,
|
||||
PARAGRAPH_UNKNOWN,
|
||||
OTHER,
|
||||
TABLE_OF_CONTENTS_ITEM,
|
||||
LIST_ITEM,
|
||||
TABLE;
|
||||
|
||||
|
||||
|
||||
@ -10,6 +10,7 @@ import java.util.TreeSet;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
|
||||
import io.micrometer.observation.annotation.Observed;
|
||||
@ -20,7 +21,9 @@ import lombok.extern.slf4j.Slf4j;
|
||||
public class OutlineValidationService {
|
||||
|
||||
@Observed(name = "OutlineValidationService", contextualName = "create-toc")
|
||||
public TableOfContents createToC(List<TextPageBlock> headlines) {
|
||||
public TableOfContents createToC(ClassificationDocument classificationDocument) {
|
||||
|
||||
List<TextPageBlock> headlines = extractHeadlines(classificationDocument);
|
||||
|
||||
List<TableOfContentItem> mainSections = new ArrayList<>();
|
||||
Map<Integer, TableOfContentItem> lastItemsPerDepth = new HashMap<>();
|
||||
@ -60,4 +63,16 @@ public class OutlineValidationService {
|
||||
return new TableOfContents(mainSections);
|
||||
}
|
||||
|
||||
|
||||
private static List<TextPageBlock> extractHeadlines(ClassificationDocument classificationDocument) {
|
||||
|
||||
return classificationDocument.getPages()
|
||||
.stream()
|
||||
.flatMap(classificationPage -> classificationPage.getTextBlocks()
|
||||
.stream()
|
||||
.filter(tb -> tb instanceof TextPageBlock && tb.getClassification() != null && tb.getClassification().isHeadline())
|
||||
.map(tb -> (TextPageBlock) tb))
|
||||
.toList();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,8 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
|
||||
public record AbstractBlockOnPage(AbstractPageBlock block, ClassificationPage page) {
|
||||
|
||||
}
|
||||
@ -0,0 +1,85 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class ListIdentifier {
|
||||
|
||||
public static final Pattern STARTING_NUMBERS = Pattern.compile("^\\s*([1-9]+)\\.\\s+");
|
||||
|
||||
enum Format {
|
||||
NUMBERS
|
||||
}
|
||||
|
||||
Format format;
|
||||
@Getter
|
||||
TextPositionSequence word;
|
||||
@Getter
|
||||
int page;
|
||||
int representation;
|
||||
|
||||
|
||||
public static Optional<ListIdentifier> parse(TextPageBlock textPageBlock, int page) {
|
||||
|
||||
return parse(textPageBlock.getSequences().subList(0, Math.min(5, textPageBlock.getSequences().size())), page);
|
||||
}
|
||||
|
||||
|
||||
public static Optional<ListIdentifier> parse(List<TextPositionSequence> sequences, int page) {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (TextPositionSequence sequence : sequences) {
|
||||
sb.append(sequence.toString());
|
||||
sb.append(" ");
|
||||
}
|
||||
sb.replace(sb.length() - 1, sb.length(), "");
|
||||
String text = sb.toString();
|
||||
|
||||
Matcher numberMatcher = STARTING_NUMBERS.matcher(text);
|
||||
|
||||
if (numberMatcher.find()) {
|
||||
return Optional.of(new ListIdentifier(Format.NUMBERS, sequences.get(0), page, Integer.parseInt(numberMatcher.group(1))));
|
||||
}
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
|
||||
public static boolean isInOrder(List<ListIdentifier> listIdentifiers) {
|
||||
|
||||
if (listIdentifiers.size() <= 1) {
|
||||
return true;
|
||||
}
|
||||
|
||||
for (int i = 1; i < listIdentifiers.size(); i++) {
|
||||
ListIdentifier current = listIdentifiers.get(i);
|
||||
ListIdentifier previous = listIdentifiers.get(i - 1);
|
||||
if (current.format != previous.format) {
|
||||
return false;
|
||||
}
|
||||
if (current.representation <= previous.representation) {
|
||||
return false;
|
||||
}
|
||||
if (!current.word.intersectsXDirAdj(previous.word, 2)) {
|
||||
return false;
|
||||
}
|
||||
if (current.page == previous.page && !current.word.isBelowDirAdj(previous.word)) {
|
||||
return false;
|
||||
}
|
||||
if (current.page < previous.page) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
@ -2,6 +2,6 @@ package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
|
||||
public record TextBlockOnPage(ClassificationPage page, TextPageBlock textBlock) {
|
||||
public record TextBlockOnPage(TextPageBlock textBlock, ClassificationPage page) {
|
||||
|
||||
}
|
||||
@ -33,8 +33,6 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
|
||||
private boolean underlined;
|
||||
|
||||
private double highestFontSize;
|
||||
|
||||
private PageBlockType classification;
|
||||
|
||||
private boolean toDuplicate;
|
||||
@ -262,6 +260,12 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
}
|
||||
|
||||
|
||||
public double getHighestFontSize() {
|
||||
|
||||
return frequencyCounters.getFontSizeFrequencyCounter().getHighest();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean isEmpty() {
|
||||
|
||||
|
||||
@ -18,6 +18,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.SectionIdentif
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@ -467,7 +468,9 @@ public class BlockificationPostprocessingService {
|
||||
|
||||
private static String sanitizeString(String text) {
|
||||
|
||||
return StringUtils.deleteWhitespace(text).toLowerCase(Locale.ROOT);
|
||||
return TextNormalizationUtilities.removeAllWhitespaces(text)//
|
||||
.trim() // sometimes there are trailing empty bytes at the end of the string trim() seems to remove them
|
||||
.toLowerCase(Locale.ENGLISH);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -10,7 +10,6 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
@ -73,7 +72,9 @@ public class ClarifyndClassificationService {
|
||||
&& (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5
|
||||
&& textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) {
|
||||
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
|
||||
textBlock.setClassification(PageBlockType.TITLE);
|
||||
PageBlockType headlineType = HeadlineClassificationService.headlineClassByFontSize(textBlock, headlineFontSizes);
|
||||
headlineClassificationService.classifyHeadline(textBlock, headlineType);
|
||||
document.setHeadlines(true);
|
||||
}
|
||||
} else if (textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular()
|
||||
&& PositionUtils.getApproxLineCount(textBlock) < 4.9
|
||||
|
||||
@ -0,0 +1,29 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.classification;
|
||||
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class ClassificationPatterns {
|
||||
|
||||
public static final Pattern HEADLINE_WITH_2_IDENTIFER_PATTERN = Pattern.compile("^([1-9]\\d?\\.){1,3}\\d{1,2}\\.?\\s[a-z][0-9a-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
|
||||
|
||||
public static final Pattern HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN = Pattern.compile("^([0-9]\\.)\\s[a-z][0-9a-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
|
||||
|
||||
public static final Pattern AT_LEAST_3_CHARS_PATTERN = Pattern.compile("\\p{L}{3,}", Pattern.CASE_INSENSITIVE);
|
||||
|
||||
public static final Pattern HEADLINE_PATTERN_WITH_SLASHES = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*");
|
||||
|
||||
public static final Pattern AMOUNT_PATTERN = Pattern.compile(
|
||||
"^\\s*\\d+(?:\\.\\d+)?\\s*(?:ml|l|g|kg|mg|cm|cm2|cm3|mm|mm2|mm3|km|km2|m|m2|m3|lb|oz|ppm|dpm|days|weeks|months|%|f)\\b",
|
||||
Pattern.CASE_INSENSITIVE);
|
||||
|
||||
|
||||
|
||||
public static final Pattern TABLE_OR_FIGURE_PATTERN = Pattern.compile(
|
||||
"^\\s*(?:table|continued\\s+table|appendix|figure)\\s+(?:[xvi]+|[a-z0-9]{1,3}(?:\\.[0-9]{1,3})*(?:-[0-9]{1,3})?)\\b",
|
||||
Pattern.CASE_INSENSITIVE);
|
||||
|
||||
public static final Pattern ALPHANUMERIC = Pattern.compile("[a-zA-Z0-9]");
|
||||
|
||||
public static final Pattern NUMERIC = Pattern.compile("[0-9]+");
|
||||
|
||||
}
|
||||
@ -1,9 +1,19 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.classification;
|
||||
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.ALPHANUMERIC;
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.AMOUNT_PATTERN;
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.AT_LEAST_3_CHARS_PATTERN;
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.HEADLINE_PATTERN_WITH_SLASHES;
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.HEADLINE_WITH_2_IDENTIFER_PATTERN;
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN;
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.TABLE_OR_FIGURE_PATTERN;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Set;
|
||||
import java.util.Map;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
@ -13,90 +23,60 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.AbstractBlockOnPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.ListIdentifier;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.HeaderFooterDetection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class DocuMineClassificationService {
|
||||
|
||||
private static final Pattern HEADLINE_WITH_2_IDENTIFER_PATTERN = Pattern.compile("^([1-9]\\d?\\.){1,3}\\d{1,2}\\.?\\s[a-z][0-9a-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
|
||||
private static final Pattern HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN = Pattern.compile("^([0-9]\\.)\\s[a-z][0-9a-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
|
||||
private static final Pattern AT_LEAST_3_CHARS_PATTERN = Pattern.compile("\\p{L}{3,}", Pattern.CASE_INSENSITIVE);
|
||||
private static final Pattern HEADLINE_PATTERN_WITH_SLASHES = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*");
|
||||
public static final Pattern AMOUNT_PATTERN = Pattern.compile("^\\s*\\d+(?:\\.\\d+)?\\s*(?:ml|l|g|kg|mg|cm|cm2|cm3|mm|mm2|mm3|km|km2|m|m2|m3|lb|oz|ppm|dpm|days|weeks|months|%|f)\\b",
|
||||
Pattern.CASE_INSENSITIVE);
|
||||
private static final Pattern TABLE_OR_FIGURE_PATTERN = Pattern.compile(
|
||||
"^\\s*(?:table|continued\\s+table|appendix|figure)\\s+(?:[xvi]+|[a-z0-9]{1,3}(?:\\.[0-9]{1,3})*(?:-[0-9]{1,3})?)\\b",
|
||||
Pattern.CASE_INSENSITIVE);
|
||||
private static final Pattern ALPHANUMERIC = Pattern.compile("[a-zA-Z0-9]");
|
||||
|
||||
public static final int SEPARATION_THRESHOLD = 10; // if the min distance between a textblock and all its surrounding blocks, the regexes can be more lenient.
|
||||
public static final int SURROUNDING_BLOCKS_RADIUS = 3; // number of surrounding blocks before and after the current textblock to be tested
|
||||
|
||||
ListItemClassificationService listItemClassificationService;
|
||||
|
||||
|
||||
public void classifyDocument(ClassificationDocument document) {
|
||||
|
||||
List<Double> headlineFontSizes = document.getFontSizeCounter().getValuesInReverseOrder();
|
||||
|
||||
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
|
||||
List<Double> headlineFontSizes = buildHeadlineFontSizes(document);
|
||||
List<AbstractBlockOnPage> blocks = buildBlocksPerPage(document);
|
||||
log.debug("Headline FontSizes are: {}", headlineFontSizes);
|
||||
|
||||
HeadlineClassificationService headlineClassificationService = new HeadlineClassificationService();
|
||||
|
||||
for (ClassificationPage page : document.getPages()) {
|
||||
document.getLayoutDebugLayer().addTextBlockVisualizations(page.getTextBlocks(), page.getPageNumber());
|
||||
classifyPage(headlineClassificationService, page, document, headlineFontSizes);
|
||||
for (int i = 0; i < blocks.size(); i++) {
|
||||
AbstractBlockOnPage block = blocks.get(i);
|
||||
document.getLayoutDebugLayer().addTextBlockVisualizations(block.page().getTextBlocks(), block.page().getPageNumber());
|
||||
classifyBlock(headlineClassificationService, i, blocks, document, headlineFontSizes);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void classifyPage(HeadlineClassificationService headlineClassificationService,
|
||||
ClassificationPage page,
|
||||
ClassificationDocument document,
|
||||
List<Double> headlineFontSizes) {
|
||||
|
||||
List<AbstractPageBlock> textBlocks = page.getTextBlocks();
|
||||
for (int i = 0; i < textBlocks.size(); i++) {
|
||||
AbstractPageBlock textBlock = textBlocks.get(i);
|
||||
if (textBlock instanceof TextPageBlock) {
|
||||
List<AbstractPageBlock> surroundingBlocks = getSurroundingBlocks(i, textBlocks);
|
||||
classifyBlock(headlineClassificationService, (TextPageBlock) textBlock, surroundingBlocks, page, document, headlineFontSizes);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private List<AbstractPageBlock> getSurroundingBlocks(int originalIndex, List<AbstractPageBlock> textBlocks) {
|
||||
|
||||
int start = Math.max(originalIndex - SURROUNDING_BLOCKS_RADIUS, 0);
|
||||
int end = Math.min(originalIndex + SURROUNDING_BLOCKS_RADIUS, textBlocks.size());
|
||||
List<AbstractPageBlock> surroundingBlocks = new ArrayList<>(2 * SURROUNDING_BLOCKS_RADIUS);
|
||||
for (int i = start; i < end; i++) {
|
||||
if (i == originalIndex) {
|
||||
continue;
|
||||
}
|
||||
if (textBlocks.get(i).getText().length() <= 1) {
|
||||
continue;
|
||||
}
|
||||
surroundingBlocks.add(textBlocks.get(i));
|
||||
}
|
||||
return surroundingBlocks;
|
||||
}
|
||||
|
||||
|
||||
private void classifyBlock(HeadlineClassificationService headlineClassificationService,
|
||||
TextPageBlock textBlock,
|
||||
List<AbstractPageBlock> surroundingBlocks,
|
||||
ClassificationPage page,
|
||||
int currentIndex,
|
||||
List<AbstractBlockOnPage> allBlocks,
|
||||
ClassificationDocument document,
|
||||
List<Double> headlineFontSizes) {
|
||||
|
||||
TextPageBlock textBlock;
|
||||
if (allBlocks.get(currentIndex).block() instanceof TextPageBlock block) {
|
||||
textBlock = block;
|
||||
} else {
|
||||
return;
|
||||
}
|
||||
ClassificationPage page = allBlocks.get(currentIndex).page();
|
||||
List<AbstractPageBlock> surroundingBlocks = getSurroundingBlocksOnPage(currentIndex, allBlocks);
|
||||
|
||||
log.debug("headlineFontSizes: {}", headlineFontSizes);
|
||||
var bodyTextFrame = page.getBodyTextFrame();
|
||||
|
||||
@ -113,6 +93,9 @@ public class DocuMineClassificationService {
|
||||
|
||||
boolean enoughChars = charCount > textBlock.getText().length() * 0.5;
|
||||
|
||||
List<ListIdentifier> listIdentifiers = listItemClassificationService.findConfirmedListIdentifiers(currentIndex, allBlocks);
|
||||
document.getLayoutDebugLayer().addListIdentifiers(listIdentifiers);
|
||||
|
||||
if (textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) {
|
||||
headlineClassificationService.setLastHeadlineFromOutline(textBlock);
|
||||
return;
|
||||
@ -126,26 +109,21 @@ public class DocuMineClassificationService {
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
||||
return;
|
||||
}
|
||||
if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5
|
||||
&& textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) {
|
||||
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
|
||||
textBlock.setClassification(PageBlockType.TITLE);
|
||||
}
|
||||
} else if (textBlock.getText().length() > 5
|
||||
&& greaterOrEqualFontThanDocumentAverage(textBlock, document)
|
||||
&& PositionUtils.getApproxLineCount(textBlock) < 5.9
|
||||
&& ((textBlock.getMostPopularWordStyle().contains("bold") || textBlock.isUnderlined())//
|
||||
&& Character.isDigit(textBlock.toString().charAt(0)) //
|
||||
&& isAtLeast3Characters //
|
||||
&& !textBlock.toString().contains(":") //
|
||||
|| textBlock.toString().startsWith("APPENDIX") //
|
||||
|| textBlock.toString().startsWith("FIGURE") //
|
||||
|| textBlock.toString().startsWith("Continued TABLE") //
|
||||
|| textBlock.toString().startsWith("TABLE"))
|
||||
&& !textBlock.toString().endsWith(":")
|
||||
&& isAtLeast3Characters
|
||||
&& !isAmount
|
||||
&& enoughChars) {
|
||||
if (textBlock.getText().length() > 5
|
||||
&& greaterOrEqualFontThanDocumentAverage(textBlock, document)
|
||||
&& PositionUtils.getApproxLineCount(textBlock) < 5.9
|
||||
&& ((textBlock.getMostPopularWordStyle().contains("bold") || textBlock.isUnderlined())//
|
||||
&& Character.isDigit(textBlock.toString().charAt(0)) //
|
||||
&& isAtLeast3Characters //
|
||||
&& !textBlock.toString().contains(":") //
|
||||
|| textBlock.toString().startsWith("APPENDIX") //
|
||||
|| textBlock.toString().startsWith("FIGURE") //
|
||||
|| textBlock.toString().startsWith("Continued TABLE") //
|
||||
|| textBlock.toString().startsWith("TABLE"))
|
||||
&& !textBlock.toString().endsWith(":")
|
||||
&& isAtLeast3Characters
|
||||
&& !isAmount
|
||||
&& enoughChars) {
|
||||
|
||||
setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes);
|
||||
} else if (isAllCaps(textBlock)
|
||||
@ -170,11 +148,14 @@ public class DocuMineClassificationService {
|
||||
} else if (hasSeparation(textBlock, surroundingBlocks)//
|
||||
&& greaterOrEqualFontThanPageAverage(textBlock, page)//
|
||||
&& PositionUtils.getApproxLineCount(textBlock) < 2.9//
|
||||
&& (tableOrFigureMatcher.reset().find() || headlineWithSingleIdentifierMatcher.reset().find())//
|
||||
&& (tableOrFigureMatcher.reset().find() || (headlineWithSingleIdentifierMatcher.reset().find() && listIdentifiers.isEmpty())) //
|
||||
&& !isAmount//
|
||||
&& !headlineWithSlashesMatches) {
|
||||
|
||||
setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes);
|
||||
} else if (!listIdentifiers.isEmpty()) {
|
||||
|
||||
textBlock.setClassification(PageBlockType.LIST_ITEM);
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
|
||||
&& textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()
|
||||
&& textBlock.getMostPopularWordStyle().equals("bold")
|
||||
@ -264,6 +245,92 @@ public class DocuMineClassificationService {
|
||||
document.setHeadlines(true);
|
||||
}
|
||||
|
||||
|
||||
private List<AbstractBlockOnPage> buildBlocksPerPage(ClassificationDocument document) {
|
||||
|
||||
List<AbstractBlockOnPage> blocks = new ArrayList<>();
|
||||
for (ClassificationPage page : document.getPages()) {
|
||||
for (AbstractPageBlock abstractPageBlock : page.getTextBlocks()) {
|
||||
if (abstractPageBlock instanceof TextPageBlock textBlock) {
|
||||
if (textBlock.getClassification() != null && (textBlock.getClassification().equals(PageBlockType.HEADER) //
|
||||
|| textBlock.getClassification().equals(PageBlockType.FOOTER))) {
|
||||
continue;
|
||||
}
|
||||
blocks.add(new AbstractBlockOnPage(textBlock, page));
|
||||
}
|
||||
}
|
||||
}
|
||||
return blocks;
|
||||
}
|
||||
|
||||
|
||||
private static List<Double> buildHeadlineFontSizes(ClassificationDocument document) {
|
||||
|
||||
if (document.getFontSizeCounter().getCountPerValue().size() <= 6) {
|
||||
return document.getFontSizeCounter().getValuesInReverseOrder();
|
||||
}
|
||||
|
||||
List<Map.Entry<Double, Integer>> sortedEntries = new ArrayList<>(document.getFontSizeCounter().getCountPerValue().entrySet());
|
||||
sortedEntries.sort(Map.Entry.comparingByKey());
|
||||
|
||||
int totalCount = sortedEntries.stream()
|
||||
.mapToInt(Map.Entry::getValue).sum();
|
||||
|
||||
int cumulativeCount = 0;
|
||||
Iterator<Map.Entry<Double, Integer>> iterator = sortedEntries.iterator();
|
||||
while (iterator.hasNext()) {
|
||||
Map.Entry<Double, Integer> entry = iterator.next();
|
||||
cumulativeCount += entry.getValue();
|
||||
if (cumulativeCount > totalCount * 0.3) {
|
||||
break; // We've filtered the bottom 30%, so stop.
|
||||
}
|
||||
iterator.remove();
|
||||
}
|
||||
|
||||
if (sortedEntries.size() < 6) {
|
||||
return document.getFontSizeCounter().getValuesInReverseOrder();
|
||||
}
|
||||
int clusterSize = Math.max(1, sortedEntries.size() / 6);
|
||||
|
||||
List<List<Double>> clusters = new ArrayList<>();
|
||||
for (int i = 0; i < 6; i++) {
|
||||
clusters.add(new ArrayList<>());
|
||||
}
|
||||
|
||||
for (int i = 0; i < sortedEntries.size(); i++) {
|
||||
int clusterIndex = Math.min(i / clusterSize, 5);
|
||||
clusters.get(clusterIndex).add(sortedEntries.get(i).getKey());
|
||||
}
|
||||
|
||||
return clusters.stream()
|
||||
.map(cluster -> cluster.stream()
|
||||
.mapToDouble(d -> d).average()
|
||||
.orElseThrow())
|
||||
.sorted(Comparator.reverseOrder())
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
private List<AbstractPageBlock> getSurroundingBlocksOnPage(int originalIndex, List<AbstractBlockOnPage> textBlocks) {
|
||||
|
||||
int start = Math.max(originalIndex - SURROUNDING_BLOCKS_RADIUS, 0);
|
||||
int end = Math.min(originalIndex + SURROUNDING_BLOCKS_RADIUS, textBlocks.size());
|
||||
List<AbstractPageBlock> surroundingBlocks = new ArrayList<>(2 * SURROUNDING_BLOCKS_RADIUS);
|
||||
for (int i = start; i < end; i++) {
|
||||
if (i == originalIndex) {
|
||||
continue;
|
||||
}
|
||||
if (textBlocks.get(i).block().getText().length() <= 1) {
|
||||
continue;
|
||||
}
|
||||
if (!textBlocks.get(i).page().equals(textBlocks.get(originalIndex).page())) {
|
||||
continue;
|
||||
}
|
||||
surroundingBlocks.add(textBlocks.get(i).block());
|
||||
}
|
||||
return surroundingBlocks;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -73,13 +73,18 @@ public class HeadlineClassificationService {
|
||||
|
||||
public static PageBlockType headlineClassByFontSize(TextPageBlock textBlock, List<Double> fontSizeGroups) {
|
||||
|
||||
PageBlockType headlineType = PageBlockType.H1;
|
||||
for (int i = 1; i <= fontSizeGroups.size(); i++) {
|
||||
if (textBlock.getMostPopularWordFontSize() == fontSizeGroups.get(i - 1)) {
|
||||
headlineType = PageBlockType.getHeadlineType(i);
|
||||
List<Double> distances = fontSizeGroups.stream()
|
||||
.map(fontSize -> Math.abs(fontSize - textBlock.getMostPopularWordFontSize()))
|
||||
.toList();
|
||||
double min = Double.MAX_VALUE;
|
||||
int argMin = -1;
|
||||
for (int i = 0; i < distances.size(); i++) {
|
||||
if (distances.get(i) < min) {
|
||||
min = distances.get(i);
|
||||
argMin = i;
|
||||
}
|
||||
}
|
||||
return headlineType;
|
||||
return PageBlockType.getHeadlineType(argMin);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,99 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.classification;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.AbstractBlockOnPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.ListIdentifier;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
|
||||
@Service
|
||||
public class ListItemClassificationService {
|
||||
|
||||
public static final int LIST_IDENTIFIER_SEARCH_RADIUS = 3;
|
||||
|
||||
|
||||
public List<ListIdentifier> findConfirmedListIdentifiers(int currentIndex, List<AbstractBlockOnPage> allBlocks) {
|
||||
|
||||
List<ListIdentifier> listIdentifiers = extractListIdentifiers(allBlocks.get(currentIndex));
|
||||
if (listIdentifiers.isEmpty()) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
if (listIdentifiers.size() > 1 && ListIdentifier.isInOrder(listIdentifiers)) {
|
||||
return listIdentifiers;
|
||||
}
|
||||
|
||||
int start = Math.max(0, currentIndex - LIST_IDENTIFIER_SEARCH_RADIUS);
|
||||
int end = Math.min(allBlocks.size(), currentIndex + LIST_IDENTIFIER_SEARCH_RADIUS);
|
||||
|
||||
List<ListIdentifier> identifiersBehind = new ArrayList<>();
|
||||
if (start < currentIndex) {
|
||||
identifiersBehind.addAll(allBlocks.subList(start, currentIndex)
|
||||
.stream()
|
||||
.map(this::extractListIdentifiers)
|
||||
.flatMap(Collection::stream)
|
||||
.toList());
|
||||
}
|
||||
if (!identifiersBehind.isEmpty()) {
|
||||
listIdentifiers.add(0, identifiersBehind.get(identifiersBehind.size() - 1));
|
||||
if (ListIdentifier.isInOrder(listIdentifiers)) {
|
||||
return listIdentifiers;
|
||||
}
|
||||
listIdentifiers.remove(0);
|
||||
}
|
||||
List<ListIdentifier> identifiersAhead = new ArrayList<>();
|
||||
if (currentIndex + 1 < end) {
|
||||
identifiersAhead.addAll(allBlocks.subList(currentIndex + 1, end)
|
||||
.stream()
|
||||
.map(this::extractListIdentifiers)
|
||||
.flatMap(Collection::stream)
|
||||
.toList());
|
||||
}
|
||||
if (!identifiersAhead.isEmpty()) {
|
||||
listIdentifiers.add(identifiersAhead.get(0));
|
||||
if (ListIdentifier.isInOrder(listIdentifiers)) {
|
||||
return listIdentifiers;
|
||||
}
|
||||
listIdentifiers.remove(listIdentifiers.size() - 1);
|
||||
}
|
||||
return Collections.emptyList();
|
||||
|
||||
}
|
||||
|
||||
|
||||
private List<ListIdentifier> extractListIdentifiers(AbstractBlockOnPage block) {
|
||||
|
||||
List<ListIdentifier> result = new LinkedList<>();
|
||||
if (block.block() instanceof TextPageBlock textBlock) {
|
||||
List<TextPositionSequence> sequences = textBlock.getSequences();
|
||||
for (int i = 0; i < sequences.size(); i++) {
|
||||
|
||||
if (i != 0 && sequences.get(i - 1).getXDirAdj() < sequences.get(i).getXDirAdj()) {
|
||||
// is not the start of a line, continue
|
||||
continue;
|
||||
}
|
||||
|
||||
TextPositionSequence sequence = sequences.get(i);
|
||||
List<TextPositionSequence> wordsAtStartOfLine = new ArrayList<>(3);
|
||||
int end = Math.min(sequences.size(), i + 3);
|
||||
for (int j = i; j < end; j++) {
|
||||
if (sequences.get(j).intersectsYDirAdj(sequence, 2)) {
|
||||
wordsAtStartOfLine.add(sequences.get(j));
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
ListIdentifier.parse(wordsAtStartOfLine, block.page().getPageNumber()).ifPresent(result::add);
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
}
|
||||
@ -86,7 +86,9 @@ public class RedactManagerClassificationService {
|
||||
&& textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) {
|
||||
|
||||
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
|
||||
textBlock.setClassification(PageBlockType.TITLE);
|
||||
PageBlockType headlineType = HeadlineClassificationService.headlineClassByFontSize(textBlock, headlineFontSizes);
|
||||
headlineClassificationService.classifyHeadline(textBlock, headlineType);
|
||||
document.setHeadlines(true);
|
||||
}
|
||||
} else if (textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular()
|
||||
&& PositionUtils.getApproxLineCount(textBlock) < 4.9
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.classification;
|
||||
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.DocuMineClassificationService.AMOUNT_PATTERN;
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.AMOUNT_PATTERN;
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.NUMERIC;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
@ -13,7 +14,6 @@ import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
@ -42,8 +42,6 @@ public class TableOfContentsClassificationService {
|
||||
public static final int INTERSECTION_TOLERANCE = 2; // equality threshold for x intersection
|
||||
public static final int DENSITY_THRESHOLD_COUNT = 10; // describes the minimum density, at least this many entries per page height are required
|
||||
|
||||
private static final Pattern NUMERIC = Pattern.compile("[0-9]+");
|
||||
|
||||
|
||||
@SuppressWarnings("checkstyle:ModifiedControlVariable")
|
||||
public void classifyTableOfContents(ClassificationDocument document) {
|
||||
@ -57,11 +55,13 @@ public class TableOfContentsClassificationService {
|
||||
continue;
|
||||
}
|
||||
|
||||
int offset = identifyTOCItems(i + 1, textBlocks, document);
|
||||
int end = identifyTOCItems(i + 1, textBlocks, document);
|
||||
|
||||
if (offset > 1) {
|
||||
textBlock.textBlock().setClassification(PageBlockType.H1);
|
||||
i += offset;
|
||||
if (end > i + 1) {
|
||||
if (textBlock.textBlock().getClassification() == null) {
|
||||
textBlock.textBlock().setClassification(PageBlockType.H1);
|
||||
}
|
||||
i = end;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -237,7 +237,7 @@ public class TableOfContentsClassificationService {
|
||||
|| textBlock.getClassification().equals(PageBlockType.FOOTER))) {
|
||||
continue;
|
||||
}
|
||||
blocks.add(new TextBlockOnPage(page, textBlock));
|
||||
blocks.add(new TextBlockOnPage(textBlock, page));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -352,7 +352,7 @@ public class TableOfContentsClassificationService {
|
||||
return false;
|
||||
}
|
||||
|
||||
int prev = getNumberAsInt(numbers, i);
|
||||
int prev = getNumberAsInt(numbers, i - 1);
|
||||
int curr = getNumberAsInt(numbers, i);
|
||||
int next = getNumberAsInt(numbers, i + 1);
|
||||
|
||||
|
||||
@ -5,6 +5,7 @@ import java.awt.geom.Point2D;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.pdfbox.contentstream.operator.Operator;
|
||||
import org.apache.pdfbox.contentstream.operator.OperatorName;
|
||||
@ -50,6 +51,7 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@Slf4j
|
||||
public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
|
||||
private final static Set<String> DOT_LIKE_CHARACTERS = Set.of(".", "·", "•", "․", "‧", "∙", "⋅", "・", ".", "・", "…", "⸱", "﹒", "ꞏ");
|
||||
private final List<TextPositionSequence> textPositionSequences = new ArrayList<>();
|
||||
private final List<Ruling> rulings = new ArrayList<>();
|
||||
private final List<Ruling> graphicsPath = new ArrayList<>();
|
||||
@ -336,20 +338,32 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
private boolean isWordFollowedByDottedLine(List<TextPosition> textPositions, int i, int startIndex) {
|
||||
|
||||
return i - startIndex >= 4 //
|
||||
&& textPositions.get(i).getUnicode().equals(".") //
|
||||
&& textPositions.get(i - 1).getUnicode().equals(".") //
|
||||
&& textPositions.get(i - 2).getUnicode().equals(".") //
|
||||
&& !textPositions.get(i - 3).getUnicode().equals(".");
|
||||
&& isDot(textPositions, i) //
|
||||
&& isDot(textPositions, i - 1) //
|
||||
&& isDot(textPositions, i - 2) //
|
||||
&& alphanumeric(textPositions, i - 3);
|
||||
}
|
||||
|
||||
|
||||
private static boolean isDottedLineFollowedByWord(List<TextPosition> textPositions, int i, int startIndex) {
|
||||
|
||||
return i - startIndex >= 4 //
|
||||
&& !textPositions.get(i).getUnicode().equals(".") //
|
||||
&& textPositions.get(i - 1).getUnicode().equals(".") //
|
||||
&& textPositions.get(i - 2).getUnicode().equals(".") //
|
||||
&& textPositions.get(i - 3).getUnicode().equals(".");
|
||||
&& alphanumeric(textPositions, i) //
|
||||
&& isDot(textPositions, i - 1) //
|
||||
&& isDot(textPositions, i - 2) //
|
||||
&& isDot(textPositions, i - 3);
|
||||
}
|
||||
|
||||
|
||||
private static boolean isDot(List<TextPosition> textPositions, int i) {
|
||||
|
||||
return DOT_LIKE_CHARACTERS.contains(textPositions.get(i).getUnicode());
|
||||
}
|
||||
|
||||
|
||||
private static boolean alphanumeric(List<TextPosition> textPositions, int i) {
|
||||
|
||||
return Character.isAlphabetic(textPositions.get(i).getUnicode().charAt(0)) || Character.isDigit(textPositions.get(i).getUnicode().charAt(0));
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -26,6 +26,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.textbloc
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.ListIdentifier;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||
@ -80,7 +81,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
||||
visualizationsOnPage.getColoredLines()
|
||||
.addAll(cleanRulings.buildAll()
|
||||
.stream()
|
||||
.map(ruling -> new ColoredLine(ruling, decideOnRulingColor(ruling), 0.5f))
|
||||
.map(ruling -> new ColoredLine(ruling, decideOnRulingColor(ruling), LINE_WIDTH))
|
||||
.toList());
|
||||
}
|
||||
|
||||
@ -93,7 +94,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
||||
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.rulings);
|
||||
visualizationsOnPage.getColoredLines()
|
||||
.addAll(rulings.stream()
|
||||
.map(ruling -> new ColoredLine(ruling, decideOnRulingColor(ruling), 0.5f))
|
||||
.map(ruling -> new ColoredLine(ruling, decideOnRulingColor(ruling), LINE_WIDTH))
|
||||
.toList());
|
||||
}
|
||||
|
||||
@ -182,7 +183,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
||||
visualizationsOnPage.getColoredRectangles()
|
||||
.addAll(lines.stream()
|
||||
.map(BoundingBox::getBBoxPdf)
|
||||
.map(line -> new ColoredRectangle(line, LINES_COLOR, 0.5f))
|
||||
.map(line -> new ColoredRectangle(line, LINES_COLOR, LINE_WIDTH))
|
||||
.toList());
|
||||
}
|
||||
|
||||
@ -198,7 +199,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
||||
.map(line -> line.stream()
|
||||
.map(BoundingBox::getBBoxPdf)
|
||||
.collect(RectangleTransformations.collectBBox()))
|
||||
.map(line -> new ColoredRectangle(line, LINES_COLOR, 0.5f))
|
||||
.map(line -> new ColoredRectangle(line, LINES_COLOR, LINE_WIDTH))
|
||||
.toList());
|
||||
|
||||
}
|
||||
@ -300,12 +301,12 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
||||
visualizationsOnPage.getColoredRectangles()
|
||||
.addAll(numbers.stream()
|
||||
.map(BoundingBox::getBBoxPdf)
|
||||
.map(line -> new ColoredRectangle(line, LINES_COLOR, 0.5f))
|
||||
.map(line -> new ColoredRectangle(line, LINES_COLOR, LINE_WIDTH))
|
||||
.toList());
|
||||
visualizationsOnPage.getColoredRectangles()
|
||||
.add(new ColoredRectangle(numbers.stream()
|
||||
.map(BoundingBox::getBBoxPdf)
|
||||
.collect(RectangleTransformations.collectBBox()), LINES_COLOR, 0.5f));
|
||||
.collect(RectangleTransformations.collectBBox()), LINES_COLOR, LINE_WIDTH));
|
||||
}
|
||||
|
||||
|
||||
@ -351,4 +352,13 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
||||
visualizationsOnPage.getPlacedTexts().add(PlacedText.textFacingUp(outlineObject.getTitle(), textPoint, 10, outlineObject.isFound() ? Color.BLACK : Color.RED, FONT));
|
||||
}
|
||||
|
||||
|
||||
public void addListIdentifiers(List<ListIdentifier> listIdentifiers) {
|
||||
|
||||
for (ListIdentifier listIdentifier : listIdentifiers) {
|
||||
getOrCreateVisualizationsOnPage(listIdentifier.getPage(), this.listIdentifiers).getColoredRectangles()
|
||||
.add(new ColoredRectangle(listIdentifier.getWord().getBBoxPdf(), WORDS_COLOR, LINE_WIDTH));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -37,7 +37,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
|
||||
@Disabled
|
||||
public void testLayoutParserEndToEnd() {
|
||||
|
||||
String filePath = "/home/kschuettler/Dokumente/TestFiles/NER Dataset/Syngenta prod/77c680315c31d403d2e023be023b2087.PREVIEW.pdf";
|
||||
String filePath = "/home/kschuettler/Dokumente/LayoutparsingEvaluation/VV-340050.pdf";
|
||||
|
||||
runForFile(filePath);
|
||||
}
|
||||
|
||||
@ -79,7 +79,7 @@ public class OutlineDetectionTest extends AbstractTest {
|
||||
var documentFile = new ClassPathResource(fileName).getFile();
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
ClassificationDocument classificationDocument = parseLayout(fileName, LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH);
|
||||
ClassificationDocument classificationDocument = parseLayout(fileName, LayoutParsingType.DOCUMINE_OLD);
|
||||
Document document = buildGraph(fileName, classificationDocument);
|
||||
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
|
||||
OutlineObjectTree outlineObjectTree = classificationDocument.getOutlineObjectTree();
|
||||
@ -102,7 +102,7 @@ public class OutlineDetectionTest extends AbstractTest {
|
||||
|
||||
TableOfContents tableOfContents = classificationDocument.getTableOfContents();
|
||||
|
||||
assertEquals(tableOfContents.getMainSections().size(), 10);
|
||||
assertEquals(tableOfContents.getMainSections().size(), 9);
|
||||
assertEquals(tableOfContents.getMainSections().subList(1, 9)
|
||||
.stream()
|
||||
.map(tableOfContentItem -> sanitizeString(tableOfContentItem.getHeadline().toString()))
|
||||
@ -135,7 +135,7 @@ public class OutlineDetectionTest extends AbstractTest {
|
||||
|
||||
List<SemanticNode> childrenOfTypeSectionOrSuperSection = document.getChildrenOfTypeSectionOrSuperSection();
|
||||
|
||||
assertEquals(childrenOfTypeSectionOrSuperSection.size(), 10);
|
||||
assertEquals(childrenOfTypeSectionOrSuperSection.size(), 9);
|
||||
assertEquals(childrenOfTypeSectionOrSuperSection.subList(1, 9)
|
||||
.stream()
|
||||
.map(section -> sanitizeString(section.getHeadline().getLeafTextBlock().toString()))
|
||||
|
||||
@ -57,6 +57,7 @@ public record LayerIdentifier(String name, String markedContentName) {
|
||||
public static final LayerIdentifier OUTLINE_OBJECTS = new LayerIdentifier("Outline Positions", "OUTLINE_OBJECTS");
|
||||
public static final LayerIdentifier SENTENCES = new LayerIdentifier("Sentences", "SENTENCES");
|
||||
public static final LayerIdentifier TOC_PAGES = new LayerIdentifier("TOC pages", "TOC_PAGES");
|
||||
public static final LayerIdentifier LIST_IDENTIFIERS = new LayerIdentifier("List identifiers", "LIST_IDENTIFIERS");
|
||||
|
||||
// Visual layout parser
|
||||
public static final LayerIdentifier KNECON_VISUAL_PARSING = new LayerIdentifier("Visual Layout Parser", "VISUAL_PARSING");
|
||||
|
||||
@ -18,6 +18,8 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup {
|
||||
|
||||
protected static final Standard14EmbeddableFont FONT = Standard14EmbeddableFont.helvetica();
|
||||
|
||||
public static final float LINE_WIDTH = 0.5f;
|
||||
|
||||
protected static final Color WORDS_COLOR = new Color(68, 84, 147);
|
||||
protected static final Color LINES_COLOR = new Color(152, 45, 179);
|
||||
protected static final Color ZONES_COLOR = new Color(131, 38, 38);
|
||||
@ -57,6 +59,7 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup {
|
||||
protected final Visualizations outlineObjects = Visualizations.builder().layer(LayerIdentifier.OUTLINE_OBJECTS).build();
|
||||
protected final Visualizations sentences = Visualizations.builder().layer(LayerIdentifier.SENTENCES).build();
|
||||
protected final Visualizations tocPages = Visualizations.builder().layer(LayerIdentifier.TOC_PAGES).build();
|
||||
protected final Visualizations listIdentifiers = Visualizations.builder().layer(LayerIdentifier.LIST_IDENTIFIERS).build();
|
||||
|
||||
|
||||
public List<Visualizations> getVisualizations() {
|
||||
@ -73,7 +76,10 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup {
|
||||
mainBody, //
|
||||
markedContent, //
|
||||
outlineObjects, //
|
||||
tocPages);
|
||||
tocPages, //
|
||||
listIdentifiers //
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user