diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index 57a0a4a..906b71e 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -374,14 +374,7 @@ public class LayoutParsingPipeline { classificationService.classify(classificationDocument, layoutParsingType, identifier); - List headlines = classificationDocument.getPages() - .stream() - .flatMap(classificationPage -> classificationPage.getTextBlocks() - .stream() - .filter(tb -> tb instanceof TextPageBlock && tb.getClassification() != null && tb.getClassification().isHeadline()) - .map(tb -> (TextPageBlock) tb)) - .toList(); - TableOfContents tableOfContents = outlineValidationService.createToC(headlines); + TableOfContents tableOfContents = outlineValidationService.createToC(classificationDocument); classificationDocument.setTableOfContents(tableOfContents); log.info("Building Sections for {}", identifier); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/TextBoundingBox.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/TextBoundingBox.java index f66bdbb..842cc7b 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/TextBoundingBox.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/TextBoundingBox.java @@ -165,4 +165,16 @@ public abstract class TextBoundingBox extends BoundingBox { return this.getYDirAdj() - threshold <= other.getMaxYDirAdj() && this.getMaxYDirAdj() + threshold >= other.getYDirAdj(); } + + public boolean isAboveDirAdj(TextBoundingBox other) { + + return other.isBelow(this); + } + + + public boolean isBelowDirAdj(TextBoundingBox other) { + + return this.intersectsXDirAdj(other) && this.getYDirAdj() >= other.getMaxYDirAdj(); + } + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageBlockType.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageBlockType.java index 8a5619f..d893ca2 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageBlockType.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageBlockType.java @@ -9,13 +9,13 @@ public enum PageBlockType { H6, HEADER, FOOTER, - TITLE, PARAGRAPH, PARAGRAPH_BOLD, PARAGRAPH_ITALIC, PARAGRAPH_UNKNOWN, OTHER, TABLE_OF_CONTENTS_ITEM, + LIST_ITEM, TABLE; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineValidationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineValidationService.java index 1c8c2bf..e40ad72 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineValidationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineValidationService.java @@ -10,6 +10,7 @@ import java.util.TreeSet; import org.springframework.stereotype.Service; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import io.micrometer.observation.annotation.Observed; @@ -20,7 +21,9 @@ import lombok.extern.slf4j.Slf4j; public class OutlineValidationService { @Observed(name = "OutlineValidationService", contextualName = "create-toc") - public TableOfContents createToC(List headlines) { + public TableOfContents createToC(ClassificationDocument classificationDocument) { + + List headlines = extractHeadlines(classificationDocument); List mainSections = new ArrayList<>(); Map lastItemsPerDepth = new HashMap<>(); @@ -60,4 +63,16 @@ public class OutlineValidationService { return new TableOfContents(mainSections); } + + private static List extractHeadlines(ClassificationDocument classificationDocument) { + + return classificationDocument.getPages() + .stream() + .flatMap(classificationPage -> classificationPage.getTextBlocks() + .stream() + .filter(tb -> tb instanceof TextPageBlock && tb.getClassification() != null && tb.getClassification().isHeadline()) + .map(tb -> (TextPageBlock) tb)) + .toList(); + } + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/AbstractBlockOnPage.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/AbstractBlockOnPage.java new file mode 100644 index 0000000..8461db5 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/AbstractBlockOnPage.java @@ -0,0 +1,8 @@ +package com.knecon.fforesight.service.layoutparser.processor.model.text; + +import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; + +public record AbstractBlockOnPage(AbstractPageBlock block, ClassificationPage page) { + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/ListIdentifier.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/ListIdentifier.java new file mode 100644 index 0000000..0cf496b --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/ListIdentifier.java @@ -0,0 +1,85 @@ +package com.knecon.fforesight.service.layoutparser.processor.model.text; + +import java.util.List; +import java.util.Optional; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.stream.Collectors; + +import lombok.AccessLevel; +import lombok.AllArgsConstructor; +import lombok.Getter; +import lombok.experimental.FieldDefaults; + +@AllArgsConstructor +@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) +public class ListIdentifier { + + public static final Pattern STARTING_NUMBERS = Pattern.compile("^\\s*([1-9]+)\\.\\s+"); + + enum Format { + NUMBERS + } + + Format format; + @Getter + TextPositionSequence word; + @Getter + int page; + int representation; + + + public static Optional parse(TextPageBlock textPageBlock, int page) { + + return parse(textPageBlock.getSequences().subList(0, Math.min(5, textPageBlock.getSequences().size())), page); + } + + + public static Optional parse(List sequences, int page) { + + StringBuilder sb = new StringBuilder(); + for (TextPositionSequence sequence : sequences) { + sb.append(sequence.toString()); + sb.append(" "); + } + sb.replace(sb.length() - 1, sb.length(), ""); + String text = sb.toString(); + + Matcher numberMatcher = STARTING_NUMBERS.matcher(text); + + if (numberMatcher.find()) { + return Optional.of(new ListIdentifier(Format.NUMBERS, sequences.get(0), page, Integer.parseInt(numberMatcher.group(1)))); + } + return Optional.empty(); + } + + + public static boolean isInOrder(List listIdentifiers) { + + if (listIdentifiers.size() <= 1) { + return true; + } + + for (int i = 1; i < listIdentifiers.size(); i++) { + ListIdentifier current = listIdentifiers.get(i); + ListIdentifier previous = listIdentifiers.get(i - 1); + if (current.format != previous.format) { + return false; + } + if (current.representation <= previous.representation) { + return false; + } + if (!current.word.intersectsXDirAdj(previous.word, 2)) { + return false; + } + if (current.page == previous.page && !current.word.isBelowDirAdj(previous.word)) { + return false; + } + if (current.page < previous.page) { + return false; + } + } + return true; + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextBlockOnPage.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextBlockOnPage.java index f79127b..177e178 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextBlockOnPage.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextBlockOnPage.java @@ -2,6 +2,6 @@ package com.knecon.fforesight.service.layoutparser.processor.model.text; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; -public record TextBlockOnPage(ClassificationPage page, TextPageBlock textBlock) { +public record TextBlockOnPage(TextPageBlock textBlock, ClassificationPage page) { } \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java index 7ddeb1e..9b7940b 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java @@ -33,8 +33,6 @@ public class TextPageBlock extends AbstractPageBlock { private boolean underlined; - private double highestFontSize; - private PageBlockType classification; private boolean toDuplicate; @@ -262,6 +260,12 @@ public class TextPageBlock extends AbstractPageBlock { } + public double getHighestFontSize() { + + return frequencyCounters.getFontSizeFrequencyCounter().getHighest(); + } + + @Override public boolean isEmpty() { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java index 2d4e700..ee1538a 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java @@ -18,6 +18,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.SectionIdentif import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities; import lombok.Data; @@ -467,7 +468,9 @@ public class BlockificationPostprocessingService { private static String sanitizeString(String text) { - return StringUtils.deleteWhitespace(text).toLowerCase(Locale.ROOT); + return TextNormalizationUtilities.removeAllWhitespaces(text)// + .trim() // sometimes there are trailing empty bytes at the end of the string trim() seems to remove them + .toLowerCase(Locale.ENGLISH); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/ClarifyndClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/ClarifyndClassificationService.java index b90fdf1..20ced9f 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/ClarifyndClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/ClarifyndClassificationService.java @@ -10,7 +10,6 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils; import lombok.RequiredArgsConstructor; @@ -73,7 +72,9 @@ public class ClarifyndClassificationService { && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) { if (!Pattern.matches("[0-9]+", textBlock.toString())) { - textBlock.setClassification(PageBlockType.TITLE); + PageBlockType headlineType = HeadlineClassificationService.headlineClassByFontSize(textBlock, headlineFontSizes); + headlineClassificationService.classifyHeadline(textBlock, headlineType); + document.setHeadlines(true); } } else if (textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() && PositionUtils.getApproxLineCount(textBlock) < 4.9 diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/ClassificationPatterns.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/ClassificationPatterns.java new file mode 100644 index 0000000..737096e --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/ClassificationPatterns.java @@ -0,0 +1,29 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.classification; + +import java.util.regex.Pattern; + +public class ClassificationPatterns { + + public static final Pattern HEADLINE_WITH_2_IDENTIFER_PATTERN = Pattern.compile("^([1-9]\\d?\\.){1,3}\\d{1,2}\\.?\\s[a-z][0-9a-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE); + + public static final Pattern HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN = Pattern.compile("^([0-9]\\.)\\s[a-z][0-9a-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE); + + public static final Pattern AT_LEAST_3_CHARS_PATTERN = Pattern.compile("\\p{L}{3,}", Pattern.CASE_INSENSITIVE); + + public static final Pattern HEADLINE_PATTERN_WITH_SLASHES = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*"); + + public static final Pattern AMOUNT_PATTERN = Pattern.compile( + "^\\s*\\d+(?:\\.\\d+)?\\s*(?:ml|l|g|kg|mg|cm|cm2|cm3|mm|mm2|mm3|km|km2|m|m2|m3|lb|oz|ppm|dpm|days|weeks|months|%|f)\\b", + Pattern.CASE_INSENSITIVE); + + + + public static final Pattern TABLE_OR_FIGURE_PATTERN = Pattern.compile( + "^\\s*(?:table|continued\\s+table|appendix|figure)\\s+(?:[xvi]+|[a-z0-9]{1,3}(?:\\.[0-9]{1,3})*(?:-[0-9]{1,3})?)\\b", + Pattern.CASE_INSENSITIVE); + + public static final Pattern ALPHANUMERIC = Pattern.compile("[a-zA-Z0-9]"); + + public static final Pattern NUMERIC = Pattern.compile("[0-9]+"); + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java index c11b96c..a072748 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java @@ -1,9 +1,19 @@ package com.knecon.fforesight.service.layoutparser.processor.services.classification; +import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.ALPHANUMERIC; +import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.AMOUNT_PATTERN; +import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.AT_LEAST_3_CHARS_PATTERN; +import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.HEADLINE_PATTERN_WITH_SLASHES; +import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.HEADLINE_WITH_2_IDENTIFER_PATTERN; +import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN; +import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.TABLE_OR_FIGURE_PATTERN; + import java.util.ArrayList; +import java.util.Comparator; +import java.util.Iterator; import java.util.List; import java.util.Locale; -import java.util.Set; +import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -13,90 +23,60 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; +import com.knecon.fforesight.service.layoutparser.processor.model.text.AbstractBlockOnPage; +import com.knecon.fforesight.service.layoutparser.processor.model.text.ListIdentifier; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.utils.HeaderFooterDetection; -import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils; +import lombok.AccessLevel; import lombok.RequiredArgsConstructor; +import lombok.experimental.FieldDefaults; import lombok.extern.slf4j.Slf4j; @Slf4j @Service @RequiredArgsConstructor +@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) public class DocuMineClassificationService { - private static final Pattern HEADLINE_WITH_2_IDENTIFER_PATTERN = Pattern.compile("^([1-9]\\d?\\.){1,3}\\d{1,2}\\.?\\s[a-z][0-9a-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE); - private static final Pattern HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN = Pattern.compile("^([0-9]\\.)\\s[a-z][0-9a-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE); - private static final Pattern AT_LEAST_3_CHARS_PATTERN = Pattern.compile("\\p{L}{3,}", Pattern.CASE_INSENSITIVE); - private static final Pattern HEADLINE_PATTERN_WITH_SLASHES = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*"); - public static final Pattern AMOUNT_PATTERN = Pattern.compile("^\\s*\\d+(?:\\.\\d+)?\\s*(?:ml|l|g|kg|mg|cm|cm2|cm3|mm|mm2|mm3|km|km2|m|m2|m3|lb|oz|ppm|dpm|days|weeks|months|%|f)\\b", - Pattern.CASE_INSENSITIVE); - private static final Pattern TABLE_OR_FIGURE_PATTERN = Pattern.compile( - "^\\s*(?:table|continued\\s+table|appendix|figure)\\s+(?:[xvi]+|[a-z0-9]{1,3}(?:\\.[0-9]{1,3})*(?:-[0-9]{1,3})?)\\b", - Pattern.CASE_INSENSITIVE); - private static final Pattern ALPHANUMERIC = Pattern.compile("[a-zA-Z0-9]"); - public static final int SEPARATION_THRESHOLD = 10; // if the min distance between a textblock and all its surrounding blocks, the regexes can be more lenient. public static final int SURROUNDING_BLOCKS_RADIUS = 3; // number of surrounding blocks before and after the current textblock to be tested + ListItemClassificationService listItemClassificationService; + public void classifyDocument(ClassificationDocument document) { - List headlineFontSizes = document.getFontSizeCounter().getValuesInReverseOrder(); - - log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue()); + List headlineFontSizes = buildHeadlineFontSizes(document); + List blocks = buildBlocksPerPage(document); + log.debug("Headline FontSizes are: {}", headlineFontSizes); HeadlineClassificationService headlineClassificationService = new HeadlineClassificationService(); - for (ClassificationPage page : document.getPages()) { - document.getLayoutDebugLayer().addTextBlockVisualizations(page.getTextBlocks(), page.getPageNumber()); - classifyPage(headlineClassificationService, page, document, headlineFontSizes); + for (int i = 0; i < blocks.size(); i++) { + AbstractBlockOnPage block = blocks.get(i); + document.getLayoutDebugLayer().addTextBlockVisualizations(block.page().getTextBlocks(), block.page().getPageNumber()); + classifyBlock(headlineClassificationService, i, blocks, document, headlineFontSizes); } - } - - private void classifyPage(HeadlineClassificationService headlineClassificationService, - ClassificationPage page, - ClassificationDocument document, - List headlineFontSizes) { - - List textBlocks = page.getTextBlocks(); - for (int i = 0; i < textBlocks.size(); i++) { - AbstractPageBlock textBlock = textBlocks.get(i); - if (textBlock instanceof TextPageBlock) { - List surroundingBlocks = getSurroundingBlocks(i, textBlocks); - classifyBlock(headlineClassificationService, (TextPageBlock) textBlock, surroundingBlocks, page, document, headlineFontSizes); - } - } - } - - - private List getSurroundingBlocks(int originalIndex, List textBlocks) { - - int start = Math.max(originalIndex - SURROUNDING_BLOCKS_RADIUS, 0); - int end = Math.min(originalIndex + SURROUNDING_BLOCKS_RADIUS, textBlocks.size()); - List surroundingBlocks = new ArrayList<>(2 * SURROUNDING_BLOCKS_RADIUS); - for (int i = start; i < end; i++) { - if (i == originalIndex) { - continue; - } - if (textBlocks.get(i).getText().length() <= 1) { - continue; - } - surroundingBlocks.add(textBlocks.get(i)); - } - return surroundingBlocks; } private void classifyBlock(HeadlineClassificationService headlineClassificationService, - TextPageBlock textBlock, - List surroundingBlocks, - ClassificationPage page, + int currentIndex, + List allBlocks, ClassificationDocument document, List headlineFontSizes) { + TextPageBlock textBlock; + if (allBlocks.get(currentIndex).block() instanceof TextPageBlock block) { + textBlock = block; + } else { + return; + } + ClassificationPage page = allBlocks.get(currentIndex).page(); + List surroundingBlocks = getSurroundingBlocksOnPage(currentIndex, allBlocks); + log.debug("headlineFontSizes: {}", headlineFontSizes); var bodyTextFrame = page.getBodyTextFrame(); @@ -113,6 +93,9 @@ public class DocuMineClassificationService { boolean enoughChars = charCount > textBlock.getText().length() * 0.5; + List listIdentifiers = listItemClassificationService.findConfirmedListIdentifiers(currentIndex, allBlocks); + document.getLayoutDebugLayer().addListIdentifiers(listIdentifiers); + if (textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) { headlineClassificationService.setLastHeadlineFromOutline(textBlock); return; @@ -126,26 +109,21 @@ public class DocuMineClassificationService { textBlock.setClassification(PageBlockType.PARAGRAPH); return; } - if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5 - && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) { - if (!Pattern.matches("[0-9]+", textBlock.toString())) { - textBlock.setClassification(PageBlockType.TITLE); - } - } else if (textBlock.getText().length() > 5 - && greaterOrEqualFontThanDocumentAverage(textBlock, document) - && PositionUtils.getApproxLineCount(textBlock) < 5.9 - && ((textBlock.getMostPopularWordStyle().contains("bold") || textBlock.isUnderlined())// - && Character.isDigit(textBlock.toString().charAt(0)) // - && isAtLeast3Characters // - && !textBlock.toString().contains(":") // - || textBlock.toString().startsWith("APPENDIX") // - || textBlock.toString().startsWith("FIGURE") // - || textBlock.toString().startsWith("Continued TABLE") // - || textBlock.toString().startsWith("TABLE")) - && !textBlock.toString().endsWith(":") - && isAtLeast3Characters - && !isAmount - && enoughChars) { + if (textBlock.getText().length() > 5 + && greaterOrEqualFontThanDocumentAverage(textBlock, document) + && PositionUtils.getApproxLineCount(textBlock) < 5.9 + && ((textBlock.getMostPopularWordStyle().contains("bold") || textBlock.isUnderlined())// + && Character.isDigit(textBlock.toString().charAt(0)) // + && isAtLeast3Characters // + && !textBlock.toString().contains(":") // + || textBlock.toString().startsWith("APPENDIX") // + || textBlock.toString().startsWith("FIGURE") // + || textBlock.toString().startsWith("Continued TABLE") // + || textBlock.toString().startsWith("TABLE")) + && !textBlock.toString().endsWith(":") + && isAtLeast3Characters + && !isAmount + && enoughChars) { setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes); } else if (isAllCaps(textBlock) @@ -170,11 +148,14 @@ public class DocuMineClassificationService { } else if (hasSeparation(textBlock, surroundingBlocks)// && greaterOrEqualFontThanPageAverage(textBlock, page)// && PositionUtils.getApproxLineCount(textBlock) < 2.9// - && (tableOrFigureMatcher.reset().find() || headlineWithSingleIdentifierMatcher.reset().find())// + && (tableOrFigureMatcher.reset().find() || (headlineWithSingleIdentifierMatcher.reset().find() && listIdentifiers.isEmpty())) // && !isAmount// && !headlineWithSlashesMatches) { setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes); + } else if (!listIdentifiers.isEmpty()) { + + textBlock.setClassification(PageBlockType.LIST_ITEM); } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") @@ -264,6 +245,92 @@ public class DocuMineClassificationService { document.setHeadlines(true); } + + private List buildBlocksPerPage(ClassificationDocument document) { + + List blocks = new ArrayList<>(); + for (ClassificationPage page : document.getPages()) { + for (AbstractPageBlock abstractPageBlock : page.getTextBlocks()) { + if (abstractPageBlock instanceof TextPageBlock textBlock) { + if (textBlock.getClassification() != null && (textBlock.getClassification().equals(PageBlockType.HEADER) // + || textBlock.getClassification().equals(PageBlockType.FOOTER))) { + continue; + } + blocks.add(new AbstractBlockOnPage(textBlock, page)); + } + } + } + return blocks; + } + + + private static List buildHeadlineFontSizes(ClassificationDocument document) { + + if (document.getFontSizeCounter().getCountPerValue().size() <= 6) { + return document.getFontSizeCounter().getValuesInReverseOrder(); + } + + List> sortedEntries = new ArrayList<>(document.getFontSizeCounter().getCountPerValue().entrySet()); + sortedEntries.sort(Map.Entry.comparingByKey()); + + int totalCount = sortedEntries.stream() + .mapToInt(Map.Entry::getValue).sum(); + + int cumulativeCount = 0; + Iterator> iterator = sortedEntries.iterator(); + while (iterator.hasNext()) { + Map.Entry entry = iterator.next(); + cumulativeCount += entry.getValue(); + if (cumulativeCount > totalCount * 0.3) { + break; // We've filtered the bottom 30%, so stop. + } + iterator.remove(); + } + + if (sortedEntries.size() < 6) { + return document.getFontSizeCounter().getValuesInReverseOrder(); + } + int clusterSize = Math.max(1, sortedEntries.size() / 6); + + List> clusters = new ArrayList<>(); + for (int i = 0; i < 6; i++) { + clusters.add(new ArrayList<>()); + } + + for (int i = 0; i < sortedEntries.size(); i++) { + int clusterIndex = Math.min(i / clusterSize, 5); + clusters.get(clusterIndex).add(sortedEntries.get(i).getKey()); + } + + return clusters.stream() + .map(cluster -> cluster.stream() + .mapToDouble(d -> d).average() + .orElseThrow()) + .sorted(Comparator.reverseOrder()) + .toList(); + } + + + private List getSurroundingBlocksOnPage(int originalIndex, List textBlocks) { + + int start = Math.max(originalIndex - SURROUNDING_BLOCKS_RADIUS, 0); + int end = Math.min(originalIndex + SURROUNDING_BLOCKS_RADIUS, textBlocks.size()); + List surroundingBlocks = new ArrayList<>(2 * SURROUNDING_BLOCKS_RADIUS); + for (int i = start; i < end; i++) { + if (i == originalIndex) { + continue; + } + if (textBlocks.get(i).block().getText().length() <= 1) { + continue; + } + if (!textBlocks.get(i).page().equals(textBlocks.get(originalIndex).page())) { + continue; + } + surroundingBlocks.add(textBlocks.get(i).block()); + } + return surroundingBlocks; + } + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/HeadlineClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/HeadlineClassificationService.java index be9aaaf..38e0f92 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/HeadlineClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/HeadlineClassificationService.java @@ -73,13 +73,18 @@ public class HeadlineClassificationService { public static PageBlockType headlineClassByFontSize(TextPageBlock textBlock, List fontSizeGroups) { - PageBlockType headlineType = PageBlockType.H1; - for (int i = 1; i <= fontSizeGroups.size(); i++) { - if (textBlock.getMostPopularWordFontSize() == fontSizeGroups.get(i - 1)) { - headlineType = PageBlockType.getHeadlineType(i); + List distances = fontSizeGroups.stream() + .map(fontSize -> Math.abs(fontSize - textBlock.getMostPopularWordFontSize())) + .toList(); + double min = Double.MAX_VALUE; + int argMin = -1; + for (int i = 0; i < distances.size(); i++) { + if (distances.get(i) < min) { + min = distances.get(i); + argMin = i; } } - return headlineType; + return PageBlockType.getHeadlineType(argMin); } } \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/ListItemClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/ListItemClassificationService.java new file mode 100644 index 0000000..b2fc088 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/ListItemClassificationService.java @@ -0,0 +1,99 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.classification; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.LinkedList; +import java.util.List; + +import org.springframework.stereotype.Service; + +import com.knecon.fforesight.service.layoutparser.processor.model.text.AbstractBlockOnPage; +import com.knecon.fforesight.service.layoutparser.processor.model.text.ListIdentifier; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; + +@Service +public class ListItemClassificationService { + + public static final int LIST_IDENTIFIER_SEARCH_RADIUS = 3; + + + public List findConfirmedListIdentifiers(int currentIndex, List allBlocks) { + + List listIdentifiers = extractListIdentifiers(allBlocks.get(currentIndex)); + if (listIdentifiers.isEmpty()) { + return Collections.emptyList(); + } + if (listIdentifiers.size() > 1 && ListIdentifier.isInOrder(listIdentifiers)) { + return listIdentifiers; + } + + int start = Math.max(0, currentIndex - LIST_IDENTIFIER_SEARCH_RADIUS); + int end = Math.min(allBlocks.size(), currentIndex + LIST_IDENTIFIER_SEARCH_RADIUS); + + List identifiersBehind = new ArrayList<>(); + if (start < currentIndex) { + identifiersBehind.addAll(allBlocks.subList(start, currentIndex) + .stream() + .map(this::extractListIdentifiers) + .flatMap(Collection::stream) + .toList()); + } + if (!identifiersBehind.isEmpty()) { + listIdentifiers.add(0, identifiersBehind.get(identifiersBehind.size() - 1)); + if (ListIdentifier.isInOrder(listIdentifiers)) { + return listIdentifiers; + } + listIdentifiers.remove(0); + } + List identifiersAhead = new ArrayList<>(); + if (currentIndex + 1 < end) { + identifiersAhead.addAll(allBlocks.subList(currentIndex + 1, end) + .stream() + .map(this::extractListIdentifiers) + .flatMap(Collection::stream) + .toList()); + } + if (!identifiersAhead.isEmpty()) { + listIdentifiers.add(identifiersAhead.get(0)); + if (ListIdentifier.isInOrder(listIdentifiers)) { + return listIdentifiers; + } + listIdentifiers.remove(listIdentifiers.size() - 1); + } + return Collections.emptyList(); + + } + + + private List extractListIdentifiers(AbstractBlockOnPage block) { + + List result = new LinkedList<>(); + if (block.block() instanceof TextPageBlock textBlock) { + List sequences = textBlock.getSequences(); + for (int i = 0; i < sequences.size(); i++) { + + if (i != 0 && sequences.get(i - 1).getXDirAdj() < sequences.get(i).getXDirAdj()) { + // is not the start of a line, continue + continue; + } + + TextPositionSequence sequence = sequences.get(i); + List wordsAtStartOfLine = new ArrayList<>(3); + int end = Math.min(sequences.size(), i + 3); + for (int j = i; j < end; j++) { + if (sequences.get(j).intersectsYDirAdj(sequence, 2)) { + wordsAtStartOfLine.add(sequences.get(j)); + } else { + break; + } + } + + ListIdentifier.parse(wordsAtStartOfLine, block.page().getPageNumber()).ifPresent(result::add); + } + } + return result; + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java index 3e066d5..63d3beb 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java @@ -86,7 +86,9 @@ public class RedactManagerClassificationService { && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) { if (!Pattern.matches("[0-9]+", textBlock.toString())) { - textBlock.setClassification(PageBlockType.TITLE); + PageBlockType headlineType = HeadlineClassificationService.headlineClassByFontSize(textBlock, headlineFontSizes); + headlineClassificationService.classifyHeadline(textBlock, headlineType); + document.setHeadlines(true); } } else if (textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() && PositionUtils.getApproxLineCount(textBlock) < 4.9 diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TableOfContentsClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TableOfContentsClassificationService.java index 854d087..0e955e7 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TableOfContentsClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TableOfContentsClassificationService.java @@ -1,6 +1,7 @@ package com.knecon.fforesight.service.layoutparser.processor.services.classification; -import static com.knecon.fforesight.service.layoutparser.processor.services.classification.DocuMineClassificationService.AMOUNT_PATTERN; +import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.AMOUNT_PATTERN; +import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.NUMERIC; import java.util.ArrayList; import java.util.Collection; @@ -13,7 +14,6 @@ import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Set; -import java.util.regex.Pattern; import java.util.stream.Collectors; import org.springframework.stereotype.Service; @@ -42,8 +42,6 @@ public class TableOfContentsClassificationService { public static final int INTERSECTION_TOLERANCE = 2; // equality threshold for x intersection public static final int DENSITY_THRESHOLD_COUNT = 10; // describes the minimum density, at least this many entries per page height are required - private static final Pattern NUMERIC = Pattern.compile("[0-9]+"); - @SuppressWarnings("checkstyle:ModifiedControlVariable") public void classifyTableOfContents(ClassificationDocument document) { @@ -57,11 +55,13 @@ public class TableOfContentsClassificationService { continue; } - int offset = identifyTOCItems(i + 1, textBlocks, document); + int end = identifyTOCItems(i + 1, textBlocks, document); - if (offset > 1) { - textBlock.textBlock().setClassification(PageBlockType.H1); - i += offset; + if (end > i + 1) { + if (textBlock.textBlock().getClassification() == null) { + textBlock.textBlock().setClassification(PageBlockType.H1); + } + i = end; } } } @@ -237,7 +237,7 @@ public class TableOfContentsClassificationService { || textBlock.getClassification().equals(PageBlockType.FOOTER))) { continue; } - blocks.add(new TextBlockOnPage(page, textBlock)); + blocks.add(new TextBlockOnPage(textBlock, page)); } } } @@ -352,7 +352,7 @@ public class TableOfContentsClassificationService { return false; } - int prev = getNumberAsInt(numbers, i); + int prev = getNumberAsInt(numbers, i - 1); int curr = getNumberAsInt(numbers, i); int next = getNumberAsInt(numbers, i + 1); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java index e39b666..c125acd 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java @@ -5,6 +5,7 @@ import java.awt.geom.Point2D; import java.io.IOException; import java.util.ArrayList; import java.util.List; +import java.util.Set; import org.apache.pdfbox.contentstream.operator.Operator; import org.apache.pdfbox.contentstream.operator.OperatorName; @@ -50,6 +51,7 @@ import lombok.extern.slf4j.Slf4j; @Slf4j public class PDFLinesTextStripper extends PDFTextStripper { + private final static Set DOT_LIKE_CHARACTERS = Set.of(".", "·", "•", "․", "‧", "∙", "⋅", "・", ".", "・", "…", "⸱", "﹒", "ꞏ"); private final List textPositionSequences = new ArrayList<>(); private final List rulings = new ArrayList<>(); private final List graphicsPath = new ArrayList<>(); @@ -336,20 +338,32 @@ public class PDFLinesTextStripper extends PDFTextStripper { private boolean isWordFollowedByDottedLine(List textPositions, int i, int startIndex) { return i - startIndex >= 4 // - && textPositions.get(i).getUnicode().equals(".") // - && textPositions.get(i - 1).getUnicode().equals(".") // - && textPositions.get(i - 2).getUnicode().equals(".") // - && !textPositions.get(i - 3).getUnicode().equals("."); + && isDot(textPositions, i) // + && isDot(textPositions, i - 1) // + && isDot(textPositions, i - 2) // + && alphanumeric(textPositions, i - 3); } private static boolean isDottedLineFollowedByWord(List textPositions, int i, int startIndex) { return i - startIndex >= 4 // - && !textPositions.get(i).getUnicode().equals(".") // - && textPositions.get(i - 1).getUnicode().equals(".") // - && textPositions.get(i - 2).getUnicode().equals(".") // - && textPositions.get(i - 3).getUnicode().equals("."); + && alphanumeric(textPositions, i) // + && isDot(textPositions, i - 1) // + && isDot(textPositions, i - 2) // + && isDot(textPositions, i - 3); + } + + + private static boolean isDot(List textPositions, int i) { + + return DOT_LIKE_CHARACTERS.contains(textPositions.get(i).getUnicode()); + } + + + private static boolean alphanumeric(List textPositions, int i) { + + return Character.isAlphabetic(textPositions.get(i).getUnicode().charAt(0)) || Character.isDigit(textPositions.get(i).getUnicode().charAt(0)); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutDebugLayer.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutDebugLayer.java index 9428e2d..abce8af 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutDebugLayer.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutDebugLayer.java @@ -26,6 +26,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.textbloc import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; +import com.knecon.fforesight.service.layoutparser.processor.model.text.ListIdentifier; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms; import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; @@ -80,7 +81,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig { visualizationsOnPage.getColoredLines() .addAll(cleanRulings.buildAll() .stream() - .map(ruling -> new ColoredLine(ruling, decideOnRulingColor(ruling), 0.5f)) + .map(ruling -> new ColoredLine(ruling, decideOnRulingColor(ruling), LINE_WIDTH)) .toList()); } @@ -93,7 +94,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig { VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.rulings); visualizationsOnPage.getColoredLines() .addAll(rulings.stream() - .map(ruling -> new ColoredLine(ruling, decideOnRulingColor(ruling), 0.5f)) + .map(ruling -> new ColoredLine(ruling, decideOnRulingColor(ruling), LINE_WIDTH)) .toList()); } @@ -182,7 +183,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig { visualizationsOnPage.getColoredRectangles() .addAll(lines.stream() .map(BoundingBox::getBBoxPdf) - .map(line -> new ColoredRectangle(line, LINES_COLOR, 0.5f)) + .map(line -> new ColoredRectangle(line, LINES_COLOR, LINE_WIDTH)) .toList()); } @@ -198,7 +199,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig { .map(line -> line.stream() .map(BoundingBox::getBBoxPdf) .collect(RectangleTransformations.collectBBox())) - .map(line -> new ColoredRectangle(line, LINES_COLOR, 0.5f)) + .map(line -> new ColoredRectangle(line, LINES_COLOR, LINE_WIDTH)) .toList()); } @@ -300,12 +301,12 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig { visualizationsOnPage.getColoredRectangles() .addAll(numbers.stream() .map(BoundingBox::getBBoxPdf) - .map(line -> new ColoredRectangle(line, LINES_COLOR, 0.5f)) + .map(line -> new ColoredRectangle(line, LINES_COLOR, LINE_WIDTH)) .toList()); visualizationsOnPage.getColoredRectangles() .add(new ColoredRectangle(numbers.stream() .map(BoundingBox::getBBoxPdf) - .collect(RectangleTransformations.collectBBox()), LINES_COLOR, 0.5f)); + .collect(RectangleTransformations.collectBBox()), LINES_COLOR, LINE_WIDTH)); } @@ -351,4 +352,13 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig { visualizationsOnPage.getPlacedTexts().add(PlacedText.textFacingUp(outlineObject.getTitle(), textPoint, 10, outlineObject.isFound() ? Color.BLACK : Color.RED, FONT)); } + + public void addListIdentifiers(List listIdentifiers) { + + for (ListIdentifier listIdentifier : listIdentifiers) { + getOrCreateVisualizationsOnPage(listIdentifier.getPage(), this.listIdentifiers).getColoredRectangles() + .add(new ColoredRectangle(listIdentifier.getWord().getBBoxPdf(), WORDS_COLOR, LINE_WIDTH)); + } + } + } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java index 4d9dd72..5364a09 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java @@ -37,7 +37,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest { @Disabled public void testLayoutParserEndToEnd() { - String filePath = "/home/kschuettler/Dokumente/TestFiles/NER Dataset/Syngenta prod/77c680315c31d403d2e023be023b2087.PREVIEW.pdf"; + String filePath = "/home/kschuettler/Dokumente/LayoutparsingEvaluation/VV-340050.pdf"; runForFile(filePath); } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/OutlineDetectionTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/OutlineDetectionTest.java index c548a81..374e8f5 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/OutlineDetectionTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/OutlineDetectionTest.java @@ -79,7 +79,7 @@ public class OutlineDetectionTest extends AbstractTest { var documentFile = new ClassPathResource(fileName).getFile(); long start = System.currentTimeMillis(); - ClassificationDocument classificationDocument = parseLayout(fileName, LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH); + ClassificationDocument classificationDocument = parseLayout(fileName, LayoutParsingType.DOCUMINE_OLD); Document document = buildGraph(fileName, classificationDocument); layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true); OutlineObjectTree outlineObjectTree = classificationDocument.getOutlineObjectTree(); @@ -102,7 +102,7 @@ public class OutlineDetectionTest extends AbstractTest { TableOfContents tableOfContents = classificationDocument.getTableOfContents(); - assertEquals(tableOfContents.getMainSections().size(), 10); + assertEquals(tableOfContents.getMainSections().size(), 9); assertEquals(tableOfContents.getMainSections().subList(1, 9) .stream() .map(tableOfContentItem -> sanitizeString(tableOfContentItem.getHeadline().toString())) @@ -135,7 +135,7 @@ public class OutlineDetectionTest extends AbstractTest { List childrenOfTypeSectionOrSuperSection = document.getChildrenOfTypeSectionOrSuperSection(); - assertEquals(childrenOfTypeSectionOrSuperSection.size(), 10); + assertEquals(childrenOfTypeSectionOrSuperSection.size(), 9); assertEquals(childrenOfTypeSectionOrSuperSection.subList(1, 9) .stream() .map(section -> sanitizeString(section.getHeadline().getLeafTextBlock().toString())) diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/LayerIdentifier.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/LayerIdentifier.java index 33d52c2..331542d 100644 --- a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/LayerIdentifier.java +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/LayerIdentifier.java @@ -57,6 +57,7 @@ public record LayerIdentifier(String name, String markedContentName) { public static final LayerIdentifier OUTLINE_OBJECTS = new LayerIdentifier("Outline Positions", "OUTLINE_OBJECTS"); public static final LayerIdentifier SENTENCES = new LayerIdentifier("Sentences", "SENTENCES"); public static final LayerIdentifier TOC_PAGES = new LayerIdentifier("TOC pages", "TOC_PAGES"); + public static final LayerIdentifier LIST_IDENTIFIERS = new LayerIdentifier("List identifiers", "LIST_IDENTIFIERS"); // Visual layout parser public static final LayerIdentifier KNECON_VISUAL_PARSING = new LayerIdentifier("Visual Layout Parser", "VISUAL_PARSING"); diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/LayoutDebugLayerConfig.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/LayoutDebugLayerConfig.java index 1fb35be..946f2ab 100644 --- a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/LayoutDebugLayerConfig.java +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/LayoutDebugLayerConfig.java @@ -18,6 +18,8 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup { protected static final Standard14EmbeddableFont FONT = Standard14EmbeddableFont.helvetica(); + public static final float LINE_WIDTH = 0.5f; + protected static final Color WORDS_COLOR = new Color(68, 84, 147); protected static final Color LINES_COLOR = new Color(152, 45, 179); protected static final Color ZONES_COLOR = new Color(131, 38, 38); @@ -57,6 +59,7 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup { protected final Visualizations outlineObjects = Visualizations.builder().layer(LayerIdentifier.OUTLINE_OBJECTS).build(); protected final Visualizations sentences = Visualizations.builder().layer(LayerIdentifier.SENTENCES).build(); protected final Visualizations tocPages = Visualizations.builder().layer(LayerIdentifier.TOC_PAGES).build(); + protected final Visualizations listIdentifiers = Visualizations.builder().layer(LayerIdentifier.LIST_IDENTIFIERS).build(); public List getVisualizations() { @@ -73,7 +76,10 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup { mainBody, // markedContent, // outlineObjects, // - tocPages); + tocPages, // + listIdentifiers // + ); } + }