From 5e091402c7c6f1850e95547876e5f20f6a1ed615 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kilian=20Sch=C3=BCttler?= Date: Thu, 24 Oct 2024 10:04:59 +0200 Subject: [PATCH] RED-10204: backport of NPE hotfix and rename TextPositionSequence to Word --- .../processor/LayoutParsingPipeline.java | 23 +++--- .../docstrum/DocstrumSegmentationService.java | 8 +- .../processor/docstrum/model/Line.java | 18 ++--- .../model/FloatFrequencyCounter.java | 2 +- .../processor/model/LineInformation.java | 6 +- .../processor/model/PageContents.java | 4 +- .../processor/model/table/Cell.java | 6 +- .../processor/model/text/ListIdentifier.java | 8 +- .../processor/model/text/SearchableText.java | 14 ++-- .../processor/model/text/TextPageBlock.java | 73 ++++++++++--------- .../text/TextPositionSequenceComparator.java | 8 +- .../{TextPositionSequence.java => Word.java} | 18 ++--- .../DividingColumnDetectionService.java | 4 +- .../services/GapDetectionService.java | 30 ++++---- .../InvisibleTableDetectionService.java | 8 +- .../services/LineDetectionService.java | 60 +++++++-------- .../services/PageContentExtractor.java | 6 +- .../services/PageInformationService.java | 4 +- .../services/TextRulingsClassifier.java | 10 +-- .../BlockificationPostprocessingService.java | 48 ++++++------ .../DocstrumBlockificationService.java | 22 +++--- .../DocuMineBlockificationService.java | 32 ++++---- .../RedactManagerBlockificationService.java | 10 +-- .../ClarifyndClassificationService.java | 4 +- .../DocuMineClassificationService.java | 11 ++- .../ListItemClassificationService.java | 8 +- .../RedactManagerClassificationService.java | 5 +- .../TableOfContentsClassificationService.java | 56 +++++++------- .../factory/DocumentGraphFactory.java | 18 ++--- .../SearchTextWithTextPositionFactory.java | 8 +- .../services/factory/TableNodeFactory.java | 6 +- .../services/factory/TextBlockFactory.java | 6 +- .../graphics/GraphicExtractorService.java | 10 +-- .../parsing/PDFLinesTextStripper.java | 30 ++++---- .../processor/utils/MarkedContentUtils.java | 6 +- .../utils/TextPositionOperations.java | 44 +++++------ .../visualization/LayoutDebugLayer.java | 10 +-- .../HeadlinesGoldStandardIntegrationTest.java | 34 +++++---- .../graph/DocumentReadingOrderTest.java | 4 +- .../PdfSegmentationServiceTest.java | 8 +- .../GapAcrossLinesDetectionServiceTest.java | 2 +- .../InvisibleTableDetectionServiceTest.java | 10 +-- .../services/PageContentExtractorTest.java | 6 +- .../services/RulingsClassifierTest.java | 14 ++-- .../server/utils/BuildDocumentTest.java | 18 ----- .../layers/LayoutDebugLayerConfig.java | 4 +- .../viewerdoc/layers/OcrDebugLayerConfig.java | 9 +-- 47 files changed, 374 insertions(+), 379 deletions(-) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/{TextPositionSequence.java => Word.java} (90%) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index 872884b..5377398 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -29,7 +29,6 @@ import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsi import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection; import com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationService; -import com.knecon.fforesight.service.layoutparser.processor.services.classification.TableOfContentsClassificationService; import com.knecon.fforesight.service.layoutparser.processor.services.mapper.MarkdownMapper; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; @@ -45,7 +44,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableO import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.CvTableParsingAdapter; import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.ImageServiceResponseAdapter; import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.VisualLayoutParsingAdapter; @@ -53,7 +52,6 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.ima import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse; -import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService; import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService; import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService; import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService; @@ -63,9 +61,6 @@ import com.knecon.fforesight.service.layoutparser.processor.services.blockificat import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService; import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService; import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService; -import com.knecon.fforesight.service.layoutparser.processor.services.classification.ClarifyndClassificationService; -import com.knecon.fforesight.service.layoutparser.processor.services.classification.DocuMineClassificationService; -import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService; import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; import com.knecon.fforesight.service.layoutparser.processor.services.graphics.Box; import com.knecon.fforesight.service.layoutparser.processor.services.graphics.GraphicExtractorService; @@ -275,7 +270,7 @@ public class LayoutParsingPipeline { stripper.setEndPage(pageNumber); stripper.setPdpage(pdPage); stripper.getText(originDocument); - List words = stripper.getTextPositionSequences(); + List words = stripper.getWords(); // rotateDirAdjExactly(words, pdPage); // works really well for many highly rotated documents (e.g. VV-331340.pdf), but it decreases the headline performance by 1.3%, so I am leaving it out for now @@ -301,7 +296,7 @@ public class LayoutParsingPipeline { TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(words, cleanRulings); - List graphics = graphicExtractorService.extractPathElementGraphics(originDocument, pdPage, pageNumber, cleanRulings, stripper.getTextPositionSequences(), false); + List graphics = graphicExtractorService.extractPathElementGraphics(originDocument, pdPage, pageNumber, cleanRulings, stripper.getWords(), false); pdfImages.computeIfAbsent(pageNumber, x -> new ArrayList<>()) .addAll(graphics.stream() @@ -314,7 +309,7 @@ public class LayoutParsingPipeline { ClassificationPage classificationPage = switch (layoutParsingType) { case REDACT_MANAGER_OLD -> - redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings, classificationDocument.getLayoutDebugLayer()); + redactManagerBlockificationService.blockify(stripper.getWords(), cleanRulings, classificationDocument.getLayoutDebugLayer()); case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings); case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH -> docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getLayoutDebugLayer(), layoutParsingType); @@ -388,12 +383,12 @@ public class LayoutParsingPipeline { } - private static void rotateDirAdjExactly(List words, PDPage pdPage) { + private static void rotateDirAdjExactly(List words, PDPage pdPage) { for (TextDirection dir : TextDirection.values()) { double averageRotation = words.stream() - .map(TextPositionSequence::getTextPositions) + .map(Word::getTextPositions) .flatMap(Collection::stream) .filter(pos -> pos.getDir().equals(dir)) .mapToDouble(RedTextPosition::getExactDir).average().orElse(0); @@ -404,7 +399,7 @@ public class LayoutParsingPipeline { AffineTransform rotateInstance = AffineTransform.getRotateInstance(averageRotation, pdPage.getMediaBox().getWidth() / 2, pdPage.getMediaBox().getHeight() / 2); - for (TextPositionSequence word : words) { + for (Word word : words) { if (!dir.equals(word.getDir())) { continue; } @@ -455,10 +450,10 @@ public class LayoutParsingPipeline { // Collect all statistics for the classificationPage, except from blocks inside tables, as tables will always be added to BodyTextFrame. for (AbstractPageBlock textBlock : classificationPage.getTextBlocks()) { if (textBlock instanceof TextPageBlock) { - if (((TextPageBlock) textBlock).getSequences() == null) { + if (((TextPageBlock) textBlock).getWords() == null) { continue; } - for (TextPositionSequence word : ((TextPageBlock) textBlock).getSequences()) { + for (Word word : ((TextPageBlock) textBlock).getWords()) { classificationPage.getTextHeightCounter().add(word.getTextHeight()); classificationPage.getFontCounter().add(word.getFont()); classificationPage.getFontSizeCounter().add(word.getFontSize()); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/DocstrumSegmentationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/DocstrumSegmentationService.java index 6f6024c..8f3c9f6 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/DocstrumSegmentationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/DocstrumSegmentationService.java @@ -18,7 +18,7 @@ import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.Zon import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer; import lombok.RequiredArgsConstructor; @@ -35,7 +35,7 @@ public class DocstrumSegmentationService { private final ReadingOrderService readingOrderService; - public List segmentPage(List textPositions, boolean xyOrder, CleanRulings usedRulings, LayoutDebugLayer visualizations) { + public List segmentPage(List textPositions, boolean xyOrder, CleanRulings usedRulings, LayoutDebugLayer visualizations) { EnumMap directionCounts = new EnumMap<>(TextDirection.class); @@ -78,11 +78,11 @@ public class DocstrumSegmentationService { } - private List computeZones(List textPositions, CleanRulings rulings, LayoutDebugLayer visualizations, TextDirection direction) { + private List computeZones(List textPositions, CleanRulings rulings, LayoutDebugLayer visualizations, TextDirection direction) { List positions = textPositions.stream() .filter(t -> t.getDir() == direction) - .map(TextPositionSequence::getTextPositions) + .map(Word::getTextPositions) .flatMap(List::stream) .toList(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/Line.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/Line.java index d9d779b..eb999c7 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/Line.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/Line.java @@ -1,9 +1,9 @@ package com.knecon.fforesight.service.layoutparser.processor.docstrum.model; -import static com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence.BOLD; -import static com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence.BOLD_ITALIC; -import static com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence.ITALIC; -import static com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence.STANDARD; +import static com.knecon.fforesight.service.layoutparser.processor.model.text.Word.BOLD; +import static com.knecon.fforesight.service.layoutparser.processor.model.text.Word.BOLD_ITALIC; +import static com.knecon.fforesight.service.layoutparser.processor.model.text.Word.ITALIC; +import static com.knecon.fforesight.service.layoutparser.processor.model.text.Word.STANDARD; import java.util.ArrayList; import java.util.Arrays; @@ -14,7 +14,7 @@ import java.util.Map; import java.util.concurrent.atomic.AtomicInteger; import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; import com.knecon.fforesight.service.layoutparser.processor.model.text.FontStyle; import lombok.Data; @@ -41,7 +41,7 @@ public class Line extends TextBoundingBox { private FontStyle fontStyle; private final List characters; - private final List words = new ArrayList<>(); + private final List words = new ArrayList<>(); public Line(List characters, double wordSpacing) { @@ -89,7 +89,7 @@ public class Line extends TextBoundingBox { for (FontStyle fontStyle : FontStyle.values()) { fontStyleCounter.put(fontStyle, new AtomicInteger(0)); } - for (TextPositionSequence word : words) { + for (Word word : words) { switch (word.getFontStyle()) { case STANDARD -> fontStyleCounter.get(FontStyle.REGULAR).getAndIncrement(); case BOLD -> fontStyleCounter.get(FontStyle.BOLD).getAndIncrement(); @@ -159,14 +159,14 @@ public class Line extends TextBoundingBox { private void computeWords(double wordSpacing) { - TextPositionSequence word = new TextPositionSequence(); + Word word = new Word(); Character previous = null; for (Character current : characters) { if (previous != null) { double dist = current.getTextPosition().getXDirAdj() - previous.getTextPosition().getXDirAdj() - previous.getTextPosition().getWidthDirAdj(); if (dist > wordSpacing) { words.add(word); - word = new TextPositionSequence(); + word = new Word(); } } word.getTextPositions().add(current.getTextPosition()); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/FloatFrequencyCounter.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/FloatFrequencyCounter.java index 8dd491a..498ee1d 100755 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/FloatFrequencyCounter.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/FloatFrequencyCounter.java @@ -51,7 +51,7 @@ public class FloatFrequencyCounter { mostPopular = entry; } } - mostPopularCache = mostPopular != null ? mostPopular.getKey() : null; + mostPopularCache = mostPopular != null ? mostPopular.getKey() : 0; changed = false; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/LineInformation.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/LineInformation.java index b09df49..600db39 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/LineInformation.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/LineInformation.java @@ -3,7 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.model; import java.awt.geom.Rectangle2D; import java.util.List; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; import lombok.AccessLevel; import lombok.AllArgsConstructor; @@ -16,8 +16,8 @@ import lombok.experimental.FieldDefaults; public class LineInformation { List lineBBox; - List> sequencesByLines; + List> sequencesByLines; List> bBoxWithGapsByLines; - List>> sequencesWithGapsByLines; + List>> sequencesWithGapsByLines; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageContents.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageContents.java index f9d00ef..d4617ce 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageContents.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageContents.java @@ -4,7 +4,7 @@ import java.awt.geom.Rectangle2D; import java.util.List; import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; import lombok.AllArgsConstructor; import lombok.Builder; @@ -15,7 +15,7 @@ import lombok.Getter; @AllArgsConstructor public class PageContents { - List sortedTextPositionSequences; + List sortedWords; Rectangle2D cropBox; Rectangle2D mediaBox; List rulings; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Cell.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Cell.java index fd1e4de..9faee3b 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Cell.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Cell.java @@ -9,7 +9,7 @@ import java.util.List; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities; import lombok.Data; @@ -68,12 +68,12 @@ public class Cell extends BoundingBox { StringBuilder sb = new StringBuilder(); Iterator itty = textBlocks.iterator(); - TextPositionSequence previous = null; + Word previous = null; while (itty.hasNext()) { TextPageBlock textBlock = itty.next(); - for (TextPositionSequence word : textBlock.getSequences()) { + for (Word word : textBlock.getWords()) { if (previous != null) { if (Math.abs(previous.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight()) { sb.append('\n'); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/ListIdentifier.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/ListIdentifier.java index 0d55e5a..e64a39e 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/ListIdentifier.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/ListIdentifier.java @@ -23,7 +23,7 @@ public class ListIdentifier { Format format; @Getter - TextPositionSequence word; + Word word; @Getter int page; int representation; @@ -31,14 +31,14 @@ public class ListIdentifier { public static Optional parse(TextPageBlock textPageBlock, int page) { - return parse(textPageBlock.getSequences().subList(0, Math.min(5, textPageBlock.getSequences().size())), page); + return parse(textPageBlock.getWords().subList(0, Math.min(5, textPageBlock.getWords().size())), page); } - public static Optional parse(List sequences, int page) { + public static Optional parse(List sequences, int page) { StringBuilder sb = new StringBuilder(); - for (TextPositionSequence sequence : sequences) { + for (Word sequence : sequences) { sb.append(sequence.toString()); sb.append(" "); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/SearchableText.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/SearchableText.java index e8f6a39..0f99829 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/SearchableText.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/SearchableText.java @@ -10,18 +10,18 @@ import lombok.Getter; @Getter public class SearchableText { - private final List sequences = new ArrayList<>(); + private final List sequences = new ArrayList<>(); - public void add(TextPositionSequence textPositionSequence) { + public void add(Word word) { - sequences.add(textPositionSequence); + sequences.add(word); } - public void addAll(List textPositionSequences) { + public void addAll(List words) { - sequences.addAll(textPositionSequences); + sequences.addAll(words); } @@ -32,10 +32,10 @@ public class SearchableText { } - public static String buildString(List sequences) { + public static String buildString(List sequences) { StringBuilder sb = new StringBuilder(); - for (TextPositionSequence word : sequences) { + for (Word word : sequences) { sb.append(word); sb.append(' '); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java index 9b7940b..8de8fea 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java @@ -25,7 +25,7 @@ import lombok.NoArgsConstructor; public class TextPageBlock extends AbstractPageBlock { @Builder.Default - private List sequences = new ArrayList<>(); + private List words = new ArrayList<>(); @Builder.Default private FrequencyCounters frequencyCounters = new FrequencyCounters(); @@ -41,43 +41,43 @@ public class TextPageBlock extends AbstractPageBlock { private boolean changed; - public TextPageBlock(List sequences) { + public TextPageBlock(List words) { - this.sequences = new ArrayList<>(sequences); + this.words = new ArrayList<>(words); this.frequencyCounters = new FrequencyCounters(); - if (!sequences.isEmpty()) { - addToFrequencyCounters(sequences); + if (!words.isEmpty()) { + addToFrequencyCounters(words); } calculateBBox(); } - public List getSequences() { + public List getWords() { - return Collections.unmodifiableList(sequences); + return Collections.unmodifiableList(words); } public TextDirection getDir() { - return sequences.get(0).getDir(); + return words.get(0).getDir(); } private void calculateBBox() { - if (sequences == null) { + if (words == null) { this.bBox = new Rectangle2D.Double(); this.bBoxPdf = new Rectangle2D.Double(); this.bBoxDirAdj = new Rectangle2D.Double(); return; } - this.bBoxDirAdj = sequences.stream() - .map(TextPositionSequence::getBBoxDirAdj) + this.bBoxDirAdj = words.stream() + .map(Word::getBBoxDirAdj) .collect(RectangleTransformations.collectBBox()); - setToBBoxOfComponents(sequences); + setToBBoxOfComponents(words); } @@ -99,8 +99,8 @@ public class TextPageBlock extends AbstractPageBlock { throw new IllegalArgumentException("Cannot merge textBlocks on different pages."); } - List sequences = textBlocksToMerge.stream() - .map(TextPageBlock::getSequences) + List sequences = textBlocksToMerge.stream() + .map(TextPageBlock::getWords) .flatMap(java.util.Collection::stream) .toList(); sequences = new ArrayList<>(sequences); @@ -109,9 +109,9 @@ public class TextPageBlock extends AbstractPageBlock { } - private void addToFrequencyCounters(List sequences) { + private void addToFrequencyCounters(List sequences) { - for (TextPositionSequence wordBlock : sequences) { + for (Word wordBlock : sequences) { frequencyCounters.getLineHeightFrequencyCounter().add(wordBlock.getTextHeight()); frequencyCounters.getFontSizeFrequencyCounter().add(wordBlock.getFontSize()); @@ -120,12 +120,12 @@ public class TextPageBlock extends AbstractPageBlock { frequencyCounters.getStyleFrequencyCounter().add(wordBlock.getFontStyle()); } - setUnderlined(this.sequences.stream() - .allMatch(TextPositionSequence::isUnderline)); + setUnderlined(this.words.stream() + .allMatch(Word::isUnderline)); } - public TextPageBlock union(TextPositionSequence r) { + public TextPageBlock union(Word r) { TextPageBlock union = this.copy(); union.add(r); @@ -138,8 +138,8 @@ public class TextPageBlock extends AbstractPageBlock { public TextPageBlock union(TextPageBlock r) { TextPageBlock union = this.copy(); - union.addAll(r.getSequences()); - addToFrequencyCounters(r.getSequences()); + union.addAll(r.getWords()); + addToFrequencyCounters(r.getWords()); calculateBBox(); return union; } @@ -148,33 +148,33 @@ public class TextPageBlock extends AbstractPageBlock { public void add(TextPageBlock textPageBlock) { changed = true; - sequences.addAll(textPageBlock.getSequences()); - addToFrequencyCounters(textPageBlock.getSequences()); + words.addAll(textPageBlock.getWords()); + addToFrequencyCounters(textPageBlock.getWords()); calculateBBox(); } - public void add(TextPositionSequence textPositionSequence) { + public void add(Word word) { changed = true; - sequences.add(textPositionSequence); - addToFrequencyCounters(List.of(textPositionSequence)); + words.add(word); + addToFrequencyCounters(List.of(word)); calculateBBox(); } - public void addAll(List textPositionSequences) { + public void addAll(List words) { changed = true; - sequences.addAll(textPositionSequences); - addToFrequencyCounters(textPositionSequences); + this.words.addAll(words); + addToFrequencyCounters(words); calculateBBox(); } public TextPageBlock copy() { - return new TextPageBlock(new ArrayList<>(sequences)); + return new TextPageBlock(new ArrayList<>(words)); } @@ -193,8 +193,8 @@ public class TextPageBlock extends AbstractPageBlock { StringBuilder sb = new StringBuilder(); - TextPositionSequence previous = null; - for (TextPositionSequence word : sequences) { + Word previous = null; + for (Word word : words) { if (previous != null) { if (Math.abs(previous.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight()) { sb.append('\n'); @@ -217,8 +217,8 @@ public class TextPageBlock extends AbstractPageBlock { public int getNumberOfLines() { int numberOfLines = 1; - TextPositionSequence previous = null; - for (TextPositionSequence word : sequences) { + Word previous = null; + for (Word word : words) { if (previous != null) { if (word.getMaxYDirAdj() - previous.getMaxYDirAdj() > word.getTextHeight()) { numberOfLines++; @@ -262,14 +262,15 @@ public class TextPageBlock extends AbstractPageBlock { public double getHighestFontSize() { - return frequencyCounters.getFontSizeFrequencyCounter().getHighest(); + Double highest = frequencyCounters.getFontSizeFrequencyCounter().getHighest(); + return highest == null ? 0 : highest; } @Override public boolean isEmpty() { - return sequences.isEmpty(); + return words.isEmpty(); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequenceComparator.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequenceComparator.java index 0565923..b5d328a 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequenceComparator.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequenceComparator.java @@ -3,19 +3,19 @@ package com.knecon.fforesight.service.layoutparser.processor.model.text; import java.util.Comparator; import java.util.HashMap; -public class TextPositionSequenceComparator implements Comparator { +public class TextPositionSequenceComparator implements Comparator { - private HashMap lookup; + private HashMap lookup; - public TextPositionSequenceComparator(HashMap lookup) { + public TextPositionSequenceComparator(HashMap lookup) { this.lookup = lookup; } @Override - public int compare(TextPositionSequence number1, TextPositionSequence number2) { + public int compare(Word number1, Word number2) { int page1 = lookup.get(number1).page().getPageNumber(); int page2 = lookup.get(number2).page().getPageNumber(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequence.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/Word.java similarity index 90% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequence.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/Word.java index 9f60edf..3434d22 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequence.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/Word.java @@ -27,7 +27,7 @@ import lombok.extern.slf4j.Slf4j; @NoArgsConstructor @AllArgsConstructor @SuppressWarnings("pmd") -public class TextPositionSequence extends TextBoundingBox implements CharSequence { +public class Word extends TextBoundingBox implements CharSequence { public static final String STANDARD = "standard"; public static final String BOLD_ITALIC = "bold, italic"; @@ -47,7 +47,7 @@ public class TextPositionSequence extends TextBoundingBox implements CharSequenc private Integer hashcodeCache; - public TextPositionSequence(List textPositions, int pageNumber, boolean isParagraphStart) { + public Word(List textPositions, int pageNumber, boolean isParagraphStart) { this.textPositions = textPositions.stream() .map(RedTextPosition::fromTextPosition) @@ -65,7 +65,7 @@ public class TextPositionSequence extends TextBoundingBox implements CharSequenc } - public TextPositionSequence(List textPositions, int page) { + public Word(List textPositions, int page) { this.textPositions = textPositions; this.page = page; @@ -98,9 +98,9 @@ public class TextPositionSequence extends TextBoundingBox implements CharSequenc @Override - public TextPositionSequence subSequence(int start, int end) { + public Word subSequence(int start, int end) { - var textPositionSequence = new TextPositionSequence(); + var textPositionSequence = new Word(); textPositionSequence.textPositions = textPositions.subList(start, end); textPositionSequence.page = page; textPositionSequence.dir = dir; @@ -126,10 +126,10 @@ public class TextPositionSequence extends TextBoundingBox implements CharSequenc } - public void add(TextPositionSequence textPositionSequence, RedTextPosition textPosition) { + public void add(Word word, RedTextPosition textPosition) { this.textPositions.add(textPosition); - this.page = textPositionSequence.getPage(); + this.page = word.getPage(); calculateBBoxAndHashcode(); } @@ -199,7 +199,7 @@ public class TextPositionSequence extends TextBoundingBox implements CharSequenc if (o == this) { return true; } - if (!(o instanceof TextPositionSequence other)) { + if (!(o instanceof Word other)) { return false; } if (!other.canEqual((Object) this)) { @@ -220,7 +220,7 @@ public class TextPositionSequence extends TextBoundingBox implements CharSequenc } - protected boolean canEqual(final Object other) {return other instanceof TextPositionSequence;} + protected boolean canEqual(final Object other) {return other instanceof Word;} public int hashCode() { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/DividingColumnDetectionService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/DividingColumnDetectionService.java index 7c062bc..f1eeeb1 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/DividingColumnDetectionService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/DividingColumnDetectionService.java @@ -23,11 +23,11 @@ public class DividingColumnDetectionService { public List detectColumns(PageContents pageContents) { - if (pageContents.getSortedTextPositionSequences().size() < 2) { + if (pageContents.getSortedWords().size() < 2) { return List.of(pageContents.getCropBox()); } - GapInformation linesWithGapInformation = GapDetectionService.findGapsInLines(pageContents.getSortedTextPositionSequences(), pageContents.getCropBox()); + GapInformation linesWithGapInformation = GapDetectionService.findGapsInLines(pageContents.getSortedWords(), pageContents.getCropBox()); return detectColumnsFromLines(linesWithGapInformation.getXGaps(), pageContents.getCropBox()); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/GapDetectionService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/GapDetectionService.java index 54fd973..5d8cab8 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/GapDetectionService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/GapDetectionService.java @@ -5,7 +5,7 @@ import java.util.LinkedList; import java.util.List; import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; import lombok.AllArgsConstructor; import lombok.experimental.UtilityClass; @@ -18,23 +18,23 @@ public class GapDetectionService { private static final double NEW_LINE_FACTOR = 0.2; - public static GapInformation findGapsInLines(List sortedTextPositionSequences, Rectangle2D mainBodyTextFrame) { + public static GapInformation findGapsInLines(List sortedWords, Rectangle2D mainBodyTextFrame) { - if (sortedTextPositionSequences.isEmpty()) { + if (sortedWords.isEmpty()) { return new GapInformation(); } - final double avgTextPositionHeight = getAvgTextPositionHeight(sortedTextPositionSequences); + final double avgTextPositionHeight = getAvgTextPositionHeight(sortedWords); XGapsContext xGapContext = XGapsContext.init(mainBodyTextFrame); YGapsContext yGapContext = YGapsContext.init(mainBodyTextFrame); - var previousTextPosition = sortedTextPositionSequences.get(0); + var previousTextPosition = sortedWords.get(0); Rectangle2D rectangle = toRectangle2D(previousTextPosition); xGapContext.addGapFromLeftEdgeOfMainBody(rectangle); - for (TextPositionSequence currentTextPosition : sortedTextPositionSequences.subList(1, sortedTextPositionSequences.size())) { + for (Word currentTextPosition : sortedWords.subList(1, sortedWords.size())) { double yDifference = Math.abs(currentTextPosition.getMaxYDirAdj() - previousTextPosition.getMaxYDirAdj()); double xGap = Math.abs(previousTextPosition.getMaxXDirAdj() - currentTextPosition.getXDirAdj()); @@ -59,14 +59,14 @@ public class GapDetectionService { } previousTextPosition = currentTextPosition; } - xGapContext.addGapToRightEdgeOfMainBody(toRectangle2D(sortedTextPositionSequences.get(sortedTextPositionSequences.size() - 1))); + xGapContext.addGapToRightEdgeOfMainBody(toRectangle2D(sortedWords.get(sortedWords.size() - 1))); xGapContext.gapsPerLine.add(xGapContext.gapsInCurrentLine); return new GapInformation(xGapContext.gapsPerLine, yGapContext.gapsPerLine); } - private static Rectangle2D toRectangle2D(TextPositionSequence textPosition) { + private static Rectangle2D toRectangle2D(Word textPosition) { return mirrorY(textPosition.getBBox()); } @@ -87,18 +87,18 @@ public class GapDetectionService { } - private static void assertAllTextPositionsHaveSameDir(List textPositionSequences) { + private static void assertAllTextPositionsHaveSameDir(List words) { - assert textPositionSequences.stream() - .map(TextPositionSequence::getDir) - .allMatch(a -> a.equals(textPositionSequences.get(0).getDir())); + assert words.stream() + .map(Word::getDir) + .allMatch(a -> a.equals(words.get(0).getDir())); } - private static double getAvgTextPositionHeight(List textPositionSequences) { + private static double getAvgTextPositionHeight(List words) { - return textPositionSequences.stream() - .mapToDouble(TextPositionSequence::getHeight).average().orElseThrow(); + return words.stream() + .mapToDouble(Word::getHeight).average().orElseThrow(); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/InvisibleTableDetectionService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/InvisibleTableDetectionService.java index 809fd80..b9ea1c4 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/InvisibleTableDetectionService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/InvisibleTableDetectionService.java @@ -7,17 +7,17 @@ import java.util.List; import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation; import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; import lombok.experimental.UtilityClass; @UtilityClass public class InvisibleTableDetectionService { - public List> detectTable(List textPositionSequences, Rectangle2D tableBBox) { + public List> detectTable(List words, Rectangle2D tableBBox) { - LineInformation lineInformation = LineDetectionService.calculateLineInformation(textPositionSequences); - GapInformation gaps = GapDetectionService.findGapsInLines(textPositionSequences, tableBBox); + LineInformation lineInformation = LineDetectionService.calculateLineInformation(words); + GapInformation gaps = GapDetectionService.findGapsInLines(words, tableBBox); List gapsAcrossLines = GapsAcrossLinesService.detectXGapsAcrossLines(gaps, tableBBox); List columnXCoords = gapsAcrossLines.stream().map(RectangularShape::getCenterX).toList(); int colCount = gapsAcrossLines.size(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/LineDetectionService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/LineDetectionService.java index 6ad8d9f..62a4a6b 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/LineDetectionService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/LineDetectionService.java @@ -7,7 +7,7 @@ import java.util.List; import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation; import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation; import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; import lombok.AllArgsConstructor; import lombok.Getter; @@ -19,37 +19,37 @@ public class LineDetectionService { private static final double X_GAP_FACTOR = 1; // multiplied with average text height, determines the minimum distance of gaps in lines - public LineInformation calculateLineInformation(List sortedTextPositionSequences) { + public LineInformation calculateLineInformation(List sortedWords) { - if (sortedTextPositionSequences.isEmpty()) { + if (sortedWords.isEmpty()) { return LineFactory.init().build(); } - return buildLineInformation(sortedTextPositionSequences); + return buildLineInformation(sortedWords); } - public List> findLinesWithGaps(List sortedTextPositionSequences, Rectangle2D mainBodyTextFrame) { + public List> findLinesWithGaps(List sortedWords, Rectangle2D mainBodyTextFrame) { - return calculateLineInformation(sortedTextPositionSequences).getBBoxWithGapsByLines(); + return calculateLineInformation(sortedWords).getBBoxWithGapsByLines(); } - public List> orderByLines(List sortedTextPositionSequences, Rectangle2D mainBodyTextFrame) { + public List> orderByLines(List sortedWords, Rectangle2D mainBodyTextFrame) { - return calculateLineInformation(sortedTextPositionSequences).getSequencesByLines(); + return calculateLineInformation(sortedWords).getSequencesByLines(); } - private static LineInformation buildLineInformation(List sortedTextPositionSequences) { + private static LineInformation buildLineInformation(List sortedWords) { - final double avgTextPositionHeight = getAvgTextPositionHeight(sortedTextPositionSequences); + final double avgTextPositionHeight = getAvgTextPositionHeight(sortedWords); LineFactory lineFactory = LineFactory.init(); - var previousTextPosition = sortedTextPositionSequences.get(0); + var previousTextPosition = sortedWords.get(0); lineFactory.addToCurrentLine(previousTextPosition); - for (TextPositionSequence currentTextPosition : sortedTextPositionSequences.subList(1, sortedTextPositionSequences.size())) { + for (Word currentTextPosition : sortedWords.subList(1, sortedWords.size())) { if (isNewLine(currentTextPosition, previousTextPosition, avgTextPositionHeight) || isSplitByOrientation(currentTextPosition, previousTextPosition)) { lineFactory.startNewLine(); } else if (isXGap(currentTextPosition, previousTextPosition, avgTextPositionHeight)) { @@ -63,25 +63,25 @@ public class LineDetectionService { } - private static double getAvgTextPositionHeight(List textPositionSequences) { + private static double getAvgTextPositionHeight(List words) { - return textPositionSequences.stream().mapToDouble(TextPositionSequence::getHeight).average().orElseThrow(); + return words.stream().mapToDouble(Word::getHeight).average().orElseThrow(); } - private static boolean isXGap(TextPositionSequence currentTextPosition, TextPositionSequence previousTextPosition, double avgTextPositionHeight) { + private static boolean isXGap(Word currentTextPosition, Word previousTextPosition, double avgTextPositionHeight) { return Math.abs(previousTextPosition.getMaxXDirAdj() - currentTextPosition.getXDirAdj()) > (avgTextPositionHeight * X_GAP_FACTOR); } - private static boolean isSplitByOrientation(TextPositionSequence currentTextPosition, TextPositionSequence previousTextPosition) { + private static boolean isSplitByOrientation(Word currentTextPosition, Word previousTextPosition) { return !previousTextPosition.getDir().equals(currentTextPosition.getDir()); } - private static boolean isNewLine(TextPositionSequence currentTextPosition, TextPositionSequence previousTextPosition, double avgTextPositionHeight) { + private static boolean isNewLine(Word currentTextPosition, Word previousTextPosition, double avgTextPositionHeight) { return Math.abs(previousTextPosition.getYDirAdj() - currentTextPosition.getYDirAdj()) > avgTextPositionHeight; } @@ -96,13 +96,13 @@ public class LineDetectionService { List> bBoxWithGapsByLines; List bBoxWithGapsInCurrentLine; - List>> sequencesWithGapsByLines; - List> sequencesWithGapsInCurrentLine; + List>> sequencesWithGapsByLines; + List> sequencesWithGapsInCurrentLine; - List currentSequencesWithoutGaps; + List currentSequencesWithoutGaps; - List> sequencesByLines; - List sequencesInCurrentLine; + List> sequencesByLines; + List sequencesInCurrentLine; List> xGaps; List> yGaps; @@ -116,14 +116,14 @@ public class LineDetectionService { List bBoxWithGapsInCurrentLine = new LinkedList<>(); bBoxWithGapsByLines.add(bBoxWithGapsInCurrentLine); - List>> sequencesWithGapsByLines = new LinkedList<>(); - List> sequencesWithGapsInCurrentLine = new LinkedList<>(); + List>> sequencesWithGapsByLines = new LinkedList<>(); + List> sequencesWithGapsInCurrentLine = new LinkedList<>(); sequencesWithGapsByLines.add(sequencesWithGapsInCurrentLine); - List currentSequencesWithoutGaps = new LinkedList<>(); + List currentSequencesWithoutGaps = new LinkedList<>(); sequencesWithGapsInCurrentLine.add(currentSequencesWithoutGaps); - List> sequencesByLines = new LinkedList<>(); - List sequencesInCurrentLine = new LinkedList<>(); + List> sequencesByLines = new LinkedList<>(); + List sequencesInCurrentLine = new LinkedList<>(); sequencesByLines.add(sequencesInCurrentLine); return new LineFactory(lineBBox, @@ -178,13 +178,13 @@ public class LineDetectionService { } - private Rectangle2D textPositionBBox(List textPositionSequences) { + private Rectangle2D textPositionBBox(List words) { - return RectangleTransformations.rectangle2DBBox(textPositionSequences.stream().map(TextPositionSequence::getBBox).toList()); + return RectangleTransformations.rectangle2DBBox(words.stream().map(Word::getBBox).toList()); } - public void addToCurrentLine(TextPositionSequence current) { + public void addToCurrentLine(Word current) { sequencesInCurrentLine.add(current); currentSequencesWithoutGaps.add(current); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PageContentExtractor.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PageContentExtractor.java index dde3b94..6bd802e 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PageContentExtractor.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PageContentExtractor.java @@ -13,7 +13,7 @@ import org.apache.pdfbox.pdmodel.PDPage; import org.springframework.core.io.ClassPathResource; import com.knecon.fforesight.service.layoutparser.processor.model.PageContents; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDFLinesTextStripper; import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; @@ -40,7 +40,7 @@ public class PageContentExtractor { stripper.setPdpage(pdPage); stripper.getText(pdDocument); - Map> sortedTextPositionSequencesPerDir = stripper.getTextPositionSequences() + Map> sortedTextPositionSequencesPerDir = stripper.getWords() .stream() .collect(Collectors.groupingBy(textPositionSequence -> textPositionSequence.getDir().getDegrees())); @@ -57,7 +57,7 @@ public class PageContentExtractor { } - public List sortByDirAccordingToPageRotation(Map> sortedTextPositionSequencesPerDir, int rotation) { + public List sortByDirAccordingToPageRotation(Map> sortedTextPositionSequencesPerDir, int rotation) { LinkedList sortedKeys = new LinkedList<>(sortedTextPositionSequencesPerDir.keySet().stream().sorted().toList()); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PageInformationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PageInformationService.java index 03353a1..dc98f4b 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PageInformationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PageInformationService.java @@ -14,9 +14,9 @@ public class PageInformationService { public PageInformation build(PageContents pageContents) { - LineInformation lineInformation = LineDetectionService.calculateLineInformation(pageContents.getSortedTextPositionSequences()); + LineInformation lineInformation = LineDetectionService.calculateLineInformation(pageContents.getSortedWords()); Rectangle2D mainBodyTextFrame = MainBodyTextFrameExtractionService.calculateMainBodyTextFrame(lineInformation); - GapInformation gapInformation = GapDetectionService.findGapsInLines(pageContents.getSortedTextPositionSequences(), mainBodyTextFrame); + GapInformation gapInformation = GapDetectionService.findGapsInLines(pageContents.getSortedWords(), mainBodyTextFrame); return new PageInformation(pageContents, lineInformation, mainBodyTextFrame, gapInformation); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TextRulingsClassifier.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TextRulingsClassifier.java index f809833..3f203e6 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TextRulingsClassifier.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TextRulingsClassifier.java @@ -5,7 +5,7 @@ import java.util.List; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; import lombok.experimental.UtilityClass; @@ -17,9 +17,9 @@ public class TextRulingsClassifier { private final static double TEXT_BBOX_THRESHOLD_FACTOR = 0.15; // multiplied with text width then subtracted from word width. If ruling covers this width, it is considered as strikethrough/underline. - public static void classifyUnderlinedAndStrikethroughText(List words, CleanRulings cleanRulings) { + public static void classifyUnderlinedAndStrikethroughText(List words, CleanRulings cleanRulings) { - for (TextPositionSequence word : words) { + for (Word word : words) { if (word.getDir().equals(TextDirection.ZERO) || word.getDir().equals(TextDirection.HALF_CIRCLE)) { handleHorizontalText(cleanRulings, word); } else { @@ -29,7 +29,7 @@ public class TextRulingsClassifier { } - private static void handleVerticalText(CleanRulings cleanRulings, TextPositionSequence word) { + private static void handleVerticalText(CleanRulings cleanRulings, Word word) { float lowerY = (float) (word.getBBoxPdf().getMinY() + TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth()); float upperY = (float) (word.getBBoxPdf().getMaxY() - TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth()); @@ -63,7 +63,7 @@ public class TextRulingsClassifier { } - private static void handleHorizontalText(CleanRulings cleanRulings, TextPositionSequence word) { + private static void handleHorizontalText(CleanRulings cleanRulings, Word word) { float leftX = (float) (word.getBBoxPdf().getMinX() + TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth()); float rightX = (float) (word.getBBoxPdf().getMaxX() - TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth()); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java index 99ce865..4c6f5d9 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java @@ -16,7 +16,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; import com.knecon.fforesight.service.layoutparser.processor.model.SectionIdentifier; import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities; import lombok.Data; @@ -222,14 +222,14 @@ public class BlockificationPostprocessingService { headline = sectionIdentifier + headline; } - WordSequenceResult wordSequenceResult = findWordSequence(blockToSplit.getSequences(), headline); + WordSequenceResult wordSequenceResult = findWordSequence(blockToSplit.getWords(), headline); if (wordSequenceResult.inSequence.isEmpty() && !headline.equals(title)) { - wordSequenceResult = findWordSequence(blockToSplit.getSequences(), title); + wordSequenceResult = findWordSequence(blockToSplit.getWords(), title); } boolean modifiedBlockToSplit = false; if (!wordSequenceResult.inSequence.isEmpty()) { - blockToSplit.setSequences(wordSequenceResult.inSequence); + blockToSplit.setWords(wordSequenceResult.inSequence); blockToSplit.recalculateBBox(); modifiedBlockToSplit = true; } @@ -250,19 +250,19 @@ public class BlockificationPostprocessingService { } - private static WordSequenceResult findWordSequence(List textPositionSequences, String text) { + private static WordSequenceResult findWordSequence(List words, String text) { String target = sanitizeString(text); - List inSequence = new ArrayList<>(); - List preSequence = new ArrayList<>(); - List postSequence = new ArrayList<>(); + List inSequence = new ArrayList<>(); + List preSequence = new ArrayList<>(); + List postSequence = new ArrayList<>(); StringBuilder currentSequence = new StringBuilder(); if (target.isBlank()) { return new WordSequenceResult(); } - for (TextPositionSequence sequence : textPositionSequences) { + for (Word sequence : words) { currentSequence.append(sanitizeString(sequence.toString())); inSequence.add(sequence); @@ -274,10 +274,10 @@ public class BlockificationPostprocessingService { int index = 0; String toRemove = currentSequence.substring(0, currentSequence.length() - target.length()); - TextPositionSequence next = inSequence.get(index); + Word next = inSequence.get(index); while (currentSequence.length() - next.length() >= target.length()) { - TextPositionSequence removed = inSequence.remove(index); + Word removed = inSequence.remove(index); currentSequence.delete(0, removed.toString().length()); preSequence.add(removed); @@ -306,7 +306,7 @@ public class BlockificationPostprocessingService { } if (currentSequence.toString().equals(target)) { - postSequence.addAll(textPositionSequences.subList(textPositionSequences.indexOf(sequence) + 1, textPositionSequences.size())); + postSequence.addAll(words.subList(words.indexOf(sequence) + 1, words.size())); return new WordSequenceResult(inSequence, preSequence, postSequence); } } @@ -316,10 +316,10 @@ public class BlockificationPostprocessingService { } - private static SplitSequenceResult splitSequence(TextPositionSequence sequence, String toRemove) { + private static SplitSequenceResult splitSequence(Word sequence, String toRemove) { - TextPositionSequence in = null; - TextPositionSequence out; + Word in = null; + Word out; String currentSequence = sequence.toString().toLowerCase(Locale.ROOT); int index = currentSequence.indexOf(toRemove); @@ -337,9 +337,9 @@ public class BlockificationPostprocessingService { } - private static TextPositionSequence createSubSequence(TextPositionSequence sequence, int start, int end) { + private static Word createSubSequence(Word sequence, int start, int end) { - TextPositionSequence newSeq = new TextPositionSequence(new ArrayList<>(sequence.getTextPositions().subList(start, end)), sequence.getPage()); + Word newSeq = new Word(new ArrayList<>(sequence.getTextPositions().subList(start, end)), sequence.getPage()); newSeq.setParagraphStart(sequence.isParagraphStart()); return newSeq; } @@ -354,10 +354,10 @@ public class BlockificationPostprocessingService { List mergedBlocks = new ArrayList<>(); for (TextPageBlock textPageBlock : blocksToMerge.subList(1, blocksToMerge.size())) { - if (firstBlock != null && !firstBlock.getSequences().isEmpty()) { + if (firstBlock != null && !firstBlock.getWords().isEmpty()) { if (textPageBlock.getDir() == firstBlock.getDir()) { - firstBlock.addAll(textPageBlock.getSequences()); + firstBlock.addAll(textPageBlock.getWords()); mergedBlocks.add(textPageBlock); } } @@ -496,12 +496,12 @@ public class BlockificationPostprocessingService { public static class WordSequenceResult { - public List inSequence; - public List preSequence; - public List postSequence; + public List inSequence; + public List preSequence; + public List postSequence; - public WordSequenceResult(List inSequence, List preSequence, List postSequence) { + public WordSequenceResult(List inSequence, List preSequence, List postSequence) { this.inSequence = inSequence; this.preSequence = preSequence; @@ -522,7 +522,7 @@ public class BlockificationPostprocessingService { } - public record SplitSequenceResult(TextPositionSequence in, TextPositionSequence out) { + public record SplitSequenceResult(Word in, Word out) { } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java index 3d9c26c..edaee05 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java @@ -14,7 +14,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer; import lombok.RequiredArgsConstructor; @@ -30,7 +30,7 @@ public class DocstrumBlockificationService { static final float THRESHOLD = 1f; - public ClassificationPage blockify(List textPositions, + public ClassificationPage blockify(List textPositions, CleanRulings rulings, boolean xyOrder, LayoutDebugLayer visualizations, @@ -72,16 +72,16 @@ public class DocstrumBlockificationService { List abstractPageBlocks = new ArrayList<>(); zones.forEach(zone -> { - List textPositionSequences = new ArrayList<>(); + List words = new ArrayList<>(); zone.getLines() .forEach(line -> { line.getWords() .forEach(word -> { - textPositionSequences.add(new TextPositionSequence(word.getTextPositions(), word.getPage())); + words.add(new Word(word.getTextPositions(), word.getPage())); }); }); - abstractPageBlocks.add(buildTextBlock(textPositionSequences, 0)); + abstractPageBlocks.add(buildTextBlock(words, 0)); }); return abstractPageBlocks; @@ -102,7 +102,7 @@ public class DocstrumBlockificationService { } TextPageBlock current = (TextPageBlock) block; - if (previous != null && !previous.getSequences().isEmpty()) { + if (previous != null && !previous.getWords().isEmpty()) { if (current.getDir() != previous.getDir() || usedRulings.lineBetween(current, previous)) { previous = current; @@ -182,8 +182,8 @@ public class DocstrumBlockificationService { private TextPageBlock combineBlocksAndResetIterator(TextPageBlock previous, TextPageBlock current, ListIterator itty, boolean toDuplicate) { - previous.addAll(current.getSequences()); - previous = buildTextBlock(previous.getSequences(), 0); + previous.addAll(current.getWords()); + previous = buildTextBlock(previous.getWords(), 0); previous.setToDuplicate(toDuplicate); if (current.getClassification() != null && previous.getClassification() == null) { previous.setClassification(current.getClassification()); @@ -283,8 +283,8 @@ public class DocstrumBlockificationService { if (current.getDir() == inner.getDir() && current.intersects(inner, yThreshold, xThreshold)) { boolean toDuplicate = current.isToDuplicate() || inner.isToDuplicate(); - current.addAll(inner.getSequences()); - current = buildTextBlock(current.getSequences(), 0); + current.addAll(inner.getWords()); + current = buildTextBlock(current.getWords(), 0); current.setToDuplicate(toDuplicate); blocks.set(i, null); @@ -301,7 +301,7 @@ public class DocstrumBlockificationService { } - public static TextPageBlock buildTextBlock(List wordBlockList, int indexOnPage) { + public static TextPageBlock buildTextBlock(List wordBlockList, int indexOnPage) { return new TextPageBlock(wordBlockList); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocuMineBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocuMineBlockificationService.java index 8949ce1..ce94487 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocuMineBlockificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocuMineBlockificationService.java @@ -16,13 +16,14 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Orientation; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; @SuppressWarnings("all") @Service public class DocuMineBlockificationService { static final float THRESHOLD = 1f; + public static final double FONT_SIZE_CHANGE_RATIO = 0.15; Pattern pattern = Pattern.compile("^(\\d{1,2}\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z ()-]{2,50}", Pattern.CASE_INSENSITIVE); @@ -36,9 +37,9 @@ public class DocuMineBlockificationService { * @param cleanRulings All rulings on a page * @return Page object that contains the Textblock and text statistics. */ - public ClassificationPage blockify(List textPositions, CleanRulings cleanRulings) { + public ClassificationPage blockify(List textPositions, CleanRulings cleanRulings) { - List chunkWords = new ArrayList<>(); + List chunkWords = new ArrayList<>(); List textPageBlocks = new ArrayList<>(); CleanRulings usedRulings = cleanRulings.withoutTextRulings(); @@ -47,11 +48,11 @@ public class DocuMineBlockificationService { double maxX = 0; double minY = 1000; double maxY = 0; - TextPositionSequence prev = null; + Word prev = null; boolean wasSplitted = false; Double splitX1 = null; - for (TextPositionSequence word : textPositions) { + for (Word word : textPositions) { boolean lineSeparation = prev != null && word.getYDirAdj() - prev.getMaxYDirAdj() > Math.min(word.getHeight(), prev.getHeight()) * 1.1; boolean startFromTop = prev != null && word.getYDirAdj() < prev.getYDirAdj() - prev.getTextHeight(); @@ -60,11 +61,7 @@ public class DocuMineBlockificationService { boolean newLineAfterSplit = prev != null && word.getYDirAdj() != prev.getYDirAdj() && wasSplitted && splitX1 != word.getXDirAdj(); boolean isSplitByRuling = prev != null && usedRulings.lineBetween(prev, word); boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir()); - boolean splitByOtherFontAndOtherY = prev != null && Math.abs(prev.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight() * 0.2 // - && (word.getFontStyle().contains("bold") && !prev.getFontStyle().contains("bold") - || prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold") - || Math.abs(prev.getFontSize() - word.getFontSize()) >= 1 - || Math.abs(word.getTextHeight() - prev.getTextHeight()) > 0.8); + boolean splitByOtherFontAndOtherY = prev != null && Math.abs(prev.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight() * 0.2 && isFontChange(word, prev); Matcher matcher = pattern.matcher(chunkWords.stream() .collect(Collectors.joining(" ")).toString()); @@ -127,6 +124,15 @@ public class DocuMineBlockificationService { } + private static boolean isFontChange(Word word, Word prev) { + + return word.getFontStyle().contains("bold") && !prev.getFontStyle().contains("bold") + || prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold") + || Math.abs(prev.getFontSize() - word.getFontSize()) >= FONT_SIZE_CHANGE_RATIO * Math.min(prev.getFontSize(), word.getFontSize()) + || Math.abs(word.getTextHeight() - prev.getTextHeight()) >= FONT_SIZE_CHANGE_RATIO * Math.min(prev.getTextHeight(), word.getTextHeight()); + } + + public void mergeblocks(ClassificationPage page, CleanRulings usedRulings, float xThreshold, float yThreshold) { var blocks = page.getTextBlocks(); @@ -169,8 +175,8 @@ public class DocuMineBlockificationService { .equals(inner.getClassification()))) { boolean toDuplicate = current.isToDuplicate() || inner.isToDuplicate(); - current.addAll(inner.getSequences()); - current = buildTextBlock(current.getSequences(), 0); + current.addAll(inner.getWords()); + current = buildTextBlock(current.getWords(), 0); current.setClassification(inner.getClassification()); current.setToDuplicate(toDuplicate); blocks.set(i, null); @@ -193,7 +199,7 @@ public class DocuMineBlockificationService { } - public static TextPageBlock buildTextBlock(List wordBlockList, int indexOnPage) { + public static TextPageBlock buildTextBlock(List wordBlockList, int indexOnPage) { return new TextPageBlock(wordBlockList); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/RedactManagerBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/RedactManagerBlockificationService.java index 6ed553b..68c9c97 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/RedactManagerBlockificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/RedactManagerBlockificationService.java @@ -11,7 +11,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification import com.knecon.fforesight.service.layoutparser.processor.model.Orientation; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer; @SuppressWarnings("all") @@ -30,20 +30,20 @@ public class RedactManagerBlockificationService { * @param visualizations * @return Page object that contains the Textblock and text statistics. */ - public ClassificationPage blockify(List textPositions, CleanRulings cleanRulings, LayoutDebugLayer visualizations) { + public ClassificationPage blockify(List textPositions, CleanRulings cleanRulings, LayoutDebugLayer visualizations) { CleanRulings usedRulings = cleanRulings.withoutTextRulings(); int indexOnPage = 0; - List chunkWords = new ArrayList<>(); + List chunkWords = new ArrayList<>(); List chunkBlockList = new ArrayList<>(); double minX = 1000, maxX = 0, minY = 1000, maxY = 0; - TextPositionSequence prev = null; + Word prev = null; boolean wasSplitted = false; Double splitX1 = null; - for (TextPositionSequence word : textPositions) { + for (Word word : textPositions) { boolean lineSeparation = word.getYDirAdj() - maxY > word.getHeight() * 1.25; boolean startFromTop = prev != null && word.getYDirAdj() < prev.getYDirAdj() - prev.getTextHeight(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/ClarifyndClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/ClarifyndClassificationService.java index 20ced9f..c47ac9a 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/ClarifyndClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/ClarifyndClassificationService.java @@ -81,7 +81,7 @@ public class ClarifyndClassificationService { && (textBlock.getMostPopularWordStyle().equals("bold") || !document.getFontStyleCounter().getCountPerValue().containsKey("bold") && textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() + 1) - && textBlock.getSequences().get(0).getTextPositions().get(0).getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) { + && textBlock.getWords().get(0).getTextPositions().get(0).getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) { PageBlockType headlineType = HeadlineClassificationService.headlineClassByFontSize(textBlock, headlineFontSizes); headlineClassificationService.classifyHeadline(textBlock, headlineType); @@ -91,7 +91,7 @@ public class ClarifyndClassificationService { && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9 - && textBlock.getSequences().get(0).getTextPositions().get(0).getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) { + && textBlock.getWords().get(0).getTextPositions().get(0).getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) { PageBlockType headlineType = HeadlineClassificationService.headlineClassByFontSize(textBlock, headlineFontSizes); headlineClassificationService.classifyHeadline(textBlock, headlineType); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java index a072748..38b1097 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java @@ -15,7 +15,6 @@ import java.util.List; import java.util.Locale; import java.util.Map; import java.util.regex.Matcher; -import java.util.regex.Pattern; import org.springframework.stereotype.Service; @@ -153,6 +152,16 @@ public class DocuMineClassificationService { && !headlineWithSlashesMatches) { setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes); +// } else if (textBlock.getMostPopularWordFont().contains("bold") +// && greaterOrEqualFontThanPageAverage(textBlock, page) +// && textBlock.getWords().size() <= 6 +// && PositionUtils.getApproxLineCount(textBlock) < 2.9 +// && isAtLeast3Characters +// && charCount > textBlock.getText().length() * 0.75 +// && !textBlock.getText().contains(":") +// && textBlock.getWidth() < page.getBodyTextFrame().getWidth() * 0.7) { +// +// setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes); } else if (!listIdentifiers.isEmpty()) { textBlock.setClassification(PageBlockType.LIST_ITEM); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/ListItemClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/ListItemClassificationService.java index b2fc088..9a450ca 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/ListItemClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/ListItemClassificationService.java @@ -11,7 +11,7 @@ import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.processor.model.text.AbstractBlockOnPage; import com.knecon.fforesight.service.layoutparser.processor.model.text.ListIdentifier; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; @Service public class ListItemClassificationService { @@ -71,7 +71,7 @@ public class ListItemClassificationService { List result = new LinkedList<>(); if (block.block() instanceof TextPageBlock textBlock) { - List sequences = textBlock.getSequences(); + List sequences = textBlock.getWords(); for (int i = 0; i < sequences.size(); i++) { if (i != 0 && sequences.get(i - 1).getXDirAdj() < sequences.get(i).getXDirAdj()) { @@ -79,8 +79,8 @@ public class ListItemClassificationService { continue; } - TextPositionSequence sequence = sequences.get(i); - List wordsAtStartOfLine = new ArrayList<>(3); + Word sequence = sequences.get(i); + List wordsAtStartOfLine = new ArrayList<>(3); int end = Math.min(sequences.size(), i + 3); for (int j = i; j < end; j++) { if (sequences.get(j).intersectsYDirAdj(sequence, 2)) { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java index 63d3beb..2683f61 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java @@ -11,7 +11,6 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils; import lombok.RequiredArgsConstructor; @@ -95,7 +94,7 @@ public class RedactManagerClassificationService { && (textBlock.getMostPopularWordStyle().equals("bold") || !document.getFontStyleCounter().getCountPerValue().containsKey("bold") && textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() + 1) - && textBlock.getSequences().get(0).getTextPositions().get(0).getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) { + && textBlock.getWords().get(0).getTextPositions().get(0).getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) { PageBlockType headlineType = HeadlineClassificationService.headlineClassByFontSize(textBlock, headlineFontSizes); headlineClassificationService.classifyHeadline(textBlock, headlineType); @@ -105,7 +104,7 @@ public class RedactManagerClassificationService { && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9 - && textBlock.getSequences().get(0).getTextPositions().get(0).getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) { + && textBlock.getWords().get(0).getTextPositions().get(0).getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) { PageBlockType headlineType = HeadlineClassificationService.headlineClassByFontSize(textBlock, headlineFontSizes); headlineClassificationService.classifyHeadline(textBlock, headlineType); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TableOfContentsClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TableOfContentsClassificationService.java index 0e955e7..2465632 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TableOfContentsClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TableOfContentsClassificationService.java @@ -25,7 +25,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextBlockOnPage; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequenceComparator; import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities; import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer; @@ -71,8 +71,8 @@ public class TableOfContentsClassificationService { ClassificationPage startPage = textBlocks.get(start).page(); List initialLookAhead = textBlocks.subList(start, Math.min(start + SURROUNDING_BLOCKS_RADIUS, textBlocks.size())); - HashMap lookup = new HashMap<>(); - List numbers = extractNumbers(initialLookAhead, lookup, document.getPages().size()); + HashMap lookup = new HashMap<>(); + List numbers = extractNumbers(initialLookAhead, lookup, document.getPages().size()); TocNumberFinder tocNumberFinder = new TocNumberFinder(numbers, lookup); int lastCandidate = start; @@ -93,9 +93,9 @@ public class TableOfContentsClassificationService { break; } - List numbersFromBlock = extractNumbers(textBlockOnPage, lookup, document.getPages().size()); + List numbersFromBlock = extractNumbers(textBlockOnPage, lookup, document.getPages().size()); - List currentRightmostCluster = tocNumberFinder.getCurrentRightmostCluster(); + List currentRightmostCluster = tocNumberFinder.getCurrentRightmostCluster(); if (currentRightmostCluster.size() < MINIMUM_MATCHES) { log.debug("No numbers indicating a table of contents here."); @@ -132,7 +132,7 @@ public class TableOfContentsClassificationService { } - private static void addVisualization(LayoutDebugLayer layoutDebugLayer, TocNumberFinder tocNumberFinder, Map lookup) { + private static void addVisualization(LayoutDebugLayer layoutDebugLayer, TocNumberFinder tocNumberFinder, Map lookup) { tocNumberFinder.getCurrentRightmostCluster() .stream() @@ -141,9 +141,9 @@ public class TableOfContentsClassificationService { } - private static boolean anyIntersection(Collection numbers1, - Collection numbers2, - Map lookup) { + private static boolean anyIntersection(Collection numbers1, + Collection numbers2, + Map lookup) { return numbers1.stream() .anyMatch(numberFromCluster -> numbers2.stream() @@ -151,9 +151,9 @@ public class TableOfContentsClassificationService { } - private static List extractNumbers(List textBlocks, Map lookup, int numberOfPages) { + private static List extractNumbers(List textBlocks, Map lookup, int numberOfPages) { - List blocks = new LinkedList<>(); + List blocks = new LinkedList<>(); for (TextBlockOnPage textBlock : textBlocks) { blocks.addAll(extractNumbers(textBlock, lookup, numberOfPages)); } @@ -161,14 +161,14 @@ public class TableOfContentsClassificationService { } - private static List extractNumbers(TextBlockOnPage textBlock, Map lookup, int numberOfPages) { + private static List extractNumbers(TextBlockOnPage textBlock, Map lookup, int numberOfPages) { - List blocks = new LinkedList<>(); + List blocks = new LinkedList<>(); TextPageBlock block = textBlock.textBlock(); - List sequences = block.getSequences(); + List sequences = block.getWords(); for (int i = 0; i < sequences.size(); i++) { - TextPositionSequence word = sequences.get(i); + Word word = sequences.get(i); if (!NUMERIC.matcher(word).matches()) { continue; @@ -193,17 +193,17 @@ public class TableOfContentsClassificationService { } - private static CharSequence getSurroundingString(int i, List sequences) { + private static CharSequence getSurroundingString(int i, List sequences) { int end = Math.min(i + 5, sequences.size()); return sequences.subList(i, end) .stream() - .map(TextPositionSequence::toString) + .map(Word::toString) .collect(Collectors.joining(" ")); } - private static boolean matches(TextPositionSequence number1, TextPositionSequence number2, Map lookup) { + private static boolean matches(Word number1, Word number2, Map lookup) { if (number1.getDir() != number2.getDir()) { return false; @@ -247,11 +247,11 @@ public class TableOfContentsClassificationService { private static class TocNumberFinder { - final UnionFind numberClusters; - final HashMap lookup; + final UnionFind numberClusters; + final HashMap lookup; - TocNumberFinder(List blocks, HashMap lookup) { + TocNumberFinder(List blocks, HashMap lookup) { this.numberClusters = new UnionFind<>(new HashSet<>(blocks)); for (int i = 0; i < blocks.size(); i++) { @@ -265,14 +265,14 @@ public class TableOfContentsClassificationService { } - public void add(TextPositionSequence number) { + public void add(Word number) { if (numberClusters.getElements().contains(number)) { return; } numberClusters.addElement(number); - for (TextPositionSequence element : numberClusters.getElements()) { + for (Word element : numberClusters.getElements()) { if (matches(number, element, lookup)) { numberClusters.union(element, number); } @@ -280,7 +280,7 @@ public class TableOfContentsClassificationService { } - public List getCurrentRightmostCluster() { + public List getCurrentRightmostCluster() { return numberClusters.getGroups() .stream() @@ -322,9 +322,9 @@ public class TableOfContentsClassificationService { // } - public List removeOutliers(List numbers) { + public List removeOutliers(List numbers) { - List result = new ArrayList<>(); + List result = new ArrayList<>(); result.add(numbers.get(0)); @@ -346,7 +346,7 @@ public class TableOfContentsClassificationService { // Helper method to check if removing the current number results in a better order - public static boolean isBetterWithout(List numbers, int i) { + public static boolean isBetterWithout(List numbers, int i) { if (i == 0 || i == numbers.size() - 1) { return false; @@ -362,7 +362,7 @@ public class TableOfContentsClassificationService { } - private static int getNumberAsInt(List numbers, int i) { + private static int getNumberAsInt(List numbers, int i) { return Integer.parseInt(numbers.get(i).toString()); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java index 69ba4b9..7ecfd6c 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java @@ -97,12 +97,12 @@ public class DocumentGraphFactory { for (TableOfContentItem tocItem : classificationDocument.getTableOfContents()) { GenericSemanticNode parent = tocItem.getParent() == null ? null : tocItem.getParent().getSection(); Optional section = SectionNodeFactory.addSection(layoutParsingType, - parent, - tocItem.getChildren().isEmpty(), - tocItem.getNonEmptySectionBlocks(), - tocItem.getImages(), - context, - document); + parent, + tocItem.getChildren().isEmpty(), + tocItem.getNonEmptySectionBlocks(), + tocItem.getImages(), + context, + document); tocItem.setSection(section.orElse(null)); } } @@ -133,9 +133,9 @@ public class DocumentGraphFactory { if (node instanceof DuplicatedParagraph duplicatedParagraph) { AtomicTextBlock unsortedTextBlock = context.textBlockFactory.buildAtomicTextBlock(textBlocks.stream() - .flatMap(tb -> tb.getSequences() - .stream()) - .collect(Collectors.toList()), node, context, page); + .flatMap(tb -> tb.getWords() + .stream()) + .collect(Collectors.toList()), node, context, page); duplicatedParagraph.setUnsortedLeafTextBlock(unsortedTextBlock); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SearchTextWithTextPositionFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SearchTextWithTextPositionFactory.java index 3a76c39..856cba4 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SearchTextWithTextPositionFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SearchTextWithTextPositionFactory.java @@ -12,7 +12,7 @@ import java.util.Objects; import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange; import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; import lombok.experimental.UtilityClass; @@ -28,7 +28,7 @@ public class SearchTextWithTextPositionFactory { public static final double LINEBREAK_DELTA_TOLERANCE = 1.5; - public SearchTextWithTextPositionDto buildSearchTextToTextPositionDto(List sequences) { + public SearchTextWithTextPositionDto buildSearchTextToTextPositionDto(List sequences) { if (sequences.isEmpty() || sequences.stream() .allMatch(sequence -> sequence.getTextPositions().isEmpty())) { @@ -40,7 +40,7 @@ public class SearchTextWithTextPositionFactory { RedTextPosition currentTextPosition = sequences.get(0).getTextPositions().get(0); RedTextPosition previousTextPosition = RedTextPosition.builder().unicode(" ").bBoxDirAdj(currentTextPosition.getBBoxDirAdj()).build(); - for (TextPositionSequence word : sequences) { + for (Word word : sequences) { for (int i = 0; i < word.getTextPositions().size(); ++i) { currentTextPosition = word.getTextPositions().get(i); @@ -66,7 +66,7 @@ public class SearchTextWithTextPositionFactory { } List positions = sequences.stream() - .map(TextPositionSequence::getTextPositions) + .map(Word::getTextPositions) .flatMap(Collection::stream) .map(RedTextPosition::getBBoxPdf) .toList(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java index 313da70..b8d7d10 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java @@ -16,7 +16,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Ta import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations; import lombok.experimental.UtilityClass; @@ -115,7 +115,7 @@ public class TableNodeFactory { if (cell.getTextBlocks().isEmpty()) { tableCell.setLeafTextBlock(context.getTextBlockFactory().emptyTextBlock(tableNode, context, page)); } else if (cell.getTextBlocks().size() == 1) { - textBlock = context.getTextBlockFactory().buildAtomicTextBlock(cell.getTextBlocks().get(0).getSequences(), tableCell, context, page); + textBlock = context.getTextBlockFactory().buildAtomicTextBlock(cell.getTextBlocks().get(0).getWords(), tableCell, context, page); tableCell.setLeafTextBlock(textBlock); } else if (firstTextBlockIsHeadline(cell)) { SectionNodeFactory.addSection(layoutParsingType, @@ -129,7 +129,7 @@ public class TableNodeFactory { context, document); } else if (cellAreaIsSmallerThanPageAreaTimesThreshold(cell, page)) { - List sequences = TextPositionOperations.mergeAndSort(cell.getTextBlocks()); + List sequences = TextPositionOperations.mergeAndSort(cell.getTextBlocks()); textBlock = context.getTextBlockFactory().buildAtomicTextBlock(sequences, tableCell, context, page); tableCell.setLeafTextBlock(textBlock); } else { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TextBlockFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TextBlockFactory.java index d53c5a5..4ece312 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TextBlockFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TextBlockFactory.java @@ -2,7 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor.services.factory; import java.util.List; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock; @@ -17,14 +17,14 @@ public class TextBlockFactory { long textBlockIdx; - public AtomicTextBlock buildAtomicTextBlock(List sequences, SemanticNode parent, DocumentGraphFactory.Context context, Page page) { + public AtomicTextBlock buildAtomicTextBlock(List sequences, SemanticNode parent, DocumentGraphFactory.Context context, Page page) { Integer numberOnPage = context.getAndIncrementTextBlockNumberOnPage(page); return buildAtomicTextBlock(sequences, parent, numberOnPage, page); } - public AtomicTextBlock buildAtomicTextBlock(List sequences, SemanticNode parent, Integer numberOnPage, Page page) { + public AtomicTextBlock buildAtomicTextBlock(List sequences, SemanticNode parent, Integer numberOnPage, Page page) { SearchTextWithTextPositionDto searchTextWithTextPositionDto = SearchTextWithTextPositionFactory.buildSearchTextToTextPositionDto(sequences); int offset = stringOffset; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/graphics/GraphicExtractorService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/graphics/GraphicExtractorService.java index 42c483a..127c6e8 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/graphics/GraphicExtractorService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/graphics/GraphicExtractorService.java @@ -11,7 +11,7 @@ import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation; import lombok.RequiredArgsConstructor; @@ -33,10 +33,10 @@ public class GraphicExtractorService { PDPage pdPage, int pageNumber, CleanRulings cleanRulings, - List textPositionSequences, + List words, boolean graphicsRaster) { - List characterBBoxes = getCharacterBBoxes(textPositionSequences); + List characterBBoxes = getCharacterBBoxes(words); List classifiedRulingsBoxes = getLineBBoxesOfAllClassifiedRulings(cleanRulings); GraphicBBDetector graphicBBDetector = new GraphicBBDetector(pdPage, true); @@ -63,9 +63,9 @@ public class GraphicExtractorService { } - private List getCharacterBBoxes(List textPositionSequences) { + private List getCharacterBBoxes(List words) { - return textPositionSequences.stream() + return words.stream() .map(BoundingBox::getBBoxPdf) .map(Box::new) .collect(Collectors.toList()); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java index c125acd..c4985bd 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java @@ -40,7 +40,7 @@ import org.apache.pdfbox.text.TextPosition; import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; import lombok.Getter; import lombok.Setter; @@ -52,7 +52,7 @@ import lombok.extern.slf4j.Slf4j; public class PDFLinesTextStripper extends PDFTextStripper { private final static Set DOT_LIKE_CHARACTERS = Set.of(".", "·", "•", "․", "‧", "∙", "⋅", "・", ".", "・", "…", "⸱", "﹒", "ꞏ"); - private final List textPositionSequences = new ArrayList<>(); + private final List words = new ArrayList<>(); private final List rulings = new ArrayList<>(); private final List graphicsPath = new ArrayList<>(); @Setter @@ -246,10 +246,10 @@ public class PDFLinesTextStripper extends PDFTextStripper { direction = textPositions.get(i).getDir(); } - if (!textPositionSequences.isEmpty()) { - previous = textPositionSequences.get(textPositionSequences.size() - 1) + if (!words.isEmpty()) { + previous = words.get(words.size() - 1) .getTextPositions() - .get(textPositionSequences.get(textPositionSequences.size() - 1).getTextPositions().size() - 1); + .get(words.get(words.size() - 1).getTextPositions().size() - 1); } if (i == 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i).getUnicode().equals("\u00A0") || textPositions.get(i).getUnicode().equals("\t"))) { @@ -259,7 +259,7 @@ public class PDFLinesTextStripper extends PDFTextStripper { if (textPositions.get(i).getDir() != direction && startIndex != i) { List sublist = textPositions.subList(startIndex, i); - textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart)); + words.add(new Word(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart)); startIndex = i; direction = textPositions.get(i).getDir(); } @@ -268,7 +268,7 @@ public class PDFLinesTextStripper extends PDFTextStripper { if (checkIfCurrentPositionIsToTheRightOfPreviousPosition(i, textPositions)) { List sublist = textPositions.subList(startIndex, i); if (checkIfSequenceContainsOnlyWhitespaces(sublist)) { - textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart)); + words.add(new Word(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart)); } startIndex = i; } @@ -276,7 +276,7 @@ public class PDFLinesTextStripper extends PDFTextStripper { if (textPositions.get(i).getRotation() == 0 && i > 0 && textPositions.get(i).getX() > textPositions.get(i - 1).getEndX() + 1) { List sublist = textPositions.subList(startIndex, i); if (checkIfSequenceContainsOnlyWhitespaces(sublist)) { - textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart)); + words.add(new Word(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart)); } startIndex = i; } @@ -290,22 +290,22 @@ public class PDFLinesTextStripper extends PDFTextStripper { // Remove false sequence ends (whitespaces) if (checkIfGapSizeBetweenCharactersSmallerThanMaximum(previous, sublist, 0.01f)) { for (TextPosition t : sublist) { - textPositionSequences.get(textPositionSequences.size() - 1).add(t); + words.get(words.size() - 1).add(t); } } else { - textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart)); + words.add(new Word(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart)); } } startIndex = i + 1; } if (isDottedLineFollowedByWord(textPositions, i, startIndex)) { List sublist = textPositions.subList(startIndex, i); - textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart)); + words.add(new Word(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart)); startIndex = i; } if (isWordFollowedByDottedLine(textPositions, i, startIndex)) { List sublist = textPositions.subList(startIndex, i - 2); - textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart)); + words.add(new Word(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart)); startIndex = i - 2; } } @@ -324,10 +324,10 @@ public class PDFLinesTextStripper extends PDFTextStripper { && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0).getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < 0.01) { for (TextPosition t : sublist) { - textPositionSequences.get(textPositionSequences.size() - 1).add(t); + words.get(words.size() - 1).add(t); } } else { - textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, isParagraphStart)); + words.add(new Word(sublist, pageNumber, isParagraphStart)); } } @@ -392,7 +392,7 @@ public class PDFLinesTextStripper extends PDFTextStripper { @Override public String getText(PDDocument doc) throws IOException { - textPositionSequences.clear(); + words.clear(); rulings.clear(); graphicsPath.clear(); path_x = 0.0f; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/MarkedContentUtils.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/MarkedContentUtils.java index 015f5bb..13de446 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/MarkedContentUtils.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/MarkedContentUtils.java @@ -13,7 +13,7 @@ import org.apache.pdfbox.text.TextPosition; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; import lombok.experimental.UtilityClass; @@ -48,7 +48,7 @@ public class MarkedContentUtils { return markedContentByYPosition.values() .stream() - .map(textPositions -> new TextPositionSequence(textPositions, 0, true).getBBoxPdf()) + .map(textPositions -> new Word(textPositions, 0, true).getBBoxPdf()) .map(t -> new Rectangle2D.Double(t.getX(), t.getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight()))) .collect(Collectors.toList()); } @@ -89,7 +89,7 @@ public class MarkedContentUtils { .filter(content -> content instanceof TextPosition) .map(content -> (TextPosition) content) .filter(content -> !content.getUnicode().equals(" ")) - .map(textPositions -> new TextPositionSequence(List.of(textPositions), 0, true)) + .map(textPositions -> new Word(List.of(textPositions), 0, true)) .map(BoundingBox::getBBoxPdf) .collect(Collectors.toList()); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java index 3972243..442bf8a 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java @@ -16,7 +16,7 @@ import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Union import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils; import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; import lombok.experimental.UtilityClass; @@ -36,33 +36,33 @@ public class TextPositionOperations { .thenComparing(TextBoundingBox::getXDirAdj, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, X_THRESHOLD)); - public List mergeAndSort(List textBlocks) { + public List mergeAndSort(List textBlocks) { var sequences = textBlocks.stream() - .flatMap(tb -> tb.getSequences() + .flatMap(tb -> tb.getWords() .stream()) .collect(Collectors.toSet()); return sortUsingLineDetection(sequences); } - public List sort(List sequences) { + public List sort(List sequences) { return sortUsingLineDetection(new HashSet<>(sequences)); } - private List sortUsingLineDetection(Set sequences) { + private List sortUsingLineDetection(Set sequences) { return sortLines(groupByLine(sequences)); } - public List sortLines(Collection> lines) { + public List sortLines(Collection> lines) { - List> lineBlocks = new ArrayList<>(); - for (Set line : lines) { - List sortedLine = sortByXDirAdj(line); + List> lineBlocks = new ArrayList<>(); + for (Set line : lines) { + List sortedLine = sortByXDirAdj(line); if (!sortedLine.isEmpty()) { lineBlocks.add(sortedLine); } @@ -70,35 +70,35 @@ public class TextPositionOperations { // need to use old sorting, since COMPARATOR_DIR_ADJ is not transitive QuickSort.sort(lineBlocks, Comparator.comparing(line -> line.get(0), COMPARATOR_DIR_ADJ)); - List list = new ArrayList<>(); - for (List textPositionSequences : lineBlocks) { - list.addAll(textPositionSequences); + List list = new ArrayList<>(); + for (List words : lineBlocks) { + list.addAll(words); } return list; } - private List sortByXDirAdj(Set line) { + private List sortByXDirAdj(Set line) { return line.stream() - .sorted(Comparator.comparing(TextPositionSequence::getXDirAdj)) + .sorted(Comparator.comparing(Word::getXDirAdj)) .toList(); } - public Collection> groupByLine(Set sequences) { + public Collection> groupByLine(Set sequences) { double maxLineDistance = sequences.stream() - .map(TextPositionSequence::getBBoxDirAdj) + .map(Word::getBBoxDirAdj) .mapToDouble(RectangularShape::getHeight).average().orElse(10) * MAX_LINE_HEIGHT_FACTOR; double maxXGap = sequences.stream() - .map(TextPositionSequence::getBBoxDirAdj) + .map(Word::getBBoxDirAdj) .mapToDouble(RectangularShape::getWidth).average().orElse(75) * MAX_WORD_DISTANCE_FACTOR; - UnionFind unionFind = new UnionFind<>(sequences); + UnionFind unionFind = new UnionFind<>(sequences); - for (TextPositionSequence sequence : sequences) { - for (TextPositionSequence sequence2 : sequences) { + for (Word sequence : sequences) { + for (Word sequence2 : sequences) { if (sequence.equals(sequence2)) { // || unionFind.inSameSet(sequence, sequence2)) doing this is actually slower than not doing it continue; @@ -144,10 +144,10 @@ public class TextPositionOperations { } - public List merge(List textBlocks) { + public List merge(List textBlocks) { return textBlocks.stream() - .map(TextPageBlock::getSequences) + .map(TextPageBlock::getWords) .flatMap(Collection::stream) .collect(Collectors.toList()); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutDebugLayer.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutDebugLayer.java index abce8af..7fb781c 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutDebugLayer.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutDebugLayer.java @@ -27,7 +27,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.outline.Outlin import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import com.knecon.fforesight.service.layoutparser.processor.model.text.ListIdentifier; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms; import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation; @@ -58,14 +58,14 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig { Map outlineObjectsWithoutPointsPerPage = new HashMap<>(); - public void addTextVisualizations(List textPositionSequences, int pageNumber) { + public void addTextVisualizations(List words, int pageNumber) { if (!active) { return; } VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.words); visualizationsOnPage.getColoredRectangles() - .addAll(textPositionSequences.stream() + .addAll(words.stream() .map(BoundingBox::getBBoxPdf) .map(rect -> new ColoredRectangle(rect, WORDS_COLOR, 1)) .toList()); @@ -188,7 +188,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig { } - public void addLineVisualizationsFromNestedTextPosition(Collection> lines, int pageNumber) { + public void addLineVisualizationsFromNestedTextPosition(Collection> lines, int pageNumber) { if (!active) { return; @@ -291,7 +291,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig { } - public void addTocPages(List numbers, int page) { + public void addTocPages(List numbers, int page) { if (!active) { return; diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/HeadlinesGoldStandardIntegrationTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/HeadlinesGoldStandardIntegrationTest.java index c51c7ae..33b2262 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/HeadlinesGoldStandardIntegrationTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/HeadlinesGoldStandardIntegrationTest.java @@ -70,13 +70,16 @@ public class HeadlinesGoldStandardIntegrationTest { List metrics = new ArrayList<>(); metrics.add(getMetrics("files/syngenta/CustomerFiles/01 - CGA100251 - Acute Oral Toxicity (Up and Down Procedure) - Rat (1).pdf", - "files/headlineTest/01 - CGA100251 - Acute Oral Toxicity (Up and Down Procedure) - Rat (1)_REDACTION_LOG.json")); + "files/headlineTest/01 - CGA100251 - Acute Oral Toxicity (Up and Down Procedure) - Rat (1)_REDACTION_LOG.json")); metrics.add(getMetrics("files/syngenta/CustomerFiles/91 Trinexapac-ethyl_RAR_01_Volume_1_2018-02-23.pdf", - "files/headlineTest/91 Trinexapac-ethyl_RAR_01_Volume_1_2018-02-23_REDACTION_LOG.json")); - metrics.add(getMetrics("files/syngenta/CustomerFiles/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf", "files/headlineTest/S-Metolachlor_RAR_01_Volume_1_2018-09-06_REDACTION_LOG.json")); + "files/headlineTest/91 Trinexapac-ethyl_RAR_01_Volume_1_2018-02-23_REDACTION_LOG.json")); + metrics.add(getMetrics("files/syngenta/CustomerFiles/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf", + "files/headlineTest/S-Metolachlor_RAR_01_Volume_1_2018-09-06_REDACTION_LOG.json")); - double precision = metrics.stream().mapToDouble(Metrics::getPrecision).average().orElse(1.0); - double recall = metrics.stream().mapToDouble(Metrics::getRecall).average().orElse(1.0); + double precision = metrics.stream() + .mapToDouble(Metrics::getPrecision).average().orElse(1.0); + double recall = metrics.stream() + .mapToDouble(Metrics::getRecall).average().orElse(1.0); System.out.println("Precision is: " + precision + " recall is: " + recall); @@ -94,20 +97,23 @@ public class HeadlinesGoldStandardIntegrationTest { Set goldStandardHeadlines = new HashSet<>(); var goldStandardLog = objectMapper.readValue(redactionLogResource.getInputStream(), RedactionLog.class); goldStandardLog.getRedactionLogEntry().removeIf(r -> !r.isRedacted() || r.getChanges().get(r.getChanges().size() - 1).getType().equals(ChangeType.REMOVED)); - goldStandardLog.getRedactionLogEntry().forEach(e -> goldStandardHeadlines.add(new Headline(e.getPositions().get(0).getPage(), e.getValue()))); + goldStandardLog.getRedactionLogEntry() + .forEach(e -> goldStandardHeadlines.add(new Headline(e.getPositions().get(0).getPage(), e.getValue()))); Document documentGraph = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.DOCUMINE, layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE, - pdfFileResource.getFile(), - new ImageServiceResponse(), - new TableServiceResponse(), - new VisualLayoutParsingResponse(), - Map.of("file",filePath))); + pdfFileResource.getFile(), + new ImageServiceResponse(), + new TableServiceResponse(), + new VisualLayoutParsingResponse(), + Map.of("file", filePath))); var foundHeadlines = documentGraph.streamAllSubNodes() .map(SemanticNode::getHeadline) .distinct() - .map(headlineNode -> new Headline(headlineNode.getPages().stream().findFirst().get().getNumber(), headlineNode.getTextBlock().getSearchText().stripTrailing())) + .map(headlineNode -> new Headline(headlineNode.getPages() + .stream() + .findFirst().get().getNumber(), headlineNode.getTextBlock().getSearchText().stripTrailing())) .toList(); Set correct = new HashSet<>(); @@ -121,7 +127,9 @@ public class HeadlinesGoldStandardIntegrationTest { } } - missing = goldStandardHeadlines.stream().filter(h -> !correct.contains(h)).collect(Collectors.toSet()); + missing = goldStandardHeadlines.stream() + .filter(h -> !correct.contains(h)) + .collect(Collectors.toSet()); float precision = (float) correct.size() / (float) foundHeadlines.size(); float recall = (float) correct.size() / ((float) correct.size() + (float) missing.size()); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentReadingOrderTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentReadingOrderTest.java index 9e04dae..eb9b5f3 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentReadingOrderTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentReadingOrderTest.java @@ -26,7 +26,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations; import com.knecon.fforesight.service.layoutparser.server.PDFNetInitializer; @@ -387,7 +387,7 @@ public class DocumentReadingOrderTest extends BuildDocumentTest { for (AbstractPageBlock abstractBlock : classificationDocumentPage.getTextBlocks()) { if (abstractBlock instanceof TextPageBlock textBlock) { - for (TextPositionSequence sequence : TextPositionOperations.mergeAndSort(List.of(textBlock))) { + for (Word sequence : TextPositionOperations.mergeAndSort(List.of(textBlock))) { float stringWidth; try { diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java index b01dc27..a43edcb 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java @@ -32,7 +32,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.textbloc import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse; @@ -105,9 +105,9 @@ public class PdfSegmentationServiceTest extends AbstractTest { List textPositionPerPage = PageContentExtractor.getSortedPageContents(fileName); var textPositions = textPositionPerPage.stream() - .flatMap(t -> t.getSortedTextPositionSequences() + .flatMap(t -> t.getSortedWords() .stream() - .map(TextPositionSequence::toString)) + .map(Word::toString)) .collect(Collectors.joining(" ")); assertThat(textPositions.contains(textToSearch)).isFalse(); @@ -117,7 +117,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { .get(0).getTextBlocks().size()).isEqualTo(3); assertThat(classificationDocument.getHeaders() .get(0).getTextBlocks() - .get(0).getSequences().size()).isEqualTo(8); + .get(0).getWords().size()).isEqualTo(8); assertThat(classificationDocument.getHeaders() .get(0).getTextBlocks() .get(0).toString()).contains(textToSearch); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/GapAcrossLinesDetectionServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/GapAcrossLinesDetectionServiceTest.java index 2bbc7ee..63beced 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/GapAcrossLinesDetectionServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/GapAcrossLinesDetectionServiceTest.java @@ -36,7 +36,7 @@ class GapAcrossLinesDetectionServiceTest { System.out.println("start column detection"); start = System.currentTimeMillis(); for (PageInformation pageInformation : pageInformations) { - GapInformation gapInformation = GapDetectionService.findGapsInLines(pageInformation.getPageContents().getSortedTextPositionSequences(), pageInformation.getMainBodyTextFrame()); + GapInformation gapInformation = GapDetectionService.findGapsInLines(pageInformation.getPageContents().getSortedWords(), pageInformation.getMainBodyTextFrame()); columnsPerPage.add(GapsAcrossLinesService.detectXGapsAcrossLines(gapInformation, pageInformation.getMainBodyTextFrame())); } System.out.printf("Finished column detection in %d ms%n", System.currentTimeMillis() - start); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/InvisibleTableDetectionServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/InvisibleTableDetectionServiceTest.java index 3374b89..f08858f 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/InvisibleTableDetectionServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/InvisibleTableDetectionServiceTest.java @@ -10,7 +10,7 @@ import java.util.stream.Collectors; import org.junit.jupiter.api.Test; import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; import com.knecon.fforesight.service.layoutparser.processor.services.InvisibleTableDetectionService; import com.knecon.fforesight.service.layoutparser.processor.services.PageInformationService; import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor; @@ -34,18 +34,18 @@ class InvisibleTableDetectionServiceTest { .collect(Collectors.toList()); int pageNumber = 1; - Rectangle2D tableBBox = pageContents.get(0).getPageContents().getSortedTextPositionSequences().subList(45, 152) + Rectangle2D tableBBox = pageContents.get(0).getPageContents().getSortedWords().subList(45, 152) .stream() - .map(TextPositionSequence::getBBox) + .map(Word::getBBox) .map(this::mirrorY) .collect(RectangleTransformations.collectBBox()); - List textPositionSequences = pageContents.get(0).getPageContents().getSortedTextPositionSequences() + List words = pageContents.get(0).getPageContents().getSortedWords() .stream() .filter(textPositionSequence -> tableBBox.contains(mirrorY(textPositionSequence.getBBox()))) .toList(); - var table = InvisibleTableDetectionService.detectTable(textPositionSequences, tableBBox); + var table = InvisibleTableDetectionService.detectTable(words, tableBBox); PdfDraw.drawRectanglesPerPage(fileName, List.of(table.stream() diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/PageContentExtractorTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/PageContentExtractorTest.java index dbb838a..14c4c64 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/PageContentExtractorTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/PageContentExtractorTest.java @@ -6,7 +6,7 @@ import java.util.List; import org.junit.jupiter.api.Test; import com.knecon.fforesight.service.layoutparser.processor.model.PageContents; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor; import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw; @@ -25,9 +25,9 @@ class PageContentExtractorTest { PdfDraw.drawRectanglesPerPageNumberedByLine(fileName, textPositionPerPage.stream() - .map(t -> t.getSortedTextPositionSequences() + .map(t -> t.getSortedWords() .stream() - .map(TextPositionSequence::getBBoxPdf) + .map(Word::getBBoxPdf) .map(List::of) .toList()) .toList(), tmpFileName); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingsClassifierTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingsClassifierTest.java index f3fd281..1576918 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingsClassifierTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingsClassifierTest.java @@ -11,7 +11,7 @@ import org.junit.jupiter.api.Test; import com.knecon.fforesight.service.layoutparser.processor.model.PageContents; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor; import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService; import com.knecon.fforesight.service.layoutparser.processor.services.TextRulingsClassifier; @@ -32,16 +32,16 @@ public class RulingsClassifierTest { for (PageContents pageContent : pageContents) { CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(Collections.emptyList(), pageContent.getRulings()); RectangularIntersectionFinder.find(cleanRulings.getHorizontals(), cleanRulings.getVerticals()); - TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(pageContent.getSortedTextPositionSequences(), cleanRulings); + TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(pageContent.getSortedWords(), cleanRulings); - assertTrue(pageContent.getSortedTextPositionSequences() + assertTrue(pageContent.getSortedWords() .stream() .filter(word -> word.toString().equals("Underlined")) - .allMatch(TextPositionSequence::isUnderline)); - assertTrue(pageContent.getSortedTextPositionSequences() + .allMatch(Word::isUnderline)); + assertTrue(pageContent.getSortedWords() .stream() .filter(word -> word.toString().equals("Striketrough")) - .allMatch(TextPositionSequence::isStrikethrough)); + .allMatch(Word::isStrikethrough)); assertEquals(4, cleanRulings.buildAll() @@ -70,7 +70,7 @@ public class RulingsClassifierTest { for (PageContents pageContent : pageContents) { CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(Collections.emptyList(), pageContent.getRulings()); RectangularIntersectionFinder.find(cleanRulings.getHorizontals(), cleanRulings.getVerticals()); - TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(pageContent.getSortedTextPositionSequences(), cleanRulings); + TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(pageContent.getSortedWords(), cleanRulings); assertEquals(30, cleanRulings.getHorizontals().size()); assertEquals(30, cleanRulings.getTableLines().getHorizontals().size()); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/BuildDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/BuildDocumentTest.java index 658bcff..d06e894 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/BuildDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/BuildDocumentTest.java @@ -1,8 +1,6 @@ package com.knecon.fforesight.service.layoutparser.server.utils; -import java.awt.geom.Rectangle2D; import java.io.File; -import java.io.FileOutputStream; import java.nio.file.Path; import java.util.Map; @@ -12,27 +10,11 @@ import org.springframework.core.io.ClassPathResource; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline; -import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; -import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse; import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; -import com.knecon.fforesight.service.viewerdoc.model.Standard14EmbeddableFont; -import com.pdftron.common.Matrix2D; -import com.pdftron.pdf.ColorPt; -import com.pdftron.pdf.ColorSpace; -import com.pdftron.pdf.Element; -import com.pdftron.pdf.ElementBuilder; -import com.pdftron.pdf.ElementWriter; -import com.pdftron.pdf.Font; -import com.pdftron.pdf.GState; -import com.pdftron.pdf.PDFDoc; -import com.pdftron.pdf.Page; -import com.pdftron.sdf.SDFDoc; import lombok.SneakyThrows; diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/LayoutDebugLayerConfig.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/LayoutDebugLayerConfig.java index 946f2ab..da50e01 100644 --- a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/LayoutDebugLayerConfig.java +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/LayoutDebugLayerConfig.java @@ -46,9 +46,9 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup { new Color(0, 188, 212), new Color(121, 85, 72)); - protected final Visualizations words = Visualizations.builder().layer(LayerIdentifier.WORDS).visibleByDefault(true).build(); + protected final Visualizations words = Visualizations.builder().layer(LayerIdentifier.WORDS).build(); protected final Visualizations lines = Visualizations.builder().layer(LayerIdentifier.LINES).build(); - protected final Visualizations zones = Visualizations.builder().layer(LayerIdentifier.ZONES).build(); + protected final Visualizations zones = Visualizations.builder().layer(LayerIdentifier.ZONES).visibleByDefault(true).build(); protected final Visualizations mainBody = Visualizations.builder().layer(LayerIdentifier.MAIN_BODY).build(); protected final Visualizations clean_rulings = Visualizations.builder().layer(LayerIdentifier.CLEAN_RULINGS).build(); protected final Visualizations rulings = Visualizations.builder().layer(LayerIdentifier.RULINGS).build(); diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/OcrDebugLayerConfig.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/OcrDebugLayerConfig.java index c3f7fca..5133580 100644 --- a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/OcrDebugLayerConfig.java +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/OcrDebugLayerConfig.java @@ -25,10 +25,11 @@ public class OcrDebugLayerConfig extends AbstractLayerGroup { protected final Visualizations debugText = Visualizations.builder().layer(LayerIdentifier.KNECON_OCR_TEXT_DEBUG).visibleByDefault(true).build(); protected final Visualizations tableLines = Visualizations.builder().layer(LayerIdentifier.KNECON_OCR_LINE_DEBUG).visibleByDefault(true).build(); - protected final Visualizations overlappedText = Visualizations.builder().layer(LayerIdentifier.KNECON_OCR_OVERLAPPED_TEXT).visibleByDefault(true).build(); + protected final Visualizations overlappedText = Visualizations.builder().layer(LayerIdentifier.KNECON_OCR_OVERLAPPED_TEXT).visibleByDefault(false).build(); protected final Visualizations debugBBox = Visualizations.builder().layer(LayerIdentifier.KNECON_OCR_BBOX_DEBUG).visibleByDefault(false).build(); + @Override public List getVisualizations() { @@ -36,10 +37,4 @@ public class OcrDebugLayerConfig extends AbstractLayerGroup { } - @Override - public boolean isVisibleByDefault() { - - return true; - } - }