From 6c7442ac6dca327e86117efa4706ef2940e77f2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kilian=20Sch=C3=BCttler?= Date: Wed, 9 Oct 2024 08:48:48 +0200 Subject: [PATCH] RED-10127: improve headline detection --- .../processor/LayoutParsingPipeline.java | 59 +-- .../processor/docstrum/model/BoundingBox.java | 38 +- .../docstrum/model/TextBoundingBox.java | 66 ++++ .../processor/docstrum/model/UnionFind.java | 6 + .../model/FloatFrequencyCounter.java | 2 +- .../processor/model/PageBlockType.java | 1 + .../processor/model/graph/TextRange.java | 8 +- .../model/text/FrequencyCounters.java | 21 + .../processor/model/text/RedTextPosition.java | 2 + .../model/text/StringFrequencyCounter.java | 21 +- .../processor/model/text/TextBlockOnPage.java | 7 + .../processor/model/text/TextPageBlock.java | 85 ++-- .../model/text/TextPositionSequence.java | 17 +- .../text/TextPositionSequenceComparator.java | 34 ++ .../ClarifyndClassificationService.java | 22 +- .../classification/ClassificationService.java | 62 +++ .../DocuMineClassificationService.java | 73 ++-- .../HeaderFooterClassificationService.java | 55 +++ .../RedactManagerClassificationService.java | 26 +- .../TableOfContentsClassificationService.java | 370 ++++++++++++++++++ .../parsing/PDFLinesTextStripper.java | 55 ++- .../visualization/LayoutGridService.java | 3 + .../utils/TextNormalizationUtilities.java | 8 + .../utils/TextPositionOperations.java | 25 +- .../visualization/LayoutDebugLayer.java | 54 ++- .../docstrum/model/BoundingBoxTest.java | 70 ++++ .../docstrum/model/ConcreteBoundingBox.java | 12 + .../server/SimplifiedTextServiceTest.java | 4 +- .../graph/DocumentReadingOrderTest.java | 4 +- .../server/graph/TextRangeTest.java | 5 +- .../service/viewerdoc/LayerIdentifier.java | 3 + .../layers/LayoutDebugLayerConfig.java | 7 +- .../viewerdoc/service/OutlineUtility.java | 4 +- 33 files changed, 1042 insertions(+), 187 deletions(-) create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/FrequencyCounters.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextBlockOnPage.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequenceComparator.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/ClassificationService.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/HeaderFooterClassificationService.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TableOfContentsClassificationService.java create mode 100644 layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/BoundingBoxTest.java create mode 100644 layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/ConcreteBoundingBox.java diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index 13c52fb..57a0a4a 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -2,11 +2,13 @@ package com.knecon.fforesight.service.layoutparser.processor; import static java.lang.String.format; +import java.awt.geom.AffineTransform; import java.awt.geom.Rectangle2D; import java.io.File; import java.io.IOException; import java.nio.file.Files; import java.util.ArrayList; +import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.List; @@ -24,6 +26,10 @@ import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.No import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; +import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection; +import com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationService; +import com.knecon.fforesight.service.layoutparser.processor.services.classification.TableOfContentsClassificationService; import com.knecon.fforesight.service.layoutparser.processor.services.mapper.MarkdownMapper; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; @@ -91,10 +97,7 @@ public class LayoutParsingPipeline { CvTableParsingAdapter cvTableParsingAdapter; LayoutParsingStorageService layoutParsingStorageService; SectionsBuilderService sectionsBuilderService; - RedactManagerClassificationService redactManagerClassificationService; - DocuMineClassificationService docuMineClassificationService; SimplifiedSectionTextService simplifiedSectionTextService; - BodyTextFrameService bodyTextFrameService; RulingCleaningService rulingCleaningService; TableExtractionService tableExtractionService; DocuMineBlockificationService docuMineBlockificationService; @@ -104,12 +107,12 @@ public class LayoutParsingPipeline { LayoutGridService layoutGridService; ObservationRegistry observationRegistry; VisualLayoutParsingAdapter visualLayoutParsingAdapter; - ClarifyndClassificationService clarifyndClassificationService; GraphicExtractorService graphicExtractorService; OutlineExtractorService outlineExtractorService; OutlineValidationService outlineValidationService; TOCEnrichmentService tocEnrichmentService; LayoutparserSettings settings; + ClassificationService classificationService; public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException { @@ -273,6 +276,9 @@ public class LayoutParsingPipeline { stripper.setPdpage(pdPage); stripper.getText(originDocument); List words = stripper.getTextPositionSequences(); + +// rotateDirAdjExactly(words, pdPage); // works really well for many highly rotated documents (e.g. VV-331340.pdf), but it decreases the headline performance by 1.3%, so I am leaving it out for now + if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD)) { var lines = TextPositionOperations.groupByLine(new HashSet<>(words)); classificationDocument.getLayoutDebugLayer().addLineVisualizationsFromNestedTextPosition(lines, pageNumber); @@ -366,24 +372,7 @@ public class LayoutParsingPipeline { originDocument.close(); - log.info("Calculating BodyTextFrame for {}", identifier); - bodyTextFrameService.setBodyTextFrames(classificationDocument, layoutParsingType); - for (ClassificationPage page : classificationDocument.getPages()) { - classificationDocument.getLayoutDebugLayer().addCleanRulingVisualization(page.getCleanRulings(), page.getPageNumber()); - } - log.info("Classify TextBlocks for {}", identifier); - switch (layoutParsingType) { - case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_OLD, CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH -> - redactManagerClassificationService.classifyDocument(classificationDocument); - case DOCUMINE_OLD, DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument); - case CLARIFYND -> clarifyndClassificationService.classifyDocument(classificationDocument); - } - - if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD)) { - for (ClassificationPage page : classificationDocument.getPages()) { - docuMineBlockificationService.mergeblocks(page, page.getCleanRulings().withoutTextRulings(), 0, 10); - } - } + classificationService.classify(classificationDocument, layoutParsingType, identifier); List headlines = classificationDocument.getPages() .stream() @@ -406,6 +395,32 @@ public class LayoutParsingPipeline { } + private static void rotateDirAdjExactly(List words, PDPage pdPage) { + + for (TextDirection dir : TextDirection.values()) { + + double averageRotation = words.stream() + .map(TextPositionSequence::getTextPositions) + .flatMap(Collection::stream) + .filter(pos -> pos.getDir().equals(dir)) + .mapToDouble(RedTextPosition::getExactDir).average().orElse(0); + + if (averageRotation == 0) { + continue; + } + + AffineTransform rotateInstance = AffineTransform.getRotateInstance(averageRotation, pdPage.getMediaBox().getWidth() / 2, pdPage.getMediaBox().getHeight() / 2); + + for (TextPositionSequence word : words) { + if (!dir.equals(word.getDir())) { + continue; + } + word.transform(rotateInstance); + } + } + } + + private void addNumberOfPagesToTrace(int numberOfPages, long size) { if (observationRegistry.getCurrentObservation() != null) { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/BoundingBox.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/BoundingBox.java index be36feb..f282671 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/BoundingBox.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/BoundingBox.java @@ -225,33 +225,31 @@ public abstract class BoundingBox { public double horizontalDistance(BoundingBox other) { - Rectangle2D left; - Rectangle2D right; - if (this.leftOf(other)) { - left = this.getBBox(); - right = other.getBBox(); - } else { - left = other.getBBox(); - right = this.getBBox(); - } + double rect1Right = getMaxX(); + double rect1Left = getMinX(); + double rect2Right = other.getMaxX(); + double rect2Left = other.getMinX(); - return Math.max(0, right.getMinX() - left.getMaxX()); + if (rect1Left > rect2Right || rect2Left > rect1Right) { + return Math.max(rect2Left - rect1Right, rect1Left - rect2Right); + } else { + return 0; + } } public double verticalDistance(BoundingBox other) { - Rectangle2D bottom; - Rectangle2D top; - if (this.isAbove(other)) { - top = this.getBBox(); - bottom = other.getBBox(); - } else { - bottom = this.getBBox(); - top = other.getBBox(); - } + double rect1Top = getMaxY(); + double rect1Bottom = getMinY(); + double rect2Top = other.getMaxY(); + double rect2Bottom = other.getMinY(); - return Math.max(0, bottom.getMinY() - top.getMaxY()); + if (rect1Bottom > rect2Top || rect2Bottom > rect1Top) { + return Math.max(rect2Bottom - rect1Top, rect1Bottom - rect2Top); + } else { + return 0; + } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/TextBoundingBox.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/TextBoundingBox.java index 02aa578..f66bdbb 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/TextBoundingBox.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/TextBoundingBox.java @@ -99,4 +99,70 @@ public abstract class TextBoundingBox extends BoundingBox { return this.bBoxDirAdj.getCenterX(); } + + public double horizontalDistanceDirAdj(TextBoundingBox other) { + + double rect1Right = getMaxXDirAdj(); + double rect1Left = getXDirAdj(); + double rect2Right = other.getMaxXDirAdj(); + double rect2Left = other.getXDirAdj(); + + if (rect1Left > rect2Right || rect2Left > rect1Right) { + return Math.max(rect2Left - rect1Right, rect1Left - rect2Right); + } else { + return 0; + } + } + + + public double verticalDistanceDirAdj(TextBoundingBox other) { + + double rect1Top = getMaxYDirAdj(); + double rect1Bottom = getYDirAdj(); + double rect2Top = other.getMaxYDirAdj(); + double rect2Bottom = other.getYDirAdj(); + + if (rect1Bottom > rect2Top || rect2Bottom > rect1Top) { + return Math.max(rect2Bottom - rect1Top, rect1Bottom - rect2Top); + } else { + return 0; + } + } + + + public boolean intersectsDirAdj(TextBoundingBox other) { + + return this.intersectsXDirAdj(other) && this.intersectsYDirAdj(other); + } + + + public boolean intersectsDirAdj(TextBoundingBox other, float yThreshold, float xThreshold) { + + return this.intersectsXDirAdj(other, xThreshold) && this.intersectsYDirAdj(other, yThreshold); + } + + + public boolean intersectsXDirAdj(TextBoundingBox other, float threshold) { + + return this.getXDirAdj() - threshold <= other.getMaxXDirAdj() && this.getMaxXDirAdj() + threshold >= other.getXDirAdj(); + } + + + public boolean intersectsXDirAdj(TextBoundingBox other) { + + return this.getXDirAdj() <= other.getMaxXDirAdj() && this.getMaxXDirAdj() >= other.getXDirAdj(); + } + + + public boolean intersectsYDirAdj(TextBoundingBox other) { + + return this.getYDirAdj() <= other.getMaxYDirAdj() && this.getMaxYDirAdj() >= other.getYDirAdj(); + } + + + public boolean intersectsYDirAdj(TextBoundingBox other, float threshold) { + + return this.getYDirAdj() - threshold <= other.getMaxYDirAdj() && this.getMaxYDirAdj() + threshold >= other.getYDirAdj(); + } + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/UnionFind.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/UnionFind.java index 11932f4..78e8142 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/UnionFind.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/UnionFind.java @@ -28,4 +28,10 @@ public class UnionFind extends org.jgrapht.alg.util.UnionFind { return setRep.values(); } + + public Collection getElements() { + + return getParentMap().keySet(); + } + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/FloatFrequencyCounter.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/FloatFrequencyCounter.java index 9b69b37..8dd491a 100755 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/FloatFrequencyCounter.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/FloatFrequencyCounter.java @@ -44,7 +44,7 @@ public class FloatFrequencyCounter { public Double getMostPopular() { - if (changed) { + if (changed || mostPopularCache == null) { Map.Entry mostPopular = null; for (Map.Entry entry : countPerValue.entrySet()) { if (mostPopular == null || entry.getValue() >= mostPopular.getValue()) { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageBlockType.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageBlockType.java index f67127a..8a5619f 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageBlockType.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageBlockType.java @@ -15,6 +15,7 @@ public enum PageBlockType { PARAGRAPH_ITALIC, PARAGRAPH_UNKNOWN, OTHER, + TABLE_OF_CONTENTS_ITEM, TABLE; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/TextRange.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/TextRange.java index 2da6c55..eda21dc 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/TextRange.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/TextRange.java @@ -98,10 +98,10 @@ public class TextRange implements Comparable { public List split(List splitIndices) { if (splitIndices.stream() - .anyMatch(idx -> !this.containsExclusive(idx))) { + .anyMatch(idx -> !this.contains(idx))) { throw new IndexOutOfBoundsException(format("%s splitting indices are out of range for %s", splitIndices.stream() - .filter(idx -> !this.containsExclusive(idx)) + .filter(idx -> !this.contains(idx)) .toList(), this)); } @@ -116,7 +116,9 @@ public class TextRange implements Comparable { splitBoundaries.add(new TextRange(previousIndex, splitIndex)); previousIndex = splitIndex; } - splitBoundaries.add(new TextRange(previousIndex, end)); + if (previousIndex != end) { + splitBoundaries.add(new TextRange(previousIndex, end)); + } return splitBoundaries; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/FrequencyCounters.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/FrequencyCounters.java new file mode 100644 index 0000000..421d33f --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/FrequencyCounters.java @@ -0,0 +1,21 @@ +package com.knecon.fforesight.service.layoutparser.processor.model.text; + +import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter; + +import lombok.AccessLevel; +import lombok.Getter; +import lombok.NoArgsConstructor; +import lombok.experimental.FieldDefaults; + +@Getter +@NoArgsConstructor +@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) +public class FrequencyCounters { + + FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter(); + FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter(); + FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter(); + StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter(); + StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter(); + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/RedTextPosition.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/RedTextPosition.java index bdc581f..9a34c3e 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/RedTextPosition.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/RedTextPosition.java @@ -7,6 +7,8 @@ import org.apache.pdfbox.text.TextPosition; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox; import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2; +import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms; +import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation; import lombok.AccessLevel; import lombok.AllArgsConstructor; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/StringFrequencyCounter.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/StringFrequencyCounter.java index 934b1b3..3ae9270 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/StringFrequencyCounter.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/StringFrequencyCounter.java @@ -9,10 +9,14 @@ public class StringFrequencyCounter { @Getter private final Map countPerValue = new HashMap<>(); + boolean changed; + String mostPopularCache; public void add(String value) { + changed = true; + if (!countPerValue.containsKey(value)) { countPerValue.put(value, 1); } else { @@ -23,6 +27,8 @@ public class StringFrequencyCounter { public void addAll(Map otherCounter) { + changed = true; + for (Map.Entry entry : otherCounter.entrySet()) { if (countPerValue.containsKey(entry.getKey())) { countPerValue.put(entry.getKey(), countPerValue.get(entry.getKey()) + entry.getValue()); @@ -35,13 +41,18 @@ public class StringFrequencyCounter { public String getMostPopular() { - Map.Entry mostPopular = null; - for (Map.Entry entry : countPerValue.entrySet()) { - if (mostPopular == null || entry.getValue() > mostPopular.getValue()) { - mostPopular = entry; + if (changed || mostPopularCache == null) { + Map.Entry mostPopular = null; + for (Map.Entry entry : countPerValue.entrySet()) { + if (mostPopular == null || entry.getValue() >= mostPopular.getValue()) { + mostPopular = entry; + } } + mostPopularCache = mostPopular != null ? mostPopular.getKey() : null; + changed = false; } - return mostPopular != null ? mostPopular.getKey() : null; + + return mostPopularCache; } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextBlockOnPage.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextBlockOnPage.java new file mode 100644 index 0000000..f79127b --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextBlockOnPage.java @@ -0,0 +1,7 @@ +package com.knecon.fforesight.service.layoutparser.processor.model.text; + +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; + +public record TextBlockOnPage(ClassificationPage page, TextPageBlock textBlock) { + +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java index 06514d3..7ddeb1e 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java @@ -7,7 +7,6 @@ import java.util.List; import com.fasterxml.jackson.annotation.JsonIgnore; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter; import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities; @@ -27,19 +26,11 @@ public class TextPageBlock extends AbstractPageBlock { @Builder.Default private List sequences = new ArrayList<>(); + @Builder.Default + private FrequencyCounters frequencyCounters = new FrequencyCounters(); private Rectangle2D bBoxDirAdj; - private String mostPopularWordFont; - - private String mostPopularWordStyle; - - private double mostPopularWordFontSize; - - private double mostPopularWordHeight; - - private double mostPopularWordSpaceWidth; - private boolean underlined; private double highestFontSize; @@ -55,8 +46,10 @@ public class TextPageBlock extends AbstractPageBlock { public TextPageBlock(List sequences) { this.sequences = new ArrayList<>(sequences); + this.frequencyCounters = new FrequencyCounters(); + if (!sequences.isEmpty()) { - calculateFrequencyCounters(); + addToFrequencyCounters(sequences); } calculateBBox(); } @@ -118,32 +111,18 @@ public class TextPageBlock extends AbstractPageBlock { } - private void calculateFrequencyCounters() { - - FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter(); - FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter(); - FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter(); - StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter(); - StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter(); + private void addToFrequencyCounters(List sequences) { for (TextPositionSequence wordBlock : sequences) { - lineHeightFrequencyCounter.add(wordBlock.getTextHeight()); - fontSizeFrequencyCounter.add(wordBlock.getFontSize()); - spaceFrequencyCounter.add(wordBlock.getSpaceWidth()); - fontFrequencyCounter.add(wordBlock.getFont()); - styleFrequencyCounter.add(wordBlock.getFontStyle()); - + frequencyCounters.getLineHeightFrequencyCounter().add(wordBlock.getTextHeight()); + frequencyCounters.getFontSizeFrequencyCounter().add(wordBlock.getFontSize()); + frequencyCounters.getSpaceFrequencyCounter().add(wordBlock.getSpaceWidth()); + frequencyCounters.getFontFrequencyCounter().add(wordBlock.getFont()); + frequencyCounters.getStyleFrequencyCounter().add(wordBlock.getFontStyle()); } - setMostPopularWordFont(fontFrequencyCounter.getMostPopular()); - setMostPopularWordStyle(styleFrequencyCounter.getMostPopular()); - setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular()); - setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular()); - setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular()); - setHighestFontSize(fontSizeFrequencyCounter.getHighest()); - - setUnderlined(sequences.stream() + setUnderlined(this.sequences.stream() .allMatch(TextPositionSequence::isUnderline)); } @@ -152,7 +131,7 @@ public class TextPageBlock extends AbstractPageBlock { TextPageBlock union = this.copy(); union.add(r); - calculateFrequencyCounters(); + addToFrequencyCounters(List.of(r)); calculateBBox(); return union; } @@ -162,7 +141,7 @@ public class TextPageBlock extends AbstractPageBlock { TextPageBlock union = this.copy(); union.addAll(r.getSequences()); - calculateFrequencyCounters(); + addToFrequencyCounters(r.getSequences()); calculateBBox(); return union; } @@ -172,7 +151,7 @@ public class TextPageBlock extends AbstractPageBlock { changed = true; sequences.addAll(textPageBlock.getSequences()); - calculateFrequencyCounters(); + addToFrequencyCounters(textPageBlock.getSequences()); calculateBBox(); } @@ -181,7 +160,7 @@ public class TextPageBlock extends AbstractPageBlock { changed = true; sequences.add(textPositionSequence); - calculateFrequencyCounters(); + addToFrequencyCounters(List.of(textPositionSequence)); calculateBBox(); } @@ -190,7 +169,7 @@ public class TextPageBlock extends AbstractPageBlock { changed = true; sequences.addAll(textPositionSequences); - calculateFrequencyCounters(); + addToFrequencyCounters(textPositionSequences); calculateBBox(); } @@ -253,6 +232,36 @@ public class TextPageBlock extends AbstractPageBlock { } + public String getMostPopularWordFont() { + + return frequencyCounters.getFontFrequencyCounter().getMostPopular(); + } + + + public String getMostPopularWordStyle() { + + return frequencyCounters.getStyleFrequencyCounter().getMostPopular(); + } + + + public double getMostPopularWordFontSize() { + + return frequencyCounters.getFontSizeFrequencyCounter().getMostPopular(); + } + + + public double getMostPopularWordHeight() { + + return frequencyCounters.getLineHeightFrequencyCounter().getMostPopular(); + } + + + public double getMostPopularWordSpaceWidth() { + + return frequencyCounters.getSpaceFrequencyCounter().getMostPopular(); + } + + @Override public boolean isEmpty() { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequence.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequence.java index a8af625..9f60edf 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequence.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequence.java @@ -2,10 +2,13 @@ package com.knecon.fforesight.service.layoutparser.processor.model.text; import static com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition.HEIGHT_PADDING; +import java.awt.geom.AffineTransform; +import java.awt.geom.Rectangle2D; import java.util.ArrayList; import java.util.List; import java.util.Locale; import java.util.Objects; +import java.util.regex.Pattern; import java.util.stream.Collectors; import org.apache.pdfbox.text.TextPosition; @@ -30,6 +33,7 @@ public class TextPositionSequence extends TextBoundingBox implements CharSequenc public static final String BOLD_ITALIC = "bold, italic"; public static final String BOLD = "bold"; public static final String ITALIC = "italic"; + public static final Pattern FONT_CLEANER = Pattern.compile(",bold|,italic"); private int page; @@ -154,7 +158,8 @@ public class TextPositionSequence extends TextBoundingBox implements CharSequenc if (textPositions.get(0).getFontName() == null) { return "none"; } - return textPositions.get(0).getFontName().toLowerCase(Locale.ROOT).replaceAll(",bold", "").replaceAll(",italic", ""); + + return FONT_CLEANER.matcher(textPositions.get(0).getFontName().toLowerCase(Locale.ROOT)).replaceAll(""); } @@ -238,5 +243,15 @@ public class TextPositionSequence extends TextBoundingBox implements CharSequenc return result; } + + public void transform(AffineTransform rotateInstance) { + + for (RedTextPosition textPosition : getTextPositions()) { + Rectangle2D exactDirAdjCoordinates = rotateInstance.createTransformedShape(textPosition.getBBoxDirAdj()).getBounds2D(); + textPosition.setBBoxDirAdj(exactDirAdjCoordinates); + } + calculateBBoxAndHashcode(); + } + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequenceComparator.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequenceComparator.java new file mode 100644 index 0000000..0565923 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequenceComparator.java @@ -0,0 +1,34 @@ +package com.knecon.fforesight.service.layoutparser.processor.model.text; + +import java.util.Comparator; +import java.util.HashMap; + +public class TextPositionSequenceComparator implements Comparator { + + private HashMap lookup; + + + public TextPositionSequenceComparator(HashMap lookup) { + + this.lookup = lookup; + } + + + @Override + public int compare(TextPositionSequence number1, TextPositionSequence number2) { + + int page1 = lookup.get(number1).page().getPageNumber(); + int page2 = lookup.get(number2).page().getPageNumber(); + + if (page1 != page2) { + return Integer.compare(page1, page2); + } + + if (number1.getY() != number2.getY()) { + return Double.compare(number1.getY(), number2.getY()); + } + + return Integer.compare(Integer.parseInt(number1.toString()), Integer.parseInt(number2.toString())); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/ClarifyndClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/ClarifyndClassificationService.java index 98308cb..b90fdf1 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/ClarifyndClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/ClarifyndClassificationService.java @@ -60,24 +60,18 @@ public class ClarifyndClassificationService { headlineClassificationService.setLastHeadlineFromOutline(textBlock); return; } + if (textBlock.getClassification() != null && (textBlock.getClassification().equals(PageBlockType.HEADER) + || textBlock.getClassification().equals(PageBlockType.FOOTER) + || textBlock.getClassification().equals(PageBlockType.TABLE_OF_CONTENTS_ITEM))) { + return; + } if (document.getFontSizeCounter().getMostPopular() == null) { textBlock.setClassification(PageBlockType.PARAGRAPH); return; } - if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) - || PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null - || textBlock.getHighestFontSize() <= document.getFontSizeCounter() - .getMostPopular())) { - textBlock.setClassification(PageBlockType.PARAGRAPH); - - } else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER) - || PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null - || textBlock.getHighestFontSize() <= document.getFontSizeCounter() - .getMostPopular())) { - textBlock.setClassification(PageBlockType.PARAGRAPH); - } else if (page.getPageNumber() == 1 // - && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5 - && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) { + if (page.getPageNumber() == 1 // + && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5 + && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) { if (!Pattern.matches("[0-9]+", textBlock.toString())) { textBlock.setClassification(PageBlockType.TITLE); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/ClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/ClassificationService.java new file mode 100644 index 0000000..e9bdff5 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/ClassificationService.java @@ -0,0 +1,62 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.classification; + +import java.util.Map; + +import org.springframework.stereotype.Service; + +import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService; +import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService; + +import lombok.AccessLevel; +import lombok.RequiredArgsConstructor; +import lombok.experimental.FieldDefaults; +import lombok.extern.slf4j.Slf4j; + +@Slf4j +@Service +@RequiredArgsConstructor +@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) +public class ClassificationService { + + DocuMineBlockificationService docuMineBlockificationService; + BodyTextFrameService bodyTextFrameService; + TableOfContentsClassificationService tableOfContentsClassificationService; + RedactManagerClassificationService redactManagerClassificationService; + ClarifyndClassificationService clarifyndClassificationService; + DocuMineClassificationService docuMineClassificationService; + HeaderFooterClassificationService headerFooterClassificationService; + + + public void classify(ClassificationDocument document, LayoutParsingType layoutParsingType, Map identifier) { + + log.info("Calculating BodyTextFrame for {}", identifier); + bodyTextFrameService.setBodyTextFrames(document, layoutParsingType); + for (ClassificationPage page : document.getPages()) { + document.getLayoutDebugLayer().addCleanRulingVisualization(page.getCleanRulings(), page.getPageNumber()); + } + log.info("Classify TextBlocks for {}", identifier); + + headerFooterClassificationService.classifyHeadersAndFooters(document); + + tableOfContentsClassificationService.classifyTableOfContents(document); + + switch (layoutParsingType) { + case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_OLD, CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH -> + redactManagerClassificationService.classifyDocument(document); + case DOCUMINE_OLD, DOCUMINE -> docuMineClassificationService.classifyDocument(document); + case CLARIFYND -> clarifyndClassificationService.classifyDocument(document); + } + + if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD)) { + for (ClassificationPage page : document.getPages()) { + docuMineBlockificationService.mergeblocks(page, page.getCleanRulings().withoutTextRulings(), 0, 10); + } + } + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java index 4d18626..c11b96c 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java @@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.services.classifica import java.util.ArrayList; import java.util.List; import java.util.Locale; +import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -29,10 +30,12 @@ public class DocuMineClassificationService { private static final Pattern HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN = Pattern.compile("^([0-9]\\.)\\s[a-z][0-9a-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE); private static final Pattern AT_LEAST_3_CHARS_PATTERN = Pattern.compile("\\p{L}{3,}", Pattern.CASE_INSENSITIVE); private static final Pattern HEADLINE_PATTERN_WITH_SLASHES = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*"); - private static final Pattern AMOUNT_PATTERN = Pattern.compile("^\\s*\\d+(?:\\.\\d+)?\\s*(?:ml|l|g|kg|mg|cm|mm|km|m|lb|oz|ppm|%|f)\\b", Pattern.CASE_INSENSITIVE); - private static final Pattern TABLE_OR_FIGURE_PATTER = Pattern.compile( + public static final Pattern AMOUNT_PATTERN = Pattern.compile("^\\s*\\d+(?:\\.\\d+)?\\s*(?:ml|l|g|kg|mg|cm|cm2|cm3|mm|mm2|mm3|km|km2|m|m2|m3|lb|oz|ppm|dpm|days|weeks|months|%|f)\\b", + Pattern.CASE_INSENSITIVE); + private static final Pattern TABLE_OR_FIGURE_PATTERN = Pattern.compile( "^\\s*(?:table|continued\\s+table|appendix|figure)\\s+(?:[xvi]+|[a-z0-9]{1,3}(?:\\.[0-9]{1,3})*(?:-[0-9]{1,3})?)\\b", Pattern.CASE_INSENSITIVE); + private static final Pattern ALPHANUMERIC = Pattern.compile("[a-zA-Z0-9]"); public static final int SEPARATION_THRESHOLD = 10; // if the min distance between a textblock and all its surrounding blocks, the regexes can be more lenient. public static final int SURROUNDING_BLOCKS_RADIUS = 3; // number of surrounding blocks before and after the current textblock to be tested @@ -78,6 +81,9 @@ public class DocuMineClassificationService { if (i == originalIndex) { continue; } + if (textBlocks.get(i).getText().length() <= 1) { + continue; + } surroundingBlocks.add(textBlocks.get(i)); } return surroundingBlocks; @@ -98,10 +104,9 @@ public class DocuMineClassificationService { Matcher atLeast3Matcher = AT_LEAST_3_CHARS_PATTERN.matcher(textBlock.toString()); Matcher headlineWithSlashesMatcher = HEADLINE_PATTERN_WITH_SLASHES.matcher(textBlock.toString()); Matcher amountMatcher = AMOUNT_PATTERN.matcher(textBlock.toString()); - Matcher tableOrFigureMatcher = TABLE_OR_FIGURE_PATTER.matcher(textBlock.toString()); + Matcher tableOrFigureMatcher = TABLE_OR_FIGURE_PATTERN.matcher(textBlock.toString()); Matcher headlineWithSingleIdentifierMatcher = HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN.matcher(textBlock.toString()); boolean isAtLeast3Characters = atLeast3Matcher.reset().find(); - boolean isTocItem = textBlock.getText().contains(".............."); boolean headlineWithSlashesMatches = headlineWithSlashesMatcher.reset().matches(); boolean isAmount = amountMatcher.reset().find(); int charCount = countChars(textBlock); @@ -112,35 +117,22 @@ public class DocuMineClassificationService { headlineClassificationService.setLastHeadlineFromOutline(textBlock); return; } + if (textBlock.getClassification() != null && (textBlock.getClassification().equals(PageBlockType.HEADER) + || textBlock.getClassification().equals(PageBlockType.FOOTER) + || textBlock.getClassification().equals(PageBlockType.TABLE_OF_CONTENTS_ITEM))) { + return; + } if (document.getFontSizeCounter().getMostPopular() == null) { textBlock.setClassification(PageBlockType.PARAGRAPH); return; } - if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) // - || (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) // - && (document.getFontSizeCounter().getMostPopular() == null // - || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular()))) { - textBlock.setClassification(PageBlockType.HEADER); - - } else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER) - || (PositionUtils.isUnderBodyTextFrame(bodyTextFrame, - textBlock, - page.getRotation()) - && (document.getFontSizeCounter().getMostPopular() - == null - || textBlock.getHighestFontSize() - <= document.getFontSizeCounter() - .getMostPopular())) - || HeaderFooterDetection.isLikelyFooter(textBlock, document, page)) { - textBlock.setClassification(PageBlockType.FOOTER); - } else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5 - && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) { + if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5 + && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) { if (!Pattern.matches("[0-9]+", textBlock.toString())) { textBlock.setClassification(PageBlockType.TITLE); } } else if (textBlock.getText().length() > 5 - && (textBlock.getMostPopularWordHeight() > document.getTextHeightCounter().getMostPopular() - || textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular()) + && greaterOrEqualFontThanDocumentAverage(textBlock, document) && PositionUtils.getApproxLineCount(textBlock) < 5.9 && ((textBlock.getMostPopularWordStyle().contains("bold") || textBlock.isUnderlined())// && Character.isDigit(textBlock.toString().charAt(0)) // @@ -152,18 +144,19 @@ public class DocuMineClassificationService { || textBlock.toString().startsWith("TABLE")) && !textBlock.toString().endsWith(":") && isAtLeast3Characters - && !isTocItem && !isAmount && enoughChars) { setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes); } else if (isAllCaps(textBlock) + && ALPHANUMERIC.matcher(Character.toString(textBlock.getText().charAt(0))).matches() + && hasSeparation(textBlock, surroundingBlocks) && textBlock.getText().length() > 5 && isAtLeast3Characters && !isAmount && enoughChars && !textBlock.toString().contains(":") - && !textBlock.toString().startsWith("(") + && !textBlock.toString().endsWith(".") && PositionUtils.getApproxLineCount(textBlock) < 2.9) { setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes); @@ -171,16 +164,14 @@ public class DocuMineClassificationService { && PositionUtils.getApproxLineCount(textBlock) < 2.9 && isAtLeast3Characters && !headlineWithSlashesMatches - && !isAmount - && !isTocItem) { + && !isAmount) { setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes); - } else if (!isTocItem - && hasSeparation(textBlock, surroundingBlocks) - && greaterOrEqualThanFontPageAverage(textBlock, page) - && PositionUtils.getApproxLineCount(textBlock) < 2.9 - && (tableOrFigureMatcher.reset().find() || headlineWithSingleIdentifierMatcher.reset().find()) - && !isAmount + } else if (hasSeparation(textBlock, surroundingBlocks)// + && greaterOrEqualFontThanPageAverage(textBlock, page)// + && PositionUtils.getApproxLineCount(textBlock) < 2.9// + && (tableOrFigureMatcher.reset().find() || headlineWithSingleIdentifierMatcher.reset().find())// + && !isAmount// && !headlineWithSlashesMatches) { setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes); @@ -222,13 +213,20 @@ public class DocuMineClassificationService { } - private static boolean greaterOrEqualThanFontPageAverage(TextPageBlock textBlock, ClassificationPage page) { + private static boolean greaterOrEqualFontThanPageAverage(TextPageBlock textBlock, ClassificationPage page) { return textBlock.getMostPopularWordHeight() >= page.getTextHeightCounter().getMostPopular() // || textBlock.getMostPopularWordFontSize() >= page.getFontSizeCounter().getMostPopular(); } + private static boolean greaterOrEqualFontThanDocumentAverage(TextPageBlock textBlock, ClassificationDocument document) { + + return textBlock.getMostPopularWordHeight() > document.getTextHeightCounter().getMostPopular() // + || textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular(); + } + + private static boolean isAllCaps(TextPageBlock textBlock) { return textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)); @@ -246,8 +244,7 @@ public class DocuMineClassificationService { return surroundingBlocks.stream() .mapToDouble(surroundingBlock -> calculateSeparation(textBlock, surroundingBlock)) - .min() - .orElse(Double.MAX_VALUE); + .min().orElse(Double.MAX_VALUE); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/HeaderFooterClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/HeaderFooterClassificationService.java new file mode 100644 index 0000000..d47a6b1 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/HeaderFooterClassificationService.java @@ -0,0 +1,55 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.classification; + +import org.springframework.stereotype.Service; + +import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; +import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils; + +import lombok.AccessLevel; +import lombok.RequiredArgsConstructor; +import lombok.experimental.FieldDefaults; + +@Service +@RequiredArgsConstructor +@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) +public class HeaderFooterClassificationService { + + public void classifyHeadersAndFooters(ClassificationDocument document) { + + for (ClassificationPage page : document.getPages()) { + for (AbstractPageBlock pageBlock : page.getTextBlocks()) { + if (pageBlock instanceof TextPageBlock textBlock) { + classifyBlock(document, page, textBlock); + } + } + } + + } + + + private static void classifyBlock(ClassificationDocument document, ClassificationPage page, TextPageBlock textBlock) { + + if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) + || PositionUtils.isOverBodyTextFrame(page.getBodyTextFrame(), textBlock, page.getRotation()) && smallerFontThanDocAverage(document, textBlock)) { + + textBlock.setClassification(PageBlockType.HEADER); + + } else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER) + || PositionUtils.isUnderBodyTextFrame(page.getBodyTextFrame(), textBlock, page.getRotation()) && smallerFontThanDocAverage(document, textBlock)) { + + textBlock.setClassification(PageBlockType.FOOTER); + } + } + + + private static boolean smallerFontThanDocAverage(ClassificationDocument document, TextPageBlock textBlock) { + + return document.getFontSizeCounter().getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular(); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java index 2e0da1e..3e066d5 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java @@ -61,6 +61,15 @@ public class RedactManagerClassificationService { headlineClassificationService.setLastHeadlineFromOutline(textBlock); return; } + if (textBlock.getClassification() != null && (textBlock.getClassification().equals(PageBlockType.HEADER) + || textBlock.getClassification().equals(PageBlockType.FOOTER) + || textBlock.getClassification().equals(PageBlockType.TABLE_OF_CONTENTS_ITEM))) { + return; + } + if (document.getFontSizeCounter().getMostPopular() == null) { + textBlock.setClassification(PageBlockType.PARAGRAPH); + return; + } if (document.getFontSizeCounter().getMostPopular() == null) { textBlock.setClassification(PageBlockType.PARAGRAPH); return; @@ -73,21 +82,8 @@ public class RedactManagerClassificationService { textBlock.setClassification(PageBlockType.PARAGRAPH); return; } - if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) - || PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null - || textBlock.getHighestFontSize() <= document.getFontSizeCounter() - .getMostPopular())) { - - textBlock.setClassification(PageBlockType.HEADER); - - } else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER) - || PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null - || textBlock.getHighestFontSize() <= document.getFontSizeCounter() - .getMostPopular())) { - - textBlock.setClassification(PageBlockType.FOOTER); - } else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5 - && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) { + if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5 + && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) { if (!Pattern.matches("[0-9]+", textBlock.toString())) { textBlock.setClassification(PageBlockType.TITLE); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TableOfContentsClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TableOfContentsClassificationService.java new file mode 100644 index 0000000..854d087 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TableOfContentsClassificationService.java @@ -0,0 +1,370 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.classification; + +import static com.knecon.fforesight.service.layoutparser.processor.services.classification.DocuMineClassificationService.AMOUNT_PATTERN; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Set; +import java.util.regex.Pattern; +import java.util.stream.Collectors; + +import org.springframework.stereotype.Service; + +import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind; +import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextBlockOnPage; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequenceComparator; +import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities; +import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer; + +import lombok.extern.slf4j.Slf4j; + +@Slf4j +@Service +public class TableOfContentsClassificationService { + + private static final int MAX_PAGE_COUNT = 10; // maximum length of a toc to avoid runaway classification + private static final int SURROUNDING_BLOCKS_RADIUS = 10; // number of blocks to look ahead + private static final int MINIMUM_MATCHES = 2; // minimum cluster size + public static final int INTERSECTION_TOLERANCE = 2; // equality threshold for x intersection + public static final int DENSITY_THRESHOLD_COUNT = 10; // describes the minimum density, at least this many entries per page height are required + + private static final Pattern NUMERIC = Pattern.compile("[0-9]+"); + + + @SuppressWarnings("checkstyle:ModifiedControlVariable") + public void classifyTableOfContents(ClassificationDocument document) { + + List textBlocks = buildBlocksPerPage(document); + + for (int i = 0; i < textBlocks.size(); i++) { + TextBlockOnPage textBlock = textBlocks.get(i); + + if (!isTOCHeadline(textBlock)) { + continue; + } + + int offset = identifyTOCItems(i + 1, textBlocks, document); + + if (offset > 1) { + textBlock.textBlock().setClassification(PageBlockType.H1); + i += offset; + } + } + } + + + private int identifyTOCItems(int start, List textBlocks, ClassificationDocument document) { + + ClassificationPage startPage = textBlocks.get(start).page(); + List initialLookAhead = textBlocks.subList(start, Math.min(start + SURROUNDING_BLOCKS_RADIUS, textBlocks.size())); + HashMap lookup = new HashMap<>(); + List numbers = extractNumbers(initialLookAhead, lookup, document.getPages().size()); + TocNumberFinder tocNumberFinder = new TocNumberFinder(numbers, lookup); + + int lastCandidate = start; + for (int i = start; i < Math.min(lastCandidate + SURROUNDING_BLOCKS_RADIUS, textBlocks.size()); i++) { + + TextBlockOnPage textBlockOnPage = textBlocks.get(i); + if (textBlockOnPage.page().getPageNumber() - MAX_PAGE_COUNT > startPage.getPageNumber()) { + break; + } + + if (textBlockOnPage.textBlock().getClassification() != null // + && textBlockOnPage.textBlock().getClassification().isHeadline() // + && !(textBlockOnPage.textBlock().getText().startsWith("TABLES") // + || textBlockOnPage.textBlock().getText().startsWith("APPENDICES") // + || textBlockOnPage.textBlock().getText().startsWith("FIGURES"))) { + log.debug("hit an outline headline, stop immediately."); + lastCandidate = i - 1; + break; + } + + List numbersFromBlock = extractNumbers(textBlockOnPage, lookup, document.getPages().size()); + + List currentRightmostCluster = tocNumberFinder.getCurrentRightmostCluster(); + + if (currentRightmostCluster.size() < MINIMUM_MATCHES) { + log.debug("No numbers indicating a table of contents here."); + return start; + } + + if (anyIntersection(currentRightmostCluster, numbersFromBlock, lookup)) { + lastCandidate = i; + numbersFromBlock.forEach(tocNumberFinder::add); + } + } + + addVisualization(document.getLayoutDebugLayer(), tocNumberFinder, lookup); + + Set blocksWithNumberInCluster = tocNumberFinder.getCurrentRightmostCluster() + .stream() + .map(lookup::get) + .collect(Collectors.toSet()); + + int lastConfirmed = start; + for (int i = start; i < lastCandidate + 1; i++) { + TextBlockOnPage textBlockOnPage = textBlocks.get(i); + if (blocksWithNumberInCluster.contains(textBlockOnPage)) { + lastConfirmed = i; + } + } + + textBlocks.subList(start, lastConfirmed + 1) + .stream() + .filter(block -> (block.textBlock().getClassification() == null || !block.textBlock().getClassification().isHeadline())) + .forEach(textBlockOnPage -> textBlockOnPage.textBlock().setClassification(PageBlockType.TABLE_OF_CONTENTS_ITEM)); + + return lastCandidate; + } + + + private static void addVisualization(LayoutDebugLayer layoutDebugLayer, TocNumberFinder tocNumberFinder, Map lookup) { + + tocNumberFinder.getCurrentRightmostCluster() + .stream() + .collect(Collectors.groupingBy(key -> lookup.get(key).page().getPageNumber())) + .forEach((pageNumber, number) -> layoutDebugLayer.addTocPages(number, pageNumber)); + } + + + private static boolean anyIntersection(Collection numbers1, + Collection numbers2, + Map lookup) { + + return numbers1.stream() + .anyMatch(numberFromCluster -> numbers2.stream() + .anyMatch(numberFromBlock -> matches(numberFromBlock, numberFromCluster, lookup))); + } + + + private static List extractNumbers(List textBlocks, Map lookup, int numberOfPages) { + + List blocks = new LinkedList<>(); + for (TextBlockOnPage textBlock : textBlocks) { + blocks.addAll(extractNumbers(textBlock, lookup, numberOfPages)); + } + return blocks; + } + + + private static List extractNumbers(TextBlockOnPage textBlock, Map lookup, int numberOfPages) { + + List blocks = new LinkedList<>(); + TextPageBlock block = textBlock.textBlock(); + List sequences = block.getSequences(); + for (int i = 0; i < sequences.size(); i++) { + + TextPositionSequence word = sequences.get(i); + + if (!NUMERIC.matcher(word).matches()) { + continue; + } + + if (AMOUNT_PATTERN.matcher(getSurroundingString(i, sequences)).matches()) { + continue; + } + + try { + int pageNumber = Integer.parseInt(word.toString()); + if (0 >= pageNumber || pageNumber > numberOfPages) { + continue; + } + lookup.put(word, textBlock); + blocks.add(word); + } catch (NumberFormatException e) { + log.debug("That wasn't a number! Should not happen, due to numeric check beforehand."); + } + } + return blocks; + } + + + private static CharSequence getSurroundingString(int i, List sequences) { + + int end = Math.min(i + 5, sequences.size()); + return sequences.subList(i, end) + .stream() + .map(TextPositionSequence::toString) + .collect(Collectors.joining(" ")); + } + + + private static boolean matches(TextPositionSequence number1, TextPositionSequence number2, Map lookup) { + + if (number1.getDir() != number2.getDir()) { + return false; + } + + return number1.intersectsXDirAdj(number2, INTERSECTION_TOLERANCE); + } + + + private boolean isTOCHeadline(TextBlockOnPage textBlock) { + + if (textBlock.textBlock().getText().length() > 50) { + return false; + } + String text = TextNormalizationUtilities.removeAllWhitespaces(textBlock.textBlock().getText().toLowerCase(Locale.ENGLISH)); + return (text.contains("content") && text.length() < "content".length() + 6) // + || (text.contains("tableofcontent") && text.length() < "tableofcontent".length() + DENSITY_THRESHOLD_COUNT)// + || text.equals("tables")// + || text.equals("appendices")// + || text.equals("figures"); + } + + + private List buildBlocksPerPage(ClassificationDocument document) { + + List blocks = new ArrayList<>(); + for (ClassificationPage page : document.getPages()) { + for (AbstractPageBlock abstractPageBlock : page.getTextBlocks()) { + if (abstractPageBlock instanceof TextPageBlock textBlock) { + if (textBlock.getClassification() != null && (textBlock.getClassification().equals(PageBlockType.HEADER) // + || textBlock.getClassification().equals(PageBlockType.FOOTER))) { + continue; + } + blocks.add(new TextBlockOnPage(page, textBlock)); + } + } + } + return blocks; + } + + + private static class TocNumberFinder { + + final UnionFind numberClusters; + final HashMap lookup; + + + TocNumberFinder(List blocks, HashMap lookup) { + + this.numberClusters = new UnionFind<>(new HashSet<>(blocks)); + for (int i = 0; i < blocks.size(); i++) { + for (int j = i + 1; j < blocks.size(); j++) { + if (matches(blocks.get(i), blocks.get(j), lookup)) { + numberClusters.union(blocks.get(i), blocks.get(j)); + } + } + } + this.lookup = lookup; + } + + + public void add(TextPositionSequence number) { + + if (numberClusters.getElements().contains(number)) { + return; + } + + numberClusters.addElement(number); + for (TextPositionSequence element : numberClusters.getElements()) { + if (matches(number, element, lookup)) { + numberClusters.union(element, number); + } + } + } + + + public List getCurrentRightmostCluster() { + + return numberClusters.getGroups() + .stream() + .filter(cluster -> cluster.size() > MINIMUM_MATCHES) + .map(cluster -> cluster.stream() + .sorted(new TextPositionSequenceComparator(lookup)) + .toList()) + .map(this::removeOutliers) +// .map(this::filterByMinimumDensity) + .filter(cluster -> cluster.size() > MINIMUM_MATCHES) + .max(Comparator.comparingDouble(cluster -> cluster.get(0).getBBox().getMaxX())).orElse(Collections.emptyList()); + } + +// does not seem to be doing much, ideally instead of using the height of the blocks i would like to use the height, beginning from the MainBody top, +// but as the MainBody is often wrong, this results in some numbers being discarded even though they are correct. +// private List filterByMinimumDensity(List numbers) { +// +// Map> clustersPerPage = numbers.stream() +// .collect(Collectors.groupingBy(number -> lookup.get(number).page())); +// +// List result = new ArrayList<>(numbers.size()); +// clustersPerPage.keySet() +// .stream() +// .sorted(Comparator.comparingInt(ClassificationPage::getPageNumber)) +// .forEach(page -> { +// var numbersOnPage = clustersPerPage.get(page); +// +// double height = numbersOnPage.stream() +// .map(BoundingBox::getBBox) +// .collect(RectangleTransformations.collectBBox()).getHeight(); +// +// double count = numbersOnPage.size(); +// +// if ((count / height) >= (DENSITY_THRESHOLD_COUNT / page.getPageHeight())) { +// result.addAll(numbers); +// } +// }); +// return result; +// } + + + public List removeOutliers(List numbers) { + + List result = new ArrayList<>(); + + result.add(numbers.get(0)); + + for (int i = 1; i < numbers.size() - 1; i++) { + int prev = getNumberAsInt(numbers, i - 1); + int curr = getNumberAsInt(numbers, i); + int next = getNumberAsInt(numbers, i + 1); + + if (!(curr <= prev || curr >= next) || !isBetterWithout(numbers, i)) { + result.add(numbers.get(i)); + } + } + if (getNumberAsInt(numbers, numbers.size() - 1) >= getNumberAsInt(numbers, Math.max(0, numbers.size() - 2))) { + result.add(numbers.get(numbers.size() - 1)); + } + + return result; + } + + + // Helper method to check if removing the current number results in a better order + public static boolean isBetterWithout(List numbers, int i) { + + if (i == 0 || i == numbers.size() - 1) { + return false; + } + + int prev = getNumberAsInt(numbers, i); + int curr = getNumberAsInt(numbers, i); + int next = getNumberAsInt(numbers, i + 1); + + return (prev <= next) && (Math.abs(prev - next) < Math.abs(prev - curr) + Math.abs(curr - next)); + } + + } + + + private static int getNumberAsInt(List numbers, int i) { + + return Integer.parseInt(numbers.get(i).toString()); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java index 18e5a5a..e39b666 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java @@ -201,7 +201,7 @@ public class PDFLinesTextStripper extends PDFTextStripper { try { if (stroke && !getGraphicsState().getStrokingColor().isPattern() && isBlack(getGraphicsState().getStrokingColor()) || // - !stroke && !getGraphicsState().getNonStrokingColor().isPattern() && isBlack(getGraphicsState().getNonStrokingColor())) { + !stroke && !getGraphicsState().getNonStrokingColor().isPattern() && isBlack(getGraphicsState().getNonStrokingColor())) { rulings.addAll(path); } } catch (UnsupportedOperationException e) { @@ -279,9 +279,9 @@ public class PDFLinesTextStripper extends PDFTextStripper { startIndex = i; } - if (i > 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i).getUnicode().equals("\u00A0") || textPositions.get(i) - .getUnicode() - .equals("\t")) && i <= textPositions.size() - 2) { + if (i > 0 + && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i).getUnicode().equals("\u00A0") || textPositions.get(i).getUnicode().equals("\t")) + && i <= textPositions.size() - 2) { List sublist = textPositions.subList(startIndex, i); if (checkIfSequenceContainsOnlyWhitespaces(sublist)) { @@ -296,20 +296,31 @@ public class PDFLinesTextStripper extends PDFTextStripper { } startIndex = i + 1; } + if (isDottedLineFollowedByWord(textPositions, i, startIndex)) { + List sublist = textPositions.subList(startIndex, i); + textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart)); + startIndex = i; + } + if (isWordFollowedByDottedLine(textPositions, i, startIndex)) { + List sublist = textPositions.subList(startIndex, i - 2); + textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart)); + startIndex = i - 2; + } } List sublist = textPositions.subList(startIndex, textPositions.size()); - if (!sublist.isEmpty() && (sublist.get(sublist.size() - 1).getUnicode().equals(" ") || sublist.get(sublist.size() - 1) - .getUnicode() - .equals("\u00A0") || sublist.get(sublist.size() - 1).getUnicode().equals("\t"))) { + if (!sublist.isEmpty() && (sublist.get(sublist.size() - 1).getUnicode().equals(" ") + || sublist.get(sublist.size() - 1).getUnicode().equals("\u00A0") + || sublist.get(sublist.size() - 1).getUnicode().equals("\t"))) { sublist = sublist.subList(0, sublist.size() - 1); } if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0) .getUnicode() .equals("\t")))) { - if (previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0) - .getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < 0.01) { + if (previous != null + && sublist.get(0).getYDirAdj() == previous.getYDirAdj() + && sublist.get(0).getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < 0.01) { for (TextPosition t : sublist) { textPositionSequences.get(textPositionSequences.size() - 1).add(t); } @@ -317,10 +328,31 @@ public class PDFLinesTextStripper extends PDFTextStripper { textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, isParagraphStart)); } } + super.writeString(text); } + private boolean isWordFollowedByDottedLine(List textPositions, int i, int startIndex) { + + return i - startIndex >= 4 // + && textPositions.get(i).getUnicode().equals(".") // + && textPositions.get(i - 1).getUnicode().equals(".") // + && textPositions.get(i - 2).getUnicode().equals(".") // + && !textPositions.get(i - 3).getUnicode().equals("."); + } + + + private static boolean isDottedLineFollowedByWord(List textPositions, int i, int startIndex) { + + return i - startIndex >= 4 // + && !textPositions.get(i).getUnicode().equals(".") // + && textPositions.get(i - 1).getUnicode().equals(".") // + && textPositions.get(i - 2).getUnicode().equals(".") // + && textPositions.get(i - 3).getUnicode().equals("."); + } + + public boolean checkIfCurrentPositionIsToTheRightOfPreviousPosition(int i, List textPositions) { return i > 0 && textPositions.get(i).getXDirAdj() < textPositions.get(i - 1).getXDirAdj(); @@ -337,8 +369,9 @@ public class PDFLinesTextStripper extends PDFTextStripper { public boolean checkIfGapSizeBetweenCharactersSmallerThanMaximum(RedTextPosition previous, List sublist, float maximumGapSize) { - return previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0) - .getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < maximumGapSize; + return previous != null + && sublist.get(0).getYDirAdj() == previous.getYDirAdj() + && sublist.get(0).getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < maximumGapSize; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LayoutGridService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LayoutGridService.java index 750a7e8..acdb8ea 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LayoutGridService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LayoutGridService.java @@ -36,6 +36,9 @@ public class LayoutGridService { LayoutGrid layoutGrid = createLayoutGrid(document); Outline outline = OutlineMapper.createOutline(document); layoutGrid.setVisibleByDefault(layerVisibilityDefaultValue); + + document.getLayoutDebugLayer().addSentenceVisualization(document.getTextBlock()); + if (document.getLayoutDebugLayer().isActive()) { viewerDocumentService.addLayerGroups(originFile, destinationFile, List.of(layoutGrid, document.getLayoutDebugLayer()), outline); } else { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextNormalizationUtilities.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextNormalizationUtilities.java index ec93725..65329bd 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextNormalizationUtilities.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextNormalizationUtilities.java @@ -10,6 +10,7 @@ public final class TextNormalizationUtilities { public static final Pattern hyphenLineBreaks = Pattern.compile("[-~‐‒⁻−﹣゠⁓‑\\u00AD][\\r\\n]+"); public static final Pattern linebreaks = Pattern.compile("[\\r\\n]+"); public static final Pattern doubleWhitespaces = Pattern.compile("\\s{2,}"); + public static final Pattern WHITESPACE_REMOVAL = Pattern.compile("\\s+"); public String cleanString(String value) { @@ -36,4 +37,11 @@ public final class TextNormalizationUtilities { return linebreaks.matcher(value).replaceAll(" "); } + + + public String removeAllWhitespaces(String value) { + + return WHITESPACE_REMOVAL.matcher(value).replaceAll(""); + } + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java index 7d27ee5..3972243 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java @@ -2,6 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor.utils; import java.awt.geom.Rectangle2D; import java.awt.geom.RectangularShape; +import java.util.ArrayList; import java.util.Collection; import java.util.Comparator; import java.util.HashSet; @@ -54,18 +55,26 @@ public class TextPositionOperations { private List sortUsingLineDetection(Set sequences) { return sortLines(groupByLine(sequences)); - } public List sortLines(Collection> lines) { - return lines.stream() - .map(TextPositionOperations::sortByXDirAdj) - .filter(line -> !line.isEmpty()) - .sorted(Comparator.comparing(line -> line.get(0), COMPARATOR_DIR_ADJ)) - .flatMap(Collection::stream) - .toList(); + List> lineBlocks = new ArrayList<>(); + for (Set line : lines) { + List sortedLine = sortByXDirAdj(line); + if (!sortedLine.isEmpty()) { + lineBlocks.add(sortedLine); + } + } + // need to use old sorting, since COMPARATOR_DIR_ADJ is not transitive + QuickSort.sort(lineBlocks, Comparator.comparing(line -> line.get(0), COMPARATOR_DIR_ADJ)); + + List list = new ArrayList<>(); + for (List textPositionSequences : lineBlocks) { + list.addAll(textPositionSequences); + } + return list; } @@ -91,7 +100,7 @@ public class TextPositionOperations { for (TextPositionSequence sequence : sequences) { for (TextPositionSequence sequence2 : sequences) { - if (sequence.equals(sequence2) || unionFind.inSameSet(sequence, sequence2)) { + if (sequence.equals(sequence2)) { // || unionFind.inSameSet(sequence, sequence2)) doing this is actually slower than not doing it continue; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutDebugLayer.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutDebugLayer.java index ba3223c..9428e2d 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutDebugLayer.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutDebugLayer.java @@ -5,9 +5,11 @@ import java.awt.geom.AffineTransform; import java.awt.geom.Line2D; import java.awt.geom.Point2D; import java.awt.geom.Rectangle2D; +import java.text.BreakIterator; import java.util.Collection; import java.util.HashMap; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.Set; import java.util.concurrent.atomic.AtomicInteger; @@ -19,6 +21,8 @@ import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Bound import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; @@ -94,6 +98,29 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig { } + public void addSentenceVisualization(TextBlock textBlock) { + + if (!active) { + return; + } + AtomicInteger rotatingColorIdx = new AtomicInteger(0); + String text = textBlock.getSearchText(); + BreakIterator sentenceIterator = BreakIterator.getSentenceInstance(Locale.ENGLISH); + sentenceIterator.setText(text); + int lastIdx = 0; + while (sentenceIterator.next() != BreakIterator.DONE) { + TextRange sentenceRange = new TextRange(lastIdx + textBlock.getTextRange().start(), sentenceIterator.current() + textBlock.getTextRange().start()); + lastIdx = sentenceIterator.current(); + Color color = getRotatingColor(rotatingColorIdx); + textBlock.getPositionsPerPage(sentenceRange) + .forEach((page, bboxes) -> getOrCreateVisualizationsOnPage(page.getNumber(), this.sentences).getColoredRectangles() + .addAll(bboxes.stream() + .map(bbox -> new ColoredRectangle(bbox, color, 1)) + .toList())); + } + } + + private Color decideOnRulingColor(Ruling ruling) { return switch (ruling.getClassification()) { @@ -248,7 +275,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig { .map(Line::getCharacters) .flatMap(Collection::stream) .forEach(character -> { - Color color = ROTATING_CHARACTER_COLOR.get(index.getAndIncrement() % ROTATING_CHARACTER_COLOR.size()); + Color color = getRotatingColor(index); Rectangle2D charBBox = character.getTextPosition().getBBoxPdf(); characterVisualizations.getColoredRectangles().add(new ColoredRectangle(charBBox, color, 1)); character.getNeighbors() @@ -263,6 +290,31 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig { } + public void addTocPages(List numbers, int page) { + + if (!active) { + return; + } + + VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(page, this.tocPages); + visualizationsOnPage.getColoredRectangles() + .addAll(numbers.stream() + .map(BoundingBox::getBBoxPdf) + .map(line -> new ColoredRectangle(line, LINES_COLOR, 0.5f)) + .toList()); + visualizationsOnPage.getColoredRectangles() + .add(new ColoredRectangle(numbers.stream() + .map(BoundingBox::getBBoxPdf) + .collect(RectangleTransformations.collectBBox()), LINES_COLOR, 0.5f)); + } + + + private static Color getRotatingColor(AtomicInteger index) { + + return ROTATING_CHARACTER_COLOR.get(index.getAndIncrement() % ROTATING_CHARACTER_COLOR.size()); + } + + public void addOutlineObjects(List outlineObjects, PageInformation pageInformation) { if (!active) { diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/BoundingBoxTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/BoundingBoxTest.java new file mode 100644 index 0000000..67f606c --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/BoundingBoxTest.java @@ -0,0 +1,70 @@ +package com.knecon.fforesight.service.layoutparser.processor.docstrum.model; + +import static org.junit.jupiter.api.Assertions.*; + +import java.awt.geom.Rectangle2D; + +import org.junit.jupiter.api.Test; + +class BoundingBoxTest { + + @Test + void testHorizontalDistance_NoOverlap() { + + ConcreteBoundingBox box1 = new ConcreteBoundingBox(0, 0, 10, 10); + ConcreteBoundingBox box2 = new ConcreteBoundingBox(20, 0, 10, 10); + + assertEquals(10, box1.horizontalDistance(box2)); + } + + + @Test + void testHorizontalDistance_Overlap() { + + ConcreteBoundingBox box1 = new ConcreteBoundingBox(0, 0, 10, 10); + ConcreteBoundingBox box2 = new ConcreteBoundingBox(5, 0, 10, 10); + + assertEquals(0, box1.horizontalDistance(box2)); + } + + + @Test + void testVerticalDistance_NoOverlap() { + + ConcreteBoundingBox box1 = new ConcreteBoundingBox(0, 0, 10, 10); + ConcreteBoundingBox box2 = new ConcreteBoundingBox(0, 20, 10, 10); + + assertEquals(10, box1.verticalDistance(box2)); + } + + + @Test + void testVerticalDistance_Overlap() { + + ConcreteBoundingBox box1 = new ConcreteBoundingBox(0, 0, 10, 10); + ConcreteBoundingBox box2 = new ConcreteBoundingBox(0, 5, 10, 10); + + assertEquals(0, box1.verticalDistance(box2)); + } + + + @Test + void testVerticalDistance_PartialOverlap() { + + ConcreteBoundingBox box1 = new ConcreteBoundingBox(0, 0, 10, 10); + ConcreteBoundingBox box2 = new ConcreteBoundingBox(0, 8, 10, 10); + + assertEquals(0, box1.verticalDistance(box2)); + } + + + @Test + void testHorizontalDistance_PartialOverlap() { + + ConcreteBoundingBox box1 = new ConcreteBoundingBox(0, 0, 10, 10); + ConcreteBoundingBox box2 = new ConcreteBoundingBox(8, 0, 10, 10); + + assertEquals(0, box1.horizontalDistance(box2)); + } + +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/ConcreteBoundingBox.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/ConcreteBoundingBox.java new file mode 100644 index 0000000..e77d5ea --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/ConcreteBoundingBox.java @@ -0,0 +1,12 @@ +package com.knecon.fforesight.service.layoutparser.processor.docstrum.model; + +import java.awt.geom.Rectangle2D; + +class ConcreteBoundingBox extends BoundingBox { + + ConcreteBoundingBox(double x, double y, double width, double height) { + + this.bBox = new Rectangle2D.Double(x, y, width, height); + } + +} diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/SimplifiedTextServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/SimplifiedTextServiceTest.java index a818b31..01989c5 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/SimplifiedTextServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/SimplifiedTextServiceTest.java @@ -46,8 +46,8 @@ public class SimplifiedTextServiceTest Document document = buildGraph(file); SimplifiedText simplifiedText = simplifiedSectionTextService.toSimplifiedText(document); List sectionTexts = simplifiedText.getSectionTexts(); - assertThat(sectionTexts.stream().filter(section -> section.getText().equals(footerExample)).collect(Collectors.toList()).size()).isGreaterThan(0); - assertThat(sectionTexts.stream().filter(section -> section.getText().equals(headerExample)).collect(Collectors.toList()).size()).isGreaterThan(0); + assertThat(sectionTexts.stream().filter(section -> section.getText().contains(footerExample)).toList().size()).isGreaterThan(0); + assertThat(sectionTexts.stream().filter(section -> section.getText().contains(headerExample)).toList().size()).isGreaterThan(0); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentReadingOrderTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentReadingOrderTest.java index 1da8457..9e04dae 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentReadingOrderTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentReadingOrderTest.java @@ -51,7 +51,7 @@ import lombok.extern.slf4j.Slf4j; @Slf4j public class DocumentReadingOrderTest extends BuildDocumentTest { - private static final boolean DRAW_DIR_ADJ_COORDS = true; + private static final boolean DRAW_DIR_ADJ_COORDS = false; public static final List LAYOUT_PARSING_TYPES = List.of(LayoutParsingType.DOCUMINE, LayoutParsingType.DOCUMINE_OLD, LayoutParsingType.REDACT_MANAGER, @@ -82,7 +82,7 @@ public class DocumentReadingOrderTest extends BuildDocumentTest { @Disabled public void drawDirAdjForFile() { - String pdfFile = "/home/kschuettler/Dokumente/Ticket Related/RED-9974/026dc94b019bc2348a4c54f0c6c4516f.ORIGIN.pdf"; + String pdfFile = "/home/kschuettler/Dokumente/TestFiles/OCR/VV-331340/VV-331340_OCRED_first15.pdf"; ClassificationDocument classificationDocument = parseLayout(pdfFile, LayoutParsingType.DOCUMINE_OLD); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/TextRangeTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/TextRangeTest.java index fb296a5..741b908 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/TextRangeTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/TextRangeTest.java @@ -76,9 +76,10 @@ class TextRangeTest { assertEquals(List.of(new TextRange(10, 40), new TextRange(40, 100)), startTextRange.split(List.of(40))); assertEquals(1, startTextRange.split(Collections.emptyList()).size()); assertEquals(1, startTextRange.split(List.of(startTextRange.start())).size()); + assertEquals(1, startTextRange.split(List.of(100)).size()); assertThrows(IndexOutOfBoundsException.class, () -> startTextRange.split(Collections.singletonList(0))); - assertThrows(IndexOutOfBoundsException.class, () -> startTextRange.split(Collections.singletonList(100))); - assertThrows(IndexOutOfBoundsException.class, () -> startTextRange.split(List.of(12, 40, 100))); + assertThrows(IndexOutOfBoundsException.class, () -> startTextRange.split(Collections.singletonList(101))); + assertThrows(IndexOutOfBoundsException.class, () -> startTextRange.split(List.of(12, 40, 101))); } } \ No newline at end of file diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/LayerIdentifier.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/LayerIdentifier.java index 2cca26c..33d52c2 100644 --- a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/LayerIdentifier.java +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/LayerIdentifier.java @@ -55,7 +55,10 @@ public record LayerIdentifier(String name, String markedContentName) { public static final LayerIdentifier NEIGHBOURS = new LayerIdentifier("Neighbours", "NEIGHBOURS"); public static final LayerIdentifier CHARACTERS = new LayerIdentifier("Characters", "CHARACTERS"); public static final LayerIdentifier OUTLINE_OBJECTS = new LayerIdentifier("Outline Positions", "OUTLINE_OBJECTS"); + public static final LayerIdentifier SENTENCES = new LayerIdentifier("Sentences", "SENTENCES"); + public static final LayerIdentifier TOC_PAGES = new LayerIdentifier("TOC pages", "TOC_PAGES"); + // Visual layout parser public static final LayerIdentifier KNECON_VISUAL_PARSING = new LayerIdentifier("Visual Layout Parser", "VISUAL_PARSING"); //ocr diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/LayoutDebugLayerConfig.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/LayoutDebugLayerConfig.java index bb01cf0..1fb35be 100644 --- a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/LayoutDebugLayerConfig.java +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/LayoutDebugLayerConfig.java @@ -55,6 +55,8 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup { protected final Visualizations neighbours = Visualizations.builder().layer(LayerIdentifier.NEIGHBOURS).build(); protected final Visualizations characters = Visualizations.builder().layer(LayerIdentifier.CHARACTERS).build(); protected final Visualizations outlineObjects = Visualizations.builder().layer(LayerIdentifier.OUTLINE_OBJECTS).build(); + protected final Visualizations sentences = Visualizations.builder().layer(LayerIdentifier.SENTENCES).build(); + protected final Visualizations tocPages = Visualizations.builder().layer(LayerIdentifier.TOC_PAGES).build(); public List getVisualizations() { @@ -63,14 +65,15 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup { neighbours,// words, // lines, // + sentences, // zones, // rulings, // clean_rulings, // cells, // mainBody, // markedContent, // - outlineObjects // - ); + outlineObjects, // + tocPages); } } diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/OutlineUtility.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/OutlineUtility.java index 141e0c4..499e52e 100644 --- a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/OutlineUtility.java +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/OutlineUtility.java @@ -68,10 +68,10 @@ public class OutlineUtility { public static void deleteExistingOutline(PDFDoc doc) { Bookmark firstBookmark = doc.getFirstBookmark(); -// while (firstBookmark != null && firstBookmark.isValid()) { + while (firstBookmark != null && firstBookmark.isValid()) { firstBookmark.delete(); firstBookmark = doc.getFirstBookmark(); -// } + } }