Merge branch 'feature/RED-10127' into 'main'
RED-10127: improve headline detection See merge request fforesight/layout-parser!235
This commit is contained in:
commit
4b0c041d84
@ -2,11 +2,13 @@ package com.knecon.fforesight.service.layoutparser.processor;
|
||||
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
@ -24,6 +26,10 @@ import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.No
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.TableOfContentsClassificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.MarkdownMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
@ -91,10 +97,7 @@ public class LayoutParsingPipeline {
|
||||
CvTableParsingAdapter cvTableParsingAdapter;
|
||||
LayoutParsingStorageService layoutParsingStorageService;
|
||||
SectionsBuilderService sectionsBuilderService;
|
||||
RedactManagerClassificationService redactManagerClassificationService;
|
||||
DocuMineClassificationService docuMineClassificationService;
|
||||
SimplifiedSectionTextService simplifiedSectionTextService;
|
||||
BodyTextFrameService bodyTextFrameService;
|
||||
RulingCleaningService rulingCleaningService;
|
||||
TableExtractionService tableExtractionService;
|
||||
DocuMineBlockificationService docuMineBlockificationService;
|
||||
@ -104,12 +107,12 @@ public class LayoutParsingPipeline {
|
||||
LayoutGridService layoutGridService;
|
||||
ObservationRegistry observationRegistry;
|
||||
VisualLayoutParsingAdapter visualLayoutParsingAdapter;
|
||||
ClarifyndClassificationService clarifyndClassificationService;
|
||||
GraphicExtractorService graphicExtractorService;
|
||||
OutlineExtractorService outlineExtractorService;
|
||||
OutlineValidationService outlineValidationService;
|
||||
TOCEnrichmentService tocEnrichmentService;
|
||||
LayoutparserSettings settings;
|
||||
ClassificationService classificationService;
|
||||
|
||||
|
||||
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
|
||||
@ -273,6 +276,9 @@ public class LayoutParsingPipeline {
|
||||
stripper.setPdpage(pdPage);
|
||||
stripper.getText(originDocument);
|
||||
List<TextPositionSequence> words = stripper.getTextPositionSequences();
|
||||
|
||||
// rotateDirAdjExactly(words, pdPage); // works really well for many highly rotated documents (e.g. VV-331340.pdf), but it decreases the headline performance by 1.3%, so I am leaving it out for now
|
||||
|
||||
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD)) {
|
||||
var lines = TextPositionOperations.groupByLine(new HashSet<>(words));
|
||||
classificationDocument.getLayoutDebugLayer().addLineVisualizationsFromNestedTextPosition(lines, pageNumber);
|
||||
@ -366,24 +372,7 @@ public class LayoutParsingPipeline {
|
||||
|
||||
originDocument.close();
|
||||
|
||||
log.info("Calculating BodyTextFrame for {}", identifier);
|
||||
bodyTextFrameService.setBodyTextFrames(classificationDocument, layoutParsingType);
|
||||
for (ClassificationPage page : classificationDocument.getPages()) {
|
||||
classificationDocument.getLayoutDebugLayer().addCleanRulingVisualization(page.getCleanRulings(), page.getPageNumber());
|
||||
}
|
||||
log.info("Classify TextBlocks for {}", identifier);
|
||||
switch (layoutParsingType) {
|
||||
case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_OLD, CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH ->
|
||||
redactManagerClassificationService.classifyDocument(classificationDocument);
|
||||
case DOCUMINE_OLD, DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument);
|
||||
case CLARIFYND -> clarifyndClassificationService.classifyDocument(classificationDocument);
|
||||
}
|
||||
|
||||
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD)) {
|
||||
for (ClassificationPage page : classificationDocument.getPages()) {
|
||||
docuMineBlockificationService.mergeblocks(page, page.getCleanRulings().withoutTextRulings(), 0, 10);
|
||||
}
|
||||
}
|
||||
classificationService.classify(classificationDocument, layoutParsingType, identifier);
|
||||
|
||||
List<TextPageBlock> headlines = classificationDocument.getPages()
|
||||
.stream()
|
||||
@ -406,6 +395,32 @@ public class LayoutParsingPipeline {
|
||||
}
|
||||
|
||||
|
||||
private static void rotateDirAdjExactly(List<TextPositionSequence> words, PDPage pdPage) {
|
||||
|
||||
for (TextDirection dir : TextDirection.values()) {
|
||||
|
||||
double averageRotation = words.stream()
|
||||
.map(TextPositionSequence::getTextPositions)
|
||||
.flatMap(Collection::stream)
|
||||
.filter(pos -> pos.getDir().equals(dir))
|
||||
.mapToDouble(RedTextPosition::getExactDir).average().orElse(0);
|
||||
|
||||
if (averageRotation == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
AffineTransform rotateInstance = AffineTransform.getRotateInstance(averageRotation, pdPage.getMediaBox().getWidth() / 2, pdPage.getMediaBox().getHeight() / 2);
|
||||
|
||||
for (TextPositionSequence word : words) {
|
||||
if (!dir.equals(word.getDir())) {
|
||||
continue;
|
||||
}
|
||||
word.transform(rotateInstance);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void addNumberOfPagesToTrace(int numberOfPages, long size) {
|
||||
|
||||
if (observationRegistry.getCurrentObservation() != null) {
|
||||
|
||||
@ -225,33 +225,31 @@ public abstract class BoundingBox {
|
||||
|
||||
public double horizontalDistance(BoundingBox other) {
|
||||
|
||||
Rectangle2D left;
|
||||
Rectangle2D right;
|
||||
if (this.leftOf(other)) {
|
||||
left = this.getBBox();
|
||||
right = other.getBBox();
|
||||
} else {
|
||||
left = other.getBBox();
|
||||
right = this.getBBox();
|
||||
}
|
||||
double rect1Right = getMaxX();
|
||||
double rect1Left = getMinX();
|
||||
double rect2Right = other.getMaxX();
|
||||
double rect2Left = other.getMinX();
|
||||
|
||||
return Math.max(0, right.getMinX() - left.getMaxX());
|
||||
if (rect1Left > rect2Right || rect2Left > rect1Right) {
|
||||
return Math.max(rect2Left - rect1Right, rect1Left - rect2Right);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public double verticalDistance(BoundingBox other) {
|
||||
|
||||
Rectangle2D bottom;
|
||||
Rectangle2D top;
|
||||
if (this.isAbove(other)) {
|
||||
top = this.getBBox();
|
||||
bottom = other.getBBox();
|
||||
} else {
|
||||
bottom = this.getBBox();
|
||||
top = other.getBBox();
|
||||
}
|
||||
double rect1Top = getMaxY();
|
||||
double rect1Bottom = getMinY();
|
||||
double rect2Top = other.getMaxY();
|
||||
double rect2Bottom = other.getMinY();
|
||||
|
||||
return Math.max(0, bottom.getMinY() - top.getMaxY());
|
||||
if (rect1Bottom > rect2Top || rect2Bottom > rect1Top) {
|
||||
return Math.max(rect2Bottom - rect1Top, rect1Bottom - rect2Top);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -99,4 +99,70 @@ public abstract class TextBoundingBox extends BoundingBox {
|
||||
return this.bBoxDirAdj.getCenterX();
|
||||
}
|
||||
|
||||
|
||||
public double horizontalDistanceDirAdj(TextBoundingBox other) {
|
||||
|
||||
double rect1Right = getMaxXDirAdj();
|
||||
double rect1Left = getXDirAdj();
|
||||
double rect2Right = other.getMaxXDirAdj();
|
||||
double rect2Left = other.getXDirAdj();
|
||||
|
||||
if (rect1Left > rect2Right || rect2Left > rect1Right) {
|
||||
return Math.max(rect2Left - rect1Right, rect1Left - rect2Right);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public double verticalDistanceDirAdj(TextBoundingBox other) {
|
||||
|
||||
double rect1Top = getMaxYDirAdj();
|
||||
double rect1Bottom = getYDirAdj();
|
||||
double rect2Top = other.getMaxYDirAdj();
|
||||
double rect2Bottom = other.getYDirAdj();
|
||||
|
||||
if (rect1Bottom > rect2Top || rect2Bottom > rect1Top) {
|
||||
return Math.max(rect2Bottom - rect1Top, rect1Bottom - rect2Top);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsDirAdj(TextBoundingBox other) {
|
||||
|
||||
return this.intersectsXDirAdj(other) && this.intersectsYDirAdj(other);
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsDirAdj(TextBoundingBox other, float yThreshold, float xThreshold) {
|
||||
|
||||
return this.intersectsXDirAdj(other, xThreshold) && this.intersectsYDirAdj(other, yThreshold);
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsXDirAdj(TextBoundingBox other, float threshold) {
|
||||
|
||||
return this.getXDirAdj() - threshold <= other.getMaxXDirAdj() && this.getMaxXDirAdj() + threshold >= other.getXDirAdj();
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsXDirAdj(TextBoundingBox other) {
|
||||
|
||||
return this.getXDirAdj() <= other.getMaxXDirAdj() && this.getMaxXDirAdj() >= other.getXDirAdj();
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsYDirAdj(TextBoundingBox other) {
|
||||
|
||||
return this.getYDirAdj() <= other.getMaxYDirAdj() && this.getMaxYDirAdj() >= other.getYDirAdj();
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsYDirAdj(TextBoundingBox other, float threshold) {
|
||||
|
||||
return this.getYDirAdj() - threshold <= other.getMaxYDirAdj() && this.getMaxYDirAdj() + threshold >= other.getYDirAdj();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -28,4 +28,10 @@ public class UnionFind<T> extends org.jgrapht.alg.util.UnionFind<T> {
|
||||
return setRep.values();
|
||||
}
|
||||
|
||||
|
||||
public Collection<T> getElements() {
|
||||
|
||||
return getParentMap().keySet();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -44,7 +44,7 @@ public class FloatFrequencyCounter {
|
||||
|
||||
public Double getMostPopular() {
|
||||
|
||||
if (changed) {
|
||||
if (changed || mostPopularCache == null) {
|
||||
Map.Entry<Double, Integer> mostPopular = null;
|
||||
for (Map.Entry<Double, Integer> entry : countPerValue.entrySet()) {
|
||||
if (mostPopular == null || entry.getValue() >= mostPopular.getValue()) {
|
||||
|
||||
@ -15,6 +15,7 @@ public enum PageBlockType {
|
||||
PARAGRAPH_ITALIC,
|
||||
PARAGRAPH_UNKNOWN,
|
||||
OTHER,
|
||||
TABLE_OF_CONTENTS_ITEM,
|
||||
TABLE;
|
||||
|
||||
|
||||
|
||||
@ -98,10 +98,10 @@ public class TextRange implements Comparable<TextRange> {
|
||||
public List<TextRange> split(List<Integer> splitIndices) {
|
||||
|
||||
if (splitIndices.stream()
|
||||
.anyMatch(idx -> !this.containsExclusive(idx))) {
|
||||
.anyMatch(idx -> !this.contains(idx))) {
|
||||
throw new IndexOutOfBoundsException(format("%s splitting indices are out of range for %s",
|
||||
splitIndices.stream()
|
||||
.filter(idx -> !this.containsExclusive(idx))
|
||||
.filter(idx -> !this.contains(idx))
|
||||
.toList(),
|
||||
this));
|
||||
}
|
||||
@ -116,7 +116,9 @@ public class TextRange implements Comparable<TextRange> {
|
||||
splitBoundaries.add(new TextRange(previousIndex, splitIndex));
|
||||
previousIndex = splitIndex;
|
||||
}
|
||||
splitBoundaries.add(new TextRange(previousIndex, end));
|
||||
if (previousIndex != end) {
|
||||
splitBoundaries.add(new TextRange(previousIndex, end));
|
||||
}
|
||||
return splitBoundaries;
|
||||
}
|
||||
|
||||
|
||||
@ -0,0 +1,21 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Getter;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Getter
|
||||
@NoArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class FrequencyCounters {
|
||||
|
||||
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
|
||||
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
|
||||
FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter();
|
||||
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
|
||||
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
|
||||
|
||||
}
|
||||
@ -7,6 +7,8 @@ import org.apache.pdfbox.text.TextPosition;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
|
||||
@ -9,10 +9,14 @@ public class StringFrequencyCounter {
|
||||
|
||||
@Getter
|
||||
private final Map<String, Integer> countPerValue = new HashMap<>();
|
||||
boolean changed;
|
||||
String mostPopularCache;
|
||||
|
||||
|
||||
public void add(String value) {
|
||||
|
||||
changed = true;
|
||||
|
||||
if (!countPerValue.containsKey(value)) {
|
||||
countPerValue.put(value, 1);
|
||||
} else {
|
||||
@ -23,6 +27,8 @@ public class StringFrequencyCounter {
|
||||
|
||||
public void addAll(Map<String, Integer> otherCounter) {
|
||||
|
||||
changed = true;
|
||||
|
||||
for (Map.Entry<String, Integer> entry : otherCounter.entrySet()) {
|
||||
if (countPerValue.containsKey(entry.getKey())) {
|
||||
countPerValue.put(entry.getKey(), countPerValue.get(entry.getKey()) + entry.getValue());
|
||||
@ -35,13 +41,18 @@ public class StringFrequencyCounter {
|
||||
|
||||
public String getMostPopular() {
|
||||
|
||||
Map.Entry<String, Integer> mostPopular = null;
|
||||
for (Map.Entry<String, Integer> entry : countPerValue.entrySet()) {
|
||||
if (mostPopular == null || entry.getValue() > mostPopular.getValue()) {
|
||||
mostPopular = entry;
|
||||
if (changed || mostPopularCache == null) {
|
||||
Map.Entry<String, Integer> mostPopular = null;
|
||||
for (Map.Entry<String, Integer> entry : countPerValue.entrySet()) {
|
||||
if (mostPopular == null || entry.getValue() >= mostPopular.getValue()) {
|
||||
mostPopular = entry;
|
||||
}
|
||||
}
|
||||
mostPopularCache = mostPopular != null ? mostPopular.getKey() : null;
|
||||
changed = false;
|
||||
}
|
||||
return mostPopular != null ? mostPopular.getKey() : null;
|
||||
|
||||
return mostPopularCache;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,7 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
|
||||
public record TextBlockOnPage(ClassificationPage page, TextPageBlock textBlock) {
|
||||
|
||||
}
|
||||
@ -7,7 +7,6 @@ import java.util.List;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
|
||||
@ -27,19 +26,11 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
|
||||
@Builder.Default
|
||||
private List<TextPositionSequence> sequences = new ArrayList<>();
|
||||
@Builder.Default
|
||||
private FrequencyCounters frequencyCounters = new FrequencyCounters();
|
||||
|
||||
private Rectangle2D bBoxDirAdj;
|
||||
|
||||
private String mostPopularWordFont;
|
||||
|
||||
private String mostPopularWordStyle;
|
||||
|
||||
private double mostPopularWordFontSize;
|
||||
|
||||
private double mostPopularWordHeight;
|
||||
|
||||
private double mostPopularWordSpaceWidth;
|
||||
|
||||
private boolean underlined;
|
||||
|
||||
private double highestFontSize;
|
||||
@ -55,8 +46,10 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
public TextPageBlock(List<TextPositionSequence> sequences) {
|
||||
|
||||
this.sequences = new ArrayList<>(sequences);
|
||||
this.frequencyCounters = new FrequencyCounters();
|
||||
|
||||
if (!sequences.isEmpty()) {
|
||||
calculateFrequencyCounters();
|
||||
addToFrequencyCounters(sequences);
|
||||
}
|
||||
calculateBBox();
|
||||
}
|
||||
@ -118,32 +111,18 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
}
|
||||
|
||||
|
||||
private void calculateFrequencyCounters() {
|
||||
|
||||
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
|
||||
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
|
||||
FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter();
|
||||
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
|
||||
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
|
||||
private void addToFrequencyCounters(List<TextPositionSequence> sequences) {
|
||||
|
||||
for (TextPositionSequence wordBlock : sequences) {
|
||||
|
||||
lineHeightFrequencyCounter.add(wordBlock.getTextHeight());
|
||||
fontSizeFrequencyCounter.add(wordBlock.getFontSize());
|
||||
spaceFrequencyCounter.add(wordBlock.getSpaceWidth());
|
||||
fontFrequencyCounter.add(wordBlock.getFont());
|
||||
styleFrequencyCounter.add(wordBlock.getFontStyle());
|
||||
|
||||
frequencyCounters.getLineHeightFrequencyCounter().add(wordBlock.getTextHeight());
|
||||
frequencyCounters.getFontSizeFrequencyCounter().add(wordBlock.getFontSize());
|
||||
frequencyCounters.getSpaceFrequencyCounter().add(wordBlock.getSpaceWidth());
|
||||
frequencyCounters.getFontFrequencyCounter().add(wordBlock.getFont());
|
||||
frequencyCounters.getStyleFrequencyCounter().add(wordBlock.getFontStyle());
|
||||
}
|
||||
|
||||
setMostPopularWordFont(fontFrequencyCounter.getMostPopular());
|
||||
setMostPopularWordStyle(styleFrequencyCounter.getMostPopular());
|
||||
setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular());
|
||||
setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
|
||||
setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
|
||||
setHighestFontSize(fontSizeFrequencyCounter.getHighest());
|
||||
|
||||
setUnderlined(sequences.stream()
|
||||
setUnderlined(this.sequences.stream()
|
||||
.allMatch(TextPositionSequence::isUnderline));
|
||||
}
|
||||
|
||||
@ -152,7 +131,7 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
|
||||
TextPageBlock union = this.copy();
|
||||
union.add(r);
|
||||
calculateFrequencyCounters();
|
||||
addToFrequencyCounters(List.of(r));
|
||||
calculateBBox();
|
||||
return union;
|
||||
}
|
||||
@ -162,7 +141,7 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
|
||||
TextPageBlock union = this.copy();
|
||||
union.addAll(r.getSequences());
|
||||
calculateFrequencyCounters();
|
||||
addToFrequencyCounters(r.getSequences());
|
||||
calculateBBox();
|
||||
return union;
|
||||
}
|
||||
@ -172,7 +151,7 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
|
||||
changed = true;
|
||||
sequences.addAll(textPageBlock.getSequences());
|
||||
calculateFrequencyCounters();
|
||||
addToFrequencyCounters(textPageBlock.getSequences());
|
||||
calculateBBox();
|
||||
}
|
||||
|
||||
@ -181,7 +160,7 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
|
||||
changed = true;
|
||||
sequences.add(textPositionSequence);
|
||||
calculateFrequencyCounters();
|
||||
addToFrequencyCounters(List.of(textPositionSequence));
|
||||
calculateBBox();
|
||||
}
|
||||
|
||||
@ -190,7 +169,7 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
|
||||
changed = true;
|
||||
sequences.addAll(textPositionSequences);
|
||||
calculateFrequencyCounters();
|
||||
addToFrequencyCounters(textPositionSequences);
|
||||
calculateBBox();
|
||||
}
|
||||
|
||||
@ -253,6 +232,36 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
}
|
||||
|
||||
|
||||
public String getMostPopularWordFont() {
|
||||
|
||||
return frequencyCounters.getFontFrequencyCounter().getMostPopular();
|
||||
}
|
||||
|
||||
|
||||
public String getMostPopularWordStyle() {
|
||||
|
||||
return frequencyCounters.getStyleFrequencyCounter().getMostPopular();
|
||||
}
|
||||
|
||||
|
||||
public double getMostPopularWordFontSize() {
|
||||
|
||||
return frequencyCounters.getFontSizeFrequencyCounter().getMostPopular();
|
||||
}
|
||||
|
||||
|
||||
public double getMostPopularWordHeight() {
|
||||
|
||||
return frequencyCounters.getLineHeightFrequencyCounter().getMostPopular();
|
||||
}
|
||||
|
||||
|
||||
public double getMostPopularWordSpaceWidth() {
|
||||
|
||||
return frequencyCounters.getSpaceFrequencyCounter().getMostPopular();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean isEmpty() {
|
||||
|
||||
|
||||
@ -2,10 +2,13 @@ package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition.HEIGHT_PADDING;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Objects;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
@ -30,6 +33,7 @@ public class TextPositionSequence extends TextBoundingBox implements CharSequenc
|
||||
public static final String BOLD_ITALIC = "bold, italic";
|
||||
public static final String BOLD = "bold";
|
||||
public static final String ITALIC = "italic";
|
||||
public static final Pattern FONT_CLEANER = Pattern.compile(",bold|,italic");
|
||||
|
||||
private int page;
|
||||
|
||||
@ -154,7 +158,8 @@ public class TextPositionSequence extends TextBoundingBox implements CharSequenc
|
||||
if (textPositions.get(0).getFontName() == null) {
|
||||
return "none";
|
||||
}
|
||||
return textPositions.get(0).getFontName().toLowerCase(Locale.ROOT).replaceAll(",bold", "").replaceAll(",italic", "");
|
||||
|
||||
return FONT_CLEANER.matcher(textPositions.get(0).getFontName().toLowerCase(Locale.ROOT)).replaceAll("");
|
||||
}
|
||||
|
||||
|
||||
@ -238,5 +243,15 @@ public class TextPositionSequence extends TextBoundingBox implements CharSequenc
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
public void transform(AffineTransform rotateInstance) {
|
||||
|
||||
for (RedTextPosition textPosition : getTextPositions()) {
|
||||
Rectangle2D exactDirAdjCoordinates = rotateInstance.createTransformedShape(textPosition.getBBoxDirAdj()).getBounds2D();
|
||||
textPosition.setBBoxDirAdj(exactDirAdjCoordinates);
|
||||
}
|
||||
calculateBBoxAndHashcode();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -0,0 +1,34 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
|
||||
public class TextPositionSequenceComparator implements Comparator<TextPositionSequence> {
|
||||
|
||||
private HashMap<TextPositionSequence, TextBlockOnPage> lookup;
|
||||
|
||||
|
||||
public TextPositionSequenceComparator(HashMap<TextPositionSequence, TextBlockOnPage> lookup) {
|
||||
|
||||
this.lookup = lookup;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int compare(TextPositionSequence number1, TextPositionSequence number2) {
|
||||
|
||||
int page1 = lookup.get(number1).page().getPageNumber();
|
||||
int page2 = lookup.get(number2).page().getPageNumber();
|
||||
|
||||
if (page1 != page2) {
|
||||
return Integer.compare(page1, page2);
|
||||
}
|
||||
|
||||
if (number1.getY() != number2.getY()) {
|
||||
return Double.compare(number1.getY(), number2.getY());
|
||||
}
|
||||
|
||||
return Integer.compare(Integer.parseInt(number1.toString()), Integer.parseInt(number2.toString()));
|
||||
}
|
||||
|
||||
}
|
||||
@ -60,24 +60,18 @@ public class ClarifyndClassificationService {
|
||||
headlineClassificationService.setLastHeadlineFromOutline(textBlock);
|
||||
return;
|
||||
}
|
||||
if (textBlock.getClassification() != null && (textBlock.getClassification().equals(PageBlockType.HEADER)
|
||||
|| textBlock.getClassification().equals(PageBlockType.FOOTER)
|
||||
|| textBlock.getClassification().equals(PageBlockType.TABLE_OF_CONTENTS_ITEM))) {
|
||||
return;
|
||||
}
|
||||
if (document.getFontSizeCounter().getMostPopular() == null) {
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
||||
return;
|
||||
}
|
||||
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|
||||
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null
|
||||
|| textBlock.getHighestFontSize() <= document.getFontSizeCounter()
|
||||
.getMostPopular())) {
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
||||
|
||||
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
|
||||
|| PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null
|
||||
|| textBlock.getHighestFontSize() <= document.getFontSizeCounter()
|
||||
.getMostPopular())) {
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
||||
} else if (page.getPageNumber() == 1 //
|
||||
&& (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5
|
||||
&& textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) {
|
||||
if (page.getPageNumber() == 1 //
|
||||
&& (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5
|
||||
&& textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) {
|
||||
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
|
||||
textBlock.setClassification(PageBlockType.TITLE);
|
||||
}
|
||||
|
||||
@ -0,0 +1,62 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.classification;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class ClassificationService {
|
||||
|
||||
DocuMineBlockificationService docuMineBlockificationService;
|
||||
BodyTextFrameService bodyTextFrameService;
|
||||
TableOfContentsClassificationService tableOfContentsClassificationService;
|
||||
RedactManagerClassificationService redactManagerClassificationService;
|
||||
ClarifyndClassificationService clarifyndClassificationService;
|
||||
DocuMineClassificationService docuMineClassificationService;
|
||||
HeaderFooterClassificationService headerFooterClassificationService;
|
||||
|
||||
|
||||
public void classify(ClassificationDocument document, LayoutParsingType layoutParsingType, Map<String, String> identifier) {
|
||||
|
||||
log.info("Calculating BodyTextFrame for {}", identifier);
|
||||
bodyTextFrameService.setBodyTextFrames(document, layoutParsingType);
|
||||
for (ClassificationPage page : document.getPages()) {
|
||||
document.getLayoutDebugLayer().addCleanRulingVisualization(page.getCleanRulings(), page.getPageNumber());
|
||||
}
|
||||
log.info("Classify TextBlocks for {}", identifier);
|
||||
|
||||
headerFooterClassificationService.classifyHeadersAndFooters(document);
|
||||
|
||||
tableOfContentsClassificationService.classifyTableOfContents(document);
|
||||
|
||||
switch (layoutParsingType) {
|
||||
case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_OLD, CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH ->
|
||||
redactManagerClassificationService.classifyDocument(document);
|
||||
case DOCUMINE_OLD, DOCUMINE -> docuMineClassificationService.classifyDocument(document);
|
||||
case CLARIFYND -> clarifyndClassificationService.classifyDocument(document);
|
||||
}
|
||||
|
||||
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD)) {
|
||||
for (ClassificationPage page : document.getPages()) {
|
||||
docuMineBlockificationService.mergeblocks(page, page.getCleanRulings().withoutTextRulings(), 0, 10);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.services.classifica
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
@ -29,10 +30,12 @@ public class DocuMineClassificationService {
|
||||
private static final Pattern HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN = Pattern.compile("^([0-9]\\.)\\s[a-z][0-9a-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
|
||||
private static final Pattern AT_LEAST_3_CHARS_PATTERN = Pattern.compile("\\p{L}{3,}", Pattern.CASE_INSENSITIVE);
|
||||
private static final Pattern HEADLINE_PATTERN_WITH_SLASHES = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*");
|
||||
private static final Pattern AMOUNT_PATTERN = Pattern.compile("^\\s*\\d+(?:\\.\\d+)?\\s*(?:ml|l|g|kg|mg|cm|mm|km|m|lb|oz|ppm|%|f)\\b", Pattern.CASE_INSENSITIVE);
|
||||
private static final Pattern TABLE_OR_FIGURE_PATTER = Pattern.compile(
|
||||
public static final Pattern AMOUNT_PATTERN = Pattern.compile("^\\s*\\d+(?:\\.\\d+)?\\s*(?:ml|l|g|kg|mg|cm|cm2|cm3|mm|mm2|mm3|km|km2|m|m2|m3|lb|oz|ppm|dpm|days|weeks|months|%|f)\\b",
|
||||
Pattern.CASE_INSENSITIVE);
|
||||
private static final Pattern TABLE_OR_FIGURE_PATTERN = Pattern.compile(
|
||||
"^\\s*(?:table|continued\\s+table|appendix|figure)\\s+(?:[xvi]+|[a-z0-9]{1,3}(?:\\.[0-9]{1,3})*(?:-[0-9]{1,3})?)\\b",
|
||||
Pattern.CASE_INSENSITIVE);
|
||||
private static final Pattern ALPHANUMERIC = Pattern.compile("[a-zA-Z0-9]");
|
||||
|
||||
public static final int SEPARATION_THRESHOLD = 10; // if the min distance between a textblock and all its surrounding blocks, the regexes can be more lenient.
|
||||
public static final int SURROUNDING_BLOCKS_RADIUS = 3; // number of surrounding blocks before and after the current textblock to be tested
|
||||
@ -78,6 +81,9 @@ public class DocuMineClassificationService {
|
||||
if (i == originalIndex) {
|
||||
continue;
|
||||
}
|
||||
if (textBlocks.get(i).getText().length() <= 1) {
|
||||
continue;
|
||||
}
|
||||
surroundingBlocks.add(textBlocks.get(i));
|
||||
}
|
||||
return surroundingBlocks;
|
||||
@ -98,10 +104,9 @@ public class DocuMineClassificationService {
|
||||
Matcher atLeast3Matcher = AT_LEAST_3_CHARS_PATTERN.matcher(textBlock.toString());
|
||||
Matcher headlineWithSlashesMatcher = HEADLINE_PATTERN_WITH_SLASHES.matcher(textBlock.toString());
|
||||
Matcher amountMatcher = AMOUNT_PATTERN.matcher(textBlock.toString());
|
||||
Matcher tableOrFigureMatcher = TABLE_OR_FIGURE_PATTER.matcher(textBlock.toString());
|
||||
Matcher tableOrFigureMatcher = TABLE_OR_FIGURE_PATTERN.matcher(textBlock.toString());
|
||||
Matcher headlineWithSingleIdentifierMatcher = HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN.matcher(textBlock.toString());
|
||||
boolean isAtLeast3Characters = atLeast3Matcher.reset().find();
|
||||
boolean isTocItem = textBlock.getText().contains("..............");
|
||||
boolean headlineWithSlashesMatches = headlineWithSlashesMatcher.reset().matches();
|
||||
boolean isAmount = amountMatcher.reset().find();
|
||||
int charCount = countChars(textBlock);
|
||||
@ -112,35 +117,22 @@ public class DocuMineClassificationService {
|
||||
headlineClassificationService.setLastHeadlineFromOutline(textBlock);
|
||||
return;
|
||||
}
|
||||
if (textBlock.getClassification() != null && (textBlock.getClassification().equals(PageBlockType.HEADER)
|
||||
|| textBlock.getClassification().equals(PageBlockType.FOOTER)
|
||||
|| textBlock.getClassification().equals(PageBlockType.TABLE_OF_CONTENTS_ITEM))) {
|
||||
return;
|
||||
}
|
||||
if (document.getFontSizeCounter().getMostPopular() == null) {
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
||||
return;
|
||||
}
|
||||
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) //
|
||||
|| (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) //
|
||||
&& (document.getFontSizeCounter().getMostPopular() == null //
|
||||
|| textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular()))) {
|
||||
textBlock.setClassification(PageBlockType.HEADER);
|
||||
|
||||
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
|
||||
|| (PositionUtils.isUnderBodyTextFrame(bodyTextFrame,
|
||||
textBlock,
|
||||
page.getRotation())
|
||||
&& (document.getFontSizeCounter().getMostPopular()
|
||||
== null
|
||||
|| textBlock.getHighestFontSize()
|
||||
<= document.getFontSizeCounter()
|
||||
.getMostPopular()))
|
||||
|| HeaderFooterDetection.isLikelyFooter(textBlock, document, page)) {
|
||||
textBlock.setClassification(PageBlockType.FOOTER);
|
||||
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5
|
||||
&& textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) {
|
||||
if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5
|
||||
&& textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) {
|
||||
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
|
||||
textBlock.setClassification(PageBlockType.TITLE);
|
||||
}
|
||||
} else if (textBlock.getText().length() > 5
|
||||
&& (textBlock.getMostPopularWordHeight() > document.getTextHeightCounter().getMostPopular()
|
||||
|| textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular())
|
||||
&& greaterOrEqualFontThanDocumentAverage(textBlock, document)
|
||||
&& PositionUtils.getApproxLineCount(textBlock) < 5.9
|
||||
&& ((textBlock.getMostPopularWordStyle().contains("bold") || textBlock.isUnderlined())//
|
||||
&& Character.isDigit(textBlock.toString().charAt(0)) //
|
||||
@ -152,18 +144,19 @@ public class DocuMineClassificationService {
|
||||
|| textBlock.toString().startsWith("TABLE"))
|
||||
&& !textBlock.toString().endsWith(":")
|
||||
&& isAtLeast3Characters
|
||||
&& !isTocItem
|
||||
&& !isAmount
|
||||
&& enoughChars) {
|
||||
|
||||
setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes);
|
||||
} else if (isAllCaps(textBlock)
|
||||
&& ALPHANUMERIC.matcher(Character.toString(textBlock.getText().charAt(0))).matches()
|
||||
&& hasSeparation(textBlock, surroundingBlocks)
|
||||
&& textBlock.getText().length() > 5
|
||||
&& isAtLeast3Characters
|
||||
&& !isAmount
|
||||
&& enoughChars
|
||||
&& !textBlock.toString().contains(":")
|
||||
&& !textBlock.toString().startsWith("(")
|
||||
&& !textBlock.toString().endsWith(".")
|
||||
&& PositionUtils.getApproxLineCount(textBlock) < 2.9) {
|
||||
|
||||
setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes);
|
||||
@ -171,16 +164,14 @@ public class DocuMineClassificationService {
|
||||
&& PositionUtils.getApproxLineCount(textBlock) < 2.9
|
||||
&& isAtLeast3Characters
|
||||
&& !headlineWithSlashesMatches
|
||||
&& !isAmount
|
||||
&& !isTocItem) {
|
||||
&& !isAmount) {
|
||||
|
||||
setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes);
|
||||
} else if (!isTocItem
|
||||
&& hasSeparation(textBlock, surroundingBlocks)
|
||||
&& greaterOrEqualThanFontPageAverage(textBlock, page)
|
||||
&& PositionUtils.getApproxLineCount(textBlock) < 2.9
|
||||
&& (tableOrFigureMatcher.reset().find() || headlineWithSingleIdentifierMatcher.reset().find())
|
||||
&& !isAmount
|
||||
} else if (hasSeparation(textBlock, surroundingBlocks)//
|
||||
&& greaterOrEqualFontThanPageAverage(textBlock, page)//
|
||||
&& PositionUtils.getApproxLineCount(textBlock) < 2.9//
|
||||
&& (tableOrFigureMatcher.reset().find() || headlineWithSingleIdentifierMatcher.reset().find())//
|
||||
&& !isAmount//
|
||||
&& !headlineWithSlashesMatches) {
|
||||
|
||||
setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes);
|
||||
@ -222,13 +213,20 @@ public class DocuMineClassificationService {
|
||||
}
|
||||
|
||||
|
||||
private static boolean greaterOrEqualThanFontPageAverage(TextPageBlock textBlock, ClassificationPage page) {
|
||||
private static boolean greaterOrEqualFontThanPageAverage(TextPageBlock textBlock, ClassificationPage page) {
|
||||
|
||||
return textBlock.getMostPopularWordHeight() >= page.getTextHeightCounter().getMostPopular() //
|
||||
|| textBlock.getMostPopularWordFontSize() >= page.getFontSizeCounter().getMostPopular();
|
||||
}
|
||||
|
||||
|
||||
private static boolean greaterOrEqualFontThanDocumentAverage(TextPageBlock textBlock, ClassificationDocument document) {
|
||||
|
||||
return textBlock.getMostPopularWordHeight() > document.getTextHeightCounter().getMostPopular() //
|
||||
|| textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular();
|
||||
}
|
||||
|
||||
|
||||
private static boolean isAllCaps(TextPageBlock textBlock) {
|
||||
|
||||
return textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT));
|
||||
@ -246,8 +244,7 @@ public class DocuMineClassificationService {
|
||||
|
||||
return surroundingBlocks.stream()
|
||||
.mapToDouble(surroundingBlock -> calculateSeparation(textBlock, surroundingBlock))
|
||||
.min()
|
||||
.orElse(Double.MAX_VALUE);
|
||||
.min().orElse(Double.MAX_VALUE);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -0,0 +1,55 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.classification;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class HeaderFooterClassificationService {
|
||||
|
||||
public void classifyHeadersAndFooters(ClassificationDocument document) {
|
||||
|
||||
for (ClassificationPage page : document.getPages()) {
|
||||
for (AbstractPageBlock pageBlock : page.getTextBlocks()) {
|
||||
if (pageBlock instanceof TextPageBlock textBlock) {
|
||||
classifyBlock(document, page, textBlock);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
private static void classifyBlock(ClassificationDocument document, ClassificationPage page, TextPageBlock textBlock) {
|
||||
|
||||
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|
||||
|| PositionUtils.isOverBodyTextFrame(page.getBodyTextFrame(), textBlock, page.getRotation()) && smallerFontThanDocAverage(document, textBlock)) {
|
||||
|
||||
textBlock.setClassification(PageBlockType.HEADER);
|
||||
|
||||
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
|
||||
|| PositionUtils.isUnderBodyTextFrame(page.getBodyTextFrame(), textBlock, page.getRotation()) && smallerFontThanDocAverage(document, textBlock)) {
|
||||
|
||||
textBlock.setClassification(PageBlockType.FOOTER);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static boolean smallerFontThanDocAverage(ClassificationDocument document, TextPageBlock textBlock) {
|
||||
|
||||
return document.getFontSizeCounter().getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular();
|
||||
}
|
||||
|
||||
}
|
||||
@ -61,6 +61,15 @@ public class RedactManagerClassificationService {
|
||||
headlineClassificationService.setLastHeadlineFromOutline(textBlock);
|
||||
return;
|
||||
}
|
||||
if (textBlock.getClassification() != null && (textBlock.getClassification().equals(PageBlockType.HEADER)
|
||||
|| textBlock.getClassification().equals(PageBlockType.FOOTER)
|
||||
|| textBlock.getClassification().equals(PageBlockType.TABLE_OF_CONTENTS_ITEM))) {
|
||||
return;
|
||||
}
|
||||
if (document.getFontSizeCounter().getMostPopular() == null) {
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
||||
return;
|
||||
}
|
||||
if (document.getFontSizeCounter().getMostPopular() == null) {
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
||||
return;
|
||||
@ -73,21 +82,8 @@ public class RedactManagerClassificationService {
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
||||
return;
|
||||
}
|
||||
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|
||||
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null
|
||||
|| textBlock.getHighestFontSize() <= document.getFontSizeCounter()
|
||||
.getMostPopular())) {
|
||||
|
||||
textBlock.setClassification(PageBlockType.HEADER);
|
||||
|
||||
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
|
||||
|| PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null
|
||||
|| textBlock.getHighestFontSize() <= document.getFontSizeCounter()
|
||||
.getMostPopular())) {
|
||||
|
||||
textBlock.setClassification(PageBlockType.FOOTER);
|
||||
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5
|
||||
&& textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) {
|
||||
if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5
|
||||
&& textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) {
|
||||
|
||||
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
|
||||
textBlock.setClassification(PageBlockType.TITLE);
|
||||
|
||||
@ -0,0 +1,370 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.classification;
|
||||
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.DocuMineClassificationService.AMOUNT_PATTERN;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextBlockOnPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequenceComparator;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
public class TableOfContentsClassificationService {
|
||||
|
||||
private static final int MAX_PAGE_COUNT = 10; // maximum length of a toc to avoid runaway classification
|
||||
private static final int SURROUNDING_BLOCKS_RADIUS = 10; // number of blocks to look ahead
|
||||
private static final int MINIMUM_MATCHES = 2; // minimum cluster size
|
||||
public static final int INTERSECTION_TOLERANCE = 2; // equality threshold for x intersection
|
||||
public static final int DENSITY_THRESHOLD_COUNT = 10; // describes the minimum density, at least this many entries per page height are required
|
||||
|
||||
private static final Pattern NUMERIC = Pattern.compile("[0-9]+");
|
||||
|
||||
|
||||
@SuppressWarnings("checkstyle:ModifiedControlVariable")
|
||||
public void classifyTableOfContents(ClassificationDocument document) {
|
||||
|
||||
List<TextBlockOnPage> textBlocks = buildBlocksPerPage(document);
|
||||
|
||||
for (int i = 0; i < textBlocks.size(); i++) {
|
||||
TextBlockOnPage textBlock = textBlocks.get(i);
|
||||
|
||||
if (!isTOCHeadline(textBlock)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
int offset = identifyTOCItems(i + 1, textBlocks, document);
|
||||
|
||||
if (offset > 1) {
|
||||
textBlock.textBlock().setClassification(PageBlockType.H1);
|
||||
i += offset;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private int identifyTOCItems(int start, List<TextBlockOnPage> textBlocks, ClassificationDocument document) {
|
||||
|
||||
ClassificationPage startPage = textBlocks.get(start).page();
|
||||
List<TextBlockOnPage> initialLookAhead = textBlocks.subList(start, Math.min(start + SURROUNDING_BLOCKS_RADIUS, textBlocks.size()));
|
||||
HashMap<TextPositionSequence, TextBlockOnPage> lookup = new HashMap<>();
|
||||
List<TextPositionSequence> numbers = extractNumbers(initialLookAhead, lookup, document.getPages().size());
|
||||
TocNumberFinder tocNumberFinder = new TocNumberFinder(numbers, lookup);
|
||||
|
||||
int lastCandidate = start;
|
||||
for (int i = start; i < Math.min(lastCandidate + SURROUNDING_BLOCKS_RADIUS, textBlocks.size()); i++) {
|
||||
|
||||
TextBlockOnPage textBlockOnPage = textBlocks.get(i);
|
||||
if (textBlockOnPage.page().getPageNumber() - MAX_PAGE_COUNT > startPage.getPageNumber()) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (textBlockOnPage.textBlock().getClassification() != null //
|
||||
&& textBlockOnPage.textBlock().getClassification().isHeadline() //
|
||||
&& !(textBlockOnPage.textBlock().getText().startsWith("TABLES") //
|
||||
|| textBlockOnPage.textBlock().getText().startsWith("APPENDICES") //
|
||||
|| textBlockOnPage.textBlock().getText().startsWith("FIGURES"))) {
|
||||
log.debug("hit an outline headline, stop immediately.");
|
||||
lastCandidate = i - 1;
|
||||
break;
|
||||
}
|
||||
|
||||
List<TextPositionSequence> numbersFromBlock = extractNumbers(textBlockOnPage, lookup, document.getPages().size());
|
||||
|
||||
List<TextPositionSequence> currentRightmostCluster = tocNumberFinder.getCurrentRightmostCluster();
|
||||
|
||||
if (currentRightmostCluster.size() < MINIMUM_MATCHES) {
|
||||
log.debug("No numbers indicating a table of contents here.");
|
||||
return start;
|
||||
}
|
||||
|
||||
if (anyIntersection(currentRightmostCluster, numbersFromBlock, lookup)) {
|
||||
lastCandidate = i;
|
||||
numbersFromBlock.forEach(tocNumberFinder::add);
|
||||
}
|
||||
}
|
||||
|
||||
addVisualization(document.getLayoutDebugLayer(), tocNumberFinder, lookup);
|
||||
|
||||
Set<TextBlockOnPage> blocksWithNumberInCluster = tocNumberFinder.getCurrentRightmostCluster()
|
||||
.stream()
|
||||
.map(lookup::get)
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
int lastConfirmed = start;
|
||||
for (int i = start; i < lastCandidate + 1; i++) {
|
||||
TextBlockOnPage textBlockOnPage = textBlocks.get(i);
|
||||
if (blocksWithNumberInCluster.contains(textBlockOnPage)) {
|
||||
lastConfirmed = i;
|
||||
}
|
||||
}
|
||||
|
||||
textBlocks.subList(start, lastConfirmed + 1)
|
||||
.stream()
|
||||
.filter(block -> (block.textBlock().getClassification() == null || !block.textBlock().getClassification().isHeadline()))
|
||||
.forEach(textBlockOnPage -> textBlockOnPage.textBlock().setClassification(PageBlockType.TABLE_OF_CONTENTS_ITEM));
|
||||
|
||||
return lastCandidate;
|
||||
}
|
||||
|
||||
|
||||
private static void addVisualization(LayoutDebugLayer layoutDebugLayer, TocNumberFinder tocNumberFinder, Map<TextPositionSequence, TextBlockOnPage> lookup) {
|
||||
|
||||
tocNumberFinder.getCurrentRightmostCluster()
|
||||
.stream()
|
||||
.collect(Collectors.groupingBy(key -> lookup.get(key).page().getPageNumber()))
|
||||
.forEach((pageNumber, number) -> layoutDebugLayer.addTocPages(number, pageNumber));
|
||||
}
|
||||
|
||||
|
||||
private static boolean anyIntersection(Collection<TextPositionSequence> numbers1,
|
||||
Collection<TextPositionSequence> numbers2,
|
||||
Map<TextPositionSequence, TextBlockOnPage> lookup) {
|
||||
|
||||
return numbers1.stream()
|
||||
.anyMatch(numberFromCluster -> numbers2.stream()
|
||||
.anyMatch(numberFromBlock -> matches(numberFromBlock, numberFromCluster, lookup)));
|
||||
}
|
||||
|
||||
|
||||
private static List<TextPositionSequence> extractNumbers(List<TextBlockOnPage> textBlocks, Map<TextPositionSequence, TextBlockOnPage> lookup, int numberOfPages) {
|
||||
|
||||
List<TextPositionSequence> blocks = new LinkedList<>();
|
||||
for (TextBlockOnPage textBlock : textBlocks) {
|
||||
blocks.addAll(extractNumbers(textBlock, lookup, numberOfPages));
|
||||
}
|
||||
return blocks;
|
||||
}
|
||||
|
||||
|
||||
private static List<TextPositionSequence> extractNumbers(TextBlockOnPage textBlock, Map<TextPositionSequence, TextBlockOnPage> lookup, int numberOfPages) {
|
||||
|
||||
List<TextPositionSequence> blocks = new LinkedList<>();
|
||||
TextPageBlock block = textBlock.textBlock();
|
||||
List<TextPositionSequence> sequences = block.getSequences();
|
||||
for (int i = 0; i < sequences.size(); i++) {
|
||||
|
||||
TextPositionSequence word = sequences.get(i);
|
||||
|
||||
if (!NUMERIC.matcher(word).matches()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (AMOUNT_PATTERN.matcher(getSurroundingString(i, sequences)).matches()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
try {
|
||||
int pageNumber = Integer.parseInt(word.toString());
|
||||
if (0 >= pageNumber || pageNumber > numberOfPages) {
|
||||
continue;
|
||||
}
|
||||
lookup.put(word, textBlock);
|
||||
blocks.add(word);
|
||||
} catch (NumberFormatException e) {
|
||||
log.debug("That wasn't a number! Should not happen, due to numeric check beforehand.");
|
||||
}
|
||||
}
|
||||
return blocks;
|
||||
}
|
||||
|
||||
|
||||
private static CharSequence getSurroundingString(int i, List<TextPositionSequence> sequences) {
|
||||
|
||||
int end = Math.min(i + 5, sequences.size());
|
||||
return sequences.subList(i, end)
|
||||
.stream()
|
||||
.map(TextPositionSequence::toString)
|
||||
.collect(Collectors.joining(" "));
|
||||
}
|
||||
|
||||
|
||||
private static boolean matches(TextPositionSequence number1, TextPositionSequence number2, Map<TextPositionSequence, TextBlockOnPage> lookup) {
|
||||
|
||||
if (number1.getDir() != number2.getDir()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return number1.intersectsXDirAdj(number2, INTERSECTION_TOLERANCE);
|
||||
}
|
||||
|
||||
|
||||
private boolean isTOCHeadline(TextBlockOnPage textBlock) {
|
||||
|
||||
if (textBlock.textBlock().getText().length() > 50) {
|
||||
return false;
|
||||
}
|
||||
String text = TextNormalizationUtilities.removeAllWhitespaces(textBlock.textBlock().getText().toLowerCase(Locale.ENGLISH));
|
||||
return (text.contains("content") && text.length() < "content".length() + 6) //
|
||||
|| (text.contains("tableofcontent") && text.length() < "tableofcontent".length() + DENSITY_THRESHOLD_COUNT)//
|
||||
|| text.equals("tables")//
|
||||
|| text.equals("appendices")//
|
||||
|| text.equals("figures");
|
||||
}
|
||||
|
||||
|
||||
private List<TextBlockOnPage> buildBlocksPerPage(ClassificationDocument document) {
|
||||
|
||||
List<TextBlockOnPage> blocks = new ArrayList<>();
|
||||
for (ClassificationPage page : document.getPages()) {
|
||||
for (AbstractPageBlock abstractPageBlock : page.getTextBlocks()) {
|
||||
if (abstractPageBlock instanceof TextPageBlock textBlock) {
|
||||
if (textBlock.getClassification() != null && (textBlock.getClassification().equals(PageBlockType.HEADER) //
|
||||
|| textBlock.getClassification().equals(PageBlockType.FOOTER))) {
|
||||
continue;
|
||||
}
|
||||
blocks.add(new TextBlockOnPage(page, textBlock));
|
||||
}
|
||||
}
|
||||
}
|
||||
return blocks;
|
||||
}
|
||||
|
||||
|
||||
private static class TocNumberFinder {
|
||||
|
||||
final UnionFind<TextPositionSequence> numberClusters;
|
||||
final HashMap<TextPositionSequence, TextBlockOnPage> lookup;
|
||||
|
||||
|
||||
TocNumberFinder(List<TextPositionSequence> blocks, HashMap<TextPositionSequence, TextBlockOnPage> lookup) {
|
||||
|
||||
this.numberClusters = new UnionFind<>(new HashSet<>(blocks));
|
||||
for (int i = 0; i < blocks.size(); i++) {
|
||||
for (int j = i + 1; j < blocks.size(); j++) {
|
||||
if (matches(blocks.get(i), blocks.get(j), lookup)) {
|
||||
numberClusters.union(blocks.get(i), blocks.get(j));
|
||||
}
|
||||
}
|
||||
}
|
||||
this.lookup = lookup;
|
||||
}
|
||||
|
||||
|
||||
public void add(TextPositionSequence number) {
|
||||
|
||||
if (numberClusters.getElements().contains(number)) {
|
||||
return;
|
||||
}
|
||||
|
||||
numberClusters.addElement(number);
|
||||
for (TextPositionSequence element : numberClusters.getElements()) {
|
||||
if (matches(number, element, lookup)) {
|
||||
numberClusters.union(element, number);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public List<TextPositionSequence> getCurrentRightmostCluster() {
|
||||
|
||||
return numberClusters.getGroups()
|
||||
.stream()
|
||||
.filter(cluster -> cluster.size() > MINIMUM_MATCHES)
|
||||
.map(cluster -> cluster.stream()
|
||||
.sorted(new TextPositionSequenceComparator(lookup))
|
||||
.toList())
|
||||
.map(this::removeOutliers)
|
||||
// .map(this::filterByMinimumDensity)
|
||||
.filter(cluster -> cluster.size() > MINIMUM_MATCHES)
|
||||
.max(Comparator.comparingDouble(cluster -> cluster.get(0).getBBox().getMaxX())).orElse(Collections.emptyList());
|
||||
}
|
||||
|
||||
// does not seem to be doing much, ideally instead of using the height of the blocks i would like to use the height, beginning from the MainBody top,
|
||||
// but as the MainBody is often wrong, this results in some numbers being discarded even though they are correct.
|
||||
// private List<TextPositionSequence> filterByMinimumDensity(List<TextPositionSequence> numbers) {
|
||||
//
|
||||
// Map<ClassificationPage, List<TextPositionSequence>> clustersPerPage = numbers.stream()
|
||||
// .collect(Collectors.groupingBy(number -> lookup.get(number).page()));
|
||||
//
|
||||
// List<TextPositionSequence> result = new ArrayList<>(numbers.size());
|
||||
// clustersPerPage.keySet()
|
||||
// .stream()
|
||||
// .sorted(Comparator.comparingInt(ClassificationPage::getPageNumber))
|
||||
// .forEach(page -> {
|
||||
// var numbersOnPage = clustersPerPage.get(page);
|
||||
//
|
||||
// double height = numbersOnPage.stream()
|
||||
// .map(BoundingBox::getBBox)
|
||||
// .collect(RectangleTransformations.collectBBox()).getHeight();
|
||||
//
|
||||
// double count = numbersOnPage.size();
|
||||
//
|
||||
// if ((count / height) >= (DENSITY_THRESHOLD_COUNT / page.getPageHeight())) {
|
||||
// result.addAll(numbers);
|
||||
// }
|
||||
// });
|
||||
// return result;
|
||||
// }
|
||||
|
||||
|
||||
public List<TextPositionSequence> removeOutliers(List<TextPositionSequence> numbers) {
|
||||
|
||||
List<TextPositionSequence> result = new ArrayList<>();
|
||||
|
||||
result.add(numbers.get(0));
|
||||
|
||||
for (int i = 1; i < numbers.size() - 1; i++) {
|
||||
int prev = getNumberAsInt(numbers, i - 1);
|
||||
int curr = getNumberAsInt(numbers, i);
|
||||
int next = getNumberAsInt(numbers, i + 1);
|
||||
|
||||
if (!(curr <= prev || curr >= next) || !isBetterWithout(numbers, i)) {
|
||||
result.add(numbers.get(i));
|
||||
}
|
||||
}
|
||||
if (getNumberAsInt(numbers, numbers.size() - 1) >= getNumberAsInt(numbers, Math.max(0, numbers.size() - 2))) {
|
||||
result.add(numbers.get(numbers.size() - 1));
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
// Helper method to check if removing the current number results in a better order
|
||||
public static boolean isBetterWithout(List<TextPositionSequence> numbers, int i) {
|
||||
|
||||
if (i == 0 || i == numbers.size() - 1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
int prev = getNumberAsInt(numbers, i);
|
||||
int curr = getNumberAsInt(numbers, i);
|
||||
int next = getNumberAsInt(numbers, i + 1);
|
||||
|
||||
return (prev <= next) && (Math.abs(prev - next) < Math.abs(prev - curr) + Math.abs(curr - next));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
private static int getNumberAsInt(List<TextPositionSequence> numbers, int i) {
|
||||
|
||||
return Integer.parseInt(numbers.get(i).toString());
|
||||
}
|
||||
|
||||
}
|
||||
@ -201,7 +201,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
|
||||
try {
|
||||
if (stroke && !getGraphicsState().getStrokingColor().isPattern() && isBlack(getGraphicsState().getStrokingColor()) || //
|
||||
!stroke && !getGraphicsState().getNonStrokingColor().isPattern() && isBlack(getGraphicsState().getNonStrokingColor())) {
|
||||
!stroke && !getGraphicsState().getNonStrokingColor().isPattern() && isBlack(getGraphicsState().getNonStrokingColor())) {
|
||||
rulings.addAll(path);
|
||||
}
|
||||
} catch (UnsupportedOperationException e) {
|
||||
@ -279,9 +279,9 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
startIndex = i;
|
||||
}
|
||||
|
||||
if (i > 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i).getUnicode().equals("\u00A0") || textPositions.get(i)
|
||||
.getUnicode()
|
||||
.equals("\t")) && i <= textPositions.size() - 2) {
|
||||
if (i > 0
|
||||
&& (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i).getUnicode().equals("\u00A0") || textPositions.get(i).getUnicode().equals("\t"))
|
||||
&& i <= textPositions.size() - 2) {
|
||||
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
||||
if (checkIfSequenceContainsOnlyWhitespaces(sublist)) {
|
||||
|
||||
@ -296,20 +296,31 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
}
|
||||
startIndex = i + 1;
|
||||
}
|
||||
if (isDottedLineFollowedByWord(textPositions, i, startIndex)) {
|
||||
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart));
|
||||
startIndex = i;
|
||||
}
|
||||
if (isWordFollowedByDottedLine(textPositions, i, startIndex)) {
|
||||
List<TextPosition> sublist = textPositions.subList(startIndex, i - 2);
|
||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart));
|
||||
startIndex = i - 2;
|
||||
}
|
||||
}
|
||||
|
||||
List<TextPosition> sublist = textPositions.subList(startIndex, textPositions.size());
|
||||
if (!sublist.isEmpty() && (sublist.get(sublist.size() - 1).getUnicode().equals(" ") || sublist.get(sublist.size() - 1)
|
||||
.getUnicode()
|
||||
.equals("\u00A0") || sublist.get(sublist.size() - 1).getUnicode().equals("\t"))) {
|
||||
if (!sublist.isEmpty() && (sublist.get(sublist.size() - 1).getUnicode().equals(" ")
|
||||
|| sublist.get(sublist.size() - 1).getUnicode().equals("\u00A0")
|
||||
|| sublist.get(sublist.size() - 1).getUnicode().equals("\t"))) {
|
||||
sublist = sublist.subList(0, sublist.size() - 1);
|
||||
}
|
||||
|
||||
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0)
|
||||
.getUnicode()
|
||||
.equals("\t")))) {
|
||||
if (previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0)
|
||||
.getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < 0.01) {
|
||||
if (previous != null
|
||||
&& sublist.get(0).getYDirAdj() == previous.getYDirAdj()
|
||||
&& sublist.get(0).getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < 0.01) {
|
||||
for (TextPosition t : sublist) {
|
||||
textPositionSequences.get(textPositionSequences.size() - 1).add(t);
|
||||
}
|
||||
@ -317,10 +328,31 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, isParagraphStart));
|
||||
}
|
||||
}
|
||||
|
||||
super.writeString(text);
|
||||
}
|
||||
|
||||
|
||||
private boolean isWordFollowedByDottedLine(List<TextPosition> textPositions, int i, int startIndex) {
|
||||
|
||||
return i - startIndex >= 4 //
|
||||
&& textPositions.get(i).getUnicode().equals(".") //
|
||||
&& textPositions.get(i - 1).getUnicode().equals(".") //
|
||||
&& textPositions.get(i - 2).getUnicode().equals(".") //
|
||||
&& !textPositions.get(i - 3).getUnicode().equals(".");
|
||||
}
|
||||
|
||||
|
||||
private static boolean isDottedLineFollowedByWord(List<TextPosition> textPositions, int i, int startIndex) {
|
||||
|
||||
return i - startIndex >= 4 //
|
||||
&& !textPositions.get(i).getUnicode().equals(".") //
|
||||
&& textPositions.get(i - 1).getUnicode().equals(".") //
|
||||
&& textPositions.get(i - 2).getUnicode().equals(".") //
|
||||
&& textPositions.get(i - 3).getUnicode().equals(".");
|
||||
}
|
||||
|
||||
|
||||
public boolean checkIfCurrentPositionIsToTheRightOfPreviousPosition(int i, List<TextPosition> textPositions) {
|
||||
|
||||
return i > 0 && textPositions.get(i).getXDirAdj() < textPositions.get(i - 1).getXDirAdj();
|
||||
@ -337,8 +369,9 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
|
||||
public boolean checkIfGapSizeBetweenCharactersSmallerThanMaximum(RedTextPosition previous, List<TextPosition> sublist, float maximumGapSize) {
|
||||
|
||||
return previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0)
|
||||
.getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < maximumGapSize;
|
||||
return previous != null
|
||||
&& sublist.get(0).getYDirAdj() == previous.getYDirAdj()
|
||||
&& sublist.get(0).getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < maximumGapSize;
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -36,6 +36,9 @@ public class LayoutGridService {
|
||||
LayoutGrid layoutGrid = createLayoutGrid(document);
|
||||
Outline outline = OutlineMapper.createOutline(document);
|
||||
layoutGrid.setVisibleByDefault(layerVisibilityDefaultValue);
|
||||
|
||||
document.getLayoutDebugLayer().addSentenceVisualization(document.getTextBlock());
|
||||
|
||||
if (document.getLayoutDebugLayer().isActive()) {
|
||||
viewerDocumentService.addLayerGroups(originFile, destinationFile, List.of(layoutGrid, document.getLayoutDebugLayer()), outline);
|
||||
} else {
|
||||
|
||||
@ -10,6 +10,7 @@ public final class TextNormalizationUtilities {
|
||||
public static final Pattern hyphenLineBreaks = Pattern.compile("[-~‐‒⁻−﹣゠⁓‑\\u00AD][\\r\\n]+");
|
||||
public static final Pattern linebreaks = Pattern.compile("[\\r\\n]+");
|
||||
public static final Pattern doubleWhitespaces = Pattern.compile("\\s{2,}");
|
||||
public static final Pattern WHITESPACE_REMOVAL = Pattern.compile("\\s+");
|
||||
|
||||
|
||||
public String cleanString(String value) {
|
||||
@ -36,4 +37,11 @@ public final class TextNormalizationUtilities {
|
||||
|
||||
return linebreaks.matcher(value).replaceAll(" ");
|
||||
}
|
||||
|
||||
|
||||
public String removeAllWhitespaces(String value) {
|
||||
|
||||
return WHITESPACE_REMOVAL.matcher(value).replaceAll("");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -2,6 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.awt.geom.RectangularShape;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashSet;
|
||||
@ -54,18 +55,26 @@ public class TextPositionOperations {
|
||||
private List<TextPositionSequence> sortUsingLineDetection(Set<TextPositionSequence> sequences) {
|
||||
|
||||
return sortLines(groupByLine(sequences));
|
||||
|
||||
}
|
||||
|
||||
|
||||
public List<TextPositionSequence> sortLines(Collection<Set<TextPositionSequence>> lines) {
|
||||
|
||||
return lines.stream()
|
||||
.map(TextPositionOperations::sortByXDirAdj)
|
||||
.filter(line -> !line.isEmpty())
|
||||
.sorted(Comparator.comparing(line -> line.get(0), COMPARATOR_DIR_ADJ))
|
||||
.flatMap(Collection::stream)
|
||||
.toList();
|
||||
List<List<TextPositionSequence>> lineBlocks = new ArrayList<>();
|
||||
for (Set<TextPositionSequence> line : lines) {
|
||||
List<TextPositionSequence> sortedLine = sortByXDirAdj(line);
|
||||
if (!sortedLine.isEmpty()) {
|
||||
lineBlocks.add(sortedLine);
|
||||
}
|
||||
}
|
||||
// need to use old sorting, since COMPARATOR_DIR_ADJ is not transitive
|
||||
QuickSort.sort(lineBlocks, Comparator.comparing(line -> line.get(0), COMPARATOR_DIR_ADJ));
|
||||
|
||||
List<TextPositionSequence> list = new ArrayList<>();
|
||||
for (List<TextPositionSequence> textPositionSequences : lineBlocks) {
|
||||
list.addAll(textPositionSequences);
|
||||
}
|
||||
return list;
|
||||
}
|
||||
|
||||
|
||||
@ -91,7 +100,7 @@ public class TextPositionOperations {
|
||||
for (TextPositionSequence sequence : sequences) {
|
||||
for (TextPositionSequence sequence2 : sequences) {
|
||||
|
||||
if (sequence.equals(sequence2) || unionFind.inSameSet(sequence, sequence2)) {
|
||||
if (sequence.equals(sequence2)) { // || unionFind.inSameSet(sequence, sequence2)) doing this is actually slower than not doing it
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
@ -5,9 +5,11 @@ import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Line2D;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.text.BreakIterator;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
@ -19,6 +21,8 @@ import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Bound
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
@ -94,6 +98,29 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
||||
}
|
||||
|
||||
|
||||
public void addSentenceVisualization(TextBlock textBlock) {
|
||||
|
||||
if (!active) {
|
||||
return;
|
||||
}
|
||||
AtomicInteger rotatingColorIdx = new AtomicInteger(0);
|
||||
String text = textBlock.getSearchText();
|
||||
BreakIterator sentenceIterator = BreakIterator.getSentenceInstance(Locale.ENGLISH);
|
||||
sentenceIterator.setText(text);
|
||||
int lastIdx = 0;
|
||||
while (sentenceIterator.next() != BreakIterator.DONE) {
|
||||
TextRange sentenceRange = new TextRange(lastIdx + textBlock.getTextRange().start(), sentenceIterator.current() + textBlock.getTextRange().start());
|
||||
lastIdx = sentenceIterator.current();
|
||||
Color color = getRotatingColor(rotatingColorIdx);
|
||||
textBlock.getPositionsPerPage(sentenceRange)
|
||||
.forEach((page, bboxes) -> getOrCreateVisualizationsOnPage(page.getNumber(), this.sentences).getColoredRectangles()
|
||||
.addAll(bboxes.stream()
|
||||
.map(bbox -> new ColoredRectangle(bbox, color, 1))
|
||||
.toList()));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private Color decideOnRulingColor(Ruling ruling) {
|
||||
|
||||
return switch (ruling.getClassification()) {
|
||||
@ -248,7 +275,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
||||
.map(Line::getCharacters)
|
||||
.flatMap(Collection::stream)
|
||||
.forEach(character -> {
|
||||
Color color = ROTATING_CHARACTER_COLOR.get(index.getAndIncrement() % ROTATING_CHARACTER_COLOR.size());
|
||||
Color color = getRotatingColor(index);
|
||||
Rectangle2D charBBox = character.getTextPosition().getBBoxPdf();
|
||||
characterVisualizations.getColoredRectangles().add(new ColoredRectangle(charBBox, color, 1));
|
||||
character.getNeighbors()
|
||||
@ -263,6 +290,31 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
||||
}
|
||||
|
||||
|
||||
public void addTocPages(List<TextPositionSequence> numbers, int page) {
|
||||
|
||||
if (!active) {
|
||||
return;
|
||||
}
|
||||
|
||||
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(page, this.tocPages);
|
||||
visualizationsOnPage.getColoredRectangles()
|
||||
.addAll(numbers.stream()
|
||||
.map(BoundingBox::getBBoxPdf)
|
||||
.map(line -> new ColoredRectangle(line, LINES_COLOR, 0.5f))
|
||||
.toList());
|
||||
visualizationsOnPage.getColoredRectangles()
|
||||
.add(new ColoredRectangle(numbers.stream()
|
||||
.map(BoundingBox::getBBoxPdf)
|
||||
.collect(RectangleTransformations.collectBBox()), LINES_COLOR, 0.5f));
|
||||
}
|
||||
|
||||
|
||||
private static Color getRotatingColor(AtomicInteger index) {
|
||||
|
||||
return ROTATING_CHARACTER_COLOR.get(index.getAndIncrement() % ROTATING_CHARACTER_COLOR.size());
|
||||
}
|
||||
|
||||
|
||||
public void addOutlineObjects(List<OutlineObject> outlineObjects, PageInformation pageInformation) {
|
||||
|
||||
if (!active) {
|
||||
|
||||
@ -0,0 +1,70 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
class BoundingBoxTest {
|
||||
|
||||
@Test
|
||||
void testHorizontalDistance_NoOverlap() {
|
||||
|
||||
ConcreteBoundingBox box1 = new ConcreteBoundingBox(0, 0, 10, 10);
|
||||
ConcreteBoundingBox box2 = new ConcreteBoundingBox(20, 0, 10, 10);
|
||||
|
||||
assertEquals(10, box1.horizontalDistance(box2));
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
void testHorizontalDistance_Overlap() {
|
||||
|
||||
ConcreteBoundingBox box1 = new ConcreteBoundingBox(0, 0, 10, 10);
|
||||
ConcreteBoundingBox box2 = new ConcreteBoundingBox(5, 0, 10, 10);
|
||||
|
||||
assertEquals(0, box1.horizontalDistance(box2));
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
void testVerticalDistance_NoOverlap() {
|
||||
|
||||
ConcreteBoundingBox box1 = new ConcreteBoundingBox(0, 0, 10, 10);
|
||||
ConcreteBoundingBox box2 = new ConcreteBoundingBox(0, 20, 10, 10);
|
||||
|
||||
assertEquals(10, box1.verticalDistance(box2));
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
void testVerticalDistance_Overlap() {
|
||||
|
||||
ConcreteBoundingBox box1 = new ConcreteBoundingBox(0, 0, 10, 10);
|
||||
ConcreteBoundingBox box2 = new ConcreteBoundingBox(0, 5, 10, 10);
|
||||
|
||||
assertEquals(0, box1.verticalDistance(box2));
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
void testVerticalDistance_PartialOverlap() {
|
||||
|
||||
ConcreteBoundingBox box1 = new ConcreteBoundingBox(0, 0, 10, 10);
|
||||
ConcreteBoundingBox box2 = new ConcreteBoundingBox(0, 8, 10, 10);
|
||||
|
||||
assertEquals(0, box1.verticalDistance(box2));
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
void testHorizontalDistance_PartialOverlap() {
|
||||
|
||||
ConcreteBoundingBox box1 = new ConcreteBoundingBox(0, 0, 10, 10);
|
||||
ConcreteBoundingBox box2 = new ConcreteBoundingBox(8, 0, 10, 10);
|
||||
|
||||
assertEquals(0, box1.horizontalDistance(box2));
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,12 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
class ConcreteBoundingBox extends BoundingBox {
|
||||
|
||||
ConcreteBoundingBox(double x, double y, double width, double height) {
|
||||
|
||||
this.bBox = new Rectangle2D.Double(x, y, width, height);
|
||||
}
|
||||
|
||||
}
|
||||
@ -46,8 +46,8 @@ public class SimplifiedTextServiceTest
|
||||
Document document = buildGraph(file);
|
||||
SimplifiedText simplifiedText = simplifiedSectionTextService.toSimplifiedText(document);
|
||||
List<SimplifiedSectionText> sectionTexts = simplifiedText.getSectionTexts();
|
||||
assertThat(sectionTexts.stream().filter(section -> section.getText().equals(footerExample)).collect(Collectors.toList()).size()).isGreaterThan(0);
|
||||
assertThat(sectionTexts.stream().filter(section -> section.getText().equals(headerExample)).collect(Collectors.toList()).size()).isGreaterThan(0);
|
||||
assertThat(sectionTexts.stream().filter(section -> section.getText().contains(footerExample)).toList().size()).isGreaterThan(0);
|
||||
assertThat(sectionTexts.stream().filter(section -> section.getText().contains(headerExample)).toList().size()).isGreaterThan(0);
|
||||
|
||||
|
||||
|
||||
|
||||
@ -51,7 +51,7 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@Slf4j
|
||||
public class DocumentReadingOrderTest extends BuildDocumentTest {
|
||||
|
||||
private static final boolean DRAW_DIR_ADJ_COORDS = true;
|
||||
private static final boolean DRAW_DIR_ADJ_COORDS = false;
|
||||
public static final List<LayoutParsingType> LAYOUT_PARSING_TYPES = List.of(LayoutParsingType.DOCUMINE,
|
||||
LayoutParsingType.DOCUMINE_OLD,
|
||||
LayoutParsingType.REDACT_MANAGER,
|
||||
@ -82,7 +82,7 @@ public class DocumentReadingOrderTest extends BuildDocumentTest {
|
||||
@Disabled
|
||||
public void drawDirAdjForFile() {
|
||||
|
||||
String pdfFile = "/home/kschuettler/Dokumente/Ticket Related/RED-9974/026dc94b019bc2348a4c54f0c6c4516f.ORIGIN.pdf";
|
||||
String pdfFile = "/home/kschuettler/Dokumente/TestFiles/OCR/VV-331340/VV-331340_OCRED_first15.pdf";
|
||||
|
||||
ClassificationDocument classificationDocument = parseLayout(pdfFile, LayoutParsingType.DOCUMINE_OLD);
|
||||
|
||||
|
||||
@ -76,9 +76,10 @@ class TextRangeTest {
|
||||
assertEquals(List.of(new TextRange(10, 40), new TextRange(40, 100)), startTextRange.split(List.of(40)));
|
||||
assertEquals(1, startTextRange.split(Collections.emptyList()).size());
|
||||
assertEquals(1, startTextRange.split(List.of(startTextRange.start())).size());
|
||||
assertEquals(1, startTextRange.split(List.of(100)).size());
|
||||
assertThrows(IndexOutOfBoundsException.class, () -> startTextRange.split(Collections.singletonList(0)));
|
||||
assertThrows(IndexOutOfBoundsException.class, () -> startTextRange.split(Collections.singletonList(100)));
|
||||
assertThrows(IndexOutOfBoundsException.class, () -> startTextRange.split(List.of(12, 40, 100)));
|
||||
assertThrows(IndexOutOfBoundsException.class, () -> startTextRange.split(Collections.singletonList(101)));
|
||||
assertThrows(IndexOutOfBoundsException.class, () -> startTextRange.split(List.of(12, 40, 101)));
|
||||
}
|
||||
|
||||
}
|
||||
@ -55,7 +55,10 @@ public record LayerIdentifier(String name, String markedContentName) {
|
||||
public static final LayerIdentifier NEIGHBOURS = new LayerIdentifier("Neighbours", "NEIGHBOURS");
|
||||
public static final LayerIdentifier CHARACTERS = new LayerIdentifier("Characters", "CHARACTERS");
|
||||
public static final LayerIdentifier OUTLINE_OBJECTS = new LayerIdentifier("Outline Positions", "OUTLINE_OBJECTS");
|
||||
public static final LayerIdentifier SENTENCES = new LayerIdentifier("Sentences", "SENTENCES");
|
||||
public static final LayerIdentifier TOC_PAGES = new LayerIdentifier("TOC pages", "TOC_PAGES");
|
||||
|
||||
// Visual layout parser
|
||||
public static final LayerIdentifier KNECON_VISUAL_PARSING = new LayerIdentifier("Visual Layout Parser", "VISUAL_PARSING");
|
||||
|
||||
//ocr
|
||||
|
||||
@ -55,6 +55,8 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup {
|
||||
protected final Visualizations neighbours = Visualizations.builder().layer(LayerIdentifier.NEIGHBOURS).build();
|
||||
protected final Visualizations characters = Visualizations.builder().layer(LayerIdentifier.CHARACTERS).build();
|
||||
protected final Visualizations outlineObjects = Visualizations.builder().layer(LayerIdentifier.OUTLINE_OBJECTS).build();
|
||||
protected final Visualizations sentences = Visualizations.builder().layer(LayerIdentifier.SENTENCES).build();
|
||||
protected final Visualizations tocPages = Visualizations.builder().layer(LayerIdentifier.TOC_PAGES).build();
|
||||
|
||||
|
||||
public List<Visualizations> getVisualizations() {
|
||||
@ -63,14 +65,15 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup {
|
||||
neighbours,//
|
||||
words, //
|
||||
lines, //
|
||||
sentences, //
|
||||
zones, //
|
||||
rulings, //
|
||||
clean_rulings, //
|
||||
cells, //
|
||||
mainBody, //
|
||||
markedContent, //
|
||||
outlineObjects //
|
||||
);
|
||||
outlineObjects, //
|
||||
tocPages);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -68,10 +68,10 @@ public class OutlineUtility {
|
||||
public static void deleteExistingOutline(PDFDoc doc) {
|
||||
|
||||
Bookmark firstBookmark = doc.getFirstBookmark();
|
||||
// while (firstBookmark != null && firstBookmark.isValid()) {
|
||||
while (firstBookmark != null && firstBookmark.isValid()) {
|
||||
firstBookmark.delete();
|
||||
firstBookmark = doc.getFirstBookmark();
|
||||
// }
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user