Merge branch 'feature/RED-10127-bp' into 'release/0.159.x'
RED-10127: improve headline detection See merge request fforesight/layout-parser!236
This commit is contained in:
commit
63953ecf2d
@ -2,11 +2,13 @@ package com.knecon.fforesight.service.layoutparser.processor;
|
|||||||
|
|
||||||
import static java.lang.String.format;
|
import static java.lang.String.format;
|
||||||
|
|
||||||
|
import java.awt.geom.AffineTransform;
|
||||||
import java.awt.geom.Rectangle2D;
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collection;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@ -24,6 +26,10 @@ import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.No
|
|||||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.classification.TableOfContentsClassificationService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.MarkdownMapper;
|
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.MarkdownMapper;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||||
@ -91,10 +97,7 @@ public class LayoutParsingPipeline {
|
|||||||
CvTableParsingAdapter cvTableParsingAdapter;
|
CvTableParsingAdapter cvTableParsingAdapter;
|
||||||
LayoutParsingStorageService layoutParsingStorageService;
|
LayoutParsingStorageService layoutParsingStorageService;
|
||||||
SectionsBuilderService sectionsBuilderService;
|
SectionsBuilderService sectionsBuilderService;
|
||||||
RedactManagerClassificationService redactManagerClassificationService;
|
|
||||||
DocuMineClassificationService docuMineClassificationService;
|
|
||||||
SimplifiedSectionTextService simplifiedSectionTextService;
|
SimplifiedSectionTextService simplifiedSectionTextService;
|
||||||
BodyTextFrameService bodyTextFrameService;
|
|
||||||
RulingCleaningService rulingCleaningService;
|
RulingCleaningService rulingCleaningService;
|
||||||
TableExtractionService tableExtractionService;
|
TableExtractionService tableExtractionService;
|
||||||
DocuMineBlockificationService docuMineBlockificationService;
|
DocuMineBlockificationService docuMineBlockificationService;
|
||||||
@ -104,12 +107,12 @@ public class LayoutParsingPipeline {
|
|||||||
LayoutGridService layoutGridService;
|
LayoutGridService layoutGridService;
|
||||||
ObservationRegistry observationRegistry;
|
ObservationRegistry observationRegistry;
|
||||||
VisualLayoutParsingAdapter visualLayoutParsingAdapter;
|
VisualLayoutParsingAdapter visualLayoutParsingAdapter;
|
||||||
ClarifyndClassificationService clarifyndClassificationService;
|
|
||||||
GraphicExtractorService graphicExtractorService;
|
GraphicExtractorService graphicExtractorService;
|
||||||
OutlineExtractorService outlineExtractorService;
|
OutlineExtractorService outlineExtractorService;
|
||||||
OutlineValidationService outlineValidationService;
|
OutlineValidationService outlineValidationService;
|
||||||
TOCEnrichmentService tocEnrichmentService;
|
TOCEnrichmentService tocEnrichmentService;
|
||||||
LayoutparserSettings settings;
|
LayoutparserSettings settings;
|
||||||
|
ClassificationService classificationService;
|
||||||
|
|
||||||
|
|
||||||
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
|
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
|
||||||
@ -273,6 +276,9 @@ public class LayoutParsingPipeline {
|
|||||||
stripper.setPdpage(pdPage);
|
stripper.setPdpage(pdPage);
|
||||||
stripper.getText(originDocument);
|
stripper.getText(originDocument);
|
||||||
List<TextPositionSequence> words = stripper.getTextPositionSequences();
|
List<TextPositionSequence> words = stripper.getTextPositionSequences();
|
||||||
|
|
||||||
|
// rotateDirAdjExactly(words, pdPage); // works really well for many highly rotated documents (e.g. VV-331340.pdf), but it decreases the headline performance by 1.3%, so I am leaving it out for now
|
||||||
|
|
||||||
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD)) {
|
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD)) {
|
||||||
var lines = TextPositionOperations.groupByLine(new HashSet<>(words));
|
var lines = TextPositionOperations.groupByLine(new HashSet<>(words));
|
||||||
classificationDocument.getLayoutDebugLayer().addLineVisualizationsFromNestedTextPosition(lines, pageNumber);
|
classificationDocument.getLayoutDebugLayer().addLineVisualizationsFromNestedTextPosition(lines, pageNumber);
|
||||||
@ -366,24 +372,7 @@ public class LayoutParsingPipeline {
|
|||||||
|
|
||||||
originDocument.close();
|
originDocument.close();
|
||||||
|
|
||||||
log.info("Calculating BodyTextFrame for {}", identifier);
|
classificationService.classify(classificationDocument, layoutParsingType, identifier);
|
||||||
bodyTextFrameService.setBodyTextFrames(classificationDocument, layoutParsingType);
|
|
||||||
for (ClassificationPage page : classificationDocument.getPages()) {
|
|
||||||
classificationDocument.getLayoutDebugLayer().addCleanRulingVisualization(page.getCleanRulings(), page.getPageNumber());
|
|
||||||
}
|
|
||||||
log.info("Classify TextBlocks for {}", identifier);
|
|
||||||
switch (layoutParsingType) {
|
|
||||||
case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_OLD, CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH ->
|
|
||||||
redactManagerClassificationService.classifyDocument(classificationDocument);
|
|
||||||
case DOCUMINE_OLD, DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument);
|
|
||||||
case CLARIFYND -> clarifyndClassificationService.classifyDocument(classificationDocument);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD)) {
|
|
||||||
for (ClassificationPage page : classificationDocument.getPages()) {
|
|
||||||
docuMineBlockificationService.mergeblocks(page, page.getCleanRulings().withoutTextRulings(), 0, 10);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
List<TextPageBlock> headlines = classificationDocument.getPages()
|
List<TextPageBlock> headlines = classificationDocument.getPages()
|
||||||
.stream()
|
.stream()
|
||||||
@ -406,6 +395,32 @@ public class LayoutParsingPipeline {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static void rotateDirAdjExactly(List<TextPositionSequence> words, PDPage pdPage) {
|
||||||
|
|
||||||
|
for (TextDirection dir : TextDirection.values()) {
|
||||||
|
|
||||||
|
double averageRotation = words.stream()
|
||||||
|
.map(TextPositionSequence::getTextPositions)
|
||||||
|
.flatMap(Collection::stream)
|
||||||
|
.filter(pos -> pos.getDir().equals(dir))
|
||||||
|
.mapToDouble(RedTextPosition::getExactDir).average().orElse(0);
|
||||||
|
|
||||||
|
if (averageRotation == 0) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
AffineTransform rotateInstance = AffineTransform.getRotateInstance(averageRotation, pdPage.getMediaBox().getWidth() / 2, pdPage.getMediaBox().getHeight() / 2);
|
||||||
|
|
||||||
|
for (TextPositionSequence word : words) {
|
||||||
|
if (!dir.equals(word.getDir())) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
word.transform(rotateInstance);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
private void addNumberOfPagesToTrace(int numberOfPages, long size) {
|
private void addNumberOfPagesToTrace(int numberOfPages, long size) {
|
||||||
|
|
||||||
if (observationRegistry.getCurrentObservation() != null) {
|
if (observationRegistry.getCurrentObservation() != null) {
|
||||||
|
|||||||
@ -225,33 +225,31 @@ public abstract class BoundingBox {
|
|||||||
|
|
||||||
public double horizontalDistance(BoundingBox other) {
|
public double horizontalDistance(BoundingBox other) {
|
||||||
|
|
||||||
Rectangle2D left;
|
double rect1Right = getMaxX();
|
||||||
Rectangle2D right;
|
double rect1Left = getMinX();
|
||||||
if (this.leftOf(other)) {
|
double rect2Right = other.getMaxX();
|
||||||
left = this.getBBox();
|
double rect2Left = other.getMinX();
|
||||||
right = other.getBBox();
|
|
||||||
} else {
|
|
||||||
left = other.getBBox();
|
|
||||||
right = this.getBBox();
|
|
||||||
}
|
|
||||||
|
|
||||||
return Math.max(0, right.getMinX() - left.getMaxX());
|
if (rect1Left > rect2Right || rect2Left > rect1Right) {
|
||||||
|
return Math.max(rect2Left - rect1Right, rect1Left - rect2Right);
|
||||||
|
} else {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public double verticalDistance(BoundingBox other) {
|
public double verticalDistance(BoundingBox other) {
|
||||||
|
|
||||||
Rectangle2D bottom;
|
double rect1Top = getMaxY();
|
||||||
Rectangle2D top;
|
double rect1Bottom = getMinY();
|
||||||
if (this.isAbove(other)) {
|
double rect2Top = other.getMaxY();
|
||||||
top = this.getBBox();
|
double rect2Bottom = other.getMinY();
|
||||||
bottom = other.getBBox();
|
|
||||||
} else {
|
|
||||||
bottom = this.getBBox();
|
|
||||||
top = other.getBBox();
|
|
||||||
}
|
|
||||||
|
|
||||||
return Math.max(0, bottom.getMinY() - top.getMaxY());
|
if (rect1Bottom > rect2Top || rect2Bottom > rect1Top) {
|
||||||
|
return Math.max(rect2Bottom - rect1Top, rect1Bottom - rect2Top);
|
||||||
|
} else {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -99,4 +99,70 @@ public abstract class TextBoundingBox extends BoundingBox {
|
|||||||
return this.bBoxDirAdj.getCenterX();
|
return this.bBoxDirAdj.getCenterX();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double horizontalDistanceDirAdj(TextBoundingBox other) {
|
||||||
|
|
||||||
|
double rect1Right = getMaxXDirAdj();
|
||||||
|
double rect1Left = getXDirAdj();
|
||||||
|
double rect2Right = other.getMaxXDirAdj();
|
||||||
|
double rect2Left = other.getXDirAdj();
|
||||||
|
|
||||||
|
if (rect1Left > rect2Right || rect2Left > rect1Right) {
|
||||||
|
return Math.max(rect2Left - rect1Right, rect1Left - rect2Right);
|
||||||
|
} else {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double verticalDistanceDirAdj(TextBoundingBox other) {
|
||||||
|
|
||||||
|
double rect1Top = getMaxYDirAdj();
|
||||||
|
double rect1Bottom = getYDirAdj();
|
||||||
|
double rect2Top = other.getMaxYDirAdj();
|
||||||
|
double rect2Bottom = other.getYDirAdj();
|
||||||
|
|
||||||
|
if (rect1Bottom > rect2Top || rect2Bottom > rect1Top) {
|
||||||
|
return Math.max(rect2Bottom - rect1Top, rect1Bottom - rect2Top);
|
||||||
|
} else {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean intersectsDirAdj(TextBoundingBox other) {
|
||||||
|
|
||||||
|
return this.intersectsXDirAdj(other) && this.intersectsYDirAdj(other);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean intersectsDirAdj(TextBoundingBox other, float yThreshold, float xThreshold) {
|
||||||
|
|
||||||
|
return this.intersectsXDirAdj(other, xThreshold) && this.intersectsYDirAdj(other, yThreshold);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean intersectsXDirAdj(TextBoundingBox other, float threshold) {
|
||||||
|
|
||||||
|
return this.getXDirAdj() - threshold <= other.getMaxXDirAdj() && this.getMaxXDirAdj() + threshold >= other.getXDirAdj();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean intersectsXDirAdj(TextBoundingBox other) {
|
||||||
|
|
||||||
|
return this.getXDirAdj() <= other.getMaxXDirAdj() && this.getMaxXDirAdj() >= other.getXDirAdj();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean intersectsYDirAdj(TextBoundingBox other) {
|
||||||
|
|
||||||
|
return this.getYDirAdj() <= other.getMaxYDirAdj() && this.getMaxYDirAdj() >= other.getYDirAdj();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean intersectsYDirAdj(TextBoundingBox other, float threshold) {
|
||||||
|
|
||||||
|
return this.getYDirAdj() - threshold <= other.getMaxYDirAdj() && this.getMaxYDirAdj() + threshold >= other.getYDirAdj();
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -28,4 +28,10 @@ public class UnionFind<T> extends org.jgrapht.alg.util.UnionFind<T> {
|
|||||||
return setRep.values();
|
return setRep.values();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public Collection<T> getElements() {
|
||||||
|
|
||||||
|
return getParentMap().keySet();
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -44,7 +44,7 @@ public class FloatFrequencyCounter {
|
|||||||
|
|
||||||
public Double getMostPopular() {
|
public Double getMostPopular() {
|
||||||
|
|
||||||
if (changed) {
|
if (changed || mostPopularCache == null) {
|
||||||
Map.Entry<Double, Integer> mostPopular = null;
|
Map.Entry<Double, Integer> mostPopular = null;
|
||||||
for (Map.Entry<Double, Integer> entry : countPerValue.entrySet()) {
|
for (Map.Entry<Double, Integer> entry : countPerValue.entrySet()) {
|
||||||
if (mostPopular == null || entry.getValue() >= mostPopular.getValue()) {
|
if (mostPopular == null || entry.getValue() >= mostPopular.getValue()) {
|
||||||
|
|||||||
@ -15,6 +15,7 @@ public enum PageBlockType {
|
|||||||
PARAGRAPH_ITALIC,
|
PARAGRAPH_ITALIC,
|
||||||
PARAGRAPH_UNKNOWN,
|
PARAGRAPH_UNKNOWN,
|
||||||
OTHER,
|
OTHER,
|
||||||
|
TABLE_OF_CONTENTS_ITEM,
|
||||||
TABLE;
|
TABLE;
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -98,10 +98,10 @@ public class TextRange implements Comparable<TextRange> {
|
|||||||
public List<TextRange> split(List<Integer> splitIndices) {
|
public List<TextRange> split(List<Integer> splitIndices) {
|
||||||
|
|
||||||
if (splitIndices.stream()
|
if (splitIndices.stream()
|
||||||
.anyMatch(idx -> !this.containsExclusive(idx))) {
|
.anyMatch(idx -> !this.contains(idx))) {
|
||||||
throw new IndexOutOfBoundsException(format("%s splitting indices are out of range for %s",
|
throw new IndexOutOfBoundsException(format("%s splitting indices are out of range for %s",
|
||||||
splitIndices.stream()
|
splitIndices.stream()
|
||||||
.filter(idx -> !this.containsExclusive(idx))
|
.filter(idx -> !this.contains(idx))
|
||||||
.toList(),
|
.toList(),
|
||||||
this));
|
this));
|
||||||
}
|
}
|
||||||
@ -116,7 +116,9 @@ public class TextRange implements Comparable<TextRange> {
|
|||||||
splitBoundaries.add(new TextRange(previousIndex, splitIndex));
|
splitBoundaries.add(new TextRange(previousIndex, splitIndex));
|
||||||
previousIndex = splitIndex;
|
previousIndex = splitIndex;
|
||||||
}
|
}
|
||||||
splitBoundaries.add(new TextRange(previousIndex, end));
|
if (previousIndex != end) {
|
||||||
|
splitBoundaries.add(new TextRange(previousIndex, end));
|
||||||
|
}
|
||||||
return splitBoundaries;
|
return splitBoundaries;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -0,0 +1,21 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
|
||||||
|
|
||||||
|
import lombok.AccessLevel;
|
||||||
|
import lombok.Getter;
|
||||||
|
import lombok.NoArgsConstructor;
|
||||||
|
import lombok.experimental.FieldDefaults;
|
||||||
|
|
||||||
|
@Getter
|
||||||
|
@NoArgsConstructor
|
||||||
|
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||||
|
public class FrequencyCounters {
|
||||||
|
|
||||||
|
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
|
||||||
|
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
|
||||||
|
FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter();
|
||||||
|
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
|
||||||
|
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
|
||||||
|
|
||||||
|
}
|
||||||
@ -7,6 +7,8 @@ import org.apache.pdfbox.text.TextPosition;
|
|||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox;
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2;
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
|
||||||
|
|
||||||
import lombok.AccessLevel;
|
import lombok.AccessLevel;
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
|
|||||||
@ -9,10 +9,14 @@ public class StringFrequencyCounter {
|
|||||||
|
|
||||||
@Getter
|
@Getter
|
||||||
private final Map<String, Integer> countPerValue = new HashMap<>();
|
private final Map<String, Integer> countPerValue = new HashMap<>();
|
||||||
|
boolean changed;
|
||||||
|
String mostPopularCache;
|
||||||
|
|
||||||
|
|
||||||
public void add(String value) {
|
public void add(String value) {
|
||||||
|
|
||||||
|
changed = true;
|
||||||
|
|
||||||
if (!countPerValue.containsKey(value)) {
|
if (!countPerValue.containsKey(value)) {
|
||||||
countPerValue.put(value, 1);
|
countPerValue.put(value, 1);
|
||||||
} else {
|
} else {
|
||||||
@ -23,6 +27,8 @@ public class StringFrequencyCounter {
|
|||||||
|
|
||||||
public void addAll(Map<String, Integer> otherCounter) {
|
public void addAll(Map<String, Integer> otherCounter) {
|
||||||
|
|
||||||
|
changed = true;
|
||||||
|
|
||||||
for (Map.Entry<String, Integer> entry : otherCounter.entrySet()) {
|
for (Map.Entry<String, Integer> entry : otherCounter.entrySet()) {
|
||||||
if (countPerValue.containsKey(entry.getKey())) {
|
if (countPerValue.containsKey(entry.getKey())) {
|
||||||
countPerValue.put(entry.getKey(), countPerValue.get(entry.getKey()) + entry.getValue());
|
countPerValue.put(entry.getKey(), countPerValue.get(entry.getKey()) + entry.getValue());
|
||||||
@ -35,13 +41,18 @@ public class StringFrequencyCounter {
|
|||||||
|
|
||||||
public String getMostPopular() {
|
public String getMostPopular() {
|
||||||
|
|
||||||
Map.Entry<String, Integer> mostPopular = null;
|
if (changed || mostPopularCache == null) {
|
||||||
for (Map.Entry<String, Integer> entry : countPerValue.entrySet()) {
|
Map.Entry<String, Integer> mostPopular = null;
|
||||||
if (mostPopular == null || entry.getValue() > mostPopular.getValue()) {
|
for (Map.Entry<String, Integer> entry : countPerValue.entrySet()) {
|
||||||
mostPopular = entry;
|
if (mostPopular == null || entry.getValue() >= mostPopular.getValue()) {
|
||||||
|
mostPopular = entry;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
mostPopularCache = mostPopular != null ? mostPopular.getKey() : null;
|
||||||
|
changed = false;
|
||||||
}
|
}
|
||||||
return mostPopular != null ? mostPopular.getKey() : null;
|
|
||||||
|
return mostPopularCache;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -0,0 +1,7 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||||
|
|
||||||
|
public record TextBlockOnPage(ClassificationPage page, TextPageBlock textBlock) {
|
||||||
|
|
||||||
|
}
|
||||||
@ -7,7 +7,6 @@ import java.util.List;
|
|||||||
|
|
||||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
|
||||||
@ -27,19 +26,11 @@ public class TextPageBlock extends AbstractPageBlock {
|
|||||||
|
|
||||||
@Builder.Default
|
@Builder.Default
|
||||||
private List<TextPositionSequence> sequences = new ArrayList<>();
|
private List<TextPositionSequence> sequences = new ArrayList<>();
|
||||||
|
@Builder.Default
|
||||||
|
private FrequencyCounters frequencyCounters = new FrequencyCounters();
|
||||||
|
|
||||||
private Rectangle2D bBoxDirAdj;
|
private Rectangle2D bBoxDirAdj;
|
||||||
|
|
||||||
private String mostPopularWordFont;
|
|
||||||
|
|
||||||
private String mostPopularWordStyle;
|
|
||||||
|
|
||||||
private double mostPopularWordFontSize;
|
|
||||||
|
|
||||||
private double mostPopularWordHeight;
|
|
||||||
|
|
||||||
private double mostPopularWordSpaceWidth;
|
|
||||||
|
|
||||||
private boolean underlined;
|
private boolean underlined;
|
||||||
|
|
||||||
private double highestFontSize;
|
private double highestFontSize;
|
||||||
@ -55,8 +46,10 @@ public class TextPageBlock extends AbstractPageBlock {
|
|||||||
public TextPageBlock(List<TextPositionSequence> sequences) {
|
public TextPageBlock(List<TextPositionSequence> sequences) {
|
||||||
|
|
||||||
this.sequences = new ArrayList<>(sequences);
|
this.sequences = new ArrayList<>(sequences);
|
||||||
|
this.frequencyCounters = new FrequencyCounters();
|
||||||
|
|
||||||
if (!sequences.isEmpty()) {
|
if (!sequences.isEmpty()) {
|
||||||
calculateFrequencyCounters();
|
addToFrequencyCounters(sequences);
|
||||||
}
|
}
|
||||||
calculateBBox();
|
calculateBBox();
|
||||||
}
|
}
|
||||||
@ -118,32 +111,18 @@ public class TextPageBlock extends AbstractPageBlock {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void calculateFrequencyCounters() {
|
private void addToFrequencyCounters(List<TextPositionSequence> sequences) {
|
||||||
|
|
||||||
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
|
|
||||||
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
|
|
||||||
FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter();
|
|
||||||
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
|
|
||||||
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
|
|
||||||
|
|
||||||
for (TextPositionSequence wordBlock : sequences) {
|
for (TextPositionSequence wordBlock : sequences) {
|
||||||
|
|
||||||
lineHeightFrequencyCounter.add(wordBlock.getTextHeight());
|
frequencyCounters.getLineHeightFrequencyCounter().add(wordBlock.getTextHeight());
|
||||||
fontSizeFrequencyCounter.add(wordBlock.getFontSize());
|
frequencyCounters.getFontSizeFrequencyCounter().add(wordBlock.getFontSize());
|
||||||
spaceFrequencyCounter.add(wordBlock.getSpaceWidth());
|
frequencyCounters.getSpaceFrequencyCounter().add(wordBlock.getSpaceWidth());
|
||||||
fontFrequencyCounter.add(wordBlock.getFont());
|
frequencyCounters.getFontFrequencyCounter().add(wordBlock.getFont());
|
||||||
styleFrequencyCounter.add(wordBlock.getFontStyle());
|
frequencyCounters.getStyleFrequencyCounter().add(wordBlock.getFontStyle());
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
setMostPopularWordFont(fontFrequencyCounter.getMostPopular());
|
setUnderlined(this.sequences.stream()
|
||||||
setMostPopularWordStyle(styleFrequencyCounter.getMostPopular());
|
|
||||||
setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular());
|
|
||||||
setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
|
|
||||||
setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
|
|
||||||
setHighestFontSize(fontSizeFrequencyCounter.getHighest());
|
|
||||||
|
|
||||||
setUnderlined(sequences.stream()
|
|
||||||
.allMatch(TextPositionSequence::isUnderline));
|
.allMatch(TextPositionSequence::isUnderline));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -152,7 +131,7 @@ public class TextPageBlock extends AbstractPageBlock {
|
|||||||
|
|
||||||
TextPageBlock union = this.copy();
|
TextPageBlock union = this.copy();
|
||||||
union.add(r);
|
union.add(r);
|
||||||
calculateFrequencyCounters();
|
addToFrequencyCounters(List.of(r));
|
||||||
calculateBBox();
|
calculateBBox();
|
||||||
return union;
|
return union;
|
||||||
}
|
}
|
||||||
@ -162,7 +141,7 @@ public class TextPageBlock extends AbstractPageBlock {
|
|||||||
|
|
||||||
TextPageBlock union = this.copy();
|
TextPageBlock union = this.copy();
|
||||||
union.addAll(r.getSequences());
|
union.addAll(r.getSequences());
|
||||||
calculateFrequencyCounters();
|
addToFrequencyCounters(r.getSequences());
|
||||||
calculateBBox();
|
calculateBBox();
|
||||||
return union;
|
return union;
|
||||||
}
|
}
|
||||||
@ -172,7 +151,7 @@ public class TextPageBlock extends AbstractPageBlock {
|
|||||||
|
|
||||||
changed = true;
|
changed = true;
|
||||||
sequences.addAll(textPageBlock.getSequences());
|
sequences.addAll(textPageBlock.getSequences());
|
||||||
calculateFrequencyCounters();
|
addToFrequencyCounters(textPageBlock.getSequences());
|
||||||
calculateBBox();
|
calculateBBox();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -181,7 +160,7 @@ public class TextPageBlock extends AbstractPageBlock {
|
|||||||
|
|
||||||
changed = true;
|
changed = true;
|
||||||
sequences.add(textPositionSequence);
|
sequences.add(textPositionSequence);
|
||||||
calculateFrequencyCounters();
|
addToFrequencyCounters(List.of(textPositionSequence));
|
||||||
calculateBBox();
|
calculateBBox();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -190,7 +169,7 @@ public class TextPageBlock extends AbstractPageBlock {
|
|||||||
|
|
||||||
changed = true;
|
changed = true;
|
||||||
sequences.addAll(textPositionSequences);
|
sequences.addAll(textPositionSequences);
|
||||||
calculateFrequencyCounters();
|
addToFrequencyCounters(textPositionSequences);
|
||||||
calculateBBox();
|
calculateBBox();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -253,6 +232,36 @@ public class TextPageBlock extends AbstractPageBlock {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public String getMostPopularWordFont() {
|
||||||
|
|
||||||
|
return frequencyCounters.getFontFrequencyCounter().getMostPopular();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public String getMostPopularWordStyle() {
|
||||||
|
|
||||||
|
return frequencyCounters.getStyleFrequencyCounter().getMostPopular();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double getMostPopularWordFontSize() {
|
||||||
|
|
||||||
|
return frequencyCounters.getFontSizeFrequencyCounter().getMostPopular();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double getMostPopularWordHeight() {
|
||||||
|
|
||||||
|
return frequencyCounters.getLineHeightFrequencyCounter().getMostPopular();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double getMostPopularWordSpaceWidth() {
|
||||||
|
|
||||||
|
return frequencyCounters.getSpaceFrequencyCounter().getMostPopular();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean isEmpty() {
|
public boolean isEmpty() {
|
||||||
|
|
||||||
|
|||||||
@ -2,10 +2,13 @@ package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
|||||||
|
|
||||||
import static com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition.HEIGHT_PADDING;
|
import static com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition.HEIGHT_PADDING;
|
||||||
|
|
||||||
|
import java.awt.geom.AffineTransform;
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.pdfbox.text.TextPosition;
|
import org.apache.pdfbox.text.TextPosition;
|
||||||
@ -30,6 +33,7 @@ public class TextPositionSequence extends TextBoundingBox implements CharSequenc
|
|||||||
public static final String BOLD_ITALIC = "bold, italic";
|
public static final String BOLD_ITALIC = "bold, italic";
|
||||||
public static final String BOLD = "bold";
|
public static final String BOLD = "bold";
|
||||||
public static final String ITALIC = "italic";
|
public static final String ITALIC = "italic";
|
||||||
|
public static final Pattern FONT_CLEANER = Pattern.compile(",bold|,italic");
|
||||||
|
|
||||||
private int page;
|
private int page;
|
||||||
|
|
||||||
@ -154,7 +158,8 @@ public class TextPositionSequence extends TextBoundingBox implements CharSequenc
|
|||||||
if (textPositions.get(0).getFontName() == null) {
|
if (textPositions.get(0).getFontName() == null) {
|
||||||
return "none";
|
return "none";
|
||||||
}
|
}
|
||||||
return textPositions.get(0).getFontName().toLowerCase(Locale.ROOT).replaceAll(",bold", "").replaceAll(",italic", "");
|
|
||||||
|
return FONT_CLEANER.matcher(textPositions.get(0).getFontName().toLowerCase(Locale.ROOT)).replaceAll("");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -238,5 +243,15 @@ public class TextPositionSequence extends TextBoundingBox implements CharSequenc
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void transform(AffineTransform rotateInstance) {
|
||||||
|
|
||||||
|
for (RedTextPosition textPosition : getTextPositions()) {
|
||||||
|
Rectangle2D exactDirAdjCoordinates = rotateInstance.createTransformedShape(textPosition.getBBoxDirAdj()).getBounds2D();
|
||||||
|
textPosition.setBBoxDirAdj(exactDirAdjCoordinates);
|
||||||
|
}
|
||||||
|
calculateBBoxAndHashcode();
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -0,0 +1,34 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||||
|
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.HashMap;
|
||||||
|
|
||||||
|
public class TextPositionSequenceComparator implements Comparator<TextPositionSequence> {
|
||||||
|
|
||||||
|
private HashMap<TextPositionSequence, TextBlockOnPage> lookup;
|
||||||
|
|
||||||
|
|
||||||
|
public TextPositionSequenceComparator(HashMap<TextPositionSequence, TextBlockOnPage> lookup) {
|
||||||
|
|
||||||
|
this.lookup = lookup;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int compare(TextPositionSequence number1, TextPositionSequence number2) {
|
||||||
|
|
||||||
|
int page1 = lookup.get(number1).page().getPageNumber();
|
||||||
|
int page2 = lookup.get(number2).page().getPageNumber();
|
||||||
|
|
||||||
|
if (page1 != page2) {
|
||||||
|
return Integer.compare(page1, page2);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (number1.getY() != number2.getY()) {
|
||||||
|
return Double.compare(number1.getY(), number2.getY());
|
||||||
|
}
|
||||||
|
|
||||||
|
return Integer.compare(Integer.parseInt(number1.toString()), Integer.parseInt(number2.toString()));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -60,24 +60,18 @@ public class ClarifyndClassificationService {
|
|||||||
headlineClassificationService.setLastHeadlineFromOutline(textBlock);
|
headlineClassificationService.setLastHeadlineFromOutline(textBlock);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
if (textBlock.getClassification() != null && (textBlock.getClassification().equals(PageBlockType.HEADER)
|
||||||
|
|| textBlock.getClassification().equals(PageBlockType.FOOTER)
|
||||||
|
|| textBlock.getClassification().equals(PageBlockType.TABLE_OF_CONTENTS_ITEM))) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
if (document.getFontSizeCounter().getMostPopular() == null) {
|
if (document.getFontSizeCounter().getMostPopular() == null) {
|
||||||
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|
if (page.getPageNumber() == 1 //
|
||||||
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null
|
&& (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5
|
||||||
|| textBlock.getHighestFontSize() <= document.getFontSizeCounter()
|
&& textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) {
|
||||||
.getMostPopular())) {
|
|
||||||
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
|
||||||
|
|
||||||
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
|
|
||||||
|| PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null
|
|
||||||
|| textBlock.getHighestFontSize() <= document.getFontSizeCounter()
|
|
||||||
.getMostPopular())) {
|
|
||||||
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
|
||||||
} else if (page.getPageNumber() == 1 //
|
|
||||||
&& (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5
|
|
||||||
&& textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) {
|
|
||||||
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
|
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
|
||||||
textBlock.setClassification(PageBlockType.TITLE);
|
textBlock.setClassification(PageBlockType.TITLE);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -0,0 +1,62 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.services.classification;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService;
|
||||||
|
|
||||||
|
import lombok.AccessLevel;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import lombok.experimental.FieldDefaults;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
|
||||||
|
@Slf4j
|
||||||
|
@Service
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||||
|
public class ClassificationService {
|
||||||
|
|
||||||
|
DocuMineBlockificationService docuMineBlockificationService;
|
||||||
|
BodyTextFrameService bodyTextFrameService;
|
||||||
|
TableOfContentsClassificationService tableOfContentsClassificationService;
|
||||||
|
RedactManagerClassificationService redactManagerClassificationService;
|
||||||
|
ClarifyndClassificationService clarifyndClassificationService;
|
||||||
|
DocuMineClassificationService docuMineClassificationService;
|
||||||
|
HeaderFooterClassificationService headerFooterClassificationService;
|
||||||
|
|
||||||
|
|
||||||
|
public void classify(ClassificationDocument document, LayoutParsingType layoutParsingType, Map<String, String> identifier) {
|
||||||
|
|
||||||
|
log.info("Calculating BodyTextFrame for {}", identifier);
|
||||||
|
bodyTextFrameService.setBodyTextFrames(document, layoutParsingType);
|
||||||
|
for (ClassificationPage page : document.getPages()) {
|
||||||
|
document.getLayoutDebugLayer().addCleanRulingVisualization(page.getCleanRulings(), page.getPageNumber());
|
||||||
|
}
|
||||||
|
log.info("Classify TextBlocks for {}", identifier);
|
||||||
|
|
||||||
|
headerFooterClassificationService.classifyHeadersAndFooters(document);
|
||||||
|
|
||||||
|
tableOfContentsClassificationService.classifyTableOfContents(document);
|
||||||
|
|
||||||
|
switch (layoutParsingType) {
|
||||||
|
case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_OLD, CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH ->
|
||||||
|
redactManagerClassificationService.classifyDocument(document);
|
||||||
|
case DOCUMINE_OLD, DOCUMINE -> docuMineClassificationService.classifyDocument(document);
|
||||||
|
case CLARIFYND -> clarifyndClassificationService.classifyDocument(document);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD)) {
|
||||||
|
for (ClassificationPage page : document.getPages()) {
|
||||||
|
docuMineBlockificationService.mergeblocks(page, page.getCleanRulings().withoutTextRulings(), 0, 10);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.services.classifica
|
|||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
|
import java.util.Set;
|
||||||
import java.util.regex.Matcher;
|
import java.util.regex.Matcher;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
@ -29,10 +30,12 @@ public class DocuMineClassificationService {
|
|||||||
private static final Pattern HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN = Pattern.compile("^([0-9]\\.)\\s[a-z][0-9a-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
|
private static final Pattern HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN = Pattern.compile("^([0-9]\\.)\\s[a-z][0-9a-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
|
||||||
private static final Pattern AT_LEAST_3_CHARS_PATTERN = Pattern.compile("\\p{L}{3,}", Pattern.CASE_INSENSITIVE);
|
private static final Pattern AT_LEAST_3_CHARS_PATTERN = Pattern.compile("\\p{L}{3,}", Pattern.CASE_INSENSITIVE);
|
||||||
private static final Pattern HEADLINE_PATTERN_WITH_SLASHES = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*");
|
private static final Pattern HEADLINE_PATTERN_WITH_SLASHES = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*");
|
||||||
private static final Pattern AMOUNT_PATTERN = Pattern.compile("^\\s*\\d+(?:\\.\\d+)?\\s*(?:ml|l|g|kg|mg|cm|mm|km|m|lb|oz|ppm|%|f)\\b", Pattern.CASE_INSENSITIVE);
|
public static final Pattern AMOUNT_PATTERN = Pattern.compile("^\\s*\\d+(?:\\.\\d+)?\\s*(?:ml|l|g|kg|mg|cm|cm2|cm3|mm|mm2|mm3|km|km2|m|m2|m3|lb|oz|ppm|dpm|days|weeks|months|%|f)\\b",
|
||||||
private static final Pattern TABLE_OR_FIGURE_PATTER = Pattern.compile(
|
Pattern.CASE_INSENSITIVE);
|
||||||
|
private static final Pattern TABLE_OR_FIGURE_PATTERN = Pattern.compile(
|
||||||
"^\\s*(?:table|continued\\s+table|appendix|figure)\\s+(?:[xvi]+|[a-z0-9]{1,3}(?:\\.[0-9]{1,3})*(?:-[0-9]{1,3})?)\\b",
|
"^\\s*(?:table|continued\\s+table|appendix|figure)\\s+(?:[xvi]+|[a-z0-9]{1,3}(?:\\.[0-9]{1,3})*(?:-[0-9]{1,3})?)\\b",
|
||||||
Pattern.CASE_INSENSITIVE);
|
Pattern.CASE_INSENSITIVE);
|
||||||
|
private static final Pattern ALPHANUMERIC = Pattern.compile("[a-zA-Z0-9]");
|
||||||
|
|
||||||
public static final int SEPARATION_THRESHOLD = 10; // if the min distance between a textblock and all its surrounding blocks, the regexes can be more lenient.
|
public static final int SEPARATION_THRESHOLD = 10; // if the min distance between a textblock and all its surrounding blocks, the regexes can be more lenient.
|
||||||
public static final int SURROUNDING_BLOCKS_RADIUS = 3; // number of surrounding blocks before and after the current textblock to be tested
|
public static final int SURROUNDING_BLOCKS_RADIUS = 3; // number of surrounding blocks before and after the current textblock to be tested
|
||||||
@ -78,6 +81,9 @@ public class DocuMineClassificationService {
|
|||||||
if (i == originalIndex) {
|
if (i == originalIndex) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
if (textBlocks.get(i).getText().length() <= 1) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
surroundingBlocks.add(textBlocks.get(i));
|
surroundingBlocks.add(textBlocks.get(i));
|
||||||
}
|
}
|
||||||
return surroundingBlocks;
|
return surroundingBlocks;
|
||||||
@ -98,10 +104,9 @@ public class DocuMineClassificationService {
|
|||||||
Matcher atLeast3Matcher = AT_LEAST_3_CHARS_PATTERN.matcher(textBlock.toString());
|
Matcher atLeast3Matcher = AT_LEAST_3_CHARS_PATTERN.matcher(textBlock.toString());
|
||||||
Matcher headlineWithSlashesMatcher = HEADLINE_PATTERN_WITH_SLASHES.matcher(textBlock.toString());
|
Matcher headlineWithSlashesMatcher = HEADLINE_PATTERN_WITH_SLASHES.matcher(textBlock.toString());
|
||||||
Matcher amountMatcher = AMOUNT_PATTERN.matcher(textBlock.toString());
|
Matcher amountMatcher = AMOUNT_PATTERN.matcher(textBlock.toString());
|
||||||
Matcher tableOrFigureMatcher = TABLE_OR_FIGURE_PATTER.matcher(textBlock.toString());
|
Matcher tableOrFigureMatcher = TABLE_OR_FIGURE_PATTERN.matcher(textBlock.toString());
|
||||||
Matcher headlineWithSingleIdentifierMatcher = HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN.matcher(textBlock.toString());
|
Matcher headlineWithSingleIdentifierMatcher = HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN.matcher(textBlock.toString());
|
||||||
boolean isAtLeast3Characters = atLeast3Matcher.reset().find();
|
boolean isAtLeast3Characters = atLeast3Matcher.reset().find();
|
||||||
boolean isTocItem = textBlock.getText().contains("..............");
|
|
||||||
boolean headlineWithSlashesMatches = headlineWithSlashesMatcher.reset().matches();
|
boolean headlineWithSlashesMatches = headlineWithSlashesMatcher.reset().matches();
|
||||||
boolean isAmount = amountMatcher.reset().find();
|
boolean isAmount = amountMatcher.reset().find();
|
||||||
int charCount = countChars(textBlock);
|
int charCount = countChars(textBlock);
|
||||||
@ -112,35 +117,22 @@ public class DocuMineClassificationService {
|
|||||||
headlineClassificationService.setLastHeadlineFromOutline(textBlock);
|
headlineClassificationService.setLastHeadlineFromOutline(textBlock);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
if (textBlock.getClassification() != null && (textBlock.getClassification().equals(PageBlockType.HEADER)
|
||||||
|
|| textBlock.getClassification().equals(PageBlockType.FOOTER)
|
||||||
|
|| textBlock.getClassification().equals(PageBlockType.TABLE_OF_CONTENTS_ITEM))) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
if (document.getFontSizeCounter().getMostPopular() == null) {
|
if (document.getFontSizeCounter().getMostPopular() == null) {
|
||||||
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) //
|
if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5
|
||||||
|| (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) //
|
&& textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) {
|
||||||
&& (document.getFontSizeCounter().getMostPopular() == null //
|
|
||||||
|| textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular()))) {
|
|
||||||
textBlock.setClassification(PageBlockType.HEADER);
|
|
||||||
|
|
||||||
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
|
|
||||||
|| (PositionUtils.isUnderBodyTextFrame(bodyTextFrame,
|
|
||||||
textBlock,
|
|
||||||
page.getRotation())
|
|
||||||
&& (document.getFontSizeCounter().getMostPopular()
|
|
||||||
== null
|
|
||||||
|| textBlock.getHighestFontSize()
|
|
||||||
<= document.getFontSizeCounter()
|
|
||||||
.getMostPopular()))
|
|
||||||
|| HeaderFooterDetection.isLikelyFooter(textBlock, document, page)) {
|
|
||||||
textBlock.setClassification(PageBlockType.FOOTER);
|
|
||||||
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5
|
|
||||||
&& textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) {
|
|
||||||
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
|
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
|
||||||
textBlock.setClassification(PageBlockType.TITLE);
|
textBlock.setClassification(PageBlockType.TITLE);
|
||||||
}
|
}
|
||||||
} else if (textBlock.getText().length() > 5
|
} else if (textBlock.getText().length() > 5
|
||||||
&& (textBlock.getMostPopularWordHeight() > document.getTextHeightCounter().getMostPopular()
|
&& greaterOrEqualFontThanDocumentAverage(textBlock, document)
|
||||||
|| textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular())
|
|
||||||
&& PositionUtils.getApproxLineCount(textBlock) < 5.9
|
&& PositionUtils.getApproxLineCount(textBlock) < 5.9
|
||||||
&& ((textBlock.getMostPopularWordStyle().contains("bold") || textBlock.isUnderlined())//
|
&& ((textBlock.getMostPopularWordStyle().contains("bold") || textBlock.isUnderlined())//
|
||||||
&& Character.isDigit(textBlock.toString().charAt(0)) //
|
&& Character.isDigit(textBlock.toString().charAt(0)) //
|
||||||
@ -152,18 +144,19 @@ public class DocuMineClassificationService {
|
|||||||
|| textBlock.toString().startsWith("TABLE"))
|
|| textBlock.toString().startsWith("TABLE"))
|
||||||
&& !textBlock.toString().endsWith(":")
|
&& !textBlock.toString().endsWith(":")
|
||||||
&& isAtLeast3Characters
|
&& isAtLeast3Characters
|
||||||
&& !isTocItem
|
|
||||||
&& !isAmount
|
&& !isAmount
|
||||||
&& enoughChars) {
|
&& enoughChars) {
|
||||||
|
|
||||||
setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes);
|
setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes);
|
||||||
} else if (isAllCaps(textBlock)
|
} else if (isAllCaps(textBlock)
|
||||||
|
&& ALPHANUMERIC.matcher(Character.toString(textBlock.getText().charAt(0))).matches()
|
||||||
|
&& hasSeparation(textBlock, surroundingBlocks)
|
||||||
&& textBlock.getText().length() > 5
|
&& textBlock.getText().length() > 5
|
||||||
&& isAtLeast3Characters
|
&& isAtLeast3Characters
|
||||||
&& !isAmount
|
&& !isAmount
|
||||||
&& enoughChars
|
&& enoughChars
|
||||||
&& !textBlock.toString().contains(":")
|
&& !textBlock.toString().contains(":")
|
||||||
&& !textBlock.toString().startsWith("(")
|
&& !textBlock.toString().endsWith(".")
|
||||||
&& PositionUtils.getApproxLineCount(textBlock) < 2.9) {
|
&& PositionUtils.getApproxLineCount(textBlock) < 2.9) {
|
||||||
|
|
||||||
setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes);
|
setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes);
|
||||||
@ -171,16 +164,14 @@ public class DocuMineClassificationService {
|
|||||||
&& PositionUtils.getApproxLineCount(textBlock) < 2.9
|
&& PositionUtils.getApproxLineCount(textBlock) < 2.9
|
||||||
&& isAtLeast3Characters
|
&& isAtLeast3Characters
|
||||||
&& !headlineWithSlashesMatches
|
&& !headlineWithSlashesMatches
|
||||||
&& !isAmount
|
&& !isAmount) {
|
||||||
&& !isTocItem) {
|
|
||||||
|
|
||||||
setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes);
|
setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes);
|
||||||
} else if (!isTocItem
|
} else if (hasSeparation(textBlock, surroundingBlocks)//
|
||||||
&& hasSeparation(textBlock, surroundingBlocks)
|
&& greaterOrEqualFontThanPageAverage(textBlock, page)//
|
||||||
&& greaterOrEqualThanFontPageAverage(textBlock, page)
|
&& PositionUtils.getApproxLineCount(textBlock) < 2.9//
|
||||||
&& PositionUtils.getApproxLineCount(textBlock) < 2.9
|
&& (tableOrFigureMatcher.reset().find() || headlineWithSingleIdentifierMatcher.reset().find())//
|
||||||
&& (tableOrFigureMatcher.reset().find() || headlineWithSingleIdentifierMatcher.reset().find())
|
&& !isAmount//
|
||||||
&& !isAmount
|
|
||||||
&& !headlineWithSlashesMatches) {
|
&& !headlineWithSlashesMatches) {
|
||||||
|
|
||||||
setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes);
|
setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes);
|
||||||
@ -222,13 +213,20 @@ public class DocuMineClassificationService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static boolean greaterOrEqualThanFontPageAverage(TextPageBlock textBlock, ClassificationPage page) {
|
private static boolean greaterOrEqualFontThanPageAverage(TextPageBlock textBlock, ClassificationPage page) {
|
||||||
|
|
||||||
return textBlock.getMostPopularWordHeight() >= page.getTextHeightCounter().getMostPopular() //
|
return textBlock.getMostPopularWordHeight() >= page.getTextHeightCounter().getMostPopular() //
|
||||||
|| textBlock.getMostPopularWordFontSize() >= page.getFontSizeCounter().getMostPopular();
|
|| textBlock.getMostPopularWordFontSize() >= page.getFontSizeCounter().getMostPopular();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static boolean greaterOrEqualFontThanDocumentAverage(TextPageBlock textBlock, ClassificationDocument document) {
|
||||||
|
|
||||||
|
return textBlock.getMostPopularWordHeight() > document.getTextHeightCounter().getMostPopular() //
|
||||||
|
|| textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
private static boolean isAllCaps(TextPageBlock textBlock) {
|
private static boolean isAllCaps(TextPageBlock textBlock) {
|
||||||
|
|
||||||
return textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT));
|
return textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT));
|
||||||
@ -246,8 +244,7 @@ public class DocuMineClassificationService {
|
|||||||
|
|
||||||
return surroundingBlocks.stream()
|
return surroundingBlocks.stream()
|
||||||
.mapToDouble(surroundingBlock -> calculateSeparation(textBlock, surroundingBlock))
|
.mapToDouble(surroundingBlock -> calculateSeparation(textBlock, surroundingBlock))
|
||||||
.min()
|
.min().orElse(Double.MAX_VALUE);
|
||||||
.orElse(Double.MAX_VALUE);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -0,0 +1,55 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.services.classification;
|
||||||
|
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
|
||||||
|
|
||||||
|
import lombok.AccessLevel;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import lombok.experimental.FieldDefaults;
|
||||||
|
|
||||||
|
@Service
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||||
|
public class HeaderFooterClassificationService {
|
||||||
|
|
||||||
|
public void classifyHeadersAndFooters(ClassificationDocument document) {
|
||||||
|
|
||||||
|
for (ClassificationPage page : document.getPages()) {
|
||||||
|
for (AbstractPageBlock pageBlock : page.getTextBlocks()) {
|
||||||
|
if (pageBlock instanceof TextPageBlock textBlock) {
|
||||||
|
classifyBlock(document, page, textBlock);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static void classifyBlock(ClassificationDocument document, ClassificationPage page, TextPageBlock textBlock) {
|
||||||
|
|
||||||
|
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|
||||||
|
|| PositionUtils.isOverBodyTextFrame(page.getBodyTextFrame(), textBlock, page.getRotation()) && smallerFontThanDocAverage(document, textBlock)) {
|
||||||
|
|
||||||
|
textBlock.setClassification(PageBlockType.HEADER);
|
||||||
|
|
||||||
|
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
|
||||||
|
|| PositionUtils.isUnderBodyTextFrame(page.getBodyTextFrame(), textBlock, page.getRotation()) && smallerFontThanDocAverage(document, textBlock)) {
|
||||||
|
|
||||||
|
textBlock.setClassification(PageBlockType.FOOTER);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static boolean smallerFontThanDocAverage(ClassificationDocument document, TextPageBlock textBlock) {
|
||||||
|
|
||||||
|
return document.getFontSizeCounter().getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -61,6 +61,15 @@ public class RedactManagerClassificationService {
|
|||||||
headlineClassificationService.setLastHeadlineFromOutline(textBlock);
|
headlineClassificationService.setLastHeadlineFromOutline(textBlock);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
if (textBlock.getClassification() != null && (textBlock.getClassification().equals(PageBlockType.HEADER)
|
||||||
|
|| textBlock.getClassification().equals(PageBlockType.FOOTER)
|
||||||
|
|| textBlock.getClassification().equals(PageBlockType.TABLE_OF_CONTENTS_ITEM))) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (document.getFontSizeCounter().getMostPopular() == null) {
|
||||||
|
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
||||||
|
return;
|
||||||
|
}
|
||||||
if (document.getFontSizeCounter().getMostPopular() == null) {
|
if (document.getFontSizeCounter().getMostPopular() == null) {
|
||||||
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
||||||
return;
|
return;
|
||||||
@ -73,21 +82,8 @@ public class RedactManagerClassificationService {
|
|||||||
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|
if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5
|
||||||
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null
|
&& textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) {
|
||||||
|| textBlock.getHighestFontSize() <= document.getFontSizeCounter()
|
|
||||||
.getMostPopular())) {
|
|
||||||
|
|
||||||
textBlock.setClassification(PageBlockType.HEADER);
|
|
||||||
|
|
||||||
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
|
|
||||||
|| PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null
|
|
||||||
|| textBlock.getHighestFontSize() <= document.getFontSizeCounter()
|
|
||||||
.getMostPopular())) {
|
|
||||||
|
|
||||||
textBlock.setClassification(PageBlockType.FOOTER);
|
|
||||||
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5
|
|
||||||
&& textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) {
|
|
||||||
|
|
||||||
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
|
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
|
||||||
textBlock.setClassification(PageBlockType.TITLE);
|
textBlock.setClassification(PageBlockType.TITLE);
|
||||||
|
|||||||
@ -0,0 +1,370 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.services.classification;
|
||||||
|
|
||||||
|
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.DocuMineClassificationService.AMOUNT_PATTERN;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.LinkedList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Locale;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextBlockOnPage;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequenceComparator;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
|
||||||
|
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
|
||||||
|
@Slf4j
|
||||||
|
@Service
|
||||||
|
public class TableOfContentsClassificationService {
|
||||||
|
|
||||||
|
private static final int MAX_PAGE_COUNT = 10; // maximum length of a toc to avoid runaway classification
|
||||||
|
private static final int SURROUNDING_BLOCKS_RADIUS = 10; // number of blocks to look ahead
|
||||||
|
private static final int MINIMUM_MATCHES = 2; // minimum cluster size
|
||||||
|
public static final int INTERSECTION_TOLERANCE = 2; // equality threshold for x intersection
|
||||||
|
public static final int DENSITY_THRESHOLD_COUNT = 10; // describes the minimum density, at least this many entries per page height are required
|
||||||
|
|
||||||
|
private static final Pattern NUMERIC = Pattern.compile("[0-9]+");
|
||||||
|
|
||||||
|
|
||||||
|
@SuppressWarnings("checkstyle:ModifiedControlVariable")
|
||||||
|
public void classifyTableOfContents(ClassificationDocument document) {
|
||||||
|
|
||||||
|
List<TextBlockOnPage> textBlocks = buildBlocksPerPage(document);
|
||||||
|
|
||||||
|
for (int i = 0; i < textBlocks.size(); i++) {
|
||||||
|
TextBlockOnPage textBlock = textBlocks.get(i);
|
||||||
|
|
||||||
|
if (!isTOCHeadline(textBlock)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
int offset = identifyTOCItems(i + 1, textBlocks, document);
|
||||||
|
|
||||||
|
if (offset > 1) {
|
||||||
|
textBlock.textBlock().setClassification(PageBlockType.H1);
|
||||||
|
i += offset;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private int identifyTOCItems(int start, List<TextBlockOnPage> textBlocks, ClassificationDocument document) {
|
||||||
|
|
||||||
|
ClassificationPage startPage = textBlocks.get(start).page();
|
||||||
|
List<TextBlockOnPage> initialLookAhead = textBlocks.subList(start, Math.min(start + SURROUNDING_BLOCKS_RADIUS, textBlocks.size()));
|
||||||
|
HashMap<TextPositionSequence, TextBlockOnPage> lookup = new HashMap<>();
|
||||||
|
List<TextPositionSequence> numbers = extractNumbers(initialLookAhead, lookup, document.getPages().size());
|
||||||
|
TocNumberFinder tocNumberFinder = new TocNumberFinder(numbers, lookup);
|
||||||
|
|
||||||
|
int lastCandidate = start;
|
||||||
|
for (int i = start; i < Math.min(lastCandidate + SURROUNDING_BLOCKS_RADIUS, textBlocks.size()); i++) {
|
||||||
|
|
||||||
|
TextBlockOnPage textBlockOnPage = textBlocks.get(i);
|
||||||
|
if (textBlockOnPage.page().getPageNumber() - MAX_PAGE_COUNT > startPage.getPageNumber()) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (textBlockOnPage.textBlock().getClassification() != null //
|
||||||
|
&& textBlockOnPage.textBlock().getClassification().isHeadline() //
|
||||||
|
&& !(textBlockOnPage.textBlock().getText().startsWith("TABLES") //
|
||||||
|
|| textBlockOnPage.textBlock().getText().startsWith("APPENDICES") //
|
||||||
|
|| textBlockOnPage.textBlock().getText().startsWith("FIGURES"))) {
|
||||||
|
log.debug("hit an outline headline, stop immediately.");
|
||||||
|
lastCandidate = i - 1;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
List<TextPositionSequence> numbersFromBlock = extractNumbers(textBlockOnPage, lookup, document.getPages().size());
|
||||||
|
|
||||||
|
List<TextPositionSequence> currentRightmostCluster = tocNumberFinder.getCurrentRightmostCluster();
|
||||||
|
|
||||||
|
if (currentRightmostCluster.size() < MINIMUM_MATCHES) {
|
||||||
|
log.debug("No numbers indicating a table of contents here.");
|
||||||
|
return start;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (anyIntersection(currentRightmostCluster, numbersFromBlock, lookup)) {
|
||||||
|
lastCandidate = i;
|
||||||
|
numbersFromBlock.forEach(tocNumberFinder::add);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
addVisualization(document.getLayoutDebugLayer(), tocNumberFinder, lookup);
|
||||||
|
|
||||||
|
Set<TextBlockOnPage> blocksWithNumberInCluster = tocNumberFinder.getCurrentRightmostCluster()
|
||||||
|
.stream()
|
||||||
|
.map(lookup::get)
|
||||||
|
.collect(Collectors.toSet());
|
||||||
|
|
||||||
|
int lastConfirmed = start;
|
||||||
|
for (int i = start; i < lastCandidate + 1; i++) {
|
||||||
|
TextBlockOnPage textBlockOnPage = textBlocks.get(i);
|
||||||
|
if (blocksWithNumberInCluster.contains(textBlockOnPage)) {
|
||||||
|
lastConfirmed = i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
textBlocks.subList(start, lastConfirmed + 1)
|
||||||
|
.stream()
|
||||||
|
.filter(block -> (block.textBlock().getClassification() == null || !block.textBlock().getClassification().isHeadline()))
|
||||||
|
.forEach(textBlockOnPage -> textBlockOnPage.textBlock().setClassification(PageBlockType.TABLE_OF_CONTENTS_ITEM));
|
||||||
|
|
||||||
|
return lastCandidate;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static void addVisualization(LayoutDebugLayer layoutDebugLayer, TocNumberFinder tocNumberFinder, Map<TextPositionSequence, TextBlockOnPage> lookup) {
|
||||||
|
|
||||||
|
tocNumberFinder.getCurrentRightmostCluster()
|
||||||
|
.stream()
|
||||||
|
.collect(Collectors.groupingBy(key -> lookup.get(key).page().getPageNumber()))
|
||||||
|
.forEach((pageNumber, number) -> layoutDebugLayer.addTocPages(number, pageNumber));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static boolean anyIntersection(Collection<TextPositionSequence> numbers1,
|
||||||
|
Collection<TextPositionSequence> numbers2,
|
||||||
|
Map<TextPositionSequence, TextBlockOnPage> lookup) {
|
||||||
|
|
||||||
|
return numbers1.stream()
|
||||||
|
.anyMatch(numberFromCluster -> numbers2.stream()
|
||||||
|
.anyMatch(numberFromBlock -> matches(numberFromBlock, numberFromCluster, lookup)));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static List<TextPositionSequence> extractNumbers(List<TextBlockOnPage> textBlocks, Map<TextPositionSequence, TextBlockOnPage> lookup, int numberOfPages) {
|
||||||
|
|
||||||
|
List<TextPositionSequence> blocks = new LinkedList<>();
|
||||||
|
for (TextBlockOnPage textBlock : textBlocks) {
|
||||||
|
blocks.addAll(extractNumbers(textBlock, lookup, numberOfPages));
|
||||||
|
}
|
||||||
|
return blocks;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static List<TextPositionSequence> extractNumbers(TextBlockOnPage textBlock, Map<TextPositionSequence, TextBlockOnPage> lookup, int numberOfPages) {
|
||||||
|
|
||||||
|
List<TextPositionSequence> blocks = new LinkedList<>();
|
||||||
|
TextPageBlock block = textBlock.textBlock();
|
||||||
|
List<TextPositionSequence> sequences = block.getSequences();
|
||||||
|
for (int i = 0; i < sequences.size(); i++) {
|
||||||
|
|
||||||
|
TextPositionSequence word = sequences.get(i);
|
||||||
|
|
||||||
|
if (!NUMERIC.matcher(word).matches()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (AMOUNT_PATTERN.matcher(getSurroundingString(i, sequences)).matches()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
int pageNumber = Integer.parseInt(word.toString());
|
||||||
|
if (0 >= pageNumber || pageNumber > numberOfPages) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
lookup.put(word, textBlock);
|
||||||
|
blocks.add(word);
|
||||||
|
} catch (NumberFormatException e) {
|
||||||
|
log.debug("That wasn't a number! Should not happen, due to numeric check beforehand.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return blocks;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static CharSequence getSurroundingString(int i, List<TextPositionSequence> sequences) {
|
||||||
|
|
||||||
|
int end = Math.min(i + 5, sequences.size());
|
||||||
|
return sequences.subList(i, end)
|
||||||
|
.stream()
|
||||||
|
.map(TextPositionSequence::toString)
|
||||||
|
.collect(Collectors.joining(" "));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static boolean matches(TextPositionSequence number1, TextPositionSequence number2, Map<TextPositionSequence, TextBlockOnPage> lookup) {
|
||||||
|
|
||||||
|
if (number1.getDir() != number2.getDir()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return number1.intersectsXDirAdj(number2, INTERSECTION_TOLERANCE);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private boolean isTOCHeadline(TextBlockOnPage textBlock) {
|
||||||
|
|
||||||
|
if (textBlock.textBlock().getText().length() > 50) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
String text = TextNormalizationUtilities.removeAllWhitespaces(textBlock.textBlock().getText().toLowerCase(Locale.ENGLISH));
|
||||||
|
return (text.contains("content") && text.length() < "content".length() + 6) //
|
||||||
|
|| (text.contains("tableofcontent") && text.length() < "tableofcontent".length() + DENSITY_THRESHOLD_COUNT)//
|
||||||
|
|| text.equals("tables")//
|
||||||
|
|| text.equals("appendices")//
|
||||||
|
|| text.equals("figures");
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private List<TextBlockOnPage> buildBlocksPerPage(ClassificationDocument document) {
|
||||||
|
|
||||||
|
List<TextBlockOnPage> blocks = new ArrayList<>();
|
||||||
|
for (ClassificationPage page : document.getPages()) {
|
||||||
|
for (AbstractPageBlock abstractPageBlock : page.getTextBlocks()) {
|
||||||
|
if (abstractPageBlock instanceof TextPageBlock textBlock) {
|
||||||
|
if (textBlock.getClassification() != null && (textBlock.getClassification().equals(PageBlockType.HEADER) //
|
||||||
|
|| textBlock.getClassification().equals(PageBlockType.FOOTER))) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
blocks.add(new TextBlockOnPage(page, textBlock));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return blocks;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static class TocNumberFinder {
|
||||||
|
|
||||||
|
final UnionFind<TextPositionSequence> numberClusters;
|
||||||
|
final HashMap<TextPositionSequence, TextBlockOnPage> lookup;
|
||||||
|
|
||||||
|
|
||||||
|
TocNumberFinder(List<TextPositionSequence> blocks, HashMap<TextPositionSequence, TextBlockOnPage> lookup) {
|
||||||
|
|
||||||
|
this.numberClusters = new UnionFind<>(new HashSet<>(blocks));
|
||||||
|
for (int i = 0; i < blocks.size(); i++) {
|
||||||
|
for (int j = i + 1; j < blocks.size(); j++) {
|
||||||
|
if (matches(blocks.get(i), blocks.get(j), lookup)) {
|
||||||
|
numberClusters.union(blocks.get(i), blocks.get(j));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
this.lookup = lookup;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void add(TextPositionSequence number) {
|
||||||
|
|
||||||
|
if (numberClusters.getElements().contains(number)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
numberClusters.addElement(number);
|
||||||
|
for (TextPositionSequence element : numberClusters.getElements()) {
|
||||||
|
if (matches(number, element, lookup)) {
|
||||||
|
numberClusters.union(element, number);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public List<TextPositionSequence> getCurrentRightmostCluster() {
|
||||||
|
|
||||||
|
return numberClusters.getGroups()
|
||||||
|
.stream()
|
||||||
|
.filter(cluster -> cluster.size() > MINIMUM_MATCHES)
|
||||||
|
.map(cluster -> cluster.stream()
|
||||||
|
.sorted(new TextPositionSequenceComparator(lookup))
|
||||||
|
.toList())
|
||||||
|
.map(this::removeOutliers)
|
||||||
|
// .map(this::filterByMinimumDensity)
|
||||||
|
.filter(cluster -> cluster.size() > MINIMUM_MATCHES)
|
||||||
|
.max(Comparator.comparingDouble(cluster -> cluster.get(0).getBBox().getMaxX())).orElse(Collections.emptyList());
|
||||||
|
}
|
||||||
|
|
||||||
|
// does not seem to be doing much, ideally instead of using the height of the blocks i would like to use the height, beginning from the MainBody top,
|
||||||
|
// but as the MainBody is often wrong, this results in some numbers being discarded even though they are correct.
|
||||||
|
// private List<TextPositionSequence> filterByMinimumDensity(List<TextPositionSequence> numbers) {
|
||||||
|
//
|
||||||
|
// Map<ClassificationPage, List<TextPositionSequence>> clustersPerPage = numbers.stream()
|
||||||
|
// .collect(Collectors.groupingBy(number -> lookup.get(number).page()));
|
||||||
|
//
|
||||||
|
// List<TextPositionSequence> result = new ArrayList<>(numbers.size());
|
||||||
|
// clustersPerPage.keySet()
|
||||||
|
// .stream()
|
||||||
|
// .sorted(Comparator.comparingInt(ClassificationPage::getPageNumber))
|
||||||
|
// .forEach(page -> {
|
||||||
|
// var numbersOnPage = clustersPerPage.get(page);
|
||||||
|
//
|
||||||
|
// double height = numbersOnPage.stream()
|
||||||
|
// .map(BoundingBox::getBBox)
|
||||||
|
// .collect(RectangleTransformations.collectBBox()).getHeight();
|
||||||
|
//
|
||||||
|
// double count = numbersOnPage.size();
|
||||||
|
//
|
||||||
|
// if ((count / height) >= (DENSITY_THRESHOLD_COUNT / page.getPageHeight())) {
|
||||||
|
// result.addAll(numbers);
|
||||||
|
// }
|
||||||
|
// });
|
||||||
|
// return result;
|
||||||
|
// }
|
||||||
|
|
||||||
|
|
||||||
|
public List<TextPositionSequence> removeOutliers(List<TextPositionSequence> numbers) {
|
||||||
|
|
||||||
|
List<TextPositionSequence> result = new ArrayList<>();
|
||||||
|
|
||||||
|
result.add(numbers.get(0));
|
||||||
|
|
||||||
|
for (int i = 1; i < numbers.size() - 1; i++) {
|
||||||
|
int prev = getNumberAsInt(numbers, i - 1);
|
||||||
|
int curr = getNumberAsInt(numbers, i);
|
||||||
|
int next = getNumberAsInt(numbers, i + 1);
|
||||||
|
|
||||||
|
if (!(curr <= prev || curr >= next) || !isBetterWithout(numbers, i)) {
|
||||||
|
result.add(numbers.get(i));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (getNumberAsInt(numbers, numbers.size() - 1) >= getNumberAsInt(numbers, Math.max(0, numbers.size() - 2))) {
|
||||||
|
result.add(numbers.get(numbers.size() - 1));
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Helper method to check if removing the current number results in a better order
|
||||||
|
public static boolean isBetterWithout(List<TextPositionSequence> numbers, int i) {
|
||||||
|
|
||||||
|
if (i == 0 || i == numbers.size() - 1) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
int prev = getNumberAsInt(numbers, i);
|
||||||
|
int curr = getNumberAsInt(numbers, i);
|
||||||
|
int next = getNumberAsInt(numbers, i + 1);
|
||||||
|
|
||||||
|
return (prev <= next) && (Math.abs(prev - next) < Math.abs(prev - curr) + Math.abs(curr - next));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static int getNumberAsInt(List<TextPositionSequence> numbers, int i) {
|
||||||
|
|
||||||
|
return Integer.parseInt(numbers.get(i).toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -201,7 +201,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
|||||||
|
|
||||||
try {
|
try {
|
||||||
if (stroke && !getGraphicsState().getStrokingColor().isPattern() && isBlack(getGraphicsState().getStrokingColor()) || //
|
if (stroke && !getGraphicsState().getStrokingColor().isPattern() && isBlack(getGraphicsState().getStrokingColor()) || //
|
||||||
!stroke && !getGraphicsState().getNonStrokingColor().isPattern() && isBlack(getGraphicsState().getNonStrokingColor())) {
|
!stroke && !getGraphicsState().getNonStrokingColor().isPattern() && isBlack(getGraphicsState().getNonStrokingColor())) {
|
||||||
rulings.addAll(path);
|
rulings.addAll(path);
|
||||||
}
|
}
|
||||||
} catch (UnsupportedOperationException e) {
|
} catch (UnsupportedOperationException e) {
|
||||||
@ -279,9 +279,9 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
|||||||
startIndex = i;
|
startIndex = i;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (i > 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i).getUnicode().equals("\u00A0") || textPositions.get(i)
|
if (i > 0
|
||||||
.getUnicode()
|
&& (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i).getUnicode().equals("\u00A0") || textPositions.get(i).getUnicode().equals("\t"))
|
||||||
.equals("\t")) && i <= textPositions.size() - 2) {
|
&& i <= textPositions.size() - 2) {
|
||||||
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
||||||
if (checkIfSequenceContainsOnlyWhitespaces(sublist)) {
|
if (checkIfSequenceContainsOnlyWhitespaces(sublist)) {
|
||||||
|
|
||||||
@ -296,20 +296,31 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
|||||||
}
|
}
|
||||||
startIndex = i + 1;
|
startIndex = i + 1;
|
||||||
}
|
}
|
||||||
|
if (isDottedLineFollowedByWord(textPositions, i, startIndex)) {
|
||||||
|
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
||||||
|
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart));
|
||||||
|
startIndex = i;
|
||||||
|
}
|
||||||
|
if (isWordFollowedByDottedLine(textPositions, i, startIndex)) {
|
||||||
|
List<TextPosition> sublist = textPositions.subList(startIndex, i - 2);
|
||||||
|
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart));
|
||||||
|
startIndex = i - 2;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
List<TextPosition> sublist = textPositions.subList(startIndex, textPositions.size());
|
List<TextPosition> sublist = textPositions.subList(startIndex, textPositions.size());
|
||||||
if (!sublist.isEmpty() && (sublist.get(sublist.size() - 1).getUnicode().equals(" ") || sublist.get(sublist.size() - 1)
|
if (!sublist.isEmpty() && (sublist.get(sublist.size() - 1).getUnicode().equals(" ")
|
||||||
.getUnicode()
|
|| sublist.get(sublist.size() - 1).getUnicode().equals("\u00A0")
|
||||||
.equals("\u00A0") || sublist.get(sublist.size() - 1).getUnicode().equals("\t"))) {
|
|| sublist.get(sublist.size() - 1).getUnicode().equals("\t"))) {
|
||||||
sublist = sublist.subList(0, sublist.size() - 1);
|
sublist = sublist.subList(0, sublist.size() - 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0)
|
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0)
|
||||||
.getUnicode()
|
.getUnicode()
|
||||||
.equals("\t")))) {
|
.equals("\t")))) {
|
||||||
if (previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0)
|
if (previous != null
|
||||||
.getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < 0.01) {
|
&& sublist.get(0).getYDirAdj() == previous.getYDirAdj()
|
||||||
|
&& sublist.get(0).getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < 0.01) {
|
||||||
for (TextPosition t : sublist) {
|
for (TextPosition t : sublist) {
|
||||||
textPositionSequences.get(textPositionSequences.size() - 1).add(t);
|
textPositionSequences.get(textPositionSequences.size() - 1).add(t);
|
||||||
}
|
}
|
||||||
@ -317,10 +328,31 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
|||||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, isParagraphStart));
|
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, isParagraphStart));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
super.writeString(text);
|
super.writeString(text);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private boolean isWordFollowedByDottedLine(List<TextPosition> textPositions, int i, int startIndex) {
|
||||||
|
|
||||||
|
return i - startIndex >= 4 //
|
||||||
|
&& textPositions.get(i).getUnicode().equals(".") //
|
||||||
|
&& textPositions.get(i - 1).getUnicode().equals(".") //
|
||||||
|
&& textPositions.get(i - 2).getUnicode().equals(".") //
|
||||||
|
&& !textPositions.get(i - 3).getUnicode().equals(".");
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static boolean isDottedLineFollowedByWord(List<TextPosition> textPositions, int i, int startIndex) {
|
||||||
|
|
||||||
|
return i - startIndex >= 4 //
|
||||||
|
&& !textPositions.get(i).getUnicode().equals(".") //
|
||||||
|
&& textPositions.get(i - 1).getUnicode().equals(".") //
|
||||||
|
&& textPositions.get(i - 2).getUnicode().equals(".") //
|
||||||
|
&& textPositions.get(i - 3).getUnicode().equals(".");
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public boolean checkIfCurrentPositionIsToTheRightOfPreviousPosition(int i, List<TextPosition> textPositions) {
|
public boolean checkIfCurrentPositionIsToTheRightOfPreviousPosition(int i, List<TextPosition> textPositions) {
|
||||||
|
|
||||||
return i > 0 && textPositions.get(i).getXDirAdj() < textPositions.get(i - 1).getXDirAdj();
|
return i > 0 && textPositions.get(i).getXDirAdj() < textPositions.get(i - 1).getXDirAdj();
|
||||||
@ -337,8 +369,9 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
|||||||
|
|
||||||
public boolean checkIfGapSizeBetweenCharactersSmallerThanMaximum(RedTextPosition previous, List<TextPosition> sublist, float maximumGapSize) {
|
public boolean checkIfGapSizeBetweenCharactersSmallerThanMaximum(RedTextPosition previous, List<TextPosition> sublist, float maximumGapSize) {
|
||||||
|
|
||||||
return previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0)
|
return previous != null
|
||||||
.getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < maximumGapSize;
|
&& sublist.get(0).getYDirAdj() == previous.getYDirAdj()
|
||||||
|
&& sublist.get(0).getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < maximumGapSize;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -36,6 +36,9 @@ public class LayoutGridService {
|
|||||||
LayoutGrid layoutGrid = createLayoutGrid(document);
|
LayoutGrid layoutGrid = createLayoutGrid(document);
|
||||||
Outline outline = OutlineMapper.createOutline(document);
|
Outline outline = OutlineMapper.createOutline(document);
|
||||||
layoutGrid.setVisibleByDefault(layerVisibilityDefaultValue);
|
layoutGrid.setVisibleByDefault(layerVisibilityDefaultValue);
|
||||||
|
|
||||||
|
document.getLayoutDebugLayer().addSentenceVisualization(document.getTextBlock());
|
||||||
|
|
||||||
if (document.getLayoutDebugLayer().isActive()) {
|
if (document.getLayoutDebugLayer().isActive()) {
|
||||||
viewerDocumentService.addLayerGroups(originFile, destinationFile, List.of(layoutGrid, document.getLayoutDebugLayer()), outline);
|
viewerDocumentService.addLayerGroups(originFile, destinationFile, List.of(layoutGrid, document.getLayoutDebugLayer()), outline);
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
@ -10,6 +10,7 @@ public final class TextNormalizationUtilities {
|
|||||||
public static final Pattern hyphenLineBreaks = Pattern.compile("[-~‐‒⁻−﹣゠⁓‑\\u00AD][\\r\\n]+");
|
public static final Pattern hyphenLineBreaks = Pattern.compile("[-~‐‒⁻−﹣゠⁓‑\\u00AD][\\r\\n]+");
|
||||||
public static final Pattern linebreaks = Pattern.compile("[\\r\\n]+");
|
public static final Pattern linebreaks = Pattern.compile("[\\r\\n]+");
|
||||||
public static final Pattern doubleWhitespaces = Pattern.compile("\\s{2,}");
|
public static final Pattern doubleWhitespaces = Pattern.compile("\\s{2,}");
|
||||||
|
public static final Pattern WHITESPACE_REMOVAL = Pattern.compile("\\s+");
|
||||||
|
|
||||||
|
|
||||||
public String cleanString(String value) {
|
public String cleanString(String value) {
|
||||||
@ -36,4 +37,11 @@ public final class TextNormalizationUtilities {
|
|||||||
|
|
||||||
return linebreaks.matcher(value).replaceAll(" ");
|
return linebreaks.matcher(value).replaceAll(" ");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public String removeAllWhitespaces(String value) {
|
||||||
|
|
||||||
|
return WHITESPACE_REMOVAL.matcher(value).replaceAll("");
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -2,6 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor.utils;
|
|||||||
|
|
||||||
import java.awt.geom.Rectangle2D;
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.awt.geom.RectangularShape;
|
import java.awt.geom.RectangularShape;
|
||||||
|
import java.util.ArrayList;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
@ -54,18 +55,26 @@ public class TextPositionOperations {
|
|||||||
private List<TextPositionSequence> sortUsingLineDetection(Set<TextPositionSequence> sequences) {
|
private List<TextPositionSequence> sortUsingLineDetection(Set<TextPositionSequence> sequences) {
|
||||||
|
|
||||||
return sortLines(groupByLine(sequences));
|
return sortLines(groupByLine(sequences));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public List<TextPositionSequence> sortLines(Collection<Set<TextPositionSequence>> lines) {
|
public List<TextPositionSequence> sortLines(Collection<Set<TextPositionSequence>> lines) {
|
||||||
|
|
||||||
return lines.stream()
|
List<List<TextPositionSequence>> lineBlocks = new ArrayList<>();
|
||||||
.map(TextPositionOperations::sortByXDirAdj)
|
for (Set<TextPositionSequence> line : lines) {
|
||||||
.filter(line -> !line.isEmpty())
|
List<TextPositionSequence> sortedLine = sortByXDirAdj(line);
|
||||||
.sorted(Comparator.comparing(line -> line.get(0), COMPARATOR_DIR_ADJ))
|
if (!sortedLine.isEmpty()) {
|
||||||
.flatMap(Collection::stream)
|
lineBlocks.add(sortedLine);
|
||||||
.toList();
|
}
|
||||||
|
}
|
||||||
|
// need to use old sorting, since COMPARATOR_DIR_ADJ is not transitive
|
||||||
|
QuickSort.sort(lineBlocks, Comparator.comparing(line -> line.get(0), COMPARATOR_DIR_ADJ));
|
||||||
|
|
||||||
|
List<TextPositionSequence> list = new ArrayList<>();
|
||||||
|
for (List<TextPositionSequence> textPositionSequences : lineBlocks) {
|
||||||
|
list.addAll(textPositionSequences);
|
||||||
|
}
|
||||||
|
return list;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -91,7 +100,7 @@ public class TextPositionOperations {
|
|||||||
for (TextPositionSequence sequence : sequences) {
|
for (TextPositionSequence sequence : sequences) {
|
||||||
for (TextPositionSequence sequence2 : sequences) {
|
for (TextPositionSequence sequence2 : sequences) {
|
||||||
|
|
||||||
if (sequence.equals(sequence2) || unionFind.inSameSet(sequence, sequence2)) {
|
if (sequence.equals(sequence2)) { // || unionFind.inSameSet(sequence, sequence2)) doing this is actually slower than not doing it
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -5,9 +5,11 @@ import java.awt.geom.AffineTransform;
|
|||||||
import java.awt.geom.Line2D;
|
import java.awt.geom.Line2D;
|
||||||
import java.awt.geom.Point2D;
|
import java.awt.geom.Point2D;
|
||||||
import java.awt.geom.Rectangle2D;
|
import java.awt.geom.Rectangle2D;
|
||||||
|
import java.text.BreakIterator;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Locale;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
@ -19,6 +21,8 @@ import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Bound
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
|
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||||
@ -94,6 +98,29 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void addSentenceVisualization(TextBlock textBlock) {
|
||||||
|
|
||||||
|
if (!active) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
AtomicInteger rotatingColorIdx = new AtomicInteger(0);
|
||||||
|
String text = textBlock.getSearchText();
|
||||||
|
BreakIterator sentenceIterator = BreakIterator.getSentenceInstance(Locale.ENGLISH);
|
||||||
|
sentenceIterator.setText(text);
|
||||||
|
int lastIdx = 0;
|
||||||
|
while (sentenceIterator.next() != BreakIterator.DONE) {
|
||||||
|
TextRange sentenceRange = new TextRange(lastIdx + textBlock.getTextRange().start(), sentenceIterator.current() + textBlock.getTextRange().start());
|
||||||
|
lastIdx = sentenceIterator.current();
|
||||||
|
Color color = getRotatingColor(rotatingColorIdx);
|
||||||
|
textBlock.getPositionsPerPage(sentenceRange)
|
||||||
|
.forEach((page, bboxes) -> getOrCreateVisualizationsOnPage(page.getNumber(), this.sentences).getColoredRectangles()
|
||||||
|
.addAll(bboxes.stream()
|
||||||
|
.map(bbox -> new ColoredRectangle(bbox, color, 1))
|
||||||
|
.toList()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
private Color decideOnRulingColor(Ruling ruling) {
|
private Color decideOnRulingColor(Ruling ruling) {
|
||||||
|
|
||||||
return switch (ruling.getClassification()) {
|
return switch (ruling.getClassification()) {
|
||||||
@ -248,7 +275,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
|||||||
.map(Line::getCharacters)
|
.map(Line::getCharacters)
|
||||||
.flatMap(Collection::stream)
|
.flatMap(Collection::stream)
|
||||||
.forEach(character -> {
|
.forEach(character -> {
|
||||||
Color color = ROTATING_CHARACTER_COLOR.get(index.getAndIncrement() % ROTATING_CHARACTER_COLOR.size());
|
Color color = getRotatingColor(index);
|
||||||
Rectangle2D charBBox = character.getTextPosition().getBBoxPdf();
|
Rectangle2D charBBox = character.getTextPosition().getBBoxPdf();
|
||||||
characterVisualizations.getColoredRectangles().add(new ColoredRectangle(charBBox, color, 1));
|
characterVisualizations.getColoredRectangles().add(new ColoredRectangle(charBBox, color, 1));
|
||||||
character.getNeighbors()
|
character.getNeighbors()
|
||||||
@ -263,6 +290,31 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void addTocPages(List<TextPositionSequence> numbers, int page) {
|
||||||
|
|
||||||
|
if (!active) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(page, this.tocPages);
|
||||||
|
visualizationsOnPage.getColoredRectangles()
|
||||||
|
.addAll(numbers.stream()
|
||||||
|
.map(BoundingBox::getBBoxPdf)
|
||||||
|
.map(line -> new ColoredRectangle(line, LINES_COLOR, 0.5f))
|
||||||
|
.toList());
|
||||||
|
visualizationsOnPage.getColoredRectangles()
|
||||||
|
.add(new ColoredRectangle(numbers.stream()
|
||||||
|
.map(BoundingBox::getBBoxPdf)
|
||||||
|
.collect(RectangleTransformations.collectBBox()), LINES_COLOR, 0.5f));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static Color getRotatingColor(AtomicInteger index) {
|
||||||
|
|
||||||
|
return ROTATING_CHARACTER_COLOR.get(index.getAndIncrement() % ROTATING_CHARACTER_COLOR.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public void addOutlineObjects(List<OutlineObject> outlineObjects, PageInformation pageInformation) {
|
public void addOutlineObjects(List<OutlineObject> outlineObjects, PageInformation pageInformation) {
|
||||||
|
|
||||||
if (!active) {
|
if (!active) {
|
||||||
|
|||||||
@ -0,0 +1,70 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
class BoundingBoxTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testHorizontalDistance_NoOverlap() {
|
||||||
|
|
||||||
|
ConcreteBoundingBox box1 = new ConcreteBoundingBox(0, 0, 10, 10);
|
||||||
|
ConcreteBoundingBox box2 = new ConcreteBoundingBox(20, 0, 10, 10);
|
||||||
|
|
||||||
|
assertEquals(10, box1.horizontalDistance(box2));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testHorizontalDistance_Overlap() {
|
||||||
|
|
||||||
|
ConcreteBoundingBox box1 = new ConcreteBoundingBox(0, 0, 10, 10);
|
||||||
|
ConcreteBoundingBox box2 = new ConcreteBoundingBox(5, 0, 10, 10);
|
||||||
|
|
||||||
|
assertEquals(0, box1.horizontalDistance(box2));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testVerticalDistance_NoOverlap() {
|
||||||
|
|
||||||
|
ConcreteBoundingBox box1 = new ConcreteBoundingBox(0, 0, 10, 10);
|
||||||
|
ConcreteBoundingBox box2 = new ConcreteBoundingBox(0, 20, 10, 10);
|
||||||
|
|
||||||
|
assertEquals(10, box1.verticalDistance(box2));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testVerticalDistance_Overlap() {
|
||||||
|
|
||||||
|
ConcreteBoundingBox box1 = new ConcreteBoundingBox(0, 0, 10, 10);
|
||||||
|
ConcreteBoundingBox box2 = new ConcreteBoundingBox(0, 5, 10, 10);
|
||||||
|
|
||||||
|
assertEquals(0, box1.verticalDistance(box2));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testVerticalDistance_PartialOverlap() {
|
||||||
|
|
||||||
|
ConcreteBoundingBox box1 = new ConcreteBoundingBox(0, 0, 10, 10);
|
||||||
|
ConcreteBoundingBox box2 = new ConcreteBoundingBox(0, 8, 10, 10);
|
||||||
|
|
||||||
|
assertEquals(0, box1.verticalDistance(box2));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testHorizontalDistance_PartialOverlap() {
|
||||||
|
|
||||||
|
ConcreteBoundingBox box1 = new ConcreteBoundingBox(0, 0, 10, 10);
|
||||||
|
ConcreteBoundingBox box2 = new ConcreteBoundingBox(8, 0, 10, 10);
|
||||||
|
|
||||||
|
assertEquals(0, box1.horizontalDistance(box2));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,12 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
|
||||||
|
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
|
|
||||||
|
class ConcreteBoundingBox extends BoundingBox {
|
||||||
|
|
||||||
|
ConcreteBoundingBox(double x, double y, double width, double height) {
|
||||||
|
|
||||||
|
this.bBox = new Rectangle2D.Double(x, y, width, height);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -46,8 +46,8 @@ public class SimplifiedTextServiceTest
|
|||||||
Document document = buildGraph(file);
|
Document document = buildGraph(file);
|
||||||
SimplifiedText simplifiedText = simplifiedSectionTextService.toSimplifiedText(document);
|
SimplifiedText simplifiedText = simplifiedSectionTextService.toSimplifiedText(document);
|
||||||
List<SimplifiedSectionText> sectionTexts = simplifiedText.getSectionTexts();
|
List<SimplifiedSectionText> sectionTexts = simplifiedText.getSectionTexts();
|
||||||
assertThat(sectionTexts.stream().filter(section -> section.getText().equals(footerExample)).collect(Collectors.toList()).size()).isGreaterThan(0);
|
assertThat(sectionTexts.stream().filter(section -> section.getText().contains(footerExample)).toList().size()).isGreaterThan(0);
|
||||||
assertThat(sectionTexts.stream().filter(section -> section.getText().equals(headerExample)).collect(Collectors.toList()).size()).isGreaterThan(0);
|
assertThat(sectionTexts.stream().filter(section -> section.getText().contains(headerExample)).toList().size()).isGreaterThan(0);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -51,7 +51,7 @@ import lombok.extern.slf4j.Slf4j;
|
|||||||
@Slf4j
|
@Slf4j
|
||||||
public class DocumentReadingOrderTest extends BuildDocumentTest {
|
public class DocumentReadingOrderTest extends BuildDocumentTest {
|
||||||
|
|
||||||
private static final boolean DRAW_DIR_ADJ_COORDS = true;
|
private static final boolean DRAW_DIR_ADJ_COORDS = false;
|
||||||
public static final List<LayoutParsingType> LAYOUT_PARSING_TYPES = List.of(LayoutParsingType.DOCUMINE,
|
public static final List<LayoutParsingType> LAYOUT_PARSING_TYPES = List.of(LayoutParsingType.DOCUMINE,
|
||||||
LayoutParsingType.DOCUMINE_OLD,
|
LayoutParsingType.DOCUMINE_OLD,
|
||||||
LayoutParsingType.REDACT_MANAGER,
|
LayoutParsingType.REDACT_MANAGER,
|
||||||
@ -82,7 +82,7 @@ public class DocumentReadingOrderTest extends BuildDocumentTest {
|
|||||||
@Disabled
|
@Disabled
|
||||||
public void drawDirAdjForFile() {
|
public void drawDirAdjForFile() {
|
||||||
|
|
||||||
String pdfFile = "/home/kschuettler/Dokumente/Ticket Related/RED-9974/026dc94b019bc2348a4c54f0c6c4516f.ORIGIN.pdf";
|
String pdfFile = "/home/kschuettler/Dokumente/TestFiles/OCR/VV-331340/VV-331340_OCRED_first15.pdf";
|
||||||
|
|
||||||
ClassificationDocument classificationDocument = parseLayout(pdfFile, LayoutParsingType.DOCUMINE_OLD);
|
ClassificationDocument classificationDocument = parseLayout(pdfFile, LayoutParsingType.DOCUMINE_OLD);
|
||||||
|
|
||||||
|
|||||||
@ -76,9 +76,10 @@ class TextRangeTest {
|
|||||||
assertEquals(List.of(new TextRange(10, 40), new TextRange(40, 100)), startTextRange.split(List.of(40)));
|
assertEquals(List.of(new TextRange(10, 40), new TextRange(40, 100)), startTextRange.split(List.of(40)));
|
||||||
assertEquals(1, startTextRange.split(Collections.emptyList()).size());
|
assertEquals(1, startTextRange.split(Collections.emptyList()).size());
|
||||||
assertEquals(1, startTextRange.split(List.of(startTextRange.start())).size());
|
assertEquals(1, startTextRange.split(List.of(startTextRange.start())).size());
|
||||||
|
assertEquals(1, startTextRange.split(List.of(100)).size());
|
||||||
assertThrows(IndexOutOfBoundsException.class, () -> startTextRange.split(Collections.singletonList(0)));
|
assertThrows(IndexOutOfBoundsException.class, () -> startTextRange.split(Collections.singletonList(0)));
|
||||||
assertThrows(IndexOutOfBoundsException.class, () -> startTextRange.split(Collections.singletonList(100)));
|
assertThrows(IndexOutOfBoundsException.class, () -> startTextRange.split(Collections.singletonList(101)));
|
||||||
assertThrows(IndexOutOfBoundsException.class, () -> startTextRange.split(List.of(12, 40, 100)));
|
assertThrows(IndexOutOfBoundsException.class, () -> startTextRange.split(List.of(12, 40, 101)));
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -55,7 +55,10 @@ public record LayerIdentifier(String name, String markedContentName) {
|
|||||||
public static final LayerIdentifier NEIGHBOURS = new LayerIdentifier("Neighbours", "NEIGHBOURS");
|
public static final LayerIdentifier NEIGHBOURS = new LayerIdentifier("Neighbours", "NEIGHBOURS");
|
||||||
public static final LayerIdentifier CHARACTERS = new LayerIdentifier("Characters", "CHARACTERS");
|
public static final LayerIdentifier CHARACTERS = new LayerIdentifier("Characters", "CHARACTERS");
|
||||||
public static final LayerIdentifier OUTLINE_OBJECTS = new LayerIdentifier("Outline Positions", "OUTLINE_OBJECTS");
|
public static final LayerIdentifier OUTLINE_OBJECTS = new LayerIdentifier("Outline Positions", "OUTLINE_OBJECTS");
|
||||||
|
public static final LayerIdentifier SENTENCES = new LayerIdentifier("Sentences", "SENTENCES");
|
||||||
|
public static final LayerIdentifier TOC_PAGES = new LayerIdentifier("TOC pages", "TOC_PAGES");
|
||||||
|
|
||||||
|
// Visual layout parser
|
||||||
public static final LayerIdentifier KNECON_VISUAL_PARSING = new LayerIdentifier("Visual Layout Parser", "VISUAL_PARSING");
|
public static final LayerIdentifier KNECON_VISUAL_PARSING = new LayerIdentifier("Visual Layout Parser", "VISUAL_PARSING");
|
||||||
|
|
||||||
//ocr
|
//ocr
|
||||||
|
|||||||
@ -55,6 +55,8 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup {
|
|||||||
protected final Visualizations neighbours = Visualizations.builder().layer(LayerIdentifier.NEIGHBOURS).build();
|
protected final Visualizations neighbours = Visualizations.builder().layer(LayerIdentifier.NEIGHBOURS).build();
|
||||||
protected final Visualizations characters = Visualizations.builder().layer(LayerIdentifier.CHARACTERS).build();
|
protected final Visualizations characters = Visualizations.builder().layer(LayerIdentifier.CHARACTERS).build();
|
||||||
protected final Visualizations outlineObjects = Visualizations.builder().layer(LayerIdentifier.OUTLINE_OBJECTS).build();
|
protected final Visualizations outlineObjects = Visualizations.builder().layer(LayerIdentifier.OUTLINE_OBJECTS).build();
|
||||||
|
protected final Visualizations sentences = Visualizations.builder().layer(LayerIdentifier.SENTENCES).build();
|
||||||
|
protected final Visualizations tocPages = Visualizations.builder().layer(LayerIdentifier.TOC_PAGES).build();
|
||||||
|
|
||||||
|
|
||||||
public List<Visualizations> getVisualizations() {
|
public List<Visualizations> getVisualizations() {
|
||||||
@ -63,14 +65,15 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup {
|
|||||||
neighbours,//
|
neighbours,//
|
||||||
words, //
|
words, //
|
||||||
lines, //
|
lines, //
|
||||||
|
sentences, //
|
||||||
zones, //
|
zones, //
|
||||||
rulings, //
|
rulings, //
|
||||||
clean_rulings, //
|
clean_rulings, //
|
||||||
cells, //
|
cells, //
|
||||||
mainBody, //
|
mainBody, //
|
||||||
markedContent, //
|
markedContent, //
|
||||||
outlineObjects //
|
outlineObjects, //
|
||||||
);
|
tocPages);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -68,10 +68,10 @@ public class OutlineUtility {
|
|||||||
public static void deleteExistingOutline(PDFDoc doc) {
|
public static void deleteExistingOutline(PDFDoc doc) {
|
||||||
|
|
||||||
Bookmark firstBookmark = doc.getFirstBookmark();
|
Bookmark firstBookmark = doc.getFirstBookmark();
|
||||||
// while (firstBookmark != null && firstBookmark.isValid()) {
|
while (firstBookmark != null && firstBookmark.isValid()) {
|
||||||
firstBookmark.delete();
|
firstBookmark.delete();
|
||||||
firstBookmark = doc.getFirstBookmark();
|
firstBookmark = doc.getFirstBookmark();
|
||||||
// }
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user