Merge branch 'feature/RED-10127' into 'main'

RED-10127: improve headline detection

See merge request fforesight/layout-parser!235
This commit is contained in:
Dominique Eifländer 2024-10-09 08:48:48 +02:00
commit 4b0c041d84
33 changed files with 1042 additions and 187 deletions

View File

@ -2,11 +2,13 @@ package com.knecon.fforesight.service.layoutparser.processor;
import static java.lang.String.format;
import java.awt.geom.AffineTransform;
import java.awt.geom.Rectangle2D;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
@ -24,6 +26,10 @@ import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.No
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.TableOfContentsClassificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.MarkdownMapper;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
@ -91,10 +97,7 @@ public class LayoutParsingPipeline {
CvTableParsingAdapter cvTableParsingAdapter;
LayoutParsingStorageService layoutParsingStorageService;
SectionsBuilderService sectionsBuilderService;
RedactManagerClassificationService redactManagerClassificationService;
DocuMineClassificationService docuMineClassificationService;
SimplifiedSectionTextService simplifiedSectionTextService;
BodyTextFrameService bodyTextFrameService;
RulingCleaningService rulingCleaningService;
TableExtractionService tableExtractionService;
DocuMineBlockificationService docuMineBlockificationService;
@ -104,12 +107,12 @@ public class LayoutParsingPipeline {
LayoutGridService layoutGridService;
ObservationRegistry observationRegistry;
VisualLayoutParsingAdapter visualLayoutParsingAdapter;
ClarifyndClassificationService clarifyndClassificationService;
GraphicExtractorService graphicExtractorService;
OutlineExtractorService outlineExtractorService;
OutlineValidationService outlineValidationService;
TOCEnrichmentService tocEnrichmentService;
LayoutparserSettings settings;
ClassificationService classificationService;
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
@ -273,6 +276,9 @@ public class LayoutParsingPipeline {
stripper.setPdpage(pdPage);
stripper.getText(originDocument);
List<TextPositionSequence> words = stripper.getTextPositionSequences();
// rotateDirAdjExactly(words, pdPage); // works really well for many highly rotated documents (e.g. VV-331340.pdf), but it decreases the headline performance by 1.3%, so I am leaving it out for now
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD)) {
var lines = TextPositionOperations.groupByLine(new HashSet<>(words));
classificationDocument.getLayoutDebugLayer().addLineVisualizationsFromNestedTextPosition(lines, pageNumber);
@ -366,24 +372,7 @@ public class LayoutParsingPipeline {
originDocument.close();
log.info("Calculating BodyTextFrame for {}", identifier);
bodyTextFrameService.setBodyTextFrames(classificationDocument, layoutParsingType);
for (ClassificationPage page : classificationDocument.getPages()) {
classificationDocument.getLayoutDebugLayer().addCleanRulingVisualization(page.getCleanRulings(), page.getPageNumber());
}
log.info("Classify TextBlocks for {}", identifier);
switch (layoutParsingType) {
case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_OLD, CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH ->
redactManagerClassificationService.classifyDocument(classificationDocument);
case DOCUMINE_OLD, DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument);
case CLARIFYND -> clarifyndClassificationService.classifyDocument(classificationDocument);
}
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD)) {
for (ClassificationPage page : classificationDocument.getPages()) {
docuMineBlockificationService.mergeblocks(page, page.getCleanRulings().withoutTextRulings(), 0, 10);
}
}
classificationService.classify(classificationDocument, layoutParsingType, identifier);
List<TextPageBlock> headlines = classificationDocument.getPages()
.stream()
@ -406,6 +395,32 @@ public class LayoutParsingPipeline {
}
private static void rotateDirAdjExactly(List<TextPositionSequence> words, PDPage pdPage) {
for (TextDirection dir : TextDirection.values()) {
double averageRotation = words.stream()
.map(TextPositionSequence::getTextPositions)
.flatMap(Collection::stream)
.filter(pos -> pos.getDir().equals(dir))
.mapToDouble(RedTextPosition::getExactDir).average().orElse(0);
if (averageRotation == 0) {
continue;
}
AffineTransform rotateInstance = AffineTransform.getRotateInstance(averageRotation, pdPage.getMediaBox().getWidth() / 2, pdPage.getMediaBox().getHeight() / 2);
for (TextPositionSequence word : words) {
if (!dir.equals(word.getDir())) {
continue;
}
word.transform(rotateInstance);
}
}
}
private void addNumberOfPagesToTrace(int numberOfPages, long size) {
if (observationRegistry.getCurrentObservation() != null) {

View File

@ -225,33 +225,31 @@ public abstract class BoundingBox {
public double horizontalDistance(BoundingBox other) {
Rectangle2D left;
Rectangle2D right;
if (this.leftOf(other)) {
left = this.getBBox();
right = other.getBBox();
} else {
left = other.getBBox();
right = this.getBBox();
}
double rect1Right = getMaxX();
double rect1Left = getMinX();
double rect2Right = other.getMaxX();
double rect2Left = other.getMinX();
return Math.max(0, right.getMinX() - left.getMaxX());
if (rect1Left > rect2Right || rect2Left > rect1Right) {
return Math.max(rect2Left - rect1Right, rect1Left - rect2Right);
} else {
return 0;
}
}
public double verticalDistance(BoundingBox other) {
Rectangle2D bottom;
Rectangle2D top;
if (this.isAbove(other)) {
top = this.getBBox();
bottom = other.getBBox();
} else {
bottom = this.getBBox();
top = other.getBBox();
}
double rect1Top = getMaxY();
double rect1Bottom = getMinY();
double rect2Top = other.getMaxY();
double rect2Bottom = other.getMinY();
return Math.max(0, bottom.getMinY() - top.getMaxY());
if (rect1Bottom > rect2Top || rect2Bottom > rect1Top) {
return Math.max(rect2Bottom - rect1Top, rect1Bottom - rect2Top);
} else {
return 0;
}
}

View File

@ -99,4 +99,70 @@ public abstract class TextBoundingBox extends BoundingBox {
return this.bBoxDirAdj.getCenterX();
}
public double horizontalDistanceDirAdj(TextBoundingBox other) {
double rect1Right = getMaxXDirAdj();
double rect1Left = getXDirAdj();
double rect2Right = other.getMaxXDirAdj();
double rect2Left = other.getXDirAdj();
if (rect1Left > rect2Right || rect2Left > rect1Right) {
return Math.max(rect2Left - rect1Right, rect1Left - rect2Right);
} else {
return 0;
}
}
public double verticalDistanceDirAdj(TextBoundingBox other) {
double rect1Top = getMaxYDirAdj();
double rect1Bottom = getYDirAdj();
double rect2Top = other.getMaxYDirAdj();
double rect2Bottom = other.getYDirAdj();
if (rect1Bottom > rect2Top || rect2Bottom > rect1Top) {
return Math.max(rect2Bottom - rect1Top, rect1Bottom - rect2Top);
} else {
return 0;
}
}
public boolean intersectsDirAdj(TextBoundingBox other) {
return this.intersectsXDirAdj(other) && this.intersectsYDirAdj(other);
}
public boolean intersectsDirAdj(TextBoundingBox other, float yThreshold, float xThreshold) {
return this.intersectsXDirAdj(other, xThreshold) && this.intersectsYDirAdj(other, yThreshold);
}
public boolean intersectsXDirAdj(TextBoundingBox other, float threshold) {
return this.getXDirAdj() - threshold <= other.getMaxXDirAdj() && this.getMaxXDirAdj() + threshold >= other.getXDirAdj();
}
public boolean intersectsXDirAdj(TextBoundingBox other) {
return this.getXDirAdj() <= other.getMaxXDirAdj() && this.getMaxXDirAdj() >= other.getXDirAdj();
}
public boolean intersectsYDirAdj(TextBoundingBox other) {
return this.getYDirAdj() <= other.getMaxYDirAdj() && this.getMaxYDirAdj() >= other.getYDirAdj();
}
public boolean intersectsYDirAdj(TextBoundingBox other, float threshold) {
return this.getYDirAdj() - threshold <= other.getMaxYDirAdj() && this.getMaxYDirAdj() + threshold >= other.getYDirAdj();
}
}

View File

@ -28,4 +28,10 @@ public class UnionFind<T> extends org.jgrapht.alg.util.UnionFind<T> {
return setRep.values();
}
public Collection<T> getElements() {
return getParentMap().keySet();
}
}

View File

@ -44,7 +44,7 @@ public class FloatFrequencyCounter {
public Double getMostPopular() {
if (changed) {
if (changed || mostPopularCache == null) {
Map.Entry<Double, Integer> mostPopular = null;
for (Map.Entry<Double, Integer> entry : countPerValue.entrySet()) {
if (mostPopular == null || entry.getValue() >= mostPopular.getValue()) {

View File

@ -15,6 +15,7 @@ public enum PageBlockType {
PARAGRAPH_ITALIC,
PARAGRAPH_UNKNOWN,
OTHER,
TABLE_OF_CONTENTS_ITEM,
TABLE;

View File

@ -98,10 +98,10 @@ public class TextRange implements Comparable<TextRange> {
public List<TextRange> split(List<Integer> splitIndices) {
if (splitIndices.stream()
.anyMatch(idx -> !this.containsExclusive(idx))) {
.anyMatch(idx -> !this.contains(idx))) {
throw new IndexOutOfBoundsException(format("%s splitting indices are out of range for %s",
splitIndices.stream()
.filter(idx -> !this.containsExclusive(idx))
.filter(idx -> !this.contains(idx))
.toList(),
this));
}
@ -116,7 +116,9 @@ public class TextRange implements Comparable<TextRange> {
splitBoundaries.add(new TextRange(previousIndex, splitIndex));
previousIndex = splitIndex;
}
splitBoundaries.add(new TextRange(previousIndex, end));
if (previousIndex != end) {
splitBoundaries.add(new TextRange(previousIndex, end));
}
return splitBoundaries;
}

View File

@ -0,0 +1,21 @@
package com.knecon.fforesight.service.layoutparser.processor.model.text;
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
import lombok.AccessLevel;
import lombok.Getter;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Getter
@NoArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class FrequencyCounters {
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter();
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
}

View File

@ -7,6 +7,8 @@ import org.apache.pdfbox.text.TextPosition;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2;
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;

View File

@ -9,10 +9,14 @@ public class StringFrequencyCounter {
@Getter
private final Map<String, Integer> countPerValue = new HashMap<>();
boolean changed;
String mostPopularCache;
public void add(String value) {
changed = true;
if (!countPerValue.containsKey(value)) {
countPerValue.put(value, 1);
} else {
@ -23,6 +27,8 @@ public class StringFrequencyCounter {
public void addAll(Map<String, Integer> otherCounter) {
changed = true;
for (Map.Entry<String, Integer> entry : otherCounter.entrySet()) {
if (countPerValue.containsKey(entry.getKey())) {
countPerValue.put(entry.getKey(), countPerValue.get(entry.getKey()) + entry.getValue());
@ -35,13 +41,18 @@ public class StringFrequencyCounter {
public String getMostPopular() {
Map.Entry<String, Integer> mostPopular = null;
for (Map.Entry<String, Integer> entry : countPerValue.entrySet()) {
if (mostPopular == null || entry.getValue() > mostPopular.getValue()) {
mostPopular = entry;
if (changed || mostPopularCache == null) {
Map.Entry<String, Integer> mostPopular = null;
for (Map.Entry<String, Integer> entry : countPerValue.entrySet()) {
if (mostPopular == null || entry.getValue() >= mostPopular.getValue()) {
mostPopular = entry;
}
}
mostPopularCache = mostPopular != null ? mostPopular.getKey() : null;
changed = false;
}
return mostPopular != null ? mostPopular.getKey() : null;
return mostPopularCache;
}
}

View File

@ -0,0 +1,7 @@
package com.knecon.fforesight.service.layoutparser.processor.model.text;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
public record TextBlockOnPage(ClassificationPage page, TextPageBlock textBlock) {
}

View File

@ -7,7 +7,6 @@ import java.util.List;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
@ -27,19 +26,11 @@ public class TextPageBlock extends AbstractPageBlock {
@Builder.Default
private List<TextPositionSequence> sequences = new ArrayList<>();
@Builder.Default
private FrequencyCounters frequencyCounters = new FrequencyCounters();
private Rectangle2D bBoxDirAdj;
private String mostPopularWordFont;
private String mostPopularWordStyle;
private double mostPopularWordFontSize;
private double mostPopularWordHeight;
private double mostPopularWordSpaceWidth;
private boolean underlined;
private double highestFontSize;
@ -55,8 +46,10 @@ public class TextPageBlock extends AbstractPageBlock {
public TextPageBlock(List<TextPositionSequence> sequences) {
this.sequences = new ArrayList<>(sequences);
this.frequencyCounters = new FrequencyCounters();
if (!sequences.isEmpty()) {
calculateFrequencyCounters();
addToFrequencyCounters(sequences);
}
calculateBBox();
}
@ -118,32 +111,18 @@ public class TextPageBlock extends AbstractPageBlock {
}
private void calculateFrequencyCounters() {
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter();
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
private void addToFrequencyCounters(List<TextPositionSequence> sequences) {
for (TextPositionSequence wordBlock : sequences) {
lineHeightFrequencyCounter.add(wordBlock.getTextHeight());
fontSizeFrequencyCounter.add(wordBlock.getFontSize());
spaceFrequencyCounter.add(wordBlock.getSpaceWidth());
fontFrequencyCounter.add(wordBlock.getFont());
styleFrequencyCounter.add(wordBlock.getFontStyle());
frequencyCounters.getLineHeightFrequencyCounter().add(wordBlock.getTextHeight());
frequencyCounters.getFontSizeFrequencyCounter().add(wordBlock.getFontSize());
frequencyCounters.getSpaceFrequencyCounter().add(wordBlock.getSpaceWidth());
frequencyCounters.getFontFrequencyCounter().add(wordBlock.getFont());
frequencyCounters.getStyleFrequencyCounter().add(wordBlock.getFontStyle());
}
setMostPopularWordFont(fontFrequencyCounter.getMostPopular());
setMostPopularWordStyle(styleFrequencyCounter.getMostPopular());
setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular());
setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
setHighestFontSize(fontSizeFrequencyCounter.getHighest());
setUnderlined(sequences.stream()
setUnderlined(this.sequences.stream()
.allMatch(TextPositionSequence::isUnderline));
}
@ -152,7 +131,7 @@ public class TextPageBlock extends AbstractPageBlock {
TextPageBlock union = this.copy();
union.add(r);
calculateFrequencyCounters();
addToFrequencyCounters(List.of(r));
calculateBBox();
return union;
}
@ -162,7 +141,7 @@ public class TextPageBlock extends AbstractPageBlock {
TextPageBlock union = this.copy();
union.addAll(r.getSequences());
calculateFrequencyCounters();
addToFrequencyCounters(r.getSequences());
calculateBBox();
return union;
}
@ -172,7 +151,7 @@ public class TextPageBlock extends AbstractPageBlock {
changed = true;
sequences.addAll(textPageBlock.getSequences());
calculateFrequencyCounters();
addToFrequencyCounters(textPageBlock.getSequences());
calculateBBox();
}
@ -181,7 +160,7 @@ public class TextPageBlock extends AbstractPageBlock {
changed = true;
sequences.add(textPositionSequence);
calculateFrequencyCounters();
addToFrequencyCounters(List.of(textPositionSequence));
calculateBBox();
}
@ -190,7 +169,7 @@ public class TextPageBlock extends AbstractPageBlock {
changed = true;
sequences.addAll(textPositionSequences);
calculateFrequencyCounters();
addToFrequencyCounters(textPositionSequences);
calculateBBox();
}
@ -253,6 +232,36 @@ public class TextPageBlock extends AbstractPageBlock {
}
public String getMostPopularWordFont() {
return frequencyCounters.getFontFrequencyCounter().getMostPopular();
}
public String getMostPopularWordStyle() {
return frequencyCounters.getStyleFrequencyCounter().getMostPopular();
}
public double getMostPopularWordFontSize() {
return frequencyCounters.getFontSizeFrequencyCounter().getMostPopular();
}
public double getMostPopularWordHeight() {
return frequencyCounters.getLineHeightFrequencyCounter().getMostPopular();
}
public double getMostPopularWordSpaceWidth() {
return frequencyCounters.getSpaceFrequencyCounter().getMostPopular();
}
@Override
public boolean isEmpty() {

View File

@ -2,10 +2,13 @@ package com.knecon.fforesight.service.layoutparser.processor.model.text;
import static com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition.HEIGHT_PADDING;
import java.awt.geom.AffineTransform;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.Objects;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.apache.pdfbox.text.TextPosition;
@ -30,6 +33,7 @@ public class TextPositionSequence extends TextBoundingBox implements CharSequenc
public static final String BOLD_ITALIC = "bold, italic";
public static final String BOLD = "bold";
public static final String ITALIC = "italic";
public static final Pattern FONT_CLEANER = Pattern.compile(",bold|,italic");
private int page;
@ -154,7 +158,8 @@ public class TextPositionSequence extends TextBoundingBox implements CharSequenc
if (textPositions.get(0).getFontName() == null) {
return "none";
}
return textPositions.get(0).getFontName().toLowerCase(Locale.ROOT).replaceAll(",bold", "").replaceAll(",italic", "");
return FONT_CLEANER.matcher(textPositions.get(0).getFontName().toLowerCase(Locale.ROOT)).replaceAll("");
}
@ -238,5 +243,15 @@ public class TextPositionSequence extends TextBoundingBox implements CharSequenc
return result;
}
public void transform(AffineTransform rotateInstance) {
for (RedTextPosition textPosition : getTextPositions()) {
Rectangle2D exactDirAdjCoordinates = rotateInstance.createTransformedShape(textPosition.getBBoxDirAdj()).getBounds2D();
textPosition.setBBoxDirAdj(exactDirAdjCoordinates);
}
calculateBBoxAndHashcode();
}
}

View File

@ -0,0 +1,34 @@
package com.knecon.fforesight.service.layoutparser.processor.model.text;
import java.util.Comparator;
import java.util.HashMap;
public class TextPositionSequenceComparator implements Comparator<TextPositionSequence> {
private HashMap<TextPositionSequence, TextBlockOnPage> lookup;
public TextPositionSequenceComparator(HashMap<TextPositionSequence, TextBlockOnPage> lookup) {
this.lookup = lookup;
}
@Override
public int compare(TextPositionSequence number1, TextPositionSequence number2) {
int page1 = lookup.get(number1).page().getPageNumber();
int page2 = lookup.get(number2).page().getPageNumber();
if (page1 != page2) {
return Integer.compare(page1, page2);
}
if (number1.getY() != number2.getY()) {
return Double.compare(number1.getY(), number2.getY());
}
return Integer.compare(Integer.parseInt(number1.toString()), Integer.parseInt(number2.toString()));
}
}

View File

@ -60,24 +60,18 @@ public class ClarifyndClassificationService {
headlineClassificationService.setLastHeadlineFromOutline(textBlock);
return;
}
if (textBlock.getClassification() != null && (textBlock.getClassification().equals(PageBlockType.HEADER)
|| textBlock.getClassification().equals(PageBlockType.FOOTER)
|| textBlock.getClassification().equals(PageBlockType.TABLE_OF_CONTENTS_ITEM))) {
return;
}
if (document.getFontSizeCounter().getMostPopular() == null) {
textBlock.setClassification(PageBlockType.PARAGRAPH);
return;
}
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null
|| textBlock.getHighestFontSize() <= document.getFontSizeCounter()
.getMostPopular())) {
textBlock.setClassification(PageBlockType.PARAGRAPH);
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
|| PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null
|| textBlock.getHighestFontSize() <= document.getFontSizeCounter()
.getMostPopular())) {
textBlock.setClassification(PageBlockType.PARAGRAPH);
} else if (page.getPageNumber() == 1 //
&& (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5
&& textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) {
if (page.getPageNumber() == 1 //
&& (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5
&& textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) {
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
textBlock.setClassification(PageBlockType.TITLE);
}

View File

@ -0,0 +1,62 @@
package com.knecon.fforesight.service.layoutparser.processor.services.classification;
import java.util.Map;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService;
import lombok.AccessLevel;
import lombok.RequiredArgsConstructor;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class ClassificationService {
DocuMineBlockificationService docuMineBlockificationService;
BodyTextFrameService bodyTextFrameService;
TableOfContentsClassificationService tableOfContentsClassificationService;
RedactManagerClassificationService redactManagerClassificationService;
ClarifyndClassificationService clarifyndClassificationService;
DocuMineClassificationService docuMineClassificationService;
HeaderFooterClassificationService headerFooterClassificationService;
public void classify(ClassificationDocument document, LayoutParsingType layoutParsingType, Map<String, String> identifier) {
log.info("Calculating BodyTextFrame for {}", identifier);
bodyTextFrameService.setBodyTextFrames(document, layoutParsingType);
for (ClassificationPage page : document.getPages()) {
document.getLayoutDebugLayer().addCleanRulingVisualization(page.getCleanRulings(), page.getPageNumber());
}
log.info("Classify TextBlocks for {}", identifier);
headerFooterClassificationService.classifyHeadersAndFooters(document);
tableOfContentsClassificationService.classifyTableOfContents(document);
switch (layoutParsingType) {
case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_OLD, CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH ->
redactManagerClassificationService.classifyDocument(document);
case DOCUMINE_OLD, DOCUMINE -> docuMineClassificationService.classifyDocument(document);
case CLARIFYND -> clarifyndClassificationService.classifyDocument(document);
}
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD)) {
for (ClassificationPage page : document.getPages()) {
docuMineBlockificationService.mergeblocks(page, page.getCleanRulings().withoutTextRulings(), 0, 10);
}
}
}
}

View File

@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.services.classifica
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@ -29,10 +30,12 @@ public class DocuMineClassificationService {
private static final Pattern HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN = Pattern.compile("^([0-9]\\.)\\s[a-z][0-9a-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
private static final Pattern AT_LEAST_3_CHARS_PATTERN = Pattern.compile("\\p{L}{3,}", Pattern.CASE_INSENSITIVE);
private static final Pattern HEADLINE_PATTERN_WITH_SLASHES = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*");
private static final Pattern AMOUNT_PATTERN = Pattern.compile("^\\s*\\d+(?:\\.\\d+)?\\s*(?:ml|l|g|kg|mg|cm|mm|km|m|lb|oz|ppm|%|f)\\b", Pattern.CASE_INSENSITIVE);
private static final Pattern TABLE_OR_FIGURE_PATTER = Pattern.compile(
public static final Pattern AMOUNT_PATTERN = Pattern.compile("^\\s*\\d+(?:\\.\\d+)?\\s*(?:ml|l|g|kg|mg|cm|cm2|cm3|mm|mm2|mm3|km|km2|m|m2|m3|lb|oz|ppm|dpm|days|weeks|months|%|f)\\b",
Pattern.CASE_INSENSITIVE);
private static final Pattern TABLE_OR_FIGURE_PATTERN = Pattern.compile(
"^\\s*(?:table|continued\\s+table|appendix|figure)\\s+(?:[xvi]+|[a-z0-9]{1,3}(?:\\.[0-9]{1,3})*(?:-[0-9]{1,3})?)\\b",
Pattern.CASE_INSENSITIVE);
private static final Pattern ALPHANUMERIC = Pattern.compile("[a-zA-Z0-9]");
public static final int SEPARATION_THRESHOLD = 10; // if the min distance between a textblock and all its surrounding blocks, the regexes can be more lenient.
public static final int SURROUNDING_BLOCKS_RADIUS = 3; // number of surrounding blocks before and after the current textblock to be tested
@ -78,6 +81,9 @@ public class DocuMineClassificationService {
if (i == originalIndex) {
continue;
}
if (textBlocks.get(i).getText().length() <= 1) {
continue;
}
surroundingBlocks.add(textBlocks.get(i));
}
return surroundingBlocks;
@ -98,10 +104,9 @@ public class DocuMineClassificationService {
Matcher atLeast3Matcher = AT_LEAST_3_CHARS_PATTERN.matcher(textBlock.toString());
Matcher headlineWithSlashesMatcher = HEADLINE_PATTERN_WITH_SLASHES.matcher(textBlock.toString());
Matcher amountMatcher = AMOUNT_PATTERN.matcher(textBlock.toString());
Matcher tableOrFigureMatcher = TABLE_OR_FIGURE_PATTER.matcher(textBlock.toString());
Matcher tableOrFigureMatcher = TABLE_OR_FIGURE_PATTERN.matcher(textBlock.toString());
Matcher headlineWithSingleIdentifierMatcher = HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN.matcher(textBlock.toString());
boolean isAtLeast3Characters = atLeast3Matcher.reset().find();
boolean isTocItem = textBlock.getText().contains("..............");
boolean headlineWithSlashesMatches = headlineWithSlashesMatcher.reset().matches();
boolean isAmount = amountMatcher.reset().find();
int charCount = countChars(textBlock);
@ -112,35 +117,22 @@ public class DocuMineClassificationService {
headlineClassificationService.setLastHeadlineFromOutline(textBlock);
return;
}
if (textBlock.getClassification() != null && (textBlock.getClassification().equals(PageBlockType.HEADER)
|| textBlock.getClassification().equals(PageBlockType.FOOTER)
|| textBlock.getClassification().equals(PageBlockType.TABLE_OF_CONTENTS_ITEM))) {
return;
}
if (document.getFontSizeCounter().getMostPopular() == null) {
textBlock.setClassification(PageBlockType.PARAGRAPH);
return;
}
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) //
|| (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) //
&& (document.getFontSizeCounter().getMostPopular() == null //
|| textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular()))) {
textBlock.setClassification(PageBlockType.HEADER);
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
|| (PositionUtils.isUnderBodyTextFrame(bodyTextFrame,
textBlock,
page.getRotation())
&& (document.getFontSizeCounter().getMostPopular()
== null
|| textBlock.getHighestFontSize()
<= document.getFontSizeCounter()
.getMostPopular()))
|| HeaderFooterDetection.isLikelyFooter(textBlock, document, page)) {
textBlock.setClassification(PageBlockType.FOOTER);
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5
&& textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) {
if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5
&& textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) {
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
textBlock.setClassification(PageBlockType.TITLE);
}
} else if (textBlock.getText().length() > 5
&& (textBlock.getMostPopularWordHeight() > document.getTextHeightCounter().getMostPopular()
|| textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular())
&& greaterOrEqualFontThanDocumentAverage(textBlock, document)
&& PositionUtils.getApproxLineCount(textBlock) < 5.9
&& ((textBlock.getMostPopularWordStyle().contains("bold") || textBlock.isUnderlined())//
&& Character.isDigit(textBlock.toString().charAt(0)) //
@ -152,18 +144,19 @@ public class DocuMineClassificationService {
|| textBlock.toString().startsWith("TABLE"))
&& !textBlock.toString().endsWith(":")
&& isAtLeast3Characters
&& !isTocItem
&& !isAmount
&& enoughChars) {
setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes);
} else if (isAllCaps(textBlock)
&& ALPHANUMERIC.matcher(Character.toString(textBlock.getText().charAt(0))).matches()
&& hasSeparation(textBlock, surroundingBlocks)
&& textBlock.getText().length() > 5
&& isAtLeast3Characters
&& !isAmount
&& enoughChars
&& !textBlock.toString().contains(":")
&& !textBlock.toString().startsWith("(")
&& !textBlock.toString().endsWith(".")
&& PositionUtils.getApproxLineCount(textBlock) < 2.9) {
setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes);
@ -171,16 +164,14 @@ public class DocuMineClassificationService {
&& PositionUtils.getApproxLineCount(textBlock) < 2.9
&& isAtLeast3Characters
&& !headlineWithSlashesMatches
&& !isAmount
&& !isTocItem) {
&& !isAmount) {
setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes);
} else if (!isTocItem
&& hasSeparation(textBlock, surroundingBlocks)
&& greaterOrEqualThanFontPageAverage(textBlock, page)
&& PositionUtils.getApproxLineCount(textBlock) < 2.9
&& (tableOrFigureMatcher.reset().find() || headlineWithSingleIdentifierMatcher.reset().find())
&& !isAmount
} else if (hasSeparation(textBlock, surroundingBlocks)//
&& greaterOrEqualFontThanPageAverage(textBlock, page)//
&& PositionUtils.getApproxLineCount(textBlock) < 2.9//
&& (tableOrFigureMatcher.reset().find() || headlineWithSingleIdentifierMatcher.reset().find())//
&& !isAmount//
&& !headlineWithSlashesMatches) {
setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes);
@ -222,13 +213,20 @@ public class DocuMineClassificationService {
}
private static boolean greaterOrEqualThanFontPageAverage(TextPageBlock textBlock, ClassificationPage page) {
private static boolean greaterOrEqualFontThanPageAverage(TextPageBlock textBlock, ClassificationPage page) {
return textBlock.getMostPopularWordHeight() >= page.getTextHeightCounter().getMostPopular() //
|| textBlock.getMostPopularWordFontSize() >= page.getFontSizeCounter().getMostPopular();
}
private static boolean greaterOrEqualFontThanDocumentAverage(TextPageBlock textBlock, ClassificationDocument document) {
return textBlock.getMostPopularWordHeight() > document.getTextHeightCounter().getMostPopular() //
|| textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular();
}
private static boolean isAllCaps(TextPageBlock textBlock) {
return textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT));
@ -246,8 +244,7 @@ public class DocuMineClassificationService {
return surroundingBlocks.stream()
.mapToDouble(surroundingBlock -> calculateSeparation(textBlock, surroundingBlock))
.min()
.orElse(Double.MAX_VALUE);
.min().orElse(Double.MAX_VALUE);
}

View File

@ -0,0 +1,55 @@
package com.knecon.fforesight.service.layoutparser.processor.services.classification;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
import lombok.AccessLevel;
import lombok.RequiredArgsConstructor;
import lombok.experimental.FieldDefaults;
@Service
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class HeaderFooterClassificationService {
public void classifyHeadersAndFooters(ClassificationDocument document) {
for (ClassificationPage page : document.getPages()) {
for (AbstractPageBlock pageBlock : page.getTextBlocks()) {
if (pageBlock instanceof TextPageBlock textBlock) {
classifyBlock(document, page, textBlock);
}
}
}
}
private static void classifyBlock(ClassificationDocument document, ClassificationPage page, TextPageBlock textBlock) {
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|| PositionUtils.isOverBodyTextFrame(page.getBodyTextFrame(), textBlock, page.getRotation()) && smallerFontThanDocAverage(document, textBlock)) {
textBlock.setClassification(PageBlockType.HEADER);
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
|| PositionUtils.isUnderBodyTextFrame(page.getBodyTextFrame(), textBlock, page.getRotation()) && smallerFontThanDocAverage(document, textBlock)) {
textBlock.setClassification(PageBlockType.FOOTER);
}
}
private static boolean smallerFontThanDocAverage(ClassificationDocument document, TextPageBlock textBlock) {
return document.getFontSizeCounter().getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular();
}
}

View File

@ -61,6 +61,15 @@ public class RedactManagerClassificationService {
headlineClassificationService.setLastHeadlineFromOutline(textBlock);
return;
}
if (textBlock.getClassification() != null && (textBlock.getClassification().equals(PageBlockType.HEADER)
|| textBlock.getClassification().equals(PageBlockType.FOOTER)
|| textBlock.getClassification().equals(PageBlockType.TABLE_OF_CONTENTS_ITEM))) {
return;
}
if (document.getFontSizeCounter().getMostPopular() == null) {
textBlock.setClassification(PageBlockType.PARAGRAPH);
return;
}
if (document.getFontSizeCounter().getMostPopular() == null) {
textBlock.setClassification(PageBlockType.PARAGRAPH);
return;
@ -73,21 +82,8 @@ public class RedactManagerClassificationService {
textBlock.setClassification(PageBlockType.PARAGRAPH);
return;
}
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null
|| textBlock.getHighestFontSize() <= document.getFontSizeCounter()
.getMostPopular())) {
textBlock.setClassification(PageBlockType.HEADER);
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
|| PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null
|| textBlock.getHighestFontSize() <= document.getFontSizeCounter()
.getMostPopular())) {
textBlock.setClassification(PageBlockType.FOOTER);
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5
&& textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) {
if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5
&& textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) {
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
textBlock.setClassification(PageBlockType.TITLE);

View File

@ -0,0 +1,370 @@
package com.knecon.fforesight.service.layoutparser.processor.services.classification;
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.DocuMineClassificationService.AMOUNT_PATTERN;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextBlockOnPage;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequenceComparator;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
public class TableOfContentsClassificationService {
private static final int MAX_PAGE_COUNT = 10; // maximum length of a toc to avoid runaway classification
private static final int SURROUNDING_BLOCKS_RADIUS = 10; // number of blocks to look ahead
private static final int MINIMUM_MATCHES = 2; // minimum cluster size
public static final int INTERSECTION_TOLERANCE = 2; // equality threshold for x intersection
public static final int DENSITY_THRESHOLD_COUNT = 10; // describes the minimum density, at least this many entries per page height are required
private static final Pattern NUMERIC = Pattern.compile("[0-9]+");
@SuppressWarnings("checkstyle:ModifiedControlVariable")
public void classifyTableOfContents(ClassificationDocument document) {
List<TextBlockOnPage> textBlocks = buildBlocksPerPage(document);
for (int i = 0; i < textBlocks.size(); i++) {
TextBlockOnPage textBlock = textBlocks.get(i);
if (!isTOCHeadline(textBlock)) {
continue;
}
int offset = identifyTOCItems(i + 1, textBlocks, document);
if (offset > 1) {
textBlock.textBlock().setClassification(PageBlockType.H1);
i += offset;
}
}
}
private int identifyTOCItems(int start, List<TextBlockOnPage> textBlocks, ClassificationDocument document) {
ClassificationPage startPage = textBlocks.get(start).page();
List<TextBlockOnPage> initialLookAhead = textBlocks.subList(start, Math.min(start + SURROUNDING_BLOCKS_RADIUS, textBlocks.size()));
HashMap<TextPositionSequence, TextBlockOnPage> lookup = new HashMap<>();
List<TextPositionSequence> numbers = extractNumbers(initialLookAhead, lookup, document.getPages().size());
TocNumberFinder tocNumberFinder = new TocNumberFinder(numbers, lookup);
int lastCandidate = start;
for (int i = start; i < Math.min(lastCandidate + SURROUNDING_BLOCKS_RADIUS, textBlocks.size()); i++) {
TextBlockOnPage textBlockOnPage = textBlocks.get(i);
if (textBlockOnPage.page().getPageNumber() - MAX_PAGE_COUNT > startPage.getPageNumber()) {
break;
}
if (textBlockOnPage.textBlock().getClassification() != null //
&& textBlockOnPage.textBlock().getClassification().isHeadline() //
&& !(textBlockOnPage.textBlock().getText().startsWith("TABLES") //
|| textBlockOnPage.textBlock().getText().startsWith("APPENDICES") //
|| textBlockOnPage.textBlock().getText().startsWith("FIGURES"))) {
log.debug("hit an outline headline, stop immediately.");
lastCandidate = i - 1;
break;
}
List<TextPositionSequence> numbersFromBlock = extractNumbers(textBlockOnPage, lookup, document.getPages().size());
List<TextPositionSequence> currentRightmostCluster = tocNumberFinder.getCurrentRightmostCluster();
if (currentRightmostCluster.size() < MINIMUM_MATCHES) {
log.debug("No numbers indicating a table of contents here.");
return start;
}
if (anyIntersection(currentRightmostCluster, numbersFromBlock, lookup)) {
lastCandidate = i;
numbersFromBlock.forEach(tocNumberFinder::add);
}
}
addVisualization(document.getLayoutDebugLayer(), tocNumberFinder, lookup);
Set<TextBlockOnPage> blocksWithNumberInCluster = tocNumberFinder.getCurrentRightmostCluster()
.stream()
.map(lookup::get)
.collect(Collectors.toSet());
int lastConfirmed = start;
for (int i = start; i < lastCandidate + 1; i++) {
TextBlockOnPage textBlockOnPage = textBlocks.get(i);
if (blocksWithNumberInCluster.contains(textBlockOnPage)) {
lastConfirmed = i;
}
}
textBlocks.subList(start, lastConfirmed + 1)
.stream()
.filter(block -> (block.textBlock().getClassification() == null || !block.textBlock().getClassification().isHeadline()))
.forEach(textBlockOnPage -> textBlockOnPage.textBlock().setClassification(PageBlockType.TABLE_OF_CONTENTS_ITEM));
return lastCandidate;
}
private static void addVisualization(LayoutDebugLayer layoutDebugLayer, TocNumberFinder tocNumberFinder, Map<TextPositionSequence, TextBlockOnPage> lookup) {
tocNumberFinder.getCurrentRightmostCluster()
.stream()
.collect(Collectors.groupingBy(key -> lookup.get(key).page().getPageNumber()))
.forEach((pageNumber, number) -> layoutDebugLayer.addTocPages(number, pageNumber));
}
private static boolean anyIntersection(Collection<TextPositionSequence> numbers1,
Collection<TextPositionSequence> numbers2,
Map<TextPositionSequence, TextBlockOnPage> lookup) {
return numbers1.stream()
.anyMatch(numberFromCluster -> numbers2.stream()
.anyMatch(numberFromBlock -> matches(numberFromBlock, numberFromCluster, lookup)));
}
private static List<TextPositionSequence> extractNumbers(List<TextBlockOnPage> textBlocks, Map<TextPositionSequence, TextBlockOnPage> lookup, int numberOfPages) {
List<TextPositionSequence> blocks = new LinkedList<>();
for (TextBlockOnPage textBlock : textBlocks) {
blocks.addAll(extractNumbers(textBlock, lookup, numberOfPages));
}
return blocks;
}
private static List<TextPositionSequence> extractNumbers(TextBlockOnPage textBlock, Map<TextPositionSequence, TextBlockOnPage> lookup, int numberOfPages) {
List<TextPositionSequence> blocks = new LinkedList<>();
TextPageBlock block = textBlock.textBlock();
List<TextPositionSequence> sequences = block.getSequences();
for (int i = 0; i < sequences.size(); i++) {
TextPositionSequence word = sequences.get(i);
if (!NUMERIC.matcher(word).matches()) {
continue;
}
if (AMOUNT_PATTERN.matcher(getSurroundingString(i, sequences)).matches()) {
continue;
}
try {
int pageNumber = Integer.parseInt(word.toString());
if (0 >= pageNumber || pageNumber > numberOfPages) {
continue;
}
lookup.put(word, textBlock);
blocks.add(word);
} catch (NumberFormatException e) {
log.debug("That wasn't a number! Should not happen, due to numeric check beforehand.");
}
}
return blocks;
}
private static CharSequence getSurroundingString(int i, List<TextPositionSequence> sequences) {
int end = Math.min(i + 5, sequences.size());
return sequences.subList(i, end)
.stream()
.map(TextPositionSequence::toString)
.collect(Collectors.joining(" "));
}
private static boolean matches(TextPositionSequence number1, TextPositionSequence number2, Map<TextPositionSequence, TextBlockOnPage> lookup) {
if (number1.getDir() != number2.getDir()) {
return false;
}
return number1.intersectsXDirAdj(number2, INTERSECTION_TOLERANCE);
}
private boolean isTOCHeadline(TextBlockOnPage textBlock) {
if (textBlock.textBlock().getText().length() > 50) {
return false;
}
String text = TextNormalizationUtilities.removeAllWhitespaces(textBlock.textBlock().getText().toLowerCase(Locale.ENGLISH));
return (text.contains("content") && text.length() < "content".length() + 6) //
|| (text.contains("tableofcontent") && text.length() < "tableofcontent".length() + DENSITY_THRESHOLD_COUNT)//
|| text.equals("tables")//
|| text.equals("appendices")//
|| text.equals("figures");
}
private List<TextBlockOnPage> buildBlocksPerPage(ClassificationDocument document) {
List<TextBlockOnPage> blocks = new ArrayList<>();
for (ClassificationPage page : document.getPages()) {
for (AbstractPageBlock abstractPageBlock : page.getTextBlocks()) {
if (abstractPageBlock instanceof TextPageBlock textBlock) {
if (textBlock.getClassification() != null && (textBlock.getClassification().equals(PageBlockType.HEADER) //
|| textBlock.getClassification().equals(PageBlockType.FOOTER))) {
continue;
}
blocks.add(new TextBlockOnPage(page, textBlock));
}
}
}
return blocks;
}
private static class TocNumberFinder {
final UnionFind<TextPositionSequence> numberClusters;
final HashMap<TextPositionSequence, TextBlockOnPage> lookup;
TocNumberFinder(List<TextPositionSequence> blocks, HashMap<TextPositionSequence, TextBlockOnPage> lookup) {
this.numberClusters = new UnionFind<>(new HashSet<>(blocks));
for (int i = 0; i < blocks.size(); i++) {
for (int j = i + 1; j < blocks.size(); j++) {
if (matches(blocks.get(i), blocks.get(j), lookup)) {
numberClusters.union(blocks.get(i), blocks.get(j));
}
}
}
this.lookup = lookup;
}
public void add(TextPositionSequence number) {
if (numberClusters.getElements().contains(number)) {
return;
}
numberClusters.addElement(number);
for (TextPositionSequence element : numberClusters.getElements()) {
if (matches(number, element, lookup)) {
numberClusters.union(element, number);
}
}
}
public List<TextPositionSequence> getCurrentRightmostCluster() {
return numberClusters.getGroups()
.stream()
.filter(cluster -> cluster.size() > MINIMUM_MATCHES)
.map(cluster -> cluster.stream()
.sorted(new TextPositionSequenceComparator(lookup))
.toList())
.map(this::removeOutliers)
// .map(this::filterByMinimumDensity)
.filter(cluster -> cluster.size() > MINIMUM_MATCHES)
.max(Comparator.comparingDouble(cluster -> cluster.get(0).getBBox().getMaxX())).orElse(Collections.emptyList());
}
// does not seem to be doing much, ideally instead of using the height of the blocks i would like to use the height, beginning from the MainBody top,
// but as the MainBody is often wrong, this results in some numbers being discarded even though they are correct.
// private List<TextPositionSequence> filterByMinimumDensity(List<TextPositionSequence> numbers) {
//
// Map<ClassificationPage, List<TextPositionSequence>> clustersPerPage = numbers.stream()
// .collect(Collectors.groupingBy(number -> lookup.get(number).page()));
//
// List<TextPositionSequence> result = new ArrayList<>(numbers.size());
// clustersPerPage.keySet()
// .stream()
// .sorted(Comparator.comparingInt(ClassificationPage::getPageNumber))
// .forEach(page -> {
// var numbersOnPage = clustersPerPage.get(page);
//
// double height = numbersOnPage.stream()
// .map(BoundingBox::getBBox)
// .collect(RectangleTransformations.collectBBox()).getHeight();
//
// double count = numbersOnPage.size();
//
// if ((count / height) >= (DENSITY_THRESHOLD_COUNT / page.getPageHeight())) {
// result.addAll(numbers);
// }
// });
// return result;
// }
public List<TextPositionSequence> removeOutliers(List<TextPositionSequence> numbers) {
List<TextPositionSequence> result = new ArrayList<>();
result.add(numbers.get(0));
for (int i = 1; i < numbers.size() - 1; i++) {
int prev = getNumberAsInt(numbers, i - 1);
int curr = getNumberAsInt(numbers, i);
int next = getNumberAsInt(numbers, i + 1);
if (!(curr <= prev || curr >= next) || !isBetterWithout(numbers, i)) {
result.add(numbers.get(i));
}
}
if (getNumberAsInt(numbers, numbers.size() - 1) >= getNumberAsInt(numbers, Math.max(0, numbers.size() - 2))) {
result.add(numbers.get(numbers.size() - 1));
}
return result;
}
// Helper method to check if removing the current number results in a better order
public static boolean isBetterWithout(List<TextPositionSequence> numbers, int i) {
if (i == 0 || i == numbers.size() - 1) {
return false;
}
int prev = getNumberAsInt(numbers, i);
int curr = getNumberAsInt(numbers, i);
int next = getNumberAsInt(numbers, i + 1);
return (prev <= next) && (Math.abs(prev - next) < Math.abs(prev - curr) + Math.abs(curr - next));
}
}
private static int getNumberAsInt(List<TextPositionSequence> numbers, int i) {
return Integer.parseInt(numbers.get(i).toString());
}
}

View File

@ -201,7 +201,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
try {
if (stroke && !getGraphicsState().getStrokingColor().isPattern() && isBlack(getGraphicsState().getStrokingColor()) || //
!stroke && !getGraphicsState().getNonStrokingColor().isPattern() && isBlack(getGraphicsState().getNonStrokingColor())) {
!stroke && !getGraphicsState().getNonStrokingColor().isPattern() && isBlack(getGraphicsState().getNonStrokingColor())) {
rulings.addAll(path);
}
} catch (UnsupportedOperationException e) {
@ -279,9 +279,9 @@ public class PDFLinesTextStripper extends PDFTextStripper {
startIndex = i;
}
if (i > 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i).getUnicode().equals("\u00A0") || textPositions.get(i)
.getUnicode()
.equals("\t")) && i <= textPositions.size() - 2) {
if (i > 0
&& (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i).getUnicode().equals("\u00A0") || textPositions.get(i).getUnicode().equals("\t"))
&& i <= textPositions.size() - 2) {
List<TextPosition> sublist = textPositions.subList(startIndex, i);
if (checkIfSequenceContainsOnlyWhitespaces(sublist)) {
@ -296,20 +296,31 @@ public class PDFLinesTextStripper extends PDFTextStripper {
}
startIndex = i + 1;
}
if (isDottedLineFollowedByWord(textPositions, i, startIndex)) {
List<TextPosition> sublist = textPositions.subList(startIndex, i);
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart));
startIndex = i;
}
if (isWordFollowedByDottedLine(textPositions, i, startIndex)) {
List<TextPosition> sublist = textPositions.subList(startIndex, i - 2);
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart));
startIndex = i - 2;
}
}
List<TextPosition> sublist = textPositions.subList(startIndex, textPositions.size());
if (!sublist.isEmpty() && (sublist.get(sublist.size() - 1).getUnicode().equals(" ") || sublist.get(sublist.size() - 1)
.getUnicode()
.equals("\u00A0") || sublist.get(sublist.size() - 1).getUnicode().equals("\t"))) {
if (!sublist.isEmpty() && (sublist.get(sublist.size() - 1).getUnicode().equals(" ")
|| sublist.get(sublist.size() - 1).getUnicode().equals("\u00A0")
|| sublist.get(sublist.size() - 1).getUnicode().equals("\t"))) {
sublist = sublist.subList(0, sublist.size() - 1);
}
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0)
.getUnicode()
.equals("\t")))) {
if (previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0)
.getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < 0.01) {
if (previous != null
&& sublist.get(0).getYDirAdj() == previous.getYDirAdj()
&& sublist.get(0).getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < 0.01) {
for (TextPosition t : sublist) {
textPositionSequences.get(textPositionSequences.size() - 1).add(t);
}
@ -317,10 +328,31 @@ public class PDFLinesTextStripper extends PDFTextStripper {
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, isParagraphStart));
}
}
super.writeString(text);
}
private boolean isWordFollowedByDottedLine(List<TextPosition> textPositions, int i, int startIndex) {
return i - startIndex >= 4 //
&& textPositions.get(i).getUnicode().equals(".") //
&& textPositions.get(i - 1).getUnicode().equals(".") //
&& textPositions.get(i - 2).getUnicode().equals(".") //
&& !textPositions.get(i - 3).getUnicode().equals(".");
}
private static boolean isDottedLineFollowedByWord(List<TextPosition> textPositions, int i, int startIndex) {
return i - startIndex >= 4 //
&& !textPositions.get(i).getUnicode().equals(".") //
&& textPositions.get(i - 1).getUnicode().equals(".") //
&& textPositions.get(i - 2).getUnicode().equals(".") //
&& textPositions.get(i - 3).getUnicode().equals(".");
}
public boolean checkIfCurrentPositionIsToTheRightOfPreviousPosition(int i, List<TextPosition> textPositions) {
return i > 0 && textPositions.get(i).getXDirAdj() < textPositions.get(i - 1).getXDirAdj();
@ -337,8 +369,9 @@ public class PDFLinesTextStripper extends PDFTextStripper {
public boolean checkIfGapSizeBetweenCharactersSmallerThanMaximum(RedTextPosition previous, List<TextPosition> sublist, float maximumGapSize) {
return previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0)
.getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < maximumGapSize;
return previous != null
&& sublist.get(0).getYDirAdj() == previous.getYDirAdj()
&& sublist.get(0).getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < maximumGapSize;
}

View File

@ -36,6 +36,9 @@ public class LayoutGridService {
LayoutGrid layoutGrid = createLayoutGrid(document);
Outline outline = OutlineMapper.createOutline(document);
layoutGrid.setVisibleByDefault(layerVisibilityDefaultValue);
document.getLayoutDebugLayer().addSentenceVisualization(document.getTextBlock());
if (document.getLayoutDebugLayer().isActive()) {
viewerDocumentService.addLayerGroups(originFile, destinationFile, List.of(layoutGrid, document.getLayoutDebugLayer()), outline);
} else {

View File

@ -10,6 +10,7 @@ public final class TextNormalizationUtilities {
public static final Pattern hyphenLineBreaks = Pattern.compile("[-~‐‒⁻−﹣゠⁓‑\\u00AD][\\r\\n]+");
public static final Pattern linebreaks = Pattern.compile("[\\r\\n]+");
public static final Pattern doubleWhitespaces = Pattern.compile("\\s{2,}");
public static final Pattern WHITESPACE_REMOVAL = Pattern.compile("\\s+");
public String cleanString(String value) {
@ -36,4 +37,11 @@ public final class TextNormalizationUtilities {
return linebreaks.matcher(value).replaceAll(" ");
}
public String removeAllWhitespaces(String value) {
return WHITESPACE_REMOVAL.matcher(value).replaceAll("");
}
}

View File

@ -2,6 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor.utils;
import java.awt.geom.Rectangle2D;
import java.awt.geom.RectangularShape;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashSet;
@ -54,18 +55,26 @@ public class TextPositionOperations {
private List<TextPositionSequence> sortUsingLineDetection(Set<TextPositionSequence> sequences) {
return sortLines(groupByLine(sequences));
}
public List<TextPositionSequence> sortLines(Collection<Set<TextPositionSequence>> lines) {
return lines.stream()
.map(TextPositionOperations::sortByXDirAdj)
.filter(line -> !line.isEmpty())
.sorted(Comparator.comparing(line -> line.get(0), COMPARATOR_DIR_ADJ))
.flatMap(Collection::stream)
.toList();
List<List<TextPositionSequence>> lineBlocks = new ArrayList<>();
for (Set<TextPositionSequence> line : lines) {
List<TextPositionSequence> sortedLine = sortByXDirAdj(line);
if (!sortedLine.isEmpty()) {
lineBlocks.add(sortedLine);
}
}
// need to use old sorting, since COMPARATOR_DIR_ADJ is not transitive
QuickSort.sort(lineBlocks, Comparator.comparing(line -> line.get(0), COMPARATOR_DIR_ADJ));
List<TextPositionSequence> list = new ArrayList<>();
for (List<TextPositionSequence> textPositionSequences : lineBlocks) {
list.addAll(textPositionSequences);
}
return list;
}
@ -91,7 +100,7 @@ public class TextPositionOperations {
for (TextPositionSequence sequence : sequences) {
for (TextPositionSequence sequence2 : sequences) {
if (sequence.equals(sequence2) || unionFind.inSameSet(sequence, sequence2)) {
if (sequence.equals(sequence2)) { // || unionFind.inSameSet(sequence, sequence2)) doing this is actually slower than not doing it
continue;
}

View File

@ -5,9 +5,11 @@ import java.awt.geom.AffineTransform;
import java.awt.geom.Line2D;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.text.BreakIterator;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
@ -19,6 +21,8 @@ import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Bound
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
@ -94,6 +98,29 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
}
public void addSentenceVisualization(TextBlock textBlock) {
if (!active) {
return;
}
AtomicInteger rotatingColorIdx = new AtomicInteger(0);
String text = textBlock.getSearchText();
BreakIterator sentenceIterator = BreakIterator.getSentenceInstance(Locale.ENGLISH);
sentenceIterator.setText(text);
int lastIdx = 0;
while (sentenceIterator.next() != BreakIterator.DONE) {
TextRange sentenceRange = new TextRange(lastIdx + textBlock.getTextRange().start(), sentenceIterator.current() + textBlock.getTextRange().start());
lastIdx = sentenceIterator.current();
Color color = getRotatingColor(rotatingColorIdx);
textBlock.getPositionsPerPage(sentenceRange)
.forEach((page, bboxes) -> getOrCreateVisualizationsOnPage(page.getNumber(), this.sentences).getColoredRectangles()
.addAll(bboxes.stream()
.map(bbox -> new ColoredRectangle(bbox, color, 1))
.toList()));
}
}
private Color decideOnRulingColor(Ruling ruling) {
return switch (ruling.getClassification()) {
@ -248,7 +275,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
.map(Line::getCharacters)
.flatMap(Collection::stream)
.forEach(character -> {
Color color = ROTATING_CHARACTER_COLOR.get(index.getAndIncrement() % ROTATING_CHARACTER_COLOR.size());
Color color = getRotatingColor(index);
Rectangle2D charBBox = character.getTextPosition().getBBoxPdf();
characterVisualizations.getColoredRectangles().add(new ColoredRectangle(charBBox, color, 1));
character.getNeighbors()
@ -263,6 +290,31 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
}
public void addTocPages(List<TextPositionSequence> numbers, int page) {
if (!active) {
return;
}
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(page, this.tocPages);
visualizationsOnPage.getColoredRectangles()
.addAll(numbers.stream()
.map(BoundingBox::getBBoxPdf)
.map(line -> new ColoredRectangle(line, LINES_COLOR, 0.5f))
.toList());
visualizationsOnPage.getColoredRectangles()
.add(new ColoredRectangle(numbers.stream()
.map(BoundingBox::getBBoxPdf)
.collect(RectangleTransformations.collectBBox()), LINES_COLOR, 0.5f));
}
private static Color getRotatingColor(AtomicInteger index) {
return ROTATING_CHARACTER_COLOR.get(index.getAndIncrement() % ROTATING_CHARACTER_COLOR.size());
}
public void addOutlineObjects(List<OutlineObject> outlineObjects, PageInformation pageInformation) {
if (!active) {

View File

@ -0,0 +1,70 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
import static org.junit.jupiter.api.Assertions.*;
import java.awt.geom.Rectangle2D;
import org.junit.jupiter.api.Test;
class BoundingBoxTest {
@Test
void testHorizontalDistance_NoOverlap() {
ConcreteBoundingBox box1 = new ConcreteBoundingBox(0, 0, 10, 10);
ConcreteBoundingBox box2 = new ConcreteBoundingBox(20, 0, 10, 10);
assertEquals(10, box1.horizontalDistance(box2));
}
@Test
void testHorizontalDistance_Overlap() {
ConcreteBoundingBox box1 = new ConcreteBoundingBox(0, 0, 10, 10);
ConcreteBoundingBox box2 = new ConcreteBoundingBox(5, 0, 10, 10);
assertEquals(0, box1.horizontalDistance(box2));
}
@Test
void testVerticalDistance_NoOverlap() {
ConcreteBoundingBox box1 = new ConcreteBoundingBox(0, 0, 10, 10);
ConcreteBoundingBox box2 = new ConcreteBoundingBox(0, 20, 10, 10);
assertEquals(10, box1.verticalDistance(box2));
}
@Test
void testVerticalDistance_Overlap() {
ConcreteBoundingBox box1 = new ConcreteBoundingBox(0, 0, 10, 10);
ConcreteBoundingBox box2 = new ConcreteBoundingBox(0, 5, 10, 10);
assertEquals(0, box1.verticalDistance(box2));
}
@Test
void testVerticalDistance_PartialOverlap() {
ConcreteBoundingBox box1 = new ConcreteBoundingBox(0, 0, 10, 10);
ConcreteBoundingBox box2 = new ConcreteBoundingBox(0, 8, 10, 10);
assertEquals(0, box1.verticalDistance(box2));
}
@Test
void testHorizontalDistance_PartialOverlap() {
ConcreteBoundingBox box1 = new ConcreteBoundingBox(0, 0, 10, 10);
ConcreteBoundingBox box2 = new ConcreteBoundingBox(8, 0, 10, 10);
assertEquals(0, box1.horizontalDistance(box2));
}
}

View File

@ -0,0 +1,12 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
import java.awt.geom.Rectangle2D;
class ConcreteBoundingBox extends BoundingBox {
ConcreteBoundingBox(double x, double y, double width, double height) {
this.bBox = new Rectangle2D.Double(x, y, width, height);
}
}

View File

@ -46,8 +46,8 @@ public class SimplifiedTextServiceTest
Document document = buildGraph(file);
SimplifiedText simplifiedText = simplifiedSectionTextService.toSimplifiedText(document);
List<SimplifiedSectionText> sectionTexts = simplifiedText.getSectionTexts();
assertThat(sectionTexts.stream().filter(section -> section.getText().equals(footerExample)).collect(Collectors.toList()).size()).isGreaterThan(0);
assertThat(sectionTexts.stream().filter(section -> section.getText().equals(headerExample)).collect(Collectors.toList()).size()).isGreaterThan(0);
assertThat(sectionTexts.stream().filter(section -> section.getText().contains(footerExample)).toList().size()).isGreaterThan(0);
assertThat(sectionTexts.stream().filter(section -> section.getText().contains(headerExample)).toList().size()).isGreaterThan(0);

View File

@ -51,7 +51,7 @@ import lombok.extern.slf4j.Slf4j;
@Slf4j
public class DocumentReadingOrderTest extends BuildDocumentTest {
private static final boolean DRAW_DIR_ADJ_COORDS = true;
private static final boolean DRAW_DIR_ADJ_COORDS = false;
public static final List<LayoutParsingType> LAYOUT_PARSING_TYPES = List.of(LayoutParsingType.DOCUMINE,
LayoutParsingType.DOCUMINE_OLD,
LayoutParsingType.REDACT_MANAGER,
@ -82,7 +82,7 @@ public class DocumentReadingOrderTest extends BuildDocumentTest {
@Disabled
public void drawDirAdjForFile() {
String pdfFile = "/home/kschuettler/Dokumente/Ticket Related/RED-9974/026dc94b019bc2348a4c54f0c6c4516f.ORIGIN.pdf";
String pdfFile = "/home/kschuettler/Dokumente/TestFiles/OCR/VV-331340/VV-331340_OCRED_first15.pdf";
ClassificationDocument classificationDocument = parseLayout(pdfFile, LayoutParsingType.DOCUMINE_OLD);

View File

@ -76,9 +76,10 @@ class TextRangeTest {
assertEquals(List.of(new TextRange(10, 40), new TextRange(40, 100)), startTextRange.split(List.of(40)));
assertEquals(1, startTextRange.split(Collections.emptyList()).size());
assertEquals(1, startTextRange.split(List.of(startTextRange.start())).size());
assertEquals(1, startTextRange.split(List.of(100)).size());
assertThrows(IndexOutOfBoundsException.class, () -> startTextRange.split(Collections.singletonList(0)));
assertThrows(IndexOutOfBoundsException.class, () -> startTextRange.split(Collections.singletonList(100)));
assertThrows(IndexOutOfBoundsException.class, () -> startTextRange.split(List.of(12, 40, 100)));
assertThrows(IndexOutOfBoundsException.class, () -> startTextRange.split(Collections.singletonList(101)));
assertThrows(IndexOutOfBoundsException.class, () -> startTextRange.split(List.of(12, 40, 101)));
}
}

View File

@ -55,7 +55,10 @@ public record LayerIdentifier(String name, String markedContentName) {
public static final LayerIdentifier NEIGHBOURS = new LayerIdentifier("Neighbours", "NEIGHBOURS");
public static final LayerIdentifier CHARACTERS = new LayerIdentifier("Characters", "CHARACTERS");
public static final LayerIdentifier OUTLINE_OBJECTS = new LayerIdentifier("Outline Positions", "OUTLINE_OBJECTS");
public static final LayerIdentifier SENTENCES = new LayerIdentifier("Sentences", "SENTENCES");
public static final LayerIdentifier TOC_PAGES = new LayerIdentifier("TOC pages", "TOC_PAGES");
// Visual layout parser
public static final LayerIdentifier KNECON_VISUAL_PARSING = new LayerIdentifier("Visual Layout Parser", "VISUAL_PARSING");
//ocr

View File

@ -55,6 +55,8 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup {
protected final Visualizations neighbours = Visualizations.builder().layer(LayerIdentifier.NEIGHBOURS).build();
protected final Visualizations characters = Visualizations.builder().layer(LayerIdentifier.CHARACTERS).build();
protected final Visualizations outlineObjects = Visualizations.builder().layer(LayerIdentifier.OUTLINE_OBJECTS).build();
protected final Visualizations sentences = Visualizations.builder().layer(LayerIdentifier.SENTENCES).build();
protected final Visualizations tocPages = Visualizations.builder().layer(LayerIdentifier.TOC_PAGES).build();
public List<Visualizations> getVisualizations() {
@ -63,14 +65,15 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup {
neighbours,//
words, //
lines, //
sentences, //
zones, //
rulings, //
clean_rulings, //
cells, //
mainBody, //
markedContent, //
outlineObjects //
);
outlineObjects, //
tocPages);
}
}

View File

@ -68,10 +68,10 @@ public class OutlineUtility {
public static void deleteExistingOutline(PDFDoc doc) {
Bookmark firstBookmark = doc.getFirstBookmark();
// while (firstBookmark != null && firstBookmark.isValid()) {
while (firstBookmark != null && firstBookmark.isValid()) {
firstBookmark.delete();
firstBookmark = doc.getFirstBookmark();
// }
}
}