Merge branch 'RED-10127' into 'main'

RED-10127: rename TextPositionSequence to Word

See merge request fforesight/layout-parser!244
This commit is contained in:
Kilian Schüttler 2024-10-18 12:20:15 +02:00
commit 2219519a2b
44 changed files with 348 additions and 357 deletions

View File

@ -29,7 +29,6 @@ import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsi
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition; import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationService; import com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.TableOfContentsClassificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.MarkdownMapper; import com.knecon.fforesight.service.layoutparser.processor.services.mapper.MarkdownMapper;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
@ -45,7 +44,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableO
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.CvTableParsingAdapter; import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.CvTableParsingAdapter;
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.ImageServiceResponseAdapter; import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.ImageServiceResponseAdapter;
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.VisualLayoutParsingAdapter; import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.VisualLayoutParsingAdapter;
@ -53,7 +52,6 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.ima
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService;
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService; import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService; import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService; import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService;
@ -63,9 +61,6 @@ import com.knecon.fforesight.service.layoutparser.processor.services.blockificat
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService; import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService; import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService; import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.ClarifyndClassificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.DocuMineClassificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.processor.services.graphics.Box; import com.knecon.fforesight.service.layoutparser.processor.services.graphics.Box;
import com.knecon.fforesight.service.layoutparser.processor.services.graphics.GraphicExtractorService; import com.knecon.fforesight.service.layoutparser.processor.services.graphics.GraphicExtractorService;
@ -275,7 +270,7 @@ public class LayoutParsingPipeline {
stripper.setEndPage(pageNumber); stripper.setEndPage(pageNumber);
stripper.setPdpage(pdPage); stripper.setPdpage(pdPage);
stripper.getText(originDocument); stripper.getText(originDocument);
List<TextPositionSequence> words = stripper.getTextPositionSequences(); List<Word> words = stripper.getWords();
// rotateDirAdjExactly(words, pdPage); // works really well for many highly rotated documents (e.g. VV-331340.pdf), but it decreases the headline performance by 1.3%, so I am leaving it out for now // rotateDirAdjExactly(words, pdPage); // works really well for many highly rotated documents (e.g. VV-331340.pdf), but it decreases the headline performance by 1.3%, so I am leaving it out for now
@ -301,7 +296,7 @@ public class LayoutParsingPipeline {
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(words, cleanRulings); TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(words, cleanRulings);
List<Box> graphics = graphicExtractorService.extractPathElementGraphics(originDocument, pdPage, pageNumber, cleanRulings, stripper.getTextPositionSequences(), false); List<Box> graphics = graphicExtractorService.extractPathElementGraphics(originDocument, pdPage, pageNumber, cleanRulings, stripper.getWords(), false);
pdfImages.computeIfAbsent(pageNumber, x -> new ArrayList<>()) pdfImages.computeIfAbsent(pageNumber, x -> new ArrayList<>())
.addAll(graphics.stream() .addAll(graphics.stream()
@ -314,7 +309,7 @@ public class LayoutParsingPipeline {
ClassificationPage classificationPage = switch (layoutParsingType) { ClassificationPage classificationPage = switch (layoutParsingType) {
case REDACT_MANAGER_OLD -> case REDACT_MANAGER_OLD ->
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings, classificationDocument.getLayoutDebugLayer()); redactManagerBlockificationService.blockify(stripper.getWords(), cleanRulings, classificationDocument.getLayoutDebugLayer());
case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings); case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings);
case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH -> case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH ->
docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getLayoutDebugLayer(), layoutParsingType); docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getLayoutDebugLayer(), layoutParsingType);
@ -388,12 +383,12 @@ public class LayoutParsingPipeline {
} }
private static void rotateDirAdjExactly(List<TextPositionSequence> words, PDPage pdPage) { private static void rotateDirAdjExactly(List<Word> words, PDPage pdPage) {
for (TextDirection dir : TextDirection.values()) { for (TextDirection dir : TextDirection.values()) {
double averageRotation = words.stream() double averageRotation = words.stream()
.map(TextPositionSequence::getTextPositions) .map(Word::getTextPositions)
.flatMap(Collection::stream) .flatMap(Collection::stream)
.filter(pos -> pos.getDir().equals(dir)) .filter(pos -> pos.getDir().equals(dir))
.mapToDouble(RedTextPosition::getExactDir).average().orElse(0); .mapToDouble(RedTextPosition::getExactDir).average().orElse(0);
@ -404,7 +399,7 @@ public class LayoutParsingPipeline {
AffineTransform rotateInstance = AffineTransform.getRotateInstance(averageRotation, pdPage.getMediaBox().getWidth() / 2, pdPage.getMediaBox().getHeight() / 2); AffineTransform rotateInstance = AffineTransform.getRotateInstance(averageRotation, pdPage.getMediaBox().getWidth() / 2, pdPage.getMediaBox().getHeight() / 2);
for (TextPositionSequence word : words) { for (Word word : words) {
if (!dir.equals(word.getDir())) { if (!dir.equals(word.getDir())) {
continue; continue;
} }
@ -455,10 +450,10 @@ public class LayoutParsingPipeline {
// Collect all statistics for the classificationPage, except from blocks inside tables, as tables will always be added to BodyTextFrame. // Collect all statistics for the classificationPage, except from blocks inside tables, as tables will always be added to BodyTextFrame.
for (AbstractPageBlock textBlock : classificationPage.getTextBlocks()) { for (AbstractPageBlock textBlock : classificationPage.getTextBlocks()) {
if (textBlock instanceof TextPageBlock) { if (textBlock instanceof TextPageBlock) {
if (((TextPageBlock) textBlock).getSequences() == null) { if (((TextPageBlock) textBlock).getWords() == null) {
continue; continue;
} }
for (TextPositionSequence word : ((TextPageBlock) textBlock).getSequences()) { for (Word word : ((TextPageBlock) textBlock).getWords()) {
classificationPage.getTextHeightCounter().add(word.getTextHeight()); classificationPage.getTextHeightCounter().add(word.getTextHeight());
classificationPage.getFontCounter().add(word.getFont()); classificationPage.getFontCounter().add(word.getFont());
classificationPage.getFontSizeCounter().add(word.getFontSize()); classificationPage.getFontSizeCounter().add(word.getFontSize());

View File

@ -18,7 +18,7 @@ import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.Zon
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition; import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer; import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
@ -35,7 +35,7 @@ public class DocstrumSegmentationService {
private final ReadingOrderService readingOrderService; private final ReadingOrderService readingOrderService;
public List<Zone> segmentPage(List<TextPositionSequence> textPositions, boolean xyOrder, CleanRulings usedRulings, LayoutDebugLayer visualizations) { public List<Zone> segmentPage(List<Word> textPositions, boolean xyOrder, CleanRulings usedRulings, LayoutDebugLayer visualizations) {
EnumMap<TextDirection, Integer> directionCounts = new EnumMap<>(TextDirection.class); EnumMap<TextDirection, Integer> directionCounts = new EnumMap<>(TextDirection.class);
@ -78,11 +78,11 @@ public class DocstrumSegmentationService {
} }
private List<Zone> computeZones(List<TextPositionSequence> textPositions, CleanRulings rulings, LayoutDebugLayer visualizations, TextDirection direction) { private List<Zone> computeZones(List<Word> textPositions, CleanRulings rulings, LayoutDebugLayer visualizations, TextDirection direction) {
List<RedTextPosition> positions = textPositions.stream() List<RedTextPosition> positions = textPositions.stream()
.filter(t -> t.getDir() == direction) .filter(t -> t.getDir() == direction)
.map(TextPositionSequence::getTextPositions) .map(Word::getTextPositions)
.flatMap(List::stream) .flatMap(List::stream)
.toList(); .toList();

View File

@ -1,9 +1,9 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model; package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
import static com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence.BOLD; import static com.knecon.fforesight.service.layoutparser.processor.model.text.Word.BOLD;
import static com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence.BOLD_ITALIC; import static com.knecon.fforesight.service.layoutparser.processor.model.text.Word.BOLD_ITALIC;
import static com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence.ITALIC; import static com.knecon.fforesight.service.layoutparser.processor.model.text.Word.ITALIC;
import static com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence.STANDARD; import static com.knecon.fforesight.service.layoutparser.processor.model.text.Word.STANDARD;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
@ -14,7 +14,7 @@ import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicInteger;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2; import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import com.knecon.fforesight.service.layoutparser.processor.model.text.FontStyle; import com.knecon.fforesight.service.layoutparser.processor.model.text.FontStyle;
import lombok.Data; import lombok.Data;
@ -41,7 +41,7 @@ public class Line extends TextBoundingBox {
private FontStyle fontStyle; private FontStyle fontStyle;
private final List<Character> characters; private final List<Character> characters;
private final List<TextPositionSequence> words = new ArrayList<>(); private final List<Word> words = new ArrayList<>();
public Line(List<Character> characters, double wordSpacing) { public Line(List<Character> characters, double wordSpacing) {
@ -89,7 +89,7 @@ public class Line extends TextBoundingBox {
for (FontStyle fontStyle : FontStyle.values()) { for (FontStyle fontStyle : FontStyle.values()) {
fontStyleCounter.put(fontStyle, new AtomicInteger(0)); fontStyleCounter.put(fontStyle, new AtomicInteger(0));
} }
for (TextPositionSequence word : words) { for (Word word : words) {
switch (word.getFontStyle()) { switch (word.getFontStyle()) {
case STANDARD -> fontStyleCounter.get(FontStyle.REGULAR).getAndIncrement(); case STANDARD -> fontStyleCounter.get(FontStyle.REGULAR).getAndIncrement();
case BOLD -> fontStyleCounter.get(FontStyle.BOLD).getAndIncrement(); case BOLD -> fontStyleCounter.get(FontStyle.BOLD).getAndIncrement();
@ -159,14 +159,14 @@ public class Line extends TextBoundingBox {
private void computeWords(double wordSpacing) { private void computeWords(double wordSpacing) {
TextPositionSequence word = new TextPositionSequence(); Word word = new Word();
Character previous = null; Character previous = null;
for (Character current : characters) { for (Character current : characters) {
if (previous != null) { if (previous != null) {
double dist = current.getTextPosition().getXDirAdj() - previous.getTextPosition().getXDirAdj() - previous.getTextPosition().getWidthDirAdj(); double dist = current.getTextPosition().getXDirAdj() - previous.getTextPosition().getXDirAdj() - previous.getTextPosition().getWidthDirAdj();
if (dist > wordSpacing) { if (dist > wordSpacing) {
words.add(word); words.add(word);
word = new TextPositionSequence(); word = new Word();
} }
} }
word.getTextPositions().add(current.getTextPosition()); word.getTextPositions().add(current.getTextPosition());

View File

@ -3,7 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.model;
import java.awt.geom.Rectangle2D; import java.awt.geom.Rectangle2D;
import java.util.List; import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import lombok.AccessLevel; import lombok.AccessLevel;
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
@ -16,8 +16,8 @@ import lombok.experimental.FieldDefaults;
public class LineInformation { public class LineInformation {
List<Rectangle2D> lineBBox; List<Rectangle2D> lineBBox;
List<List<TextPositionSequence>> sequencesByLines; List<List<Word>> sequencesByLines;
List<List<Rectangle2D>> bBoxWithGapsByLines; List<List<Rectangle2D>> bBoxWithGapsByLines;
List<List<List<TextPositionSequence>>> sequencesWithGapsByLines; List<List<List<Word>>> sequencesWithGapsByLines;
} }

View File

@ -4,7 +4,7 @@ import java.awt.geom.Rectangle2D;
import java.util.List; import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
import lombok.Builder; import lombok.Builder;
@ -15,7 +15,7 @@ import lombok.Getter;
@AllArgsConstructor @AllArgsConstructor
public class PageContents { public class PageContents {
List<TextPositionSequence> sortedTextPositionSequences; List<Word> sortedWords;
Rectangle2D cropBox; Rectangle2D cropBox;
Rectangle2D mediaBox; Rectangle2D mediaBox;
List<Ruling> rulings; List<Ruling> rulings;

View File

@ -9,7 +9,7 @@ import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities; import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
import lombok.Data; import lombok.Data;
@ -68,12 +68,12 @@ public class Cell extends BoundingBox {
StringBuilder sb = new StringBuilder(); StringBuilder sb = new StringBuilder();
Iterator<TextPageBlock> itty = textBlocks.iterator(); Iterator<TextPageBlock> itty = textBlocks.iterator();
TextPositionSequence previous = null; Word previous = null;
while (itty.hasNext()) { while (itty.hasNext()) {
TextPageBlock textBlock = itty.next(); TextPageBlock textBlock = itty.next();
for (TextPositionSequence word : textBlock.getSequences()) { for (Word word : textBlock.getWords()) {
if (previous != null) { if (previous != null) {
if (Math.abs(previous.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight()) { if (Math.abs(previous.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight()) {
sb.append('\n'); sb.append('\n');

View File

@ -23,7 +23,7 @@ public class ListIdentifier {
Format format; Format format;
@Getter @Getter
TextPositionSequence word; Word word;
@Getter @Getter
int page; int page;
int representation; int representation;
@ -31,14 +31,14 @@ public class ListIdentifier {
public static Optional<ListIdentifier> parse(TextPageBlock textPageBlock, int page) { public static Optional<ListIdentifier> parse(TextPageBlock textPageBlock, int page) {
return parse(textPageBlock.getSequences().subList(0, Math.min(5, textPageBlock.getSequences().size())), page); return parse(textPageBlock.getWords().subList(0, Math.min(5, textPageBlock.getWords().size())), page);
} }
public static Optional<ListIdentifier> parse(List<TextPositionSequence> sequences, int page) { public static Optional<ListIdentifier> parse(List<Word> sequences, int page) {
StringBuilder sb = new StringBuilder(); StringBuilder sb = new StringBuilder();
for (TextPositionSequence sequence : sequences) { for (Word sequence : sequences) {
sb.append(sequence.toString()); sb.append(sequence.toString());
sb.append(" "); sb.append(" ");
} }

View File

@ -10,18 +10,18 @@ import lombok.Getter;
@Getter @Getter
public class SearchableText { public class SearchableText {
private final List<TextPositionSequence> sequences = new ArrayList<>(); private final List<Word> sequences = new ArrayList<>();
public void add(TextPositionSequence textPositionSequence) { public void add(Word word) {
sequences.add(textPositionSequence); sequences.add(word);
} }
public void addAll(List<TextPositionSequence> textPositionSequences) { public void addAll(List<Word> words) {
sequences.addAll(textPositionSequences); sequences.addAll(words);
} }
@ -32,10 +32,10 @@ public class SearchableText {
} }
public static String buildString(List<TextPositionSequence> sequences) { public static String buildString(List<Word> sequences) {
StringBuilder sb = new StringBuilder(); StringBuilder sb = new StringBuilder();
for (TextPositionSequence word : sequences) { for (Word word : sequences) {
sb.append(word); sb.append(word);
sb.append(' '); sb.append(' ');
} }

View File

@ -25,7 +25,7 @@ import lombok.NoArgsConstructor;
public class TextPageBlock extends AbstractPageBlock { public class TextPageBlock extends AbstractPageBlock {
@Builder.Default @Builder.Default
private List<TextPositionSequence> sequences = new ArrayList<>(); private List<Word> words = new ArrayList<>();
@Builder.Default @Builder.Default
private FrequencyCounters frequencyCounters = new FrequencyCounters(); private FrequencyCounters frequencyCounters = new FrequencyCounters();
@ -41,43 +41,43 @@ public class TextPageBlock extends AbstractPageBlock {
private boolean changed; private boolean changed;
public TextPageBlock(List<TextPositionSequence> sequences) { public TextPageBlock(List<Word> words) {
this.sequences = new ArrayList<>(sequences); this.words = new ArrayList<>(words);
this.frequencyCounters = new FrequencyCounters(); this.frequencyCounters = new FrequencyCounters();
if (!sequences.isEmpty()) { if (!words.isEmpty()) {
addToFrequencyCounters(sequences); addToFrequencyCounters(words);
} }
calculateBBox(); calculateBBox();
} }
public List<TextPositionSequence> getSequences() { public List<Word> getWords() {
return Collections.unmodifiableList(sequences); return Collections.unmodifiableList(words);
} }
public TextDirection getDir() { public TextDirection getDir() {
return sequences.get(0).getDir(); return words.get(0).getDir();
} }
private void calculateBBox() { private void calculateBBox() {
if (sequences == null) { if (words == null) {
this.bBox = new Rectangle2D.Double(); this.bBox = new Rectangle2D.Double();
this.bBoxPdf = new Rectangle2D.Double(); this.bBoxPdf = new Rectangle2D.Double();
this.bBoxDirAdj = new Rectangle2D.Double(); this.bBoxDirAdj = new Rectangle2D.Double();
return; return;
} }
this.bBoxDirAdj = sequences.stream() this.bBoxDirAdj = words.stream()
.map(TextPositionSequence::getBBoxDirAdj) .map(Word::getBBoxDirAdj)
.collect(RectangleTransformations.collectBBox()); .collect(RectangleTransformations.collectBBox());
setToBBoxOfComponents(sequences); setToBBoxOfComponents(words);
} }
@ -99,8 +99,8 @@ public class TextPageBlock extends AbstractPageBlock {
throw new IllegalArgumentException("Cannot merge textBlocks on different pages."); throw new IllegalArgumentException("Cannot merge textBlocks on different pages.");
} }
List<TextPositionSequence> sequences = textBlocksToMerge.stream() List<Word> sequences = textBlocksToMerge.stream()
.map(TextPageBlock::getSequences) .map(TextPageBlock::getWords)
.flatMap(java.util.Collection::stream) .flatMap(java.util.Collection::stream)
.toList(); .toList();
sequences = new ArrayList<>(sequences); sequences = new ArrayList<>(sequences);
@ -109,9 +109,9 @@ public class TextPageBlock extends AbstractPageBlock {
} }
private void addToFrequencyCounters(List<TextPositionSequence> sequences) { private void addToFrequencyCounters(List<Word> sequences) {
for (TextPositionSequence wordBlock : sequences) { for (Word wordBlock : sequences) {
frequencyCounters.getLineHeightFrequencyCounter().add(wordBlock.getTextHeight()); frequencyCounters.getLineHeightFrequencyCounter().add(wordBlock.getTextHeight());
frequencyCounters.getFontSizeFrequencyCounter().add(wordBlock.getFontSize()); frequencyCounters.getFontSizeFrequencyCounter().add(wordBlock.getFontSize());
@ -120,12 +120,12 @@ public class TextPageBlock extends AbstractPageBlock {
frequencyCounters.getStyleFrequencyCounter().add(wordBlock.getFontStyle()); frequencyCounters.getStyleFrequencyCounter().add(wordBlock.getFontStyle());
} }
setUnderlined(this.sequences.stream() setUnderlined(this.words.stream()
.allMatch(TextPositionSequence::isUnderline)); .allMatch(Word::isUnderline));
} }
public TextPageBlock union(TextPositionSequence r) { public TextPageBlock union(Word r) {
TextPageBlock union = this.copy(); TextPageBlock union = this.copy();
union.add(r); union.add(r);
@ -138,8 +138,8 @@ public class TextPageBlock extends AbstractPageBlock {
public TextPageBlock union(TextPageBlock r) { public TextPageBlock union(TextPageBlock r) {
TextPageBlock union = this.copy(); TextPageBlock union = this.copy();
union.addAll(r.getSequences()); union.addAll(r.getWords());
addToFrequencyCounters(r.getSequences()); addToFrequencyCounters(r.getWords());
calculateBBox(); calculateBBox();
return union; return union;
} }
@ -148,33 +148,33 @@ public class TextPageBlock extends AbstractPageBlock {
public void add(TextPageBlock textPageBlock) { public void add(TextPageBlock textPageBlock) {
changed = true; changed = true;
sequences.addAll(textPageBlock.getSequences()); words.addAll(textPageBlock.getWords());
addToFrequencyCounters(textPageBlock.getSequences()); addToFrequencyCounters(textPageBlock.getWords());
calculateBBox(); calculateBBox();
} }
public void add(TextPositionSequence textPositionSequence) { public void add(Word word) {
changed = true; changed = true;
sequences.add(textPositionSequence); words.add(word);
addToFrequencyCounters(List.of(textPositionSequence)); addToFrequencyCounters(List.of(word));
calculateBBox(); calculateBBox();
} }
public void addAll(List<TextPositionSequence> textPositionSequences) { public void addAll(List<Word> words) {
changed = true; changed = true;
sequences.addAll(textPositionSequences); this.words.addAll(words);
addToFrequencyCounters(textPositionSequences); addToFrequencyCounters(words);
calculateBBox(); calculateBBox();
} }
public TextPageBlock copy() { public TextPageBlock copy() {
return new TextPageBlock(new ArrayList<>(sequences)); return new TextPageBlock(new ArrayList<>(words));
} }
@ -193,8 +193,8 @@ public class TextPageBlock extends AbstractPageBlock {
StringBuilder sb = new StringBuilder(); StringBuilder sb = new StringBuilder();
TextPositionSequence previous = null; Word previous = null;
for (TextPositionSequence word : sequences) { for (Word word : words) {
if (previous != null) { if (previous != null) {
if (Math.abs(previous.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight()) { if (Math.abs(previous.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight()) {
sb.append('\n'); sb.append('\n');
@ -217,8 +217,8 @@ public class TextPageBlock extends AbstractPageBlock {
public int getNumberOfLines() { public int getNumberOfLines() {
int numberOfLines = 1; int numberOfLines = 1;
TextPositionSequence previous = null; Word previous = null;
for (TextPositionSequence word : sequences) { for (Word word : words) {
if (previous != null) { if (previous != null) {
if (word.getMaxYDirAdj() - previous.getMaxYDirAdj() > word.getTextHeight()) { if (word.getMaxYDirAdj() - previous.getMaxYDirAdj() > word.getTextHeight()) {
numberOfLines++; numberOfLines++;
@ -270,7 +270,7 @@ public class TextPageBlock extends AbstractPageBlock {
@Override @Override
public boolean isEmpty() { public boolean isEmpty() {
return sequences.isEmpty(); return words.isEmpty();
} }
} }

View File

@ -3,19 +3,19 @@ package com.knecon.fforesight.service.layoutparser.processor.model.text;
import java.util.Comparator; import java.util.Comparator;
import java.util.HashMap; import java.util.HashMap;
public class TextPositionSequenceComparator implements Comparator<TextPositionSequence> { public class TextPositionSequenceComparator implements Comparator<Word> {
private HashMap<TextPositionSequence, TextBlockOnPage> lookup; private HashMap<Word, TextBlockOnPage> lookup;
public TextPositionSequenceComparator(HashMap<TextPositionSequence, TextBlockOnPage> lookup) { public TextPositionSequenceComparator(HashMap<Word, TextBlockOnPage> lookup) {
this.lookup = lookup; this.lookup = lookup;
} }
@Override @Override
public int compare(TextPositionSequence number1, TextPositionSequence number2) { public int compare(Word number1, Word number2) {
int page1 = lookup.get(number1).page().getPageNumber(); int page1 = lookup.get(number1).page().getPageNumber();
int page2 = lookup.get(number2).page().getPageNumber(); int page2 = lookup.get(number2).page().getPageNumber();

View File

@ -27,7 +27,7 @@ import lombok.extern.slf4j.Slf4j;
@NoArgsConstructor @NoArgsConstructor
@AllArgsConstructor @AllArgsConstructor
@SuppressWarnings("pmd") @SuppressWarnings("pmd")
public class TextPositionSequence extends TextBoundingBox implements CharSequence { public class Word extends TextBoundingBox implements CharSequence {
public static final String STANDARD = "standard"; public static final String STANDARD = "standard";
public static final String BOLD_ITALIC = "bold, italic"; public static final String BOLD_ITALIC = "bold, italic";
@ -47,7 +47,7 @@ public class TextPositionSequence extends TextBoundingBox implements CharSequenc
private Integer hashcodeCache; private Integer hashcodeCache;
public TextPositionSequence(List<TextPosition> textPositions, int pageNumber, boolean isParagraphStart) { public Word(List<TextPosition> textPositions, int pageNumber, boolean isParagraphStart) {
this.textPositions = textPositions.stream() this.textPositions = textPositions.stream()
.map(RedTextPosition::fromTextPosition) .map(RedTextPosition::fromTextPosition)
@ -65,7 +65,7 @@ public class TextPositionSequence extends TextBoundingBox implements CharSequenc
} }
public TextPositionSequence(List<RedTextPosition> textPositions, int page) { public Word(List<RedTextPosition> textPositions, int page) {
this.textPositions = textPositions; this.textPositions = textPositions;
this.page = page; this.page = page;
@ -98,9 +98,9 @@ public class TextPositionSequence extends TextBoundingBox implements CharSequenc
@Override @Override
public TextPositionSequence subSequence(int start, int end) { public Word subSequence(int start, int end) {
var textPositionSequence = new TextPositionSequence(); var textPositionSequence = new Word();
textPositionSequence.textPositions = textPositions.subList(start, end); textPositionSequence.textPositions = textPositions.subList(start, end);
textPositionSequence.page = page; textPositionSequence.page = page;
textPositionSequence.dir = dir; textPositionSequence.dir = dir;
@ -126,10 +126,10 @@ public class TextPositionSequence extends TextBoundingBox implements CharSequenc
} }
public void add(TextPositionSequence textPositionSequence, RedTextPosition textPosition) { public void add(Word word, RedTextPosition textPosition) {
this.textPositions.add(textPosition); this.textPositions.add(textPosition);
this.page = textPositionSequence.getPage(); this.page = word.getPage();
calculateBBoxAndHashcode(); calculateBBoxAndHashcode();
} }
@ -199,7 +199,7 @@ public class TextPositionSequence extends TextBoundingBox implements CharSequenc
if (o == this) { if (o == this) {
return true; return true;
} }
if (!(o instanceof TextPositionSequence other)) { if (!(o instanceof Word other)) {
return false; return false;
} }
if (!other.canEqual((Object) this)) { if (!other.canEqual((Object) this)) {
@ -220,7 +220,7 @@ public class TextPositionSequence extends TextBoundingBox implements CharSequenc
} }
protected boolean canEqual(final Object other) {return other instanceof TextPositionSequence;} protected boolean canEqual(final Object other) {return other instanceof Word;}
public int hashCode() { public int hashCode() {

View File

@ -23,11 +23,11 @@ public class DividingColumnDetectionService {
public List<Rectangle2D> detectColumns(PageContents pageContents) { public List<Rectangle2D> detectColumns(PageContents pageContents) {
if (pageContents.getSortedTextPositionSequences().size() < 2) { if (pageContents.getSortedWords().size() < 2) {
return List.of(pageContents.getCropBox()); return List.of(pageContents.getCropBox());
} }
GapInformation linesWithGapInformation = GapDetectionService.findGapsInLines(pageContents.getSortedTextPositionSequences(), pageContents.getCropBox()); GapInformation linesWithGapInformation = GapDetectionService.findGapsInLines(pageContents.getSortedWords(), pageContents.getCropBox());
return detectColumnsFromLines(linesWithGapInformation.getXGaps(), pageContents.getCropBox()); return detectColumnsFromLines(linesWithGapInformation.getXGaps(), pageContents.getCropBox());
} }

View File

@ -5,7 +5,7 @@ import java.util.LinkedList;
import java.util.List; import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation; import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
import lombok.experimental.UtilityClass; import lombok.experimental.UtilityClass;
@ -18,23 +18,23 @@ public class GapDetectionService {
private static final double NEW_LINE_FACTOR = 0.2; private static final double NEW_LINE_FACTOR = 0.2;
public static GapInformation findGapsInLines(List<TextPositionSequence> sortedTextPositionSequences, Rectangle2D mainBodyTextFrame) { public static GapInformation findGapsInLines(List<Word> sortedWords, Rectangle2D mainBodyTextFrame) {
if (sortedTextPositionSequences.isEmpty()) { if (sortedWords.isEmpty()) {
return new GapInformation(); return new GapInformation();
} }
final double avgTextPositionHeight = getAvgTextPositionHeight(sortedTextPositionSequences); final double avgTextPositionHeight = getAvgTextPositionHeight(sortedWords);
XGapsContext xGapContext = XGapsContext.init(mainBodyTextFrame); XGapsContext xGapContext = XGapsContext.init(mainBodyTextFrame);
YGapsContext yGapContext = YGapsContext.init(mainBodyTextFrame); YGapsContext yGapContext = YGapsContext.init(mainBodyTextFrame);
var previousTextPosition = sortedTextPositionSequences.get(0); var previousTextPosition = sortedWords.get(0);
Rectangle2D rectangle = toRectangle2D(previousTextPosition); Rectangle2D rectangle = toRectangle2D(previousTextPosition);
xGapContext.addGapFromLeftEdgeOfMainBody(rectangle); xGapContext.addGapFromLeftEdgeOfMainBody(rectangle);
for (TextPositionSequence currentTextPosition : sortedTextPositionSequences.subList(1, sortedTextPositionSequences.size())) { for (Word currentTextPosition : sortedWords.subList(1, sortedWords.size())) {
double yDifference = Math.abs(currentTextPosition.getMaxYDirAdj() - previousTextPosition.getMaxYDirAdj()); double yDifference = Math.abs(currentTextPosition.getMaxYDirAdj() - previousTextPosition.getMaxYDirAdj());
double xGap = Math.abs(previousTextPosition.getMaxXDirAdj() - currentTextPosition.getXDirAdj()); double xGap = Math.abs(previousTextPosition.getMaxXDirAdj() - currentTextPosition.getXDirAdj());
@ -59,14 +59,14 @@ public class GapDetectionService {
} }
previousTextPosition = currentTextPosition; previousTextPosition = currentTextPosition;
} }
xGapContext.addGapToRightEdgeOfMainBody(toRectangle2D(sortedTextPositionSequences.get(sortedTextPositionSequences.size() - 1))); xGapContext.addGapToRightEdgeOfMainBody(toRectangle2D(sortedWords.get(sortedWords.size() - 1)));
xGapContext.gapsPerLine.add(xGapContext.gapsInCurrentLine); xGapContext.gapsPerLine.add(xGapContext.gapsInCurrentLine);
return new GapInformation(xGapContext.gapsPerLine, yGapContext.gapsPerLine); return new GapInformation(xGapContext.gapsPerLine, yGapContext.gapsPerLine);
} }
private static Rectangle2D toRectangle2D(TextPositionSequence textPosition) { private static Rectangle2D toRectangle2D(Word textPosition) {
return mirrorY(textPosition.getBBox()); return mirrorY(textPosition.getBBox());
} }
@ -87,18 +87,18 @@ public class GapDetectionService {
} }
private static void assertAllTextPositionsHaveSameDir(List<TextPositionSequence> textPositionSequences) { private static void assertAllTextPositionsHaveSameDir(List<Word> words) {
assert textPositionSequences.stream() assert words.stream()
.map(TextPositionSequence::getDir) .map(Word::getDir)
.allMatch(a -> a.equals(textPositionSequences.get(0).getDir())); .allMatch(a -> a.equals(words.get(0).getDir()));
} }
private static double getAvgTextPositionHeight(List<TextPositionSequence> textPositionSequences) { private static double getAvgTextPositionHeight(List<Word> words) {
return textPositionSequences.stream() return words.stream()
.mapToDouble(TextPositionSequence::getHeight).average().orElseThrow(); .mapToDouble(Word::getHeight).average().orElseThrow();
} }

View File

@ -7,17 +7,17 @@ import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation; import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation; import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import lombok.experimental.UtilityClass; import lombok.experimental.UtilityClass;
@UtilityClass @UtilityClass
public class InvisibleTableDetectionService { public class InvisibleTableDetectionService {
public List<List<Rectangle2D>> detectTable(List<TextPositionSequence> textPositionSequences, Rectangle2D tableBBox) { public List<List<Rectangle2D>> detectTable(List<Word> words, Rectangle2D tableBBox) {
LineInformation lineInformation = LineDetectionService.calculateLineInformation(textPositionSequences); LineInformation lineInformation = LineDetectionService.calculateLineInformation(words);
GapInformation gaps = GapDetectionService.findGapsInLines(textPositionSequences, tableBBox); GapInformation gaps = GapDetectionService.findGapsInLines(words, tableBBox);
List<Rectangle2D> gapsAcrossLines = GapsAcrossLinesService.detectXGapsAcrossLines(gaps, tableBBox); List<Rectangle2D> gapsAcrossLines = GapsAcrossLinesService.detectXGapsAcrossLines(gaps, tableBBox);
List<Double> columnXCoords = gapsAcrossLines.stream().map(RectangularShape::getCenterX).toList(); List<Double> columnXCoords = gapsAcrossLines.stream().map(RectangularShape::getCenterX).toList();
int colCount = gapsAcrossLines.size(); int colCount = gapsAcrossLines.size();

View File

@ -7,7 +7,7 @@ import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation; import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation; import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
import lombok.Getter; import lombok.Getter;
@ -19,37 +19,37 @@ public class LineDetectionService {
private static final double X_GAP_FACTOR = 1; // multiplied with average text height, determines the minimum distance of gaps in lines private static final double X_GAP_FACTOR = 1; // multiplied with average text height, determines the minimum distance of gaps in lines
public LineInformation calculateLineInformation(List<TextPositionSequence> sortedTextPositionSequences) { public LineInformation calculateLineInformation(List<Word> sortedWords) {
if (sortedTextPositionSequences.isEmpty()) { if (sortedWords.isEmpty()) {
return LineFactory.init().build(); return LineFactory.init().build();
} }
return buildLineInformation(sortedTextPositionSequences); return buildLineInformation(sortedWords);
} }
public List<List<Rectangle2D>> findLinesWithGaps(List<TextPositionSequence> sortedTextPositionSequences, Rectangle2D mainBodyTextFrame) { public List<List<Rectangle2D>> findLinesWithGaps(List<Word> sortedWords, Rectangle2D mainBodyTextFrame) {
return calculateLineInformation(sortedTextPositionSequences).getBBoxWithGapsByLines(); return calculateLineInformation(sortedWords).getBBoxWithGapsByLines();
} }
public List<List<TextPositionSequence>> orderByLines(List<TextPositionSequence> sortedTextPositionSequences, Rectangle2D mainBodyTextFrame) { public List<List<Word>> orderByLines(List<Word> sortedWords, Rectangle2D mainBodyTextFrame) {
return calculateLineInformation(sortedTextPositionSequences).getSequencesByLines(); return calculateLineInformation(sortedWords).getSequencesByLines();
} }
private static LineInformation buildLineInformation(List<TextPositionSequence> sortedTextPositionSequences) { private static LineInformation buildLineInformation(List<Word> sortedWords) {
final double avgTextPositionHeight = getAvgTextPositionHeight(sortedTextPositionSequences); final double avgTextPositionHeight = getAvgTextPositionHeight(sortedWords);
LineFactory lineFactory = LineFactory.init(); LineFactory lineFactory = LineFactory.init();
var previousTextPosition = sortedTextPositionSequences.get(0); var previousTextPosition = sortedWords.get(0);
lineFactory.addToCurrentLine(previousTextPosition); lineFactory.addToCurrentLine(previousTextPosition);
for (TextPositionSequence currentTextPosition : sortedTextPositionSequences.subList(1, sortedTextPositionSequences.size())) { for (Word currentTextPosition : sortedWords.subList(1, sortedWords.size())) {
if (isNewLine(currentTextPosition, previousTextPosition, avgTextPositionHeight) || isSplitByOrientation(currentTextPosition, previousTextPosition)) { if (isNewLine(currentTextPosition, previousTextPosition, avgTextPositionHeight) || isSplitByOrientation(currentTextPosition, previousTextPosition)) {
lineFactory.startNewLine(); lineFactory.startNewLine();
} else if (isXGap(currentTextPosition, previousTextPosition, avgTextPositionHeight)) { } else if (isXGap(currentTextPosition, previousTextPosition, avgTextPositionHeight)) {
@ -63,25 +63,25 @@ public class LineDetectionService {
} }
private static double getAvgTextPositionHeight(List<TextPositionSequence> textPositionSequences) { private static double getAvgTextPositionHeight(List<Word> words) {
return textPositionSequences.stream().mapToDouble(TextPositionSequence::getHeight).average().orElseThrow(); return words.stream().mapToDouble(Word::getHeight).average().orElseThrow();
} }
private static boolean isXGap(TextPositionSequence currentTextPosition, TextPositionSequence previousTextPosition, double avgTextPositionHeight) { private static boolean isXGap(Word currentTextPosition, Word previousTextPosition, double avgTextPositionHeight) {
return Math.abs(previousTextPosition.getMaxXDirAdj() - currentTextPosition.getXDirAdj()) > (avgTextPositionHeight * X_GAP_FACTOR); return Math.abs(previousTextPosition.getMaxXDirAdj() - currentTextPosition.getXDirAdj()) > (avgTextPositionHeight * X_GAP_FACTOR);
} }
private static boolean isSplitByOrientation(TextPositionSequence currentTextPosition, TextPositionSequence previousTextPosition) { private static boolean isSplitByOrientation(Word currentTextPosition, Word previousTextPosition) {
return !previousTextPosition.getDir().equals(currentTextPosition.getDir()); return !previousTextPosition.getDir().equals(currentTextPosition.getDir());
} }
private static boolean isNewLine(TextPositionSequence currentTextPosition, TextPositionSequence previousTextPosition, double avgTextPositionHeight) { private static boolean isNewLine(Word currentTextPosition, Word previousTextPosition, double avgTextPositionHeight) {
return Math.abs(previousTextPosition.getYDirAdj() - currentTextPosition.getYDirAdj()) > avgTextPositionHeight; return Math.abs(previousTextPosition.getYDirAdj() - currentTextPosition.getYDirAdj()) > avgTextPositionHeight;
} }
@ -96,13 +96,13 @@ public class LineDetectionService {
List<List<Rectangle2D>> bBoxWithGapsByLines; List<List<Rectangle2D>> bBoxWithGapsByLines;
List<Rectangle2D> bBoxWithGapsInCurrentLine; List<Rectangle2D> bBoxWithGapsInCurrentLine;
List<List<List<TextPositionSequence>>> sequencesWithGapsByLines; List<List<List<Word>>> sequencesWithGapsByLines;
List<List<TextPositionSequence>> sequencesWithGapsInCurrentLine; List<List<Word>> sequencesWithGapsInCurrentLine;
List<TextPositionSequence> currentSequencesWithoutGaps; List<Word> currentSequencesWithoutGaps;
List<List<TextPositionSequence>> sequencesByLines; List<List<Word>> sequencesByLines;
List<TextPositionSequence> sequencesInCurrentLine; List<Word> sequencesInCurrentLine;
List<List<Rectangle2D>> xGaps; List<List<Rectangle2D>> xGaps;
List<List<Rectangle2D>> yGaps; List<List<Rectangle2D>> yGaps;
@ -116,14 +116,14 @@ public class LineDetectionService {
List<Rectangle2D> bBoxWithGapsInCurrentLine = new LinkedList<>(); List<Rectangle2D> bBoxWithGapsInCurrentLine = new LinkedList<>();
bBoxWithGapsByLines.add(bBoxWithGapsInCurrentLine); bBoxWithGapsByLines.add(bBoxWithGapsInCurrentLine);
List<List<List<TextPositionSequence>>> sequencesWithGapsByLines = new LinkedList<>(); List<List<List<Word>>> sequencesWithGapsByLines = new LinkedList<>();
List<List<TextPositionSequence>> sequencesWithGapsInCurrentLine = new LinkedList<>(); List<List<Word>> sequencesWithGapsInCurrentLine = new LinkedList<>();
sequencesWithGapsByLines.add(sequencesWithGapsInCurrentLine); sequencesWithGapsByLines.add(sequencesWithGapsInCurrentLine);
List<TextPositionSequence> currentSequencesWithoutGaps = new LinkedList<>(); List<Word> currentSequencesWithoutGaps = new LinkedList<>();
sequencesWithGapsInCurrentLine.add(currentSequencesWithoutGaps); sequencesWithGapsInCurrentLine.add(currentSequencesWithoutGaps);
List<List<TextPositionSequence>> sequencesByLines = new LinkedList<>(); List<List<Word>> sequencesByLines = new LinkedList<>();
List<TextPositionSequence> sequencesInCurrentLine = new LinkedList<>(); List<Word> sequencesInCurrentLine = new LinkedList<>();
sequencesByLines.add(sequencesInCurrentLine); sequencesByLines.add(sequencesInCurrentLine);
return new LineFactory(lineBBox, return new LineFactory(lineBBox,
@ -178,13 +178,13 @@ public class LineDetectionService {
} }
private Rectangle2D textPositionBBox(List<TextPositionSequence> textPositionSequences) { private Rectangle2D textPositionBBox(List<Word> words) {
return RectangleTransformations.rectangle2DBBox(textPositionSequences.stream().map(TextPositionSequence::getBBox).toList()); return RectangleTransformations.rectangle2DBBox(words.stream().map(Word::getBBox).toList());
} }
public void addToCurrentLine(TextPositionSequence current) { public void addToCurrentLine(Word current) {
sequencesInCurrentLine.add(current); sequencesInCurrentLine.add(current);
currentSequencesWithoutGaps.add(current); currentSequencesWithoutGaps.add(current);

View File

@ -13,7 +13,7 @@ import org.apache.pdfbox.pdmodel.PDPage;
import org.springframework.core.io.ClassPathResource; import org.springframework.core.io.ClassPathResource;
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents; import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDFLinesTextStripper; import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDFLinesTextStripper;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
@ -40,7 +40,7 @@ public class PageContentExtractor {
stripper.setPdpage(pdPage); stripper.setPdpage(pdPage);
stripper.getText(pdDocument); stripper.getText(pdDocument);
Map<Float, List<TextPositionSequence>> sortedTextPositionSequencesPerDir = stripper.getTextPositionSequences() Map<Float, List<Word>> sortedTextPositionSequencesPerDir = stripper.getWords()
.stream() .stream()
.collect(Collectors.groupingBy(textPositionSequence -> textPositionSequence.getDir().getDegrees())); .collect(Collectors.groupingBy(textPositionSequence -> textPositionSequence.getDir().getDegrees()));
@ -57,7 +57,7 @@ public class PageContentExtractor {
} }
public List<TextPositionSequence> sortByDirAccordingToPageRotation(Map<Float, List<TextPositionSequence>> sortedTextPositionSequencesPerDir, int rotation) { public List<Word> sortByDirAccordingToPageRotation(Map<Float, List<Word>> sortedTextPositionSequencesPerDir, int rotation) {
LinkedList<Float> sortedKeys = new LinkedList<>(sortedTextPositionSequencesPerDir.keySet().stream().sorted().toList()); LinkedList<Float> sortedKeys = new LinkedList<>(sortedTextPositionSequencesPerDir.keySet().stream().sorted().toList());

View File

@ -14,9 +14,9 @@ public class PageInformationService {
public PageInformation build(PageContents pageContents) { public PageInformation build(PageContents pageContents) {
LineInformation lineInformation = LineDetectionService.calculateLineInformation(pageContents.getSortedTextPositionSequences()); LineInformation lineInformation = LineDetectionService.calculateLineInformation(pageContents.getSortedWords());
Rectangle2D mainBodyTextFrame = MainBodyTextFrameExtractionService.calculateMainBodyTextFrame(lineInformation); Rectangle2D mainBodyTextFrame = MainBodyTextFrameExtractionService.calculateMainBodyTextFrame(lineInformation);
GapInformation gapInformation = GapDetectionService.findGapsInLines(pageContents.getSortedTextPositionSequences(), mainBodyTextFrame); GapInformation gapInformation = GapDetectionService.findGapsInLines(pageContents.getSortedWords(), mainBodyTextFrame);
return new PageInformation(pageContents, lineInformation, mainBodyTextFrame, gapInformation); return new PageInformation(pageContents, lineInformation, mainBodyTextFrame, gapInformation);
} }

View File

@ -5,7 +5,7 @@ import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import lombok.experimental.UtilityClass; import lombok.experimental.UtilityClass;
@ -17,9 +17,9 @@ public class TextRulingsClassifier {
private final static double TEXT_BBOX_THRESHOLD_FACTOR = 0.15; // multiplied with text width then subtracted from word width. If ruling covers this width, it is considered as strikethrough/underline. private final static double TEXT_BBOX_THRESHOLD_FACTOR = 0.15; // multiplied with text width then subtracted from word width. If ruling covers this width, it is considered as strikethrough/underline.
public static void classifyUnderlinedAndStrikethroughText(List<TextPositionSequence> words, CleanRulings cleanRulings) { public static void classifyUnderlinedAndStrikethroughText(List<Word> words, CleanRulings cleanRulings) {
for (TextPositionSequence word : words) { for (Word word : words) {
if (word.getDir().equals(TextDirection.ZERO) || word.getDir().equals(TextDirection.HALF_CIRCLE)) { if (word.getDir().equals(TextDirection.ZERO) || word.getDir().equals(TextDirection.HALF_CIRCLE)) {
handleHorizontalText(cleanRulings, word); handleHorizontalText(cleanRulings, word);
} else { } else {
@ -29,7 +29,7 @@ public class TextRulingsClassifier {
} }
private static void handleVerticalText(CleanRulings cleanRulings, TextPositionSequence word) { private static void handleVerticalText(CleanRulings cleanRulings, Word word) {
float lowerY = (float) (word.getBBoxPdf().getMinY() + TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth()); float lowerY = (float) (word.getBBoxPdf().getMinY() + TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
float upperY = (float) (word.getBBoxPdf().getMaxY() - TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth()); float upperY = (float) (word.getBBoxPdf().getMaxY() - TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
@ -63,7 +63,7 @@ public class TextRulingsClassifier {
} }
private static void handleHorizontalText(CleanRulings cleanRulings, TextPositionSequence word) { private static void handleHorizontalText(CleanRulings cleanRulings, Word word) {
float leftX = (float) (word.getBBoxPdf().getMinX() + TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth()); float leftX = (float) (word.getBBoxPdf().getMinX() + TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
float rightX = (float) (word.getBBoxPdf().getMaxX() - TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth()); float rightX = (float) (word.getBBoxPdf().getMaxX() - TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());

View File

@ -16,7 +16,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.SectionIdentifier; import com.knecon.fforesight.service.layoutparser.processor.model.SectionIdentifier;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject; import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities; import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
import lombok.Data; import lombok.Data;
@ -222,14 +222,14 @@ public class BlockificationPostprocessingService {
headline = sectionIdentifier + headline; headline = sectionIdentifier + headline;
} }
WordSequenceResult wordSequenceResult = findWordSequence(blockToSplit.getSequences(), headline); WordSequenceResult wordSequenceResult = findWordSequence(blockToSplit.getWords(), headline);
if (wordSequenceResult.inSequence.isEmpty() && !headline.equals(title)) { if (wordSequenceResult.inSequence.isEmpty() && !headline.equals(title)) {
wordSequenceResult = findWordSequence(blockToSplit.getSequences(), title); wordSequenceResult = findWordSequence(blockToSplit.getWords(), title);
} }
boolean modifiedBlockToSplit = false; boolean modifiedBlockToSplit = false;
if (!wordSequenceResult.inSequence.isEmpty()) { if (!wordSequenceResult.inSequence.isEmpty()) {
blockToSplit.setSequences(wordSequenceResult.inSequence); blockToSplit.setWords(wordSequenceResult.inSequence);
blockToSplit.recalculateBBox(); blockToSplit.recalculateBBox();
modifiedBlockToSplit = true; modifiedBlockToSplit = true;
} }
@ -250,19 +250,19 @@ public class BlockificationPostprocessingService {
} }
private static WordSequenceResult findWordSequence(List<TextPositionSequence> textPositionSequences, String text) { private static WordSequenceResult findWordSequence(List<Word> words, String text) {
String target = sanitizeString(text); String target = sanitizeString(text);
List<TextPositionSequence> inSequence = new ArrayList<>(); List<Word> inSequence = new ArrayList<>();
List<TextPositionSequence> preSequence = new ArrayList<>(); List<Word> preSequence = new ArrayList<>();
List<TextPositionSequence> postSequence = new ArrayList<>(); List<Word> postSequence = new ArrayList<>();
StringBuilder currentSequence = new StringBuilder(); StringBuilder currentSequence = new StringBuilder();
if (target.isBlank()) { if (target.isBlank()) {
return new WordSequenceResult(); return new WordSequenceResult();
} }
for (TextPositionSequence sequence : textPositionSequences) { for (Word sequence : words) {
currentSequence.append(sanitizeString(sequence.toString())); currentSequence.append(sanitizeString(sequence.toString()));
inSequence.add(sequence); inSequence.add(sequence);
@ -274,10 +274,10 @@ public class BlockificationPostprocessingService {
int index = 0; int index = 0;
String toRemove = currentSequence.substring(0, currentSequence.length() - target.length()); String toRemove = currentSequence.substring(0, currentSequence.length() - target.length());
TextPositionSequence next = inSequence.get(index); Word next = inSequence.get(index);
while (currentSequence.length() - next.length() >= target.length()) { while (currentSequence.length() - next.length() >= target.length()) {
TextPositionSequence removed = inSequence.remove(index); Word removed = inSequence.remove(index);
currentSequence.delete(0, removed.toString().length()); currentSequence.delete(0, removed.toString().length());
preSequence.add(removed); preSequence.add(removed);
@ -306,7 +306,7 @@ public class BlockificationPostprocessingService {
} }
if (currentSequence.toString().equals(target)) { if (currentSequence.toString().equals(target)) {
postSequence.addAll(textPositionSequences.subList(textPositionSequences.indexOf(sequence) + 1, textPositionSequences.size())); postSequence.addAll(words.subList(words.indexOf(sequence) + 1, words.size()));
return new WordSequenceResult(inSequence, preSequence, postSequence); return new WordSequenceResult(inSequence, preSequence, postSequence);
} }
} }
@ -316,10 +316,10 @@ public class BlockificationPostprocessingService {
} }
private static SplitSequenceResult splitSequence(TextPositionSequence sequence, String toRemove) { private static SplitSequenceResult splitSequence(Word sequence, String toRemove) {
TextPositionSequence in = null; Word in = null;
TextPositionSequence out; Word out;
String currentSequence = sequence.toString().toLowerCase(Locale.ROOT); String currentSequence = sequence.toString().toLowerCase(Locale.ROOT);
int index = currentSequence.indexOf(toRemove); int index = currentSequence.indexOf(toRemove);
@ -337,9 +337,9 @@ public class BlockificationPostprocessingService {
} }
private static TextPositionSequence createSubSequence(TextPositionSequence sequence, int start, int end) { private static Word createSubSequence(Word sequence, int start, int end) {
TextPositionSequence newSeq = new TextPositionSequence(new ArrayList<>(sequence.getTextPositions().subList(start, end)), sequence.getPage()); Word newSeq = new Word(new ArrayList<>(sequence.getTextPositions().subList(start, end)), sequence.getPage());
newSeq.setParagraphStart(sequence.isParagraphStart()); newSeq.setParagraphStart(sequence.isParagraphStart());
return newSeq; return newSeq;
} }
@ -354,10 +354,10 @@ public class BlockificationPostprocessingService {
List<TextPageBlock> mergedBlocks = new ArrayList<>(); List<TextPageBlock> mergedBlocks = new ArrayList<>();
for (TextPageBlock textPageBlock : blocksToMerge.subList(1, blocksToMerge.size())) { for (TextPageBlock textPageBlock : blocksToMerge.subList(1, blocksToMerge.size())) {
if (firstBlock != null && !firstBlock.getSequences().isEmpty()) { if (firstBlock != null && !firstBlock.getWords().isEmpty()) {
if (textPageBlock.getDir() == firstBlock.getDir()) { if (textPageBlock.getDir() == firstBlock.getDir()) {
firstBlock.addAll(textPageBlock.getSequences()); firstBlock.addAll(textPageBlock.getWords());
mergedBlocks.add(textPageBlock); mergedBlocks.add(textPageBlock);
} }
} }
@ -496,12 +496,12 @@ public class BlockificationPostprocessingService {
public static class WordSequenceResult { public static class WordSequenceResult {
public List<TextPositionSequence> inSequence; public List<Word> inSequence;
public List<TextPositionSequence> preSequence; public List<Word> preSequence;
public List<TextPositionSequence> postSequence; public List<Word> postSequence;
public WordSequenceResult(List<TextPositionSequence> inSequence, List<TextPositionSequence> preSequence, List<TextPositionSequence> postSequence) { public WordSequenceResult(List<Word> inSequence, List<Word> preSequence, List<Word> postSequence) {
this.inSequence = inSequence; this.inSequence = inSequence;
this.preSequence = preSequence; this.preSequence = preSequence;
@ -522,7 +522,7 @@ public class BlockificationPostprocessingService {
} }
public record SplitSequenceResult(TextPositionSequence in, TextPositionSequence out) { public record SplitSequenceResult(Word in, Word out) {
} }

View File

@ -14,7 +14,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer; import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
@ -30,7 +30,7 @@ public class DocstrumBlockificationService {
static final float THRESHOLD = 1f; static final float THRESHOLD = 1f;
public ClassificationPage blockify(List<TextPositionSequence> textPositions, public ClassificationPage blockify(List<Word> textPositions,
CleanRulings rulings, CleanRulings rulings,
boolean xyOrder, boolean xyOrder,
LayoutDebugLayer visualizations, LayoutDebugLayer visualizations,
@ -72,16 +72,16 @@ public class DocstrumBlockificationService {
List<AbstractPageBlock> abstractPageBlocks = new ArrayList<>(); List<AbstractPageBlock> abstractPageBlocks = new ArrayList<>();
zones.forEach(zone -> { zones.forEach(zone -> {
List<TextPositionSequence> textPositionSequences = new ArrayList<>(); List<Word> words = new ArrayList<>();
zone.getLines() zone.getLines()
.forEach(line -> { .forEach(line -> {
line.getWords() line.getWords()
.forEach(word -> { .forEach(word -> {
textPositionSequences.add(new TextPositionSequence(word.getTextPositions(), word.getPage())); words.add(new Word(word.getTextPositions(), word.getPage()));
}); });
}); });
abstractPageBlocks.add(buildTextBlock(textPositionSequences, 0)); abstractPageBlocks.add(buildTextBlock(words, 0));
}); });
return abstractPageBlocks; return abstractPageBlocks;
@ -102,7 +102,7 @@ public class DocstrumBlockificationService {
} }
TextPageBlock current = (TextPageBlock) block; TextPageBlock current = (TextPageBlock) block;
if (previous != null && !previous.getSequences().isEmpty()) { if (previous != null && !previous.getWords().isEmpty()) {
if (current.getDir() != previous.getDir() || usedRulings.lineBetween(current, previous)) { if (current.getDir() != previous.getDir() || usedRulings.lineBetween(current, previous)) {
previous = current; previous = current;
@ -182,8 +182,8 @@ public class DocstrumBlockificationService {
private TextPageBlock combineBlocksAndResetIterator(TextPageBlock previous, TextPageBlock current, ListIterator<AbstractPageBlock> itty, boolean toDuplicate) { private TextPageBlock combineBlocksAndResetIterator(TextPageBlock previous, TextPageBlock current, ListIterator<AbstractPageBlock> itty, boolean toDuplicate) {
previous.addAll(current.getSequences()); previous.addAll(current.getWords());
previous = buildTextBlock(previous.getSequences(), 0); previous = buildTextBlock(previous.getWords(), 0);
previous.setToDuplicate(toDuplicate); previous.setToDuplicate(toDuplicate);
if (current.getClassification() != null && previous.getClassification() == null) { if (current.getClassification() != null && previous.getClassification() == null) {
previous.setClassification(current.getClassification()); previous.setClassification(current.getClassification());
@ -283,8 +283,8 @@ public class DocstrumBlockificationService {
if (current.getDir() == inner.getDir() && current.intersects(inner, yThreshold, xThreshold)) { if (current.getDir() == inner.getDir() && current.intersects(inner, yThreshold, xThreshold)) {
boolean toDuplicate = current.isToDuplicate() || inner.isToDuplicate(); boolean toDuplicate = current.isToDuplicate() || inner.isToDuplicate();
current.addAll(inner.getSequences()); current.addAll(inner.getWords());
current = buildTextBlock(current.getSequences(), 0); current = buildTextBlock(current.getWords(), 0);
current.setToDuplicate(toDuplicate); current.setToDuplicate(toDuplicate);
blocks.set(i, null); blocks.set(i, null);
@ -301,7 +301,7 @@ public class DocstrumBlockificationService {
} }
public static TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) { public static TextPageBlock buildTextBlock(List<Word> wordBlockList, int indexOnPage) {
return new TextPageBlock(wordBlockList); return new TextPageBlock(wordBlockList);
} }

View File

@ -16,13 +16,14 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Orientation;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
@SuppressWarnings("all") @SuppressWarnings("all")
@Service @Service
public class DocuMineBlockificationService { public class DocuMineBlockificationService {
static final float THRESHOLD = 1f; static final float THRESHOLD = 1f;
public static final double FONT_SIZE_CHANGE_RATIO = 0.15;
Pattern pattern = Pattern.compile("^(\\d{1,2}\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z ()-]{2,50}", Pattern.CASE_INSENSITIVE); Pattern pattern = Pattern.compile("^(\\d{1,2}\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z ()-]{2,50}", Pattern.CASE_INSENSITIVE);
@ -36,9 +37,9 @@ public class DocuMineBlockificationService {
* @param cleanRulings All rulings on a page * @param cleanRulings All rulings on a page
* @return Page object that contains the Textblock and text statistics. * @return Page object that contains the Textblock and text statistics.
*/ */
public ClassificationPage blockify(List<TextPositionSequence> textPositions, CleanRulings cleanRulings) { public ClassificationPage blockify(List<Word> textPositions, CleanRulings cleanRulings) {
List<TextPositionSequence> chunkWords = new ArrayList<>(); List<Word> chunkWords = new ArrayList<>();
List<AbstractPageBlock> textPageBlocks = new ArrayList<>(); List<AbstractPageBlock> textPageBlocks = new ArrayList<>();
CleanRulings usedRulings = cleanRulings.withoutTextRulings(); CleanRulings usedRulings = cleanRulings.withoutTextRulings();
@ -47,11 +48,11 @@ public class DocuMineBlockificationService {
double maxX = 0; double maxX = 0;
double minY = 1000; double minY = 1000;
double maxY = 0; double maxY = 0;
TextPositionSequence prev = null; Word prev = null;
boolean wasSplitted = false; boolean wasSplitted = false;
Double splitX1 = null; Double splitX1 = null;
for (TextPositionSequence word : textPositions) { for (Word word : textPositions) {
boolean lineSeparation = prev != null && word.getYDirAdj() - prev.getMaxYDirAdj() > Math.min(word.getHeight(), prev.getHeight()) * 1.1; boolean lineSeparation = prev != null && word.getYDirAdj() - prev.getMaxYDirAdj() > Math.min(word.getHeight(), prev.getHeight()) * 1.1;
boolean startFromTop = prev != null && word.getYDirAdj() < prev.getYDirAdj() - prev.getTextHeight(); boolean startFromTop = prev != null && word.getYDirAdj() < prev.getYDirAdj() - prev.getTextHeight();
@ -60,11 +61,7 @@ public class DocuMineBlockificationService {
boolean newLineAfterSplit = prev != null && word.getYDirAdj() != prev.getYDirAdj() && wasSplitted && splitX1 != word.getXDirAdj(); boolean newLineAfterSplit = prev != null && word.getYDirAdj() != prev.getYDirAdj() && wasSplitted && splitX1 != word.getXDirAdj();
boolean isSplitByRuling = prev != null && usedRulings.lineBetween(prev, word); boolean isSplitByRuling = prev != null && usedRulings.lineBetween(prev, word);
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir()); boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
boolean splitByOtherFontAndOtherY = prev != null && Math.abs(prev.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight() * 0.2 // boolean splitByOtherFontAndOtherY = prev != null && Math.abs(prev.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight() * 0.2 && isFontChange(word, prev);
&& (word.getFontStyle().contains("bold") && !prev.getFontStyle().contains("bold")
|| prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold")
|| Math.abs(prev.getFontSize() - word.getFontSize()) >= 1
|| Math.abs(word.getTextHeight() - prev.getTextHeight()) > 0.8);
Matcher matcher = pattern.matcher(chunkWords.stream() Matcher matcher = pattern.matcher(chunkWords.stream()
.collect(Collectors.joining(" ")).toString()); .collect(Collectors.joining(" ")).toString());
@ -127,6 +124,15 @@ public class DocuMineBlockificationService {
} }
private static boolean isFontChange(Word word, Word prev) {
return word.getFontStyle().contains("bold") && !prev.getFontStyle().contains("bold")
|| prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold")
|| Math.abs(prev.getFontSize() - word.getFontSize()) >= FONT_SIZE_CHANGE_RATIO * Math.min(prev.getFontSize(), word.getFontSize())
|| Math.abs(word.getTextHeight() - prev.getTextHeight()) >= FONT_SIZE_CHANGE_RATIO * Math.min(prev.getTextHeight(), word.getTextHeight());
}
public void mergeblocks(ClassificationPage page, CleanRulings usedRulings, float xThreshold, float yThreshold) { public void mergeblocks(ClassificationPage page, CleanRulings usedRulings, float xThreshold, float yThreshold) {
var blocks = page.getTextBlocks(); var blocks = page.getTextBlocks();
@ -169,8 +175,8 @@ public class DocuMineBlockificationService {
.equals(inner.getClassification()))) { .equals(inner.getClassification()))) {
boolean toDuplicate = current.isToDuplicate() || inner.isToDuplicate(); boolean toDuplicate = current.isToDuplicate() || inner.isToDuplicate();
current.addAll(inner.getSequences()); current.addAll(inner.getWords());
current = buildTextBlock(current.getSequences(), 0); current = buildTextBlock(current.getWords(), 0);
current.setClassification(inner.getClassification()); current.setClassification(inner.getClassification());
current.setToDuplicate(toDuplicate); current.setToDuplicate(toDuplicate);
blocks.set(i, null); blocks.set(i, null);
@ -193,7 +199,7 @@ public class DocuMineBlockificationService {
} }
public static TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) { public static TextPageBlock buildTextBlock(List<Word> wordBlockList, int indexOnPage) {
return new TextPageBlock(wordBlockList); return new TextPageBlock(wordBlockList);
} }

View File

@ -11,7 +11,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
import com.knecon.fforesight.service.layoutparser.processor.model.Orientation; import com.knecon.fforesight.service.layoutparser.processor.model.Orientation;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer; import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
@SuppressWarnings("all") @SuppressWarnings("all")
@ -30,20 +30,20 @@ public class RedactManagerBlockificationService {
* @param visualizations * @param visualizations
* @return Page object that contains the Textblock and text statistics. * @return Page object that contains the Textblock and text statistics.
*/ */
public ClassificationPage blockify(List<TextPositionSequence> textPositions, CleanRulings cleanRulings, LayoutDebugLayer visualizations) { public ClassificationPage blockify(List<Word> textPositions, CleanRulings cleanRulings, LayoutDebugLayer visualizations) {
CleanRulings usedRulings = cleanRulings.withoutTextRulings(); CleanRulings usedRulings = cleanRulings.withoutTextRulings();
int indexOnPage = 0; int indexOnPage = 0;
List<TextPositionSequence> chunkWords = new ArrayList<>(); List<Word> chunkWords = new ArrayList<>();
List<AbstractPageBlock> chunkBlockList = new ArrayList<>(); List<AbstractPageBlock> chunkBlockList = new ArrayList<>();
double minX = 1000, maxX = 0, minY = 1000, maxY = 0; double minX = 1000, maxX = 0, minY = 1000, maxY = 0;
TextPositionSequence prev = null; Word prev = null;
boolean wasSplitted = false; boolean wasSplitted = false;
Double splitX1 = null; Double splitX1 = null;
for (TextPositionSequence word : textPositions) { for (Word word : textPositions) {
boolean lineSeparation = word.getYDirAdj() - maxY > word.getHeight() * 1.25; boolean lineSeparation = word.getYDirAdj() - maxY > word.getHeight() * 1.25;
boolean startFromTop = prev != null && word.getYDirAdj() < prev.getYDirAdj() - prev.getTextHeight(); boolean startFromTop = prev != null && word.getYDirAdj() < prev.getYDirAdj() - prev.getTextHeight();

View File

@ -81,7 +81,7 @@ public class ClarifyndClassificationService {
&& (textBlock.getMostPopularWordStyle().equals("bold") && (textBlock.getMostPopularWordStyle().equals("bold")
|| !document.getFontStyleCounter().getCountPerValue().containsKey("bold") || !document.getFontStyleCounter().getCountPerValue().containsKey("bold")
&& textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() + 1) && textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() + 1)
&& textBlock.getSequences().get(0).getTextPositions().get(0).getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) { && textBlock.getWords().get(0).getTextPositions().get(0).getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
PageBlockType headlineType = HeadlineClassificationService.headlineClassByFontSize(textBlock, headlineFontSizes); PageBlockType headlineType = HeadlineClassificationService.headlineClassByFontSize(textBlock, headlineFontSizes);
headlineClassificationService.classifyHeadline(textBlock, headlineType); headlineClassificationService.classifyHeadline(textBlock, headlineType);
@ -91,7 +91,7 @@ public class ClarifyndClassificationService {
&& textBlock.getMostPopularWordStyle().equals("bold") && textBlock.getMostPopularWordStyle().equals("bold")
&& !document.getFontStyleCounter().getMostPopular().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")
&& PositionUtils.getApproxLineCount(textBlock) < 2.9 && PositionUtils.getApproxLineCount(textBlock) < 2.9
&& textBlock.getSequences().get(0).getTextPositions().get(0).getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) { && textBlock.getWords().get(0).getTextPositions().get(0).getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
PageBlockType headlineType = HeadlineClassificationService.headlineClassByFontSize(textBlock, headlineFontSizes); PageBlockType headlineType = HeadlineClassificationService.headlineClassByFontSize(textBlock, headlineFontSizes);
headlineClassificationService.classifyHeadline(textBlock, headlineType); headlineClassificationService.classifyHeadline(textBlock, headlineType);

View File

@ -15,7 +15,6 @@ import java.util.List;
import java.util.Locale; import java.util.Locale;
import java.util.Map; import java.util.Map;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
@ -153,6 +152,16 @@ public class DocuMineClassificationService {
&& !headlineWithSlashesMatches) { && !headlineWithSlashesMatches) {
setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes); setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes);
// } else if (textBlock.getMostPopularWordFont().contains("bold")
// && greaterOrEqualFontThanPageAverage(textBlock, page)
// && textBlock.getWords().size() <= 6
// && PositionUtils.getApproxLineCount(textBlock) < 2.9
// && isAtLeast3Characters
// && charCount > textBlock.getText().length() * 0.75
// && !textBlock.getText().contains(":")
// && textBlock.getWidth() < page.getBodyTextFrame().getWidth() * 0.7) {
//
// setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes);
} else if (!listIdentifiers.isEmpty()) { } else if (!listIdentifiers.isEmpty()) {
textBlock.setClassification(PageBlockType.LIST_ITEM); textBlock.setClassification(PageBlockType.LIST_ITEM);

View File

@ -11,7 +11,7 @@ import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.text.AbstractBlockOnPage; import com.knecon.fforesight.service.layoutparser.processor.model.text.AbstractBlockOnPage;
import com.knecon.fforesight.service.layoutparser.processor.model.text.ListIdentifier; import com.knecon.fforesight.service.layoutparser.processor.model.text.ListIdentifier;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
@Service @Service
public class ListItemClassificationService { public class ListItemClassificationService {
@ -71,7 +71,7 @@ public class ListItemClassificationService {
List<ListIdentifier> result = new LinkedList<>(); List<ListIdentifier> result = new LinkedList<>();
if (block.block() instanceof TextPageBlock textBlock) { if (block.block() instanceof TextPageBlock textBlock) {
List<TextPositionSequence> sequences = textBlock.getSequences(); List<Word> sequences = textBlock.getWords();
for (int i = 0; i < sequences.size(); i++) { for (int i = 0; i < sequences.size(); i++) {
if (i != 0 && sequences.get(i - 1).getXDirAdj() < sequences.get(i).getXDirAdj()) { if (i != 0 && sequences.get(i - 1).getXDirAdj() < sequences.get(i).getXDirAdj()) {
@ -79,8 +79,8 @@ public class ListItemClassificationService {
continue; continue;
} }
TextPositionSequence sequence = sequences.get(i); Word sequence = sequences.get(i);
List<TextPositionSequence> wordsAtStartOfLine = new ArrayList<>(3); List<Word> wordsAtStartOfLine = new ArrayList<>(3);
int end = Math.min(sequences.size(), i + 3); int end = Math.min(sequences.size(), i + 3);
for (int j = i; j < end; j++) { for (int j = i; j < end; j++) {
if (sequences.get(j).intersectsYDirAdj(sequence, 2)) { if (sequences.get(j).intersectsYDirAdj(sequence, 2)) {

View File

@ -11,7 +11,6 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils; import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
@ -95,7 +94,7 @@ public class RedactManagerClassificationService {
&& (textBlock.getMostPopularWordStyle().equals("bold") && (textBlock.getMostPopularWordStyle().equals("bold")
|| !document.getFontStyleCounter().getCountPerValue().containsKey("bold") || !document.getFontStyleCounter().getCountPerValue().containsKey("bold")
&& textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() + 1) && textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() + 1)
&& textBlock.getSequences().get(0).getTextPositions().get(0).getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) { && textBlock.getWords().get(0).getTextPositions().get(0).getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
PageBlockType headlineType = HeadlineClassificationService.headlineClassByFontSize(textBlock, headlineFontSizes); PageBlockType headlineType = HeadlineClassificationService.headlineClassByFontSize(textBlock, headlineFontSizes);
headlineClassificationService.classifyHeadline(textBlock, headlineType); headlineClassificationService.classifyHeadline(textBlock, headlineType);
@ -105,7 +104,7 @@ public class RedactManagerClassificationService {
&& textBlock.getMostPopularWordStyle().equals("bold") && textBlock.getMostPopularWordStyle().equals("bold")
&& !document.getFontStyleCounter().getMostPopular().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")
&& PositionUtils.getApproxLineCount(textBlock) < 2.9 && PositionUtils.getApproxLineCount(textBlock) < 2.9
&& textBlock.getSequences().get(0).getTextPositions().get(0).getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) { && textBlock.getWords().get(0).getTextPositions().get(0).getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
PageBlockType headlineType = HeadlineClassificationService.headlineClassByFontSize(textBlock, headlineFontSizes); PageBlockType headlineType = HeadlineClassificationService.headlineClassByFontSize(textBlock, headlineFontSizes);
headlineClassificationService.classifyHeadline(textBlock, headlineType); headlineClassificationService.classifyHeadline(textBlock, headlineType);

View File

@ -25,7 +25,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextBlockOnPage; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextBlockOnPage;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequenceComparator; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequenceComparator;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities; import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer; import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
@ -71,8 +71,8 @@ public class TableOfContentsClassificationService {
ClassificationPage startPage = textBlocks.get(start).page(); ClassificationPage startPage = textBlocks.get(start).page();
List<TextBlockOnPage> initialLookAhead = textBlocks.subList(start, Math.min(start + SURROUNDING_BLOCKS_RADIUS, textBlocks.size())); List<TextBlockOnPage> initialLookAhead = textBlocks.subList(start, Math.min(start + SURROUNDING_BLOCKS_RADIUS, textBlocks.size()));
HashMap<TextPositionSequence, TextBlockOnPage> lookup = new HashMap<>(); HashMap<Word, TextBlockOnPage> lookup = new HashMap<>();
List<TextPositionSequence> numbers = extractNumbers(initialLookAhead, lookup, document.getPages().size()); List<Word> numbers = extractNumbers(initialLookAhead, lookup, document.getPages().size());
TocNumberFinder tocNumberFinder = new TocNumberFinder(numbers, lookup); TocNumberFinder tocNumberFinder = new TocNumberFinder(numbers, lookup);
int lastCandidate = start; int lastCandidate = start;
@ -93,9 +93,9 @@ public class TableOfContentsClassificationService {
break; break;
} }
List<TextPositionSequence> numbersFromBlock = extractNumbers(textBlockOnPage, lookup, document.getPages().size()); List<Word> numbersFromBlock = extractNumbers(textBlockOnPage, lookup, document.getPages().size());
List<TextPositionSequence> currentRightmostCluster = tocNumberFinder.getCurrentRightmostCluster(); List<Word> currentRightmostCluster = tocNumberFinder.getCurrentRightmostCluster();
if (currentRightmostCluster.size() < MINIMUM_MATCHES) { if (currentRightmostCluster.size() < MINIMUM_MATCHES) {
log.debug("No numbers indicating a table of contents here."); log.debug("No numbers indicating a table of contents here.");
@ -132,7 +132,7 @@ public class TableOfContentsClassificationService {
} }
private static void addVisualization(LayoutDebugLayer layoutDebugLayer, TocNumberFinder tocNumberFinder, Map<TextPositionSequence, TextBlockOnPage> lookup) { private static void addVisualization(LayoutDebugLayer layoutDebugLayer, TocNumberFinder tocNumberFinder, Map<Word, TextBlockOnPage> lookup) {
tocNumberFinder.getCurrentRightmostCluster() tocNumberFinder.getCurrentRightmostCluster()
.stream() .stream()
@ -141,9 +141,9 @@ public class TableOfContentsClassificationService {
} }
private static boolean anyIntersection(Collection<TextPositionSequence> numbers1, private static boolean anyIntersection(Collection<Word> numbers1,
Collection<TextPositionSequence> numbers2, Collection<Word> numbers2,
Map<TextPositionSequence, TextBlockOnPage> lookup) { Map<Word, TextBlockOnPage> lookup) {
return numbers1.stream() return numbers1.stream()
.anyMatch(numberFromCluster -> numbers2.stream() .anyMatch(numberFromCluster -> numbers2.stream()
@ -151,9 +151,9 @@ public class TableOfContentsClassificationService {
} }
private static List<TextPositionSequence> extractNumbers(List<TextBlockOnPage> textBlocks, Map<TextPositionSequence, TextBlockOnPage> lookup, int numberOfPages) { private static List<Word> extractNumbers(List<TextBlockOnPage> textBlocks, Map<Word, TextBlockOnPage> lookup, int numberOfPages) {
List<TextPositionSequence> blocks = new LinkedList<>(); List<Word> blocks = new LinkedList<>();
for (TextBlockOnPage textBlock : textBlocks) { for (TextBlockOnPage textBlock : textBlocks) {
blocks.addAll(extractNumbers(textBlock, lookup, numberOfPages)); blocks.addAll(extractNumbers(textBlock, lookup, numberOfPages));
} }
@ -161,14 +161,14 @@ public class TableOfContentsClassificationService {
} }
private static List<TextPositionSequence> extractNumbers(TextBlockOnPage textBlock, Map<TextPositionSequence, TextBlockOnPage> lookup, int numberOfPages) { private static List<Word> extractNumbers(TextBlockOnPage textBlock, Map<Word, TextBlockOnPage> lookup, int numberOfPages) {
List<TextPositionSequence> blocks = new LinkedList<>(); List<Word> blocks = new LinkedList<>();
TextPageBlock block = textBlock.textBlock(); TextPageBlock block = textBlock.textBlock();
List<TextPositionSequence> sequences = block.getSequences(); List<Word> sequences = block.getWords();
for (int i = 0; i < sequences.size(); i++) { for (int i = 0; i < sequences.size(); i++) {
TextPositionSequence word = sequences.get(i); Word word = sequences.get(i);
if (!NUMERIC.matcher(word).matches()) { if (!NUMERIC.matcher(word).matches()) {
continue; continue;
@ -193,17 +193,17 @@ public class TableOfContentsClassificationService {
} }
private static CharSequence getSurroundingString(int i, List<TextPositionSequence> sequences) { private static CharSequence getSurroundingString(int i, List<Word> sequences) {
int end = Math.min(i + 5, sequences.size()); int end = Math.min(i + 5, sequences.size());
return sequences.subList(i, end) return sequences.subList(i, end)
.stream() .stream()
.map(TextPositionSequence::toString) .map(Word::toString)
.collect(Collectors.joining(" ")); .collect(Collectors.joining(" "));
} }
private static boolean matches(TextPositionSequence number1, TextPositionSequence number2, Map<TextPositionSequence, TextBlockOnPage> lookup) { private static boolean matches(Word number1, Word number2, Map<Word, TextBlockOnPage> lookup) {
if (number1.getDir() != number2.getDir()) { if (number1.getDir() != number2.getDir()) {
return false; return false;
@ -247,11 +247,11 @@ public class TableOfContentsClassificationService {
private static class TocNumberFinder { private static class TocNumberFinder {
final UnionFind<TextPositionSequence> numberClusters; final UnionFind<Word> numberClusters;
final HashMap<TextPositionSequence, TextBlockOnPage> lookup; final HashMap<Word, TextBlockOnPage> lookup;
TocNumberFinder(List<TextPositionSequence> blocks, HashMap<TextPositionSequence, TextBlockOnPage> lookup) { TocNumberFinder(List<Word> blocks, HashMap<Word, TextBlockOnPage> lookup) {
this.numberClusters = new UnionFind<>(new HashSet<>(blocks)); this.numberClusters = new UnionFind<>(new HashSet<>(blocks));
for (int i = 0; i < blocks.size(); i++) { for (int i = 0; i < blocks.size(); i++) {
@ -265,14 +265,14 @@ public class TableOfContentsClassificationService {
} }
public void add(TextPositionSequence number) { public void add(Word number) {
if (numberClusters.getElements().contains(number)) { if (numberClusters.getElements().contains(number)) {
return; return;
} }
numberClusters.addElement(number); numberClusters.addElement(number);
for (TextPositionSequence element : numberClusters.getElements()) { for (Word element : numberClusters.getElements()) {
if (matches(number, element, lookup)) { if (matches(number, element, lookup)) {
numberClusters.union(element, number); numberClusters.union(element, number);
} }
@ -280,7 +280,7 @@ public class TableOfContentsClassificationService {
} }
public List<TextPositionSequence> getCurrentRightmostCluster() { public List<Word> getCurrentRightmostCluster() {
return numberClusters.getGroups() return numberClusters.getGroups()
.stream() .stream()
@ -322,9 +322,9 @@ public class TableOfContentsClassificationService {
// } // }
public List<TextPositionSequence> removeOutliers(List<TextPositionSequence> numbers) { public List<Word> removeOutliers(List<Word> numbers) {
List<TextPositionSequence> result = new ArrayList<>(); List<Word> result = new ArrayList<>();
result.add(numbers.get(0)); result.add(numbers.get(0));
@ -346,7 +346,7 @@ public class TableOfContentsClassificationService {
// Helper method to check if removing the current number results in a better order // Helper method to check if removing the current number results in a better order
public static boolean isBetterWithout(List<TextPositionSequence> numbers, int i) { public static boolean isBetterWithout(List<Word> numbers, int i) {
if (i == 0 || i == numbers.size() - 1) { if (i == 0 || i == numbers.size() - 1) {
return false; return false;
@ -362,7 +362,7 @@ public class TableOfContentsClassificationService {
} }
private static int getNumberAsInt(List<TextPositionSequence> numbers, int i) { private static int getNumberAsInt(List<Word> numbers, int i) {
return Integer.parseInt(numbers.get(i).toString()); return Integer.parseInt(numbers.get(i).toString());
} }

View File

@ -97,12 +97,12 @@ public class DocumentGraphFactory {
for (TableOfContentItem tocItem : classificationDocument.getTableOfContents()) { for (TableOfContentItem tocItem : classificationDocument.getTableOfContents()) {
GenericSemanticNode parent = tocItem.getParent() == null ? null : tocItem.getParent().getSection(); GenericSemanticNode parent = tocItem.getParent() == null ? null : tocItem.getParent().getSection();
Optional<GenericSemanticNode> section = SectionNodeFactory.addSection(layoutParsingType, Optional<GenericSemanticNode> section = SectionNodeFactory.addSection(layoutParsingType,
parent, parent,
tocItem.getChildren().isEmpty(), tocItem.getChildren().isEmpty(),
tocItem.getNonEmptySectionBlocks(), tocItem.getNonEmptySectionBlocks(),
tocItem.getImages(), tocItem.getImages(),
context, context,
document); document);
tocItem.setSection(section.orElse(null)); tocItem.setSection(section.orElse(null));
} }
} }
@ -133,9 +133,9 @@ public class DocumentGraphFactory {
if (node instanceof DuplicatedParagraph duplicatedParagraph) { if (node instanceof DuplicatedParagraph duplicatedParagraph) {
AtomicTextBlock unsortedTextBlock = context.textBlockFactory.buildAtomicTextBlock(textBlocks.stream() AtomicTextBlock unsortedTextBlock = context.textBlockFactory.buildAtomicTextBlock(textBlocks.stream()
.flatMap(tb -> tb.getSequences() .flatMap(tb -> tb.getWords()
.stream()) .stream())
.collect(Collectors.toList()), node, context, page); .collect(Collectors.toList()), node, context, page);
duplicatedParagraph.setUnsortedLeafTextBlock(unsortedTextBlock); duplicatedParagraph.setUnsortedLeafTextBlock(unsortedTextBlock);
} }

View File

@ -12,7 +12,7 @@ import java.util.Objects;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange; import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition; import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import lombok.experimental.UtilityClass; import lombok.experimental.UtilityClass;
@ -28,7 +28,7 @@ public class SearchTextWithTextPositionFactory {
public static final double LINEBREAK_DELTA_TOLERANCE = 1.5; public static final double LINEBREAK_DELTA_TOLERANCE = 1.5;
public SearchTextWithTextPositionDto buildSearchTextToTextPositionDto(List<TextPositionSequence> sequences) { public SearchTextWithTextPositionDto buildSearchTextToTextPositionDto(List<Word> sequences) {
if (sequences.isEmpty() || sequences.stream() if (sequences.isEmpty() || sequences.stream()
.allMatch(sequence -> sequence.getTextPositions().isEmpty())) { .allMatch(sequence -> sequence.getTextPositions().isEmpty())) {
@ -40,7 +40,7 @@ public class SearchTextWithTextPositionFactory {
RedTextPosition currentTextPosition = sequences.get(0).getTextPositions().get(0); RedTextPosition currentTextPosition = sequences.get(0).getTextPositions().get(0);
RedTextPosition previousTextPosition = RedTextPosition.builder().unicode(" ").bBoxDirAdj(currentTextPosition.getBBoxDirAdj()).build(); RedTextPosition previousTextPosition = RedTextPosition.builder().unicode(" ").bBoxDirAdj(currentTextPosition.getBBoxDirAdj()).build();
for (TextPositionSequence word : sequences) { for (Word word : sequences) {
for (int i = 0; i < word.getTextPositions().size(); ++i) { for (int i = 0; i < word.getTextPositions().size(); ++i) {
currentTextPosition = word.getTextPositions().get(i); currentTextPosition = word.getTextPositions().get(i);
@ -66,7 +66,7 @@ public class SearchTextWithTextPositionFactory {
} }
List<Rectangle2D> positions = sequences.stream() List<Rectangle2D> positions = sequences.stream()
.map(TextPositionSequence::getTextPositions) .map(Word::getTextPositions)
.flatMap(Collection::stream) .flatMap(Collection::stream)
.map(RedTextPosition::getBBoxPdf) .map(RedTextPosition::getBBoxPdf)
.toList(); .toList();

View File

@ -16,7 +16,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Ta
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations; import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
import lombok.experimental.UtilityClass; import lombok.experimental.UtilityClass;
@ -115,7 +115,7 @@ public class TableNodeFactory {
if (cell.getTextBlocks().isEmpty()) { if (cell.getTextBlocks().isEmpty()) {
tableCell.setLeafTextBlock(context.getTextBlockFactory().emptyTextBlock(tableNode, context, page)); tableCell.setLeafTextBlock(context.getTextBlockFactory().emptyTextBlock(tableNode, context, page));
} else if (cell.getTextBlocks().size() == 1) { } else if (cell.getTextBlocks().size() == 1) {
textBlock = context.getTextBlockFactory().buildAtomicTextBlock(cell.getTextBlocks().get(0).getSequences(), tableCell, context, page); textBlock = context.getTextBlockFactory().buildAtomicTextBlock(cell.getTextBlocks().get(0).getWords(), tableCell, context, page);
tableCell.setLeafTextBlock(textBlock); tableCell.setLeafTextBlock(textBlock);
} else if (firstTextBlockIsHeadline(cell)) { } else if (firstTextBlockIsHeadline(cell)) {
SectionNodeFactory.addSection(layoutParsingType, SectionNodeFactory.addSection(layoutParsingType,
@ -129,7 +129,7 @@ public class TableNodeFactory {
context, context,
document); document);
} else if (cellAreaIsSmallerThanPageAreaTimesThreshold(cell, page)) { } else if (cellAreaIsSmallerThanPageAreaTimesThreshold(cell, page)) {
List<TextPositionSequence> sequences = TextPositionOperations.mergeAndSort(cell.getTextBlocks()); List<Word> sequences = TextPositionOperations.mergeAndSort(cell.getTextBlocks());
textBlock = context.getTextBlockFactory().buildAtomicTextBlock(sequences, tableCell, context, page); textBlock = context.getTextBlockFactory().buildAtomicTextBlock(sequences, tableCell, context, page);
tableCell.setLeafTextBlock(textBlock); tableCell.setLeafTextBlock(textBlock);
} else { } else {

View File

@ -2,7 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor.services.factory;
import java.util.List; import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
@ -17,14 +17,14 @@ public class TextBlockFactory {
long textBlockIdx; long textBlockIdx;
public AtomicTextBlock buildAtomicTextBlock(List<TextPositionSequence> sequences, SemanticNode parent, DocumentGraphFactory.Context context, Page page) { public AtomicTextBlock buildAtomicTextBlock(List<Word> sequences, SemanticNode parent, DocumentGraphFactory.Context context, Page page) {
Integer numberOnPage = context.getAndIncrementTextBlockNumberOnPage(page); Integer numberOnPage = context.getAndIncrementTextBlockNumberOnPage(page);
return buildAtomicTextBlock(sequences, parent, numberOnPage, page); return buildAtomicTextBlock(sequences, parent, numberOnPage, page);
} }
public AtomicTextBlock buildAtomicTextBlock(List<TextPositionSequence> sequences, SemanticNode parent, Integer numberOnPage, Page page) { public AtomicTextBlock buildAtomicTextBlock(List<Word> sequences, SemanticNode parent, Integer numberOnPage, Page page) {
SearchTextWithTextPositionDto searchTextWithTextPositionDto = SearchTextWithTextPositionFactory.buildSearchTextToTextPositionDto(sequences); SearchTextWithTextPositionDto searchTextWithTextPositionDto = SearchTextWithTextPositionFactory.buildSearchTextToTextPositionDto(sequences);
int offset = stringOffset; int offset = stringOffset;

View File

@ -11,7 +11,7 @@ import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation; import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
@ -33,10 +33,10 @@ public class GraphicExtractorService {
PDPage pdPage, PDPage pdPage,
int pageNumber, int pageNumber,
CleanRulings cleanRulings, CleanRulings cleanRulings,
List<TextPositionSequence> textPositionSequences, List<Word> words,
boolean graphicsRaster) { boolean graphicsRaster) {
List<Box> characterBBoxes = getCharacterBBoxes(textPositionSequences); List<Box> characterBBoxes = getCharacterBBoxes(words);
List<Box> classifiedRulingsBoxes = getLineBBoxesOfAllClassifiedRulings(cleanRulings); List<Box> classifiedRulingsBoxes = getLineBBoxesOfAllClassifiedRulings(cleanRulings);
GraphicBBDetector graphicBBDetector = new GraphicBBDetector(pdPage, true); GraphicBBDetector graphicBBDetector = new GraphicBBDetector(pdPage, true);
@ -63,9 +63,9 @@ public class GraphicExtractorService {
} }
private List<Box> getCharacterBBoxes(List<TextPositionSequence> textPositionSequences) { private List<Box> getCharacterBBoxes(List<Word> words) {
return textPositionSequences.stream() return words.stream()
.map(BoundingBox::getBBoxPdf) .map(BoundingBox::getBBoxPdf)
.map(Box::new) .map(Box::new)
.collect(Collectors.toList()); .collect(Collectors.toList());

View File

@ -40,7 +40,7 @@ import org.apache.pdfbox.text.TextPosition;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition; import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import lombok.Getter; import lombok.Getter;
import lombok.Setter; import lombok.Setter;
@ -52,7 +52,7 @@ import lombok.extern.slf4j.Slf4j;
public class PDFLinesTextStripper extends PDFTextStripper { public class PDFLinesTextStripper extends PDFTextStripper {
private final static Set<String> DOT_LIKE_CHARACTERS = Set.of(".", "·", "", "", "", "", "", "", "", "", "", "", "", ""); private final static Set<String> DOT_LIKE_CHARACTERS = Set.of(".", "·", "", "", "", "", "", "", "", "", "", "", "", "");
private final List<TextPositionSequence> textPositionSequences = new ArrayList<>(); private final List<Word> words = new ArrayList<>();
private final List<Ruling> rulings = new ArrayList<>(); private final List<Ruling> rulings = new ArrayList<>();
private final List<Ruling> graphicsPath = new ArrayList<>(); private final List<Ruling> graphicsPath = new ArrayList<>();
@Setter @Setter
@ -246,10 +246,10 @@ public class PDFLinesTextStripper extends PDFTextStripper {
direction = textPositions.get(i).getDir(); direction = textPositions.get(i).getDir();
} }
if (!textPositionSequences.isEmpty()) { if (!words.isEmpty()) {
previous = textPositionSequences.get(textPositionSequences.size() - 1) previous = words.get(words.size() - 1)
.getTextPositions() .getTextPositions()
.get(textPositionSequences.get(textPositionSequences.size() - 1).getTextPositions().size() - 1); .get(words.get(words.size() - 1).getTextPositions().size() - 1);
} }
if (i == 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i).getUnicode().equals("\u00A0") || textPositions.get(i).getUnicode().equals("\t"))) { if (i == 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i).getUnicode().equals("\u00A0") || textPositions.get(i).getUnicode().equals("\t"))) {
@ -259,7 +259,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
if (textPositions.get(i).getDir() != direction && startIndex != i) { if (textPositions.get(i).getDir() != direction && startIndex != i) {
List<TextPosition> sublist = textPositions.subList(startIndex, i); List<TextPosition> sublist = textPositions.subList(startIndex, i);
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart)); words.add(new Word(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart));
startIndex = i; startIndex = i;
direction = textPositions.get(i).getDir(); direction = textPositions.get(i).getDir();
} }
@ -268,7 +268,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
if (checkIfCurrentPositionIsToTheRightOfPreviousPosition(i, textPositions)) { if (checkIfCurrentPositionIsToTheRightOfPreviousPosition(i, textPositions)) {
List<TextPosition> sublist = textPositions.subList(startIndex, i); List<TextPosition> sublist = textPositions.subList(startIndex, i);
if (checkIfSequenceContainsOnlyWhitespaces(sublist)) { if (checkIfSequenceContainsOnlyWhitespaces(sublist)) {
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart)); words.add(new Word(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart));
} }
startIndex = i; startIndex = i;
} }
@ -276,7 +276,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
if (textPositions.get(i).getRotation() == 0 && i > 0 && textPositions.get(i).getX() > textPositions.get(i - 1).getEndX() + 1) { if (textPositions.get(i).getRotation() == 0 && i > 0 && textPositions.get(i).getX() > textPositions.get(i - 1).getEndX() + 1) {
List<TextPosition> sublist = textPositions.subList(startIndex, i); List<TextPosition> sublist = textPositions.subList(startIndex, i);
if (checkIfSequenceContainsOnlyWhitespaces(sublist)) { if (checkIfSequenceContainsOnlyWhitespaces(sublist)) {
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart)); words.add(new Word(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart));
} }
startIndex = i; startIndex = i;
} }
@ -290,22 +290,22 @@ public class PDFLinesTextStripper extends PDFTextStripper {
// Remove false sequence ends (whitespaces) // Remove false sequence ends (whitespaces)
if (checkIfGapSizeBetweenCharactersSmallerThanMaximum(previous, sublist, 0.01f)) { if (checkIfGapSizeBetweenCharactersSmallerThanMaximum(previous, sublist, 0.01f)) {
for (TextPosition t : sublist) { for (TextPosition t : sublist) {
textPositionSequences.get(textPositionSequences.size() - 1).add(t); words.get(words.size() - 1).add(t);
} }
} else { } else {
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart)); words.add(new Word(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart));
} }
} }
startIndex = i + 1; startIndex = i + 1;
} }
if (isDottedLineFollowedByWord(textPositions, i, startIndex)) { if (isDottedLineFollowedByWord(textPositions, i, startIndex)) {
List<TextPosition> sublist = textPositions.subList(startIndex, i); List<TextPosition> sublist = textPositions.subList(startIndex, i);
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart)); words.add(new Word(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart));
startIndex = i; startIndex = i;
} }
if (isWordFollowedByDottedLine(textPositions, i, startIndex)) { if (isWordFollowedByDottedLine(textPositions, i, startIndex)) {
List<TextPosition> sublist = textPositions.subList(startIndex, i - 2); List<TextPosition> sublist = textPositions.subList(startIndex, i - 2);
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart)); words.add(new Word(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart));
startIndex = i - 2; startIndex = i - 2;
} }
} }
@ -324,10 +324,10 @@ public class PDFLinesTextStripper extends PDFTextStripper {
&& sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0).getYDirAdj() == previous.getYDirAdj()
&& sublist.get(0).getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < 0.01) { && sublist.get(0).getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < 0.01) {
for (TextPosition t : sublist) { for (TextPosition t : sublist) {
textPositionSequences.get(textPositionSequences.size() - 1).add(t); words.get(words.size() - 1).add(t);
} }
} else { } else {
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, isParagraphStart)); words.add(new Word(sublist, pageNumber, isParagraphStart));
} }
} }
@ -392,7 +392,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
@Override @Override
public String getText(PDDocument doc) throws IOException { public String getText(PDDocument doc) throws IOException {
textPositionSequences.clear(); words.clear();
rulings.clear(); rulings.clear();
graphicsPath.clear(); graphicsPath.clear();
path_x = 0.0f; path_x = 0.0f;

View File

@ -13,7 +13,7 @@ import org.apache.pdfbox.text.TextPosition;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import lombok.experimental.UtilityClass; import lombok.experimental.UtilityClass;
@ -48,7 +48,7 @@ public class MarkedContentUtils {
return markedContentByYPosition.values() return markedContentByYPosition.values()
.stream() .stream()
.map(textPositions -> new TextPositionSequence(textPositions, 0, true).getBBoxPdf()) .map(textPositions -> new Word(textPositions, 0, true).getBBoxPdf())
.map(t -> new Rectangle2D.Double(t.getX(), t.getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight()))) .map(t -> new Rectangle2D.Double(t.getX(), t.getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight())))
.collect(Collectors.toList()); .collect(Collectors.toList());
} }
@ -89,7 +89,7 @@ public class MarkedContentUtils {
.filter(content -> content instanceof TextPosition) .filter(content -> content instanceof TextPosition)
.map(content -> (TextPosition) content) .map(content -> (TextPosition) content)
.filter(content -> !content.getUnicode().equals(" ")) .filter(content -> !content.getUnicode().equals(" "))
.map(textPositions -> new TextPositionSequence(List.of(textPositions), 0, true)) .map(textPositions -> new Word(List.of(textPositions), 0, true))
.map(BoundingBox::getBBoxPdf) .map(BoundingBox::getBBoxPdf)
.collect(Collectors.toList()); .collect(Collectors.toList());
} }

View File

@ -16,7 +16,7 @@ import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Union
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils; import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2; import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import lombok.experimental.UtilityClass; import lombok.experimental.UtilityClass;
@ -36,33 +36,33 @@ public class TextPositionOperations {
.thenComparing(TextBoundingBox::getXDirAdj, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, X_THRESHOLD)); .thenComparing(TextBoundingBox::getXDirAdj, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, X_THRESHOLD));
public List<TextPositionSequence> mergeAndSort(List<TextPageBlock> textBlocks) { public List<Word> mergeAndSort(List<TextPageBlock> textBlocks) {
var sequences = textBlocks.stream() var sequences = textBlocks.stream()
.flatMap(tb -> tb.getSequences() .flatMap(tb -> tb.getWords()
.stream()) .stream())
.collect(Collectors.toSet()); .collect(Collectors.toSet());
return sortUsingLineDetection(sequences); return sortUsingLineDetection(sequences);
} }
public List<TextPositionSequence> sort(List<TextPositionSequence> sequences) { public List<Word> sort(List<Word> sequences) {
return sortUsingLineDetection(new HashSet<>(sequences)); return sortUsingLineDetection(new HashSet<>(sequences));
} }
private List<TextPositionSequence> sortUsingLineDetection(Set<TextPositionSequence> sequences) { private List<Word> sortUsingLineDetection(Set<Word> sequences) {
return sortLines(groupByLine(sequences)); return sortLines(groupByLine(sequences));
} }
public List<TextPositionSequence> sortLines(Collection<Set<TextPositionSequence>> lines) { public List<Word> sortLines(Collection<Set<Word>> lines) {
List<List<TextPositionSequence>> lineBlocks = new ArrayList<>(); List<List<Word>> lineBlocks = new ArrayList<>();
for (Set<TextPositionSequence> line : lines) { for (Set<Word> line : lines) {
List<TextPositionSequence> sortedLine = sortByXDirAdj(line); List<Word> sortedLine = sortByXDirAdj(line);
if (!sortedLine.isEmpty()) { if (!sortedLine.isEmpty()) {
lineBlocks.add(sortedLine); lineBlocks.add(sortedLine);
} }
@ -70,35 +70,35 @@ public class TextPositionOperations {
// need to use old sorting, since COMPARATOR_DIR_ADJ is not transitive // need to use old sorting, since COMPARATOR_DIR_ADJ is not transitive
QuickSort.sort(lineBlocks, Comparator.comparing(line -> line.get(0), COMPARATOR_DIR_ADJ)); QuickSort.sort(lineBlocks, Comparator.comparing(line -> line.get(0), COMPARATOR_DIR_ADJ));
List<TextPositionSequence> list = new ArrayList<>(); List<Word> list = new ArrayList<>();
for (List<TextPositionSequence> textPositionSequences : lineBlocks) { for (List<Word> words : lineBlocks) {
list.addAll(textPositionSequences); list.addAll(words);
} }
return list; return list;
} }
private List<TextPositionSequence> sortByXDirAdj(Set<TextPositionSequence> line) { private List<Word> sortByXDirAdj(Set<Word> line) {
return line.stream() return line.stream()
.sorted(Comparator.comparing(TextPositionSequence::getXDirAdj)) .sorted(Comparator.comparing(Word::getXDirAdj))
.toList(); .toList();
} }
public Collection<Set<TextPositionSequence>> groupByLine(Set<TextPositionSequence> sequences) { public Collection<Set<Word>> groupByLine(Set<Word> sequences) {
double maxLineDistance = sequences.stream() double maxLineDistance = sequences.stream()
.map(TextPositionSequence::getBBoxDirAdj) .map(Word::getBBoxDirAdj)
.mapToDouble(RectangularShape::getHeight).average().orElse(10) * MAX_LINE_HEIGHT_FACTOR; .mapToDouble(RectangularShape::getHeight).average().orElse(10) * MAX_LINE_HEIGHT_FACTOR;
double maxXGap = sequences.stream() double maxXGap = sequences.stream()
.map(TextPositionSequence::getBBoxDirAdj) .map(Word::getBBoxDirAdj)
.mapToDouble(RectangularShape::getWidth).average().orElse(75) * MAX_WORD_DISTANCE_FACTOR; .mapToDouble(RectangularShape::getWidth).average().orElse(75) * MAX_WORD_DISTANCE_FACTOR;
UnionFind<TextPositionSequence> unionFind = new UnionFind<>(sequences); UnionFind<Word> unionFind = new UnionFind<>(sequences);
for (TextPositionSequence sequence : sequences) { for (Word sequence : sequences) {
for (TextPositionSequence sequence2 : sequences) { for (Word sequence2 : sequences) {
if (sequence.equals(sequence2)) { // || unionFind.inSameSet(sequence, sequence2)) doing this is actually slower than not doing it if (sequence.equals(sequence2)) { // || unionFind.inSameSet(sequence, sequence2)) doing this is actually slower than not doing it
continue; continue;
@ -144,10 +144,10 @@ public class TextPositionOperations {
} }
public List<TextPositionSequence> merge(List<TextPageBlock> textBlocks) { public List<Word> merge(List<TextPageBlock> textBlocks) {
return textBlocks.stream() return textBlocks.stream()
.map(TextPageBlock::getSequences) .map(TextPageBlock::getWords)
.flatMap(Collection::stream) .flatMap(Collection::stream)
.collect(Collectors.toList()); .collect(Collectors.toList());
} }

View File

@ -27,7 +27,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.outline.Outlin
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.text.ListIdentifier; import com.knecon.fforesight.service.layoutparser.processor.model.text.ListIdentifier;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms; import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation; import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
@ -58,14 +58,14 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
Map<Integer, AtomicInteger> outlineObjectsWithoutPointsPerPage = new HashMap<>(); Map<Integer, AtomicInteger> outlineObjectsWithoutPointsPerPage = new HashMap<>();
public void addTextVisualizations(List<TextPositionSequence> textPositionSequences, int pageNumber) { public void addTextVisualizations(List<Word> words, int pageNumber) {
if (!active) { if (!active) {
return; return;
} }
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.words); VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.words);
visualizationsOnPage.getColoredRectangles() visualizationsOnPage.getColoredRectangles()
.addAll(textPositionSequences.stream() .addAll(words.stream()
.map(BoundingBox::getBBoxPdf) .map(BoundingBox::getBBoxPdf)
.map(rect -> new ColoredRectangle(rect, WORDS_COLOR, 1)) .map(rect -> new ColoredRectangle(rect, WORDS_COLOR, 1))
.toList()); .toList());
@ -188,7 +188,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
} }
public void addLineVisualizationsFromNestedTextPosition(Collection<Set<TextPositionSequence>> lines, int pageNumber) { public void addLineVisualizationsFromNestedTextPosition(Collection<Set<Word>> lines, int pageNumber) {
if (!active) { if (!active) {
return; return;
@ -291,7 +291,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
} }
public void addTocPages(List<TextPositionSequence> numbers, int page) { public void addTocPages(List<Word> numbers, int page) {
if (!active) { if (!active) {
return; return;

View File

@ -26,7 +26,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations; import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
import com.knecon.fforesight.service.layoutparser.server.PDFNetInitializer; import com.knecon.fforesight.service.layoutparser.server.PDFNetInitializer;
@ -387,7 +387,7 @@ public class DocumentReadingOrderTest extends BuildDocumentTest {
for (AbstractPageBlock abstractBlock : classificationDocumentPage.getTextBlocks()) { for (AbstractPageBlock abstractBlock : classificationDocumentPage.getTextBlocks()) {
if (abstractBlock instanceof TextPageBlock textBlock) { if (abstractBlock instanceof TextPageBlock textBlock) {
for (TextPositionSequence sequence : TextPositionOperations.mergeAndSort(List.of(textBlock))) { for (Word sequence : TextPositionOperations.mergeAndSort(List.of(textBlock))) {
float stringWidth; float stringWidth;
try { try {

View File

@ -32,7 +32,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.textbloc
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
@ -105,9 +105,9 @@ public class PdfSegmentationServiceTest extends AbstractTest {
List<PageContents> textPositionPerPage = PageContentExtractor.getSortedPageContents(fileName); List<PageContents> textPositionPerPage = PageContentExtractor.getSortedPageContents(fileName);
var textPositions = textPositionPerPage.stream() var textPositions = textPositionPerPage.stream()
.flatMap(t -> t.getSortedTextPositionSequences() .flatMap(t -> t.getSortedWords()
.stream() .stream()
.map(TextPositionSequence::toString)) .map(Word::toString))
.collect(Collectors.joining(" ")); .collect(Collectors.joining(" "));
assertThat(textPositions.contains(textToSearch)).isFalse(); assertThat(textPositions.contains(textToSearch)).isFalse();
@ -117,7 +117,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
.get(0).getTextBlocks().size()).isEqualTo(3); .get(0).getTextBlocks().size()).isEqualTo(3);
assertThat(classificationDocument.getHeaders() assertThat(classificationDocument.getHeaders()
.get(0).getTextBlocks() .get(0).getTextBlocks()
.get(0).getSequences().size()).isEqualTo(8); .get(0).getWords().size()).isEqualTo(8);
assertThat(classificationDocument.getHeaders() assertThat(classificationDocument.getHeaders()
.get(0).getTextBlocks() .get(0).getTextBlocks()
.get(0).toString()).contains(textToSearch); .get(0).toString()).contains(textToSearch);

View File

@ -36,7 +36,7 @@ class GapAcrossLinesDetectionServiceTest {
System.out.println("start column detection"); System.out.println("start column detection");
start = System.currentTimeMillis(); start = System.currentTimeMillis();
for (PageInformation pageInformation : pageInformations) { for (PageInformation pageInformation : pageInformations) {
GapInformation gapInformation = GapDetectionService.findGapsInLines(pageInformation.getPageContents().getSortedTextPositionSequences(), pageInformation.getMainBodyTextFrame()); GapInformation gapInformation = GapDetectionService.findGapsInLines(pageInformation.getPageContents().getSortedWords(), pageInformation.getMainBodyTextFrame());
columnsPerPage.add(GapsAcrossLinesService.detectXGapsAcrossLines(gapInformation, pageInformation.getMainBodyTextFrame())); columnsPerPage.add(GapsAcrossLinesService.detectXGapsAcrossLines(gapInformation, pageInformation.getMainBodyTextFrame()));
} }
System.out.printf("Finished column detection in %d ms%n", System.currentTimeMillis() - start); System.out.printf("Finished column detection in %d ms%n", System.currentTimeMillis() - start);

View File

@ -10,7 +10,7 @@ import java.util.stream.Collectors;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation; import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import com.knecon.fforesight.service.layoutparser.processor.services.InvisibleTableDetectionService; import com.knecon.fforesight.service.layoutparser.processor.services.InvisibleTableDetectionService;
import com.knecon.fforesight.service.layoutparser.processor.services.PageInformationService; import com.knecon.fforesight.service.layoutparser.processor.services.PageInformationService;
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor; import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
@ -34,18 +34,18 @@ class InvisibleTableDetectionServiceTest {
.collect(Collectors.toList()); .collect(Collectors.toList());
int pageNumber = 1; int pageNumber = 1;
Rectangle2D tableBBox = pageContents.get(0).getPageContents().getSortedTextPositionSequences().subList(45, 152) Rectangle2D tableBBox = pageContents.get(0).getPageContents().getSortedWords().subList(45, 152)
.stream() .stream()
.map(TextPositionSequence::getBBox) .map(Word::getBBox)
.map(this::mirrorY) .map(this::mirrorY)
.collect(RectangleTransformations.collectBBox()); .collect(RectangleTransformations.collectBBox());
List<TextPositionSequence> textPositionSequences = pageContents.get(0).getPageContents().getSortedTextPositionSequences() List<Word> words = pageContents.get(0).getPageContents().getSortedWords()
.stream() .stream()
.filter(textPositionSequence -> tableBBox.contains(mirrorY(textPositionSequence.getBBox()))) .filter(textPositionSequence -> tableBBox.contains(mirrorY(textPositionSequence.getBBox())))
.toList(); .toList();
var table = InvisibleTableDetectionService.detectTable(textPositionSequences, tableBBox); var table = InvisibleTableDetectionService.detectTable(words, tableBBox);
PdfDraw.drawRectanglesPerPage(fileName, PdfDraw.drawRectanglesPerPage(fileName,
List.of(table.stream() List.of(table.stream()

View File

@ -6,7 +6,7 @@ import java.util.List;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents; import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor; import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw; import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
@ -25,9 +25,9 @@ class PageContentExtractorTest {
PdfDraw.drawRectanglesPerPageNumberedByLine(fileName, PdfDraw.drawRectanglesPerPageNumberedByLine(fileName,
textPositionPerPage.stream() textPositionPerPage.stream()
.map(t -> t.getSortedTextPositionSequences() .map(t -> t.getSortedWords()
.stream() .stream()
.map(TextPositionSequence::getBBoxPdf) .map(Word::getBBoxPdf)
.map(List::of) .map(List::of)
.toList()) .toList())
.toList(), tmpFileName); .toList(), tmpFileName);

View File

@ -11,7 +11,7 @@ import org.junit.jupiter.api.Test;
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents; import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor; import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService; import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
import com.knecon.fforesight.service.layoutparser.processor.services.TextRulingsClassifier; import com.knecon.fforesight.service.layoutparser.processor.services.TextRulingsClassifier;
@ -32,16 +32,16 @@ public class RulingsClassifierTest {
for (PageContents pageContent : pageContents) { for (PageContents pageContent : pageContents) {
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(Collections.emptyList(), pageContent.getRulings()); CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(Collections.emptyList(), pageContent.getRulings());
RectangularIntersectionFinder.find(cleanRulings.getHorizontals(), cleanRulings.getVerticals()); RectangularIntersectionFinder.find(cleanRulings.getHorizontals(), cleanRulings.getVerticals());
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(pageContent.getSortedTextPositionSequences(), cleanRulings); TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(pageContent.getSortedWords(), cleanRulings);
assertTrue(pageContent.getSortedTextPositionSequences() assertTrue(pageContent.getSortedWords()
.stream() .stream()
.filter(word -> word.toString().equals("Underlined")) .filter(word -> word.toString().equals("Underlined"))
.allMatch(TextPositionSequence::isUnderline)); .allMatch(Word::isUnderline));
assertTrue(pageContent.getSortedTextPositionSequences() assertTrue(pageContent.getSortedWords()
.stream() .stream()
.filter(word -> word.toString().equals("Striketrough")) .filter(word -> word.toString().equals("Striketrough"))
.allMatch(TextPositionSequence::isStrikethrough)); .allMatch(Word::isStrikethrough));
assertEquals(4, assertEquals(4,
cleanRulings.buildAll() cleanRulings.buildAll()
@ -70,7 +70,7 @@ public class RulingsClassifierTest {
for (PageContents pageContent : pageContents) { for (PageContents pageContent : pageContents) {
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(Collections.emptyList(), pageContent.getRulings()); CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(Collections.emptyList(), pageContent.getRulings());
RectangularIntersectionFinder.find(cleanRulings.getHorizontals(), cleanRulings.getVerticals()); RectangularIntersectionFinder.find(cleanRulings.getHorizontals(), cleanRulings.getVerticals());
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(pageContent.getSortedTextPositionSequences(), cleanRulings); TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(pageContent.getSortedWords(), cleanRulings);
assertEquals(30, cleanRulings.getHorizontals().size()); assertEquals(30, cleanRulings.getHorizontals().size());
assertEquals(30, cleanRulings.getTableLines().getHorizontals().size()); assertEquals(30, cleanRulings.getTableLines().getHorizontals().size());

View File

@ -1,8 +1,6 @@
package com.knecon.fforesight.service.layoutparser.server.utils; package com.knecon.fforesight.service.layoutparser.server.utils;
import java.awt.geom.Rectangle2D;
import java.io.File; import java.io.File;
import java.io.FileOutputStream;
import java.nio.file.Path; import java.nio.file.Path;
import java.util.Map; import java.util.Map;
@ -12,27 +10,11 @@ import org.springframework.core.io.ClassPathResource;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline; import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.viewerdoc.model.Standard14EmbeddableFont;
import com.pdftron.common.Matrix2D;
import com.pdftron.pdf.ColorPt;
import com.pdftron.pdf.ColorSpace;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.ElementBuilder;
import com.pdftron.pdf.ElementWriter;
import com.pdftron.pdf.Font;
import com.pdftron.pdf.GState;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.sdf.SDFDoc;
import lombok.SneakyThrows; import lombok.SneakyThrows;

View File

@ -46,9 +46,9 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup {
new Color(0, 188, 212), new Color(0, 188, 212),
new Color(121, 85, 72)); new Color(121, 85, 72));
protected final Visualizations words = Visualizations.builder().layer(LayerIdentifier.WORDS).visibleByDefault(true).build(); protected final Visualizations words = Visualizations.builder().layer(LayerIdentifier.WORDS).build();
protected final Visualizations lines = Visualizations.builder().layer(LayerIdentifier.LINES).build(); protected final Visualizations lines = Visualizations.builder().layer(LayerIdentifier.LINES).build();
protected final Visualizations zones = Visualizations.builder().layer(LayerIdentifier.ZONES).build(); protected final Visualizations zones = Visualizations.builder().layer(LayerIdentifier.ZONES).visibleByDefault(true).build();
protected final Visualizations mainBody = Visualizations.builder().layer(LayerIdentifier.MAIN_BODY).build(); protected final Visualizations mainBody = Visualizations.builder().layer(LayerIdentifier.MAIN_BODY).build();
protected final Visualizations clean_rulings = Visualizations.builder().layer(LayerIdentifier.CLEAN_RULINGS).build(); protected final Visualizations clean_rulings = Visualizations.builder().layer(LayerIdentifier.CLEAN_RULINGS).build();
protected final Visualizations rulings = Visualizations.builder().layer(LayerIdentifier.RULINGS).build(); protected final Visualizations rulings = Visualizations.builder().layer(LayerIdentifier.RULINGS).build();