Merge branch 'RED-10249' into 'main'

RED-10249: regex found incorrectly due to wrong text sorting

See merge request fforesight/layout-parser!252
This commit is contained in:
Kilian Schüttler 2024-11-04 12:51:38 +01:00
commit f9b25c8157
14 changed files with 159 additions and 170 deletions

View File

@ -26,6 +26,8 @@ import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.No
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition; import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationService; import com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationService;
@ -158,9 +160,9 @@ public class LayoutParsingPipeline {
} }
if (!viewerDocumentFile.equals(originFile)) { if (!viewerDocumentFile.equals(originFile)) {
viewerDocumentFile.delete(); assert !viewerDocumentFile.exists() || viewerDocumentFile.delete();
} }
originFile.delete(); assert !originFile.exists() || originFile.delete();
return LayoutParsingFinishedEvent.builder() return LayoutParsingFinishedEvent.builder()
.identifier(layoutParsingRequest.identifier()) .identifier(layoutParsingRequest.identifier())
@ -240,12 +242,8 @@ public class LayoutParsingPipeline {
} }
List<ClassificationPage> classificationPages = new ArrayList<>(); List<ClassificationPage> classificationPages = new ArrayList<>();
OutlineObject lastProcessedOutlineObject = null;
// parsing the structure elements could be useful as well classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originDocument));
if (layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD) {
classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originDocument));
}
long pageCount = originDocument.getNumberOfPages(); long pageCount = originDocument.getNumberOfPages();
@ -277,18 +275,15 @@ public class LayoutParsingPipeline {
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD)) { if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD)) {
var lines = TextPositionOperations.groupByLine(new HashSet<>(words)); var lines = TextPositionOperations.groupByLine(new HashSet<>(words));
classificationDocument.getLayoutDebugLayer().addLineVisualizationsFromNestedTextPosition(lines, pageNumber); classificationDocument.getLayoutDebugLayer().addLineVisualizationsFromNestedTextPosition(lines, pageNumber);
words = TextPositionOperations.sortLines(lines); words = TextPositionOperations.sortWords(lines);
} }
classificationDocument.getLayoutDebugLayer().addTextVisualizations(words, pageNumber); classificationDocument.getLayoutDebugLayer().addTextVisualizations(words, pageNumber);
PDRectangle pdr = pdPage.getMediaBox(); PDRectangle pdr = pdPage.getMediaBox();
int rotation = pdPage.getRotation(); List<Ruling> rulings = stripper.getRulings();
boolean isLandscape = pdr.getWidth() > pdr.getHeight() && (rotation == 0 || rotation == 180) || pdr.getHeight() > pdr.getWidth() && (rotation == 90 || rotation == 270); classificationDocument.getLayoutDebugLayer().addRulingVisualization(rulings, pageNumber);
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(pdfTableCells.get(pageNumber), rulings);
PDRectangle cropbox = pdPage.getCropBox();
classificationDocument.getLayoutDebugLayer().addRulingVisualization(stripper.getRulings(), pageNumber);
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(pdfTableCells.get(pageNumber), stripper.getRulings());
PageInformation pageInformation = PageInformation.fromPDPage(pageNumber, pdPage); PageInformation pageInformation = PageInformation.fromPDPage(pageNumber, pdPage);
List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontals(), cleanRulings.getVerticals(), pageInformation); List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontals(), cleanRulings.getVerticals(), pageInformation);
@ -308,8 +303,7 @@ public class LayoutParsingPipeline {
.toList()); .toList());
ClassificationPage classificationPage = switch (layoutParsingType) { ClassificationPage classificationPage = switch (layoutParsingType) {
case REDACT_MANAGER_OLD -> case REDACT_MANAGER_OLD -> redactManagerBlockificationService.blockify(stripper.getWords(), cleanRulings, classificationDocument.getLayoutDebugLayer());
redactManagerBlockificationService.blockify(stripper.getWords(), cleanRulings, classificationDocument.getLayoutDebugLayer());
case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings); case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings);
case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH -> case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH ->
docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getLayoutDebugLayer(), layoutParsingType); docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getLayoutDebugLayer(), layoutParsingType);
@ -317,27 +311,9 @@ public class LayoutParsingPipeline {
docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getLayoutDebugLayer(), layoutParsingType); docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getLayoutDebugLayer(), layoutParsingType);
}; };
classificationPage.setCleanRulings(cleanRulings); updateClassificationPage(pdPage, pdr, classificationPage, cleanRulings, pageNumber, pageInformation);
classificationPage.setRotation(rotation);
classificationPage.setLandscape(isLandscape);
classificationPage.setPageNumber(pageNumber);
classificationPage.setPageWidth(cropbox.getWidth());
classificationPage.setPageHeight(cropbox.getHeight());
if (layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD) { blockificationPostprocessingService.findHeadlinesFromOutline(classificationDocument, pageNumber, classificationPage, pageInformation);
List<OutlineObject> outlineObjects = classificationDocument.getOutlineObjectTree().getOutlineObjectsPerPage().getOrDefault(pageNumber, new ArrayList<>());
OutlineObject notFoundOutlineObject = null;
if (lastProcessedOutlineObject != null && !lastProcessedOutlineObject.isFound()) {
lastProcessedOutlineObject.resetPoint();
notFoundOutlineObject = lastProcessedOutlineObject;
}
if (!outlineObjects.isEmpty()) {
classificationPage.setOutlineObjects(outlineObjects);
lastProcessedOutlineObject = blockificationPostprocessingService.sanitizeOutlineBlocks(classificationPage, notFoundOutlineObject);
}
classificationDocument.getLayoutDebugLayer().addOutlineObjects(outlineObjects, pageInformation);
}
classificationDocument.getLayoutDebugLayer().addMarkedContentVisualizations(stripper.getMarkedContents(), pageNumber); classificationDocument.getLayoutDebugLayer().addMarkedContentVisualizations(stripper.getMarkedContents(), pageNumber);
// MarkedContent needs to be converted at this point, otherwise it leads to GC Problems in Pdfbox. // MarkedContent needs to be converted at this point, otherwise it leads to GC Problems in Pdfbox.
@ -383,13 +359,31 @@ public class LayoutParsingPipeline {
} }
private static void updateClassificationPage(PDPage pdPage,
PDRectangle pdr,
ClassificationPage classificationPage,
CleanRulings cleanRulings,
int pageNumber,
PageInformation pageInformation) {
int rotation = pdPage.getRotation();
boolean isLandscape = pdr.getWidth() > pdr.getHeight() && (rotation == 0 || rotation == 180) || pdr.getHeight() > pdr.getWidth() && (rotation == 90 || rotation == 270);
classificationPage.setCleanRulings(cleanRulings);
classificationPage.setRotation(rotation);
classificationPage.setLandscape(isLandscape);
classificationPage.setPageNumber(pageNumber);
classificationPage.setPageWidth((float) pageInformation.width());
classificationPage.setPageHeight((float) pageInformation.height());
}
private static void rotateDirAdjExactly(List<Word> words, PDPage pdPage) { private static void rotateDirAdjExactly(List<Word> words, PDPage pdPage) {
for (TextDirection dir : TextDirection.values()) { for (TextDirection dir : TextDirection.values()) {
double averageRotation = words.stream() double averageRotation = words.stream()
.map(Word::getTextPositions) .map(Word::getCharacters)
.flatMap(Collection::stream) .flatMap(Collection::stream)
.map(Character::getTextPosition)
.filter(pos -> pos.getDir().equals(dir)) .filter(pos -> pos.getDir().equals(dir))
.mapToDouble(RedTextPosition::getExactDir).average().orElse(0); .mapToDouble(RedTextPosition::getExactDir).average().orElse(0);

View File

@ -80,16 +80,12 @@ public class DocstrumSegmentationService {
private List<Zone> computeZones(List<Word> textPositions, CleanRulings rulings, LayoutDebugLayer visualizations, TextDirection direction) { private List<Zone> computeZones(List<Word> textPositions, CleanRulings rulings, LayoutDebugLayer visualizations, TextDirection direction) {
List<RedTextPosition> positions = textPositions.stream() List<Character> characters = textPositions.stream()
.filter(t -> t.getDir() == direction) .filter(t -> t.getDir() == direction)
.map(Word::getTextPositions) .map(Word::getCharacters)
.flatMap(List::stream) .flatMap(List::stream)
.toList(); .toList();
List<Character> characters = positions.stream()
.map(Character::new)
.collect(Collectors.toList());
nearestNeighbourService.findNearestNeighbors(characters); nearestNeighbourService.findNearestNeighbors(characters);
double characterSpacing = spacingService.computeCharacterSpacing(characters); double characterSpacing = spacingService.computeCharacterSpacing(characters);

View File

@ -36,18 +36,13 @@ public class Line extends TextBoundingBox {
@EqualsAndHashCode.Include @EqualsAndHashCode.Include
private final double y1; private final double y1;
private final double height;
private FontStyle fontStyle; private FontStyle fontStyle;
private final List<Character> characters; private final List<Word> words;
private final List<Word> words = new ArrayList<>();
public Line(List<Character> characters, double wordSpacing) { public Line(List<Character> characters, double wordSpacing) {
this.characters = characters;
if (characters.size() >= 2) { if (characters.size() >= 2) {
// linear regression // linear regression
double sx = 0.0; double sx = 0.0;
@ -76,13 +71,25 @@ public class Line extends TextBoundingBox {
this.y0 = character.getY() - dy; this.y0 = character.getY() - dy;
this.y1 = character.getY() + dy; this.y1 = character.getY() + dy;
} }
height = computeHeight(); this.words = new ArrayList<>();
computeWords(wordSpacing * WORD_DISTANCE_MULTIPLIER); computeWords(characters, wordSpacing * WORD_DISTANCE_MULTIPLIER);
buildBBox(); buildBBox();
computeFontStyle(); computeFontStyle();
} }
public Line(List<Word> words) {
this.words = words;
buildBBox();
x0 = getMinX();
y0 = getMinY();
x1 = getMaxX();
y1 = getMaxY();
computeFontStyle();
}
private void computeFontStyle() { private void computeFontStyle() {
EnumMap<FontStyle, AtomicInteger> fontStyleCounter = new EnumMap<>(FontStyle.class); EnumMap<FontStyle, AtomicInteger> fontStyleCounter = new EnumMap<>(FontStyle.class);
@ -100,8 +107,7 @@ public class Line extends TextBoundingBox {
fontStyle = fontStyleCounter.entrySet() fontStyle = fontStyleCounter.entrySet()
.stream() .stream()
.max(Comparator.comparing(entry -> entry.getValue().get())) .max(Comparator.comparing(entry -> entry.getValue().get()))
.map(Map.Entry::getKey) .map(Map.Entry::getKey).orElse(FontStyle.REGULAR);
.orElse(FontStyle.REGULAR);
} }
@ -117,14 +123,6 @@ public class Line extends TextBoundingBox {
} }
private double computeHeight() {
return characters.stream()
.map(Character::getHeight)
.reduce(0d, Double::sum) / characters.size();
}
public double angularDifference(Line j) { public double angularDifference(Line j) {
double diff = Math.abs(getAngle() - j.getAngle()); double diff = Math.abs(getAngle() - j.getAngle());
@ -157,7 +155,7 @@ public class Line extends TextBoundingBox {
} }
private void computeWords(double wordSpacing) { private void computeWords(List<Character> characters, double wordSpacing) {
Word word = new Word(); Word word = new Word();
Character previous = null; Character previous = null;
@ -169,7 +167,7 @@ public class Line extends TextBoundingBox {
word = new Word(); word = new Word();
} }
} }
word.getTextPositions().add(current.getTextPosition()); word.add(current);
previous = current; previous = current;
} }
words.add(word); words.add(word);
@ -178,9 +176,7 @@ public class Line extends TextBoundingBox {
private void buildBBox() { private void buildBBox() {
this.setToBBoxOfComponents(characters.stream() this.setToBBoxOfComponents(words);
.map(Character::getTextPosition)
.toList());
} }

View File

@ -18,7 +18,6 @@ public class Zone extends TextBoundingBox {
@SuppressWarnings("PMD.ConstructorCallsOverridableMethod") @SuppressWarnings("PMD.ConstructorCallsOverridableMethod")
public Zone(List<Line> lines) { public Zone(List<Line> lines) {
lines.sort(Comparator.comparingDouble(Line::getY0));
this.lines = lines; this.lines = lines;
setToBBoxOfComponents(lines); setToBBoxOfComponents(lines);
} }

View File

@ -1,9 +1,7 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.service; package com.knecon.fforesight.service.layoutparser.processor.docstrum.service;
import static com.knecon.fforesight.service.layoutparser.processor.model.SectionIdentifier.numericalIdentifierPattern;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Comparator; import java.util.Collection;
import java.util.HashSet; import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Set; import java.util.Set;
@ -11,11 +9,12 @@ import java.util.stream.Collectors;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
@Service @Service
public class ZoneBuilderService { public class ZoneBuilderService {
@ -31,7 +30,7 @@ public class ZoneBuilderService {
private static final double MAX_LINE_SIZE_SCALE = 2.5; private static final double MAX_LINE_SIZE_SCALE = 2.5;
private static final double ANGLE_TOLERANCE = Math.PI / 6; private static final double ANGLE_TOLERANCE = Math.toRadians(5);
private static final double MAX_VERTICAL_MERGE_DISTANCE = 0.5; private static final double MAX_VERTICAL_MERGE_DISTANCE = 0.5;
@ -114,64 +113,14 @@ public class ZoneBuilderService {
private Zone mergeLinesInZone(List<Line> lines, double characterSpacing, double lineSpacing) { private Zone mergeLinesInZone(List<Line> lines, double characterSpacing, double lineSpacing) {
double maxHorizontalDistance = 0; Set<Word> words = lines.stream()
double minVerticalDistance = 0; .map(Line::getWords)
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_MERGE_DISTANCE; .flatMap(Collection::stream)
.collect(Collectors.toSet());
Collection<Set<Word>> groupedLines = TextPositionOperations.groupByLine(words);
UnionFind<Line> unionFind = new UnionFind<>(new HashSet<>(lines)); List<Line> sortedLines = TextPositionOperations.sortLines(groupedLines);
return new Zone(sortedLines);
lines.forEach(outer -> {
lines.forEach(inner -> {
if (inner == outer) {
return;
}
double horizontalDistance = outer.horizontalDistance(inner);
double verticalDistance = outer.verticalDistance(inner);
if (horizontalDistance <= maxHorizontalDistance && minVerticalDistance <= verticalDistance && verticalDistance <= maxVerticalDistance) {
unionFind.union(outer, inner);
} else if (minVerticalDistance <= verticalDistance
&& verticalDistance <= maxVerticalDistance
&& Math.abs(horizontalDistance - Math.min(outer.getLength(), inner.getLength())) < 0.1) {
boolean characterOverlap = false;
int overlappingCount = 0;
for (Character outerCharacter : outer.getCharacters()) {
for (Character innerCharacter : inner.getCharacters()) {
double characterOverlapDistance = outerCharacter.overlappingDistance(innerCharacter);
if (characterOverlapDistance > 2) {
characterOverlap = true;
}
if (characterOverlapDistance > 0) {
overlappingCount++;
}
}
}
if (!characterOverlap && overlappingCount <= 2) {
unionFind.union(outer, inner);
}
}
});
});
List<Line> outputZone = new ArrayList<>();
for (Set<Line> group : unionFind.getGroups()) {
List<Character> characters = new ArrayList<>();
for (Line line : group) {
characters.addAll(line.getCharacters());
}
characters.sort(Comparator.comparingDouble(Character::getX));
outputZone.add(new Line(characters, characterSpacing));
}
return new Zone(outputZone.stream()
.sorted(Comparator.comparing(Line::getY0))
.collect(Collectors.toList()));
} }
} }

View File

@ -13,6 +13,7 @@ import java.util.stream.Collectors;
import org.apache.pdfbox.text.TextPosition; import org.apache.pdfbox.text.TextPosition;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox;
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
@ -38,8 +39,7 @@ public class Word extends TextBoundingBox implements CharSequence {
private int page; private int page;
@Builder.Default @Builder.Default
private List<RedTextPosition> textPositions = new ArrayList<>(); private List<Character> characters = new ArrayList<>();
private boolean isParagraphStart; private boolean isParagraphStart;
private boolean strikethrough; private boolean strikethrough;
private boolean underline; private boolean underline;
@ -49,8 +49,9 @@ public class Word extends TextBoundingBox implements CharSequence {
public Word(List<TextPosition> textPositions, int pageNumber, boolean isParagraphStart) { public Word(List<TextPosition> textPositions, int pageNumber, boolean isParagraphStart) {
this.textPositions = textPositions.stream() this.characters = textPositions.stream()
.map(RedTextPosition::fromTextPosition) .map(RedTextPosition::fromTextPosition)
.map(Character::new)
.collect(Collectors.toList()); .collect(Collectors.toList());
this.page = pageNumber; this.page = pageNumber;
this.isParagraphStart = isParagraphStart; this.isParagraphStart = isParagraphStart;
@ -65,9 +66,9 @@ public class Word extends TextBoundingBox implements CharSequence {
} }
public Word(List<RedTextPosition> textPositions, int page) { public Word(List<Character> textPositions, int page) {
this.textPositions = textPositions; this.characters = new ArrayList<>(textPositions);
this.page = page; this.page = page;
calculateBBoxAndHashcode(); calculateBBoxAndHashcode();
} }
@ -76,7 +77,7 @@ public class Word extends TextBoundingBox implements CharSequence {
@Override @Override
public int length() { public int length() {
return textPositions.size(); return characters.size();
} }
@ -101,7 +102,7 @@ public class Word extends TextBoundingBox implements CharSequence {
public Word subSequence(int start, int end) { public Word subSequence(int start, int end) {
var textPositionSequence = new Word(); var textPositionSequence = new Word();
textPositionSequence.textPositions = textPositions.subList(start, end); textPositionSequence.characters = characters.subList(start, end);
textPositionSequence.page = page; textPositionSequence.page = page;
textPositionSequence.dir = dir; textPositionSequence.dir = dir;
textPositionSequence.setToBBoxOfComponents(getTextPositions()); textPositionSequence.setToBBoxOfComponents(getTextPositions());
@ -122,53 +123,59 @@ public class Word extends TextBoundingBox implements CharSequence {
public RedTextPosition textPositionAt(int index) { public RedTextPosition textPositionAt(int index) {
return textPositions.get(index); return characters.get(index).getTextPosition();
} }
public void add(Word word, RedTextPosition textPosition) { public void add(Word word, RedTextPosition textPosition) {
this.textPositions.add(textPosition); this.characters.add(new Character(textPosition));
this.page = word.getPage(); this.page = word.getPage();
calculateBBoxAndHashcode(); calculateBBoxAndHashcode();
} }
public void add(Character current) {
characters.add(current);
calculateBBoxAndHashcode();
}
public void add(TextPosition textPosition) { public void add(TextPosition textPosition) {
this.textPositions.add(RedTextPosition.fromTextPosition(textPosition)); add(new Character(RedTextPosition.fromTextPosition(textPosition)));
calculateBBoxAndHashcode();
} }
public double getTextHeightNoPadding() { public double getTextHeightNoPadding() {
return textPositions.get(0).getHeightDirAdj(); return characters.get(0).getTextPosition().getHeightDirAdj();
} }
public double getTextHeight() { public double getTextHeight() {
return textPositions.get(0).getHeightDirAdj() + HEIGHT_PADDING; return characters.get(0).getTextPosition().getHeightDirAdj() + HEIGHT_PADDING;
} }
public String getFont() { public String getFont() {
if (textPositions.get(0).getFontName() == null) { if (characters.get(0).getTextPosition().getFontName() == null) {
return "none"; return "none";
} }
return FONT_CLEANER.matcher(textPositions.get(0).getFontName().toLowerCase(Locale.ROOT)).replaceAll(""); return FONT_CLEANER.matcher(characters.get(0).getTextPosition().getFontName().toLowerCase(Locale.ROOT)).replaceAll("");
} }
public String getFontStyle() { public String getFontStyle() {
if (textPositions.get(0).getFontName() == null) { if (characters.get(0).getTextPosition().getFontName() == null) {
return STANDARD; return STANDARD;
} }
String lowercaseFontName = textPositions.get(0).getFontName().toLowerCase(Locale.ROOT); String lowercaseFontName = characters.get(0).getTextPosition().getFontName().toLowerCase(Locale.ROOT);
if (lowercaseFontName.contains(BOLD) && lowercaseFontName.contains(ITALIC)) { if (lowercaseFontName.contains(BOLD) && lowercaseFontName.contains(ITALIC)) {
return BOLD_ITALIC; return BOLD_ITALIC;
@ -184,13 +191,13 @@ public class Word extends TextBoundingBox implements CharSequence {
public float getFontSize() { public float getFontSize() {
return textPositions.get(0).getFontSizeInPt(); return characters.get(0).getTextPosition().getFontSizeInPt();
} }
public float getSpaceWidth() { public float getSpaceWidth() {
return textPositions.get(0).getWidthOfSpace(); return characters.get(0).getTextPosition().getWidthOfSpace();
} }
@ -244,6 +251,14 @@ public class Word extends TextBoundingBox implements CharSequence {
} }
private List<RedTextPosition> getTextPositions() {
return characters.stream()
.map(Character::getTextPosition)
.toList();
}
public void transform(AffineTransform rotateInstance) { public void transform(AffineTransform rotateInstance) {
for (RedTextPosition textPosition : getTextPositions()) { for (RedTextPosition textPosition : getTextPositions()) {

View File

@ -11,12 +11,14 @@ import org.apache.commons.text.similarity.LevenshteinDistance;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngineProto.LayoutEngine; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngineProto.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.SectionIdentifier; import com.knecon.fforesight.service.layoutparser.processor.model.SectionIdentifier;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject; import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities; import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
import lombok.Data; import lombok.Data;
@ -27,6 +29,24 @@ public class BlockificationPostprocessingService {
private static final float STRING_SIMILARITY_THRESHOLD = 0.1f; private static final float STRING_SIMILARITY_THRESHOLD = 0.1f;
public void findHeadlinesFromOutline(ClassificationDocument classificationDocument, int pageNumber, ClassificationPage classificationPage, PageInformation pageInformation) {
OutlineObject lastProcessedOutlineObject = null;
List<OutlineObject> outlineObjects = classificationDocument.getOutlineObjectTree().getOutlineObjectsPerPage().getOrDefault(pageNumber, new ArrayList<>());
OutlineObject notFoundOutlineObject = null;
if (lastProcessedOutlineObject != null && !lastProcessedOutlineObject.isFound()) {
lastProcessedOutlineObject.resetPoint();
notFoundOutlineObject = lastProcessedOutlineObject;
}
if (!outlineObjects.isEmpty()) {
classificationPage.setOutlineObjects(outlineObjects);
lastProcessedOutlineObject = sanitizeOutlineBlocks(classificationPage, notFoundOutlineObject);
}
classificationDocument.getLayoutDebugLayer().addOutlineObjects(outlineObjects, pageInformation);
}
public OutlineObject sanitizeOutlineBlocks(ClassificationPage classificationPage, OutlineObject notFoundOutlineObject) { public OutlineObject sanitizeOutlineBlocks(ClassificationPage classificationPage, OutlineObject notFoundOutlineObject) {
List<OutlineObject> outlineObjects = classificationPage.getOutlineObjects(); List<OutlineObject> outlineObjects = classificationPage.getOutlineObjects();
@ -329,8 +349,8 @@ public class BlockificationPostprocessingService {
if (index > 0) { if (index > 0) {
in = createSubSequence(sequence, 0, index); in = createSubSequence(sequence, 0, index);
} else if (endIndex < sequence.getTextPositions().size()) { } else if (endIndex < sequence.length()) {
in = createSubSequence(sequence, endIndex, sequence.getTextPositions().size()); in = createSubSequence(sequence, endIndex, sequence.length());
} }
return new SplitSequenceResult(in, out); return new SplitSequenceResult(in, out);
@ -339,7 +359,7 @@ public class BlockificationPostprocessingService {
private static Word createSubSequence(Word sequence, int start, int end) { private static Word createSubSequence(Word sequence, int start, int end) {
Word newSeq = new Word(new ArrayList<>(sequence.getTextPositions().subList(start, end)), sequence.getPage()); Word newSeq = new Word(new ArrayList<>(sequence.getCharacters().subList(start, end)), sequence.getPage());
newSeq.setParagraphStart(sequence.isParagraphStart()); newSeq.setParagraphStart(sequence.isParagraphStart());
return newSeq; return newSeq;
} }

View File

@ -77,7 +77,7 @@ public class DocstrumBlockificationService {
.forEach(line -> { .forEach(line -> {
line.getWords() line.getWords()
.forEach(word -> { .forEach(word -> {
words.add(new Word(word.getTextPositions(), word.getPage())); words.add(new Word(word.getCharacters(), word.getPage()));
}); });
}); });

View File

@ -81,7 +81,7 @@ public class ClarifyndClassificationService {
&& (textBlock.getMostPopularWordStyle().equals("bold") && (textBlock.getMostPopularWordStyle().equals("bold")
|| !document.getFontStyleCounter().getCountPerValue().containsKey("bold") || !document.getFontStyleCounter().getCountPerValue().containsKey("bold")
&& textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() + 1) && textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() + 1)
&& textBlock.getWords().get(0).getTextPositions().get(0).getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) { && textBlock.getWords().get(0).getFontSize()>= textBlock.getMostPopularWordFontSize()) {
PageBlockType headlineType = HeadlineClassificationService.headlineClassByFontSize(textBlock, headlineFontSizes); PageBlockType headlineType = HeadlineClassificationService.headlineClassByFontSize(textBlock, headlineFontSizes);
headlineClassificationService.classifyHeadline(textBlock, headlineType); headlineClassificationService.classifyHeadline(textBlock, headlineType);
@ -91,7 +91,7 @@ public class ClarifyndClassificationService {
&& textBlock.getMostPopularWordStyle().equals("bold") && textBlock.getMostPopularWordStyle().equals("bold")
&& !document.getFontStyleCounter().getMostPopular().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")
&& PositionUtils.getApproxLineCount(textBlock) < 2.9 && PositionUtils.getApproxLineCount(textBlock) < 2.9
&& textBlock.getWords().get(0).getTextPositions().get(0).getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) { && textBlock.getWords().get(0).getFontSize() >= textBlock.getMostPopularWordFontSize()) {
PageBlockType headlineType = HeadlineClassificationService.headlineClassByFontSize(textBlock, headlineFontSizes); PageBlockType headlineType = HeadlineClassificationService.headlineClassByFontSize(textBlock, headlineFontSizes);
headlineClassificationService.classifyHeadline(textBlock, headlineType); headlineClassificationService.classifyHeadline(textBlock, headlineType);

View File

@ -94,7 +94,7 @@ public class RedactManagerClassificationService {
&& (textBlock.getMostPopularWordStyle().equals("bold") && (textBlock.getMostPopularWordStyle().equals("bold")
|| !document.getFontStyleCounter().getCountPerValue().containsKey("bold") || !document.getFontStyleCounter().getCountPerValue().containsKey("bold")
&& textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() + 1) && textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() + 1)
&& textBlock.getWords().get(0).getTextPositions().get(0).getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) { && textBlock.getWords().get(0).getFontSize() >= textBlock.getMostPopularWordFontSize()) {
PageBlockType headlineType = HeadlineClassificationService.headlineClassByFontSize(textBlock, headlineFontSizes); PageBlockType headlineType = HeadlineClassificationService.headlineClassByFontSize(textBlock, headlineFontSizes);
headlineClassificationService.classifyHeadline(textBlock, headlineType); headlineClassificationService.classifyHeadline(textBlock, headlineType);
@ -104,7 +104,7 @@ public class RedactManagerClassificationService {
&& textBlock.getMostPopularWordStyle().equals("bold") && textBlock.getMostPopularWordStyle().equals("bold")
&& !document.getFontStyleCounter().getMostPopular().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")
&& PositionUtils.getApproxLineCount(textBlock) < 2.9 && PositionUtils.getApproxLineCount(textBlock) < 2.9
&& textBlock.getWords().get(0).getTextPositions().get(0).getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) { && textBlock.getWords().get(0).getFontSize() >= textBlock.getMostPopularWordFontSize()) {
PageBlockType headlineType = HeadlineClassificationService.headlineClassByFontSize(textBlock, headlineFontSizes); PageBlockType headlineType = HeadlineClassificationService.headlineClassByFontSize(textBlock, headlineFontSizes);
headlineClassificationService.classifyHeadline(textBlock, headlineType); headlineClassificationService.classifyHeadline(textBlock, headlineType);

View File

@ -10,6 +10,7 @@ import java.util.List;
import java.util.Locale; import java.util.Locale;
import java.util.Objects; import java.util.Objects;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange; import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition; import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
@ -31,19 +32,19 @@ public class SearchTextWithTextPositionFactory {
public SearchTextWithTextPositionDto buildSearchTextToTextPositionDto(List<Word> sequences) { public SearchTextWithTextPositionDto buildSearchTextToTextPositionDto(List<Word> sequences) {
if (sequences.isEmpty() || sequences.stream() if (sequences.isEmpty() || sequences.stream()
.allMatch(sequence -> sequence.getTextPositions().isEmpty())) { .allMatch(sequence -> sequence.getCharacters().isEmpty())) {
return SearchTextWithTextPositionDto.empty(); return SearchTextWithTextPositionDto.empty();
} }
Context context = new Context(); Context context = new Context();
RedTextPosition currentTextPosition = sequences.get(0).getTextPositions().get(0); RedTextPosition currentTextPosition = sequences.get(0).getCharacters().get(0).getTextPosition();
RedTextPosition previousTextPosition = RedTextPosition.builder().unicode(" ").bBoxDirAdj(currentTextPosition.getBBoxDirAdj()).build(); RedTextPosition previousTextPosition = RedTextPosition.builder().unicode(" ").bBoxDirAdj(currentTextPosition.getBBoxDirAdj()).build();
for (Word word : sequences) { for (Word word : sequences) {
for (int i = 0; i < word.getTextPositions().size(); ++i) { for (int i = 0; i < word.getCharacters().size(); ++i) {
currentTextPosition = word.getTextPositions().get(i); currentTextPosition = word.getCharacters().get(i).getTextPosition();
if (isLineBreak(currentTextPosition, previousTextPosition)) { if (isLineBreak(currentTextPosition, previousTextPosition)) {
removeHyphenLinebreaks(context); removeHyphenLinebreaks(context);
context.lineBreaksStringIdx.add(context.stringIdx); context.lineBreaksStringIdx.add(context.stringIdx);
@ -66,8 +67,9 @@ public class SearchTextWithTextPositionFactory {
} }
List<Rectangle2D> positions = sequences.stream() List<Rectangle2D> positions = sequences.stream()
.map(Word::getTextPositions) .map(Word::getCharacters)
.flatMap(Collection::stream) .flatMap(Collection::stream)
.map(Character::getTextPosition)
.map(RedTextPosition::getBBoxPdf) .map(RedTextPosition::getBBoxPdf)
.toList(); .toList();

View File

@ -248,8 +248,8 @@ public class PDFLinesTextStripper extends PDFTextStripper {
if (!words.isEmpty()) { if (!words.isEmpty()) {
previous = words.get(words.size() - 1) previous = words.get(words.size() - 1)
.getTextPositions() .getCharacters()
.get(words.get(words.size() - 1).getTextPositions().size() - 1); .get(words.get(words.size() - 1).getCharacters().size() - 1).getTextPosition();
} }
if (i == 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i).getUnicode().equals("\u00A0") || textPositions.get(i).getUnicode().equals("\t"))) { if (i == 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i).getUnicode().equals("\u00A0") || textPositions.get(i).getUnicode().equals("\t"))) {

View File

@ -11,6 +11,7 @@ import java.util.Set;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.AngleFilter; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.AngleFilter;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils; import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils;
@ -54,11 +55,20 @@ public class TextPositionOperations {
private List<Word> sortUsingLineDetection(Set<Word> sequences) { private List<Word> sortUsingLineDetection(Set<Word> sequences) {
return sortLines(groupByLine(sequences)); return sortWords(groupByLine(sequences));
} }
public List<Word> sortLines(Collection<Set<Word>> lines) { public List<Word> sortWords(Collection<Set<Word>> lines) {
return sortLines(lines).stream()
.map(Line::getWords)
.flatMap(Collection::stream)
.collect(Collectors.toList());
}
public List<Line> sortLines(Collection<Set<Word>> lines) {
List<List<Word>> lineBlocks = new ArrayList<>(); List<List<Word>> lineBlocks = new ArrayList<>();
for (Set<Word> line : lines) { for (Set<Word> line : lines) {
@ -70,9 +80,9 @@ public class TextPositionOperations {
// need to use old sorting, since COMPARATOR_DIR_ADJ is not transitive // need to use old sorting, since COMPARATOR_DIR_ADJ is not transitive
QuickSort.sort(lineBlocks, Comparator.comparing(line -> line.get(0), COMPARATOR_DIR_ADJ)); QuickSort.sort(lineBlocks, Comparator.comparing(line -> line.get(0), COMPARATOR_DIR_ADJ));
List<Word> list = new ArrayList<>(); List<Line> list = new ArrayList<>();
for (List<Word> words : lineBlocks) { for (List<Word> lineBlock : lineBlocks) {
list.addAll(words); list.add(new Line(lineBlock));
} }
return list; return list;
} }
@ -95,6 +105,12 @@ public class TextPositionOperations {
.map(Word::getBBoxDirAdj) .map(Word::getBBoxDirAdj)
.mapToDouble(RectangularShape::getWidth).average().orElse(75) * MAX_WORD_DISTANCE_FACTOR; .mapToDouble(RectangularShape::getWidth).average().orElse(75) * MAX_WORD_DISTANCE_FACTOR;
return groupByLine(sequences, maxLineDistance, maxXGap);
}
public Collection<Set<Word>> groupByLine(Set<Word> sequences, double maxLineDistance, double maxXGap) {
UnionFind<Word> unionFind = new UnionFind<>(sequences); UnionFind<Word> unionFind = new UnionFind<>(sequences);
for (Word sequence : sequences) { for (Word sequence : sequences) {

View File

@ -273,7 +273,9 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
AtomicInteger index = new AtomicInteger(0); AtomicInteger index = new AtomicInteger(0);
zones.forEach(zone -> zone.getLines() zones.forEach(zone -> zone.getLines()
.stream() .stream()
.map(Line::getCharacters) .map(Line::getWords)
.flatMap(Collection::stream)
.map(Word::getCharacters)
.flatMap(Collection::stream) .flatMap(Collection::stream)
.forEach(character -> { .forEach(character -> {
Color color = getRotatingColor(index); Color color = getRotatingColor(index);