Compare commits
4 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8e4cfa2047 | ||
|
|
124afb3623 | ||
|
|
ca2f3512d2 | ||
|
|
7338e06fb0 |
@ -26,6 +26,8 @@ import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.No
|
|||||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationService;
|
||||||
@ -158,9 +160,9 @@ public class LayoutParsingPipeline {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (!viewerDocumentFile.equals(originFile)) {
|
if (!viewerDocumentFile.equals(originFile)) {
|
||||||
viewerDocumentFile.delete();
|
assert !viewerDocumentFile.exists() || viewerDocumentFile.delete();
|
||||||
}
|
}
|
||||||
originFile.delete();
|
assert !originFile.exists() || originFile.delete();
|
||||||
|
|
||||||
return LayoutParsingFinishedEvent.builder()
|
return LayoutParsingFinishedEvent.builder()
|
||||||
.identifier(layoutParsingRequest.identifier())
|
.identifier(layoutParsingRequest.identifier())
|
||||||
@ -240,12 +242,8 @@ public class LayoutParsingPipeline {
|
|||||||
}
|
}
|
||||||
|
|
||||||
List<ClassificationPage> classificationPages = new ArrayList<>();
|
List<ClassificationPage> classificationPages = new ArrayList<>();
|
||||||
OutlineObject lastProcessedOutlineObject = null;
|
|
||||||
|
|
||||||
// parsing the structure elements could be useful as well
|
classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originDocument));
|
||||||
if (layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD) {
|
|
||||||
classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originDocument));
|
|
||||||
}
|
|
||||||
|
|
||||||
long pageCount = originDocument.getNumberOfPages();
|
long pageCount = originDocument.getNumberOfPages();
|
||||||
|
|
||||||
@ -277,18 +275,15 @@ public class LayoutParsingPipeline {
|
|||||||
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD)) {
|
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD)) {
|
||||||
var lines = TextPositionOperations.groupByLine(new HashSet<>(words));
|
var lines = TextPositionOperations.groupByLine(new HashSet<>(words));
|
||||||
classificationDocument.getLayoutDebugLayer().addLineVisualizationsFromNestedTextPosition(lines, pageNumber);
|
classificationDocument.getLayoutDebugLayer().addLineVisualizationsFromNestedTextPosition(lines, pageNumber);
|
||||||
words = TextPositionOperations.sortLines(lines);
|
words = TextPositionOperations.sortWords(lines);
|
||||||
}
|
}
|
||||||
classificationDocument.getLayoutDebugLayer().addTextVisualizations(words, pageNumber);
|
classificationDocument.getLayoutDebugLayer().addTextVisualizations(words, pageNumber);
|
||||||
|
|
||||||
PDRectangle pdr = pdPage.getMediaBox();
|
PDRectangle pdr = pdPage.getMediaBox();
|
||||||
|
|
||||||
int rotation = pdPage.getRotation();
|
List<Ruling> rulings = stripper.getRulings();
|
||||||
boolean isLandscape = pdr.getWidth() > pdr.getHeight() && (rotation == 0 || rotation == 180) || pdr.getHeight() > pdr.getWidth() && (rotation == 90 || rotation == 270);
|
classificationDocument.getLayoutDebugLayer().addRulingVisualization(rulings, pageNumber);
|
||||||
|
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(pdfTableCells.get(pageNumber), rulings);
|
||||||
PDRectangle cropbox = pdPage.getCropBox();
|
|
||||||
classificationDocument.getLayoutDebugLayer().addRulingVisualization(stripper.getRulings(), pageNumber);
|
|
||||||
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(pdfTableCells.get(pageNumber), stripper.getRulings());
|
|
||||||
|
|
||||||
PageInformation pageInformation = PageInformation.fromPDPage(pageNumber, pdPage);
|
PageInformation pageInformation = PageInformation.fromPDPage(pageNumber, pdPage);
|
||||||
List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontals(), cleanRulings.getVerticals(), pageInformation);
|
List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontals(), cleanRulings.getVerticals(), pageInformation);
|
||||||
@ -308,8 +303,7 @@ public class LayoutParsingPipeline {
|
|||||||
.toList());
|
.toList());
|
||||||
|
|
||||||
ClassificationPage classificationPage = switch (layoutParsingType) {
|
ClassificationPage classificationPage = switch (layoutParsingType) {
|
||||||
case REDACT_MANAGER_OLD ->
|
case REDACT_MANAGER_OLD -> redactManagerBlockificationService.blockify(stripper.getWords(), cleanRulings, classificationDocument.getLayoutDebugLayer());
|
||||||
redactManagerBlockificationService.blockify(stripper.getWords(), cleanRulings, classificationDocument.getLayoutDebugLayer());
|
|
||||||
case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings);
|
case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings);
|
||||||
case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH ->
|
case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH ->
|
||||||
docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getLayoutDebugLayer(), layoutParsingType);
|
docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getLayoutDebugLayer(), layoutParsingType);
|
||||||
@ -317,27 +311,9 @@ public class LayoutParsingPipeline {
|
|||||||
docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getLayoutDebugLayer(), layoutParsingType);
|
docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getLayoutDebugLayer(), layoutParsingType);
|
||||||
};
|
};
|
||||||
|
|
||||||
classificationPage.setCleanRulings(cleanRulings);
|
updateClassificationPage(pdPage, pdr, classificationPage, cleanRulings, pageNumber, pageInformation);
|
||||||
classificationPage.setRotation(rotation);
|
|
||||||
classificationPage.setLandscape(isLandscape);
|
|
||||||
classificationPage.setPageNumber(pageNumber);
|
|
||||||
classificationPage.setPageWidth(cropbox.getWidth());
|
|
||||||
classificationPage.setPageHeight(cropbox.getHeight());
|
|
||||||
|
|
||||||
if (layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD) {
|
blockificationPostprocessingService.findHeadlinesFromOutline(classificationDocument, pageNumber, classificationPage, pageInformation);
|
||||||
List<OutlineObject> outlineObjects = classificationDocument.getOutlineObjectTree().getOutlineObjectsPerPage().getOrDefault(pageNumber, new ArrayList<>());
|
|
||||||
|
|
||||||
OutlineObject notFoundOutlineObject = null;
|
|
||||||
if (lastProcessedOutlineObject != null && !lastProcessedOutlineObject.isFound()) {
|
|
||||||
lastProcessedOutlineObject.resetPoint();
|
|
||||||
notFoundOutlineObject = lastProcessedOutlineObject;
|
|
||||||
}
|
|
||||||
if (!outlineObjects.isEmpty()) {
|
|
||||||
classificationPage.setOutlineObjects(outlineObjects);
|
|
||||||
lastProcessedOutlineObject = blockificationPostprocessingService.sanitizeOutlineBlocks(classificationPage, notFoundOutlineObject);
|
|
||||||
}
|
|
||||||
classificationDocument.getLayoutDebugLayer().addOutlineObjects(outlineObjects, pageInformation);
|
|
||||||
}
|
|
||||||
|
|
||||||
classificationDocument.getLayoutDebugLayer().addMarkedContentVisualizations(stripper.getMarkedContents(), pageNumber);
|
classificationDocument.getLayoutDebugLayer().addMarkedContentVisualizations(stripper.getMarkedContents(), pageNumber);
|
||||||
// MarkedContent needs to be converted at this point, otherwise it leads to GC Problems in Pdfbox.
|
// MarkedContent needs to be converted at this point, otherwise it leads to GC Problems in Pdfbox.
|
||||||
@ -383,13 +359,31 @@ public class LayoutParsingPipeline {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static void updateClassificationPage(PDPage pdPage,
|
||||||
|
PDRectangle pdr,
|
||||||
|
ClassificationPage classificationPage,
|
||||||
|
CleanRulings cleanRulings,
|
||||||
|
int pageNumber,
|
||||||
|
PageInformation pageInformation) {
|
||||||
|
|
||||||
|
int rotation = pdPage.getRotation();
|
||||||
|
boolean isLandscape = pdr.getWidth() > pdr.getHeight() && (rotation == 0 || rotation == 180) || pdr.getHeight() > pdr.getWidth() && (rotation == 90 || rotation == 270);
|
||||||
|
classificationPage.setCleanRulings(cleanRulings);
|
||||||
|
classificationPage.setRotation(rotation);
|
||||||
|
classificationPage.setLandscape(isLandscape);
|
||||||
|
classificationPage.setPageNumber(pageNumber);
|
||||||
|
classificationPage.setPageWidth((float) pageInformation.width());
|
||||||
|
classificationPage.setPageHeight((float) pageInformation.height());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
private static void rotateDirAdjExactly(List<Word> words, PDPage pdPage) {
|
private static void rotateDirAdjExactly(List<Word> words, PDPage pdPage) {
|
||||||
|
|
||||||
for (TextDirection dir : TextDirection.values()) {
|
for (TextDirection dir : TextDirection.values()) {
|
||||||
|
|
||||||
double averageRotation = words.stream()
|
double averageRotation = words.stream()
|
||||||
.map(Word::getTextPositions)
|
.map(Word::getCharacters)
|
||||||
.flatMap(Collection::stream)
|
.flatMap(Collection::stream)
|
||||||
|
.map(Character::getTextPosition)
|
||||||
.filter(pos -> pos.getDir().equals(dir))
|
.filter(pos -> pos.getDir().equals(dir))
|
||||||
.mapToDouble(RedTextPosition::getExactDir).average().orElse(0);
|
.mapToDouble(RedTextPosition::getExactDir).average().orElse(0);
|
||||||
|
|
||||||
|
|||||||
@ -80,16 +80,12 @@ public class DocstrumSegmentationService {
|
|||||||
|
|
||||||
private List<Zone> computeZones(List<Word> textPositions, CleanRulings rulings, LayoutDebugLayer visualizations, TextDirection direction) {
|
private List<Zone> computeZones(List<Word> textPositions, CleanRulings rulings, LayoutDebugLayer visualizations, TextDirection direction) {
|
||||||
|
|
||||||
List<RedTextPosition> positions = textPositions.stream()
|
List<Character> characters = textPositions.stream()
|
||||||
.filter(t -> t.getDir() == direction)
|
.filter(t -> t.getDir() == direction)
|
||||||
.map(Word::getTextPositions)
|
.map(Word::getCharacters)
|
||||||
.flatMap(List::stream)
|
.flatMap(List::stream)
|
||||||
.toList();
|
.toList();
|
||||||
|
|
||||||
List<Character> characters = positions.stream()
|
|
||||||
.map(Character::new)
|
|
||||||
.collect(Collectors.toList());
|
|
||||||
|
|
||||||
nearestNeighbourService.findNearestNeighbors(characters);
|
nearestNeighbourService.findNearestNeighbors(characters);
|
||||||
|
|
||||||
double characterSpacing = spacingService.computeCharacterSpacing(characters);
|
double characterSpacing = spacingService.computeCharacterSpacing(characters);
|
||||||
|
|||||||
@ -36,18 +36,13 @@ public class Line extends TextBoundingBox {
|
|||||||
@EqualsAndHashCode.Include
|
@EqualsAndHashCode.Include
|
||||||
private final double y1;
|
private final double y1;
|
||||||
|
|
||||||
private final double height;
|
|
||||||
|
|
||||||
private FontStyle fontStyle;
|
private FontStyle fontStyle;
|
||||||
|
|
||||||
private final List<Character> characters;
|
private final List<Word> words;
|
||||||
private final List<Word> words = new ArrayList<>();
|
|
||||||
|
|
||||||
|
|
||||||
public Line(List<Character> characters, double wordSpacing) {
|
public Line(List<Character> characters, double wordSpacing) {
|
||||||
|
|
||||||
this.characters = characters;
|
|
||||||
|
|
||||||
if (characters.size() >= 2) {
|
if (characters.size() >= 2) {
|
||||||
// linear regression
|
// linear regression
|
||||||
double sx = 0.0;
|
double sx = 0.0;
|
||||||
@ -76,13 +71,25 @@ public class Line extends TextBoundingBox {
|
|||||||
this.y0 = character.getY() - dy;
|
this.y0 = character.getY() - dy;
|
||||||
this.y1 = character.getY() + dy;
|
this.y1 = character.getY() + dy;
|
||||||
}
|
}
|
||||||
height = computeHeight();
|
this.words = new ArrayList<>();
|
||||||
computeWords(wordSpacing * WORD_DISTANCE_MULTIPLIER);
|
computeWords(characters, wordSpacing * WORD_DISTANCE_MULTIPLIER);
|
||||||
buildBBox();
|
buildBBox();
|
||||||
computeFontStyle();
|
computeFontStyle();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public Line(List<Word> words) {
|
||||||
|
|
||||||
|
this.words = words;
|
||||||
|
buildBBox();
|
||||||
|
x0 = getMinX();
|
||||||
|
y0 = getMinY();
|
||||||
|
x1 = getMaxX();
|
||||||
|
y1 = getMaxY();
|
||||||
|
computeFontStyle();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
private void computeFontStyle() {
|
private void computeFontStyle() {
|
||||||
|
|
||||||
EnumMap<FontStyle, AtomicInteger> fontStyleCounter = new EnumMap<>(FontStyle.class);
|
EnumMap<FontStyle, AtomicInteger> fontStyleCounter = new EnumMap<>(FontStyle.class);
|
||||||
@ -100,8 +107,7 @@ public class Line extends TextBoundingBox {
|
|||||||
fontStyle = fontStyleCounter.entrySet()
|
fontStyle = fontStyleCounter.entrySet()
|
||||||
.stream()
|
.stream()
|
||||||
.max(Comparator.comparing(entry -> entry.getValue().get()))
|
.max(Comparator.comparing(entry -> entry.getValue().get()))
|
||||||
.map(Map.Entry::getKey)
|
.map(Map.Entry::getKey).orElse(FontStyle.REGULAR);
|
||||||
.orElse(FontStyle.REGULAR);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -117,14 +123,6 @@ public class Line extends TextBoundingBox {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private double computeHeight() {
|
|
||||||
|
|
||||||
return characters.stream()
|
|
||||||
.map(Character::getHeight)
|
|
||||||
.reduce(0d, Double::sum) / characters.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public double angularDifference(Line j) {
|
public double angularDifference(Line j) {
|
||||||
|
|
||||||
double diff = Math.abs(getAngle() - j.getAngle());
|
double diff = Math.abs(getAngle() - j.getAngle());
|
||||||
@ -157,7 +155,7 @@ public class Line extends TextBoundingBox {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void computeWords(double wordSpacing) {
|
private void computeWords(List<Character> characters, double wordSpacing) {
|
||||||
|
|
||||||
Word word = new Word();
|
Word word = new Word();
|
||||||
Character previous = null;
|
Character previous = null;
|
||||||
@ -169,7 +167,7 @@ public class Line extends TextBoundingBox {
|
|||||||
word = new Word();
|
word = new Word();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
word.getTextPositions().add(current.getTextPosition());
|
word.add(current);
|
||||||
previous = current;
|
previous = current;
|
||||||
}
|
}
|
||||||
words.add(word);
|
words.add(word);
|
||||||
@ -178,9 +176,7 @@ public class Line extends TextBoundingBox {
|
|||||||
|
|
||||||
private void buildBBox() {
|
private void buildBBox() {
|
||||||
|
|
||||||
this.setToBBoxOfComponents(characters.stream()
|
this.setToBBoxOfComponents(words);
|
||||||
.map(Character::getTextPosition)
|
|
||||||
.toList());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -18,7 +18,6 @@ public class Zone extends TextBoundingBox {
|
|||||||
@SuppressWarnings("PMD.ConstructorCallsOverridableMethod")
|
@SuppressWarnings("PMD.ConstructorCallsOverridableMethod")
|
||||||
public Zone(List<Line> lines) {
|
public Zone(List<Line> lines) {
|
||||||
|
|
||||||
lines.sort(Comparator.comparingDouble(Line::getY0));
|
|
||||||
this.lines = lines;
|
this.lines = lines;
|
||||||
setToBBoxOfComponents(lines);
|
setToBBoxOfComponents(lines);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,9 +1,7 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.docstrum.service;
|
package com.knecon.fforesight.service.layoutparser.processor.docstrum.service;
|
||||||
|
|
||||||
import static com.knecon.fforesight.service.layoutparser.processor.model.SectionIdentifier.numericalIdentifierPattern;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Comparator;
|
import java.util.Collection;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
@ -11,11 +9,12 @@ import java.util.stream.Collectors;
|
|||||||
|
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind;
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
|
||||||
|
|
||||||
@Service
|
@Service
|
||||||
public class ZoneBuilderService {
|
public class ZoneBuilderService {
|
||||||
@ -31,7 +30,7 @@ public class ZoneBuilderService {
|
|||||||
|
|
||||||
private static final double MAX_LINE_SIZE_SCALE = 2.5;
|
private static final double MAX_LINE_SIZE_SCALE = 2.5;
|
||||||
|
|
||||||
private static final double ANGLE_TOLERANCE = Math.PI / 6;
|
private static final double ANGLE_TOLERANCE = Math.toRadians(5);
|
||||||
|
|
||||||
private static final double MAX_VERTICAL_MERGE_DISTANCE = 0.5;
|
private static final double MAX_VERTICAL_MERGE_DISTANCE = 0.5;
|
||||||
|
|
||||||
@ -114,64 +113,14 @@ public class ZoneBuilderService {
|
|||||||
|
|
||||||
private Zone mergeLinesInZone(List<Line> lines, double characterSpacing, double lineSpacing) {
|
private Zone mergeLinesInZone(List<Line> lines, double characterSpacing, double lineSpacing) {
|
||||||
|
|
||||||
double maxHorizontalDistance = 0;
|
Set<Word> words = lines.stream()
|
||||||
double minVerticalDistance = 0;
|
.map(Line::getWords)
|
||||||
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_MERGE_DISTANCE;
|
.flatMap(Collection::stream)
|
||||||
|
.collect(Collectors.toSet());
|
||||||
|
Collection<Set<Word>> groupedLines = TextPositionOperations.groupByLine(words);
|
||||||
|
|
||||||
UnionFind<Line> unionFind = new UnionFind<>(new HashSet<>(lines));
|
List<Line> sortedLines = TextPositionOperations.sortLines(groupedLines);
|
||||||
|
return new Zone(sortedLines);
|
||||||
lines.forEach(outer -> {
|
|
||||||
lines.forEach(inner -> {
|
|
||||||
if (inner == outer) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
double horizontalDistance = outer.horizontalDistance(inner);
|
|
||||||
double verticalDistance = outer.verticalDistance(inner);
|
|
||||||
|
|
||||||
if (horizontalDistance <= maxHorizontalDistance && minVerticalDistance <= verticalDistance && verticalDistance <= maxVerticalDistance) {
|
|
||||||
|
|
||||||
unionFind.union(outer, inner);
|
|
||||||
|
|
||||||
} else if (minVerticalDistance <= verticalDistance
|
|
||||||
&& verticalDistance <= maxVerticalDistance
|
|
||||||
&& Math.abs(horizontalDistance - Math.min(outer.getLength(), inner.getLength())) < 0.1) {
|
|
||||||
|
|
||||||
boolean characterOverlap = false;
|
|
||||||
int overlappingCount = 0;
|
|
||||||
for (Character outerCharacter : outer.getCharacters()) {
|
|
||||||
for (Character innerCharacter : inner.getCharacters()) {
|
|
||||||
double characterOverlapDistance = outerCharacter.overlappingDistance(innerCharacter);
|
|
||||||
if (characterOverlapDistance > 2) {
|
|
||||||
characterOverlap = true;
|
|
||||||
}
|
|
||||||
if (characterOverlapDistance > 0) {
|
|
||||||
overlappingCount++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (!characterOverlap && overlappingCount <= 2) {
|
|
||||||
unionFind.union(outer, inner);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
||||||
List<Line> outputZone = new ArrayList<>();
|
|
||||||
for (Set<Line> group : unionFind.getGroups()) {
|
|
||||||
List<Character> characters = new ArrayList<>();
|
|
||||||
for (Line line : group) {
|
|
||||||
characters.addAll(line.getCharacters());
|
|
||||||
}
|
|
||||||
characters.sort(Comparator.comparingDouble(Character::getX));
|
|
||||||
|
|
||||||
outputZone.add(new Line(characters, characterSpacing));
|
|
||||||
}
|
|
||||||
|
|
||||||
return new Zone(outputZone.stream()
|
|
||||||
.sorted(Comparator.comparing(Line::getY0))
|
|
||||||
.collect(Collectors.toList()));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -14,6 +14,7 @@ public enum PageBlockType {
|
|||||||
PARAGRAPH_ITALIC,
|
PARAGRAPH_ITALIC,
|
||||||
PARAGRAPH_UNKNOWN,
|
PARAGRAPH_UNKNOWN,
|
||||||
OTHER,
|
OTHER,
|
||||||
|
TABLE_OF_CONTENTS_HEADLINE,
|
||||||
TABLE_OF_CONTENTS_ITEM,
|
TABLE_OF_CONTENTS_ITEM,
|
||||||
LIST_ITEM,
|
LIST_ITEM,
|
||||||
TABLE;
|
TABLE;
|
||||||
@ -35,7 +36,7 @@ public enum PageBlockType {
|
|||||||
public static int getHeadlineNumber(PageBlockType pageBlockType) {
|
public static int getHeadlineNumber(PageBlockType pageBlockType) {
|
||||||
|
|
||||||
return switch (pageBlockType) {
|
return switch (pageBlockType) {
|
||||||
case H1 -> 1;
|
case H1, TABLE_OF_CONTENTS_HEADLINE -> 1;
|
||||||
case H2 -> 2;
|
case H2 -> 2;
|
||||||
case H3 -> 3;
|
case H3 -> 3;
|
||||||
case H4 -> 4;
|
case H4 -> 4;
|
||||||
@ -47,6 +48,6 @@ public enum PageBlockType {
|
|||||||
|
|
||||||
public boolean isHeadline() {
|
public boolean isHeadline() {
|
||||||
|
|
||||||
return this.equals(H1) || this.equals(H2) || this.equals(H3) || this.equals(H4) || this.equals(H5) || this.equals(H6);
|
return this.equals(H1) || this.equals(H2) || this.equals(H3) || this.equals(H4) || this.equals(H5) || this.equals(H6) || this.equals(TABLE_OF_CONTENTS_HEADLINE);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -11,6 +11,8 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Se
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableOfContents;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableOfContentsItem;
|
||||||
|
|
||||||
public abstract class AbstractNodeVisitor implements NodeVisitor {
|
public abstract class AbstractNodeVisitor implements NodeVisitor {
|
||||||
|
|
||||||
@ -83,6 +85,18 @@ public abstract class AbstractNodeVisitor implements NodeVisitor {
|
|||||||
visitChildren(tableCell);
|
visitChildren(tableCell);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void visit(TableOfContents toc) {
|
||||||
|
|
||||||
|
visitChildren(toc);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void visit(TableOfContentsItem toci) {
|
||||||
|
|
||||||
|
visitChildren(toci);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
protected void visitChildren(SemanticNode semanticNode) {
|
protected void visitChildren(SemanticNode semanticNode) {
|
||||||
|
|
||||||
|
|||||||
@ -10,6 +10,10 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Se
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableOfContents;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableOfContentsItem;
|
||||||
|
|
||||||
|
import software.amazon.awssdk.utils.builder.ToCopyableBuilder;
|
||||||
|
|
||||||
public interface NodeVisitor {
|
public interface NodeVisitor {
|
||||||
|
|
||||||
@ -42,4 +46,10 @@ public interface NodeVisitor {
|
|||||||
|
|
||||||
void visit(TableCell tableCell);
|
void visit(TableCell tableCell);
|
||||||
|
|
||||||
|
|
||||||
|
void visit(TableOfContents tableOfContents);
|
||||||
|
|
||||||
|
|
||||||
|
void visit(TableOfContentsItem tableOfContentsItem);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -0,0 +1,41 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
|
||||||
|
|
||||||
|
import lombok.AccessLevel;
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.EqualsAndHashCode;
|
||||||
|
import lombok.experimental.FieldDefaults;
|
||||||
|
import lombok.experimental.SuperBuilder;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@SuperBuilder
|
||||||
|
@AllArgsConstructor
|
||||||
|
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||||
|
@EqualsAndHashCode(callSuper = true)
|
||||||
|
public class TableOfContents extends AbstractSemanticNode {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public NodeTypeProto.NodeType getType() {
|
||||||
|
|
||||||
|
return NodeTypeProto.NodeType.TABLE_OF_CONTENTS;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public Headline getHeadline() {
|
||||||
|
|
||||||
|
return streamChildrenOfType(NodeTypeProto.NodeType.HEADLINE).map(node -> (Headline) node)
|
||||||
|
.findFirst()
|
||||||
|
.orElseGet(() -> getParent().getHeadline());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void accept(NodeVisitor visitor) {
|
||||||
|
|
||||||
|
visitor.visit(this);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,51 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||||
|
|
||||||
|
import lombok.AccessLevel;
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.EqualsAndHashCode;
|
||||||
|
import lombok.experimental.FieldDefaults;
|
||||||
|
import lombok.experimental.SuperBuilder;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@SuperBuilder
|
||||||
|
@AllArgsConstructor
|
||||||
|
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||||
|
@EqualsAndHashCode(callSuper = true)
|
||||||
|
public class TableOfContentsItem extends AbstractSemanticNode {
|
||||||
|
|
||||||
|
TextBlock leafTextBlock;
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public NodeTypeProto.NodeType getType() {
|
||||||
|
|
||||||
|
return NodeTypeProto.NodeType.TABLE_OF_CONTENTS_ITEM;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean isLeaf() {
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void accept(NodeVisitor visitor) {
|
||||||
|
|
||||||
|
visitor.visit(this);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TextBlock getTextBlock() {
|
||||||
|
|
||||||
|
return leafTextBlock;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -13,6 +13,7 @@ import java.util.stream.Collectors;
|
|||||||
|
|
||||||
import org.apache.pdfbox.text.TextPosition;
|
import org.apache.pdfbox.text.TextPosition;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox;
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox;
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
@ -38,8 +39,7 @@ public class Word extends TextBoundingBox implements CharSequence {
|
|||||||
private int page;
|
private int page;
|
||||||
|
|
||||||
@Builder.Default
|
@Builder.Default
|
||||||
private List<RedTextPosition> textPositions = new ArrayList<>();
|
private List<Character> characters = new ArrayList<>();
|
||||||
|
|
||||||
private boolean isParagraphStart;
|
private boolean isParagraphStart;
|
||||||
private boolean strikethrough;
|
private boolean strikethrough;
|
||||||
private boolean underline;
|
private boolean underline;
|
||||||
@ -49,8 +49,9 @@ public class Word extends TextBoundingBox implements CharSequence {
|
|||||||
|
|
||||||
public Word(List<TextPosition> textPositions, int pageNumber, boolean isParagraphStart) {
|
public Word(List<TextPosition> textPositions, int pageNumber, boolean isParagraphStart) {
|
||||||
|
|
||||||
this.textPositions = textPositions.stream()
|
this.characters = textPositions.stream()
|
||||||
.map(RedTextPosition::fromTextPosition)
|
.map(RedTextPosition::fromTextPosition)
|
||||||
|
.map(Character::new)
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
this.page = pageNumber;
|
this.page = pageNumber;
|
||||||
this.isParagraphStart = isParagraphStart;
|
this.isParagraphStart = isParagraphStart;
|
||||||
@ -65,9 +66,9 @@ public class Word extends TextBoundingBox implements CharSequence {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public Word(List<RedTextPosition> textPositions, int page) {
|
public Word(List<Character> textPositions, int page) {
|
||||||
|
|
||||||
this.textPositions = textPositions;
|
this.characters = new ArrayList<>(textPositions);
|
||||||
this.page = page;
|
this.page = page;
|
||||||
calculateBBoxAndHashcode();
|
calculateBBoxAndHashcode();
|
||||||
}
|
}
|
||||||
@ -76,7 +77,7 @@ public class Word extends TextBoundingBox implements CharSequence {
|
|||||||
@Override
|
@Override
|
||||||
public int length() {
|
public int length() {
|
||||||
|
|
||||||
return textPositions.size();
|
return characters.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -101,7 +102,7 @@ public class Word extends TextBoundingBox implements CharSequence {
|
|||||||
public Word subSequence(int start, int end) {
|
public Word subSequence(int start, int end) {
|
||||||
|
|
||||||
var textPositionSequence = new Word();
|
var textPositionSequence = new Word();
|
||||||
textPositionSequence.textPositions = textPositions.subList(start, end);
|
textPositionSequence.characters = characters.subList(start, end);
|
||||||
textPositionSequence.page = page;
|
textPositionSequence.page = page;
|
||||||
textPositionSequence.dir = dir;
|
textPositionSequence.dir = dir;
|
||||||
textPositionSequence.setToBBoxOfComponents(getTextPositions());
|
textPositionSequence.setToBBoxOfComponents(getTextPositions());
|
||||||
@ -122,53 +123,59 @@ public class Word extends TextBoundingBox implements CharSequence {
|
|||||||
|
|
||||||
public RedTextPosition textPositionAt(int index) {
|
public RedTextPosition textPositionAt(int index) {
|
||||||
|
|
||||||
return textPositions.get(index);
|
return characters.get(index).getTextPosition();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void add(Word word, RedTextPosition textPosition) {
|
public void add(Word word, RedTextPosition textPosition) {
|
||||||
|
|
||||||
this.textPositions.add(textPosition);
|
this.characters.add(new Character(textPosition));
|
||||||
this.page = word.getPage();
|
this.page = word.getPage();
|
||||||
calculateBBoxAndHashcode();
|
calculateBBoxAndHashcode();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void add(Character current) {
|
||||||
|
|
||||||
|
characters.add(current);
|
||||||
|
calculateBBoxAndHashcode();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public void add(TextPosition textPosition) {
|
public void add(TextPosition textPosition) {
|
||||||
|
|
||||||
this.textPositions.add(RedTextPosition.fromTextPosition(textPosition));
|
add(new Character(RedTextPosition.fromTextPosition(textPosition)));
|
||||||
calculateBBoxAndHashcode();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public double getTextHeightNoPadding() {
|
public double getTextHeightNoPadding() {
|
||||||
|
|
||||||
return textPositions.get(0).getHeightDirAdj();
|
return characters.get(0).getTextPosition().getHeightDirAdj();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public double getTextHeight() {
|
public double getTextHeight() {
|
||||||
|
|
||||||
return textPositions.get(0).getHeightDirAdj() + HEIGHT_PADDING;
|
return characters.get(0).getTextPosition().getHeightDirAdj() + HEIGHT_PADDING;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public String getFont() {
|
public String getFont() {
|
||||||
|
|
||||||
if (textPositions.get(0).getFontName() == null) {
|
if (characters.get(0).getTextPosition().getFontName() == null) {
|
||||||
return "none";
|
return "none";
|
||||||
}
|
}
|
||||||
|
|
||||||
return FONT_CLEANER.matcher(textPositions.get(0).getFontName().toLowerCase(Locale.ROOT)).replaceAll("");
|
return FONT_CLEANER.matcher(characters.get(0).getTextPosition().getFontName().toLowerCase(Locale.ROOT)).replaceAll("");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public String getFontStyle() {
|
public String getFontStyle() {
|
||||||
|
|
||||||
if (textPositions.get(0).getFontName() == null) {
|
if (characters.get(0).getTextPosition().getFontName() == null) {
|
||||||
return STANDARD;
|
return STANDARD;
|
||||||
}
|
}
|
||||||
String lowercaseFontName = textPositions.get(0).getFontName().toLowerCase(Locale.ROOT);
|
String lowercaseFontName = characters.get(0).getTextPosition().getFontName().toLowerCase(Locale.ROOT);
|
||||||
|
|
||||||
if (lowercaseFontName.contains(BOLD) && lowercaseFontName.contains(ITALIC)) {
|
if (lowercaseFontName.contains(BOLD) && lowercaseFontName.contains(ITALIC)) {
|
||||||
return BOLD_ITALIC;
|
return BOLD_ITALIC;
|
||||||
@ -184,13 +191,13 @@ public class Word extends TextBoundingBox implements CharSequence {
|
|||||||
|
|
||||||
public float getFontSize() {
|
public float getFontSize() {
|
||||||
|
|
||||||
return textPositions.get(0).getFontSizeInPt();
|
return characters.get(0).getTextPosition().getFontSizeInPt();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public float getSpaceWidth() {
|
public float getSpaceWidth() {
|
||||||
|
|
||||||
return textPositions.get(0).getWidthOfSpace();
|
return characters.get(0).getTextPosition().getWidthOfSpace();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -244,6 +251,14 @@ public class Word extends TextBoundingBox implements CharSequence {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private List<RedTextPosition> getTextPositions() {
|
||||||
|
|
||||||
|
return characters.stream()
|
||||||
|
.map(Character::getTextPosition)
|
||||||
|
.toList();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public void transform(AffineTransform rotateInstance) {
|
public void transform(AffineTransform rotateInstance) {
|
||||||
|
|
||||||
for (RedTextPosition textPosition : getTextPositions()) {
|
for (RedTextPosition textPosition : getTextPositions()) {
|
||||||
|
|||||||
@ -11,12 +11,14 @@ import org.apache.commons.text.similarity.LevenshteinDistance;
|
|||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngineProto.LayoutEngine;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngineProto.LayoutEngine;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.SectionIdentifier;
|
import com.knecon.fforesight.service.layoutparser.processor.model.SectionIdentifier;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
|
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
|
||||||
|
|
||||||
import lombok.Data;
|
import lombok.Data;
|
||||||
@ -27,6 +29,24 @@ public class BlockificationPostprocessingService {
|
|||||||
private static final float STRING_SIMILARITY_THRESHOLD = 0.1f;
|
private static final float STRING_SIMILARITY_THRESHOLD = 0.1f;
|
||||||
|
|
||||||
|
|
||||||
|
public void findHeadlinesFromOutline(ClassificationDocument classificationDocument, int pageNumber, ClassificationPage classificationPage, PageInformation pageInformation) {
|
||||||
|
|
||||||
|
OutlineObject lastProcessedOutlineObject = null;
|
||||||
|
List<OutlineObject> outlineObjects = classificationDocument.getOutlineObjectTree().getOutlineObjectsPerPage().getOrDefault(pageNumber, new ArrayList<>());
|
||||||
|
|
||||||
|
OutlineObject notFoundOutlineObject = null;
|
||||||
|
if (lastProcessedOutlineObject != null && !lastProcessedOutlineObject.isFound()) {
|
||||||
|
lastProcessedOutlineObject.resetPoint();
|
||||||
|
notFoundOutlineObject = lastProcessedOutlineObject;
|
||||||
|
}
|
||||||
|
if (!outlineObjects.isEmpty()) {
|
||||||
|
classificationPage.setOutlineObjects(outlineObjects);
|
||||||
|
lastProcessedOutlineObject = sanitizeOutlineBlocks(classificationPage, notFoundOutlineObject);
|
||||||
|
}
|
||||||
|
classificationDocument.getLayoutDebugLayer().addOutlineObjects(outlineObjects, pageInformation);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public OutlineObject sanitizeOutlineBlocks(ClassificationPage classificationPage, OutlineObject notFoundOutlineObject) {
|
public OutlineObject sanitizeOutlineBlocks(ClassificationPage classificationPage, OutlineObject notFoundOutlineObject) {
|
||||||
|
|
||||||
List<OutlineObject> outlineObjects = classificationPage.getOutlineObjects();
|
List<OutlineObject> outlineObjects = classificationPage.getOutlineObjects();
|
||||||
@ -329,8 +349,8 @@ public class BlockificationPostprocessingService {
|
|||||||
|
|
||||||
if (index > 0) {
|
if (index > 0) {
|
||||||
in = createSubSequence(sequence, 0, index);
|
in = createSubSequence(sequence, 0, index);
|
||||||
} else if (endIndex < sequence.getTextPositions().size()) {
|
} else if (endIndex < sequence.length()) {
|
||||||
in = createSubSequence(sequence, endIndex, sequence.getTextPositions().size());
|
in = createSubSequence(sequence, endIndex, sequence.length());
|
||||||
}
|
}
|
||||||
|
|
||||||
return new SplitSequenceResult(in, out);
|
return new SplitSequenceResult(in, out);
|
||||||
@ -339,7 +359,7 @@ public class BlockificationPostprocessingService {
|
|||||||
|
|
||||||
private static Word createSubSequence(Word sequence, int start, int end) {
|
private static Word createSubSequence(Word sequence, int start, int end) {
|
||||||
|
|
||||||
Word newSeq = new Word(new ArrayList<>(sequence.getTextPositions().subList(start, end)), sequence.getPage());
|
Word newSeq = new Word(new ArrayList<>(sequence.getCharacters().subList(start, end)), sequence.getPage());
|
||||||
newSeq.setParagraphStart(sequence.isParagraphStart());
|
newSeq.setParagraphStart(sequence.isParagraphStart());
|
||||||
return newSeq;
|
return newSeq;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -77,7 +77,7 @@ public class DocstrumBlockificationService {
|
|||||||
.forEach(line -> {
|
.forEach(line -> {
|
||||||
line.getWords()
|
line.getWords()
|
||||||
.forEach(word -> {
|
.forEach(word -> {
|
||||||
words.add(new Word(word.getTextPositions(), word.getPage()));
|
words.add(new Word(word.getCharacters(), word.getPage()));
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
@ -81,7 +81,7 @@ public class ClarifyndClassificationService {
|
|||||||
&& (textBlock.getMostPopularWordStyle().equals("bold")
|
&& (textBlock.getMostPopularWordStyle().equals("bold")
|
||||||
|| !document.getFontStyleCounter().getCountPerValue().containsKey("bold")
|
|| !document.getFontStyleCounter().getCountPerValue().containsKey("bold")
|
||||||
&& textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() + 1)
|
&& textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() + 1)
|
||||||
&& textBlock.getWords().get(0).getTextPositions().get(0).getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
|
&& textBlock.getWords().get(0).getFontSize()>= textBlock.getMostPopularWordFontSize()) {
|
||||||
|
|
||||||
PageBlockType headlineType = HeadlineClassificationService.headlineClassByFontSize(textBlock, headlineFontSizes);
|
PageBlockType headlineType = HeadlineClassificationService.headlineClassByFontSize(textBlock, headlineFontSizes);
|
||||||
headlineClassificationService.classifyHeadline(textBlock, headlineType);
|
headlineClassificationService.classifyHeadline(textBlock, headlineType);
|
||||||
@ -91,7 +91,7 @@ public class ClarifyndClassificationService {
|
|||||||
&& textBlock.getMostPopularWordStyle().equals("bold")
|
&& textBlock.getMostPopularWordStyle().equals("bold")
|
||||||
&& !document.getFontStyleCounter().getMostPopular().equals("bold")
|
&& !document.getFontStyleCounter().getMostPopular().equals("bold")
|
||||||
&& PositionUtils.getApproxLineCount(textBlock) < 2.9
|
&& PositionUtils.getApproxLineCount(textBlock) < 2.9
|
||||||
&& textBlock.getWords().get(0).getTextPositions().get(0).getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
|
&& textBlock.getWords().get(0).getFontSize() >= textBlock.getMostPopularWordFontSize()) {
|
||||||
|
|
||||||
PageBlockType headlineType = HeadlineClassificationService.headlineClassByFontSize(textBlock, headlineFontSizes);
|
PageBlockType headlineType = HeadlineClassificationService.headlineClassByFontSize(textBlock, headlineFontSizes);
|
||||||
headlineClassificationService.classifyHeadline(textBlock, headlineType);
|
headlineClassificationService.classifyHeadline(textBlock, headlineType);
|
||||||
|
|||||||
@ -94,7 +94,7 @@ public class RedactManagerClassificationService {
|
|||||||
&& (textBlock.getMostPopularWordStyle().equals("bold")
|
&& (textBlock.getMostPopularWordStyle().equals("bold")
|
||||||
|| !document.getFontStyleCounter().getCountPerValue().containsKey("bold")
|
|| !document.getFontStyleCounter().getCountPerValue().containsKey("bold")
|
||||||
&& textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() + 1)
|
&& textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() + 1)
|
||||||
&& textBlock.getWords().get(0).getTextPositions().get(0).getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
|
&& textBlock.getWords().get(0).getFontSize() >= textBlock.getMostPopularWordFontSize()) {
|
||||||
|
|
||||||
PageBlockType headlineType = HeadlineClassificationService.headlineClassByFontSize(textBlock, headlineFontSizes);
|
PageBlockType headlineType = HeadlineClassificationService.headlineClassByFontSize(textBlock, headlineFontSizes);
|
||||||
headlineClassificationService.classifyHeadline(textBlock, headlineType);
|
headlineClassificationService.classifyHeadline(textBlock, headlineType);
|
||||||
@ -104,7 +104,7 @@ public class RedactManagerClassificationService {
|
|||||||
&& textBlock.getMostPopularWordStyle().equals("bold")
|
&& textBlock.getMostPopularWordStyle().equals("bold")
|
||||||
&& !document.getFontStyleCounter().getMostPopular().equals("bold")
|
&& !document.getFontStyleCounter().getMostPopular().equals("bold")
|
||||||
&& PositionUtils.getApproxLineCount(textBlock) < 2.9
|
&& PositionUtils.getApproxLineCount(textBlock) < 2.9
|
||||||
&& textBlock.getWords().get(0).getTextPositions().get(0).getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
|
&& textBlock.getWords().get(0).getFontSize() >= textBlock.getMostPopularWordFontSize()) {
|
||||||
|
|
||||||
PageBlockType headlineType = HeadlineClassificationService.headlineClassByFontSize(textBlock, headlineFontSizes);
|
PageBlockType headlineType = HeadlineClassificationService.headlineClassByFontSize(textBlock, headlineFontSizes);
|
||||||
headlineClassificationService.classifyHeadline(textBlock, headlineType);
|
headlineClassificationService.classifyHeadline(textBlock, headlineType);
|
||||||
|
|||||||
@ -59,7 +59,7 @@ public class TableOfContentsClassificationService {
|
|||||||
|
|
||||||
if (end > i + 1) {
|
if (end > i + 1) {
|
||||||
if (textBlock.textBlock().getClassification() == null) {
|
if (textBlock.textBlock().getClassification() == null) {
|
||||||
textBlock.textBlock().setClassification(PageBlockType.H1);
|
textBlock.textBlock().setClassification(PageBlockType.TABLE_OF_CONTENTS_HEADLINE);
|
||||||
}
|
}
|
||||||
i = end;
|
i = end;
|
||||||
}
|
}
|
||||||
@ -71,9 +71,9 @@ public class TableOfContentsClassificationService {
|
|||||||
|
|
||||||
ClassificationPage startPage = textBlocks.get(start).page();
|
ClassificationPage startPage = textBlocks.get(start).page();
|
||||||
List<TextBlockOnPage> initialLookAhead = textBlocks.subList(start, Math.min(start + SURROUNDING_BLOCKS_RADIUS, textBlocks.size()));
|
List<TextBlockOnPage> initialLookAhead = textBlocks.subList(start, Math.min(start + SURROUNDING_BLOCKS_RADIUS, textBlocks.size()));
|
||||||
HashMap<Word, TextBlockOnPage> lookup = new HashMap<>();
|
HashMap<Word, TextBlockOnPage> numberToBlockLookup = new HashMap<>();
|
||||||
List<Word> numbers = extractNumbers(initialLookAhead, lookup, document.getPages().size());
|
List<Word> numbers = extractNumbers(initialLookAhead, numberToBlockLookup, document.getPages().size());
|
||||||
TocNumberFinder tocNumberFinder = new TocNumberFinder(numbers, lookup);
|
TocNumberFinder tocNumberFinder = new TocNumberFinder(numbers, numberToBlockLookup);
|
||||||
|
|
||||||
int lastCandidate = start;
|
int lastCandidate = start;
|
||||||
for (int i = start; i < Math.min(lastCandidate + SURROUNDING_BLOCKS_RADIUS, textBlocks.size()); i++) {
|
for (int i = start; i < Math.min(lastCandidate + SURROUNDING_BLOCKS_RADIUS, textBlocks.size()); i++) {
|
||||||
@ -93,7 +93,7 @@ public class TableOfContentsClassificationService {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
List<Word> numbersFromBlock = extractNumbers(textBlockOnPage, lookup, document.getPages().size());
|
List<Word> numbersFromBlock = extractNumbers(textBlockOnPage, numberToBlockLookup, document.getPages().size());
|
||||||
|
|
||||||
List<Word> currentRightmostCluster = tocNumberFinder.getCurrentRightmostCluster();
|
List<Word> currentRightmostCluster = tocNumberFinder.getCurrentRightmostCluster();
|
||||||
|
|
||||||
@ -102,19 +102,19 @@ public class TableOfContentsClassificationService {
|
|||||||
return start;
|
return start;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (anyIntersection(currentRightmostCluster, numbersFromBlock, lookup)) {
|
if (anyIntersection(currentRightmostCluster, numbersFromBlock, numberToBlockLookup)) {
|
||||||
lastCandidate = i;
|
lastCandidate = i;
|
||||||
numbersFromBlock.forEach(tocNumberFinder::add);
|
numbersFromBlock.forEach(tocNumberFinder::add);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
addVisualization(document.getLayoutDebugLayer(), tocNumberFinder, lookup);
|
|
||||||
|
|
||||||
Set<TextBlockOnPage> blocksWithNumberInCluster = tocNumberFinder.getCurrentRightmostCluster()
|
Set<TextBlockOnPage> blocksWithNumberInCluster = tocNumberFinder.getCurrentRightmostCluster()
|
||||||
.stream()
|
.stream()
|
||||||
.map(lookup::get)
|
.map(numberToBlockLookup::get)
|
||||||
.collect(Collectors.toSet());
|
.collect(Collectors.toSet());
|
||||||
|
|
||||||
|
addVisualization(document.getLayoutDebugLayer(), tocNumberFinder, numberToBlockLookup, blocksWithNumberInCluster, textBlocks.get(start - 1));
|
||||||
|
|
||||||
int lastConfirmed = start;
|
int lastConfirmed = start;
|
||||||
for (int i = start; i < lastCandidate + 1; i++) {
|
for (int i = start; i < lastCandidate + 1; i++) {
|
||||||
TextBlockOnPage textBlockOnPage = textBlocks.get(i);
|
TextBlockOnPage textBlockOnPage = textBlocks.get(i);
|
||||||
@ -132,18 +132,22 @@ public class TableOfContentsClassificationService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static void addVisualization(LayoutDebugLayer layoutDebugLayer, TocNumberFinder tocNumberFinder, Map<Word, TextBlockOnPage> lookup) {
|
private static void addVisualization(LayoutDebugLayer layoutDebugLayer,
|
||||||
|
TocNumberFinder tocNumberFinder,
|
||||||
|
Map<Word, TextBlockOnPage> lookup,
|
||||||
|
Set<TextBlockOnPage> blocksWithNumberInCluster,
|
||||||
|
TextBlockOnPage startingHeadline) {
|
||||||
|
|
||||||
tocNumberFinder.getCurrentRightmostCluster()
|
tocNumberFinder.getCurrentRightmostCluster()
|
||||||
.stream()
|
.stream()
|
||||||
.collect(Collectors.groupingBy(key -> lookup.get(key).page().getPageNumber()))
|
.collect(Collectors.groupingBy(key -> lookup.get(key).page().getPageNumber()))
|
||||||
.forEach((pageNumber, number) -> layoutDebugLayer.addTocPages(number, pageNumber));
|
.forEach((pageNumber, number) -> layoutDebugLayer.addTocPages(number, pageNumber));
|
||||||
|
layoutDebugLayer.addTocBlocks(blocksWithNumberInCluster);
|
||||||
|
layoutDebugLayer.addTocBlocks(Set.of(startingHeadline));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static boolean anyIntersection(Collection<Word> numbers1,
|
private static boolean anyIntersection(Collection<Word> numbers1, Collection<Word> numbers2, Map<Word, TextBlockOnPage> lookup) {
|
||||||
Collection<Word> numbers2,
|
|
||||||
Map<Word, TextBlockOnPage> lookup) {
|
|
||||||
|
|
||||||
return numbers1.stream()
|
return numbers1.stream()
|
||||||
.anyMatch(numberFromCluster -> numbers2.stream()
|
.anyMatch(numberFromCluster -> numbers2.stream()
|
||||||
|
|||||||
@ -23,6 +23,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.AbstractSemanticNode;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.AbstractSemanticNode;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||||
@ -35,6 +36,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Im
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableOfContentsItem;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||||
@ -96,6 +98,7 @@ public class DocumentGraphFactory {
|
|||||||
|
|
||||||
for (TableOfContentItem tocItem : classificationDocument.getTableOfContents()) {
|
for (TableOfContentItem tocItem : classificationDocument.getTableOfContents()) {
|
||||||
GenericSemanticNode parent = tocItem.getParent() == null ? null : tocItem.getParent().getSection();
|
GenericSemanticNode parent = tocItem.getParent() == null ? null : tocItem.getParent().getSection();
|
||||||
|
|
||||||
Optional<GenericSemanticNode> section = SectionNodeFactory.addSection(layoutParsingType,
|
Optional<GenericSemanticNode> section = SectionNodeFactory.addSection(layoutParsingType,
|
||||||
parent,
|
parent,
|
||||||
tocItem.getChildren().isEmpty(),
|
tocItem.getChildren().isEmpty(),
|
||||||
@ -121,6 +124,8 @@ public class DocumentGraphFactory {
|
|||||||
node = Headline.builder().documentTree(context.getDocumentTree()).build();
|
node = Headline.builder().documentTree(context.getDocumentTree()).build();
|
||||||
} else if (originalTextBlock.isToDuplicate() && layoutParsingType.equals(LayoutParsingType.REDACT_MANAGER)) {
|
} else if (originalTextBlock.isToDuplicate() && layoutParsingType.equals(LayoutParsingType.REDACT_MANAGER)) {
|
||||||
node = DuplicatedParagraph.builder().documentTree(context.getDocumentTree()).build();
|
node = DuplicatedParagraph.builder().documentTree(context.getDocumentTree()).build();
|
||||||
|
} else if (originalTextBlock.getClassification().equals(PageBlockType.TABLE_OF_CONTENTS_ITEM)) {
|
||||||
|
node = TableOfContentsItem.builder().documentTree(context.getDocumentTree()).build();
|
||||||
} else {
|
} else {
|
||||||
node = Paragraph.builder().documentTree(context.getDocumentTree()).build();
|
node = Paragraph.builder().documentTree(context.getDocumentTree()).build();
|
||||||
}
|
}
|
||||||
@ -142,7 +147,9 @@ public class DocumentGraphFactory {
|
|||||||
List<Integer> treeId = context.documentTree.createNewChildEntryAndReturnId(parentNode, node);
|
List<Integer> treeId = context.documentTree.createNewChildEntryAndReturnId(parentNode, node);
|
||||||
node.setLeafTextBlock(textBlock);
|
node.setLeafTextBlock(textBlock);
|
||||||
node.setTreeId(treeId);
|
node.setTreeId(treeId);
|
||||||
node.getEngines().addAll(originalTextBlock.getEngines());
|
node.getEngines().
|
||||||
|
|
||||||
|
addAll(originalTextBlock.getEngines());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -10,6 +10,7 @@ import java.util.List;
|
|||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||||
@ -31,19 +32,19 @@ public class SearchTextWithTextPositionFactory {
|
|||||||
public SearchTextWithTextPositionDto buildSearchTextToTextPositionDto(List<Word> sequences) {
|
public SearchTextWithTextPositionDto buildSearchTextToTextPositionDto(List<Word> sequences) {
|
||||||
|
|
||||||
if (sequences.isEmpty() || sequences.stream()
|
if (sequences.isEmpty() || sequences.stream()
|
||||||
.allMatch(sequence -> sequence.getTextPositions().isEmpty())) {
|
.allMatch(sequence -> sequence.getCharacters().isEmpty())) {
|
||||||
return SearchTextWithTextPositionDto.empty();
|
return SearchTextWithTextPositionDto.empty();
|
||||||
}
|
}
|
||||||
|
|
||||||
Context context = new Context();
|
Context context = new Context();
|
||||||
|
|
||||||
RedTextPosition currentTextPosition = sequences.get(0).getTextPositions().get(0);
|
RedTextPosition currentTextPosition = sequences.get(0).getCharacters().get(0).getTextPosition();
|
||||||
RedTextPosition previousTextPosition = RedTextPosition.builder().unicode(" ").bBoxDirAdj(currentTextPosition.getBBoxDirAdj()).build();
|
RedTextPosition previousTextPosition = RedTextPosition.builder().unicode(" ").bBoxDirAdj(currentTextPosition.getBBoxDirAdj()).build();
|
||||||
|
|
||||||
for (Word word : sequences) {
|
for (Word word : sequences) {
|
||||||
for (int i = 0; i < word.getTextPositions().size(); ++i) {
|
for (int i = 0; i < word.getCharacters().size(); ++i) {
|
||||||
|
|
||||||
currentTextPosition = word.getTextPositions().get(i);
|
currentTextPosition = word.getCharacters().get(i).getTextPosition();
|
||||||
if (isLineBreak(currentTextPosition, previousTextPosition)) {
|
if (isLineBreak(currentTextPosition, previousTextPosition)) {
|
||||||
removeHyphenLinebreaks(context);
|
removeHyphenLinebreaks(context);
|
||||||
context.lineBreaksStringIdx.add(context.stringIdx);
|
context.lineBreaksStringIdx.add(context.stringIdx);
|
||||||
@ -66,8 +67,9 @@ public class SearchTextWithTextPositionFactory {
|
|||||||
}
|
}
|
||||||
|
|
||||||
List<Rectangle2D> positions = sequences.stream()
|
List<Rectangle2D> positions = sequences.stream()
|
||||||
.map(Word::getTextPositions)
|
.map(Word::getCharacters)
|
||||||
.flatMap(Collection::stream)
|
.flatMap(Collection::stream)
|
||||||
|
.map(Character::getTextPosition)
|
||||||
.map(RedTextPosition::getBBoxPdf)
|
.map(RedTextPosition::getBBoxPdf)
|
||||||
.toList();
|
.toList();
|
||||||
|
|
||||||
|
|||||||
@ -12,11 +12,13 @@ import java.util.Set;
|
|||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.AbstractSemanticNode;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.AbstractSemanticNode;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableOfContents;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
@ -27,6 +29,20 @@ import lombok.experimental.UtilityClass;
|
|||||||
@UtilityClass
|
@UtilityClass
|
||||||
public class SectionNodeFactory {
|
public class SectionNodeFactory {
|
||||||
|
|
||||||
|
public GenericSemanticNode addTocSection(LayoutParsingType layoutParsingType, List<AbstractPageBlock> pageBlocks, DocumentGraphFactory.Context context, Document document) {
|
||||||
|
|
||||||
|
AbstractSemanticNode section = TableOfContents.builder().documentTree(context.getDocumentTree()).build();
|
||||||
|
context.getSections().add(section);
|
||||||
|
section.setTreeId(getTreeId(null, context, section));
|
||||||
|
for (AbstractPageBlock pageBlock : pageBlocks) {
|
||||||
|
if (pageBlock instanceof TextPageBlock textPageBlock) {
|
||||||
|
DocumentGraphFactory.addParagraphOrHeadline(section, textPageBlock, context, new ArrayList<>(), layoutParsingType);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return section;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public Optional<GenericSemanticNode> addSection(LayoutParsingType layoutParsingType,
|
public Optional<GenericSemanticNode> addSection(LayoutParsingType layoutParsingType,
|
||||||
GenericSemanticNode parentNode,
|
GenericSemanticNode parentNode,
|
||||||
boolean isLeaf,
|
boolean isLeaf,
|
||||||
|
|||||||
@ -248,8 +248,8 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
|||||||
|
|
||||||
if (!words.isEmpty()) {
|
if (!words.isEmpty()) {
|
||||||
previous = words.get(words.size() - 1)
|
previous = words.get(words.size() - 1)
|
||||||
.getTextPositions()
|
.getCharacters()
|
||||||
.get(words.get(words.size() - 1).getTextPositions().size() - 1);
|
.get(words.get(words.size() - 1).getCharacters().size() - 1).getTextPosition();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (i == 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i).getUnicode().equals("\u00A0") || textPositions.get(i).getUnicode().equals("\t"))) {
|
if (i == 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i).getUnicode().equals("\u00A0") || textPositions.get(i).getUnicode().equals("\t"))) {
|
||||||
|
|||||||
@ -11,6 +11,7 @@ import java.util.Set;
|
|||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.AngleFilter;
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.AngleFilter;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox;
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind;
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils;
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils;
|
||||||
@ -54,11 +55,20 @@ public class TextPositionOperations {
|
|||||||
|
|
||||||
private List<Word> sortUsingLineDetection(Set<Word> sequences) {
|
private List<Word> sortUsingLineDetection(Set<Word> sequences) {
|
||||||
|
|
||||||
return sortLines(groupByLine(sequences));
|
return sortWords(groupByLine(sequences));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public List<Word> sortLines(Collection<Set<Word>> lines) {
|
public List<Word> sortWords(Collection<Set<Word>> lines) {
|
||||||
|
|
||||||
|
return sortLines(lines).stream()
|
||||||
|
.map(Line::getWords)
|
||||||
|
.flatMap(Collection::stream)
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public List<Line> sortLines(Collection<Set<Word>> lines) {
|
||||||
|
|
||||||
List<List<Word>> lineBlocks = new ArrayList<>();
|
List<List<Word>> lineBlocks = new ArrayList<>();
|
||||||
for (Set<Word> line : lines) {
|
for (Set<Word> line : lines) {
|
||||||
@ -70,9 +80,9 @@ public class TextPositionOperations {
|
|||||||
// need to use old sorting, since COMPARATOR_DIR_ADJ is not transitive
|
// need to use old sorting, since COMPARATOR_DIR_ADJ is not transitive
|
||||||
QuickSort.sort(lineBlocks, Comparator.comparing(line -> line.get(0), COMPARATOR_DIR_ADJ));
|
QuickSort.sort(lineBlocks, Comparator.comparing(line -> line.get(0), COMPARATOR_DIR_ADJ));
|
||||||
|
|
||||||
List<Word> list = new ArrayList<>();
|
List<Line> list = new ArrayList<>();
|
||||||
for (List<Word> words : lineBlocks) {
|
for (List<Word> lineBlock : lineBlocks) {
|
||||||
list.addAll(words);
|
list.add(new Line(lineBlock));
|
||||||
}
|
}
|
||||||
return list;
|
return list;
|
||||||
}
|
}
|
||||||
@ -95,6 +105,12 @@ public class TextPositionOperations {
|
|||||||
.map(Word::getBBoxDirAdj)
|
.map(Word::getBBoxDirAdj)
|
||||||
.mapToDouble(RectangularShape::getWidth).average().orElse(75) * MAX_WORD_DISTANCE_FACTOR;
|
.mapToDouble(RectangularShape::getWidth).average().orElse(75) * MAX_WORD_DISTANCE_FACTOR;
|
||||||
|
|
||||||
|
return groupByLine(sequences, maxLineDistance, maxXGap);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public Collection<Set<Word>> groupByLine(Set<Word> sequences, double maxLineDistance, double maxXGap) {
|
||||||
|
|
||||||
UnionFind<Word> unionFind = new UnionFind<>(sequences);
|
UnionFind<Word> unionFind = new UnionFind<>(sequences);
|
||||||
|
|
||||||
for (Word sequence : sequences) {
|
for (Word sequence : sequences) {
|
||||||
|
|||||||
@ -27,6 +27,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.outline.Outlin
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.ListIdentifier;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.ListIdentifier;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextBlockOnPage;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||||
@ -273,7 +274,9 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
|||||||
AtomicInteger index = new AtomicInteger(0);
|
AtomicInteger index = new AtomicInteger(0);
|
||||||
zones.forEach(zone -> zone.getLines()
|
zones.forEach(zone -> zone.getLines()
|
||||||
.stream()
|
.stream()
|
||||||
.map(Line::getCharacters)
|
.map(Line::getWords)
|
||||||
|
.flatMap(Collection::stream)
|
||||||
|
.map(Word::getCharacters)
|
||||||
.flatMap(Collection::stream)
|
.flatMap(Collection::stream)
|
||||||
.forEach(character -> {
|
.forEach(character -> {
|
||||||
Color color = getRotatingColor(index);
|
Color color = getRotatingColor(index);
|
||||||
@ -330,8 +333,10 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
|||||||
|
|
||||||
private void addOutlineObject(OutlineObject outlineObject, PageInformation pageInformation) {
|
private void addOutlineObject(OutlineObject outlineObject, PageInformation pageInformation) {
|
||||||
|
|
||||||
|
if (!active) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
int rectSize = 5;
|
int rectSize = 5;
|
||||||
|
|
||||||
Point2D point2D;
|
Point2D point2D;
|
||||||
if (outlineObject.getPoint().isPresent()) {
|
if (outlineObject.getPoint().isPresent()) {
|
||||||
point2D = outlineObject.getPoint().get();
|
point2D = outlineObject.getPoint().get();
|
||||||
@ -355,10 +360,25 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
|||||||
|
|
||||||
public void addListIdentifiers(List<ListIdentifier> listIdentifiers) {
|
public void addListIdentifiers(List<ListIdentifier> listIdentifiers) {
|
||||||
|
|
||||||
|
if (!active) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
for (ListIdentifier listIdentifier : listIdentifiers) {
|
for (ListIdentifier listIdentifier : listIdentifiers) {
|
||||||
getOrCreateVisualizationsOnPage(listIdentifier.getPage(), this.listIdentifiers).getColoredRectangles()
|
getOrCreateVisualizationsOnPage(listIdentifier.getPage(), this.listIdentifiers).getColoredRectangles()
|
||||||
.add(new ColoredRectangle(listIdentifier.getWord().getBBoxPdf(), WORDS_COLOR, LINE_WIDTH));
|
.add(new ColoredRectangle(listIdentifier.getWord().getBBoxPdf(), WORDS_COLOR, LINE_WIDTH));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void addTocBlocks(Set<TextBlockOnPage> blocksWithNumberInCluster) {
|
||||||
|
|
||||||
|
if (!active) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
for (TextBlockOnPage textBlockOnPage : blocksWithNumberInCluster) {
|
||||||
|
getOrCreateVisualizationsOnPage(textBlockOnPage.page().getPageNumber(), this.tocBlocks).getColoredRectangles()
|
||||||
|
.add(new ColoredRectangle(textBlockOnPage.textBlock().getBBoxPdf(), TOC_COLOR, LINE_WIDTH));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -57,6 +57,7 @@ public record LayerIdentifier(String name, String markedContentName) {
|
|||||||
public static final LayerIdentifier OUTLINE_OBJECTS = new LayerIdentifier("Outline Positions", "OUTLINE_OBJECTS");
|
public static final LayerIdentifier OUTLINE_OBJECTS = new LayerIdentifier("Outline Positions", "OUTLINE_OBJECTS");
|
||||||
public static final LayerIdentifier SENTENCES = new LayerIdentifier("Sentences", "SENTENCES");
|
public static final LayerIdentifier SENTENCES = new LayerIdentifier("Sentences", "SENTENCES");
|
||||||
public static final LayerIdentifier TOC_PAGES = new LayerIdentifier("TOC pages", "TOC_PAGES");
|
public static final LayerIdentifier TOC_PAGES = new LayerIdentifier("TOC pages", "TOC_PAGES");
|
||||||
|
public static final LayerIdentifier TOC_BLOCKS = new LayerIdentifier("TOC blocks", "TOC_BLOCKS");
|
||||||
public static final LayerIdentifier LIST_IDENTIFIERS = new LayerIdentifier("List identifiers", "LIST_IDENTIFIERS");
|
public static final LayerIdentifier LIST_IDENTIFIERS = new LayerIdentifier("List identifiers", "LIST_IDENTIFIERS");
|
||||||
|
|
||||||
// Visual layout parser
|
// Visual layout parser
|
||||||
|
|||||||
@ -22,6 +22,7 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup {
|
|||||||
|
|
||||||
protected static final Color WORDS_COLOR = new Color(68, 84, 147);
|
protected static final Color WORDS_COLOR = new Color(68, 84, 147);
|
||||||
protected static final Color LINES_COLOR = new Color(152, 45, 179);
|
protected static final Color LINES_COLOR = new Color(152, 45, 179);
|
||||||
|
protected static final Color TOC_COLOR = new Color(33, 159, 144);
|
||||||
protected static final Color ZONES_COLOR = new Color(131, 38, 38);
|
protected static final Color ZONES_COLOR = new Color(131, 38, 38);
|
||||||
|
|
||||||
protected static final Color RULINGS_COLOR = new Color(21, 221, 174);
|
protected static final Color RULINGS_COLOR = new Color(21, 221, 174);
|
||||||
@ -59,6 +60,7 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup {
|
|||||||
protected final Visualizations outlineObjects = Visualizations.builder().layer(LayerIdentifier.OUTLINE_OBJECTS).build();
|
protected final Visualizations outlineObjects = Visualizations.builder().layer(LayerIdentifier.OUTLINE_OBJECTS).build();
|
||||||
protected final Visualizations sentences = Visualizations.builder().layer(LayerIdentifier.SENTENCES).build();
|
protected final Visualizations sentences = Visualizations.builder().layer(LayerIdentifier.SENTENCES).build();
|
||||||
protected final Visualizations tocPages = Visualizations.builder().layer(LayerIdentifier.TOC_PAGES).build();
|
protected final Visualizations tocPages = Visualizations.builder().layer(LayerIdentifier.TOC_PAGES).build();
|
||||||
|
protected final Visualizations tocBlocks = Visualizations.builder().layer(LayerIdentifier.TOC_BLOCKS).build();
|
||||||
protected final Visualizations listIdentifiers = Visualizations.builder().layer(LayerIdentifier.LIST_IDENTIFIERS).build();
|
protected final Visualizations listIdentifiers = Visualizations.builder().layer(LayerIdentifier.LIST_IDENTIFIERS).build();
|
||||||
|
|
||||||
|
|
||||||
@ -77,6 +79,7 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup {
|
|||||||
markedContent, //
|
markedContent, //
|
||||||
outlineObjects, //
|
outlineObjects, //
|
||||||
tocPages, //
|
tocPages, //
|
||||||
|
tocBlocks, //
|
||||||
listIdentifiers //
|
listIdentifiers //
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user