diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index 4d5d4d2..eb93d8e 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -26,6 +26,8 @@ import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.No import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; +import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection; import com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationService; @@ -158,9 +160,9 @@ public class LayoutParsingPipeline { } if (!viewerDocumentFile.equals(originFile)) { - viewerDocumentFile.delete(); + assert !viewerDocumentFile.exists() || viewerDocumentFile.delete(); } - originFile.delete(); + assert !originFile.exists() || originFile.delete(); return LayoutParsingFinishedEvent.builder() .identifier(layoutParsingRequest.identifier()) @@ -240,12 +242,8 @@ public class LayoutParsingPipeline { } List classificationPages = new ArrayList<>(); - OutlineObject lastProcessedOutlineObject = null; - // parsing the structure elements could be useful as well - if (layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD) { - classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originDocument)); - } + classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originDocument)); long pageCount = originDocument.getNumberOfPages(); @@ -277,18 +275,15 @@ public class LayoutParsingPipeline { if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD)) { var lines = TextPositionOperations.groupByLine(new HashSet<>(words)); classificationDocument.getLayoutDebugLayer().addLineVisualizationsFromNestedTextPosition(lines, pageNumber); - words = TextPositionOperations.sortLines(lines); + words = TextPositionOperations.sortWords(lines); } classificationDocument.getLayoutDebugLayer().addTextVisualizations(words, pageNumber); PDRectangle pdr = pdPage.getMediaBox(); - int rotation = pdPage.getRotation(); - boolean isLandscape = pdr.getWidth() > pdr.getHeight() && (rotation == 0 || rotation == 180) || pdr.getHeight() > pdr.getWidth() && (rotation == 90 || rotation == 270); - - PDRectangle cropbox = pdPage.getCropBox(); - classificationDocument.getLayoutDebugLayer().addRulingVisualization(stripper.getRulings(), pageNumber); - CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(pdfTableCells.get(pageNumber), stripper.getRulings()); + List rulings = stripper.getRulings(); + classificationDocument.getLayoutDebugLayer().addRulingVisualization(rulings, pageNumber); + CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(pdfTableCells.get(pageNumber), rulings); PageInformation pageInformation = PageInformation.fromPDPage(pageNumber, pdPage); List emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontals(), cleanRulings.getVerticals(), pageInformation); @@ -308,8 +303,7 @@ public class LayoutParsingPipeline { .toList()); ClassificationPage classificationPage = switch (layoutParsingType) { - case REDACT_MANAGER_OLD -> - redactManagerBlockificationService.blockify(stripper.getWords(), cleanRulings, classificationDocument.getLayoutDebugLayer()); + case REDACT_MANAGER_OLD -> redactManagerBlockificationService.blockify(stripper.getWords(), cleanRulings, classificationDocument.getLayoutDebugLayer()); case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings); case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH -> docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getLayoutDebugLayer(), layoutParsingType); @@ -317,27 +311,9 @@ public class LayoutParsingPipeline { docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getLayoutDebugLayer(), layoutParsingType); }; - classificationPage.setCleanRulings(cleanRulings); - classificationPage.setRotation(rotation); - classificationPage.setLandscape(isLandscape); - classificationPage.setPageNumber(pageNumber); - classificationPage.setPageWidth(cropbox.getWidth()); - classificationPage.setPageHeight(cropbox.getHeight()); + updateClassificationPage(pdPage, pdr, classificationPage, cleanRulings, pageNumber, pageInformation); - if (layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD) { - List outlineObjects = classificationDocument.getOutlineObjectTree().getOutlineObjectsPerPage().getOrDefault(pageNumber, new ArrayList<>()); - - OutlineObject notFoundOutlineObject = null; - if (lastProcessedOutlineObject != null && !lastProcessedOutlineObject.isFound()) { - lastProcessedOutlineObject.resetPoint(); - notFoundOutlineObject = lastProcessedOutlineObject; - } - if (!outlineObjects.isEmpty()) { - classificationPage.setOutlineObjects(outlineObjects); - lastProcessedOutlineObject = blockificationPostprocessingService.sanitizeOutlineBlocks(classificationPage, notFoundOutlineObject); - } - classificationDocument.getLayoutDebugLayer().addOutlineObjects(outlineObjects, pageInformation); - } + blockificationPostprocessingService.findHeadlinesFromOutline(classificationDocument, pageNumber, classificationPage, pageInformation); classificationDocument.getLayoutDebugLayer().addMarkedContentVisualizations(stripper.getMarkedContents(), pageNumber); // MarkedContent needs to be converted at this point, otherwise it leads to GC Problems in Pdfbox. @@ -383,13 +359,31 @@ public class LayoutParsingPipeline { } + private static void updateClassificationPage(PDPage pdPage, + PDRectangle pdr, + ClassificationPage classificationPage, + CleanRulings cleanRulings, + int pageNumber, + PageInformation pageInformation) { + + int rotation = pdPage.getRotation(); + boolean isLandscape = pdr.getWidth() > pdr.getHeight() && (rotation == 0 || rotation == 180) || pdr.getHeight() > pdr.getWidth() && (rotation == 90 || rotation == 270); + classificationPage.setCleanRulings(cleanRulings); + classificationPage.setRotation(rotation); + classificationPage.setLandscape(isLandscape); + classificationPage.setPageNumber(pageNumber); + classificationPage.setPageWidth((float) pageInformation.width()); + classificationPage.setPageHeight((float) pageInformation.height()); + } + + private static void rotateDirAdjExactly(List words, PDPage pdPage) { for (TextDirection dir : TextDirection.values()) { - double averageRotation = words.stream() - .map(Word::getTextPositions) + .map(Word::getCharacters) .flatMap(Collection::stream) + .map(Character::getTextPosition) .filter(pos -> pos.getDir().equals(dir)) .mapToDouble(RedTextPosition::getExactDir).average().orElse(0); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/DocstrumSegmentationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/DocstrumSegmentationService.java index 8f3c9f6..476d4c3 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/DocstrumSegmentationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/DocstrumSegmentationService.java @@ -80,16 +80,12 @@ public class DocstrumSegmentationService { private List computeZones(List textPositions, CleanRulings rulings, LayoutDebugLayer visualizations, TextDirection direction) { - List positions = textPositions.stream() + List characters = textPositions.stream() .filter(t -> t.getDir() == direction) - .map(Word::getTextPositions) + .map(Word::getCharacters) .flatMap(List::stream) .toList(); - List characters = positions.stream() - .map(Character::new) - .collect(Collectors.toList()); - nearestNeighbourService.findNearestNeighbors(characters); double characterSpacing = spacingService.computeCharacterSpacing(characters); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/Line.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/Line.java index eb999c7..4439bba 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/Line.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/Line.java @@ -36,18 +36,13 @@ public class Line extends TextBoundingBox { @EqualsAndHashCode.Include private final double y1; - private final double height; - private FontStyle fontStyle; - private final List characters; - private final List words = new ArrayList<>(); + private final List words; public Line(List characters, double wordSpacing) { - this.characters = characters; - if (characters.size() >= 2) { // linear regression double sx = 0.0; @@ -76,13 +71,25 @@ public class Line extends TextBoundingBox { this.y0 = character.getY() - dy; this.y1 = character.getY() + dy; } - height = computeHeight(); - computeWords(wordSpacing * WORD_DISTANCE_MULTIPLIER); + this.words = new ArrayList<>(); + computeWords(characters, wordSpacing * WORD_DISTANCE_MULTIPLIER); buildBBox(); computeFontStyle(); } + public Line(List words) { + + this.words = words; + buildBBox(); + x0 = getMinX(); + y0 = getMinY(); + x1 = getMaxX(); + y1 = getMaxY(); + computeFontStyle(); + } + + private void computeFontStyle() { EnumMap fontStyleCounter = new EnumMap<>(FontStyle.class); @@ -100,8 +107,7 @@ public class Line extends TextBoundingBox { fontStyle = fontStyleCounter.entrySet() .stream() .max(Comparator.comparing(entry -> entry.getValue().get())) - .map(Map.Entry::getKey) - .orElse(FontStyle.REGULAR); + .map(Map.Entry::getKey).orElse(FontStyle.REGULAR); } @@ -117,14 +123,6 @@ public class Line extends TextBoundingBox { } - private double computeHeight() { - - return characters.stream() - .map(Character::getHeight) - .reduce(0d, Double::sum) / characters.size(); - } - - public double angularDifference(Line j) { double diff = Math.abs(getAngle() - j.getAngle()); @@ -157,7 +155,7 @@ public class Line extends TextBoundingBox { } - private void computeWords(double wordSpacing) { + private void computeWords(List characters, double wordSpacing) { Word word = new Word(); Character previous = null; @@ -169,7 +167,7 @@ public class Line extends TextBoundingBox { word = new Word(); } } - word.getTextPositions().add(current.getTextPosition()); + word.add(current); previous = current; } words.add(word); @@ -178,9 +176,7 @@ public class Line extends TextBoundingBox { private void buildBBox() { - this.setToBBoxOfComponents(characters.stream() - .map(Character::getTextPosition) - .toList()); + this.setToBBoxOfComponents(words); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/Zone.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/Zone.java index cc02fd8..f803b33 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/Zone.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/Zone.java @@ -18,7 +18,6 @@ public class Zone extends TextBoundingBox { @SuppressWarnings("PMD.ConstructorCallsOverridableMethod") public Zone(List lines) { - lines.sort(Comparator.comparingDouble(Line::getY0)); this.lines = lines; setToBBoxOfComponents(lines); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/ZoneBuilderService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/ZoneBuilderService.java index e222e23..bfdcd5c 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/ZoneBuilderService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/ZoneBuilderService.java @@ -1,9 +1,7 @@ package com.knecon.fforesight.service.layoutparser.processor.docstrum.service; -import static com.knecon.fforesight.service.layoutparser.processor.model.SectionIdentifier.numericalIdentifierPattern; - import java.util.ArrayList; -import java.util.Comparator; +import java.util.Collection; import java.util.HashSet; import java.util.List; import java.util.Set; @@ -11,11 +9,12 @@ import java.util.stream.Collectors; import org.springframework.stereotype.Service; -import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; +import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; +import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations; @Service public class ZoneBuilderService { @@ -31,7 +30,7 @@ public class ZoneBuilderService { private static final double MAX_LINE_SIZE_SCALE = 2.5; - private static final double ANGLE_TOLERANCE = Math.PI / 6; + private static final double ANGLE_TOLERANCE = Math.toRadians(5); private static final double MAX_VERTICAL_MERGE_DISTANCE = 0.5; @@ -114,64 +113,14 @@ public class ZoneBuilderService { private Zone mergeLinesInZone(List lines, double characterSpacing, double lineSpacing) { - double maxHorizontalDistance = 0; - double minVerticalDistance = 0; - double maxVerticalDistance = lineSpacing * MAX_VERTICAL_MERGE_DISTANCE; + Set words = lines.stream() + .map(Line::getWords) + .flatMap(Collection::stream) + .collect(Collectors.toSet()); + Collection> groupedLines = TextPositionOperations.groupByLine(words); - UnionFind unionFind = new UnionFind<>(new HashSet<>(lines)); - - lines.forEach(outer -> { - lines.forEach(inner -> { - if (inner == outer) { - return; - } - - double horizontalDistance = outer.horizontalDistance(inner); - double verticalDistance = outer.verticalDistance(inner); - - if (horizontalDistance <= maxHorizontalDistance && minVerticalDistance <= verticalDistance && verticalDistance <= maxVerticalDistance) { - - unionFind.union(outer, inner); - - } else if (minVerticalDistance <= verticalDistance - && verticalDistance <= maxVerticalDistance - && Math.abs(horizontalDistance - Math.min(outer.getLength(), inner.getLength())) < 0.1) { - - boolean characterOverlap = false; - int overlappingCount = 0; - for (Character outerCharacter : outer.getCharacters()) { - for (Character innerCharacter : inner.getCharacters()) { - double characterOverlapDistance = outerCharacter.overlappingDistance(innerCharacter); - if (characterOverlapDistance > 2) { - characterOverlap = true; - } - if (characterOverlapDistance > 0) { - overlappingCount++; - } - } - } - if (!characterOverlap && overlappingCount <= 2) { - unionFind.union(outer, inner); - } - } - - }); - }); - - List outputZone = new ArrayList<>(); - for (Set group : unionFind.getGroups()) { - List characters = new ArrayList<>(); - for (Line line : group) { - characters.addAll(line.getCharacters()); - } - characters.sort(Comparator.comparingDouble(Character::getX)); - - outputZone.add(new Line(characters, characterSpacing)); - } - - return new Zone(outputZone.stream() - .sorted(Comparator.comparing(Line::getY0)) - .collect(Collectors.toList())); + List sortedLines = TextPositionOperations.sortLines(groupedLines); + return new Zone(sortedLines); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/Word.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/Word.java index 3434d22..0e5a647 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/Word.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/Word.java @@ -13,6 +13,7 @@ import java.util.stream.Collectors; import org.apache.pdfbox.text.TextPosition; +import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox; import lombok.AllArgsConstructor; @@ -38,8 +39,7 @@ public class Word extends TextBoundingBox implements CharSequence { private int page; @Builder.Default - private List textPositions = new ArrayList<>(); - + private List characters = new ArrayList<>(); private boolean isParagraphStart; private boolean strikethrough; private boolean underline; @@ -49,8 +49,9 @@ public class Word extends TextBoundingBox implements CharSequence { public Word(List textPositions, int pageNumber, boolean isParagraphStart) { - this.textPositions = textPositions.stream() + this.characters = textPositions.stream() .map(RedTextPosition::fromTextPosition) + .map(Character::new) .collect(Collectors.toList()); this.page = pageNumber; this.isParagraphStart = isParagraphStart; @@ -65,9 +66,9 @@ public class Word extends TextBoundingBox implements CharSequence { } - public Word(List textPositions, int page) { + public Word(List textPositions, int page) { - this.textPositions = textPositions; + this.characters = new ArrayList<>(textPositions); this.page = page; calculateBBoxAndHashcode(); } @@ -76,7 +77,7 @@ public class Word extends TextBoundingBox implements CharSequence { @Override public int length() { - return textPositions.size(); + return characters.size(); } @@ -101,7 +102,7 @@ public class Word extends TextBoundingBox implements CharSequence { public Word subSequence(int start, int end) { var textPositionSequence = new Word(); - textPositionSequence.textPositions = textPositions.subList(start, end); + textPositionSequence.characters = characters.subList(start, end); textPositionSequence.page = page; textPositionSequence.dir = dir; textPositionSequence.setToBBoxOfComponents(getTextPositions()); @@ -122,53 +123,59 @@ public class Word extends TextBoundingBox implements CharSequence { public RedTextPosition textPositionAt(int index) { - return textPositions.get(index); + return characters.get(index).getTextPosition(); } public void add(Word word, RedTextPosition textPosition) { - this.textPositions.add(textPosition); + this.characters.add(new Character(textPosition)); this.page = word.getPage(); calculateBBoxAndHashcode(); } + public void add(Character current) { + + characters.add(current); + calculateBBoxAndHashcode(); + } + + public void add(TextPosition textPosition) { - this.textPositions.add(RedTextPosition.fromTextPosition(textPosition)); - calculateBBoxAndHashcode(); + add(new Character(RedTextPosition.fromTextPosition(textPosition))); } public double getTextHeightNoPadding() { - return textPositions.get(0).getHeightDirAdj(); + return characters.get(0).getTextPosition().getHeightDirAdj(); } public double getTextHeight() { - return textPositions.get(0).getHeightDirAdj() + HEIGHT_PADDING; + return characters.get(0).getTextPosition().getHeightDirAdj() + HEIGHT_PADDING; } public String getFont() { - if (textPositions.get(0).getFontName() == null) { + if (characters.get(0).getTextPosition().getFontName() == null) { return "none"; } - return FONT_CLEANER.matcher(textPositions.get(0).getFontName().toLowerCase(Locale.ROOT)).replaceAll(""); + return FONT_CLEANER.matcher(characters.get(0).getTextPosition().getFontName().toLowerCase(Locale.ROOT)).replaceAll(""); } public String getFontStyle() { - if (textPositions.get(0).getFontName() == null) { + if (characters.get(0).getTextPosition().getFontName() == null) { return STANDARD; } - String lowercaseFontName = textPositions.get(0).getFontName().toLowerCase(Locale.ROOT); + String lowercaseFontName = characters.get(0).getTextPosition().getFontName().toLowerCase(Locale.ROOT); if (lowercaseFontName.contains(BOLD) && lowercaseFontName.contains(ITALIC)) { return BOLD_ITALIC; @@ -184,13 +191,13 @@ public class Word extends TextBoundingBox implements CharSequence { public float getFontSize() { - return textPositions.get(0).getFontSizeInPt(); + return characters.get(0).getTextPosition().getFontSizeInPt(); } public float getSpaceWidth() { - return textPositions.get(0).getWidthOfSpace(); + return characters.get(0).getTextPosition().getWidthOfSpace(); } @@ -244,6 +251,14 @@ public class Word extends TextBoundingBox implements CharSequence { } + private List getTextPositions() { + + return characters.stream() + .map(Character::getTextPosition) + .toList(); + } + + public void transform(AffineTransform rotateInstance) { for (RedTextPosition textPosition : getTextPositions()) { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java index 3bb00f7..b8d9e21 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java @@ -11,12 +11,14 @@ import org.apache.commons.text.similarity.LevenshteinDistance; import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngineProto.LayoutEngine; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; import com.knecon.fforesight.service.layoutparser.processor.model.SectionIdentifier; import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; +import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation; import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities; import lombok.Data; @@ -27,6 +29,24 @@ public class BlockificationPostprocessingService { private static final float STRING_SIMILARITY_THRESHOLD = 0.1f; + public void findHeadlinesFromOutline(ClassificationDocument classificationDocument, int pageNumber, ClassificationPage classificationPage, PageInformation pageInformation) { + + OutlineObject lastProcessedOutlineObject = null; + List outlineObjects = classificationDocument.getOutlineObjectTree().getOutlineObjectsPerPage().getOrDefault(pageNumber, new ArrayList<>()); + + OutlineObject notFoundOutlineObject = null; + if (lastProcessedOutlineObject != null && !lastProcessedOutlineObject.isFound()) { + lastProcessedOutlineObject.resetPoint(); + notFoundOutlineObject = lastProcessedOutlineObject; + } + if (!outlineObjects.isEmpty()) { + classificationPage.setOutlineObjects(outlineObjects); + lastProcessedOutlineObject = sanitizeOutlineBlocks(classificationPage, notFoundOutlineObject); + } + classificationDocument.getLayoutDebugLayer().addOutlineObjects(outlineObjects, pageInformation); + } + + public OutlineObject sanitizeOutlineBlocks(ClassificationPage classificationPage, OutlineObject notFoundOutlineObject) { List outlineObjects = classificationPage.getOutlineObjects(); @@ -329,8 +349,8 @@ public class BlockificationPostprocessingService { if (index > 0) { in = createSubSequence(sequence, 0, index); - } else if (endIndex < sequence.getTextPositions().size()) { - in = createSubSequence(sequence, endIndex, sequence.getTextPositions().size()); + } else if (endIndex < sequence.length()) { + in = createSubSequence(sequence, endIndex, sequence.length()); } return new SplitSequenceResult(in, out); @@ -339,7 +359,7 @@ public class BlockificationPostprocessingService { private static Word createSubSequence(Word sequence, int start, int end) { - Word newSeq = new Word(new ArrayList<>(sequence.getTextPositions().subList(start, end)), sequence.getPage()); + Word newSeq = new Word(new ArrayList<>(sequence.getCharacters().subList(start, end)), sequence.getPage()); newSeq.setParagraphStart(sequence.isParagraphStart()); return newSeq; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java index edaee05..48fb851 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java @@ -77,7 +77,7 @@ public class DocstrumBlockificationService { .forEach(line -> { line.getWords() .forEach(word -> { - words.add(new Word(word.getTextPositions(), word.getPage())); + words.add(new Word(word.getCharacters(), word.getPage())); }); }); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/ClarifyndClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/ClarifyndClassificationService.java index c47ac9a..e797dff 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/ClarifyndClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/ClarifyndClassificationService.java @@ -81,7 +81,7 @@ public class ClarifyndClassificationService { && (textBlock.getMostPopularWordStyle().equals("bold") || !document.getFontStyleCounter().getCountPerValue().containsKey("bold") && textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() + 1) - && textBlock.getWords().get(0).getTextPositions().get(0).getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) { + && textBlock.getWords().get(0).getFontSize()>= textBlock.getMostPopularWordFontSize()) { PageBlockType headlineType = HeadlineClassificationService.headlineClassByFontSize(textBlock, headlineFontSizes); headlineClassificationService.classifyHeadline(textBlock, headlineType); @@ -91,7 +91,7 @@ public class ClarifyndClassificationService { && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9 - && textBlock.getWords().get(0).getTextPositions().get(0).getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) { + && textBlock.getWords().get(0).getFontSize() >= textBlock.getMostPopularWordFontSize()) { PageBlockType headlineType = HeadlineClassificationService.headlineClassByFontSize(textBlock, headlineFontSizes); headlineClassificationService.classifyHeadline(textBlock, headlineType); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java index 2683f61..01971ea 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java @@ -94,7 +94,7 @@ public class RedactManagerClassificationService { && (textBlock.getMostPopularWordStyle().equals("bold") || !document.getFontStyleCounter().getCountPerValue().containsKey("bold") && textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() + 1) - && textBlock.getWords().get(0).getTextPositions().get(0).getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) { + && textBlock.getWords().get(0).getFontSize() >= textBlock.getMostPopularWordFontSize()) { PageBlockType headlineType = HeadlineClassificationService.headlineClassByFontSize(textBlock, headlineFontSizes); headlineClassificationService.classifyHeadline(textBlock, headlineType); @@ -104,7 +104,7 @@ public class RedactManagerClassificationService { && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9 - && textBlock.getWords().get(0).getTextPositions().get(0).getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) { + && textBlock.getWords().get(0).getFontSize() >= textBlock.getMostPopularWordFontSize()) { PageBlockType headlineType = HeadlineClassificationService.headlineClassByFontSize(textBlock, headlineFontSizes); headlineClassificationService.classifyHeadline(textBlock, headlineType); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SearchTextWithTextPositionFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SearchTextWithTextPositionFactory.java index 856cba4..cf89ce4 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SearchTextWithTextPositionFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SearchTextWithTextPositionFactory.java @@ -10,6 +10,7 @@ import java.util.List; import java.util.Locale; import java.util.Objects; +import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character; import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange; import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition; import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; @@ -31,19 +32,19 @@ public class SearchTextWithTextPositionFactory { public SearchTextWithTextPositionDto buildSearchTextToTextPositionDto(List sequences) { if (sequences.isEmpty() || sequences.stream() - .allMatch(sequence -> sequence.getTextPositions().isEmpty())) { + .allMatch(sequence -> sequence.getCharacters().isEmpty())) { return SearchTextWithTextPositionDto.empty(); } Context context = new Context(); - RedTextPosition currentTextPosition = sequences.get(0).getTextPositions().get(0); + RedTextPosition currentTextPosition = sequences.get(0).getCharacters().get(0).getTextPosition(); RedTextPosition previousTextPosition = RedTextPosition.builder().unicode(" ").bBoxDirAdj(currentTextPosition.getBBoxDirAdj()).build(); for (Word word : sequences) { - for (int i = 0; i < word.getTextPositions().size(); ++i) { + for (int i = 0; i < word.getCharacters().size(); ++i) { - currentTextPosition = word.getTextPositions().get(i); + currentTextPosition = word.getCharacters().get(i).getTextPosition(); if (isLineBreak(currentTextPosition, previousTextPosition)) { removeHyphenLinebreaks(context); context.lineBreaksStringIdx.add(context.stringIdx); @@ -66,8 +67,9 @@ public class SearchTextWithTextPositionFactory { } List positions = sequences.stream() - .map(Word::getTextPositions) + .map(Word::getCharacters) .flatMap(Collection::stream) + .map(Character::getTextPosition) .map(RedTextPosition::getBBoxPdf) .toList(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java index c4985bd..d05dd66 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java @@ -248,8 +248,8 @@ public class PDFLinesTextStripper extends PDFTextStripper { if (!words.isEmpty()) { previous = words.get(words.size() - 1) - .getTextPositions() - .get(words.get(words.size() - 1).getTextPositions().size() - 1); + .getCharacters() + .get(words.get(words.size() - 1).getCharacters().size() - 1).getTextPosition(); } if (i == 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i).getUnicode().equals("\u00A0") || textPositions.get(i).getUnicode().equals("\t"))) { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java index 442bf8a..0830a42 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java @@ -11,6 +11,7 @@ import java.util.Set; import java.util.stream.Collectors; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.AngleFilter; +import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind; import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils; @@ -54,11 +55,20 @@ public class TextPositionOperations { private List sortUsingLineDetection(Set sequences) { - return sortLines(groupByLine(sequences)); + return sortWords(groupByLine(sequences)); } - public List sortLines(Collection> lines) { + public List sortWords(Collection> lines) { + + return sortLines(lines).stream() + .map(Line::getWords) + .flatMap(Collection::stream) + .collect(Collectors.toList()); + } + + + public List sortLines(Collection> lines) { List> lineBlocks = new ArrayList<>(); for (Set line : lines) { @@ -70,9 +80,9 @@ public class TextPositionOperations { // need to use old sorting, since COMPARATOR_DIR_ADJ is not transitive QuickSort.sort(lineBlocks, Comparator.comparing(line -> line.get(0), COMPARATOR_DIR_ADJ)); - List list = new ArrayList<>(); - for (List words : lineBlocks) { - list.addAll(words); + List list = new ArrayList<>(); + for (List lineBlock : lineBlocks) { + list.add(new Line(lineBlock)); } return list; } @@ -95,6 +105,12 @@ public class TextPositionOperations { .map(Word::getBBoxDirAdj) .mapToDouble(RectangularShape::getWidth).average().orElse(75) * MAX_WORD_DISTANCE_FACTOR; + return groupByLine(sequences, maxLineDistance, maxXGap); + } + + + public Collection> groupByLine(Set sequences, double maxLineDistance, double maxXGap) { + UnionFind unionFind = new UnionFind<>(sequences); for (Word sequence : sequences) { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutDebugLayer.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutDebugLayer.java index 7fb781c..3a20603 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutDebugLayer.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutDebugLayer.java @@ -273,7 +273,9 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig { AtomicInteger index = new AtomicInteger(0); zones.forEach(zone -> zone.getLines() .stream() - .map(Line::getCharacters) + .map(Line::getWords) + .flatMap(Collection::stream) + .map(Word::getCharacters) .flatMap(Collection::stream) .forEach(character -> { Color color = getRotatingColor(index);