Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
15b7c149b5 | ||
|
|
86997c880b |
@ -9,6 +9,7 @@ import java.io.IOException;
|
|||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.concurrent.atomic.AtomicReference;
|
import java.util.concurrent.atomic.AtomicReference;
|
||||||
@ -118,18 +119,14 @@ public class LayoutParsingPipeline {
|
|||||||
log.info("Starting layout parsing for {}", layoutParsingRequest.identifier());
|
log.info("Starting layout parsing for {}", layoutParsingRequest.identifier());
|
||||||
|
|
||||||
File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
|
File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
|
||||||
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId())
|
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()).orElse(originFile);
|
||||||
.orElse(originFile);
|
|
||||||
|
|
||||||
VisualLayoutParsingResponse visualLayoutParsingResponse = layoutParsingRequest.visualLayoutParsingFileId()
|
VisualLayoutParsingResponse visualLayoutParsingResponse = layoutParsingRequest.visualLayoutParsingFileId()
|
||||||
.map(layoutParsingStorageService::getVisualLayoutParsingFile)
|
.map(layoutParsingStorageService::getVisualLayoutParsingFile).orElse(new VisualLayoutParsingResponse());
|
||||||
.orElse(new VisualLayoutParsingResponse());
|
|
||||||
ImageServiceResponse imageServiceResponse = layoutParsingRequest.imagesFileStorageId()
|
ImageServiceResponse imageServiceResponse = layoutParsingRequest.imagesFileStorageId()
|
||||||
.map(layoutParsingStorageService::getImagesFile)
|
.map(layoutParsingStorageService::getImagesFile).orElse(new ImageServiceResponse());
|
||||||
.orElse(new ImageServiceResponse());
|
|
||||||
TableServiceResponse tableServiceResponse = layoutParsingRequest.tablesFileStorageId()
|
TableServiceResponse tableServiceResponse = layoutParsingRequest.tablesFileStorageId()
|
||||||
.map(layoutParsingStorageService::getTablesFile)
|
.map(layoutParsingStorageService::getTablesFile).orElse(new TableServiceResponse());
|
||||||
.orElse(new TableServiceResponse());
|
|
||||||
|
|
||||||
ClassificationDocument classificationDocument = parseLayout(settings.getLayoutParsingTypeOverride() == null //
|
ClassificationDocument classificationDocument = parseLayout(settings.getLayoutParsingTypeOverride() == null //
|
||||||
? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride(),
|
? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride(),
|
||||||
@ -278,7 +275,9 @@ public class LayoutParsingPipeline {
|
|||||||
stripper.getText(originDocument);
|
stripper.getText(originDocument);
|
||||||
List<TextPositionSequence> words = stripper.getTextPositionSequences();
|
List<TextPositionSequence> words = stripper.getTextPositionSequences();
|
||||||
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD)) {
|
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD)) {
|
||||||
words = TextPositionOperations.sort(words);
|
var lines = TextPositionOperations.groupByLine(new HashSet<>(words));
|
||||||
|
classificationDocument.getLayoutDebugLayer().addLineVisualizationsFromNestedTextPosition(lines, pageNumber);
|
||||||
|
words = TextPositionOperations.sortLines(lines);
|
||||||
}
|
}
|
||||||
classificationDocument.getLayoutDebugLayer().addTextVisualizations(words, pageNumber);
|
classificationDocument.getLayoutDebugLayer().addTextVisualizations(words, pageNumber);
|
||||||
|
|
||||||
|
|||||||
@ -7,9 +7,12 @@ public class DoubleUtils {
|
|||||||
if (Double.isNaN(d1) || Double.isNaN(d2)) {
|
if (Double.isNaN(d1) || Double.isNaN(d2)) {
|
||||||
return Double.compare(d1, d2);
|
return Double.compare(d1, d2);
|
||||||
}
|
}
|
||||||
long i1 = Math.round(d1 / (precision == 0 ? 1 : precision));
|
|
||||||
long i2 = Math.round(d2 / (precision == 0 ? 1 : precision));
|
if (Math.abs(d1 - d2) < precision) {
|
||||||
return Long.compare(i1, i2);
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
return Double.compare(d1, d2);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -27,7 +27,7 @@ public class TextPositionOperations {
|
|||||||
public static final double MAX_LINE_HEIGHT_FACTOR = 0.66; // multiplied with max word height
|
public static final double MAX_LINE_HEIGHT_FACTOR = 0.66; // multiplied with max word height
|
||||||
public static final double MAX_WORD_DISTANCE_FACTOR = 3.5; // multiplied with max word width
|
public static final double MAX_WORD_DISTANCE_FACTOR = 3.5; // multiplied with max word width
|
||||||
|
|
||||||
private static final double Y_THRESHOLD = 6;
|
private static final double Y_THRESHOLD = 5;
|
||||||
private static final double X_THRESHOLD = 3;
|
private static final double X_THRESHOLD = 3;
|
||||||
private static final Comparator<TextBoundingBox> COMPARATOR_DIR_ADJ = //
|
private static final Comparator<TextBoundingBox> COMPARATOR_DIR_ADJ = //
|
||||||
Comparator.comparing(TextBoundingBox::getDir)
|
Comparator.comparing(TextBoundingBox::getDir)
|
||||||
@ -53,13 +53,19 @@ public class TextPositionOperations {
|
|||||||
|
|
||||||
private List<TextPositionSequence> sortUsingLineDetection(Set<TextPositionSequence> sequences) {
|
private List<TextPositionSequence> sortUsingLineDetection(Set<TextPositionSequence> sequences) {
|
||||||
|
|
||||||
return groupByLine(sequences).stream()
|
return sortLines(groupByLine(sequences));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public List<TextPositionSequence> sortLines(Collection<Set<TextPositionSequence>> lines) {
|
||||||
|
|
||||||
|
return lines.stream()
|
||||||
.map(TextPositionOperations::sortByXDirAdj)
|
.map(TextPositionOperations::sortByXDirAdj)
|
||||||
.filter(line -> !line.isEmpty())
|
.filter(line -> !line.isEmpty())
|
||||||
.sorted(Comparator.comparing(line -> line.get(0), COMPARATOR_DIR_ADJ))
|
.sorted(Comparator.comparing(line -> line.get(0), COMPARATOR_DIR_ADJ))
|
||||||
.flatMap(Collection::stream)
|
.flatMap(Collection::stream)
|
||||||
.toList();
|
.toList();
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -71,7 +77,7 @@ public class TextPositionOperations {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private Collection<Set<TextPositionSequence>> groupByLine(Set<TextPositionSequence> sequences) {
|
public Collection<Set<TextPositionSequence>> groupByLine(Set<TextPositionSequence> sequences) {
|
||||||
|
|
||||||
double maxLineDistance = sequences.stream()
|
double maxLineDistance = sequences.stream()
|
||||||
.map(TextPositionSequence::getBBoxDirAdj)
|
.map(TextPositionSequence::getBBoxDirAdj)
|
||||||
|
|||||||
@ -6,6 +6,7 @@ import java.awt.geom.Point2D;
|
|||||||
import java.awt.geom.Rectangle2D;
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
|
||||||
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
|
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
|
||||||
@ -150,6 +151,23 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
public void addLineVisualizationsFromNestedTextPosition(Collection<Set<TextPositionSequence>> lines, int pageNumber) {
|
||||||
|
|
||||||
|
if (!active) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.lines);
|
||||||
|
visualizationsOnPage.getColoredRectangles()
|
||||||
|
.addAll(lines.stream()
|
||||||
|
.map(line -> line.stream()
|
||||||
|
.map(BoundingBox::getBBoxPdf)
|
||||||
|
.collect(RectangleTransformations.collectBBox()))
|
||||||
|
.map(line -> new ColoredRectangle(line, LINES_COLOR, 0.5f))
|
||||||
|
.toList());
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
public void addTextBlockVisualizations(List<TextPageBlock> textPageBlocks, int page) {
|
public void addTextBlockVisualizations(List<TextPageBlock> textPageBlocks, int page) {
|
||||||
|
|
||||||
if (!active) {
|
if (!active) {
|
||||||
@ -235,4 +253,5 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user