Compare commits

...

9 Commits

Author SHA1 Message Date
maverickstuder
f90cb20156 RED-8666 2024-03-04 15:18:13 +01:00
Dominique Eifländer
72202f63dc More 2024-02-16 14:51:26 +01:00
Dominique Eifländer
e14d953b04 More 2024-02-16 14:50:31 +01:00
Dominique Eifländer
9e5778d4b2 More 2024-02-16 14:08:59 +01:00
Dominique Eifländer
e394f2fa7c More refactoring 2024-02-16 13:48:03 +01:00
Dominique Eifländer
b2fb6829cb More refactoring 2024-02-16 11:15:44 +01:00
Dominique Eifländer
4871e55f2d More refactoring 2024-02-15 16:54:07 +01:00
Dominique Eifländer
4de6c12aec REmove more 2024-02-15 10:29:39 +01:00
Dominique Eifländer
4afa8daafa First working docstrum 2024-02-14 13:39:27 +01:00
40 changed files with 2320 additions and 18 deletions

View File

@ -10,6 +10,7 @@ import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference; import java.util.concurrent.atomic.AtomicReference;
import org.apache.pdfbox.Loader; import org.apache.pdfbox.Loader;
@ -26,6 +27,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsi
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationSection;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
@ -47,6 +49,7 @@ import com.knecon.fforesight.service.layoutparser.processor.services.blockificat
import com.knecon.fforesight.service.layoutparser.processor.services.classification.DocuMineClassificationService; import com.knecon.fforesight.service.layoutparser.processor.services.classification.DocuMineClassificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService; import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.TaasClassificationService; import com.knecon.fforesight.service.layoutparser.processor.services.classification.TaasClassificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.DocstrumSegmentationService;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper; import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper; import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper;
@ -86,6 +89,7 @@ public class LayoutParsingPipeline {
RedactManagerBlockificationService redactManagerBlockificationService; RedactManagerBlockificationService redactManagerBlockificationService;
LayoutGridService layoutGridService; LayoutGridService layoutGridService;
ObservationRegistry observationRegistry; ObservationRegistry observationRegistry;
DocstrumSegmentationService docstrumSegmentationService;
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException { public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
@ -243,11 +247,37 @@ public class LayoutParsingPipeline {
PDRectangle cropbox = pdPage.getCropBox(); PDRectangle cropbox = pdPage.getCropBox();
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), stripper.getRulings()); CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), stripper.getRulings());
ClassificationPage classificationPage = switch (layoutParsingType) { // Docstrum
case REDACT_MANAGER -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); AtomicInteger num = new AtomicInteger(pageNumber);
case TAAS -> taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); var zones = docstrumSegmentationService.segmentPage(stripper.getTextPositionSequences());
case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
}; List<AbstractPageBlock> pageBlocks = new ArrayList<>();
AtomicInteger numOnPage = new AtomicInteger(1);
// List<TextPositionSequence> textPositionSequences = new ArrayList<>();
zones.forEach(zone -> {
List<TextPositionSequence> textPositionSequences = new ArrayList<>();
zone.getLines().forEach(line -> {
line.getWords().forEach(word -> {
textPositionSequences.add(new TextPositionSequence(word.getTextPositions(), num.get()));
});
});
var cps = redactManagerBlockificationService.blockify(textPositionSequences, cleanRulings.getHorizontal(), cleanRulings.getVertical());
cps.getTextBlocks().forEach(cp -> {
pageBlocks.add(redactManagerBlockificationService.buildTextBlock(((TextPageBlock) cp).getSequences(), numOnPage.getAndIncrement()));
});
});
// ClassificationPage classificationPage = switch (layoutParsingType) {
// case REDACT_MANAGER -> redactManagerBlockificationService.blockify(textPositionSequences, cleanRulings.getHorizontal(), cleanRulings.getVertical());
// case TAAS -> taasBlockificationService.blockify(textPositionSequences, cleanRulings.getHorizontal(), cleanRulings.getVertical());
// case DOCUMINE -> docuMineBlockificationService.blockify(textPositionSequences, cleanRulings.getHorizontal(), cleanRulings.getVertical());
// };
ClassificationPage classificationPage = new ClassificationPage(pageBlocks);
classificationPage.setCleanRulings(cleanRulings); classificationPage.setCleanRulings(cleanRulings);
classificationPage.setRotation(rotation); classificationPage.setRotation(rotation);
classificationPage.setLandscape(isLandscape); classificationPage.setLandscape(isLandscape);
@ -283,9 +313,19 @@ public class LayoutParsingPipeline {
case REDACT_MANAGER -> redactManagerClassificationService.classifyDocument(classificationDocument); case REDACT_MANAGER -> redactManagerClassificationService.classifyDocument(classificationDocument);
} }
List<ClassificationSection> sections = new ArrayList<>();
for (var page : classificationPages) {
page.getTextBlocks().forEach(block -> {
block.setPage(page.getPageNumber());
var section = sectionsBuilderService.buildTextBlock(List.of(block), "a");
sections.add(section);
});
}
classificationDocument.setSections(sections);
log.info("Building Sections for {}", identifier); log.info("Building Sections for {}", identifier);
sectionsBuilderService.buildSections(classificationDocument); // sectionsBuilderService.buildSections(classificationDocument);
sectionsBuilderService.addImagesToSections(classificationDocument); // sectionsBuilderService.addImagesToSections(classificationDocument);
return classificationDocument; return classificationDocument;
} }

View File

@ -12,15 +12,16 @@ import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRul
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter; import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
import lombok.Data; import lombok.Data;
import lombok.Getter;
import lombok.NonNull; import lombok.NonNull;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
@Data @Data
@RequiredArgsConstructor @RequiredArgsConstructor
public class ClassificationPage { public class ClassificationPage {
@NonNull @NonNull
@Getter
private List<AbstractPageBlock> textBlocks; private List<AbstractPageBlock> textBlocks;
private List<ClassifiedImage> images = new ArrayList<>(); private List<ClassifiedImage> images = new ArrayList<>();

View File

@ -45,6 +45,9 @@ public class RedTextPosition {
@JsonIgnore @JsonIgnore
private String fontName; private String fontName;
@JsonIgnore
private RedTextPosition parent;
@SneakyThrows @SneakyThrows
public static RedTextPosition fromTextPosition(TextPosition textPosition) { public static RedTextPosition fromTextPosition(TextPosition textPosition) {

View File

@ -17,6 +17,7 @@ import lombok.AllArgsConstructor;
import lombok.Builder; import lombok.Builder;
import lombok.Data; import lombok.Data;
import lombok.EqualsAndHashCode; import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.NoArgsConstructor; import lombok.NoArgsConstructor;
@EqualsAndHashCode(callSuper = true) @EqualsAndHashCode(callSuper = true)
@ -27,6 +28,7 @@ import lombok.NoArgsConstructor;
public class TextPageBlock extends AbstractPageBlock { public class TextPageBlock extends AbstractPageBlock {
@Builder.Default @Builder.Default
@Getter
private List<TextPositionSequence> sequences = new ArrayList<>(); private List<TextPositionSequence> sequences = new ArrayList<>();
@JsonIgnore @JsonIgnore
@ -73,7 +75,7 @@ public class TextPageBlock extends AbstractPageBlock {
return sequences.get(0).getPageWidth(); return sequences.get(0).getPageWidth();
} }
public static TextPageBlock merge(List<TextPageBlock> textBlocksToMerge) { public static TextPageBlock merge(List<TextPageBlock> textBlocksToMerge) {
@ -82,6 +84,7 @@ public class TextPageBlock extends AbstractPageBlock {
return fromTextPositionSequences(sequences); return fromTextPositionSequences(sequences);
} }
public static TextPageBlock fromTextPositionSequences(List<TextPositionSequence> wordBlockList) { public static TextPageBlock fromTextPositionSequences(List<TextPositionSequence> wordBlockList) {
TextPageBlock textBlock = null; TextPageBlock textBlock = null;
@ -133,7 +136,6 @@ public class TextPageBlock extends AbstractPageBlock {
} }
/** /**
* Returns the minX value in pdf coordinate system. * Returns the minX value in pdf coordinate system.
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation. * Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.

View File

@ -55,6 +55,18 @@ public class TextPositionSequence implements CharSequence {
} }
public TextPositionSequence(List<RedTextPosition> textPositions, int page) {
this.textPositions = textPositions;
this.page = page;
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
this.rotation = textPositions.get(0).getRotation();
this.pageHeight = textPositions.get(0).getPageHeight();
this.pageWidth = textPositions.get(0).getPageWidth();
this.isParagraphStart = false;
}
@Override @Override
public int length() { public int length() {

View File

@ -240,7 +240,7 @@ public class SectionsBuilderService {
} }
private ClassificationSection buildTextBlock(List<AbstractPageBlock> wordBlockList, String lastHeadline) { public ClassificationSection buildTextBlock(List<AbstractPageBlock> wordBlockList, String lastHeadline) {
ClassificationSection section = new ClassificationSection(); ClassificationSection section = new ClassificationSection();

View File

@ -57,7 +57,7 @@ public class RedactManagerBlockificationService {
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines); boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir()); boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) { if (prev != null && (splitByDir || isSplitByRuling)) {
Orientation prevOrientation = null; Orientation prevOrientation = null;
if (!chunkBlockList.isEmpty()) { if (!chunkBlockList.isEmpty()) {
@ -167,7 +167,7 @@ public class RedactManagerBlockificationService {
} }
private TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) { public TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
TextPageBlock textBlock = null; TextPageBlock textBlock = null;

View File

@ -0,0 +1,48 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum;
import java.util.List;
import java.util.stream.Collectors;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.LineBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.NearestNeighbourService;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.ReadingOrderService;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.SpacingService;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.ZoneBuilderService;
import lombok.RequiredArgsConstructor;
@Service
@RequiredArgsConstructor
public class DocstrumSegmentationService {
private final NearestNeighbourService nearestNeighbourService;
private final SpacingService spacingService;
private final LineBuilderService lineBuilderService;
private final ZoneBuilderService zoneBuilderService;
private final ReadingOrderService readingOrderService;
public List<Zone> segmentPage(List<TextPositionSequence> textPositions) {
var positions = textPositions.stream().map(TextPositionSequence::getTextPositions).flatMap(List::stream).toList();
var characters = positions.stream().map(Character::new).collect(Collectors.toList());
nearestNeighbourService.findNearestNeighbors(characters);
var characterSpacing = spacingService.computeCharacterSpacing(characters);
var lineSpacing = spacingService.computeLineSpacing(characters);
var lines = lineBuilderService.buildLines(characters, characterSpacing, lineSpacing);
var zones = zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing);
return readingOrderService.resolve(zones);
}
}

View File

@ -0,0 +1,90 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
/**
* Filter class for neighbor objects that checks if the angle of the
* neighbor is within specified range.
*/
public abstract class AngleFilter {
private final double lowerAngle;
private final double upperAngle;
private AngleFilter(double lowerAngle, double upperAngle) {
this.lowerAngle = lowerAngle;
this.upperAngle = upperAngle;
}
/**
* Constructs new angle filter.
*
* @param lowerAngle minimum angle in range [-3*pi/2, pi/2)
* @param upperAngle maximum angle in range [-pi/2, 3*pi/2)
* @return newly constructed angle filter
*/
public static AngleFilter newInstance(double lowerAngle, double upperAngle) {
if (lowerAngle < -Math.PI / 2) {
lowerAngle += Math.PI;
}
if (upperAngle >= Math.PI / 2) {
upperAngle -= Math.PI;
}
if (lowerAngle <= upperAngle) {
return new AndFilter(lowerAngle, upperAngle);
} else {
return new OrFilter(lowerAngle, upperAngle);
}
}
public double getLowerAngle() {
return lowerAngle;
}
public double getUpperAngle() {
return upperAngle;
}
public abstract boolean matches(Neighbor neighbor);
public static final class AndFilter extends AngleFilter {
private AndFilter(double lowerAngle, double upperAngle) {
super(lowerAngle, upperAngle);
}
@Override
public boolean matches(Neighbor neighbor) {
return getLowerAngle() <= neighbor.getAngle() && neighbor.getAngle() < getUpperAngle();
}
}
public static final class OrFilter extends AngleFilter {
private OrFilter(double lowerAngle, double upperAngle) {
super(lowerAngle, upperAngle);
}
@Override
public boolean matches(Neighbor neighbor) {
return getLowerAngle() <= neighbor.getAngle() || neighbor.getAngle() < getUpperAngle();
}
}
}

View File

@ -0,0 +1,48 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
import java.awt.geom.Rectangle2D;
import lombok.Data;
@Data
public abstract class BoundingBox {
private Rectangle2D bBox;
public double getX() {
return bBox.getX();
}
public double getY() {
return bBox.getY();
}
public double getWidth() {
return bBox.getWidth();
}
public double getHeight() {
return bBox.getHeight();
}
public double getArea() {
return (bBox.getHeight() * bBox.getWidth());
}
public boolean contains(Rectangle2D contained, double tolerance) {
return bBox.getX() <= contained.getX() + tolerance && bBox.getY() <= contained.getY() + tolerance && bBox.getX() + bBox.getWidth() >= contained.getX() + contained.getWidth() - tolerance && bBox.getY() + bBox.getHeight() >= contained.getY() + contained.getHeight() - tolerance;
}
}

View File

@ -0,0 +1,69 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
import java.util.ArrayList;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
import lombok.Data;
@Data
public class Character {
private final double x;
private final double y;
private final RedTextPosition textPosition;
private List<Neighbor> neighbors = new ArrayList<>();
public Character(RedTextPosition chunk) {
this.x = chunk.getXDirAdj() + chunk.getWidthDirAdj() / 2;
this.y = chunk.getYDirAdj() + chunk.getHeightDir() / 2;
this.textPosition = chunk;
}
public double getHeight() {
return textPosition.getHeightDir();
}
public double distance(Character character) {
double dx = getX() - character.getX();
double dy = getY() - character.getY();
return Math.sqrt(dx * dx + dy * dy);
}
public double horizontalDistance(Character character) {
return Math.abs(getX() - character.getX());
}
public double verticalDistance(Character character) {
return Math.abs(getY() - character.getY());
}
public void setNeighbors(List<Neighbor> neighbors) {
this.neighbors = neighbors;
}
public double angle(Character character) {
if (getX() > character.getX()) {
return Math.atan2(getY() - character.getY(), getX() - character.getX());
} else {
return Math.atan2(character.getY() - getY(), character.getX() - getX());
}
}
}

View File

@ -0,0 +1,212 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
import java.util.AbstractSet;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Set;
public class DisjointSets<E> implements Iterable<Set<E>> {
private final Map<E, Entry<E>> map = new HashMap<E, Entry<E>>();
/**
* Constructs a new set of singletons.
*
* @param c elements of singleton sets
*/
public DisjointSets(Collection<? extends E> c) {
for (E element : c) {
map.put(element, new Entry<E>(element));
}
}
/**
* Checks if elements are in the same subsets.
*
* @param e1 element from a subset
* @param e2 element from a subset
* @return true if elements are in the same subset; false otherwise
*/
public boolean areTogether(E e1, E e2) {
return map.get(e1).findRepresentative() == map.get(e2).findRepresentative();
}
/**
* Merges subsets which elements e1 and e2 belong to.
*
* @param e1 element from a subset
* @param e2 element from a subset
*/
public void union(E e1, E e2) {
Entry<E> r1 = map.get(e1).findRepresentative();
Entry<E> r2 = map.get(e2).findRepresentative();
if (r1 != r2) {
if (r1.size <= r2.size) {
r2.mergeWith(r1);
} else {
r1.mergeWith(r2);
}
}
}
@Override
public Iterator<Set<E>> iterator() {
return new Iterator<Set<E>>() {
private final Iterator<Entry<E>> iterator = map.values().iterator();
private Entry<E> nextRepresentative;
{
findNextRepresentative();
}
@Override
public boolean hasNext() {
return nextRepresentative != null;
}
@Override
public Set<E> next() {
if (nextRepresentative == null) {
throw new NoSuchElementException();
}
Set<E> result = nextRepresentative.asSet();
findNextRepresentative();
return result;
}
private void findNextRepresentative() {
while (iterator.hasNext()) {
Entry<E> candidate = iterator.next();
if (candidate.isRepresentative()) {
nextRepresentative = candidate;
return;
}
}
nextRepresentative = null;
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
};
}
private static class Entry<E> {
private int size = 1;
private final E value;
private Entry<E> parent = this;
private Entry<E> next = null;
private Entry<E> last = this;
Entry(E value) {
this.value = value;
}
void mergeWith(Entry<E> otherRepresentative) {
size += otherRepresentative.size;
last.next = otherRepresentative;
last = otherRepresentative.last;
otherRepresentative.parent = this;
}
Entry<E> findRepresentative() {
Entry<E> representative = parent;
while (representative.parent != representative) {
representative = representative.parent;
}
for (Entry<E> entry = this; entry != representative; ) {
Entry<E> nextEntry = entry.parent;
entry.parent = representative;
entry = nextEntry;
}
return representative;
}
boolean isRepresentative() {
return parent == this;
}
Set<E> asSet() {
return new AbstractSet<E>() {
@Override
public Iterator<E> iterator() {
return new Iterator<E>() {
private Entry<E> nextEntry = findRepresentative();
@Override
public boolean hasNext() {
return nextEntry != null;
}
@Override
public E next() {
if (nextEntry == null) {
throw new NoSuchElementException();
}
E result = nextEntry.value;
nextEntry = nextEntry.next;
return result;
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
};
}
@Override
public int size() {
return findRepresentative().size;
}
};
}
}
}

View File

@ -0,0 +1,199 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
import java.util.Iterator;
import java.util.NoSuchElementException;
public class Histogram implements Iterable<Histogram.Bin> {
private static final double EPSILON = 1.0e-6;
private final double min;
private final double delta;
private final double resolution;
private double[] frequencies;
/**
* Constructs a new histogram for values in range [minValue, maxValue] with
* given resolution.
*
* @param minValue - minimum allowed value
* @param maxValue - maximum allowed value
* @param resolution - histogram's resolution
*/
public Histogram(double minValue, double maxValue, double resolution) {
this.min = minValue - EPSILON;
this.delta = maxValue - minValue + 2 * EPSILON;
int size = Math.max(1, (int) Math.round((maxValue - minValue) / resolution));
this.resolution = this.delta / size;
this.frequencies = new double[size];
}
public void kernelSmooth(double[] kernel) {
double[] newFrequencies = new double[frequencies.length];
int shift = (kernel.length - 1) / 2;
for (int i = 0; i < kernel.length; i++) {
int jStart = Math.max(0, i - shift);
int jEnd = Math.min(frequencies.length, frequencies.length + i - shift);
for (int j = jStart; j < jEnd; j++) {
newFrequencies[j - i + shift] += kernel[i] * frequencies[j];
}
}
frequencies = newFrequencies;
}
public void circularKernelSmooth(double[] kernel) {
double[] newFrequencies = new double[frequencies.length];
int shift = (kernel.length - 1) / 2;
for (int i = 0; i < frequencies.length; i++) {
for (int d = 0; d < kernel.length; d++) {
int j = i + d - shift;
if (j < 0) {
j += frequencies.length;
} else if (j >= frequencies.length) {
j -= frequencies.length;
}
newFrequencies[i] += kernel[d] * frequencies[j];
}
}
frequencies = newFrequencies;
}
public double[] createGaussianKernel(double length, double stdDeviation) {
int r = (int) Math.round(length / resolution) / 2;
stdDeviation /= resolution;
int size = 2 * r + 1;
double[] kernel = new double[size];
double sum = 0;
double b = 2 * stdDeviation * stdDeviation;
double a = 1 / Math.sqrt(Math.PI * b);
for (int i = 0; i < size; i++) {
kernel[i] = a * Math.exp(-(i - r) * (i - r) / b);
sum += kernel[i];
}
for (int i = 0; i < size; i++) {
kernel[i] /= sum;
}
return kernel;
}
public void circularGaussianSmooth(double windowLength, double stdDeviation) {
circularKernelSmooth(createGaussianKernel(windowLength, stdDeviation));
}
public void gaussianSmooth(double windowLength, double stdDeviation) {
kernelSmooth(createGaussianKernel(windowLength, stdDeviation));
}
/**
* Adds single occurrence of given value to the histogram.
*
* @param value inserted values
*/
public void add(double value) {
frequencies[(int) ((value - min) / resolution)] += 1.0;
}
/**
* Returns histogram's number of bins.
*
* @return number of bins
*/
public int getSize() {
return frequencies.length;
}
/**
* Finds the histogram's peak value.
*
* @return peak value
*/
public double getPeakValue() {
int peakIndex = 0;
for (int i = 1; i < frequencies.length; i++) {
if (frequencies[i] > frequencies[peakIndex]) {
peakIndex = i;
}
}
int peakEndIndex = peakIndex + 1;
final double EPS = 0.0001;
while (peakEndIndex < frequencies.length && Math.abs(frequencies[peakEndIndex] - frequencies[peakIndex]) < EPS) {
peakEndIndex++;
}
return ((double) peakIndex + peakEndIndex) / 2 * resolution + min;
}
@Override
public Iterator<Bin> iterator() {
return new Iterator() {
private int index = 0;
@Override
public boolean hasNext() {
return index < frequencies.length;
}
@Override
public Object next() {
if (index >= frequencies.length) {
throw new NoSuchElementException();
}
return new Bin(index++);
}
@Override
public void remove() {
throw new UnsupportedOperationException("Not supported yet.");
}
};
}
public final class Bin {
private final int index;
private Bin(int index) {
this.index = index;
}
public double getValue() {
return (index + 0.5) * resolution + min;
}
}
}

View File

@ -0,0 +1,167 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import lombok.Data;
@Data
public class Line extends BoundingBox {
private static final double WORD_DISTANCE_MULTIPLIER = 0.2;
private final double x0;
private final double y0;
private final double x1;
private final double y1;
private final double height;
private final List<Character> characters;
private final List<TextPositionSequence> words = new ArrayList<>();
public Line(List<Character> characters, double wordSpacing) {
this.characters = characters;
if (characters.size() >= 2) {
// Simple linear regression
double sx = 0.0, sxx = 0.0, sxy = 0.0, sy = 0.0;
for (Character component : characters) {
sx += component.getX();
sxx += component.getX() * component.getX();
sxy += component.getX() * component.getY();
sy += component.getY();
}
double b = (characters.size() * sxy - sx * sy) / (characters.size() * sxx - sx * sx);
double a = (sy - b * sx) / characters.size();
this.x0 = characters.get(0).getX();
this.y0 = a + b * this.x0;
this.x1 = characters.get(characters.size() - 1).getX();
this.y1 = a + b * this.x1;
} else if (!characters.isEmpty()) {
Character component = characters.get(0);
double dx = component.getTextPosition().getWidthDirAdj() / 3;
double dy = dx * Math.tan(0);
this.x0 = component.getX() - dx;
this.x1 = component.getX() + dx;
this.y0 = component.getY() - dy;
this.y1 = component.getY() + dy;
} else {
throw new IllegalArgumentException("Component list must not be empty");
}
height = computeHeight();
computeWords(wordSpacing * WORD_DISTANCE_MULTIPLIER);
buildBox();
}
public double getAngle() {
return Math.atan2(y1 - y0, x1 - x0);
}
public double getLength() {
return Math.sqrt((x0 - x1) * (x0 - x1) + (y0 - y1) * (y0 - y1));
}
private double computeHeight() {
double sum = 0.0;
for (Character component : characters) {
sum += component.getHeight();
}
return sum / characters.size();
}
public double angularDifference(Line j) {
double diff = Math.abs(getAngle() - j.getAngle());
if (diff <= Math.PI / 2) {
return diff;
} else {
return Math.PI - diff;
}
}
public double horizontalDistance(Line other) {
double[] xs = new double[4];
xs[0] = x0;
xs[1] = x1;
xs[2] = other.x0;
xs[3] = other.x1;
boolean overlapping = xs[1] >= xs[2] && xs[3] >= xs[0];
Arrays.sort(xs);
return Math.abs(xs[2] - xs[1]) * (overlapping ? 1 : -1);
}
public double verticalDistance(Line other) {
double ym = (y0 + y1) / 2;
double yn = (other.y0 + other.y1) / 2;
return Math.abs(ym - yn) / Math.sqrt(1);
}
private void computeWords(double wordSpacing) {
TextPositionSequence word = new TextPositionSequence();
Character previous = null;
for (Character current : characters) {
if (previous != null) {
double dist = current.getTextPosition().getXDirAdj() - previous.getTextPosition().getXDirAdj() - previous.getTextPosition().getWidthDirAdj();
if (dist > wordSpacing) {
words.add(word);
word = new TextPositionSequence();
}
}
word.getTextPositions().add(current.getTextPosition());
previous = current;
}
words.add(word);
}
private void buildBox() {
double minX = Double.POSITIVE_INFINITY;
double minY = Double.POSITIVE_INFINITY;
double maxX = Double.NEGATIVE_INFINITY;
double maxY = Double.NEGATIVE_INFINITY;
for (Character character : characters) {
minX = Math.min(minX, character.getTextPosition().getXDirAdj());
minY = Math.min(minY, character.getTextPosition().getYDirAdj());
maxX = Math.max(maxX, character.getTextPosition().getXDirAdj() + character.getTextPosition().getWidthDirAdj());
maxY = Math.max(maxY, character.getTextPosition().getYDirAdj() + character.getTextPosition().getHeightDir());
}
this.setBBox(new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY));
}
public String toString() {
StringBuilder sb = new StringBuilder();
words.forEach(word -> sb.append(word.toString()).append(" "));
return sb.toString().trim();
}
}

View File

@ -0,0 +1,36 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
import lombok.Getter;
public class Neighbor {
@Getter
private final double distance;
@Getter
private final double angle;
private final Character originCharacter;
@Getter
private final Character character;
public Neighbor(Character neighbor, Character origin) {
this.distance = neighbor.distance(origin);
this.angle = neighbor.angle(origin);
this.character = neighbor;
this.originCharacter = origin;
}
public double getHorizontalDistance() {
return character.horizontalDistance(originCharacter);
}
public double getVerticalDistance() {
return character.verticalDistance(originCharacter);
}
}

View File

@ -0,0 +1,50 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
import java.awt.geom.Rectangle2D;
import java.util.Comparator;
import java.util.List;
import lombok.Data;
@Data
public class Zone extends BoundingBox {
private List<Line> lines;
public Zone(List<Line> lines) {
lines.sort(Comparator.comparingDouble(Line::getY));
this.lines = lines;
buildBox();
}
public void buildBox() {
double minX = Double.POSITIVE_INFINITY;
double minY = Double.POSITIVE_INFINITY;
double maxX = Double.NEGATIVE_INFINITY;
double maxY = Double.NEGATIVE_INFINITY;
for (Line line : lines) {
minX = Math.min(minX, line.getX());
minY = Math.min(minY, line.getY());
maxX = Math.max(maxX, line.getX() + line.getWidth());
maxY = Math.max(maxY, line.getY() + line.getHeight());
}
this.setBBox(new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY));
}
public String toString() {
StringBuilder sb = new StringBuilder();
lines.forEach(line -> sb.append(line.toString()).append("\n"));
return sb.toString().trim();
}
}

View File

@ -0,0 +1,64 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder;
import java.awt.geom.Rectangle2D;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.BoundingBox;
public class BoundingBoxZoneGroup extends BoundingBox {
private BoundingBox leftChild;
private BoundingBox rightChild;
public BoundingBoxZoneGroup(BoundingBox child1, BoundingBox child2) {
this.leftChild = child1;
this.rightChild = child2;
setBounds(Math.min(child1.getX(), child2.getX()),
Math.min(child1.getY(), child2.getY()),
Math.max(child1.getX() + child1.getWidth(), child2.getX() + child2.getWidth()),
Math.max(child1.getY() + child1.getHeight(), child2.getY() + child2.getHeight()));
}
public void setbBox(Rectangle2D bBox) {
super.setBBox(bBox);
}
public BoundingBox getLeftChild() {
return leftChild;
}
public BoundingBox getRightChild() {
return rightChild;
}
public BoundingBoxZoneGroup setLeftChild(BoundingBox obj) {
this.leftChild = obj;
return this;
}
public BoundingBoxZoneGroup setRightChild(BoundingBox obj) {
this.rightChild = obj;
return this;
}
public BoundingBoxZoneGroup setBounds(double x0, double y0, double x1, double y1) {
assert x1 >= x0;
assert y1 >= y0;
this.setBBox(new Rectangle2D.Double(x0, y0, x1 - x0, y1 - y0));
return this;
}
}

View File

@ -0,0 +1,115 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.utils.DoubleUtils;
public class DistElem<E> implements Comparable<DistElem<E>> {
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + (c ? 1231 : 1237);
long temp;
temp = Double.doubleToLongBits(dist);
result = prime * result + (int) (temp ^ (temp >>> 32));
result = prime * result + ((obj1 == null) ? 0 : obj1.hashCode());
result = prime * result + ((obj2 == null) ? 0 : obj2.hashCode());
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj) {
return true;
}
if (obj == null) {
return false;
}
if (getClass() != obj.getClass()) {
return false;
}
DistElem other = (DistElem) obj;
if (c != other.c) {
return false;
}
if (Double.doubleToLongBits(dist) != Double.doubleToLongBits(other.dist)) {
return false;
}
if (obj1 == null) {
if (other.obj1 != null) {
return false;
}
} else if (!obj1.equals(other.obj1)) {
return false;
}
if (obj2 == null) {
if (other.obj2 != null) {
return false;
}
} else if (!obj2.equals(other.obj2)) {
return false;
}
return true;
}
boolean c;
double dist;
E obj1;
E obj2;
public boolean isC() {
return c;
}
public void setC(boolean c) {
this.c = c;
}
public double getDist() {
return dist;
}
public E getObj1() {
return obj1;
}
public E getObj2() {
return obj2;
}
public DistElem(boolean c, double dist, E obj1, E obj2) {
this.c = c;
this.dist = dist;
this.obj1 = obj1;
this.obj2 = obj2;
}
@Override
public int compareTo(DistElem<E> compareObject) {
double eps = 1E-3;
if (c == compareObject.c) {
return DoubleUtils.compareDouble(dist, compareObject.dist, eps);
} else {
return c ? -1 : 1;
}
}
}

View File

@ -0,0 +1,258 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone;
/**
* A set-like data structure for objects placed on a plane. Can efficiently find objects in a certain rectangular area.
* It maintains two parallel lists of objects, each of which is sorted by its x or y coordinate.
*
* @author Pawel Szostek
*/
public class DocumentPlane {
/**
* List of objects on the plane. Stored in a random order
*/
private final List<BoundingBox> objs;
/**
* Size of a grid square. If gridSize=50, then the plane is divided into squares of size 50. Each square contains
* objects placed in a 50x50 area
*/
private final int gridSize;
/**
* Redundant dictionary of objects on the plane. Allows efficient 2D space search. Keys are X-Y coordinates of a
* grid square. Single object can be stored under several keys (depending on its physical size). Grid squares are
* lazy-initialized.
*/
private final Map<GridXY, List<BoundingBox>> grid;
/**
* Representation of XY coordinates
*/
private static class GridXY {
public int x;
public int y;
public GridXY(int x, int y) {
this.x = x;
this.y = y;
}
@Override
public int hashCode() {
return x * y;
}
@Override
public boolean equals(Object obj) {
if (obj == null || getClass() != obj.getClass()) {
return false;
}
GridXY comparedObj = (GridXY) obj;
return x == comparedObj.x && y == comparedObj.y;
}
@Override
public String toString() {
return "(" + x + "," + y + ")";
}
}
public List<BoundingBox> getObjects() {
return objs;
}
public DocumentPlane(List<Zone> objectList, int gridSize) {
this.grid = new HashMap<GridXY, List<BoundingBox>>();
this.objs = new ArrayList<BoundingBox>();
this.gridSize = gridSize;
for (Zone obj : objectList) {
add(obj);
}
}
/**
* Looks for objects placed between obj1 and obj2 excluding them
*
* @param obj1 object
* @param obj2 object
* @return object list
*/
public List<BoundingBox> findObjectsBetween(BoundingBox obj1, BoundingBox obj2) {
double x0 = Math.min(obj1.getX(), obj2.getX());
double y0 = Math.min(obj1.getY(), obj2.getY());
double x1 = Math.max(obj1.getX() + obj1.getWidth(), obj2.getX() + obj2.getWidth());
double y1 = Math.max(obj1.getY() + obj1.getHeight(), obj2.getY() + obj2.getHeight());
assert x1 >= x0 && y1 >= y0;
Rectangle2D searchBounds = new Rectangle2D.Double(x0, y0, x1 - x0, y1 - y0);
List<BoundingBox> objsBetween = find(searchBounds);
/*
* the rectangle area must contain at least obj1 and obj2
*/
objsBetween.remove(obj1);
objsBetween.remove(obj2);
return objsBetween;
}
/**
* Checks if there is any object placed between obj1 and obj2
*
* @param obj1 object
* @param obj2 object
* @return true if anything is placed between, false otherwise
*/
public boolean anyObjectsBetween(BoundingBox obj1, BoundingBox obj2) {
List<BoundingBox> lObjs = findObjectsBetween(obj1, obj2);
return !(lObjs.isEmpty());
}
/**
* Adds object to the plane
*
* @param obj object
* @return document plane
*/
public DocumentPlane add(BoundingBox obj) {
int objsBefore = this.objs.size();
/*
* iterate over grid squares
*/
for (int y = ((int) obj.getY()) / gridSize; y <= ((int) (obj.getY() + obj.getHeight() + gridSize - 1)) / gridSize; ++y) {
for (int x = ((int) obj.getX()) / gridSize; x <= ((int) (obj.getX() + obj.getWidth() + gridSize - 1)) / gridSize; ++x) {
GridXY xy = new GridXY(x, y);
if (!grid.keySet().contains(xy)) {
/*
* add the non-existing key
*/
grid.put(xy, new ArrayList<BoundingBox>());
grid.get(xy).add(obj);
assert grid.get(xy).size() == 1;
} else {
grid.get(xy).add(obj);
}
}
}
objs.add(obj);
/*
* size of the object list should be incremented
*/
assert objsBefore + 1 == objs.size();
/*
* object list must contain the same number of objects as object dictionary
*/
assert objs.size() == elementsInGrid();
return this;
}
public DocumentPlane remove(BoundingBox obj) {
/*
* iterate over grid squares
*/
for (int y = ((int) obj.getY()) / gridSize; y <= ((int) (obj.getY() + obj.getHeight() + gridSize - 1)) / gridSize; ++y) {
for (int x = ((int) obj.getX()) / gridSize; x <= ((int) (obj.getX() + obj.getWidth() + gridSize - 1)) / gridSize; ++x) {
GridXY xy = new GridXY(x, y);
if (grid.get(xy).contains(obj)) {
grid.get(xy).remove(obj);
}
}
}
objs.remove(obj);
assert objs.size() == elementsInGrid();
return this;
}
/**
* Find objects within search bounds
*
* @param searchBounds is a search rectangle
* @return list of objects in!side search rectangle
*/
public List<BoundingBox> find(Rectangle2D searchBounds) {
List<BoundingBox> done = new ArrayList<BoundingBox>(); //contains already considered objects (wrt. optimization)
List<BoundingBox> ret = new ArrayList<BoundingBox>();
double x0 = searchBounds.getX();
double y0 = searchBounds.getY();
double y1 = searchBounds.getY() + searchBounds.getHeight();
double x1 = searchBounds.getX() + searchBounds.getWidth();
/*
* iterate over grid squares
*/
for (int y = (int) y0 / gridSize; y < ((int) (y1 + gridSize - 1)) / gridSize; ++y) {
for (int x = (int) x0 / gridSize; x < ((int) (x1 + gridSize - 1)) / gridSize; ++x) {
GridXY xy = new GridXY(x, y);
if (!grid.containsKey(xy)) {
continue;
}
for (BoundingBox obj : grid.get(xy)) {
if (done.contains(obj)) /*
* omit if already checked
*/ {
continue;
}
/*
* add to the checked objects
*/
done.add(obj);
/*
* check if two objects overlap
*/
if (obj.getX() + obj.getWidth() <= x0 || x1 <= obj.getX() || obj.getY() + obj.getHeight() <= y0 || y1 <= obj.getY()) {
continue;
}
ret.add(obj);
}
}
}
return ret;
}
/**
* Count objects stored in objects dictionary
*
* @return number of elements
*/
protected int elementsInGrid() {
List<BoundingBox> objs_ = new ArrayList<BoundingBox>();
for (GridXY coord : grid.keySet()) {
for (BoundingBox obj : grid.get(coord)) {
if (!objs_.contains(obj)) {
objs_.add(obj);
}
}
}
return objs_.size();
}
}

View File

@ -0,0 +1,29 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder;
import java.util.ArrayList;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone;
public class TreeToListConverter {
public List<Zone> convertToList(BoundingBoxZoneGroup obj) {
List<Zone> ret = new ArrayList<>();
if (obj.getLeftChild() instanceof Zone) {
Zone zone = (Zone) obj.getLeftChild();
ret.add(zone);
} else { // obj.getLeftChild() instanceof BxZoneGroup
ret.addAll(convertToList((BoundingBoxZoneGroup) obj.getLeftChild()));
}
if (obj.getRightChild() instanceof Zone) {
Zone zone = (Zone) obj.getRightChild();
ret.add(zone);
} else { // obj.getRightChild() instanceof BxZoneGroup
ret.addAll(convertToList((BoundingBoxZoneGroup) obj.getRightChild()));
}
return ret;
}
}

View File

@ -0,0 +1,50 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.AngleFilter;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.DisjointSets;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Line;
@Service
public class LineBuilderService {
private static final double CHARACTER_SPACING_DISTANCE_MULTIPLIER = 3.5;
private static final double MAX_VERTICAL_CHARACTER_DISTANCE = 0.67;
private static final double ANGLE_TOLERANCE = Math.PI / 6;
public List<Line> buildLines(List<Character> characters, double characterSpacing, double lineSpacing) {
double maxHorizontalDistance = characterSpacing * CHARACTER_SPACING_DISTANCE_MULTIPLIER;
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_CHARACTER_DISTANCE;
DisjointSets<Character> sets = new DisjointSets<>(characters);
AngleFilter filter = AngleFilter.newInstance(-ANGLE_TOLERANCE, ANGLE_TOLERANCE);
characters.forEach(character -> {
character.getNeighbors().forEach(neighbor -> {
double x = neighbor.getHorizontalDistance() / maxHorizontalDistance;
double y = neighbor.getVerticalDistance() / maxVerticalDistance;
if (filter.matches(neighbor) && Math.pow(x, 2) + Math.pow(y, 2) <= 1) {
sets.union(character, neighbor.getCharacter());
}
});
});
List<Line> lines = new ArrayList<>();
sets.forEach(group -> {
List<Character> lineComponents = new ArrayList<>(group);
lineComponents.sort(Comparator.comparingDouble(Character::getX));
lines.add(new Line(lineComponents, characterSpacing));
});
return lines;
}
}

View File

@ -0,0 +1,78 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Neighbor;
@Service
public class NearestNeighbourService {
private static final int NUMBER_OF_NEIGHBOURS = 8;
private static final double STEP = 16.0;
public void findNearestNeighbors(List<Character> characters) {
if (characters.isEmpty()) {
return;
}
characters.sort(Comparator.comparingDouble(Character::getX));
int maxNeighborCount = NUMBER_OF_NEIGHBOURS;
if (characters.size() <= NUMBER_OF_NEIGHBOURS) {
maxNeighborCount = characters.size() - 1;
}
for (int i = 0; i < characters.size(); i++) {
List<Neighbor> candidates = new ArrayList<>();
int start = i;
int end = i + 1;
double distance = Double.POSITIVE_INFINITY;
for (double searchDistance = 0; searchDistance < distance; ) {
searchDistance += STEP;
boolean newCandidatesFound = false;
while (start > 0 && characters.get(i).getX() - characters.get(start - 1).getX() < searchDistance) {
start--;
candidates.add(new Neighbor(characters.get(start), characters.get(i)));
clearLeastDistant(candidates, maxNeighborCount);
newCandidatesFound = true;
}
while (end < characters.size() && characters.get(end).getX() - characters.get(i).getX() < searchDistance) {
candidates.add(new Neighbor(characters.get(end), characters.get(i)));
clearLeastDistant(candidates, maxNeighborCount);
end++;
newCandidatesFound = true;
}
if (newCandidatesFound && candidates.size() >= maxNeighborCount) {
distance = candidates.get(maxNeighborCount - 1).getDistance();
}
}
clearLeastDistant(candidates, maxNeighborCount);
characters.get(i).setNeighbors(new ArrayList<>(candidates));
}
}
private void clearLeastDistant(List<Neighbor> candidates, int maxNeighborCount) {
if (candidates.size() > maxNeighborCount) {
candidates.sort(Comparator.comparingDouble(Neighbor::getDistance));
candidates.remove(candidates.remove(candidates.size() - 1));
}
}
}

View File

@ -0,0 +1,286 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder.BoundingBoxZoneGroup;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder.DistElem;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder.DocumentPlane;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder.TreeToListConverter;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.utils.DoubleUtils;
@Service
public class ReadingOrderService {
static final int GRIDSIZE = 50;
static final double EPS = 0.01;
static final int MAX_ZONES = 1000;
static final Comparator<BoundingBox> Y_ASCENDING_ORDER = new Comparator<BoundingBox>() {
@Override
public int compare(BoundingBox o1, BoundingBox o2) {
return DoubleUtils.compareDouble(o1.getY(), o2.getY(), EPS);
}
};
static final Comparator<BoundingBox> X_ASCENDING_ORDER = new Comparator<BoundingBox>() {
@Override
public int compare(BoundingBox o1, BoundingBox o2) {
return DoubleUtils.compareDouble(o1.getX(), o2.getX(), EPS);
}
};
static final Comparator<BoundingBox> YX_ASCENDING_ORDER = new Comparator<BoundingBox>() {
@Override
public int compare(BoundingBox o1, BoundingBox o2) {
int yCompare = Y_ASCENDING_ORDER.compare(o1, o2);
return yCompare == 0 ? X_ASCENDING_ORDER.compare(o1, o2) : yCompare;
}
};
public List<Zone> resolve(List<Zone> zones) {
List<Zone> orderedZones;
if (zones.size() > MAX_ZONES) {
orderedZones = new ArrayList<>(zones);
Collections.sort(orderedZones, YX_ASCENDING_ORDER);
} else {
orderedZones = reorderZones(zones);
}
return orderedZones;
}
private List<Zone> reorderZones(List<Zone> unorderedZones) {
if (unorderedZones.isEmpty()) {
return new ArrayList<>();
} else if (unorderedZones.size() == 1) {
List<Zone> ret = new ArrayList<>(1);
ret.add(unorderedZones.get(0));
return ret;
} else {
BoundingBoxZoneGroup bxZonesTree = groupZonesHierarchically(unorderedZones);
sortGroupedZones(bxZonesTree);
TreeToListConverter treeConverter = new TreeToListConverter();
List<Zone> orderedZones = treeConverter.convertToList(bxZonesTree);
assert unorderedZones.size() == orderedZones.size();
return orderedZones;
}
}
/**
* Builds a binary tree of zones and groups of zones from a list of unordered zones. This is done in hierarchical
* clustering by joining two least distant nodes. Distance is calculated in the distance() method.
*
* @param zones is a list of unordered zones
* @return root of the zones clustered in a tree
*/
private BoundingBoxZoneGroup groupZonesHierarchically(List<Zone> zones) {
/*
* Distance tuples are stored sorted by ascending distance value
*/
List<DistElem<BoundingBox>> dists = new ArrayList<DistElem<BoundingBox>>(zones.size() * zones.size() / 2);
for (int idx1 = 0; idx1 < zones.size(); ++idx1) {
for (int idx2 = idx1 + 1; idx2 < zones.size(); ++idx2) {
Zone zone1 = zones.get(idx1);
Zone zone2 = zones.get(idx2);
dists.add(new DistElem<BoundingBox>(false, distance(zone1, zone2), zone1, zone2));
}
}
Collections.sort(dists);
DocumentPlane plane = new DocumentPlane(zones, GRIDSIZE);
while (!dists.isEmpty()) {
DistElem<BoundingBox> distElem = dists.get(0);
dists.remove(0);
if (!distElem.isC() && plane.anyObjectsBetween(distElem.getObj1(), distElem.getObj2())) {
dists.add(new DistElem<BoundingBox>(true, distElem.getDist(), distElem.getObj1(), distElem.getObj2()));
continue;
}
BoundingBoxZoneGroup newGroup = new BoundingBoxZoneGroup(distElem.getObj1(), distElem.getObj2());
plane.remove(distElem.getObj1()).remove(distElem.getObj2());
dists = removeDistElementsContainingObject(dists, distElem.getObj1());
dists = removeDistElementsContainingObject(dists, distElem.getObj2());
for (BoundingBox other : plane.getObjects()) {
dists.add(new DistElem<BoundingBox>(false, distance(other, newGroup), newGroup, other));
}
Collections.sort(dists);
plane.add(newGroup);
}
assert plane.getObjects().size() == 1 : "There should be one object left at the plane after grouping";
return (BoundingBoxZoneGroup) plane.getObjects().get(0);
}
/**
* Removes all distance tuples containing obj
*/
private List<DistElem<BoundingBox>> removeDistElementsContainingObject(Collection<DistElem<BoundingBox>> list, BoundingBox obj) {
List<DistElem<BoundingBox>> ret = new ArrayList<DistElem<BoundingBox>>();
for (DistElem<BoundingBox> distElem : list) {
if (distElem.getObj1() != obj && distElem.getObj2() != obj) {
ret.add(distElem);
}
}
return ret;
}
/**
* Swaps children of BxZoneGroup if necessary. A group with smaller sort factor is placed to the left (leftChild).
* An object with greater sort factor is placed on the right (rightChild). This plays an important role when
* traversing the tree in conversion to a one dimensional list.
*
* @param group
*/
private void sortGroupedZones(BoundingBoxZoneGroup group) {
BoundingBox leftChild = group.getLeftChild();
BoundingBox rightChild = group.getRightChild();
if (shouldBeSwapped(leftChild, rightChild)) {
// swap
group.setLeftChild(rightChild);
group.setRightChild(leftChild);
}
if (leftChild instanceof BoundingBoxZoneGroup) // if the child is a tree node, then recurse
{
sortGroupedZones((BoundingBoxZoneGroup) leftChild);
}
if (rightChild instanceof BoundingBoxZoneGroup) // as above - recurse
{
sortGroupedZones((BoundingBoxZoneGroup) rightChild);
}
}
private boolean shouldBeSwapped(BoundingBox first, BoundingBox second) {
double cx, cy, cw, ch, ox, oy, ow, oh;
cx = first.getBBox().getX();
cy = first.getBBox().getY();
cw = first.getBBox().getWidth();
ch = first.getBBox().getHeight();
ox = second.getBBox().getX();
oy = second.getBBox().getY();
ow = second.getBBox().getWidth();
oh = second.getBBox().getHeight();
// Determine Octant
//
// 0 | 1 | 2
// __|___|__
// 7 | 9 | 3 First is placed in 9th square
// __|___|__
// 6 | 5 | 4
if (cx + cw <= ox) { //2,3,4
return false;
} else if (ox + ow <= cx) { //0,6,7
return true; //6
} else if (cy + ch <= oy) {
return false; //5
} else if (oy + oh <= cy) {
return true; //1
} else { //two zones
double xdiff = ox + ow / 2 - cx - cw / 2;
double ydiff = oy + oh / 2 - cy - ch / 2;
return xdiff + ydiff < 0;
}
}
/**
* A distance function between two TextBoxes.
* <p>
* Consider the bounding rectangle for obj1 and obj2. Return its area minus the areas of obj1 and obj2, shown as
* 'www' below. This value may be negative. (x0,y0) +------+..........+ | obj1 |wwwwwwwwww: +------+www+------+
* :wwwwwwwwww| obj2 | +..........+------+ (x1,y1)
*
* @return distance value based on objects' coordinates and physical size on a plane
*/
private double distance(BoundingBox obj1, BoundingBox obj2) {
double x0 = Math.min(obj1.getX(), obj2.getX());
double y0 = Math.min(obj1.getY(), obj2.getY());
double x1 = Math.max(obj1.getX() + obj1.getWidth(), obj2.getX() + obj2.getWidth());
double y1 = Math.max(obj1.getY() + obj1.getHeight(), obj2.getY() + obj2.getHeight());
double dist = ((x1 - x0) * (y1 - y0) - obj1.getArea() - obj2.getArea());
double factor = ((x1 - x0)/x1) / ((y1 - y0)/y1);
double obj1X = obj1.getX();
double obj1Y_2 = obj1.getBBox().getMaxY();
double obj1X_2 = obj1.getBBox().getMaxX();
double obj1CenterX = obj1.getBBox().getCenterX();
double obj1CenterY = obj1.getBBox().getCenterY();
double obj2X = obj2.getX();
double obj2Y_2 = obj2.getBBox().getMaxY();
double obj2X_2 = obj2.getBBox().getMaxX();
double obj2CenterX = obj2.getBBox().getCenterX();
double obj2CenterY = obj2.getBBox().getCenterY();
double obj1obj2VectorCosineAbsLeft = Math.abs((obj2X - obj1X) / Math.sqrt((obj2X - obj1X) * (obj2X - obj1X) + (obj2CenterY - obj1CenterY) * (obj2CenterY - obj1CenterY)));
double obj1obj2VectorCosineAbsRight = Math.abs((obj2X_2 - obj1X_2) / Math.sqrt((obj2X_2 - obj1X_2) * (obj2X_2 - obj1X_2) + (obj2CenterY - obj1CenterY) * (obj2CenterY - obj1CenterY)));
double obj1obj2VectorCosineAbsCenter = Math.abs((obj2CenterX - obj1CenterX) / Math.sqrt((obj2CenterX - obj1CenterX) * (obj2CenterX - obj1CenterX) + (obj2CenterY - obj1CenterY) * (obj2CenterY - obj1CenterY)));
double cosine = Math.min(obj1obj2VectorCosineAbsLeft, Math.min(obj1obj2VectorCosineAbsRight, obj1obj2VectorCosineAbsCenter));
final double MAGIC_COEFF = 0.85;
//return dist * (MAGIC_COEFF + cosine);
return Math.sqrt(Math.pow((obj1X - obj2X), 2) + Math.pow((obj1Y_2 - obj2Y_2) * MAGIC_COEFF, 2));
/**if (Math.abs(obj1CenterX - obj2CenterX) >= Math.abs(obj1CenterY - obj2CenterY)) {
return dist * 2;
} else {
return dist;
}**/
}
private double distanceNew(BoundingBox obj1, BoundingBox obj2) {
if(obj1.getBBox().intersects(obj2.getBBox()))
return -1;
double minX0 = Math.min(obj1.getX(), obj2.getX());
double maxX0 = Math.max(obj1.getX(), obj2.getX());
double minY0 = Math.min(obj1.getY(), obj2.getY());
double maxY0 = Math.max(obj1.getY(), obj2.getY());
double minX1 = Math.min(obj1.getX() + obj1.getWidth(), obj2.getX() + obj2.getWidth());
double maxX1 = Math.max(obj1.getX() + obj1.getWidth(), obj2.getX() + obj2.getWidth());
double minY1 = Math.min(obj1.getY() + obj1.getHeight(), obj2.getY() + obj2.getHeight());
double maxY1 = Math.max(obj1.getY() + obj1.getHeight(), obj2.getY() + obj2.getHeight());
List<Double> xValues = new ArrayList<>(List.of(minX0, maxX0, minX1, maxX1));
Collections.sort(xValues);
List<Double> yValues = new ArrayList<>(List.of(minY0, maxY0, minY1, maxY1));
Collections.sort(yValues);
double yArea = (xValues.get(2) - xValues.get(1)) * (yValues.get(3) - yValues.get(0));
double xArea = (yValues.get(2) - yValues.get(1)) * (xValues.get(3) - xValues.get(0));
return Math.min(10*yArea, xArea);
}
}

View File

@ -0,0 +1,56 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service;
import java.util.List;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.AngleFilter;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Histogram;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Neighbor;
@Service
public class SpacingService {
private static final double SPACING_HISTOGRAM_RESOLUTION = 0.5;
private static final double SPACING_HISTOGRAM_SMOOTHING_LENGTH = 2.5;
private static final double SPACING_HIST_SMOOTHING_STANDARD_DEVIATION = 0.5;
private static final double ANGLE_TOLERANCE = Math.PI / 6;
public double computeCharacterSpacing(List<Character> components) {
return computeSpacing(components, 0);
}
public double computeLineSpacing(List<Character> components) {
return computeSpacing(components, Math.PI / 2);
}
private double computeSpacing(List<Character> components, double angle) {
double maxDistance = Double.NEGATIVE_INFINITY;
for (Character component : components) {
for (Neighbor neighbor : component.getNeighbors()) {
maxDistance = Math.max(maxDistance, neighbor.getDistance());
}
}
Histogram histogram = new Histogram(0, maxDistance, SPACING_HISTOGRAM_RESOLUTION);
AngleFilter filter = AngleFilter.newInstance(angle - ANGLE_TOLERANCE, angle + ANGLE_TOLERANCE);
for (Character component : components) {
for (Neighbor neighbor : component.getNeighbors()) {
if (filter.matches(neighbor)) {
histogram.add(neighbor.getDistance());
}
}
}
histogram.gaussianSmooth(SPACING_HISTOGRAM_SMOOTHING_LENGTH, SPACING_HIST_SMOOTHING_STANDARD_DEVIATION);
return histogram.getPeakValue();
}
}

View File

@ -0,0 +1,94 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service;
import java.util.ArrayList;
import java.util.List;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.DisjointSets;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Line;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone;
@Service
public class ZoneBuilderService {
private static final double MIN_HORIZONTAL_DISTANCE_MULTIPLIER = -0.5;
private static final double MAX_VERTICAL_DISTANCE_MULTIPLIER = 1.2;
private static final double MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER = -3.0;
private static final double MAX_VERTICAL_MERGE_DISTANCE_MULTIPLIER = 0.5;
private static final double MIN_LINE_SIZE_SCALE = 0.9;
private static final double MAX_LINE_SIZE_SCALE = 2.5;
private static final double ANGLE_TOLERANCE = Math.PI / 6;
public static final int MAX_ZONES = 300;
public List<Zone> buildZones(List<Line> lines, double characterSpacing, double lineSpacing) {
double minHorizontalDistance = characterSpacing * MIN_HORIZONTAL_DISTANCE_MULTIPLIER;
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_DISTANCE_MULTIPLIER;
double minHorizontalMergeDistance = characterSpacing * MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER;
double maxVerticalMergeDistance = lineSpacing * MAX_VERTICAL_MERGE_DISTANCE_MULTIPLIER;
DisjointSets<Line> sets = new DisjointSets<>(lines);
double meanHeight = calculateMeanHeight(lines);
lines.forEach(outerLine -> //
lines.forEach(innerLine -> {
double scale = Math.min(outerLine.getHeight(), innerLine.getHeight()) / meanHeight;
scale = Math.max(MIN_LINE_SIZE_SCALE, Math.min(scale, MAX_LINE_SIZE_SCALE));
if (!sets.areTogether(outerLine, innerLine) && outerLine.angularDifference(innerLine) <= ANGLE_TOLERANCE) {
double horizontalDistance = outerLine.horizontalDistance(innerLine) / scale;
double verticalDistance = outerLine.verticalDistance(innerLine) / scale;
// Line over or above
if (minHorizontalDistance <= horizontalDistance && verticalDistance <= maxVerticalDistance) {
sets.union(outerLine, innerLine);
}
// Split line that needs later merging
else if (minHorizontalMergeDistance <= horizontalDistance && verticalDistance <= maxVerticalMergeDistance) {
sets.union(outerLine, innerLine);
}
}
}));
List<Zone> zones = new ArrayList<>();
sets.forEach(group -> {
zones.add(new Zone(new ArrayList<>(group)));
});
if (zones.size() > MAX_ZONES) {
List<Line> oneZoneLines = new ArrayList<>();
for (Zone zone : zones) {
oneZoneLines.addAll(zone.getLines());
}
return List.of(new Zone(oneZoneLines));
}
return zones;
}
private double calculateMeanHeight(List<Line> lines) {
double meanHeight = 0.0;
double weights = 0.0;
for (Line line : lines) {
double weight = line.getLength();
meanHeight += line.getHeight() * weight;
weights += weight;
}
meanHeight /= weights;
return meanHeight;
}
}

View File

@ -0,0 +1,18 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.utils;
public class DoubleUtils {
public static int compareDouble(double d1, double d2, double precision) {
if (Double.isNaN(d1) || Double.isNaN(d2)) {
return Double.compare(d1, d2);
}
if (precision == 0) {
precision = 1;
}
long i1 = Math.round(d1 / precision);
long i2 = Math.round(d2 / precision);
return Long.valueOf(i1).compareTo(i2);
}
}

View File

@ -0,0 +1,270 @@
package com.knecon.fforesight.service.layoutparser.processor.services.visualization;
import java.awt.Color;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.font.PDType1Font;
import org.apache.pdfbox.pdmodel.font.Standard14Fonts;
import org.apache.pdfbox.util.Matrix;
import org.springframework.core.io.ClassPathResource;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.utils.PdfVisualisationUtility;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Getter;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.experimental.UtilityClass;
@UtilityClass
public class PdfDraw {
public static void drawRectanglesPerPage(String filename, List<List<Rectangle2D>> rectanglesPerPage, String tmpFileName) throws IOException {
ClassPathResource pdfResource = new ClassPathResource(filename);
try (PDDocument pdDocument = Loader.loadPDF(pdfResource.getFile()); var out = new FileOutputStream(tmpFileName)) {
for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) {
PdfVisualisationUtility.drawRectangle2DList(pdDocument,
pageNumber,
rectanglesPerPage.get(pageNumber - 1),
PdfVisualisationUtility.Options.builder().stroke(true).build());
}
pdDocument.save(out);
}
}
public static void drawRectanglesPerPageNumberedByLine(String filename, List<List<List<Rectangle2D>>> rectanglesPerPage, String tmpFileName) throws IOException {
ClassPathResource pdfResource = new ClassPathResource(filename);
try (PDDocument pdDocument = Loader.loadPDF(pdfResource.getFile()); var out = new FileOutputStream(tmpFileName)) {
for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) {
var rectanglesOnPage = rectanglesPerPage.get(pageNumber - 1);
for (int lineNumber = 0; lineNumber < rectanglesOnPage.size(); lineNumber++) {
var rectanglesInLine = rectanglesOnPage.get(lineNumber);
PdfVisualisationUtility.drawRectangle2DList(pdDocument, pageNumber, rectanglesInLine, PdfVisualisationUtility.Options.builder().stroke(true).build());
double y = Math.min(rectanglesInLine.get(0).getMinY(), rectanglesInLine.get(0).getMaxY());
PdfVisualisationUtility.drawText(String.format("%d", lineNumber),
pdDocument,
new Point2D.Double(rectanglesInLine.get(0).getX() - (5 + (5 * countNumberOfDigits(lineNumber))), y + 2),
pageNumber,
PdfVisualisationUtility.Options.builder().stroke(true).build());
}
}
pdDocument.save(out);
}
}
private static int countNumberOfDigits(int num) {
int final_num = num;
if (final_num == 0) {
return 1;
}
int count = 0;
for (; final_num != 0; final_num /= 10) {
count++;
}
return count;
}
public static void drawDocumentGraph(PDDocument document, Document documentGraph) {
documentGraph.getDocumentTree().allEntriesInOrder().forEach(entry -> drawNode(document, entry));
}
public static void drawNode(PDDocument document, DocumentTree.Entry entry) {
Options options = buildStandardOptionsForNodes(entry);
drawBBoxAndLabelAndNumberOnPage(document, entry, options);
}
public static void drawTextBlock(PDDocument document, TextBlock textBlock, Options options) {
textBlock.getAtomicTextBlocks().forEach(atb -> drawAtomicTextBlock(document, atb, options));
}
public static void drawAtomicTextBlock(PDDocument document, AtomicTextBlock atomicTextBlock, Options options) {
drawRectangle2DList(document, atomicTextBlock.getPage().getNumber(), atomicTextBlock.getPositions().stream().toList(), options);
}
@SneakyThrows
private static void drawText(String string, PDDocument document, Point2D location, Integer pageNumber, Options options, boolean rotate) {
var pdPage = document.getPage(pageNumber - 1);
var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true);
contentStream.setNonStrokingColor(options.getStrokeColor());
contentStream.setLineWidth(options.getStrokeWidth());
contentStream.beginText();
if (rotate) {
contentStream.setTextMatrix(Matrix.getRotateInstance(Math.toRadians(15), (float) location.getX(), (float) location.getY()));
} else {
contentStream.newLineAtOffset((float) location.getX(), (float) location.getY());
}
contentStream.setFont(new PDType1Font(Standard14Fonts.FontName.HELVETICA), 10);
contentStream.showText(string);
contentStream.endText();
contentStream.close();
}
@SneakyThrows
public static void drawRectangle2DList(PDDocument document, int pageNumber, List<Rectangle2D> rectCollection, Options options) {
var pdPage = document.getPage(pageNumber - 1);
drawRectangle2DList(document, rectCollection, options, pdPage);
}
private static void drawRectangle2DList(PDDocument document, List<Rectangle2D> rectCollection, Options options, PDPage pdPage) throws IOException {
var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true);
contentStream.setStrokingColor(options.getStrokeColor());
contentStream.setNonStrokingColor(options.getFillColor());
contentStream.setLineWidth(options.getStrokeWidth());
for (var r : rectCollection) {
contentStream.addRect((float) r.getMinX(), (float) r.getMinY(), (float) r.getWidth(), (float) r.getHeight());
if (options.isStroke() && options.isFill()) {
contentStream.fillAndStroke();
} else if (options.isStroke()) {
contentStream.stroke();
} else if (options.isFill()) {
contentStream.fill();
}
}
contentStream.close();
}
@SneakyThrows
public static void drawRectanglesAndLinesPerPage(String filename, List<List<Rectangle2D>> list, List<List<Rectangle2D>> rectanglesPerPage, String tmpFileName) {
ClassPathResource pdfResource = new ClassPathResource(filename);
try (PDDocument pdDocument = Loader.loadPDF(pdfResource.getFile()); var out = new FileOutputStream(tmpFileName)) {
for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) {
// PdfVisualisationUtility.drawLine2DList(pdDocument,
// pageNumber,
// list.get(pageNumber - 1),
// PdfVisualisationUtility.Options.builder().stroke(true).build());
PdfVisualisationUtility.drawRectangle2DList(pdDocument,
pageNumber,
rectanglesPerPage.get(pageNumber - 1),
PdfVisualisationUtility.Options.builder().stroke(true).build());
PdfVisualisationUtility.drawRectangle2DList(pdDocument, pageNumber, list.get(pageNumber - 1), PdfVisualisationUtility.Options.builder().stroke(true).build());
}
pdDocument.save(out);
}
}
@SneakyThrows
public static void drawLinesPerPage(String filename, List<List<Ruling>> linesPerPage, String tmpFileName) {
ClassPathResource pdfResource = new ClassPathResource(filename);
try (PDDocument pdDocument = Loader.loadPDF(pdfResource.getFile()); var out = new FileOutputStream(tmpFileName)) {
for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) {
PdfVisualisationUtility.drawLine2DList(pdDocument,
pageNumber,
linesPerPage.get(pageNumber - 1),
PdfVisualisationUtility.Options.builder().strokeColor(Color.RED).stroke(true).build());
}
pdDocument.save(out);
}
}
@Builder
@AllArgsConstructor
@Getter
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public static class Options {
boolean stroke;
@Builder.Default
Color strokeColor = Color.BLACK;
@Builder.Default
float strokeWidth = 1f;
boolean fill;
@Builder.Default
Color fillColor = Color.BLACK;
}
private static Options buildStandardOptionsForNodes(DocumentTree.Entry entry) {
return Options.builder().stroke(true).strokeColor(switch (entry.getType()) {
case DOCUMENT -> Color.LIGHT_GRAY;
case HEADER, FOOTER -> Color.GREEN;
case PARAGRAPH -> Color.BLUE;
case HEADLINE -> Color.RED;
case SECTION -> Color.BLACK;
case TABLE -> Color.ORANGE;
case TABLE_CELL -> Color.GRAY;
case IMAGE -> Color.MAGENTA;
}).build();
}
private static void drawBBoxAndLabelAndNumberOnPage(PDDocument document, DocumentTree.Entry entry, Options options) {
Map<Page, Rectangle2D> rectanglesPerPage = entry.getNode().getBBox();
for (Page page : rectanglesPerPage.keySet()) {
Rectangle2D rectangle2D = rectanglesPerPage.get(page);
if (entry.getType() == NodeType.SECTION) {
rectangle2D = RectangleTransformations.pad(rectangle2D, 10, 10);
}
drawRectangle2DList(document, page.getNumber(), List.of(rectangle2D), options);
drawText(buildString(entry),
document,
new Point2D.Double(rectangle2D.getMinX(), rectangle2D.getMaxY() + 2),
page.getNumber(),
options,
entry.getType() == NodeType.TABLE_CELL);
}
}
private static String buildString(DocumentTree.Entry entry) {
return entry.getNode().getNumberOnPage() + ": " + entry.getTreeId() + ": " + entry.getType();
}
}

View File

@ -25,7 +25,9 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
@SneakyThrows @SneakyThrows
public void testLayoutParserEndToEnd() { public void testLayoutParserEndToEnd() {
prepareStorage("files/bdr/Wie weiter bei Kristeneinrichtungen.pdf"); String s = "";
String s1 = "";
prepareStorage("files/Minimal Examples/WrongOrderPage1.pdf");
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER); LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER);
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest); LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
Arrays.stream(finishedEvent.message().split("\n")).forEach(log::info); Arrays.stream(finishedEvent.message().split("\n")).forEach(log::info);

View File

@ -25,7 +25,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
@SneakyThrows @SneakyThrows
public void testViewerDocument() { public void testViewerDocument() {
String fileName = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"; String fileName = "files/Plenarprotokoll 1 (keine Druchsache!) (1).pdf";
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf"; String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
var documentFile = new ClassPathResource(fileName).getFile(); var documentFile = new ClassPathResource(fileName).getFile();
@ -35,9 +35,10 @@ public class ViewerDocumentTest extends BuildDocumentTest {
Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER); Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER);
long start = System.currentTimeMillis(); long start = System.currentTimeMillis();
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true); layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000); System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
} }
@Test @Test
@Disabled @Disabled
@SneakyThrows @SneakyThrows
@ -51,7 +52,11 @@ public class ViewerDocumentTest extends BuildDocumentTest {
var tableResponse = mapper.readValue(new ClassPathResource(tableFileName).getInputStream(), TableServiceResponse.class); var tableResponse = mapper.readValue(new ClassPathResource(tableFileName).getInputStream(), TableServiceResponse.class);
var documentFile = new ClassPathResource(fileName).getFile(); var documentFile = new ClassPathResource(fileName).getFile();
var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE, documentFile, new ImageServiceResponse(), tableResponse, Path.of(fileName).getFileName().toFile().toString()); var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE,
documentFile,
new ImageServiceResponse(),
tableResponse,
Path.of(fileName).getFileName().toFile().toString());
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null); ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService); LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
Document document = DocumentGraphFactory.buildDocumentGraph(classificationDocument); Document document = DocumentGraphFactory.buildDocumentGraph(classificationDocument);

View File

@ -48,7 +48,7 @@ class GapAcrossLinesDetectionServiceTest {
@Test @Test
@Disabled //@Disabled
@SneakyThrows @SneakyThrows
public void testColumnDetection() { public void testColumnDetection() {