Compare commits

...

10 Commits
main ... rOrder

Author SHA1 Message Date
Dominique Eifländer
08e994d904 More refactoring 2024-02-20 14:04:28 +01:00
Dominique Eifländer
0c8c727303 Reading Order 2024-02-20 11:56:43 +01:00
Dominique Eifländer
72202f63dc More 2024-02-16 14:51:26 +01:00
Dominique Eifländer
e14d953b04 More 2024-02-16 14:50:31 +01:00
Dominique Eifländer
9e5778d4b2 More 2024-02-16 14:08:59 +01:00
Dominique Eifländer
e394f2fa7c More refactoring 2024-02-16 13:48:03 +01:00
Dominique Eifländer
b2fb6829cb More refactoring 2024-02-16 11:15:44 +01:00
Dominique Eifländer
4871e55f2d More refactoring 2024-02-15 16:54:07 +01:00
Dominique Eifländer
4de6c12aec REmove more 2024-02-15 10:29:39 +01:00
Dominique Eifländer
4afa8daafa First working docstrum 2024-02-14 13:39:27 +01:00
36 changed files with 1596 additions and 17 deletions

View File

@ -10,6 +10,7 @@ import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference;
import org.apache.pdfbox.Loader;
@ -26,6 +27,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsi
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationSection;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
@ -47,6 +49,7 @@ import com.knecon.fforesight.service.layoutparser.processor.services.blockificat
import com.knecon.fforesight.service.layoutparser.processor.services.classification.DocuMineClassificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.TaasClassificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.DocstrumSegmentationService;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper;
@ -86,6 +89,7 @@ public class LayoutParsingPipeline {
RedactManagerBlockificationService redactManagerBlockificationService;
LayoutGridService layoutGridService;
ObservationRegistry observationRegistry;
DocstrumSegmentationService docstrumSegmentationService;
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
@ -243,11 +247,37 @@ public class LayoutParsingPipeline {
PDRectangle cropbox = pdPage.getCropBox();
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), stripper.getRulings());
ClassificationPage classificationPage = switch (layoutParsingType) {
case REDACT_MANAGER -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
case TAAS -> taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
};
// Docstrum
AtomicInteger num = new AtomicInteger(pageNumber);
var zones = docstrumSegmentationService.segmentPage(stripper.getTextPositionSequences());
List<AbstractPageBlock> pageBlocks = new ArrayList<>();
AtomicInteger numOnPage = new AtomicInteger(1);
// List<TextPositionSequence> textPositionSequences = new ArrayList<>();
zones.forEach(zone -> {
List<TextPositionSequence> textPositionSequences = new ArrayList<>();
zone.getLines().forEach(line -> {
line.getWords().forEach(word -> {
textPositionSequences.add(new TextPositionSequence(word.getTextPositions(), num.get()));
});
});
var cps = redactManagerBlockificationService.blockify(textPositionSequences, cleanRulings.getHorizontal(), cleanRulings.getVertical());
cps.getTextBlocks().forEach(cp -> {
pageBlocks.add(redactManagerBlockificationService.buildTextBlock(((TextPageBlock) cp).getSequences(), numOnPage.getAndIncrement()));
});
});
// ClassificationPage classificationPage = switch (layoutParsingType) {
// case REDACT_MANAGER -> redactManagerBlockificationService.blockify(textPositionSequences, cleanRulings.getHorizontal(), cleanRulings.getVertical());
// case TAAS -> taasBlockificationService.blockify(textPositionSequences, cleanRulings.getHorizontal(), cleanRulings.getVertical());
// case DOCUMINE -> docuMineBlockificationService.blockify(textPositionSequences, cleanRulings.getHorizontal(), cleanRulings.getVertical());
// };
ClassificationPage classificationPage = new ClassificationPage(pageBlocks);
classificationPage.setCleanRulings(cleanRulings);
classificationPage.setRotation(rotation);
classificationPage.setLandscape(isLandscape);
@ -283,9 +313,19 @@ public class LayoutParsingPipeline {
case REDACT_MANAGER -> redactManagerClassificationService.classifyDocument(classificationDocument);
}
List<ClassificationSection> sections = new ArrayList<>();
for (var page : classificationPages) {
page.getTextBlocks().forEach(block -> {
block.setPage(page.getPageNumber());
var section = sectionsBuilderService.buildTextBlock(List.of(block), "a");
sections.add(section);
});
}
classificationDocument.setSections(sections);
log.info("Building Sections for {}", identifier);
sectionsBuilderService.buildSections(classificationDocument);
sectionsBuilderService.addImagesToSections(classificationDocument);
// sectionsBuilderService.buildSections(classificationDocument);
// sectionsBuilderService.addImagesToSections(classificationDocument);
return classificationDocument;
}

View File

@ -12,15 +12,16 @@ import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRul
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
import lombok.Data;
import lombok.Getter;
import lombok.NonNull;
import lombok.RequiredArgsConstructor;
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
@Data
@RequiredArgsConstructor
public class ClassificationPage {
@NonNull
@Getter
private List<AbstractPageBlock> textBlocks;
private List<ClassifiedImage> images = new ArrayList<>();

View File

@ -45,6 +45,9 @@ public class RedTextPosition {
@JsonIgnore
private String fontName;
@JsonIgnore
private RedTextPosition parent;
@SneakyThrows
public static RedTextPosition fromTextPosition(TextPosition textPosition) {

View File

@ -17,6 +17,7 @@ import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.NoArgsConstructor;
@EqualsAndHashCode(callSuper = true)
@ -27,6 +28,7 @@ import lombok.NoArgsConstructor;
public class TextPageBlock extends AbstractPageBlock {
@Builder.Default
@Getter
private List<TextPositionSequence> sequences = new ArrayList<>();
@JsonIgnore
@ -73,7 +75,7 @@ public class TextPageBlock extends AbstractPageBlock {
return sequences.get(0).getPageWidth();
}
public static TextPageBlock merge(List<TextPageBlock> textBlocksToMerge) {
@ -82,6 +84,7 @@ public class TextPageBlock extends AbstractPageBlock {
return fromTextPositionSequences(sequences);
}
public static TextPageBlock fromTextPositionSequences(List<TextPositionSequence> wordBlockList) {
TextPageBlock textBlock = null;
@ -133,7 +136,6 @@ public class TextPageBlock extends AbstractPageBlock {
}
/**
* Returns the minX value in pdf coordinate system.
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.

View File

@ -55,6 +55,18 @@ public class TextPositionSequence implements CharSequence {
}
public TextPositionSequence(List<RedTextPosition> textPositions, int page) {
this.textPositions = textPositions;
this.page = page;
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
this.rotation = textPositions.get(0).getRotation();
this.pageHeight = textPositions.get(0).getPageHeight();
this.pageWidth = textPositions.get(0).getPageWidth();
this.isParagraphStart = false;
}
@Override
public int length() {

View File

@ -240,7 +240,7 @@ public class SectionsBuilderService {
}
private ClassificationSection buildTextBlock(List<AbstractPageBlock> wordBlockList, String lastHeadline) {
public ClassificationSection buildTextBlock(List<AbstractPageBlock> wordBlockList, String lastHeadline) {
ClassificationSection section = new ClassificationSection();

View File

@ -57,7 +57,7 @@ public class RedactManagerBlockificationService {
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) {
if (prev != null && (splitByDir || isSplitByRuling)) {
Orientation prevOrientation = null;
if (!chunkBlockList.isEmpty()) {
@ -167,7 +167,7 @@ public class RedactManagerBlockificationService {
}
private TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
public TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
TextPageBlock textBlock = null;

View File

@ -0,0 +1,153 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.DisjointSets;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Line;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.LineBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.NearestNeighbourService;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.ReadingOrderService;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.SpacingService;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.ZoneBuilderService;
import lombok.RequiredArgsConstructor;
@Service
@RequiredArgsConstructor
public class DocstrumSegmentationService {
private static final double MAX_VERTICAL_MERGE_DIST = 0.5;
private final NearestNeighbourService nearestNeighbourService;
private final SpacingService spacingService;
private final LineBuilderService lineBuilderService;
private final ZoneBuilderService zoneBuilderService;
private final ReadingOrderService readingOrderService;
public List<Zone> segmentPage(List<TextPositionSequence> textPositions) {
var positions = textPositions.stream().map(TextPositionSequence::getTextPositions).flatMap(List::stream).toList();
var characters = positions.stream().map(Character::new).collect(Collectors.toList());
nearestNeighbourService.findNearestNeighbors(characters);
var characterSpacing = spacingService.computeCharacterSpacing(characters);
var lineSpacing = spacingService.computeLineSpacing(characters);
var lines = lineBuilderService.buildLines(characters, characterSpacing, lineSpacing);
var zones = zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing);
zones = mergeLinesInZones(zones, characterSpacing, Double.NEGATIVE_INFINITY, 0.0, 0.0, lineSpacing * MAX_VERTICAL_MERGE_DIST);
return readingOrderService.resolve(zones, false);
}
// private List<Zone> mergeZones(List<Zone> zones, double tolerance) {
//
// List<BxBounds> bounds = new ArrayList<BxBounds>(zones.size());
// for (List<ComponentLine> zone : zones) {
// BxBoundsBuilder builder = new BxBoundsBuilder();
// for (ComponentLine line : zone) {
// for (Component component : line.getComponents()) {
// builder.expand(component.getChunk().getBounds());
// }
// }
// bounds.add(builder.getBounds());
// }
//
// List<List<ComponentLine>> outputZones = new ArrayList<List<ComponentLine>>();
// mainFor:
// for (int i = 0; i < zones.size(); i++) {
// for (int j = 0; j < zones.size(); j++) {
// if (i == j || bounds.get(j) == null || bounds.get(i) == null) {
// continue;
// }
// if (BxModelUtils.contains(bounds.get(j), bounds.get(i), tolerance)) {
// zones.get(j).addAll(zones.get(i));
// bounds.set(i, null);
// continue mainFor;
// }
// }
// outputZones.add(zones.get(i));
// }
// return outputZones;
// }
private List<Zone> mergeLinesInZones(List<Zone> zones,
double wordSpacing,
double minHorizontalDistance,
double maxHorizontalDistance,
double minVerticalDistance,
double maxVerticalDistance) {
List<Zone> outputZones = new ArrayList<>(zones.size());
for (Zone zone : zones) {
outputZones.add(mergeLinesInZone(zone, wordSpacing, minHorizontalDistance, maxHorizontalDistance, minVerticalDistance, maxVerticalDistance));
}
return outputZones;
}
private Zone mergeLinesInZone(Zone zone,
double wordSpacing,
double minHorizontalDistance,
double maxHorizontalDistance,
double minVerticalDistance,
double maxVerticalDistance) {
DisjointSets<Line> sets = new DisjointSets<>(zone.getLines());
for (int i = 0; i < zone.getLines().size(); i++) {
Line li = zone.getLines().get(i);
for (int j = i + 1; j < zone.getLines().size(); j++) {
Line lj = zone.getLines().get(j);
double hDist = li.horizontalDistance(lj);
double vDist = li.verticalDistance(lj);
if (minHorizontalDistance <= hDist && hDist <= maxHorizontalDistance && minVerticalDistance <= vDist && vDist <= maxVerticalDistance) {
sets.union(li, lj);
} else if (minVerticalDistance <= vDist && vDist <= maxVerticalDistance && Math.abs(hDist - Math.min(li.getLength(), lj.getLength())) < 0.1) {
boolean componentOverlap = false;
int overlappingCount = 0;
for (Character ci : li.getCharacters()) {
for (Character cj : lj.getCharacters()) {
double dist = ci.overlappingDistance(cj);
if (dist > 2) {
componentOverlap = true;
}
if (dist > 0) {
overlappingCount++;
}
}
}
if (!componentOverlap && overlappingCount <= 2) {
sets.union(li, lj);
}
}
}
}
List<Line> outputZone = new ArrayList<>();
for (Set<Line> group : sets) {
List<Character> components = new ArrayList<>();
for (Line line : group) {
components.addAll(line.getCharacters());
}
components.sort(Comparator.comparingDouble(Character::getX));
outputZone.add(new Line(components, wordSpacing));
}
return new Zone(outputZone);
}
}

View File

@ -0,0 +1,32 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
public class AngleFilter {
protected double lowerAngle;
protected double upperAngle;
public AngleFilter(double lowerAngle, double upperAngle) {
if (lowerAngle < -Math.PI / 2) {
lowerAngle += Math.PI;
}
if (upperAngle >= Math.PI / 2) {
upperAngle -= Math.PI;
}
this.lowerAngle = lowerAngle;
this.upperAngle = upperAngle;
}
public boolean matches(Neighbor neighbor) {
if (lowerAngle <= upperAngle) {
return lowerAngle <= neighbor.getAngle() && neighbor.getAngle() < upperAngle;
} else {
return lowerAngle <= neighbor.getAngle() || neighbor.getAngle() < upperAngle;
}
}
}

View File

@ -0,0 +1,48 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
import java.awt.geom.Rectangle2D;
import lombok.Data;
@Data
public abstract class BoundingBox {
private Rectangle2D bBox;
public double getX() {
return bBox.getX();
}
public double getY() {
return bBox.getY();
}
public double getWidth() {
return bBox.getWidth();
}
public double getHeight() {
return bBox.getHeight();
}
public double getArea() {
return (bBox.getHeight() * bBox.getWidth());
}
public boolean contains(Rectangle2D contained, double tolerance) {
return bBox.getX() <= contained.getX() + tolerance && bBox.getY() <= contained.getY() + tolerance && bBox.getX() + bBox.getWidth() >= contained.getX() + contained.getWidth() - tolerance && bBox.getY() + bBox.getHeight() >= contained.getY() + contained.getHeight() - tolerance;
}
}

View File

@ -0,0 +1,84 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
import lombok.Data;
@Data
public class Character {
private final double x;
private final double y;
private final RedTextPosition textPosition;
private List<Neighbor> neighbors = new ArrayList<>();
public Character(RedTextPosition chunk) {
this.x = chunk.getXDirAdj() + chunk.getWidthDirAdj() / 2;
this.y = chunk.getYDirAdj() + chunk.getHeightDir() / 2;
this.textPosition = chunk;
}
public double getHeight() {
return textPosition.getHeightDir();
}
public double distance(Character character) {
double dx = getX() - character.getX();
double dy = getY() - character.getY();
return Math.sqrt(dx * dx + dy * dy);
}
public double horizontalDistance(Character character) {
return Math.abs(getX() - character.getX());
}
public double verticalDistance(Character character) {
return Math.abs(getY() - character.getY());
}
public double overlappingDistance(Character other) {
double[] xs = new double[4];
double s = Math.sin(-0), c = Math.cos(-0);
xs[0] = c * x - s * y;
xs[1] = c * (x + textPosition.getWidthDirAdj()) - s * (y + textPosition.getHeightDir());
xs[2] = c * other.x - s * other.y;
xs[3] = c * (other.x + other.textPosition.getWidthDirAdj()) - s * (other.y + other.textPosition.getHeightDir());
boolean overlapping = xs[1] >= xs[2] && xs[3] >= xs[0];
Arrays.sort(xs);
return Math.abs(xs[2] - xs[1]) * (overlapping ? 1 : -1);
}
public void setNeighbors(List<Neighbor> neighbors) {
this.neighbors = neighbors;
}
public double angle(Character character) {
if (getX() > character.getX()) {
return Math.atan2(getY() - character.getY(), getX() - character.getX());
} else {
return Math.atan2(character.getY() - getY(), character.getX() - getX());
}
}
}

View File

@ -0,0 +1,194 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
import java.util.AbstractSet;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Set;
public class DisjointSets<E> implements Iterable<Set<E>> {
private final Map<E, Entry<E>> map = new HashMap<>();
public DisjointSets(Collection<? extends E> collection) {
for (E element : collection) {
map.put(element, new Entry<E>(element));
}
}
public boolean areTogether(E e1, E e2) {
return map.get(e1).findRepresentative() == map.get(e2).findRepresentative();
}
public void union(E e1, E e2) {
Entry<E> r1 = map.get(e1).findRepresentative();
Entry<E> r2 = map.get(e2).findRepresentative();
if (r1 != r2) {
if (r1.size <= r2.size) {
r2.mergeWith(r1);
} else {
r1.mergeWith(r2);
}
}
}
@Override
public Iterator<Set<E>> iterator() {
return new Iterator<>() {
private final Iterator<Entry<E>> iterator = map.values().iterator();
private Entry<E> nextRepresentative;
{
findNextRepresentative();
}
@Override
public boolean hasNext() {
return nextRepresentative != null;
}
@Override
public Set<E> next() {
if (nextRepresentative == null) {
throw new NoSuchElementException();
}
Set<E> result = nextRepresentative.asSet();
findNextRepresentative();
return result;
}
private void findNextRepresentative() {
while (iterator.hasNext()) {
Entry<E> candidate = iterator.next();
if (candidate.isRepresentative()) {
nextRepresentative = candidate;
return;
}
}
nextRepresentative = null;
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
};
}
private static class Entry<E> {
private int size = 1;
private final E value;
private Entry<E> parent = this;
private Entry<E> next = null;
private Entry<E> last = this;
Entry(E value) {
this.value = value;
}
void mergeWith(Entry<E> otherRepresentative) {
size += otherRepresentative.size;
last.next = otherRepresentative;
last = otherRepresentative.last;
otherRepresentative.parent = this;
}
Entry<E> findRepresentative() {
Entry<E> representative = parent;
while (representative.parent != representative) {
representative = representative.parent;
}
for (Entry<E> entry = this; entry != representative; ) {
Entry<E> nextEntry = entry.parent;
entry.parent = representative;
entry = nextEntry;
}
return representative;
}
boolean isRepresentative() {
return parent == this;
}
Set<E> asSet() {
return new AbstractSet<E>() {
@Override
public Iterator<E> iterator() {
return new Iterator<E>() {
private Entry<E> nextEntry = findRepresentative();
@Override
public boolean hasNext() {
return nextEntry != null;
}
@Override
public E next() {
if (nextEntry == null) {
throw new NoSuchElementException();
}
E result = nextEntry.value;
nextEntry = nextEntry.next;
return result;
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
};
}
@Override
public int size() {
return findRepresentative().size;
}
};
}
}
}

View File

@ -0,0 +1,91 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
public class Histogram {
private static final double EPSILON = 1.0e-6;
private final double min;
private final double resolution;
private double[] frequencies;
public Histogram(double minValue, double maxValue, double resolution) {
this.min = minValue - EPSILON;
double delta = maxValue - minValue + 2 * EPSILON;
int size = Math.max(1, (int) Math.round((maxValue - minValue) / resolution));
this.resolution = delta / size;
this.frequencies = new double[size];
}
public void kernelSmooth(double[] kernel) {
double[] newFrequencies = new double[frequencies.length];
int shift = (kernel.length - 1) / 2;
for (int i = 0; i < kernel.length; i++) {
int jStart = Math.max(0, i - shift);
int jEnd = Math.min(frequencies.length, frequencies.length + i - shift);
for (int j = jStart; j < jEnd; j++) {
newFrequencies[j - i + shift] += kernel[i] * frequencies[j];
}
}
frequencies = newFrequencies;
}
public double[] createGaussianKernel(double length, double stdDeviation) {
int r = (int) Math.round(length / resolution) / 2;
stdDeviation /= resolution;
int size = 2 * r + 1;
double[] kernel = new double[size];
double sum = 0;
double b = 2 * stdDeviation * stdDeviation;
double a = 1 / Math.sqrt(Math.PI * b);
for (int i = 0; i < size; i++) {
kernel[i] = a * Math.exp(-(i - r) * (i - r) / b);
sum += kernel[i];
}
for (int i = 0; i < size; i++) {
kernel[i] /= sum;
}
return kernel;
}
public void gaussianSmooth(double windowLength, double stdDeviation) {
kernelSmooth(createGaussianKernel(windowLength, stdDeviation));
}
public void add(double value) {
frequencies[(int) ((value - min) / resolution)] += 1.0;
}
public int getSize() {
return frequencies.length;
}
public double getPeakValue() {
int peakIndex = 0;
for (int i = 1; i < frequencies.length; i++) {
if (frequencies[i] > frequencies[peakIndex]) {
peakIndex = i;
}
}
int peakEndIndex = peakIndex + 1;
final double EPS = 0.0001;
while (peakEndIndex < frequencies.length && Math.abs(frequencies[peakEndIndex] - frequencies[peakIndex]) < EPS) {
peakEndIndex++;
}
return ((double) peakIndex + peakEndIndex) / 2 * resolution + min;
}
}

View File

@ -0,0 +1,165 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import lombok.Data;
@Data
public class Line extends BoundingBox {
private static final double WORD_DISTANCE_MULTIPLIER = 0.2;
private final double x0;
private final double y0;
private final double x1;
private final double y1;
private final double height;
private final List<Character> characters;
private final List<TextPositionSequence> words = new ArrayList<>();
public Line(List<Character> characters, double wordSpacing) {
this.characters = characters;
if (characters.size() >= 2) {
// linear regression
double sx = 0.0, sxx = 0.0, sxy = 0.0, sy = 0.0;
for (Character character : characters) {
sx += character.getX();
sxx += character.getX() * character.getX();
sxy += character.getX() * character.getY();
sy += character.getY();
}
double b = (characters.size() * sxy - sx * sy) / (characters.size() * sxx - sx * sx);
double a = (sy - b * sx) / characters.size();
this.x0 = characters.get(0).getX();
this.y0 = a + b * this.x0;
this.x1 = characters.get(characters.size() - 1).getX();
this.y1 = a + b * this.x1;
} else {
Character character = characters.get(0);
double dx = character.getTextPosition().getWidthDirAdj() / 3;
double dy = dx * Math.tan(0);
this.x0 = character.getX() - dx;
this.x1 = character.getX() + dx;
this.y0 = character.getY() - dy;
this.y1 = character.getY() + dy;
}
height = computeHeight();
computeWords(wordSpacing * WORD_DISTANCE_MULTIPLIER);
buildBBox();
}
public double getAngle() {
return Math.atan2(y1 - y0, x1 - x0);
}
public double getLength() {
return Math.sqrt((x0 - x1) * (x0 - x1) + (y0 - y1) * (y0 - y1));
}
private double computeHeight() {
double sum = 0.0;
for (Character component : characters) {
sum += component.getHeight();
}
return sum / characters.size();
}
public double angularDifference(Line j) {
double diff = Math.abs(getAngle() - j.getAngle());
if (diff <= Math.PI / 2) {
return diff;
} else {
return Math.PI - diff;
}
}
public double horizontalDistance(Line other) {
double[] xs = new double[4];
xs[0] = x0;
xs[1] = x1;
xs[2] = other.x0;
xs[3] = other.x1;
boolean overlapping = xs[1] >= xs[2] && xs[3] >= xs[0];
Arrays.sort(xs);
return Math.abs(xs[2] - xs[1]) * (overlapping ? 1 : -1);
}
public double verticalDistance(Line other) {
double ym = (y0 + y1) / 2;
double yn = (other.y0 + other.y1) / 2;
return Math.abs(ym - yn) / Math.sqrt(1);
}
private void computeWords(double wordSpacing) {
TextPositionSequence word = new TextPositionSequence();
Character previous = null;
for (Character current : characters) {
if (previous != null) {
double dist = current.getTextPosition().getXDirAdj() - previous.getTextPosition().getXDirAdj() - previous.getTextPosition().getWidthDirAdj();
if (dist > wordSpacing) {
words.add(word);
word = new TextPositionSequence();
}
}
word.getTextPositions().add(current.getTextPosition());
previous = current;
}
words.add(word);
}
private void buildBBox() {
double minX = Double.POSITIVE_INFINITY;
double minY = Double.POSITIVE_INFINITY;
double maxX = Double.NEGATIVE_INFINITY;
double maxY = Double.NEGATIVE_INFINITY;
for (Character character : characters) {
minX = Math.min(minX, character.getTextPosition().getXDirAdj());
minY = Math.min(minY, character.getTextPosition().getYDirAdj());
maxX = Math.max(maxX, character.getTextPosition().getXDirAdj() + character.getTextPosition().getWidthDirAdj());
maxY = Math.max(maxY, character.getTextPosition().getYDirAdj() + character.getTextPosition().getHeightDir());
}
this.setBBox(new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY));
}
public String toString() {
StringBuilder sb = new StringBuilder();
words.forEach(word -> sb.append(word.toString()).append(" "));
return sb.toString().trim();
}
}

View File

@ -0,0 +1,36 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
import lombok.Getter;
public class Neighbor {
@Getter
private final double distance;
@Getter
private final double angle;
private final Character originCharacter;
@Getter
private final Character character;
public Neighbor(Character neighbor, Character origin) {
this.distance = neighbor.distance(origin);
this.angle = neighbor.angle(origin);
this.character = neighbor;
this.originCharacter = origin;
}
public double getHorizontalDistance() {
return character.horizontalDistance(originCharacter);
}
public double getVerticalDistance() {
return character.verticalDistance(originCharacter);
}
}

View File

@ -0,0 +1,50 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
import java.awt.geom.Rectangle2D;
import java.util.Comparator;
import java.util.List;
import lombok.Data;
@Data
public class Zone extends BoundingBox {
private List<Line> lines;
public Zone(List<Line> lines) {
lines.sort(Comparator.comparingDouble(Line::getY));
this.lines = lines;
buildBBox();
}
public void buildBBox() {
double minX = Double.POSITIVE_INFINITY;
double minY = Double.POSITIVE_INFINITY;
double maxX = Double.NEGATIVE_INFINITY;
double maxY = Double.NEGATIVE_INFINITY;
for (Line line : lines) {
minX = Math.min(minX, line.getX());
minY = Math.min(minY, line.getY());
maxX = Math.max(maxX, line.getX() + line.getWidth());
maxY = Math.max(maxY, line.getY() + line.getHeight());
}
this.setBBox(new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY));
}
public String toString() {
StringBuilder sb = new StringBuilder();
lines.forEach(line -> sb.append(line.toString()).append("\n"));
return sb.toString().trim();
}
}

View File

@ -0,0 +1,50 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.AngleFilter;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.DisjointSets;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Line;
@Service
public class LineBuilderService {
private static final double CHARACTER_SPACING_DISTANCE_MULTIPLIER = 3.5;
private static final double MAX_VERTICAL_CHARACTER_DISTANCE = 0.67;
private static final double ANGLE_TOLERANCE = Math.PI / 6;
public List<Line> buildLines(List<Character> characters, double characterSpacing, double lineSpacing) {
double maxHorizontalDistance = characterSpacing * CHARACTER_SPACING_DISTANCE_MULTIPLIER;
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_CHARACTER_DISTANCE;
DisjointSets<Character> sets = new DisjointSets<>(characters);
AngleFilter filter = new AngleFilter(-ANGLE_TOLERANCE, ANGLE_TOLERANCE);
characters.forEach(character -> {
character.getNeighbors().forEach(neighbor -> {
double x = neighbor.getHorizontalDistance() / maxHorizontalDistance;
double y = neighbor.getVerticalDistance() / maxVerticalDistance;
if (filter.matches(neighbor) && Math.pow(x, 2) + Math.pow(y, 2) <= 1) {
sets.union(character, neighbor.getCharacter());
}
});
});
List<Line> lines = new ArrayList<>();
sets.forEach(group -> {
List<Character> lineComponents = new ArrayList<>(group);
lineComponents.sort(Comparator.comparingDouble(Character::getX));
lines.add(new Line(lineComponents, characterSpacing));
});
return lines;
}
}

View File

@ -0,0 +1,78 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Neighbor;
@Service
public class NearestNeighbourService {
private static final int NUMBER_OF_NEIGHBOURS = 8;
private static final double STEP = 16.0;
public void findNearestNeighbors(List<Character> characters) {
if (characters.isEmpty()) {
return;
}
characters.sort(Comparator.comparingDouble(Character::getX));
int maxNeighborCount = NUMBER_OF_NEIGHBOURS;
if (characters.size() <= NUMBER_OF_NEIGHBOURS) {
maxNeighborCount = characters.size() - 1;
}
for (int i = 0; i < characters.size(); i++) {
List<Neighbor> candidates = new ArrayList<>();
int start = i;
int end = i + 1;
double distance = Double.POSITIVE_INFINITY;
for (double searchDistance = 0; searchDistance < distance; ) {
searchDistance += STEP;
boolean newCandidatesFound = false;
while (start > 0 && characters.get(i).getX() - characters.get(start - 1).getX() < searchDistance) {
start--;
candidates.add(new Neighbor(characters.get(start), characters.get(i)));
clearLeastDistant(candidates, maxNeighborCount);
newCandidatesFound = true;
}
while (end < characters.size() && characters.get(end).getX() - characters.get(i).getX() < searchDistance) {
candidates.add(new Neighbor(characters.get(end), characters.get(i)));
clearLeastDistant(candidates, maxNeighborCount);
end++;
newCandidatesFound = true;
}
if (newCandidatesFound && candidates.size() >= maxNeighborCount) {
distance = candidates.get(maxNeighborCount - 1).getDistance();
}
}
clearLeastDistant(candidates, maxNeighborCount);
characters.get(i).setNeighbors(new ArrayList<>(candidates));
}
}
private void clearLeastDistant(List<Neighbor> candidates, int maxNeighborCount) {
if (candidates.size() > maxNeighborCount) {
candidates.sort(Comparator.comparingDouble(Neighbor::getDistance));
candidates.remove(candidates.remove(candidates.size() - 1));
}
}
}

View File

@ -0,0 +1,99 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.ListIterator;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.utils.DoubleUtils;
@Service
public class ReadingOrderService {
private static final double THRESHOLD = 1;
public List<Zone> resolve(List<Zone> zones, boolean yxOrder) {
if (zones.isEmpty() || zones.size() == 1) {
return zones;
}
if (yxOrder) {
zones.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
return zones;
}
return resolveMultiColumnReadingOder(zones);
}
private List<Zone> resolveMultiColumnReadingOder(List<Zone> zones) {
// Simple reading order resolver for multi column page layout as described here : https://pub.towardsai.net/advanced-rag-02-unveiling-pdf-parsing-b84ae866344e
// TODO implement a more fancy reading order resolver see https://github.com/BobLd/DocumentLayoutAnalysis/blob/master/README.md#reading-order
double minX = Double.POSITIVE_INFINITY;
double maxX = Double.NEGATIVE_INFINITY;
for (Zone zone : zones) {
if (zone.getX() < minX) {
minX = zone.getX();
}
if (zone.getX() + zone.getWidth() > maxX) {
maxX = zone.getX() + zone.getWidth();
}
}
double midLineXCoordinate = (minX + maxX) / 2;
List<Zone> leftOf = new ArrayList<>();
List<Zone> rightOf = new ArrayList<>();
List<Zone> middle = new ArrayList<>();
for (Zone zone : zones) {
if (zone.getX() < midLineXCoordinate && zone.getX() + zone.getWidth() < midLineXCoordinate) {
leftOf.add(zone);
} else if (zone.getX() > midLineXCoordinate && zone.getX() + zone.getWidth() > midLineXCoordinate) {
rightOf.add(zone);
} else {
middle.add(zone);
}
}
leftOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
rightOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
middle.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
List<Zone> sortedZones = new ArrayList<>();
sortedZones.addAll(leftOf);
sortedZones.addAll(rightOf);
ListIterator<Zone> itty = middle.listIterator();
while (itty.hasNext()) {
Zone current = itty.next();
for (int i = 0; i < sortedZones.size(); i++) {
if (current.getY() < sortedZones.get(i).getY()) {
sortedZones.add(i, current);
itty.remove();
break;
}
}
}
sortedZones.addAll(middle);
return sortedZones;
}
}

View File

@ -0,0 +1,56 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service;
import java.util.List;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.AngleFilter;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Histogram;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Neighbor;
@Service
public class SpacingService {
private static final double SPACING_HISTOGRAM_RESOLUTION = 0.5;
private static final double SPACING_HISTOGRAM_SMOOTHING_LENGTH = 2.5;
private static final double SPACING_HIST_SMOOTHING_STANDARD_DEVIATION = 0.5;
private static final double ANGLE_TOLERANCE = Math.PI / 6;
public double computeCharacterSpacing(List<Character> components) {
return computeSpacing(components, 0);
}
public double computeLineSpacing(List<Character> components) {
return computeSpacing(components, Math.PI / 2);
}
private double computeSpacing(List<Character> components, double angle) {
double maxDistance = Double.NEGATIVE_INFINITY;
for (Character component : components) {
for (Neighbor neighbor : component.getNeighbors()) {
maxDistance = Math.max(maxDistance, neighbor.getDistance());
}
}
Histogram histogram = new Histogram(0, maxDistance, SPACING_HISTOGRAM_RESOLUTION);
AngleFilter filter = new AngleFilter(angle - ANGLE_TOLERANCE, angle + ANGLE_TOLERANCE);
for (Character component : components) {
for (Neighbor neighbor : component.getNeighbors()) {
if (filter.matches(neighbor)) {
histogram.add(neighbor.getDistance());
}
}
}
histogram.gaussianSmooth(SPACING_HISTOGRAM_SMOOTHING_LENGTH, SPACING_HIST_SMOOTHING_STANDARD_DEVIATION);
return histogram.getPeakValue();
}
}

View File

@ -0,0 +1,90 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service;
import java.util.ArrayList;
import java.util.List;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.DisjointSets;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Line;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone;
@Service
public class ZoneBuilderService {
private static final double MIN_HORIZONTAL_DISTANCE_MULTIPLIER = -0.5;
private static final double MAX_VERTICAL_DISTANCE_MULTIPLIER = 1.2;
private static final double MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER = -3.0;
private static final double MAX_VERTICAL_MERGE_DISTANCE_MULTIPLIER = 0.5;
private static final double MIN_LINE_SIZE_SCALE = 0.9;
private static final double MAX_LINE_SIZE_SCALE = 2.5;
private static final double ANGLE_TOLERANCE = Math.PI / 6;
public static final int MAX_ZONES = 300;
public List<Zone> buildZones(List<Line> lines, double characterSpacing, double lineSpacing) {
double minHorizontalDistance = characterSpacing * MIN_HORIZONTAL_DISTANCE_MULTIPLIER;
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_DISTANCE_MULTIPLIER;
double minHorizontalMergeDistance = characterSpacing * MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER;
double maxVerticalMergeDistance = lineSpacing * MAX_VERTICAL_MERGE_DISTANCE_MULTIPLIER;
DisjointSets<Line> sets = new DisjointSets<>(lines);
double meanHeight = calculateMeanHeight(lines);
lines.forEach(outerLine -> //
lines.forEach(innerLine -> {
double scale = Math.min(outerLine.getHeight(), innerLine.getHeight()) / meanHeight;
scale = Math.max(MIN_LINE_SIZE_SCALE, Math.min(scale, MAX_LINE_SIZE_SCALE));
if (!sets.areTogether(outerLine, innerLine) && outerLine.angularDifference(innerLine) <= ANGLE_TOLERANCE) {
double horizontalDistance = outerLine.horizontalDistance(innerLine) / scale;
double verticalDistance = outerLine.verticalDistance(innerLine) / scale;
if (minHorizontalDistance <= horizontalDistance && verticalDistance <= maxVerticalDistance //
|| minHorizontalMergeDistance <= horizontalDistance && verticalDistance <= maxVerticalMergeDistance) {
sets.union(outerLine, innerLine);
}
}
}));
List<Zone> zones = new ArrayList<>();
sets.forEach(group -> {
zones.add(new Zone(new ArrayList<>(group)));
});
if (zones.size() > MAX_ZONES) {
List<Line> oneZoneLines = new ArrayList<>();
for (Zone zone : zones) {
oneZoneLines.addAll(zone.getLines());
}
return List.of(new Zone(oneZoneLines));
}
return zones;
}
private double calculateMeanHeight(List<Line> lines) {
double meanHeight = 0.0;
double weights = 0.0;
for (Line line : lines) {
double weight = line.getLength();
meanHeight += line.getHeight() * weight;
weights += weight;
}
meanHeight /= weights;
return meanHeight;
}
}

View File

@ -0,0 +1,18 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.utils;
public class DoubleUtils {
public static int compareDouble(double d1, double d2, double precision) {
if (Double.isNaN(d1) || Double.isNaN(d2)) {
return Double.compare(d1, d2);
}
if (precision == 0) {
precision = 1;
}
long i1 = Math.round(d1 / precision);
long i2 = Math.round(d2 / precision);
return Long.valueOf(i1).compareTo(i2);
}
}

View File

@ -0,0 +1,270 @@
package com.knecon.fforesight.service.layoutparser.processor.services.visualization;
import java.awt.Color;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.font.PDType1Font;
import org.apache.pdfbox.pdmodel.font.Standard14Fonts;
import org.apache.pdfbox.util.Matrix;
import org.springframework.core.io.ClassPathResource;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.utils.PdfVisualisationUtility;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Getter;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.experimental.UtilityClass;
@UtilityClass
public class PdfDraw {
public static void drawRectanglesPerPage(String filename, List<List<Rectangle2D>> rectanglesPerPage, String tmpFileName) throws IOException {
ClassPathResource pdfResource = new ClassPathResource(filename);
try (PDDocument pdDocument = Loader.loadPDF(pdfResource.getFile()); var out = new FileOutputStream(tmpFileName)) {
for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) {
PdfVisualisationUtility.drawRectangle2DList(pdDocument,
pageNumber,
rectanglesPerPage.get(pageNumber - 1),
PdfVisualisationUtility.Options.builder().stroke(true).build());
}
pdDocument.save(out);
}
}
public static void drawRectanglesPerPageNumberedByLine(String filename, List<List<List<Rectangle2D>>> rectanglesPerPage, String tmpFileName) throws IOException {
ClassPathResource pdfResource = new ClassPathResource(filename);
try (PDDocument pdDocument = Loader.loadPDF(pdfResource.getFile()); var out = new FileOutputStream(tmpFileName)) {
for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) {
var rectanglesOnPage = rectanglesPerPage.get(pageNumber - 1);
for (int lineNumber = 0; lineNumber < rectanglesOnPage.size(); lineNumber++) {
var rectanglesInLine = rectanglesOnPage.get(lineNumber);
PdfVisualisationUtility.drawRectangle2DList(pdDocument, pageNumber, rectanglesInLine, PdfVisualisationUtility.Options.builder().stroke(true).build());
double y = Math.min(rectanglesInLine.get(0).getMinY(), rectanglesInLine.get(0).getMaxY());
PdfVisualisationUtility.drawText(String.format("%d", lineNumber),
pdDocument,
new Point2D.Double(rectanglesInLine.get(0).getX() - (5 + (5 * countNumberOfDigits(lineNumber))), y + 2),
pageNumber,
PdfVisualisationUtility.Options.builder().stroke(true).build());
}
}
pdDocument.save(out);
}
}
private static int countNumberOfDigits(int num) {
int final_num = num;
if (final_num == 0) {
return 1;
}
int count = 0;
for (; final_num != 0; final_num /= 10) {
count++;
}
return count;
}
public static void drawDocumentGraph(PDDocument document, Document documentGraph) {
documentGraph.getDocumentTree().allEntriesInOrder().forEach(entry -> drawNode(document, entry));
}
public static void drawNode(PDDocument document, DocumentTree.Entry entry) {
Options options = buildStandardOptionsForNodes(entry);
drawBBoxAndLabelAndNumberOnPage(document, entry, options);
}
public static void drawTextBlock(PDDocument document, TextBlock textBlock, Options options) {
textBlock.getAtomicTextBlocks().forEach(atb -> drawAtomicTextBlock(document, atb, options));
}
public static void drawAtomicTextBlock(PDDocument document, AtomicTextBlock atomicTextBlock, Options options) {
drawRectangle2DList(document, atomicTextBlock.getPage().getNumber(), atomicTextBlock.getPositions().stream().toList(), options);
}
@SneakyThrows
private static void drawText(String string, PDDocument document, Point2D location, Integer pageNumber, Options options, boolean rotate) {
var pdPage = document.getPage(pageNumber - 1);
var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true);
contentStream.setNonStrokingColor(options.getStrokeColor());
contentStream.setLineWidth(options.getStrokeWidth());
contentStream.beginText();
if (rotate) {
contentStream.setTextMatrix(Matrix.getRotateInstance(Math.toRadians(15), (float) location.getX(), (float) location.getY()));
} else {
contentStream.newLineAtOffset((float) location.getX(), (float) location.getY());
}
contentStream.setFont(new PDType1Font(Standard14Fonts.FontName.HELVETICA), 10);
contentStream.showText(string);
contentStream.endText();
contentStream.close();
}
@SneakyThrows
public static void drawRectangle2DList(PDDocument document, int pageNumber, List<Rectangle2D> rectCollection, Options options) {
var pdPage = document.getPage(pageNumber - 1);
drawRectangle2DList(document, rectCollection, options, pdPage);
}
private static void drawRectangle2DList(PDDocument document, List<Rectangle2D> rectCollection, Options options, PDPage pdPage) throws IOException {
var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true);
contentStream.setStrokingColor(options.getStrokeColor());
contentStream.setNonStrokingColor(options.getFillColor());
contentStream.setLineWidth(options.getStrokeWidth());
for (var r : rectCollection) {
contentStream.addRect((float) r.getMinX(), (float) r.getMinY(), (float) r.getWidth(), (float) r.getHeight());
if (options.isStroke() && options.isFill()) {
contentStream.fillAndStroke();
} else if (options.isStroke()) {
contentStream.stroke();
} else if (options.isFill()) {
contentStream.fill();
}
}
contentStream.close();
}
@SneakyThrows
public static void drawRectanglesAndLinesPerPage(String filename, List<List<Rectangle2D>> list, List<List<Rectangle2D>> rectanglesPerPage, String tmpFileName) {
ClassPathResource pdfResource = new ClassPathResource(filename);
try (PDDocument pdDocument = Loader.loadPDF(pdfResource.getFile()); var out = new FileOutputStream(tmpFileName)) {
for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) {
// PdfVisualisationUtility.drawLine2DList(pdDocument,
// pageNumber,
// list.get(pageNumber - 1),
// PdfVisualisationUtility.Options.builder().stroke(true).build());
PdfVisualisationUtility.drawRectangle2DList(pdDocument,
pageNumber,
rectanglesPerPage.get(pageNumber - 1),
PdfVisualisationUtility.Options.builder().stroke(true).build());
PdfVisualisationUtility.drawRectangle2DList(pdDocument, pageNumber, list.get(pageNumber - 1), PdfVisualisationUtility.Options.builder().stroke(true).build());
}
pdDocument.save(out);
}
}
@SneakyThrows
public static void drawLinesPerPage(String filename, List<List<Ruling>> linesPerPage, String tmpFileName) {
ClassPathResource pdfResource = new ClassPathResource(filename);
try (PDDocument pdDocument = Loader.loadPDF(pdfResource.getFile()); var out = new FileOutputStream(tmpFileName)) {
for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) {
PdfVisualisationUtility.drawLine2DList(pdDocument,
pageNumber,
linesPerPage.get(pageNumber - 1),
PdfVisualisationUtility.Options.builder().strokeColor(Color.RED).stroke(true).build());
}
pdDocument.save(out);
}
}
@Builder
@AllArgsConstructor
@Getter
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public static class Options {
boolean stroke;
@Builder.Default
Color strokeColor = Color.BLACK;
@Builder.Default
float strokeWidth = 1f;
boolean fill;
@Builder.Default
Color fillColor = Color.BLACK;
}
private static Options buildStandardOptionsForNodes(DocumentTree.Entry entry) {
return Options.builder().stroke(true).strokeColor(switch (entry.getType()) {
case DOCUMENT -> Color.LIGHT_GRAY;
case HEADER, FOOTER -> Color.GREEN;
case PARAGRAPH -> Color.BLUE;
case HEADLINE -> Color.RED;
case SECTION -> Color.BLACK;
case TABLE -> Color.ORANGE;
case TABLE_CELL -> Color.GRAY;
case IMAGE -> Color.MAGENTA;
}).build();
}
private static void drawBBoxAndLabelAndNumberOnPage(PDDocument document, DocumentTree.Entry entry, Options options) {
Map<Page, Rectangle2D> rectanglesPerPage = entry.getNode().getBBox();
for (Page page : rectanglesPerPage.keySet()) {
Rectangle2D rectangle2D = rectanglesPerPage.get(page);
if (entry.getType() == NodeType.SECTION) {
rectangle2D = RectangleTransformations.pad(rectangle2D, 10, 10);
}
drawRectangle2DList(document, page.getNumber(), List.of(rectangle2D), options);
drawText(buildString(entry),
document,
new Point2D.Double(rectangle2D.getMinX(), rectangle2D.getMaxY() + 2),
page.getNumber(),
options,
entry.getType() == NodeType.TABLE_CELL);
}
}
private static String buildString(DocumentTree.Entry entry) {
return entry.getNode().getNumberOnPage() + ": " + entry.getTreeId() + ": " + entry.getType();
}
}

View File

@ -25,7 +25,9 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
@SneakyThrows
public void testLayoutParserEndToEnd() {
prepareStorage("files/bdr/Wie weiter bei Kristeneinrichtungen.pdf");
String s = "";
String s1 = "";
prepareStorage("files/Minimal Examples/WrongOrderPage1.pdf");
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER);
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
Arrays.stream(finishedEvent.message().split("\n")).forEach(log::info);

View File

@ -25,7 +25,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
@SneakyThrows
public void testViewerDocument() {
String fileName = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
String fileName = "files/Plenarprotokoll 1 (keine Druchsache!) (1).pdf";
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
var documentFile = new ClassPathResource(fileName).getFile();
@ -35,9 +35,10 @@ public class ViewerDocumentTest extends BuildDocumentTest {
Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER);
long start = System.currentTimeMillis();
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
}
@Test
@Disabled
@SneakyThrows
@ -51,7 +52,11 @@ public class ViewerDocumentTest extends BuildDocumentTest {
var tableResponse = mapper.readValue(new ClassPathResource(tableFileName).getInputStream(), TableServiceResponse.class);
var documentFile = new ClassPathResource(fileName).getFile();
var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE, documentFile, new ImageServiceResponse(), tableResponse, Path.of(fileName).getFileName().toFile().toString());
var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE,
documentFile,
new ImageServiceResponse(),
tableResponse,
Path.of(fileName).getFileName().toFile().toString());
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
Document document = DocumentGraphFactory.buildDocumentGraph(classificationDocument);