More refactoring

This commit is contained in:
Dominique Eifländer 2024-02-15 16:54:07 +01:00
parent 4de6c12aec
commit 4871e55f2d
12 changed files with 727 additions and 81 deletions

View File

@ -90,7 +90,7 @@ public class LayoutParsingPipeline {
RedactManagerBlockificationService redactManagerBlockificationService;
LayoutGridService layoutGridService;
ObservationRegistry observationRegistry;
// DocstrumSegmenter docstrumSegmenter;
DocstrumSegmenter docstrumSegmenter;
HierarchicalReadingOrderResolver hierarchicalReadingOrderResolver;
@ -251,7 +251,7 @@ public class LayoutParsingPipeline {
// Docstrum
AtomicInteger num = new AtomicInteger(pageNumber);
var zones = new DocstrumSegmenter().segmentPage(stripper.getTextPositionSequences());
var zones = docstrumSegmenter.segmentPage(stripper.getTextPositionSequences());
zones = hierarchicalReadingOrderResolver.resolve(zones);
List<AbstractPageBlock> pageBlocks = new ArrayList<>();

View File

@ -22,9 +22,16 @@ import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.mo
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.utils.BoundingBoxBuilder;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.utils.ZoneUtils;
import lombok.RequiredArgsConstructor;
@Service
@RequiredArgsConstructor
public class DocstrumSegmenter {
private final NearestNeighbourService nearestNeighbourService;
private final SpacingService spacingService;
private final LineBuilderService lineBuilderService;
public static final int MAX_ZONES_PER_PAGE = 300;
private static final double DISTANCE_STEP = 16.0;
@ -167,18 +174,14 @@ public class DocstrumSegmenter {
var components = positions.stream().map(chunk -> new Character(chunk)).collect(Collectors.toList());
Character[] componentsArray = new Character[positions.size()];
components.toArray(componentsArray);
nearestNeighbourService.findNearestNeighbors(components);
Arrays.sort(componentsArray, Character.CharacterXComparator.getInstance());
findNeighbors(componentsArray);
double orientation = 0;
double orientation = computeInitialOrientation(components);
double characterSpacing = spacingService.computeCharacterSpacing(components);
double lineSpacing = spacingService.computeLineSpacing(components);
double characterSpacing = computeCharacterSpacing(components, orientation);
double lineSpacing = computeLineSpacing(components, orientation);
List<ComponentLine> lines = determineLines(components, characterSpacing * COMP_DIST_CHAR, lineSpacing * MAX_VERTICAL_COMP_DIST);
List<ComponentLine> lines = lineBuilderService.buildLines(components, characterSpacing, lineSpacing);
List<List<ComponentLine>> zones = determineZones(lines,
orientation,
@ -322,34 +325,52 @@ public class DocstrumSegmenter {
}
/**
* Groups components into text lines.
*
* @param components component list
* @param maxHorizontalDistance - maximum horizontal distance between components
* @param maxVerticalDistance - maximum vertical distance between components
* @return lines of components
*/
private List<ComponentLine> determineLines(List<Character> components, double maxHorizontalDistance, double maxVerticalDistance) {
private List<ComponentLine> determineLines(List<Character> characters, double characterSpacing, double lineSpacing) {
DisjointSets<Character> sets = new DisjointSets<Character>(components);
double maxHorizontalDistance = characterSpacing * COMP_DIST_CHAR;
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_COMP_DIST;
// DisjointSets<Character> sets = new DisjointSets<Character>(characters);
// AngleFilter filter = AngleFilter.newInstance(-ANGLE_TOLERANCE, ANGLE_TOLERANCE);
// for (Character component : characters) {
// for (Neighbor neighbor : component.getNeighbors()) {
// double x = neighbor.getHorizontalDistance() / maxHorizontalDistance;
// double y = neighbor.getVerticalDistance() / maxVerticalDistance;
// if (filter.matches(neighbor) && x * x + y * y <= 1) {
// sets.union(component, neighbor.getCharacter());
// }
// }
// }
// List<ComponentLine> lines = new ArrayList<ComponentLine>();
// for (Set<Character> group : sets) {
// List<Character> lineComponents = new ArrayList<Character>(group);
// lineComponents.sort(Comparator.comparingDouble(Character::getX));
// lines.add(new ComponentLine(lineComponents));
// }
// return lines;
DisjointSets<Character> sets = new DisjointSets<>(characters);
AngleFilter filter = AngleFilter.newInstance(-ANGLE_TOLERANCE, ANGLE_TOLERANCE);
for (Character component : components) {
for (Neighbor neighbor : component.getNeighbors()) {
characters.forEach(character -> {
character.getNeighbors().forEach(neighbor -> {
double x = neighbor.getHorizontalDistance() / maxHorizontalDistance;
double y = neighbor.getVerticalDistance() / maxVerticalDistance;
if (filter.matches(neighbor) && x * x + y * y <= 1) {
sets.union(component, neighbor.getCharacter());
if (filter.matches(neighbor) && Math.pow(x, 2) + Math.pow(y, 2) <= 1) {
sets.union(character, neighbor.getCharacter());
}
}
}
List<ComponentLine> lines = new ArrayList<ComponentLine>();
for (Set<Character> group : sets) {
List<Character> lineComponents = new ArrayList<Character>(group);
Collections.sort(lineComponents, Character.CharacterXComparator.getInstance());
});
});
List<ComponentLine> lines = new ArrayList<>();
sets.forEach(group -> {
List<Character> lineComponents = new ArrayList<>(group);
lineComponents.sort(Comparator.comparingDouble(Character::getX));
lines.add(new ComponentLine(lineComponents));
}
});
return lines;
}
@ -508,7 +529,7 @@ public class DocstrumSegmenter {
this.y1 = a + b * this.x1;
} else if (!components.isEmpty()) {
Character component = components.get(0);
double dx = component.getChunk().getWidthDirAdj() / 3;
double dx = component.getTextPosition().getWidthDirAdj() / 3;
double dy = dx * Math.tan(0);
this.x0 = component.getX() - dx;
this.x1 = component.getX() + dx;
@ -590,14 +611,15 @@ public class DocstrumSegmenter {
Character previousComponent = null;
for (Character component : components) {
if (previousComponent != null) {
double dist = component.getChunk().getXDirAdj() - previousComponent.getChunk().getXDirAdj() - previousComponent.getChunk().getWidthDirAdj();
double dist = component.getTextPosition().getXDirAdj() - previousComponent.getTextPosition().getXDirAdj() - previousComponent.getTextPosition()
.getWidthDirAdj();
if (dist > wordSpacing) {
BoundingBoxBuilder.setBounds(word);
line.addWord(word);
word = new Word();
}
}
word.addChunk(component.getChunk());
word.addChunk(component.getTextPosition());
previousComponent = component;
}
BoundingBoxBuilder.setBounds(word);

View File

@ -0,0 +1,48 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.DisjointSets;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.Character;
@Service
public class LineBuilderService {
private static final double CHARACTER_SPACING_DISTANCE_MULTIPLIER = 3.5;
private static final double MAX_VERTICAL_CHARACTER_DISTANCE = 0.67;
private static final double ANGLE_TOLERANCE = Math.PI / 6;
public List<DocstrumSegmenter.ComponentLine> buildLines(List<Character> characters, double characterSpacing, double lineSpacing) {
double maxHorizontalDistance = characterSpacing * CHARACTER_SPACING_DISTANCE_MULTIPLIER;
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_CHARACTER_DISTANCE;
DisjointSets<Character> sets = new DisjointSets<>(characters);
AngleFilter filter = AngleFilter.newInstance(-ANGLE_TOLERANCE, ANGLE_TOLERANCE);
characters.forEach(character -> {
character.getNeighbors().forEach(neighbor -> {
double x = neighbor.getHorizontalDistance() / maxHorizontalDistance;
double y = neighbor.getVerticalDistance() / maxVerticalDistance;
if (filter.matches(neighbor) && Math.pow(x, 2) + Math.pow(y, 2) <= 1) {
sets.union(character, neighbor.getCharacter());
}
});
});
List<DocstrumSegmenter.ComponentLine> lines = new ArrayList<>();
sets.forEach(group -> {
List<Character> lineComponents = new ArrayList<>(group);
lineComponents.sort(Comparator.comparingDouble(Character::getX));
lines.add(new DocstrumSegmenter.ComponentLine(lineComponents));
});
return lines;
}
}

View File

@ -0,0 +1,78 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.Character;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.Neighbor;
@Service
public class NearestNeighbourService {
private static final int NUMBER_OF_NEIGHBOURS = 8;
private static final double STEP = 16.0;
public void findNearestNeighbors(List<Character> characters) {
if (characters.isEmpty()) {
return;
}
characters.sort(Comparator.comparingDouble(Character::getX));
int maxNeighborCount = NUMBER_OF_NEIGHBOURS;
if (characters.size() <= NUMBER_OF_NEIGHBOURS) {
maxNeighborCount = characters.size() - 1;
}
for (int i = 0; i < characters.size(); i++) {
List<Neighbor> candidates = new ArrayList<>();
int start = i;
int end = i + 1;
double distance = Double.POSITIVE_INFINITY;
for (double searchDistance = 0; searchDistance < distance; ) {
searchDistance += STEP;
boolean newCandidatesFound = false;
while (start > 0 && characters.get(i).getX() - characters.get(start - 1).getX() < searchDistance) {
start--;
candidates.add(new Neighbor(characters.get(start), characters.get(i)));
clearLeastDistant(candidates, maxNeighborCount);
newCandidatesFound = true;
}
while (end < characters.size() && characters.get(end).getX() - characters.get(i).getX() < searchDistance) {
candidates.add(new Neighbor(characters.get(end), characters.get(i)));
clearLeastDistant(candidates, maxNeighborCount);
end++;
newCandidatesFound = true;
}
if (newCandidatesFound && candidates.size() >= maxNeighborCount) {
distance = candidates.get(maxNeighborCount - 1).getDistance();
}
}
clearLeastDistant(candidates, maxNeighborCount);
characters.get(i).setNeighbors(new ArrayList<>(candidates));
}
}
private void clearLeastDistant(List<Neighbor> candidates, int maxNeighborCount) {
if (candidates.size() > maxNeighborCount) {
candidates.sort(Comparator.comparingDouble(Neighbor::getDistance));
candidates.remove(candidates.remove(candidates.size() - 1));
}
}
}

View File

@ -0,0 +1,55 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum;
import java.util.List;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Histogram;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.Character;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.Neighbor;
@Service
public class SpacingService {
private static final double SPACING_HISTOGRAM_RESOLUTION = 0.5;
private static final double SPACING_HISTOGRAM_SMOOTHING_LENGTH = 2.5;
private static final double SPACING_HIST_SMOOTHING_STANDARD_DEVIATION = 0.5;
private static final double ANGLE_TOLERANCE = Math.PI / 6;
public double computeCharacterSpacing(List<Character> components) {
return computeSpacing(components, 0);
}
public double computeLineSpacing(List<Character> components) {
return computeSpacing(components, Math.PI / 2);
}
private double computeSpacing(List<Character> components, double angle) {
double maxDistance = Double.NEGATIVE_INFINITY;
for (Character component : components) {
for (Neighbor neighbor : component.getNeighbors()) {
maxDistance = Math.max(maxDistance, neighbor.getDistance());
}
}
Histogram histogram = new Histogram(0, maxDistance, SPACING_HISTOGRAM_RESOLUTION);
AngleFilter filter = AngleFilter.newInstance(angle - ANGLE_TOLERANCE, angle + ANGLE_TOLERANCE);
for (Character component : components) {
for (Neighbor neighbor : component.getNeighbors()) {
if (filter.matches(neighbor)) {
histogram.add(neighbor.getDistance());
}
}
}
histogram.gaussianSmooth(SPACING_HISTOGRAM_SMOOTHING_LENGTH, SPACING_HIST_SMOOTHING_STANDARD_DEVIATION);
return histogram.getPeakValue();
}
}

View File

@ -0,0 +1,84 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum;
import java.util.ArrayList;
import java.util.List;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.DisjointSets;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.CharacterLine;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.CharacterZone;
@Service
public class ZoneBuilderService {
private static final double MIN_HORIZONTAL_DISTANCE_MULTIPLIER = -0.5;
private static final double MAX_VERTICAL_DISTANCE_MULTIPLIER = 1.2;
private static final double MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER = -3.0;
private static final double MAX_VERTICAL_MERGE_DISTANCE_MULTIPLIER = 0.5;
private static final double MIN_LINE_SIZE_SCALE = 0.9;
private static final double MAX_LINE_SIZE_SCALE = 2.5;
private static final double ANGLE_TOLERANCE = Math.PI / 6;
public List<CharacterZone> buildZones(List<CharacterLine> lines, double characterSpacing, double lineSpacing) {
double minHorizontalDistance = characterSpacing * MIN_HORIZONTAL_DISTANCE_MULTIPLIER;
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_DISTANCE_MULTIPLIER;
double minHorizontalMergeDistance = characterSpacing * MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER;
double maxVerticalMergeDistance = lineSpacing * MAX_VERTICAL_MERGE_DISTANCE_MULTIPLIER;
DisjointSets<CharacterLine> sets = new DisjointSets<>(lines);
double meanHeight = calculateMeanHeight(lines);
lines.forEach(outerLine -> //
lines.forEach(innerLine -> {
double scale = Math.min(outerLine.getHeight(), innerLine.getHeight()) / meanHeight;
scale = Math.max(MIN_LINE_SIZE_SCALE, Math.min(scale, MAX_LINE_SIZE_SCALE));
if (!sets.areTogether(outerLine, innerLine) && outerLine.angularDifference(innerLine) <= ANGLE_TOLERANCE) {
double horizontalDistance = outerLine.horizontalDistance(innerLine) / scale;
double verticalDistance = outerLine.verticalDistance(innerLine) / scale;
// Line over or above
if (minHorizontalDistance <= horizontalDistance && verticalDistance <= maxVerticalDistance) {
sets.union(outerLine, innerLine);
}
// Split line that needs later merging
else if (minHorizontalMergeDistance <= horizontalDistance && verticalDistance <= maxVerticalMergeDistance) {
sets.union(outerLine, innerLine);
}
}
}));
List<CharacterZone> zones = new ArrayList<>();
sets.forEach(group -> {
zones.add(new CharacterZone(new ArrayList<>(group)));
});
return zones;
}
private double calculateMeanHeight(List<CharacterLine> lines) {
double meanHeight = 0.0;
double weights = 0.0;
for (CharacterLine line : lines) {
double weight = line.getLength();
meanHeight += line.getHeight() * weight;
weights += weight;
}
meanHeight /= weights;
return meanHeight;
}
}

View File

@ -1,35 +1,33 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum;
import java.util.Arrays;
import java.util.Comparator;
import java.util.ArrayList;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
import lombok.Data;
import lombok.Getter;
@Data
public class Character {
private final double x;
private final double y;
private final RedTextPosition chunk;
private final RedTextPosition textPosition;
private List<Neighbor> neighbors;
private List<Neighbor> neighbors = new ArrayList<>();
public Character(RedTextPosition chunk) {
this.x = chunk.getXDirAdj() + chunk.getWidthDirAdj() / 2;
this.y = chunk.getYDirAdj() + chunk.getHeightDir() / 2;
this.chunk = chunk;
this.textPosition = chunk;
}
public double getHeight() {
return chunk.getHeightDir();
return textPosition.getHeightDir();
}
@ -68,43 +66,4 @@ public class Character {
}
}
public double overlappingDistance(Character other, double orientation) {
double[] xs = new double[4];
double s = Math.sin(-orientation), c = Math.cos(-orientation);
xs[0] = c * x - s * y;
xs[1] = c * (x + chunk.getWidthDirAdj()) - s * (y + chunk.getHeightDir());
xs[2] = c * other.x - s * other.y;
xs[3] = c * (other.x + other.chunk.getWidthDirAdj()) - s * (other.y + other.chunk.getHeightDir());
boolean overlapping = xs[1] >= xs[2] && xs[3] >= xs[0];
Arrays.sort(xs);
return Math.abs(xs[2] - xs[1]) * (overlapping ? 1 : -1);
}
/**
* Component comparator based on x coordinate of the centroid of component.
* <p>
* The ordering is not consistent with equals.
*/
public static final class CharacterXComparator implements Comparator<Character> {
private CharacterXComparator() {
}
@Override
public int compare(Character o1, Character o2) {
return Double.compare(o1.getX(), o2.getX());
}
@Getter
private static final CharacterXComparator instance = new CharacterXComparator();
}
}

View File

@ -0,0 +1,111 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum;
import java.util.Arrays;
import java.util.List;
import lombok.Data;
@Data
public class CharacterLine {
private final double x0;
private final double y0;
private final double x1;
private final double y1;
private final double height;
private final List<Character> characters;
public CharacterLine(List<Character> characters) {
this.characters = characters;
if (characters.size() >= 2) {
// Simple linear regression
double sx = 0.0, sxx = 0.0, sxy = 0.0, sy = 0.0;
for (Character component : characters) {
sx += component.getX();
sxx += component.getX() * component.getX();
sxy += component.getX() * component.getY();
sy += component.getY();
}
double b = (characters.size() * sxy - sx * sy) / (characters.size() * sxx - sx * sx);
double a = (sy - b * sx) / characters.size();
this.x0 = characters.get(0).getX();
this.y0 = a + b * this.x0;
this.x1 = characters.get(characters.size() - 1).getX();
this.y1 = a + b * this.x1;
} else if (!characters.isEmpty()) {
Character component = characters.get(0);
double dx = component.getTextPosition().getWidthDirAdj() / 3;
double dy = dx * Math.tan(0);
this.x0 = component.getX() - dx;
this.x1 = component.getX() + dx;
this.y0 = component.getY() - dy;
this.y1 = component.getY() + dy;
} else {
throw new IllegalArgumentException("Component list must not be empty");
}
height = computeHeight();
}
public double getAngle() {
return Math.atan2(y1 - y0, x1 - x0);
}
public double getLength() {
return Math.sqrt((x0 - x1) * (x0 - x1) + (y0 - y1) * (y0 - y1));
}
private double computeHeight() {
double sum = 0.0;
for (Character component : characters) {
sum += component.getHeight();
}
return sum / characters.size();
}
public double angularDifference(CharacterLine j) {
double diff = Math.abs(getAngle() - j.getAngle());
if (diff <= Math.PI / 2) {
return diff;
} else {
return Math.PI - diff;
}
}
public double horizontalDistance(CharacterLine other) {
double[] xs = new double[4];
double s = 0, c = 1;
xs[0] = c * x0 - s * y0;
xs[1] = c * x1 - s * y1;
xs[2] = c * other.x0 - s * other.y0;
xs[3] = c * other.x1 - s * other.y1;
boolean overlapping = xs[1] >= xs[2] && xs[3] >= xs[0];
Arrays.sort(xs);
return Math.abs(xs[2] - xs[1]) * (overlapping ? 1 : -1);
}
public double verticalDistance(CharacterLine other) {
double xm = (x0 + x1) / 2, ym = (y0 + y1) / 2;
double xn = (other.x0 + other.x1) / 2, yn = (other.y0 + other.y1) / 2;
return Math.abs((xn - xm) + ym - yn) / Math.sqrt(1);
}
}

View File

@ -0,0 +1,17 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum;
import java.util.ArrayList;
import java.util.List;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@NoArgsConstructor
@AllArgsConstructor
public class CharacterZone {
private List<CharacterLine> lines = new ArrayList<>();
}

View File

@ -0,0 +1,270 @@
package com.knecon.fforesight.service.layoutparser.processor.services.visualization;
import java.awt.Color;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.font.PDType1Font;
import org.apache.pdfbox.pdmodel.font.Standard14Fonts;
import org.apache.pdfbox.util.Matrix;
import org.springframework.core.io.ClassPathResource;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.utils.PdfVisualisationUtility;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Getter;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.experimental.UtilityClass;
@UtilityClass
public class PdfDraw {
public static void drawRectanglesPerPage(String filename, List<List<Rectangle2D>> rectanglesPerPage, String tmpFileName) throws IOException {
ClassPathResource pdfResource = new ClassPathResource(filename);
try (PDDocument pdDocument = Loader.loadPDF(pdfResource.getFile()); var out = new FileOutputStream(tmpFileName)) {
for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) {
PdfVisualisationUtility.drawRectangle2DList(pdDocument,
pageNumber,
rectanglesPerPage.get(pageNumber - 1),
PdfVisualisationUtility.Options.builder().stroke(true).build());
}
pdDocument.save(out);
}
}
public static void drawRectanglesPerPageNumberedByLine(String filename, List<List<List<Rectangle2D>>> rectanglesPerPage, String tmpFileName) throws IOException {
ClassPathResource pdfResource = new ClassPathResource(filename);
try (PDDocument pdDocument = Loader.loadPDF(pdfResource.getFile()); var out = new FileOutputStream(tmpFileName)) {
for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) {
var rectanglesOnPage = rectanglesPerPage.get(pageNumber - 1);
for (int lineNumber = 0; lineNumber < rectanglesOnPage.size(); lineNumber++) {
var rectanglesInLine = rectanglesOnPage.get(lineNumber);
PdfVisualisationUtility.drawRectangle2DList(pdDocument, pageNumber, rectanglesInLine, PdfVisualisationUtility.Options.builder().stroke(true).build());
double y = Math.min(rectanglesInLine.get(0).getMinY(), rectanglesInLine.get(0).getMaxY());
PdfVisualisationUtility.drawText(String.format("%d", lineNumber),
pdDocument,
new Point2D.Double(rectanglesInLine.get(0).getX() - (5 + (5 * countNumberOfDigits(lineNumber))), y + 2),
pageNumber,
PdfVisualisationUtility.Options.builder().stroke(true).build());
}
}
pdDocument.save(out);
}
}
private static int countNumberOfDigits(int num) {
int final_num = num;
if (final_num == 0) {
return 1;
}
int count = 0;
for (; final_num != 0; final_num /= 10) {
count++;
}
return count;
}
public static void drawDocumentGraph(PDDocument document, Document documentGraph) {
documentGraph.getDocumentTree().allEntriesInOrder().forEach(entry -> drawNode(document, entry));
}
public static void drawNode(PDDocument document, DocumentTree.Entry entry) {
Options options = buildStandardOptionsForNodes(entry);
drawBBoxAndLabelAndNumberOnPage(document, entry, options);
}
public static void drawTextBlock(PDDocument document, TextBlock textBlock, Options options) {
textBlock.getAtomicTextBlocks().forEach(atb -> drawAtomicTextBlock(document, atb, options));
}
public static void drawAtomicTextBlock(PDDocument document, AtomicTextBlock atomicTextBlock, Options options) {
drawRectangle2DList(document, atomicTextBlock.getPage().getNumber(), atomicTextBlock.getPositions().stream().toList(), options);
}
@SneakyThrows
private static void drawText(String string, PDDocument document, Point2D location, Integer pageNumber, Options options, boolean rotate) {
var pdPage = document.getPage(pageNumber - 1);
var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true);
contentStream.setNonStrokingColor(options.getStrokeColor());
contentStream.setLineWidth(options.getStrokeWidth());
contentStream.beginText();
if (rotate) {
contentStream.setTextMatrix(Matrix.getRotateInstance(Math.toRadians(15), (float) location.getX(), (float) location.getY()));
} else {
contentStream.newLineAtOffset((float) location.getX(), (float) location.getY());
}
contentStream.setFont(new PDType1Font(Standard14Fonts.FontName.HELVETICA), 10);
contentStream.showText(string);
contentStream.endText();
contentStream.close();
}
@SneakyThrows
public static void drawRectangle2DList(PDDocument document, int pageNumber, List<Rectangle2D> rectCollection, Options options) {
var pdPage = document.getPage(pageNumber - 1);
drawRectangle2DList(document, rectCollection, options, pdPage);
}
private static void drawRectangle2DList(PDDocument document, List<Rectangle2D> rectCollection, Options options, PDPage pdPage) throws IOException {
var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true);
contentStream.setStrokingColor(options.getStrokeColor());
contentStream.setNonStrokingColor(options.getFillColor());
contentStream.setLineWidth(options.getStrokeWidth());
for (var r : rectCollection) {
contentStream.addRect((float) r.getMinX(), (float) r.getMinY(), (float) r.getWidth(), (float) r.getHeight());
if (options.isStroke() && options.isFill()) {
contentStream.fillAndStroke();
} else if (options.isStroke()) {
contentStream.stroke();
} else if (options.isFill()) {
contentStream.fill();
}
}
contentStream.close();
}
@SneakyThrows
public static void drawRectanglesAndLinesPerPage(String filename, List<List<Rectangle2D>> list, List<List<Rectangle2D>> rectanglesPerPage, String tmpFileName) {
ClassPathResource pdfResource = new ClassPathResource(filename);
try (PDDocument pdDocument = Loader.loadPDF(pdfResource.getFile()); var out = new FileOutputStream(tmpFileName)) {
for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) {
// PdfVisualisationUtility.drawLine2DList(pdDocument,
// pageNumber,
// list.get(pageNumber - 1),
// PdfVisualisationUtility.Options.builder().stroke(true).build());
PdfVisualisationUtility.drawRectangle2DList(pdDocument,
pageNumber,
rectanglesPerPage.get(pageNumber - 1),
PdfVisualisationUtility.Options.builder().stroke(true).build());
PdfVisualisationUtility.drawRectangle2DList(pdDocument, pageNumber, list.get(pageNumber - 1), PdfVisualisationUtility.Options.builder().stroke(true).build());
}
pdDocument.save(out);
}
}
@SneakyThrows
public static void drawLinesPerPage(String filename, List<List<Ruling>> linesPerPage, String tmpFileName) {
ClassPathResource pdfResource = new ClassPathResource(filename);
try (PDDocument pdDocument = Loader.loadPDF(pdfResource.getFile()); var out = new FileOutputStream(tmpFileName)) {
for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) {
PdfVisualisationUtility.drawLine2DList(pdDocument,
pageNumber,
linesPerPage.get(pageNumber - 1),
PdfVisualisationUtility.Options.builder().strokeColor(Color.RED).stroke(true).build());
}
pdDocument.save(out);
}
}
@Builder
@AllArgsConstructor
@Getter
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public static class Options {
boolean stroke;
@Builder.Default
Color strokeColor = Color.BLACK;
@Builder.Default
float strokeWidth = 1f;
boolean fill;
@Builder.Default
Color fillColor = Color.BLACK;
}
private static Options buildStandardOptionsForNodes(DocumentTree.Entry entry) {
return Options.builder().stroke(true).strokeColor(switch (entry.getType()) {
case DOCUMENT -> Color.LIGHT_GRAY;
case HEADER, FOOTER -> Color.GREEN;
case PARAGRAPH -> Color.BLUE;
case HEADLINE -> Color.RED;
case SECTION -> Color.BLACK;
case TABLE -> Color.ORANGE;
case TABLE_CELL -> Color.GRAY;
case IMAGE -> Color.MAGENTA;
}).build();
}
private static void drawBBoxAndLabelAndNumberOnPage(PDDocument document, DocumentTree.Entry entry, Options options) {
Map<Page, Rectangle2D> rectanglesPerPage = entry.getNode().getBBox();
for (Page page : rectanglesPerPage.keySet()) {
Rectangle2D rectangle2D = rectanglesPerPage.get(page);
if (entry.getType() == NodeType.SECTION) {
rectangle2D = RectangleTransformations.pad(rectangle2D, 10, 10);
}
drawRectangle2DList(document, page.getNumber(), List.of(rectangle2D), options);
drawText(buildString(entry),
document,
new Point2D.Double(rectangle2D.getMinX(), rectangle2D.getMaxY() + 2),
page.getNumber(),
options,
entry.getType() == NodeType.TABLE_CELL);
}
}
private static String buildString(DocumentTree.Entry entry) {
return entry.getNode().getNumberOnPage() + ": " + entry.getTreeId() + ": " + entry.getType();
}
}

View File

@ -25,6 +25,8 @@ public class ViewerDocumentTest extends BuildDocumentTest {
@SneakyThrows
public void testViewerDocument() {
System.out.println("<<<<<<<<<<" + Math.sin(-0) + "aaa " + Math.cos(-0));
String fileName = "files/Plenarprotokoll 1 (keine Druchsache!) (1).pdf";
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";