More refactoring
This commit is contained in:
parent
4de6c12aec
commit
4871e55f2d
@ -90,7 +90,7 @@ public class LayoutParsingPipeline {
|
||||
RedactManagerBlockificationService redactManagerBlockificationService;
|
||||
LayoutGridService layoutGridService;
|
||||
ObservationRegistry observationRegistry;
|
||||
// DocstrumSegmenter docstrumSegmenter;
|
||||
DocstrumSegmenter docstrumSegmenter;
|
||||
HierarchicalReadingOrderResolver hierarchicalReadingOrderResolver;
|
||||
|
||||
|
||||
@ -251,7 +251,7 @@ public class LayoutParsingPipeline {
|
||||
|
||||
// Docstrum
|
||||
AtomicInteger num = new AtomicInteger(pageNumber);
|
||||
var zones = new DocstrumSegmenter().segmentPage(stripper.getTextPositionSequences());
|
||||
var zones = docstrumSegmenter.segmentPage(stripper.getTextPositionSequences());
|
||||
zones = hierarchicalReadingOrderResolver.resolve(zones);
|
||||
|
||||
List<AbstractPageBlock> pageBlocks = new ArrayList<>();
|
||||
|
||||
@ -22,9 +22,16 @@ import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.mo
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.utils.BoundingBoxBuilder;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.utils.ZoneUtils;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class DocstrumSegmenter {
|
||||
|
||||
private final NearestNeighbourService nearestNeighbourService;
|
||||
private final SpacingService spacingService;
|
||||
private final LineBuilderService lineBuilderService;
|
||||
|
||||
public static final int MAX_ZONES_PER_PAGE = 300;
|
||||
|
||||
private static final double DISTANCE_STEP = 16.0;
|
||||
@ -167,18 +174,14 @@ public class DocstrumSegmenter {
|
||||
|
||||
var components = positions.stream().map(chunk -> new Character(chunk)).collect(Collectors.toList());
|
||||
|
||||
Character[] componentsArray = new Character[positions.size()];
|
||||
components.toArray(componentsArray);
|
||||
nearestNeighbourService.findNearestNeighbors(components);
|
||||
|
||||
Arrays.sort(componentsArray, Character.CharacterXComparator.getInstance());
|
||||
findNeighbors(componentsArray);
|
||||
double orientation = 0;
|
||||
|
||||
double orientation = computeInitialOrientation(components);
|
||||
double characterSpacing = spacingService.computeCharacterSpacing(components);
|
||||
double lineSpacing = spacingService.computeLineSpacing(components);
|
||||
|
||||
double characterSpacing = computeCharacterSpacing(components, orientation);
|
||||
double lineSpacing = computeLineSpacing(components, orientation);
|
||||
|
||||
List<ComponentLine> lines = determineLines(components, characterSpacing * COMP_DIST_CHAR, lineSpacing * MAX_VERTICAL_COMP_DIST);
|
||||
List<ComponentLine> lines = lineBuilderService.buildLines(components, characterSpacing, lineSpacing);
|
||||
|
||||
List<List<ComponentLine>> zones = determineZones(lines,
|
||||
orientation,
|
||||
@ -322,34 +325,52 @@ public class DocstrumSegmenter {
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Groups components into text lines.
|
||||
*
|
||||
* @param components component list
|
||||
* @param maxHorizontalDistance - maximum horizontal distance between components
|
||||
* @param maxVerticalDistance - maximum vertical distance between components
|
||||
* @return lines of components
|
||||
*/
|
||||
private List<ComponentLine> determineLines(List<Character> components, double maxHorizontalDistance, double maxVerticalDistance) {
|
||||
private List<ComponentLine> determineLines(List<Character> characters, double characterSpacing, double lineSpacing) {
|
||||
|
||||
DisjointSets<Character> sets = new DisjointSets<Character>(components);
|
||||
double maxHorizontalDistance = characterSpacing * COMP_DIST_CHAR;
|
||||
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_COMP_DIST;
|
||||
|
||||
// DisjointSets<Character> sets = new DisjointSets<Character>(characters);
|
||||
// AngleFilter filter = AngleFilter.newInstance(-ANGLE_TOLERANCE, ANGLE_TOLERANCE);
|
||||
// for (Character component : characters) {
|
||||
// for (Neighbor neighbor : component.getNeighbors()) {
|
||||
// double x = neighbor.getHorizontalDistance() / maxHorizontalDistance;
|
||||
// double y = neighbor.getVerticalDistance() / maxVerticalDistance;
|
||||
// if (filter.matches(neighbor) && x * x + y * y <= 1) {
|
||||
// sets.union(component, neighbor.getCharacter());
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// List<ComponentLine> lines = new ArrayList<ComponentLine>();
|
||||
// for (Set<Character> group : sets) {
|
||||
// List<Character> lineComponents = new ArrayList<Character>(group);
|
||||
// lineComponents.sort(Comparator.comparingDouble(Character::getX));
|
||||
// lines.add(new ComponentLine(lineComponents));
|
||||
// }
|
||||
// return lines;
|
||||
|
||||
DisjointSets<Character> sets = new DisjointSets<>(characters);
|
||||
AngleFilter filter = AngleFilter.newInstance(-ANGLE_TOLERANCE, ANGLE_TOLERANCE);
|
||||
for (Character component : components) {
|
||||
for (Neighbor neighbor : component.getNeighbors()) {
|
||||
|
||||
characters.forEach(character -> {
|
||||
character.getNeighbors().forEach(neighbor -> {
|
||||
double x = neighbor.getHorizontalDistance() / maxHorizontalDistance;
|
||||
double y = neighbor.getVerticalDistance() / maxVerticalDistance;
|
||||
if (filter.matches(neighbor) && x * x + y * y <= 1) {
|
||||
sets.union(component, neighbor.getCharacter());
|
||||
if (filter.matches(neighbor) && Math.pow(x, 2) + Math.pow(y, 2) <= 1) {
|
||||
sets.union(character, neighbor.getCharacter());
|
||||
}
|
||||
}
|
||||
}
|
||||
List<ComponentLine> lines = new ArrayList<ComponentLine>();
|
||||
for (Set<Character> group : sets) {
|
||||
List<Character> lineComponents = new ArrayList<Character>(group);
|
||||
Collections.sort(lineComponents, Character.CharacterXComparator.getInstance());
|
||||
});
|
||||
});
|
||||
|
||||
List<ComponentLine> lines = new ArrayList<>();
|
||||
sets.forEach(group -> {
|
||||
List<Character> lineComponents = new ArrayList<>(group);
|
||||
lineComponents.sort(Comparator.comparingDouble(Character::getX));
|
||||
lines.add(new ComponentLine(lineComponents));
|
||||
}
|
||||
});
|
||||
|
||||
return lines;
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -508,7 +529,7 @@ public class DocstrumSegmenter {
|
||||
this.y1 = a + b * this.x1;
|
||||
} else if (!components.isEmpty()) {
|
||||
Character component = components.get(0);
|
||||
double dx = component.getChunk().getWidthDirAdj() / 3;
|
||||
double dx = component.getTextPosition().getWidthDirAdj() / 3;
|
||||
double dy = dx * Math.tan(0);
|
||||
this.x0 = component.getX() - dx;
|
||||
this.x1 = component.getX() + dx;
|
||||
@ -590,14 +611,15 @@ public class DocstrumSegmenter {
|
||||
Character previousComponent = null;
|
||||
for (Character component : components) {
|
||||
if (previousComponent != null) {
|
||||
double dist = component.getChunk().getXDirAdj() - previousComponent.getChunk().getXDirAdj() - previousComponent.getChunk().getWidthDirAdj();
|
||||
double dist = component.getTextPosition().getXDirAdj() - previousComponent.getTextPosition().getXDirAdj() - previousComponent.getTextPosition()
|
||||
.getWidthDirAdj();
|
||||
if (dist > wordSpacing) {
|
||||
BoundingBoxBuilder.setBounds(word);
|
||||
line.addWord(word);
|
||||
word = new Word();
|
||||
}
|
||||
}
|
||||
word.addChunk(component.getChunk());
|
||||
word.addChunk(component.getTextPosition());
|
||||
previousComponent = component;
|
||||
}
|
||||
BoundingBoxBuilder.setBounds(word);
|
||||
|
||||
@ -0,0 +1,48 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.DisjointSets;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.Character;
|
||||
|
||||
@Service
|
||||
public class LineBuilderService {
|
||||
|
||||
private static final double CHARACTER_SPACING_DISTANCE_MULTIPLIER = 3.5;
|
||||
private static final double MAX_VERTICAL_CHARACTER_DISTANCE = 0.67;
|
||||
private static final double ANGLE_TOLERANCE = Math.PI / 6;
|
||||
|
||||
|
||||
public List<DocstrumSegmenter.ComponentLine> buildLines(List<Character> characters, double characterSpacing, double lineSpacing) {
|
||||
|
||||
double maxHorizontalDistance = characterSpacing * CHARACTER_SPACING_DISTANCE_MULTIPLIER;
|
||||
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_CHARACTER_DISTANCE;
|
||||
|
||||
DisjointSets<Character> sets = new DisjointSets<>(characters);
|
||||
AngleFilter filter = AngleFilter.newInstance(-ANGLE_TOLERANCE, ANGLE_TOLERANCE);
|
||||
|
||||
characters.forEach(character -> {
|
||||
character.getNeighbors().forEach(neighbor -> {
|
||||
double x = neighbor.getHorizontalDistance() / maxHorizontalDistance;
|
||||
double y = neighbor.getVerticalDistance() / maxVerticalDistance;
|
||||
if (filter.matches(neighbor) && Math.pow(x, 2) + Math.pow(y, 2) <= 1) {
|
||||
sets.union(character, neighbor.getCharacter());
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
List<DocstrumSegmenter.ComponentLine> lines = new ArrayList<>();
|
||||
sets.forEach(group -> {
|
||||
List<Character> lineComponents = new ArrayList<>(group);
|
||||
lineComponents.sort(Comparator.comparingDouble(Character::getX));
|
||||
lines.add(new DocstrumSegmenter.ComponentLine(lineComponents));
|
||||
});
|
||||
|
||||
return lines;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,78 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.Character;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.Neighbor;
|
||||
|
||||
@Service
|
||||
public class NearestNeighbourService {
|
||||
|
||||
private static final int NUMBER_OF_NEIGHBOURS = 8;
|
||||
private static final double STEP = 16.0;
|
||||
|
||||
|
||||
public void findNearestNeighbors(List<Character> characters) {
|
||||
|
||||
if (characters.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
characters.sort(Comparator.comparingDouble(Character::getX));
|
||||
|
||||
int maxNeighborCount = NUMBER_OF_NEIGHBOURS;
|
||||
if (characters.size() <= NUMBER_OF_NEIGHBOURS) {
|
||||
maxNeighborCount = characters.size() - 1;
|
||||
}
|
||||
|
||||
for (int i = 0; i < characters.size(); i++) {
|
||||
|
||||
List<Neighbor> candidates = new ArrayList<>();
|
||||
|
||||
int start = i;
|
||||
int end = i + 1;
|
||||
|
||||
double distance = Double.POSITIVE_INFINITY;
|
||||
|
||||
for (double searchDistance = 0; searchDistance < distance; ) {
|
||||
|
||||
searchDistance += STEP;
|
||||
boolean newCandidatesFound = false;
|
||||
|
||||
while (start > 0 && characters.get(i).getX() - characters.get(start - 1).getX() < searchDistance) {
|
||||
start--;
|
||||
candidates.add(new Neighbor(characters.get(start), characters.get(i)));
|
||||
clearLeastDistant(candidates, maxNeighborCount);
|
||||
newCandidatesFound = true;
|
||||
}
|
||||
|
||||
while (end < characters.size() && characters.get(end).getX() - characters.get(i).getX() < searchDistance) {
|
||||
candidates.add(new Neighbor(characters.get(end), characters.get(i)));
|
||||
clearLeastDistant(candidates, maxNeighborCount);
|
||||
end++;
|
||||
newCandidatesFound = true;
|
||||
}
|
||||
|
||||
if (newCandidatesFound && candidates.size() >= maxNeighborCount) {
|
||||
distance = candidates.get(maxNeighborCount - 1).getDistance();
|
||||
}
|
||||
}
|
||||
clearLeastDistant(candidates, maxNeighborCount);
|
||||
characters.get(i).setNeighbors(new ArrayList<>(candidates));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void clearLeastDistant(List<Neighbor> candidates, int maxNeighborCount) {
|
||||
|
||||
if (candidates.size() > maxNeighborCount) {
|
||||
candidates.sort(Comparator.comparingDouble(Neighbor::getDistance));
|
||||
candidates.remove(candidates.remove(candidates.size() - 1));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,55 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Histogram;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.Character;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.Neighbor;
|
||||
|
||||
@Service
|
||||
public class SpacingService {
|
||||
|
||||
private static final double SPACING_HISTOGRAM_RESOLUTION = 0.5;
|
||||
private static final double SPACING_HISTOGRAM_SMOOTHING_LENGTH = 2.5;
|
||||
private static final double SPACING_HIST_SMOOTHING_STANDARD_DEVIATION = 0.5;
|
||||
private static final double ANGLE_TOLERANCE = Math.PI / 6;
|
||||
|
||||
|
||||
public double computeCharacterSpacing(List<Character> components) {
|
||||
|
||||
return computeSpacing(components, 0);
|
||||
}
|
||||
|
||||
|
||||
public double computeLineSpacing(List<Character> components) {
|
||||
|
||||
return computeSpacing(components, Math.PI / 2);
|
||||
}
|
||||
|
||||
|
||||
private double computeSpacing(List<Character> components, double angle) {
|
||||
|
||||
double maxDistance = Double.NEGATIVE_INFINITY;
|
||||
|
||||
for (Character component : components) {
|
||||
for (Neighbor neighbor : component.getNeighbors()) {
|
||||
maxDistance = Math.max(maxDistance, neighbor.getDistance());
|
||||
}
|
||||
}
|
||||
Histogram histogram = new Histogram(0, maxDistance, SPACING_HISTOGRAM_RESOLUTION);
|
||||
AngleFilter filter = AngleFilter.newInstance(angle - ANGLE_TOLERANCE, angle + ANGLE_TOLERANCE);
|
||||
for (Character component : components) {
|
||||
for (Neighbor neighbor : component.getNeighbors()) {
|
||||
if (filter.matches(neighbor)) {
|
||||
histogram.add(neighbor.getDistance());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
histogram.gaussianSmooth(SPACING_HISTOGRAM_SMOOTHING_LENGTH, SPACING_HIST_SMOOTHING_STANDARD_DEVIATION);
|
||||
return histogram.getPeakValue();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,84 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.DisjointSets;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.CharacterLine;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.CharacterZone;
|
||||
|
||||
@Service
|
||||
public class ZoneBuilderService {
|
||||
|
||||
private static final double MIN_HORIZONTAL_DISTANCE_MULTIPLIER = -0.5;
|
||||
private static final double MAX_VERTICAL_DISTANCE_MULTIPLIER = 1.2;
|
||||
|
||||
private static final double MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER = -3.0;
|
||||
|
||||
private static final double MAX_VERTICAL_MERGE_DISTANCE_MULTIPLIER = 0.5;
|
||||
|
||||
private static final double MIN_LINE_SIZE_SCALE = 0.9;
|
||||
|
||||
private static final double MAX_LINE_SIZE_SCALE = 2.5;
|
||||
|
||||
private static final double ANGLE_TOLERANCE = Math.PI / 6;
|
||||
|
||||
|
||||
public List<CharacterZone> buildZones(List<CharacterLine> lines, double characterSpacing, double lineSpacing) {
|
||||
|
||||
double minHorizontalDistance = characterSpacing * MIN_HORIZONTAL_DISTANCE_MULTIPLIER;
|
||||
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_DISTANCE_MULTIPLIER;
|
||||
double minHorizontalMergeDistance = characterSpacing * MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER;
|
||||
double maxVerticalMergeDistance = lineSpacing * MAX_VERTICAL_MERGE_DISTANCE_MULTIPLIER;
|
||||
|
||||
DisjointSets<CharacterLine> sets = new DisjointSets<>(lines);
|
||||
|
||||
double meanHeight = calculateMeanHeight(lines);
|
||||
|
||||
lines.forEach(outerLine -> //
|
||||
lines.forEach(innerLine -> {
|
||||
double scale = Math.min(outerLine.getHeight(), innerLine.getHeight()) / meanHeight;
|
||||
scale = Math.max(MIN_LINE_SIZE_SCALE, Math.min(scale, MAX_LINE_SIZE_SCALE));
|
||||
|
||||
if (!sets.areTogether(outerLine, innerLine) && outerLine.angularDifference(innerLine) <= ANGLE_TOLERANCE) {
|
||||
|
||||
double horizontalDistance = outerLine.horizontalDistance(innerLine) / scale;
|
||||
double verticalDistance = outerLine.verticalDistance(innerLine) / scale;
|
||||
|
||||
// Line over or above
|
||||
if (minHorizontalDistance <= horizontalDistance && verticalDistance <= maxVerticalDistance) {
|
||||
sets.union(outerLine, innerLine);
|
||||
}
|
||||
|
||||
// Split line that needs later merging
|
||||
else if (minHorizontalMergeDistance <= horizontalDistance && verticalDistance <= maxVerticalMergeDistance) {
|
||||
sets.union(outerLine, innerLine);
|
||||
}
|
||||
}
|
||||
}));
|
||||
|
||||
List<CharacterZone> zones = new ArrayList<>();
|
||||
sets.forEach(group -> {
|
||||
zones.add(new CharacterZone(new ArrayList<>(group)));
|
||||
});
|
||||
|
||||
return zones;
|
||||
}
|
||||
|
||||
|
||||
private double calculateMeanHeight(List<CharacterLine> lines) {
|
||||
|
||||
double meanHeight = 0.0;
|
||||
double weights = 0.0;
|
||||
for (CharacterLine line : lines) {
|
||||
double weight = line.getLength();
|
||||
meanHeight += line.getHeight() * weight;
|
||||
weights += weight;
|
||||
}
|
||||
meanHeight /= weights;
|
||||
return meanHeight;
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,35 +1,33 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Comparator;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.Getter;
|
||||
|
||||
@Data
|
||||
public class Character {
|
||||
|
||||
private final double x;
|
||||
private final double y;
|
||||
private final RedTextPosition chunk;
|
||||
private final RedTextPosition textPosition;
|
||||
|
||||
private List<Neighbor> neighbors;
|
||||
private List<Neighbor> neighbors = new ArrayList<>();
|
||||
|
||||
|
||||
public Character(RedTextPosition chunk) {
|
||||
|
||||
this.x = chunk.getXDirAdj() + chunk.getWidthDirAdj() / 2;
|
||||
this.y = chunk.getYDirAdj() + chunk.getHeightDir() / 2;
|
||||
this.chunk = chunk;
|
||||
this.textPosition = chunk;
|
||||
}
|
||||
|
||||
|
||||
public double getHeight() {
|
||||
|
||||
return chunk.getHeightDir();
|
||||
return textPosition.getHeightDir();
|
||||
}
|
||||
|
||||
|
||||
@ -68,43 +66,4 @@ public class Character {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public double overlappingDistance(Character other, double orientation) {
|
||||
|
||||
double[] xs = new double[4];
|
||||
double s = Math.sin(-orientation), c = Math.cos(-orientation);
|
||||
xs[0] = c * x - s * y;
|
||||
xs[1] = c * (x + chunk.getWidthDirAdj()) - s * (y + chunk.getHeightDir());
|
||||
xs[2] = c * other.x - s * other.y;
|
||||
xs[3] = c * (other.x + other.chunk.getWidthDirAdj()) - s * (other.y + other.chunk.getHeightDir());
|
||||
boolean overlapping = xs[1] >= xs[2] && xs[3] >= xs[0];
|
||||
Arrays.sort(xs);
|
||||
return Math.abs(xs[2] - xs[1]) * (overlapping ? 1 : -1);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Component comparator based on x coordinate of the centroid of component.
|
||||
* <p>
|
||||
* The ordering is not consistent with equals.
|
||||
*/
|
||||
public static final class CharacterXComparator implements Comparator<Character> {
|
||||
|
||||
private CharacterXComparator() {
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int compare(Character o1, Character o2) {
|
||||
|
||||
return Double.compare(o1.getX(), o2.getX());
|
||||
}
|
||||
|
||||
|
||||
@Getter
|
||||
private static final CharacterXComparator instance = new CharacterXComparator();
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,111 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
public class CharacterLine {
|
||||
|
||||
private final double x0;
|
||||
private final double y0;
|
||||
|
||||
private final double x1;
|
||||
private final double y1;
|
||||
|
||||
private final double height;
|
||||
|
||||
private final List<Character> characters;
|
||||
|
||||
|
||||
public CharacterLine(List<Character> characters) {
|
||||
|
||||
this.characters = characters;
|
||||
|
||||
if (characters.size() >= 2) {
|
||||
// Simple linear regression
|
||||
double sx = 0.0, sxx = 0.0, sxy = 0.0, sy = 0.0;
|
||||
for (Character component : characters) {
|
||||
sx += component.getX();
|
||||
sxx += component.getX() * component.getX();
|
||||
sxy += component.getX() * component.getY();
|
||||
sy += component.getY();
|
||||
}
|
||||
double b = (characters.size() * sxy - sx * sy) / (characters.size() * sxx - sx * sx);
|
||||
double a = (sy - b * sx) / characters.size();
|
||||
|
||||
this.x0 = characters.get(0).getX();
|
||||
this.y0 = a + b * this.x0;
|
||||
this.x1 = characters.get(characters.size() - 1).getX();
|
||||
this.y1 = a + b * this.x1;
|
||||
} else if (!characters.isEmpty()) {
|
||||
Character component = characters.get(0);
|
||||
double dx = component.getTextPosition().getWidthDirAdj() / 3;
|
||||
double dy = dx * Math.tan(0);
|
||||
this.x0 = component.getX() - dx;
|
||||
this.x1 = component.getX() + dx;
|
||||
this.y0 = component.getY() - dy;
|
||||
this.y1 = component.getY() + dy;
|
||||
} else {
|
||||
throw new IllegalArgumentException("Component list must not be empty");
|
||||
}
|
||||
height = computeHeight();
|
||||
}
|
||||
|
||||
|
||||
public double getAngle() {
|
||||
|
||||
return Math.atan2(y1 - y0, x1 - x0);
|
||||
}
|
||||
|
||||
|
||||
public double getLength() {
|
||||
|
||||
return Math.sqrt((x0 - x1) * (x0 - x1) + (y0 - y1) * (y0 - y1));
|
||||
}
|
||||
|
||||
|
||||
private double computeHeight() {
|
||||
|
||||
double sum = 0.0;
|
||||
for (Character component : characters) {
|
||||
sum += component.getHeight();
|
||||
}
|
||||
return sum / characters.size();
|
||||
}
|
||||
|
||||
|
||||
public double angularDifference(CharacterLine j) {
|
||||
|
||||
double diff = Math.abs(getAngle() - j.getAngle());
|
||||
if (diff <= Math.PI / 2) {
|
||||
return diff;
|
||||
} else {
|
||||
return Math.PI - diff;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public double horizontalDistance(CharacterLine other) {
|
||||
|
||||
double[] xs = new double[4];
|
||||
double s = 0, c = 1;
|
||||
xs[0] = c * x0 - s * y0;
|
||||
xs[1] = c * x1 - s * y1;
|
||||
xs[2] = c * other.x0 - s * other.y0;
|
||||
xs[3] = c * other.x1 - s * other.y1;
|
||||
boolean overlapping = xs[1] >= xs[2] && xs[3] >= xs[0];
|
||||
Arrays.sort(xs);
|
||||
return Math.abs(xs[2] - xs[1]) * (overlapping ? 1 : -1);
|
||||
}
|
||||
|
||||
|
||||
public double verticalDistance(CharacterLine other) {
|
||||
|
||||
double xm = (x0 + x1) / 2, ym = (y0 + y1) / 2;
|
||||
double xn = (other.x0 + other.x1) / 2, yn = (other.y0 + other.y1) / 2;
|
||||
return Math.abs((xn - xm) + ym - yn) / Math.sqrt(1);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,17 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class CharacterZone {
|
||||
|
||||
private List<CharacterLine> lines = new ArrayList<>();
|
||||
|
||||
}
|
||||
@ -0,0 +1,270 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.visualization;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.PDPageContentStream;
|
||||
import org.apache.pdfbox.pdmodel.font.PDType1Font;
|
||||
import org.apache.pdfbox.pdmodel.font.Standard14Fonts;
|
||||
import org.apache.pdfbox.util.Matrix;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PdfVisualisationUtility;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Getter;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class PdfDraw {
|
||||
|
||||
public static void drawRectanglesPerPage(String filename, List<List<Rectangle2D>> rectanglesPerPage, String tmpFileName) throws IOException {
|
||||
|
||||
ClassPathResource pdfResource = new ClassPathResource(filename);
|
||||
try (PDDocument pdDocument = Loader.loadPDF(pdfResource.getFile()); var out = new FileOutputStream(tmpFileName)) {
|
||||
for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) {
|
||||
PdfVisualisationUtility.drawRectangle2DList(pdDocument,
|
||||
pageNumber,
|
||||
rectanglesPerPage.get(pageNumber - 1),
|
||||
PdfVisualisationUtility.Options.builder().stroke(true).build());
|
||||
}
|
||||
pdDocument.save(out);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
public static void drawRectanglesPerPageNumberedByLine(String filename, List<List<List<Rectangle2D>>> rectanglesPerPage, String tmpFileName) throws IOException {
|
||||
|
||||
ClassPathResource pdfResource = new ClassPathResource(filename);
|
||||
try (PDDocument pdDocument = Loader.loadPDF(pdfResource.getFile()); var out = new FileOutputStream(tmpFileName)) {
|
||||
for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) {
|
||||
var rectanglesOnPage = rectanglesPerPage.get(pageNumber - 1);
|
||||
for (int lineNumber = 0; lineNumber < rectanglesOnPage.size(); lineNumber++) {
|
||||
var rectanglesInLine = rectanglesOnPage.get(lineNumber);
|
||||
PdfVisualisationUtility.drawRectangle2DList(pdDocument, pageNumber, rectanglesInLine, PdfVisualisationUtility.Options.builder().stroke(true).build());
|
||||
double y = Math.min(rectanglesInLine.get(0).getMinY(), rectanglesInLine.get(0).getMaxY());
|
||||
PdfVisualisationUtility.drawText(String.format("%d", lineNumber),
|
||||
pdDocument,
|
||||
new Point2D.Double(rectanglesInLine.get(0).getX() - (5 + (5 * countNumberOfDigits(lineNumber))), y + 2),
|
||||
pageNumber,
|
||||
PdfVisualisationUtility.Options.builder().stroke(true).build());
|
||||
}
|
||||
}
|
||||
pdDocument.save(out);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
private static int countNumberOfDigits(int num) {
|
||||
|
||||
int final_num = num;
|
||||
if (final_num == 0) {
|
||||
return 1;
|
||||
}
|
||||
int count = 0;
|
||||
for (; final_num != 0; final_num /= 10) {
|
||||
count++;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
|
||||
public static void drawDocumentGraph(PDDocument document, Document documentGraph) {
|
||||
|
||||
documentGraph.getDocumentTree().allEntriesInOrder().forEach(entry -> drawNode(document, entry));
|
||||
}
|
||||
|
||||
|
||||
public static void drawNode(PDDocument document, DocumentTree.Entry entry) {
|
||||
|
||||
Options options = buildStandardOptionsForNodes(entry);
|
||||
|
||||
drawBBoxAndLabelAndNumberOnPage(document, entry, options);
|
||||
|
||||
}
|
||||
|
||||
|
||||
public static void drawTextBlock(PDDocument document, TextBlock textBlock, Options options) {
|
||||
|
||||
textBlock.getAtomicTextBlocks().forEach(atb -> drawAtomicTextBlock(document, atb, options));
|
||||
}
|
||||
|
||||
|
||||
public static void drawAtomicTextBlock(PDDocument document, AtomicTextBlock atomicTextBlock, Options options) {
|
||||
|
||||
drawRectangle2DList(document, atomicTextBlock.getPage().getNumber(), atomicTextBlock.getPositions().stream().toList(), options);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private static void drawText(String string, PDDocument document, Point2D location, Integer pageNumber, Options options, boolean rotate) {
|
||||
|
||||
var pdPage = document.getPage(pageNumber - 1);
|
||||
var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true);
|
||||
|
||||
contentStream.setNonStrokingColor(options.getStrokeColor());
|
||||
contentStream.setLineWidth(options.getStrokeWidth());
|
||||
|
||||
contentStream.beginText();
|
||||
if (rotate) {
|
||||
contentStream.setTextMatrix(Matrix.getRotateInstance(Math.toRadians(15), (float) location.getX(), (float) location.getY()));
|
||||
} else {
|
||||
contentStream.newLineAtOffset((float) location.getX(), (float) location.getY());
|
||||
}
|
||||
contentStream.setFont(new PDType1Font(Standard14Fonts.FontName.HELVETICA), 10);
|
||||
contentStream.showText(string);
|
||||
contentStream.endText();
|
||||
contentStream.close();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public static void drawRectangle2DList(PDDocument document, int pageNumber, List<Rectangle2D> rectCollection, Options options) {
|
||||
|
||||
var pdPage = document.getPage(pageNumber - 1);
|
||||
drawRectangle2DList(document, rectCollection, options, pdPage);
|
||||
}
|
||||
|
||||
|
||||
private static void drawRectangle2DList(PDDocument document, List<Rectangle2D> rectCollection, Options options, PDPage pdPage) throws IOException {
|
||||
|
||||
var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true);
|
||||
|
||||
contentStream.setStrokingColor(options.getStrokeColor());
|
||||
contentStream.setNonStrokingColor(options.getFillColor());
|
||||
contentStream.setLineWidth(options.getStrokeWidth());
|
||||
|
||||
for (var r : rectCollection) {
|
||||
contentStream.addRect((float) r.getMinX(), (float) r.getMinY(), (float) r.getWidth(), (float) r.getHeight());
|
||||
|
||||
if (options.isStroke() && options.isFill()) {
|
||||
contentStream.fillAndStroke();
|
||||
} else if (options.isStroke()) {
|
||||
contentStream.stroke();
|
||||
} else if (options.isFill()) {
|
||||
contentStream.fill();
|
||||
}
|
||||
}
|
||||
contentStream.close();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public static void drawRectanglesAndLinesPerPage(String filename, List<List<Rectangle2D>> list, List<List<Rectangle2D>> rectanglesPerPage, String tmpFileName) {
|
||||
|
||||
ClassPathResource pdfResource = new ClassPathResource(filename);
|
||||
try (PDDocument pdDocument = Loader.loadPDF(pdfResource.getFile()); var out = new FileOutputStream(tmpFileName)) {
|
||||
for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) {
|
||||
// PdfVisualisationUtility.drawLine2DList(pdDocument,
|
||||
// pageNumber,
|
||||
// list.get(pageNumber - 1),
|
||||
// PdfVisualisationUtility.Options.builder().stroke(true).build());
|
||||
PdfVisualisationUtility.drawRectangle2DList(pdDocument,
|
||||
pageNumber,
|
||||
rectanglesPerPage.get(pageNumber - 1),
|
||||
PdfVisualisationUtility.Options.builder().stroke(true).build());
|
||||
PdfVisualisationUtility.drawRectangle2DList(pdDocument, pageNumber, list.get(pageNumber - 1), PdfVisualisationUtility.Options.builder().stroke(true).build());
|
||||
}
|
||||
pdDocument.save(out);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public static void drawLinesPerPage(String filename, List<List<Ruling>> linesPerPage, String tmpFileName) {
|
||||
|
||||
ClassPathResource pdfResource = new ClassPathResource(filename);
|
||||
try (PDDocument pdDocument = Loader.loadPDF(pdfResource.getFile()); var out = new FileOutputStream(tmpFileName)) {
|
||||
for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) {
|
||||
PdfVisualisationUtility.drawLine2DList(pdDocument,
|
||||
pageNumber,
|
||||
linesPerPage.get(pageNumber - 1),
|
||||
PdfVisualisationUtility.Options.builder().strokeColor(Color.RED).stroke(true).build());
|
||||
}
|
||||
pdDocument.save(out);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@Getter
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public static class Options {
|
||||
|
||||
boolean stroke;
|
||||
@Builder.Default
|
||||
Color strokeColor = Color.BLACK;
|
||||
@Builder.Default
|
||||
float strokeWidth = 1f;
|
||||
|
||||
boolean fill;
|
||||
@Builder.Default
|
||||
Color fillColor = Color.BLACK;
|
||||
|
||||
}
|
||||
|
||||
|
||||
private static Options buildStandardOptionsForNodes(DocumentTree.Entry entry) {
|
||||
|
||||
return Options.builder().stroke(true).strokeColor(switch (entry.getType()) {
|
||||
case DOCUMENT -> Color.LIGHT_GRAY;
|
||||
case HEADER, FOOTER -> Color.GREEN;
|
||||
case PARAGRAPH -> Color.BLUE;
|
||||
case HEADLINE -> Color.RED;
|
||||
case SECTION -> Color.BLACK;
|
||||
case TABLE -> Color.ORANGE;
|
||||
case TABLE_CELL -> Color.GRAY;
|
||||
case IMAGE -> Color.MAGENTA;
|
||||
}).build();
|
||||
}
|
||||
|
||||
|
||||
private static void drawBBoxAndLabelAndNumberOnPage(PDDocument document, DocumentTree.Entry entry, Options options) {
|
||||
|
||||
Map<Page, Rectangle2D> rectanglesPerPage = entry.getNode().getBBox();
|
||||
for (Page page : rectanglesPerPage.keySet()) {
|
||||
Rectangle2D rectangle2D = rectanglesPerPage.get(page);
|
||||
if (entry.getType() == NodeType.SECTION) {
|
||||
rectangle2D = RectangleTransformations.pad(rectangle2D, 10, 10);
|
||||
}
|
||||
drawRectangle2DList(document, page.getNumber(), List.of(rectangle2D), options);
|
||||
drawText(buildString(entry),
|
||||
document,
|
||||
new Point2D.Double(rectangle2D.getMinX(), rectangle2D.getMaxY() + 2),
|
||||
page.getNumber(),
|
||||
options,
|
||||
entry.getType() == NodeType.TABLE_CELL);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static String buildString(DocumentTree.Entry entry) {
|
||||
|
||||
return entry.getNode().getNumberOnPage() + ": " + entry.getTreeId() + ": " + entry.getType();
|
||||
}
|
||||
|
||||
}
|
||||
@ -25,6 +25,8 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
@SneakyThrows
|
||||
public void testViewerDocument() {
|
||||
|
||||
System.out.println("<<<<<<<<<<" + Math.sin(-0) + "aaa " + Math.cos(-0));
|
||||
|
||||
String fileName = "files/Plenarprotokoll 1 (keine Druchsache!) (1).pdf";
|
||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||
|
||||
|
||||
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user