Compare commits
10 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
08e994d904 | ||
|
|
0c8c727303 | ||
|
|
72202f63dc | ||
|
|
e14d953b04 | ||
|
|
9e5778d4b2 | ||
|
|
e394f2fa7c | ||
|
|
b2fb6829cb | ||
|
|
4871e55f2d | ||
|
|
4de6c12aec | ||
|
|
4afa8daafa |
@ -10,6 +10,7 @@ import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.concurrent.atomic.AtomicReference;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
@ -26,6 +27,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsi
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationSection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
@ -47,6 +49,7 @@ import com.knecon.fforesight.service.layoutparser.processor.services.blockificat
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.DocuMineClassificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.TaasClassificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.DocstrumSegmentationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper;
|
||||
@ -86,6 +89,7 @@ public class LayoutParsingPipeline {
|
||||
RedactManagerBlockificationService redactManagerBlockificationService;
|
||||
LayoutGridService layoutGridService;
|
||||
ObservationRegistry observationRegistry;
|
||||
DocstrumSegmentationService docstrumSegmentationService;
|
||||
|
||||
|
||||
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
|
||||
@ -243,11 +247,37 @@ public class LayoutParsingPipeline {
|
||||
PDRectangle cropbox = pdPage.getCropBox();
|
||||
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), stripper.getRulings());
|
||||
|
||||
ClassificationPage classificationPage = switch (layoutParsingType) {
|
||||
case REDACT_MANAGER -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
case TAAS -> taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
};
|
||||
// Docstrum
|
||||
AtomicInteger num = new AtomicInteger(pageNumber);
|
||||
var zones = docstrumSegmentationService.segmentPage(stripper.getTextPositionSequences());
|
||||
|
||||
List<AbstractPageBlock> pageBlocks = new ArrayList<>();
|
||||
AtomicInteger numOnPage = new AtomicInteger(1);
|
||||
// List<TextPositionSequence> textPositionSequences = new ArrayList<>();
|
||||
zones.forEach(zone -> {
|
||||
|
||||
List<TextPositionSequence> textPositionSequences = new ArrayList<>();
|
||||
zone.getLines().forEach(line -> {
|
||||
line.getWords().forEach(word -> {
|
||||
textPositionSequences.add(new TextPositionSequence(word.getTextPositions(), num.get()));
|
||||
});
|
||||
});
|
||||
|
||||
var cps = redactManagerBlockificationService.blockify(textPositionSequences, cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
|
||||
cps.getTextBlocks().forEach(cp -> {
|
||||
pageBlocks.add(redactManagerBlockificationService.buildTextBlock(((TextPageBlock) cp).getSequences(), numOnPage.getAndIncrement()));
|
||||
});
|
||||
});
|
||||
|
||||
// ClassificationPage classificationPage = switch (layoutParsingType) {
|
||||
// case REDACT_MANAGER -> redactManagerBlockificationService.blockify(textPositionSequences, cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
// case TAAS -> taasBlockificationService.blockify(textPositionSequences, cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
// case DOCUMINE -> docuMineBlockificationService.blockify(textPositionSequences, cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
// };
|
||||
|
||||
ClassificationPage classificationPage = new ClassificationPage(pageBlocks);
|
||||
|
||||
classificationPage.setCleanRulings(cleanRulings);
|
||||
classificationPage.setRotation(rotation);
|
||||
classificationPage.setLandscape(isLandscape);
|
||||
@ -283,9 +313,19 @@ public class LayoutParsingPipeline {
|
||||
case REDACT_MANAGER -> redactManagerClassificationService.classifyDocument(classificationDocument);
|
||||
}
|
||||
|
||||
List<ClassificationSection> sections = new ArrayList<>();
|
||||
for (var page : classificationPages) {
|
||||
page.getTextBlocks().forEach(block -> {
|
||||
block.setPage(page.getPageNumber());
|
||||
var section = sectionsBuilderService.buildTextBlock(List.of(block), "a");
|
||||
sections.add(section);
|
||||
});
|
||||
}
|
||||
classificationDocument.setSections(sections);
|
||||
|
||||
log.info("Building Sections for {}", identifier);
|
||||
sectionsBuilderService.buildSections(classificationDocument);
|
||||
sectionsBuilderService.addImagesToSections(classificationDocument);
|
||||
// sectionsBuilderService.buildSections(classificationDocument);
|
||||
// sectionsBuilderService.addImagesToSections(classificationDocument);
|
||||
return classificationDocument;
|
||||
}
|
||||
|
||||
|
||||
@ -12,15 +12,16 @@ import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRul
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.Getter;
|
||||
import lombok.NonNull;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
|
||||
|
||||
@Data
|
||||
@RequiredArgsConstructor
|
||||
public class ClassificationPage {
|
||||
|
||||
@NonNull
|
||||
@Getter
|
||||
private List<AbstractPageBlock> textBlocks;
|
||||
|
||||
private List<ClassifiedImage> images = new ArrayList<>();
|
||||
|
||||
@ -45,6 +45,9 @@ public class RedTextPosition {
|
||||
@JsonIgnore
|
||||
private String fontName;
|
||||
|
||||
@JsonIgnore
|
||||
private RedTextPosition parent;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public static RedTextPosition fromTextPosition(TextPosition textPosition) {
|
||||
|
||||
@ -17,6 +17,7 @@ import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.Getter;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
@ -27,6 +28,7 @@ import lombok.NoArgsConstructor;
|
||||
public class TextPageBlock extends AbstractPageBlock {
|
||||
|
||||
@Builder.Default
|
||||
@Getter
|
||||
private List<TextPositionSequence> sequences = new ArrayList<>();
|
||||
|
||||
@JsonIgnore
|
||||
@ -73,7 +75,7 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
|
||||
return sequences.get(0).getPageWidth();
|
||||
}
|
||||
|
||||
|
||||
|
||||
public static TextPageBlock merge(List<TextPageBlock> textBlocksToMerge) {
|
||||
|
||||
@ -82,6 +84,7 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
return fromTextPositionSequences(sequences);
|
||||
}
|
||||
|
||||
|
||||
public static TextPageBlock fromTextPositionSequences(List<TextPositionSequence> wordBlockList) {
|
||||
|
||||
TextPageBlock textBlock = null;
|
||||
@ -133,7 +136,6 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Returns the minX value in pdf coordinate system.
|
||||
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
|
||||
|
||||
@ -55,6 +55,18 @@ public class TextPositionSequence implements CharSequence {
|
||||
}
|
||||
|
||||
|
||||
public TextPositionSequence(List<RedTextPosition> textPositions, int page) {
|
||||
|
||||
this.textPositions = textPositions;
|
||||
this.page = page;
|
||||
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
|
||||
this.rotation = textPositions.get(0).getRotation();
|
||||
this.pageHeight = textPositions.get(0).getPageHeight();
|
||||
this.pageWidth = textPositions.get(0).getPageWidth();
|
||||
this.isParagraphStart = false;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int length() {
|
||||
|
||||
|
||||
@ -240,7 +240,7 @@ public class SectionsBuilderService {
|
||||
}
|
||||
|
||||
|
||||
private ClassificationSection buildTextBlock(List<AbstractPageBlock> wordBlockList, String lastHeadline) {
|
||||
public ClassificationSection buildTextBlock(List<AbstractPageBlock> wordBlockList, String lastHeadline) {
|
||||
|
||||
ClassificationSection section = new ClassificationSection();
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@ public class RedactManagerBlockificationService {
|
||||
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
|
||||
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
|
||||
|
||||
if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) {
|
||||
if (prev != null && (splitByDir || isSplitByRuling)) {
|
||||
|
||||
Orientation prevOrientation = null;
|
||||
if (!chunkBlockList.isEmpty()) {
|
||||
@ -167,7 +167,7 @@ public class RedactManagerBlockificationService {
|
||||
}
|
||||
|
||||
|
||||
private TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
|
||||
public TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
|
||||
|
||||
TextPageBlock textBlock = null;
|
||||
|
||||
|
||||
@ -0,0 +1,153 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.DisjointSets;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Line;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.LineBuilderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.NearestNeighbourService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.ReadingOrderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.SpacingService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.ZoneBuilderService;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class DocstrumSegmentationService {
|
||||
|
||||
private static final double MAX_VERTICAL_MERGE_DIST = 0.5;
|
||||
|
||||
private final NearestNeighbourService nearestNeighbourService;
|
||||
private final SpacingService spacingService;
|
||||
private final LineBuilderService lineBuilderService;
|
||||
private final ZoneBuilderService zoneBuilderService;
|
||||
private final ReadingOrderService readingOrderService;
|
||||
|
||||
|
||||
public List<Zone> segmentPage(List<TextPositionSequence> textPositions) {
|
||||
|
||||
var positions = textPositions.stream().map(TextPositionSequence::getTextPositions).flatMap(List::stream).toList();
|
||||
|
||||
var characters = positions.stream().map(Character::new).collect(Collectors.toList());
|
||||
|
||||
nearestNeighbourService.findNearestNeighbors(characters);
|
||||
|
||||
var characterSpacing = spacingService.computeCharacterSpacing(characters);
|
||||
var lineSpacing = spacingService.computeLineSpacing(characters);
|
||||
|
||||
var lines = lineBuilderService.buildLines(characters, characterSpacing, lineSpacing);
|
||||
|
||||
var zones = zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing);
|
||||
|
||||
zones = mergeLinesInZones(zones, characterSpacing, Double.NEGATIVE_INFINITY, 0.0, 0.0, lineSpacing * MAX_VERTICAL_MERGE_DIST);
|
||||
|
||||
return readingOrderService.resolve(zones, false);
|
||||
}
|
||||
|
||||
// private List<Zone> mergeZones(List<Zone> zones, double tolerance) {
|
||||
//
|
||||
// List<BxBounds> bounds = new ArrayList<BxBounds>(zones.size());
|
||||
// for (List<ComponentLine> zone : zones) {
|
||||
// BxBoundsBuilder builder = new BxBoundsBuilder();
|
||||
// for (ComponentLine line : zone) {
|
||||
// for (Component component : line.getComponents()) {
|
||||
// builder.expand(component.getChunk().getBounds());
|
||||
// }
|
||||
// }
|
||||
// bounds.add(builder.getBounds());
|
||||
// }
|
||||
//
|
||||
// List<List<ComponentLine>> outputZones = new ArrayList<List<ComponentLine>>();
|
||||
// mainFor:
|
||||
// for (int i = 0; i < zones.size(); i++) {
|
||||
// for (int j = 0; j < zones.size(); j++) {
|
||||
// if (i == j || bounds.get(j) == null || bounds.get(i) == null) {
|
||||
// continue;
|
||||
// }
|
||||
// if (BxModelUtils.contains(bounds.get(j), bounds.get(i), tolerance)) {
|
||||
// zones.get(j).addAll(zones.get(i));
|
||||
// bounds.set(i, null);
|
||||
// continue mainFor;
|
||||
// }
|
||||
// }
|
||||
// outputZones.add(zones.get(i));
|
||||
// }
|
||||
// return outputZones;
|
||||
// }
|
||||
|
||||
|
||||
private List<Zone> mergeLinesInZones(List<Zone> zones,
|
||||
double wordSpacing,
|
||||
double minHorizontalDistance,
|
||||
double maxHorizontalDistance,
|
||||
double minVerticalDistance,
|
||||
double maxVerticalDistance) {
|
||||
|
||||
List<Zone> outputZones = new ArrayList<>(zones.size());
|
||||
for (Zone zone : zones) {
|
||||
outputZones.add(mergeLinesInZone(zone, wordSpacing, minHorizontalDistance, maxHorizontalDistance, minVerticalDistance, maxVerticalDistance));
|
||||
}
|
||||
return outputZones;
|
||||
}
|
||||
|
||||
|
||||
private Zone mergeLinesInZone(Zone zone,
|
||||
double wordSpacing,
|
||||
double minHorizontalDistance,
|
||||
double maxHorizontalDistance,
|
||||
double minVerticalDistance,
|
||||
double maxVerticalDistance) {
|
||||
|
||||
DisjointSets<Line> sets = new DisjointSets<>(zone.getLines());
|
||||
for (int i = 0; i < zone.getLines().size(); i++) {
|
||||
Line li = zone.getLines().get(i);
|
||||
for (int j = i + 1; j < zone.getLines().size(); j++) {
|
||||
Line lj = zone.getLines().get(j);
|
||||
double hDist = li.horizontalDistance(lj);
|
||||
double vDist = li.verticalDistance(lj);
|
||||
if (minHorizontalDistance <= hDist && hDist <= maxHorizontalDistance && minVerticalDistance <= vDist && vDist <= maxVerticalDistance) {
|
||||
sets.union(li, lj);
|
||||
} else if (minVerticalDistance <= vDist && vDist <= maxVerticalDistance && Math.abs(hDist - Math.min(li.getLength(), lj.getLength())) < 0.1) {
|
||||
boolean componentOverlap = false;
|
||||
int overlappingCount = 0;
|
||||
for (Character ci : li.getCharacters()) {
|
||||
for (Character cj : lj.getCharacters()) {
|
||||
double dist = ci.overlappingDistance(cj);
|
||||
if (dist > 2) {
|
||||
componentOverlap = true;
|
||||
}
|
||||
if (dist > 0) {
|
||||
overlappingCount++;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!componentOverlap && overlappingCount <= 2) {
|
||||
sets.union(li, lj);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
List<Line> outputZone = new ArrayList<>();
|
||||
for (Set<Line> group : sets) {
|
||||
List<Character> components = new ArrayList<>();
|
||||
for (Line line : group) {
|
||||
components.addAll(line.getCharacters());
|
||||
}
|
||||
components.sort(Comparator.comparingDouble(Character::getX));
|
||||
|
||||
outputZone.add(new Line(components, wordSpacing));
|
||||
}
|
||||
return new Zone(outputZone);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,32 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
|
||||
|
||||
public class AngleFilter {
|
||||
|
||||
protected double lowerAngle;
|
||||
protected double upperAngle;
|
||||
|
||||
|
||||
public AngleFilter(double lowerAngle, double upperAngle) {
|
||||
|
||||
if (lowerAngle < -Math.PI / 2) {
|
||||
lowerAngle += Math.PI;
|
||||
}
|
||||
if (upperAngle >= Math.PI / 2) {
|
||||
upperAngle -= Math.PI;
|
||||
}
|
||||
|
||||
this.lowerAngle = lowerAngle;
|
||||
this.upperAngle = upperAngle;
|
||||
}
|
||||
|
||||
|
||||
public boolean matches(Neighbor neighbor) {
|
||||
|
||||
if (lowerAngle <= upperAngle) {
|
||||
return lowerAngle <= neighbor.getAngle() && neighbor.getAngle() < upperAngle;
|
||||
} else {
|
||||
return lowerAngle <= neighbor.getAngle() || neighbor.getAngle() < upperAngle;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,48 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
public abstract class BoundingBox {
|
||||
|
||||
private Rectangle2D bBox;
|
||||
|
||||
|
||||
public double getX() {
|
||||
|
||||
return bBox.getX();
|
||||
}
|
||||
|
||||
|
||||
public double getY() {
|
||||
|
||||
return bBox.getY();
|
||||
}
|
||||
|
||||
|
||||
public double getWidth() {
|
||||
|
||||
return bBox.getWidth();
|
||||
}
|
||||
|
||||
|
||||
public double getHeight() {
|
||||
|
||||
return bBox.getHeight();
|
||||
}
|
||||
|
||||
|
||||
public double getArea() {
|
||||
|
||||
return (bBox.getHeight() * bBox.getWidth());
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(Rectangle2D contained, double tolerance) {
|
||||
|
||||
return bBox.getX() <= contained.getX() + tolerance && bBox.getY() <= contained.getY() + tolerance && bBox.getX() + bBox.getWidth() >= contained.getX() + contained.getWidth() - tolerance && bBox.getY() + bBox.getHeight() >= contained.getY() + contained.getHeight() - tolerance;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,84 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
public class Character {
|
||||
|
||||
private final double x;
|
||||
private final double y;
|
||||
private final RedTextPosition textPosition;
|
||||
|
||||
private List<Neighbor> neighbors = new ArrayList<>();
|
||||
|
||||
|
||||
public Character(RedTextPosition chunk) {
|
||||
|
||||
this.x = chunk.getXDirAdj() + chunk.getWidthDirAdj() / 2;
|
||||
this.y = chunk.getYDirAdj() + chunk.getHeightDir() / 2;
|
||||
this.textPosition = chunk;
|
||||
}
|
||||
|
||||
|
||||
public double getHeight() {
|
||||
|
||||
return textPosition.getHeightDir();
|
||||
}
|
||||
|
||||
|
||||
public double distance(Character character) {
|
||||
|
||||
double dx = getX() - character.getX();
|
||||
double dy = getY() - character.getY();
|
||||
return Math.sqrt(dx * dx + dy * dy);
|
||||
}
|
||||
|
||||
|
||||
public double horizontalDistance(Character character) {
|
||||
|
||||
return Math.abs(getX() - character.getX());
|
||||
}
|
||||
|
||||
|
||||
public double verticalDistance(Character character) {
|
||||
|
||||
return Math.abs(getY() - character.getY());
|
||||
}
|
||||
|
||||
|
||||
public double overlappingDistance(Character other) {
|
||||
|
||||
double[] xs = new double[4];
|
||||
double s = Math.sin(-0), c = Math.cos(-0);
|
||||
xs[0] = c * x - s * y;
|
||||
xs[1] = c * (x + textPosition.getWidthDirAdj()) - s * (y + textPosition.getHeightDir());
|
||||
xs[2] = c * other.x - s * other.y;
|
||||
xs[3] = c * (other.x + other.textPosition.getWidthDirAdj()) - s * (other.y + other.textPosition.getHeightDir());
|
||||
boolean overlapping = xs[1] >= xs[2] && xs[3] >= xs[0];
|
||||
Arrays.sort(xs);
|
||||
return Math.abs(xs[2] - xs[1]) * (overlapping ? 1 : -1);
|
||||
}
|
||||
|
||||
|
||||
public void setNeighbors(List<Neighbor> neighbors) {
|
||||
|
||||
this.neighbors = neighbors;
|
||||
}
|
||||
|
||||
|
||||
public double angle(Character character) {
|
||||
|
||||
if (getX() > character.getX()) {
|
||||
return Math.atan2(getY() - character.getY(), getX() - character.getX());
|
||||
} else {
|
||||
return Math.atan2(character.getY() - getY(), character.getX() - getX());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,194 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
|
||||
|
||||
import java.util.AbstractSet;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.Set;
|
||||
|
||||
public class DisjointSets<E> implements Iterable<Set<E>> {
|
||||
|
||||
private final Map<E, Entry<E>> map = new HashMap<>();
|
||||
|
||||
|
||||
public DisjointSets(Collection<? extends E> collection) {
|
||||
|
||||
for (E element : collection) {
|
||||
map.put(element, new Entry<E>(element));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public boolean areTogether(E e1, E e2) {
|
||||
|
||||
return map.get(e1).findRepresentative() == map.get(e2).findRepresentative();
|
||||
}
|
||||
|
||||
|
||||
public void union(E e1, E e2) {
|
||||
|
||||
Entry<E> r1 = map.get(e1).findRepresentative();
|
||||
Entry<E> r2 = map.get(e2).findRepresentative();
|
||||
if (r1 != r2) {
|
||||
if (r1.size <= r2.size) {
|
||||
r2.mergeWith(r1);
|
||||
} else {
|
||||
r1.mergeWith(r2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Iterator<Set<E>> iterator() {
|
||||
|
||||
return new Iterator<>() {
|
||||
|
||||
private final Iterator<Entry<E>> iterator = map.values().iterator();
|
||||
private Entry<E> nextRepresentative;
|
||||
|
||||
{
|
||||
findNextRepresentative();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
|
||||
return nextRepresentative != null;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Set<E> next() {
|
||||
|
||||
if (nextRepresentative == null) {
|
||||
throw new NoSuchElementException();
|
||||
}
|
||||
Set<E> result = nextRepresentative.asSet();
|
||||
findNextRepresentative();
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
private void findNextRepresentative() {
|
||||
|
||||
while (iterator.hasNext()) {
|
||||
Entry<E> candidate = iterator.next();
|
||||
if (candidate.isRepresentative()) {
|
||||
nextRepresentative = candidate;
|
||||
return;
|
||||
}
|
||||
}
|
||||
nextRepresentative = null;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
private static class Entry<E> {
|
||||
|
||||
private int size = 1;
|
||||
private final E value;
|
||||
private Entry<E> parent = this;
|
||||
private Entry<E> next = null;
|
||||
private Entry<E> last = this;
|
||||
|
||||
|
||||
Entry(E value) {
|
||||
|
||||
this.value = value;
|
||||
}
|
||||
|
||||
|
||||
void mergeWith(Entry<E> otherRepresentative) {
|
||||
|
||||
size += otherRepresentative.size;
|
||||
last.next = otherRepresentative;
|
||||
last = otherRepresentative.last;
|
||||
otherRepresentative.parent = this;
|
||||
}
|
||||
|
||||
|
||||
Entry<E> findRepresentative() {
|
||||
|
||||
Entry<E> representative = parent;
|
||||
while (representative.parent != representative) {
|
||||
representative = representative.parent;
|
||||
}
|
||||
for (Entry<E> entry = this; entry != representative; ) {
|
||||
Entry<E> nextEntry = entry.parent;
|
||||
entry.parent = representative;
|
||||
entry = nextEntry;
|
||||
}
|
||||
return representative;
|
||||
}
|
||||
|
||||
|
||||
boolean isRepresentative() {
|
||||
|
||||
return parent == this;
|
||||
}
|
||||
|
||||
|
||||
Set<E> asSet() {
|
||||
|
||||
return new AbstractSet<E>() {
|
||||
|
||||
@Override
|
||||
public Iterator<E> iterator() {
|
||||
|
||||
return new Iterator<E>() {
|
||||
|
||||
private Entry<E> nextEntry = findRepresentative();
|
||||
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
|
||||
return nextEntry != null;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public E next() {
|
||||
|
||||
if (nextEntry == null) {
|
||||
throw new NoSuchElementException();
|
||||
}
|
||||
E result = nextEntry.value;
|
||||
nextEntry = nextEntry.next;
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int size() {
|
||||
|
||||
return findRepresentative().size;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,91 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
|
||||
|
||||
public class Histogram {
|
||||
|
||||
private static final double EPSILON = 1.0e-6;
|
||||
private final double min;
|
||||
private final double resolution;
|
||||
private double[] frequencies;
|
||||
|
||||
|
||||
public Histogram(double minValue, double maxValue, double resolution) {
|
||||
|
||||
this.min = minValue - EPSILON;
|
||||
double delta = maxValue - minValue + 2 * EPSILON;
|
||||
int size = Math.max(1, (int) Math.round((maxValue - minValue) / resolution));
|
||||
this.resolution = delta / size;
|
||||
this.frequencies = new double[size];
|
||||
}
|
||||
|
||||
|
||||
public void kernelSmooth(double[] kernel) {
|
||||
|
||||
double[] newFrequencies = new double[frequencies.length];
|
||||
int shift = (kernel.length - 1) / 2;
|
||||
for (int i = 0; i < kernel.length; i++) {
|
||||
int jStart = Math.max(0, i - shift);
|
||||
int jEnd = Math.min(frequencies.length, frequencies.length + i - shift);
|
||||
for (int j = jStart; j < jEnd; j++) {
|
||||
newFrequencies[j - i + shift] += kernel[i] * frequencies[j];
|
||||
}
|
||||
}
|
||||
frequencies = newFrequencies;
|
||||
}
|
||||
|
||||
|
||||
public double[] createGaussianKernel(double length, double stdDeviation) {
|
||||
|
||||
int r = (int) Math.round(length / resolution) / 2;
|
||||
stdDeviation /= resolution;
|
||||
|
||||
int size = 2 * r + 1;
|
||||
double[] kernel = new double[size];
|
||||
double sum = 0;
|
||||
double b = 2 * stdDeviation * stdDeviation;
|
||||
double a = 1 / Math.sqrt(Math.PI * b);
|
||||
for (int i = 0; i < size; i++) {
|
||||
kernel[i] = a * Math.exp(-(i - r) * (i - r) / b);
|
||||
sum += kernel[i];
|
||||
}
|
||||
for (int i = 0; i < size; i++) {
|
||||
kernel[i] /= sum;
|
||||
}
|
||||
return kernel;
|
||||
}
|
||||
|
||||
|
||||
public void gaussianSmooth(double windowLength, double stdDeviation) {
|
||||
|
||||
kernelSmooth(createGaussianKernel(windowLength, stdDeviation));
|
||||
}
|
||||
|
||||
|
||||
public void add(double value) {
|
||||
|
||||
frequencies[(int) ((value - min) / resolution)] += 1.0;
|
||||
}
|
||||
|
||||
|
||||
public int getSize() {
|
||||
|
||||
return frequencies.length;
|
||||
}
|
||||
|
||||
|
||||
public double getPeakValue() {
|
||||
|
||||
int peakIndex = 0;
|
||||
for (int i = 1; i < frequencies.length; i++) {
|
||||
if (frequencies[i] > frequencies[peakIndex]) {
|
||||
peakIndex = i;
|
||||
}
|
||||
}
|
||||
int peakEndIndex = peakIndex + 1;
|
||||
final double EPS = 0.0001;
|
||||
while (peakEndIndex < frequencies.length && Math.abs(frequencies[peakEndIndex] - frequencies[peakIndex]) < EPS) {
|
||||
peakEndIndex++;
|
||||
}
|
||||
return ((double) peakIndex + peakEndIndex) / 2 * resolution + min;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,165 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
public class Line extends BoundingBox {
|
||||
|
||||
private static final double WORD_DISTANCE_MULTIPLIER = 0.2;
|
||||
|
||||
private final double x0;
|
||||
private final double y0;
|
||||
|
||||
private final double x1;
|
||||
private final double y1;
|
||||
|
||||
private final double height;
|
||||
|
||||
private final List<Character> characters;
|
||||
private final List<TextPositionSequence> words = new ArrayList<>();
|
||||
|
||||
|
||||
public Line(List<Character> characters, double wordSpacing) {
|
||||
|
||||
this.characters = characters;
|
||||
|
||||
if (characters.size() >= 2) {
|
||||
// linear regression
|
||||
double sx = 0.0, sxx = 0.0, sxy = 0.0, sy = 0.0;
|
||||
for (Character character : characters) {
|
||||
sx += character.getX();
|
||||
sxx += character.getX() * character.getX();
|
||||
sxy += character.getX() * character.getY();
|
||||
sy += character.getY();
|
||||
}
|
||||
double b = (characters.size() * sxy - sx * sy) / (characters.size() * sxx - sx * sx);
|
||||
double a = (sy - b * sx) / characters.size();
|
||||
|
||||
this.x0 = characters.get(0).getX();
|
||||
this.y0 = a + b * this.x0;
|
||||
this.x1 = characters.get(characters.size() - 1).getX();
|
||||
this.y1 = a + b * this.x1;
|
||||
} else {
|
||||
Character character = characters.get(0);
|
||||
double dx = character.getTextPosition().getWidthDirAdj() / 3;
|
||||
double dy = dx * Math.tan(0);
|
||||
this.x0 = character.getX() - dx;
|
||||
this.x1 = character.getX() + dx;
|
||||
this.y0 = character.getY() - dy;
|
||||
this.y1 = character.getY() + dy;
|
||||
}
|
||||
height = computeHeight();
|
||||
computeWords(wordSpacing * WORD_DISTANCE_MULTIPLIER);
|
||||
buildBBox();
|
||||
}
|
||||
|
||||
|
||||
public double getAngle() {
|
||||
|
||||
return Math.atan2(y1 - y0, x1 - x0);
|
||||
}
|
||||
|
||||
|
||||
public double getLength() {
|
||||
|
||||
return Math.sqrt((x0 - x1) * (x0 - x1) + (y0 - y1) * (y0 - y1));
|
||||
}
|
||||
|
||||
|
||||
private double computeHeight() {
|
||||
|
||||
double sum = 0.0;
|
||||
for (Character component : characters) {
|
||||
sum += component.getHeight();
|
||||
}
|
||||
return sum / characters.size();
|
||||
}
|
||||
|
||||
|
||||
public double angularDifference(Line j) {
|
||||
|
||||
double diff = Math.abs(getAngle() - j.getAngle());
|
||||
if (diff <= Math.PI / 2) {
|
||||
return diff;
|
||||
} else {
|
||||
return Math.PI - diff;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public double horizontalDistance(Line other) {
|
||||
|
||||
double[] xs = new double[4];
|
||||
xs[0] = x0;
|
||||
xs[1] = x1;
|
||||
xs[2] = other.x0;
|
||||
xs[3] = other.x1;
|
||||
boolean overlapping = xs[1] >= xs[2] && xs[3] >= xs[0];
|
||||
Arrays.sort(xs);
|
||||
return Math.abs(xs[2] - xs[1]) * (overlapping ? 1 : -1);
|
||||
}
|
||||
|
||||
|
||||
public double verticalDistance(Line other) {
|
||||
|
||||
double ym = (y0 + y1) / 2;
|
||||
double yn = (other.y0 + other.y1) / 2;
|
||||
return Math.abs(ym - yn) / Math.sqrt(1);
|
||||
}
|
||||
|
||||
|
||||
private void computeWords(double wordSpacing) {
|
||||
|
||||
TextPositionSequence word = new TextPositionSequence();
|
||||
Character previous = null;
|
||||
for (Character current : characters) {
|
||||
if (previous != null) {
|
||||
double dist = current.getTextPosition().getXDirAdj() - previous.getTextPosition().getXDirAdj() - previous.getTextPosition().getWidthDirAdj();
|
||||
if (dist > wordSpacing) {
|
||||
words.add(word);
|
||||
word = new TextPositionSequence();
|
||||
}
|
||||
}
|
||||
word.getTextPositions().add(current.getTextPosition());
|
||||
previous = current;
|
||||
}
|
||||
words.add(word);
|
||||
}
|
||||
|
||||
|
||||
private void buildBBox() {
|
||||
|
||||
double minX = Double.POSITIVE_INFINITY;
|
||||
double minY = Double.POSITIVE_INFINITY;
|
||||
double maxX = Double.NEGATIVE_INFINITY;
|
||||
double maxY = Double.NEGATIVE_INFINITY;
|
||||
|
||||
for (Character character : characters) {
|
||||
|
||||
minX = Math.min(minX, character.getTextPosition().getXDirAdj());
|
||||
minY = Math.min(minY, character.getTextPosition().getYDirAdj());
|
||||
maxX = Math.max(maxX, character.getTextPosition().getXDirAdj() + character.getTextPosition().getWidthDirAdj());
|
||||
maxY = Math.max(maxY, character.getTextPosition().getYDirAdj() + character.getTextPosition().getHeightDir());
|
||||
|
||||
}
|
||||
|
||||
this.setBBox(new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY));
|
||||
}
|
||||
|
||||
|
||||
public String toString() {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
words.forEach(word -> sb.append(word.toString()).append(" "));
|
||||
return sb.toString().trim();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,36 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
|
||||
|
||||
import lombok.Getter;
|
||||
|
||||
public class Neighbor {
|
||||
|
||||
@Getter
|
||||
private final double distance;
|
||||
@Getter
|
||||
private final double angle;
|
||||
private final Character originCharacter;
|
||||
@Getter
|
||||
private final Character character;
|
||||
|
||||
|
||||
public Neighbor(Character neighbor, Character origin) {
|
||||
|
||||
this.distance = neighbor.distance(origin);
|
||||
this.angle = neighbor.angle(origin);
|
||||
this.character = neighbor;
|
||||
this.originCharacter = origin;
|
||||
}
|
||||
|
||||
|
||||
public double getHorizontalDistance() {
|
||||
|
||||
return character.horizontalDistance(originCharacter);
|
||||
}
|
||||
|
||||
|
||||
public double getVerticalDistance() {
|
||||
|
||||
return character.verticalDistance(originCharacter);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,50 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
public class Zone extends BoundingBox {
|
||||
|
||||
private List<Line> lines;
|
||||
|
||||
|
||||
public Zone(List<Line> lines) {
|
||||
|
||||
lines.sort(Comparator.comparingDouble(Line::getY));
|
||||
this.lines = lines;
|
||||
buildBBox();
|
||||
}
|
||||
|
||||
|
||||
public void buildBBox() {
|
||||
|
||||
double minX = Double.POSITIVE_INFINITY;
|
||||
double minY = Double.POSITIVE_INFINITY;
|
||||
double maxX = Double.NEGATIVE_INFINITY;
|
||||
double maxY = Double.NEGATIVE_INFINITY;
|
||||
|
||||
for (Line line : lines) {
|
||||
|
||||
minX = Math.min(minX, line.getX());
|
||||
minY = Math.min(minY, line.getY());
|
||||
maxX = Math.max(maxX, line.getX() + line.getWidth());
|
||||
maxY = Math.max(maxY, line.getY() + line.getHeight());
|
||||
|
||||
}
|
||||
|
||||
this.setBBox(new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY));
|
||||
}
|
||||
|
||||
|
||||
public String toString() {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
lines.forEach(line -> sb.append(line.toString()).append("\n"));
|
||||
return sb.toString().trim();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,50 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.AngleFilter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.DisjointSets;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Line;
|
||||
|
||||
@Service
|
||||
public class LineBuilderService {
|
||||
|
||||
private static final double CHARACTER_SPACING_DISTANCE_MULTIPLIER = 3.5;
|
||||
private static final double MAX_VERTICAL_CHARACTER_DISTANCE = 0.67;
|
||||
private static final double ANGLE_TOLERANCE = Math.PI / 6;
|
||||
|
||||
|
||||
public List<Line> buildLines(List<Character> characters, double characterSpacing, double lineSpacing) {
|
||||
|
||||
double maxHorizontalDistance = characterSpacing * CHARACTER_SPACING_DISTANCE_MULTIPLIER;
|
||||
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_CHARACTER_DISTANCE;
|
||||
|
||||
DisjointSets<Character> sets = new DisjointSets<>(characters);
|
||||
AngleFilter filter = new AngleFilter(-ANGLE_TOLERANCE, ANGLE_TOLERANCE);
|
||||
|
||||
characters.forEach(character -> {
|
||||
character.getNeighbors().forEach(neighbor -> {
|
||||
double x = neighbor.getHorizontalDistance() / maxHorizontalDistance;
|
||||
double y = neighbor.getVerticalDistance() / maxVerticalDistance;
|
||||
if (filter.matches(neighbor) && Math.pow(x, 2) + Math.pow(y, 2) <= 1) {
|
||||
sets.union(character, neighbor.getCharacter());
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
List<Line> lines = new ArrayList<>();
|
||||
sets.forEach(group -> {
|
||||
List<Character> lineComponents = new ArrayList<>(group);
|
||||
lineComponents.sort(Comparator.comparingDouble(Character::getX));
|
||||
lines.add(new Line(lineComponents, characterSpacing));
|
||||
});
|
||||
|
||||
return lines;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,78 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Neighbor;
|
||||
|
||||
@Service
|
||||
public class NearestNeighbourService {
|
||||
|
||||
private static final int NUMBER_OF_NEIGHBOURS = 8;
|
||||
private static final double STEP = 16.0;
|
||||
|
||||
|
||||
public void findNearestNeighbors(List<Character> characters) {
|
||||
|
||||
if (characters.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
characters.sort(Comparator.comparingDouble(Character::getX));
|
||||
|
||||
int maxNeighborCount = NUMBER_OF_NEIGHBOURS;
|
||||
if (characters.size() <= NUMBER_OF_NEIGHBOURS) {
|
||||
maxNeighborCount = characters.size() - 1;
|
||||
}
|
||||
|
||||
for (int i = 0; i < characters.size(); i++) {
|
||||
|
||||
List<Neighbor> candidates = new ArrayList<>();
|
||||
|
||||
int start = i;
|
||||
int end = i + 1;
|
||||
|
||||
double distance = Double.POSITIVE_INFINITY;
|
||||
|
||||
for (double searchDistance = 0; searchDistance < distance; ) {
|
||||
|
||||
searchDistance += STEP;
|
||||
boolean newCandidatesFound = false;
|
||||
|
||||
while (start > 0 && characters.get(i).getX() - characters.get(start - 1).getX() < searchDistance) {
|
||||
start--;
|
||||
candidates.add(new Neighbor(characters.get(start), characters.get(i)));
|
||||
clearLeastDistant(candidates, maxNeighborCount);
|
||||
newCandidatesFound = true;
|
||||
}
|
||||
|
||||
while (end < characters.size() && characters.get(end).getX() - characters.get(i).getX() < searchDistance) {
|
||||
candidates.add(new Neighbor(characters.get(end), characters.get(i)));
|
||||
clearLeastDistant(candidates, maxNeighborCount);
|
||||
end++;
|
||||
newCandidatesFound = true;
|
||||
}
|
||||
|
||||
if (newCandidatesFound && candidates.size() >= maxNeighborCount) {
|
||||
distance = candidates.get(maxNeighborCount - 1).getDistance();
|
||||
}
|
||||
}
|
||||
clearLeastDistant(candidates, maxNeighborCount);
|
||||
characters.get(i).setNeighbors(new ArrayList<>(candidates));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void clearLeastDistant(List<Neighbor> candidates, int maxNeighborCount) {
|
||||
|
||||
if (candidates.size() > maxNeighborCount) {
|
||||
candidates.sort(Comparator.comparingDouble(Neighbor::getDistance));
|
||||
candidates.remove(candidates.remove(candidates.size() - 1));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,99 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.ListIterator;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.BoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.utils.DoubleUtils;
|
||||
|
||||
@Service
|
||||
public class ReadingOrderService {
|
||||
|
||||
private static final double THRESHOLD = 1;
|
||||
|
||||
|
||||
public List<Zone> resolve(List<Zone> zones, boolean yxOrder) {
|
||||
|
||||
if (zones.isEmpty() || zones.size() == 1) {
|
||||
return zones;
|
||||
}
|
||||
|
||||
if (yxOrder) {
|
||||
zones.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||
return zones;
|
||||
}
|
||||
|
||||
return resolveMultiColumnReadingOder(zones);
|
||||
}
|
||||
|
||||
|
||||
private List<Zone> resolveMultiColumnReadingOder(List<Zone> zones) {
|
||||
|
||||
// Simple reading order resolver for multi column page layout as described here : https://pub.towardsai.net/advanced-rag-02-unveiling-pdf-parsing-b84ae866344e
|
||||
// TODO implement a more fancy reading order resolver see https://github.com/BobLd/DocumentLayoutAnalysis/blob/master/README.md#reading-order
|
||||
|
||||
double minX = Double.POSITIVE_INFINITY;
|
||||
double maxX = Double.NEGATIVE_INFINITY;
|
||||
|
||||
for (Zone zone : zones) {
|
||||
if (zone.getX() < minX) {
|
||||
minX = zone.getX();
|
||||
}
|
||||
if (zone.getX() + zone.getWidth() > maxX) {
|
||||
maxX = zone.getX() + zone.getWidth();
|
||||
}
|
||||
}
|
||||
|
||||
double midLineXCoordinate = (minX + maxX) / 2;
|
||||
|
||||
List<Zone> leftOf = new ArrayList<>();
|
||||
List<Zone> rightOf = new ArrayList<>();
|
||||
List<Zone> middle = new ArrayList<>();
|
||||
for (Zone zone : zones) {
|
||||
if (zone.getX() < midLineXCoordinate && zone.getX() + zone.getWidth() < midLineXCoordinate) {
|
||||
leftOf.add(zone);
|
||||
} else if (zone.getX() > midLineXCoordinate && zone.getX() + zone.getWidth() > midLineXCoordinate) {
|
||||
rightOf.add(zone);
|
||||
} else {
|
||||
middle.add(zone);
|
||||
}
|
||||
}
|
||||
|
||||
leftOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||
|
||||
rightOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||
|
||||
middle.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||
|
||||
List<Zone> sortedZones = new ArrayList<>();
|
||||
sortedZones.addAll(leftOf);
|
||||
sortedZones.addAll(rightOf);
|
||||
|
||||
ListIterator<Zone> itty = middle.listIterator();
|
||||
|
||||
while (itty.hasNext()) {
|
||||
Zone current = itty.next();
|
||||
for (int i = 0; i < sortedZones.size(); i++) {
|
||||
if (current.getY() < sortedZones.get(i).getY()) {
|
||||
sortedZones.add(i, current);
|
||||
itty.remove();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sortedZones.addAll(middle);
|
||||
|
||||
return sortedZones;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,56 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.AngleFilter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Histogram;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Neighbor;
|
||||
|
||||
@Service
|
||||
public class SpacingService {
|
||||
|
||||
private static final double SPACING_HISTOGRAM_RESOLUTION = 0.5;
|
||||
private static final double SPACING_HISTOGRAM_SMOOTHING_LENGTH = 2.5;
|
||||
private static final double SPACING_HIST_SMOOTHING_STANDARD_DEVIATION = 0.5;
|
||||
private static final double ANGLE_TOLERANCE = Math.PI / 6;
|
||||
|
||||
|
||||
public double computeCharacterSpacing(List<Character> components) {
|
||||
|
||||
return computeSpacing(components, 0);
|
||||
}
|
||||
|
||||
|
||||
public double computeLineSpacing(List<Character> components) {
|
||||
|
||||
return computeSpacing(components, Math.PI / 2);
|
||||
}
|
||||
|
||||
|
||||
private double computeSpacing(List<Character> components, double angle) {
|
||||
|
||||
double maxDistance = Double.NEGATIVE_INFINITY;
|
||||
|
||||
for (Character component : components) {
|
||||
for (Neighbor neighbor : component.getNeighbors()) {
|
||||
maxDistance = Math.max(maxDistance, neighbor.getDistance());
|
||||
}
|
||||
}
|
||||
Histogram histogram = new Histogram(0, maxDistance, SPACING_HISTOGRAM_RESOLUTION);
|
||||
AngleFilter filter = new AngleFilter(angle - ANGLE_TOLERANCE, angle + ANGLE_TOLERANCE);
|
||||
for (Character component : components) {
|
||||
for (Neighbor neighbor : component.getNeighbors()) {
|
||||
if (filter.matches(neighbor)) {
|
||||
histogram.add(neighbor.getDistance());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
histogram.gaussianSmooth(SPACING_HISTOGRAM_SMOOTHING_LENGTH, SPACING_HIST_SMOOTHING_STANDARD_DEVIATION);
|
||||
return histogram.getPeakValue();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,90 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.DisjointSets;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Line;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone;
|
||||
|
||||
@Service
|
||||
public class ZoneBuilderService {
|
||||
|
||||
private static final double MIN_HORIZONTAL_DISTANCE_MULTIPLIER = -0.5;
|
||||
private static final double MAX_VERTICAL_DISTANCE_MULTIPLIER = 1.2;
|
||||
|
||||
private static final double MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER = -3.0;
|
||||
|
||||
private static final double MAX_VERTICAL_MERGE_DISTANCE_MULTIPLIER = 0.5;
|
||||
|
||||
private static final double MIN_LINE_SIZE_SCALE = 0.9;
|
||||
|
||||
private static final double MAX_LINE_SIZE_SCALE = 2.5;
|
||||
|
||||
private static final double ANGLE_TOLERANCE = Math.PI / 6;
|
||||
|
||||
public static final int MAX_ZONES = 300;
|
||||
|
||||
|
||||
public List<Zone> buildZones(List<Line> lines, double characterSpacing, double lineSpacing) {
|
||||
|
||||
double minHorizontalDistance = characterSpacing * MIN_HORIZONTAL_DISTANCE_MULTIPLIER;
|
||||
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_DISTANCE_MULTIPLIER;
|
||||
double minHorizontalMergeDistance = characterSpacing * MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER;
|
||||
double maxVerticalMergeDistance = lineSpacing * MAX_VERTICAL_MERGE_DISTANCE_MULTIPLIER;
|
||||
|
||||
DisjointSets<Line> sets = new DisjointSets<>(lines);
|
||||
|
||||
double meanHeight = calculateMeanHeight(lines);
|
||||
|
||||
lines.forEach(outerLine -> //
|
||||
lines.forEach(innerLine -> {
|
||||
|
||||
double scale = Math.min(outerLine.getHeight(), innerLine.getHeight()) / meanHeight;
|
||||
scale = Math.max(MIN_LINE_SIZE_SCALE, Math.min(scale, MAX_LINE_SIZE_SCALE));
|
||||
|
||||
if (!sets.areTogether(outerLine, innerLine) && outerLine.angularDifference(innerLine) <= ANGLE_TOLERANCE) {
|
||||
|
||||
double horizontalDistance = outerLine.horizontalDistance(innerLine) / scale;
|
||||
double verticalDistance = outerLine.verticalDistance(innerLine) / scale;
|
||||
|
||||
if (minHorizontalDistance <= horizontalDistance && verticalDistance <= maxVerticalDistance //
|
||||
|| minHorizontalMergeDistance <= horizontalDistance && verticalDistance <= maxVerticalMergeDistance) {
|
||||
sets.union(outerLine, innerLine);
|
||||
}
|
||||
}
|
||||
}));
|
||||
|
||||
List<Zone> zones = new ArrayList<>();
|
||||
sets.forEach(group -> {
|
||||
zones.add(new Zone(new ArrayList<>(group)));
|
||||
});
|
||||
|
||||
if (zones.size() > MAX_ZONES) {
|
||||
List<Line> oneZoneLines = new ArrayList<>();
|
||||
for (Zone zone : zones) {
|
||||
oneZoneLines.addAll(zone.getLines());
|
||||
}
|
||||
return List.of(new Zone(oneZoneLines));
|
||||
}
|
||||
|
||||
return zones;
|
||||
}
|
||||
|
||||
|
||||
private double calculateMeanHeight(List<Line> lines) {
|
||||
|
||||
double meanHeight = 0.0;
|
||||
double weights = 0.0;
|
||||
for (Line line : lines) {
|
||||
double weight = line.getLength();
|
||||
meanHeight += line.getHeight() * weight;
|
||||
weights += weight;
|
||||
}
|
||||
meanHeight /= weights;
|
||||
return meanHeight;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,18 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.utils;
|
||||
|
||||
public class DoubleUtils {
|
||||
|
||||
public static int compareDouble(double d1, double d2, double precision) {
|
||||
|
||||
if (Double.isNaN(d1) || Double.isNaN(d2)) {
|
||||
return Double.compare(d1, d2);
|
||||
}
|
||||
if (precision == 0) {
|
||||
precision = 1;
|
||||
}
|
||||
long i1 = Math.round(d1 / precision);
|
||||
long i2 = Math.round(d2 / precision);
|
||||
return Long.valueOf(i1).compareTo(i2);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,270 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.visualization;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.PDPageContentStream;
|
||||
import org.apache.pdfbox.pdmodel.font.PDType1Font;
|
||||
import org.apache.pdfbox.pdmodel.font.Standard14Fonts;
|
||||
import org.apache.pdfbox.util.Matrix;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PdfVisualisationUtility;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Getter;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class PdfDraw {
|
||||
|
||||
public static void drawRectanglesPerPage(String filename, List<List<Rectangle2D>> rectanglesPerPage, String tmpFileName) throws IOException {
|
||||
|
||||
ClassPathResource pdfResource = new ClassPathResource(filename);
|
||||
try (PDDocument pdDocument = Loader.loadPDF(pdfResource.getFile()); var out = new FileOutputStream(tmpFileName)) {
|
||||
for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) {
|
||||
PdfVisualisationUtility.drawRectangle2DList(pdDocument,
|
||||
pageNumber,
|
||||
rectanglesPerPage.get(pageNumber - 1),
|
||||
PdfVisualisationUtility.Options.builder().stroke(true).build());
|
||||
}
|
||||
pdDocument.save(out);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
public static void drawRectanglesPerPageNumberedByLine(String filename, List<List<List<Rectangle2D>>> rectanglesPerPage, String tmpFileName) throws IOException {
|
||||
|
||||
ClassPathResource pdfResource = new ClassPathResource(filename);
|
||||
try (PDDocument pdDocument = Loader.loadPDF(pdfResource.getFile()); var out = new FileOutputStream(tmpFileName)) {
|
||||
for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) {
|
||||
var rectanglesOnPage = rectanglesPerPage.get(pageNumber - 1);
|
||||
for (int lineNumber = 0; lineNumber < rectanglesOnPage.size(); lineNumber++) {
|
||||
var rectanglesInLine = rectanglesOnPage.get(lineNumber);
|
||||
PdfVisualisationUtility.drawRectangle2DList(pdDocument, pageNumber, rectanglesInLine, PdfVisualisationUtility.Options.builder().stroke(true).build());
|
||||
double y = Math.min(rectanglesInLine.get(0).getMinY(), rectanglesInLine.get(0).getMaxY());
|
||||
PdfVisualisationUtility.drawText(String.format("%d", lineNumber),
|
||||
pdDocument,
|
||||
new Point2D.Double(rectanglesInLine.get(0).getX() - (5 + (5 * countNumberOfDigits(lineNumber))), y + 2),
|
||||
pageNumber,
|
||||
PdfVisualisationUtility.Options.builder().stroke(true).build());
|
||||
}
|
||||
}
|
||||
pdDocument.save(out);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
private static int countNumberOfDigits(int num) {
|
||||
|
||||
int final_num = num;
|
||||
if (final_num == 0) {
|
||||
return 1;
|
||||
}
|
||||
int count = 0;
|
||||
for (; final_num != 0; final_num /= 10) {
|
||||
count++;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
|
||||
public static void drawDocumentGraph(PDDocument document, Document documentGraph) {
|
||||
|
||||
documentGraph.getDocumentTree().allEntriesInOrder().forEach(entry -> drawNode(document, entry));
|
||||
}
|
||||
|
||||
|
||||
public static void drawNode(PDDocument document, DocumentTree.Entry entry) {
|
||||
|
||||
Options options = buildStandardOptionsForNodes(entry);
|
||||
|
||||
drawBBoxAndLabelAndNumberOnPage(document, entry, options);
|
||||
|
||||
}
|
||||
|
||||
|
||||
public static void drawTextBlock(PDDocument document, TextBlock textBlock, Options options) {
|
||||
|
||||
textBlock.getAtomicTextBlocks().forEach(atb -> drawAtomicTextBlock(document, atb, options));
|
||||
}
|
||||
|
||||
|
||||
public static void drawAtomicTextBlock(PDDocument document, AtomicTextBlock atomicTextBlock, Options options) {
|
||||
|
||||
drawRectangle2DList(document, atomicTextBlock.getPage().getNumber(), atomicTextBlock.getPositions().stream().toList(), options);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private static void drawText(String string, PDDocument document, Point2D location, Integer pageNumber, Options options, boolean rotate) {
|
||||
|
||||
var pdPage = document.getPage(pageNumber - 1);
|
||||
var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true);
|
||||
|
||||
contentStream.setNonStrokingColor(options.getStrokeColor());
|
||||
contentStream.setLineWidth(options.getStrokeWidth());
|
||||
|
||||
contentStream.beginText();
|
||||
if (rotate) {
|
||||
contentStream.setTextMatrix(Matrix.getRotateInstance(Math.toRadians(15), (float) location.getX(), (float) location.getY()));
|
||||
} else {
|
||||
contentStream.newLineAtOffset((float) location.getX(), (float) location.getY());
|
||||
}
|
||||
contentStream.setFont(new PDType1Font(Standard14Fonts.FontName.HELVETICA), 10);
|
||||
contentStream.showText(string);
|
||||
contentStream.endText();
|
||||
contentStream.close();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public static void drawRectangle2DList(PDDocument document, int pageNumber, List<Rectangle2D> rectCollection, Options options) {
|
||||
|
||||
var pdPage = document.getPage(pageNumber - 1);
|
||||
drawRectangle2DList(document, rectCollection, options, pdPage);
|
||||
}
|
||||
|
||||
|
||||
private static void drawRectangle2DList(PDDocument document, List<Rectangle2D> rectCollection, Options options, PDPage pdPage) throws IOException {
|
||||
|
||||
var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true);
|
||||
|
||||
contentStream.setStrokingColor(options.getStrokeColor());
|
||||
contentStream.setNonStrokingColor(options.getFillColor());
|
||||
contentStream.setLineWidth(options.getStrokeWidth());
|
||||
|
||||
for (var r : rectCollection) {
|
||||
contentStream.addRect((float) r.getMinX(), (float) r.getMinY(), (float) r.getWidth(), (float) r.getHeight());
|
||||
|
||||
if (options.isStroke() && options.isFill()) {
|
||||
contentStream.fillAndStroke();
|
||||
} else if (options.isStroke()) {
|
||||
contentStream.stroke();
|
||||
} else if (options.isFill()) {
|
||||
contentStream.fill();
|
||||
}
|
||||
}
|
||||
contentStream.close();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public static void drawRectanglesAndLinesPerPage(String filename, List<List<Rectangle2D>> list, List<List<Rectangle2D>> rectanglesPerPage, String tmpFileName) {
|
||||
|
||||
ClassPathResource pdfResource = new ClassPathResource(filename);
|
||||
try (PDDocument pdDocument = Loader.loadPDF(pdfResource.getFile()); var out = new FileOutputStream(tmpFileName)) {
|
||||
for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) {
|
||||
// PdfVisualisationUtility.drawLine2DList(pdDocument,
|
||||
// pageNumber,
|
||||
// list.get(pageNumber - 1),
|
||||
// PdfVisualisationUtility.Options.builder().stroke(true).build());
|
||||
PdfVisualisationUtility.drawRectangle2DList(pdDocument,
|
||||
pageNumber,
|
||||
rectanglesPerPage.get(pageNumber - 1),
|
||||
PdfVisualisationUtility.Options.builder().stroke(true).build());
|
||||
PdfVisualisationUtility.drawRectangle2DList(pdDocument, pageNumber, list.get(pageNumber - 1), PdfVisualisationUtility.Options.builder().stroke(true).build());
|
||||
}
|
||||
pdDocument.save(out);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public static void drawLinesPerPage(String filename, List<List<Ruling>> linesPerPage, String tmpFileName) {
|
||||
|
||||
ClassPathResource pdfResource = new ClassPathResource(filename);
|
||||
try (PDDocument pdDocument = Loader.loadPDF(pdfResource.getFile()); var out = new FileOutputStream(tmpFileName)) {
|
||||
for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) {
|
||||
PdfVisualisationUtility.drawLine2DList(pdDocument,
|
||||
pageNumber,
|
||||
linesPerPage.get(pageNumber - 1),
|
||||
PdfVisualisationUtility.Options.builder().strokeColor(Color.RED).stroke(true).build());
|
||||
}
|
||||
pdDocument.save(out);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@Getter
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public static class Options {
|
||||
|
||||
boolean stroke;
|
||||
@Builder.Default
|
||||
Color strokeColor = Color.BLACK;
|
||||
@Builder.Default
|
||||
float strokeWidth = 1f;
|
||||
|
||||
boolean fill;
|
||||
@Builder.Default
|
||||
Color fillColor = Color.BLACK;
|
||||
|
||||
}
|
||||
|
||||
|
||||
private static Options buildStandardOptionsForNodes(DocumentTree.Entry entry) {
|
||||
|
||||
return Options.builder().stroke(true).strokeColor(switch (entry.getType()) {
|
||||
case DOCUMENT -> Color.LIGHT_GRAY;
|
||||
case HEADER, FOOTER -> Color.GREEN;
|
||||
case PARAGRAPH -> Color.BLUE;
|
||||
case HEADLINE -> Color.RED;
|
||||
case SECTION -> Color.BLACK;
|
||||
case TABLE -> Color.ORANGE;
|
||||
case TABLE_CELL -> Color.GRAY;
|
||||
case IMAGE -> Color.MAGENTA;
|
||||
}).build();
|
||||
}
|
||||
|
||||
|
||||
private static void drawBBoxAndLabelAndNumberOnPage(PDDocument document, DocumentTree.Entry entry, Options options) {
|
||||
|
||||
Map<Page, Rectangle2D> rectanglesPerPage = entry.getNode().getBBox();
|
||||
for (Page page : rectanglesPerPage.keySet()) {
|
||||
Rectangle2D rectangle2D = rectanglesPerPage.get(page);
|
||||
if (entry.getType() == NodeType.SECTION) {
|
||||
rectangle2D = RectangleTransformations.pad(rectangle2D, 10, 10);
|
||||
}
|
||||
drawRectangle2DList(document, page.getNumber(), List.of(rectangle2D), options);
|
||||
drawText(buildString(entry),
|
||||
document,
|
||||
new Point2D.Double(rectangle2D.getMinX(), rectangle2D.getMaxY() + 2),
|
||||
page.getNumber(),
|
||||
options,
|
||||
entry.getType() == NodeType.TABLE_CELL);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static String buildString(DocumentTree.Entry entry) {
|
||||
|
||||
return entry.getNode().getNumberOnPage() + ": " + entry.getTreeId() + ": " + entry.getType();
|
||||
}
|
||||
|
||||
}
|
||||
@ -25,7 +25,9 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
|
||||
@SneakyThrows
|
||||
public void testLayoutParserEndToEnd() {
|
||||
|
||||
prepareStorage("files/bdr/Wie weiter bei Kristeneinrichtungen.pdf");
|
||||
String s = "(";
|
||||
String s1 = ")";
|
||||
prepareStorage("files/Minimal Examples/WrongOrderPage1.pdf");
|
||||
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER);
|
||||
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
|
||||
Arrays.stream(finishedEvent.message().split("\n")).forEach(log::info);
|
||||
|
||||
@ -25,7 +25,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
@SneakyThrows
|
||||
public void testViewerDocument() {
|
||||
|
||||
String fileName = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
|
||||
String fileName = "files/Plenarprotokoll 1 (keine Druchsache!) (1).pdf";
|
||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||
|
||||
var documentFile = new ClassPathResource(fileName).getFile();
|
||||
@ -35,9 +35,10 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER);
|
||||
long start = System.currentTimeMillis();
|
||||
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
|
||||
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
|
||||
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@Disabled
|
||||
@SneakyThrows
|
||||
@ -51,7 +52,11 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
var tableResponse = mapper.readValue(new ClassPathResource(tableFileName).getInputStream(), TableServiceResponse.class);
|
||||
var documentFile = new ClassPathResource(fileName).getFile();
|
||||
|
||||
var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE, documentFile, new ImageServiceResponse(), tableResponse, Path.of(fileName).getFileName().toFile().toString());
|
||||
var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE,
|
||||
documentFile,
|
||||
new ImageServiceResponse(),
|
||||
tableResponse,
|
||||
Path.of(fileName).getFileName().toFile().toString());
|
||||
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
|
||||
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
|
||||
Document document = DocumentGraphFactory.buildDocumentGraph(classificationDocument);
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user