Compare commits
9 Commits
main
...
rOrder-bef
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f90cb20156 | ||
|
|
72202f63dc | ||
|
|
e14d953b04 | ||
|
|
9e5778d4b2 | ||
|
|
e394f2fa7c | ||
|
|
b2fb6829cb | ||
|
|
4871e55f2d | ||
|
|
4de6c12aec | ||
|
|
4afa8daafa |
@ -10,6 +10,7 @@ import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.concurrent.atomic.AtomicReference;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
@ -26,6 +27,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsi
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationSection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
@ -47,6 +49,7 @@ import com.knecon.fforesight.service.layoutparser.processor.services.blockificat
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.DocuMineClassificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.TaasClassificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.DocstrumSegmentationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper;
|
||||
@ -86,6 +89,7 @@ public class LayoutParsingPipeline {
|
||||
RedactManagerBlockificationService redactManagerBlockificationService;
|
||||
LayoutGridService layoutGridService;
|
||||
ObservationRegistry observationRegistry;
|
||||
DocstrumSegmentationService docstrumSegmentationService;
|
||||
|
||||
|
||||
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
|
||||
@ -243,11 +247,37 @@ public class LayoutParsingPipeline {
|
||||
PDRectangle cropbox = pdPage.getCropBox();
|
||||
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), stripper.getRulings());
|
||||
|
||||
ClassificationPage classificationPage = switch (layoutParsingType) {
|
||||
case REDACT_MANAGER -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
case TAAS -> taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
};
|
||||
// Docstrum
|
||||
AtomicInteger num = new AtomicInteger(pageNumber);
|
||||
var zones = docstrumSegmentationService.segmentPage(stripper.getTextPositionSequences());
|
||||
|
||||
List<AbstractPageBlock> pageBlocks = new ArrayList<>();
|
||||
AtomicInteger numOnPage = new AtomicInteger(1);
|
||||
// List<TextPositionSequence> textPositionSequences = new ArrayList<>();
|
||||
zones.forEach(zone -> {
|
||||
|
||||
List<TextPositionSequence> textPositionSequences = new ArrayList<>();
|
||||
zone.getLines().forEach(line -> {
|
||||
line.getWords().forEach(word -> {
|
||||
textPositionSequences.add(new TextPositionSequence(word.getTextPositions(), num.get()));
|
||||
});
|
||||
});
|
||||
|
||||
var cps = redactManagerBlockificationService.blockify(textPositionSequences, cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
|
||||
cps.getTextBlocks().forEach(cp -> {
|
||||
pageBlocks.add(redactManagerBlockificationService.buildTextBlock(((TextPageBlock) cp).getSequences(), numOnPage.getAndIncrement()));
|
||||
});
|
||||
});
|
||||
|
||||
// ClassificationPage classificationPage = switch (layoutParsingType) {
|
||||
// case REDACT_MANAGER -> redactManagerBlockificationService.blockify(textPositionSequences, cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
// case TAAS -> taasBlockificationService.blockify(textPositionSequences, cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
// case DOCUMINE -> docuMineBlockificationService.blockify(textPositionSequences, cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
// };
|
||||
|
||||
ClassificationPage classificationPage = new ClassificationPage(pageBlocks);
|
||||
|
||||
classificationPage.setCleanRulings(cleanRulings);
|
||||
classificationPage.setRotation(rotation);
|
||||
classificationPage.setLandscape(isLandscape);
|
||||
@ -283,9 +313,19 @@ public class LayoutParsingPipeline {
|
||||
case REDACT_MANAGER -> redactManagerClassificationService.classifyDocument(classificationDocument);
|
||||
}
|
||||
|
||||
List<ClassificationSection> sections = new ArrayList<>();
|
||||
for (var page : classificationPages) {
|
||||
page.getTextBlocks().forEach(block -> {
|
||||
block.setPage(page.getPageNumber());
|
||||
var section = sectionsBuilderService.buildTextBlock(List.of(block), "a");
|
||||
sections.add(section);
|
||||
});
|
||||
}
|
||||
classificationDocument.setSections(sections);
|
||||
|
||||
log.info("Building Sections for {}", identifier);
|
||||
sectionsBuilderService.buildSections(classificationDocument);
|
||||
sectionsBuilderService.addImagesToSections(classificationDocument);
|
||||
// sectionsBuilderService.buildSections(classificationDocument);
|
||||
// sectionsBuilderService.addImagesToSections(classificationDocument);
|
||||
return classificationDocument;
|
||||
}
|
||||
|
||||
|
||||
@ -12,15 +12,16 @@ import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRul
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.Getter;
|
||||
import lombok.NonNull;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
|
||||
|
||||
@Data
|
||||
@RequiredArgsConstructor
|
||||
public class ClassificationPage {
|
||||
|
||||
@NonNull
|
||||
@Getter
|
||||
private List<AbstractPageBlock> textBlocks;
|
||||
|
||||
private List<ClassifiedImage> images = new ArrayList<>();
|
||||
|
||||
@ -45,6 +45,9 @@ public class RedTextPosition {
|
||||
@JsonIgnore
|
||||
private String fontName;
|
||||
|
||||
@JsonIgnore
|
||||
private RedTextPosition parent;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public static RedTextPosition fromTextPosition(TextPosition textPosition) {
|
||||
|
||||
@ -17,6 +17,7 @@ import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.Getter;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
@ -27,6 +28,7 @@ import lombok.NoArgsConstructor;
|
||||
public class TextPageBlock extends AbstractPageBlock {
|
||||
|
||||
@Builder.Default
|
||||
@Getter
|
||||
private List<TextPositionSequence> sequences = new ArrayList<>();
|
||||
|
||||
@JsonIgnore
|
||||
@ -73,7 +75,7 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
|
||||
return sequences.get(0).getPageWidth();
|
||||
}
|
||||
|
||||
|
||||
|
||||
public static TextPageBlock merge(List<TextPageBlock> textBlocksToMerge) {
|
||||
|
||||
@ -82,6 +84,7 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
return fromTextPositionSequences(sequences);
|
||||
}
|
||||
|
||||
|
||||
public static TextPageBlock fromTextPositionSequences(List<TextPositionSequence> wordBlockList) {
|
||||
|
||||
TextPageBlock textBlock = null;
|
||||
@ -133,7 +136,6 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Returns the minX value in pdf coordinate system.
|
||||
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
|
||||
|
||||
@ -55,6 +55,18 @@ public class TextPositionSequence implements CharSequence {
|
||||
}
|
||||
|
||||
|
||||
public TextPositionSequence(List<RedTextPosition> textPositions, int page) {
|
||||
|
||||
this.textPositions = textPositions;
|
||||
this.page = page;
|
||||
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
|
||||
this.rotation = textPositions.get(0).getRotation();
|
||||
this.pageHeight = textPositions.get(0).getPageHeight();
|
||||
this.pageWidth = textPositions.get(0).getPageWidth();
|
||||
this.isParagraphStart = false;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int length() {
|
||||
|
||||
|
||||
@ -240,7 +240,7 @@ public class SectionsBuilderService {
|
||||
}
|
||||
|
||||
|
||||
private ClassificationSection buildTextBlock(List<AbstractPageBlock> wordBlockList, String lastHeadline) {
|
||||
public ClassificationSection buildTextBlock(List<AbstractPageBlock> wordBlockList, String lastHeadline) {
|
||||
|
||||
ClassificationSection section = new ClassificationSection();
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@ public class RedactManagerBlockificationService {
|
||||
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
|
||||
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
|
||||
|
||||
if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) {
|
||||
if (prev != null && (splitByDir || isSplitByRuling)) {
|
||||
|
||||
Orientation prevOrientation = null;
|
||||
if (!chunkBlockList.isEmpty()) {
|
||||
@ -167,7 +167,7 @@ public class RedactManagerBlockificationService {
|
||||
}
|
||||
|
||||
|
||||
private TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
|
||||
public TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
|
||||
|
||||
TextPageBlock textBlock = null;
|
||||
|
||||
|
||||
@ -0,0 +1,48 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.LineBuilderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.NearestNeighbourService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.ReadingOrderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.SpacingService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.ZoneBuilderService;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class DocstrumSegmentationService {
|
||||
|
||||
private final NearestNeighbourService nearestNeighbourService;
|
||||
private final SpacingService spacingService;
|
||||
private final LineBuilderService lineBuilderService;
|
||||
private final ZoneBuilderService zoneBuilderService;
|
||||
private final ReadingOrderService readingOrderService;
|
||||
|
||||
|
||||
public List<Zone> segmentPage(List<TextPositionSequence> textPositions) {
|
||||
|
||||
var positions = textPositions.stream().map(TextPositionSequence::getTextPositions).flatMap(List::stream).toList();
|
||||
|
||||
var characters = positions.stream().map(Character::new).collect(Collectors.toList());
|
||||
|
||||
nearestNeighbourService.findNearestNeighbors(characters);
|
||||
|
||||
var characterSpacing = spacingService.computeCharacterSpacing(characters);
|
||||
var lineSpacing = spacingService.computeLineSpacing(characters);
|
||||
|
||||
var lines = lineBuilderService.buildLines(characters, characterSpacing, lineSpacing);
|
||||
|
||||
var zones = zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing);
|
||||
|
||||
return readingOrderService.resolve(zones);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,90 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
|
||||
|
||||
/**
|
||||
* Filter class for neighbor objects that checks if the angle of the
|
||||
* neighbor is within specified range.
|
||||
*/
|
||||
public abstract class AngleFilter {
|
||||
|
||||
private final double lowerAngle;
|
||||
private final double upperAngle;
|
||||
|
||||
|
||||
private AngleFilter(double lowerAngle, double upperAngle) {
|
||||
|
||||
this.lowerAngle = lowerAngle;
|
||||
this.upperAngle = upperAngle;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Constructs new angle filter.
|
||||
*
|
||||
* @param lowerAngle minimum angle in range [-3*pi/2, pi/2)
|
||||
* @param upperAngle maximum angle in range [-pi/2, 3*pi/2)
|
||||
* @return newly constructed angle filter
|
||||
*/
|
||||
public static AngleFilter newInstance(double lowerAngle, double upperAngle) {
|
||||
|
||||
if (lowerAngle < -Math.PI / 2) {
|
||||
lowerAngle += Math.PI;
|
||||
}
|
||||
if (upperAngle >= Math.PI / 2) {
|
||||
upperAngle -= Math.PI;
|
||||
}
|
||||
if (lowerAngle <= upperAngle) {
|
||||
return new AndFilter(lowerAngle, upperAngle);
|
||||
} else {
|
||||
return new OrFilter(lowerAngle, upperAngle);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public double getLowerAngle() {
|
||||
|
||||
return lowerAngle;
|
||||
}
|
||||
|
||||
|
||||
public double getUpperAngle() {
|
||||
|
||||
return upperAngle;
|
||||
}
|
||||
|
||||
|
||||
public abstract boolean matches(Neighbor neighbor);
|
||||
|
||||
|
||||
public static final class AndFilter extends AngleFilter {
|
||||
|
||||
private AndFilter(double lowerAngle, double upperAngle) {
|
||||
|
||||
super(lowerAngle, upperAngle);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean matches(Neighbor neighbor) {
|
||||
|
||||
return getLowerAngle() <= neighbor.getAngle() && neighbor.getAngle() < getUpperAngle();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public static final class OrFilter extends AngleFilter {
|
||||
|
||||
private OrFilter(double lowerAngle, double upperAngle) {
|
||||
|
||||
super(lowerAngle, upperAngle);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean matches(Neighbor neighbor) {
|
||||
|
||||
return getLowerAngle() <= neighbor.getAngle() || neighbor.getAngle() < getUpperAngle();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,48 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
public abstract class BoundingBox {
|
||||
|
||||
private Rectangle2D bBox;
|
||||
|
||||
|
||||
public double getX() {
|
||||
|
||||
return bBox.getX();
|
||||
}
|
||||
|
||||
|
||||
public double getY() {
|
||||
|
||||
return bBox.getY();
|
||||
}
|
||||
|
||||
|
||||
public double getWidth() {
|
||||
|
||||
return bBox.getWidth();
|
||||
}
|
||||
|
||||
|
||||
public double getHeight() {
|
||||
|
||||
return bBox.getHeight();
|
||||
}
|
||||
|
||||
|
||||
public double getArea() {
|
||||
|
||||
return (bBox.getHeight() * bBox.getWidth());
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(Rectangle2D contained, double tolerance) {
|
||||
|
||||
return bBox.getX() <= contained.getX() + tolerance && bBox.getY() <= contained.getY() + tolerance && bBox.getX() + bBox.getWidth() >= contained.getX() + contained.getWidth() - tolerance && bBox.getY() + bBox.getHeight() >= contained.getY() + contained.getHeight() - tolerance;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,69 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
public class Character {
|
||||
|
||||
private final double x;
|
||||
private final double y;
|
||||
private final RedTextPosition textPosition;
|
||||
|
||||
private List<Neighbor> neighbors = new ArrayList<>();
|
||||
|
||||
|
||||
public Character(RedTextPosition chunk) {
|
||||
|
||||
this.x = chunk.getXDirAdj() + chunk.getWidthDirAdj() / 2;
|
||||
this.y = chunk.getYDirAdj() + chunk.getHeightDir() / 2;
|
||||
this.textPosition = chunk;
|
||||
}
|
||||
|
||||
|
||||
public double getHeight() {
|
||||
|
||||
return textPosition.getHeightDir();
|
||||
}
|
||||
|
||||
|
||||
public double distance(Character character) {
|
||||
|
||||
double dx = getX() - character.getX();
|
||||
double dy = getY() - character.getY();
|
||||
return Math.sqrt(dx * dx + dy * dy);
|
||||
}
|
||||
|
||||
|
||||
public double horizontalDistance(Character character) {
|
||||
|
||||
return Math.abs(getX() - character.getX());
|
||||
}
|
||||
|
||||
|
||||
public double verticalDistance(Character character) {
|
||||
|
||||
return Math.abs(getY() - character.getY());
|
||||
}
|
||||
|
||||
|
||||
public void setNeighbors(List<Neighbor> neighbors) {
|
||||
|
||||
this.neighbors = neighbors;
|
||||
}
|
||||
|
||||
|
||||
public double angle(Character character) {
|
||||
|
||||
if (getX() > character.getX()) {
|
||||
return Math.atan2(getY() - character.getY(), getX() - character.getX());
|
||||
} else {
|
||||
return Math.atan2(character.getY() - getY(), character.getX() - getX());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,212 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
|
||||
|
||||
import java.util.AbstractSet;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.Set;
|
||||
|
||||
public class DisjointSets<E> implements Iterable<Set<E>> {
|
||||
|
||||
private final Map<E, Entry<E>> map = new HashMap<E, Entry<E>>();
|
||||
|
||||
|
||||
/**
|
||||
* Constructs a new set of singletons.
|
||||
*
|
||||
* @param c elements of singleton sets
|
||||
*/
|
||||
public DisjointSets(Collection<? extends E> c) {
|
||||
|
||||
for (E element : c) {
|
||||
map.put(element, new Entry<E>(element));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks if elements are in the same subsets.
|
||||
*
|
||||
* @param e1 element from a subset
|
||||
* @param e2 element from a subset
|
||||
* @return true if elements are in the same subset; false otherwise
|
||||
*/
|
||||
public boolean areTogether(E e1, E e2) {
|
||||
|
||||
return map.get(e1).findRepresentative() == map.get(e2).findRepresentative();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Merges subsets which elements e1 and e2 belong to.
|
||||
*
|
||||
* @param e1 element from a subset
|
||||
* @param e2 element from a subset
|
||||
*/
|
||||
public void union(E e1, E e2) {
|
||||
|
||||
Entry<E> r1 = map.get(e1).findRepresentative();
|
||||
Entry<E> r2 = map.get(e2).findRepresentative();
|
||||
if (r1 != r2) {
|
||||
if (r1.size <= r2.size) {
|
||||
r2.mergeWith(r1);
|
||||
} else {
|
||||
r1.mergeWith(r2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Iterator<Set<E>> iterator() {
|
||||
|
||||
return new Iterator<Set<E>>() {
|
||||
|
||||
private final Iterator<Entry<E>> iterator = map.values().iterator();
|
||||
private Entry<E> nextRepresentative;
|
||||
|
||||
{
|
||||
findNextRepresentative();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
|
||||
return nextRepresentative != null;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Set<E> next() {
|
||||
|
||||
if (nextRepresentative == null) {
|
||||
throw new NoSuchElementException();
|
||||
}
|
||||
Set<E> result = nextRepresentative.asSet();
|
||||
findNextRepresentative();
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
private void findNextRepresentative() {
|
||||
|
||||
while (iterator.hasNext()) {
|
||||
Entry<E> candidate = iterator.next();
|
||||
if (candidate.isRepresentative()) {
|
||||
nextRepresentative = candidate;
|
||||
return;
|
||||
}
|
||||
}
|
||||
nextRepresentative = null;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
private static class Entry<E> {
|
||||
|
||||
private int size = 1;
|
||||
private final E value;
|
||||
private Entry<E> parent = this;
|
||||
private Entry<E> next = null;
|
||||
private Entry<E> last = this;
|
||||
|
||||
|
||||
Entry(E value) {
|
||||
|
||||
this.value = value;
|
||||
}
|
||||
|
||||
|
||||
void mergeWith(Entry<E> otherRepresentative) {
|
||||
|
||||
size += otherRepresentative.size;
|
||||
last.next = otherRepresentative;
|
||||
last = otherRepresentative.last;
|
||||
otherRepresentative.parent = this;
|
||||
}
|
||||
|
||||
|
||||
Entry<E> findRepresentative() {
|
||||
|
||||
Entry<E> representative = parent;
|
||||
while (representative.parent != representative) {
|
||||
representative = representative.parent;
|
||||
}
|
||||
for (Entry<E> entry = this; entry != representative; ) {
|
||||
Entry<E> nextEntry = entry.parent;
|
||||
entry.parent = representative;
|
||||
entry = nextEntry;
|
||||
}
|
||||
return representative;
|
||||
}
|
||||
|
||||
|
||||
boolean isRepresentative() {
|
||||
|
||||
return parent == this;
|
||||
}
|
||||
|
||||
|
||||
Set<E> asSet() {
|
||||
|
||||
return new AbstractSet<E>() {
|
||||
|
||||
@Override
|
||||
public Iterator<E> iterator() {
|
||||
|
||||
return new Iterator<E>() {
|
||||
|
||||
private Entry<E> nextEntry = findRepresentative();
|
||||
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
|
||||
return nextEntry != null;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public E next() {
|
||||
|
||||
if (nextEntry == null) {
|
||||
throw new NoSuchElementException();
|
||||
}
|
||||
E result = nextEntry.value;
|
||||
nextEntry = nextEntry.next;
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int size() {
|
||||
|
||||
return findRepresentative().size;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,199 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.NoSuchElementException;
|
||||
|
||||
public class Histogram implements Iterable<Histogram.Bin> {
|
||||
|
||||
private static final double EPSILON = 1.0e-6;
|
||||
|
||||
private final double min;
|
||||
private final double delta;
|
||||
private final double resolution;
|
||||
|
||||
private double[] frequencies;
|
||||
|
||||
|
||||
/**
|
||||
* Constructs a new histogram for values in range [minValue, maxValue] with
|
||||
* given resolution.
|
||||
*
|
||||
* @param minValue - minimum allowed value
|
||||
* @param maxValue - maximum allowed value
|
||||
* @param resolution - histogram's resolution
|
||||
*/
|
||||
public Histogram(double minValue, double maxValue, double resolution) {
|
||||
|
||||
this.min = minValue - EPSILON;
|
||||
this.delta = maxValue - minValue + 2 * EPSILON;
|
||||
int size = Math.max(1, (int) Math.round((maxValue - minValue) / resolution));
|
||||
this.resolution = this.delta / size;
|
||||
this.frequencies = new double[size];
|
||||
}
|
||||
|
||||
|
||||
public void kernelSmooth(double[] kernel) {
|
||||
|
||||
double[] newFrequencies = new double[frequencies.length];
|
||||
int shift = (kernel.length - 1) / 2;
|
||||
for (int i = 0; i < kernel.length; i++) {
|
||||
int jStart = Math.max(0, i - shift);
|
||||
int jEnd = Math.min(frequencies.length, frequencies.length + i - shift);
|
||||
for (int j = jStart; j < jEnd; j++) {
|
||||
newFrequencies[j - i + shift] += kernel[i] * frequencies[j];
|
||||
}
|
||||
}
|
||||
frequencies = newFrequencies;
|
||||
}
|
||||
|
||||
|
||||
public void circularKernelSmooth(double[] kernel) {
|
||||
|
||||
double[] newFrequencies = new double[frequencies.length];
|
||||
int shift = (kernel.length - 1) / 2;
|
||||
for (int i = 0; i < frequencies.length; i++) {
|
||||
for (int d = 0; d < kernel.length; d++) {
|
||||
int j = i + d - shift;
|
||||
if (j < 0) {
|
||||
j += frequencies.length;
|
||||
} else if (j >= frequencies.length) {
|
||||
j -= frequencies.length;
|
||||
}
|
||||
newFrequencies[i] += kernel[d] * frequencies[j];
|
||||
}
|
||||
}
|
||||
frequencies = newFrequencies;
|
||||
}
|
||||
|
||||
|
||||
public double[] createGaussianKernel(double length, double stdDeviation) {
|
||||
|
||||
int r = (int) Math.round(length / resolution) / 2;
|
||||
stdDeviation /= resolution;
|
||||
|
||||
int size = 2 * r + 1;
|
||||
double[] kernel = new double[size];
|
||||
double sum = 0;
|
||||
double b = 2 * stdDeviation * stdDeviation;
|
||||
double a = 1 / Math.sqrt(Math.PI * b);
|
||||
for (int i = 0; i < size; i++) {
|
||||
kernel[i] = a * Math.exp(-(i - r) * (i - r) / b);
|
||||
sum += kernel[i];
|
||||
}
|
||||
for (int i = 0; i < size; i++) {
|
||||
kernel[i] /= sum;
|
||||
}
|
||||
return kernel;
|
||||
}
|
||||
|
||||
|
||||
public void circularGaussianSmooth(double windowLength, double stdDeviation) {
|
||||
|
||||
circularKernelSmooth(createGaussianKernel(windowLength, stdDeviation));
|
||||
}
|
||||
|
||||
|
||||
public void gaussianSmooth(double windowLength, double stdDeviation) {
|
||||
|
||||
kernelSmooth(createGaussianKernel(windowLength, stdDeviation));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Adds single occurrence of given value to the histogram.
|
||||
*
|
||||
* @param value inserted values
|
||||
*/
|
||||
public void add(double value) {
|
||||
|
||||
frequencies[(int) ((value - min) / resolution)] += 1.0;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns histogram's number of bins.
|
||||
*
|
||||
* @return number of bins
|
||||
*/
|
||||
public int getSize() {
|
||||
|
||||
return frequencies.length;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Finds the histogram's peak value.
|
||||
*
|
||||
* @return peak value
|
||||
*/
|
||||
public double getPeakValue() {
|
||||
|
||||
int peakIndex = 0;
|
||||
for (int i = 1; i < frequencies.length; i++) {
|
||||
if (frequencies[i] > frequencies[peakIndex]) {
|
||||
peakIndex = i;
|
||||
}
|
||||
}
|
||||
int peakEndIndex = peakIndex + 1;
|
||||
final double EPS = 0.0001;
|
||||
while (peakEndIndex < frequencies.length && Math.abs(frequencies[peakEndIndex] - frequencies[peakIndex]) < EPS) {
|
||||
peakEndIndex++;
|
||||
}
|
||||
return ((double) peakIndex + peakEndIndex) / 2 * resolution + min;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Iterator<Bin> iterator() {
|
||||
|
||||
return new Iterator() {
|
||||
|
||||
private int index = 0;
|
||||
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
|
||||
return index < frequencies.length;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Object next() {
|
||||
|
||||
if (index >= frequencies.length) {
|
||||
throw new NoSuchElementException();
|
||||
}
|
||||
return new Bin(index++);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
|
||||
throw new UnsupportedOperationException("Not supported yet.");
|
||||
}
|
||||
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
public final class Bin {
|
||||
|
||||
private final int index;
|
||||
|
||||
|
||||
private Bin(int index) {
|
||||
|
||||
this.index = index;
|
||||
}
|
||||
|
||||
|
||||
public double getValue() {
|
||||
|
||||
return (index + 0.5) * resolution + min;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,167 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
public class Line extends BoundingBox {
|
||||
|
||||
private static final double WORD_DISTANCE_MULTIPLIER = 0.2;
|
||||
|
||||
private final double x0;
|
||||
private final double y0;
|
||||
|
||||
private final double x1;
|
||||
private final double y1;
|
||||
|
||||
private final double height;
|
||||
|
||||
private final List<Character> characters;
|
||||
private final List<TextPositionSequence> words = new ArrayList<>();
|
||||
|
||||
|
||||
public Line(List<Character> characters, double wordSpacing) {
|
||||
|
||||
this.characters = characters;
|
||||
|
||||
if (characters.size() >= 2) {
|
||||
// Simple linear regression
|
||||
double sx = 0.0, sxx = 0.0, sxy = 0.0, sy = 0.0;
|
||||
for (Character component : characters) {
|
||||
sx += component.getX();
|
||||
sxx += component.getX() * component.getX();
|
||||
sxy += component.getX() * component.getY();
|
||||
sy += component.getY();
|
||||
}
|
||||
double b = (characters.size() * sxy - sx * sy) / (characters.size() * sxx - sx * sx);
|
||||
double a = (sy - b * sx) / characters.size();
|
||||
|
||||
this.x0 = characters.get(0).getX();
|
||||
this.y0 = a + b * this.x0;
|
||||
this.x1 = characters.get(characters.size() - 1).getX();
|
||||
this.y1 = a + b * this.x1;
|
||||
} else if (!characters.isEmpty()) {
|
||||
Character component = characters.get(0);
|
||||
double dx = component.getTextPosition().getWidthDirAdj() / 3;
|
||||
double dy = dx * Math.tan(0);
|
||||
this.x0 = component.getX() - dx;
|
||||
this.x1 = component.getX() + dx;
|
||||
this.y0 = component.getY() - dy;
|
||||
this.y1 = component.getY() + dy;
|
||||
} else {
|
||||
throw new IllegalArgumentException("Component list must not be empty");
|
||||
}
|
||||
height = computeHeight();
|
||||
computeWords(wordSpacing * WORD_DISTANCE_MULTIPLIER);
|
||||
buildBox();
|
||||
}
|
||||
|
||||
|
||||
public double getAngle() {
|
||||
|
||||
return Math.atan2(y1 - y0, x1 - x0);
|
||||
}
|
||||
|
||||
|
||||
public double getLength() {
|
||||
|
||||
return Math.sqrt((x0 - x1) * (x0 - x1) + (y0 - y1) * (y0 - y1));
|
||||
}
|
||||
|
||||
|
||||
private double computeHeight() {
|
||||
|
||||
double sum = 0.0;
|
||||
for (Character component : characters) {
|
||||
sum += component.getHeight();
|
||||
}
|
||||
return sum / characters.size();
|
||||
}
|
||||
|
||||
|
||||
public double angularDifference(Line j) {
|
||||
|
||||
double diff = Math.abs(getAngle() - j.getAngle());
|
||||
if (diff <= Math.PI / 2) {
|
||||
return diff;
|
||||
} else {
|
||||
return Math.PI - diff;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public double horizontalDistance(Line other) {
|
||||
|
||||
double[] xs = new double[4];
|
||||
xs[0] = x0;
|
||||
xs[1] = x1;
|
||||
xs[2] = other.x0;
|
||||
xs[3] = other.x1;
|
||||
boolean overlapping = xs[1] >= xs[2] && xs[3] >= xs[0];
|
||||
Arrays.sort(xs);
|
||||
return Math.abs(xs[2] - xs[1]) * (overlapping ? 1 : -1);
|
||||
}
|
||||
|
||||
|
||||
public double verticalDistance(Line other) {
|
||||
|
||||
double ym = (y0 + y1) / 2;
|
||||
double yn = (other.y0 + other.y1) / 2;
|
||||
return Math.abs(ym - yn) / Math.sqrt(1);
|
||||
}
|
||||
|
||||
|
||||
private void computeWords(double wordSpacing) {
|
||||
|
||||
TextPositionSequence word = new TextPositionSequence();
|
||||
Character previous = null;
|
||||
for (Character current : characters) {
|
||||
if (previous != null) {
|
||||
double dist = current.getTextPosition().getXDirAdj() - previous.getTextPosition().getXDirAdj() - previous.getTextPosition().getWidthDirAdj();
|
||||
if (dist > wordSpacing) {
|
||||
words.add(word);
|
||||
word = new TextPositionSequence();
|
||||
}
|
||||
}
|
||||
word.getTextPositions().add(current.getTextPosition());
|
||||
previous = current;
|
||||
}
|
||||
words.add(word);
|
||||
}
|
||||
|
||||
|
||||
private void buildBox() {
|
||||
|
||||
double minX = Double.POSITIVE_INFINITY;
|
||||
double minY = Double.POSITIVE_INFINITY;
|
||||
double maxX = Double.NEGATIVE_INFINITY;
|
||||
double maxY = Double.NEGATIVE_INFINITY;
|
||||
|
||||
for (Character character : characters) {
|
||||
|
||||
minX = Math.min(minX, character.getTextPosition().getXDirAdj());
|
||||
minY = Math.min(minY, character.getTextPosition().getYDirAdj());
|
||||
maxX = Math.max(maxX, character.getTextPosition().getXDirAdj() + character.getTextPosition().getWidthDirAdj());
|
||||
maxY = Math.max(maxY, character.getTextPosition().getYDirAdj() + character.getTextPosition().getHeightDir());
|
||||
|
||||
}
|
||||
|
||||
this.setBBox(new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY));
|
||||
}
|
||||
|
||||
|
||||
public String toString() {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
words.forEach(word -> sb.append(word.toString()).append(" "));
|
||||
return sb.toString().trim();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,36 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
|
||||
|
||||
import lombok.Getter;
|
||||
|
||||
public class Neighbor {
|
||||
|
||||
@Getter
|
||||
private final double distance;
|
||||
@Getter
|
||||
private final double angle;
|
||||
private final Character originCharacter;
|
||||
@Getter
|
||||
private final Character character;
|
||||
|
||||
|
||||
public Neighbor(Character neighbor, Character origin) {
|
||||
|
||||
this.distance = neighbor.distance(origin);
|
||||
this.angle = neighbor.angle(origin);
|
||||
this.character = neighbor;
|
||||
this.originCharacter = origin;
|
||||
}
|
||||
|
||||
|
||||
public double getHorizontalDistance() {
|
||||
|
||||
return character.horizontalDistance(originCharacter);
|
||||
}
|
||||
|
||||
|
||||
public double getVerticalDistance() {
|
||||
|
||||
return character.verticalDistance(originCharacter);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,50 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
public class Zone extends BoundingBox {
|
||||
|
||||
private List<Line> lines;
|
||||
|
||||
|
||||
public Zone(List<Line> lines) {
|
||||
|
||||
lines.sort(Comparator.comparingDouble(Line::getY));
|
||||
this.lines = lines;
|
||||
buildBox();
|
||||
}
|
||||
|
||||
|
||||
public void buildBox() {
|
||||
|
||||
double minX = Double.POSITIVE_INFINITY;
|
||||
double minY = Double.POSITIVE_INFINITY;
|
||||
double maxX = Double.NEGATIVE_INFINITY;
|
||||
double maxY = Double.NEGATIVE_INFINITY;
|
||||
|
||||
for (Line line : lines) {
|
||||
|
||||
minX = Math.min(minX, line.getX());
|
||||
minY = Math.min(minY, line.getY());
|
||||
maxX = Math.max(maxX, line.getX() + line.getWidth());
|
||||
maxY = Math.max(maxY, line.getY() + line.getHeight());
|
||||
|
||||
}
|
||||
|
||||
this.setBBox(new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY));
|
||||
}
|
||||
|
||||
|
||||
public String toString() {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
lines.forEach(line -> sb.append(line.toString()).append("\n"));
|
||||
return sb.toString().trim();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,64 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.BoundingBox;
|
||||
|
||||
public class BoundingBoxZoneGroup extends BoundingBox {
|
||||
|
||||
private BoundingBox leftChild;
|
||||
private BoundingBox rightChild;
|
||||
|
||||
|
||||
public BoundingBoxZoneGroup(BoundingBox child1, BoundingBox child2) {
|
||||
|
||||
this.leftChild = child1;
|
||||
this.rightChild = child2;
|
||||
setBounds(Math.min(child1.getX(), child2.getX()),
|
||||
Math.min(child1.getY(), child2.getY()),
|
||||
Math.max(child1.getX() + child1.getWidth(), child2.getX() + child2.getWidth()),
|
||||
Math.max(child1.getY() + child1.getHeight(), child2.getY() + child2.getHeight()));
|
||||
}
|
||||
|
||||
|
||||
public void setbBox(Rectangle2D bBox) {
|
||||
|
||||
super.setBBox(bBox);
|
||||
}
|
||||
|
||||
|
||||
public BoundingBox getLeftChild() {
|
||||
|
||||
return leftChild;
|
||||
}
|
||||
|
||||
|
||||
public BoundingBox getRightChild() {
|
||||
|
||||
return rightChild;
|
||||
}
|
||||
|
||||
|
||||
public BoundingBoxZoneGroup setLeftChild(BoundingBox obj) {
|
||||
|
||||
this.leftChild = obj;
|
||||
return this;
|
||||
}
|
||||
|
||||
|
||||
public BoundingBoxZoneGroup setRightChild(BoundingBox obj) {
|
||||
|
||||
this.rightChild = obj;
|
||||
return this;
|
||||
}
|
||||
|
||||
|
||||
public BoundingBoxZoneGroup setBounds(double x0, double y0, double x1, double y1) {
|
||||
|
||||
assert x1 >= x0;
|
||||
assert y1 >= y0;
|
||||
this.setBBox(new Rectangle2D.Double(x0, y0, x1 - x0, y1 - y0));
|
||||
return this;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,115 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.utils.DoubleUtils;
|
||||
|
||||
public class DistElem<E> implements Comparable<DistElem<E>> {
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
|
||||
final int prime = 31;
|
||||
int result = 1;
|
||||
result = prime * result + (c ? 1231 : 1237);
|
||||
long temp;
|
||||
temp = Double.doubleToLongBits(dist);
|
||||
result = prime * result + (int) (temp ^ (temp >>> 32));
|
||||
result = prime * result + ((obj1 == null) ? 0 : obj1.hashCode());
|
||||
result = prime * result + ((obj2 == null) ? 0 : obj2.hashCode());
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
|
||||
if (this == obj) {
|
||||
return true;
|
||||
}
|
||||
if (obj == null) {
|
||||
return false;
|
||||
}
|
||||
if (getClass() != obj.getClass()) {
|
||||
return false;
|
||||
}
|
||||
DistElem other = (DistElem) obj;
|
||||
if (c != other.c) {
|
||||
return false;
|
||||
}
|
||||
if (Double.doubleToLongBits(dist) != Double.doubleToLongBits(other.dist)) {
|
||||
return false;
|
||||
}
|
||||
if (obj1 == null) {
|
||||
if (other.obj1 != null) {
|
||||
return false;
|
||||
}
|
||||
} else if (!obj1.equals(other.obj1)) {
|
||||
return false;
|
||||
}
|
||||
if (obj2 == null) {
|
||||
if (other.obj2 != null) {
|
||||
return false;
|
||||
}
|
||||
} else if (!obj2.equals(other.obj2)) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
boolean c;
|
||||
double dist;
|
||||
E obj1;
|
||||
E obj2;
|
||||
|
||||
|
||||
public boolean isC() {
|
||||
|
||||
return c;
|
||||
}
|
||||
|
||||
|
||||
public void setC(boolean c) {
|
||||
|
||||
this.c = c;
|
||||
}
|
||||
|
||||
|
||||
public double getDist() {
|
||||
|
||||
return dist;
|
||||
}
|
||||
|
||||
|
||||
public E getObj1() {
|
||||
|
||||
return obj1;
|
||||
}
|
||||
|
||||
|
||||
public E getObj2() {
|
||||
|
||||
return obj2;
|
||||
}
|
||||
|
||||
|
||||
public DistElem(boolean c, double dist, E obj1, E obj2) {
|
||||
|
||||
this.c = c;
|
||||
this.dist = dist;
|
||||
this.obj1 = obj1;
|
||||
this.obj2 = obj2;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int compareTo(DistElem<E> compareObject) {
|
||||
|
||||
double eps = 1E-3;
|
||||
if (c == compareObject.c) {
|
||||
return DoubleUtils.compareDouble(dist, compareObject.dist, eps);
|
||||
} else {
|
||||
return c ? -1 : 1;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,258 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.BoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone;
|
||||
|
||||
/**
|
||||
* A set-like data structure for objects placed on a plane. Can efficiently find objects in a certain rectangular area.
|
||||
* It maintains two parallel lists of objects, each of which is sorted by its x or y coordinate.
|
||||
*
|
||||
* @author Pawel Szostek
|
||||
*/
|
||||
public class DocumentPlane {
|
||||
|
||||
/**
|
||||
* List of objects on the plane. Stored in a random order
|
||||
*/
|
||||
private final List<BoundingBox> objs;
|
||||
/**
|
||||
* Size of a grid square. If gridSize=50, then the plane is divided into squares of size 50. Each square contains
|
||||
* objects placed in a 50x50 area
|
||||
*/
|
||||
private final int gridSize;
|
||||
/**
|
||||
* Redundant dictionary of objects on the plane. Allows efficient 2D space search. Keys are X-Y coordinates of a
|
||||
* grid square. Single object can be stored under several keys (depending on its physical size). Grid squares are
|
||||
* lazy-initialized.
|
||||
*/
|
||||
private final Map<GridXY, List<BoundingBox>> grid;
|
||||
|
||||
/**
|
||||
* Representation of XY coordinates
|
||||
*/
|
||||
private static class GridXY {
|
||||
|
||||
public int x;
|
||||
public int y;
|
||||
|
||||
|
||||
public GridXY(int x, int y) {
|
||||
|
||||
this.x = x;
|
||||
this.y = y;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
|
||||
return x * y;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
|
||||
if (obj == null || getClass() != obj.getClass()) {
|
||||
return false;
|
||||
}
|
||||
GridXY comparedObj = (GridXY) obj;
|
||||
return x == comparedObj.x && y == comparedObj.y;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return "(" + x + "," + y + ")";
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
public List<BoundingBox> getObjects() {
|
||||
|
||||
return objs;
|
||||
}
|
||||
|
||||
|
||||
public DocumentPlane(List<Zone> objectList, int gridSize) {
|
||||
|
||||
this.grid = new HashMap<GridXY, List<BoundingBox>>();
|
||||
this.objs = new ArrayList<BoundingBox>();
|
||||
this.gridSize = gridSize;
|
||||
for (Zone obj : objectList) {
|
||||
add(obj);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Looks for objects placed between obj1 and obj2 excluding them
|
||||
*
|
||||
* @param obj1 object
|
||||
* @param obj2 object
|
||||
* @return object list
|
||||
*/
|
||||
public List<BoundingBox> findObjectsBetween(BoundingBox obj1, BoundingBox obj2) {
|
||||
|
||||
double x0 = Math.min(obj1.getX(), obj2.getX());
|
||||
double y0 = Math.min(obj1.getY(), obj2.getY());
|
||||
double x1 = Math.max(obj1.getX() + obj1.getWidth(), obj2.getX() + obj2.getWidth());
|
||||
double y1 = Math.max(obj1.getY() + obj1.getHeight(), obj2.getY() + obj2.getHeight());
|
||||
assert x1 >= x0 && y1 >= y0;
|
||||
Rectangle2D searchBounds = new Rectangle2D.Double(x0, y0, x1 - x0, y1 - y0);
|
||||
List<BoundingBox> objsBetween = find(searchBounds);
|
||||
/*
|
||||
* the rectangle area must contain at least obj1 and obj2
|
||||
*/
|
||||
objsBetween.remove(obj1);
|
||||
objsBetween.remove(obj2);
|
||||
return objsBetween;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks if there is any object placed between obj1 and obj2
|
||||
*
|
||||
* @param obj1 object
|
||||
* @param obj2 object
|
||||
* @return true if anything is placed between, false otherwise
|
||||
*/
|
||||
public boolean anyObjectsBetween(BoundingBox obj1, BoundingBox obj2) {
|
||||
|
||||
List<BoundingBox> lObjs = findObjectsBetween(obj1, obj2);
|
||||
return !(lObjs.isEmpty());
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Adds object to the plane
|
||||
*
|
||||
* @param obj object
|
||||
* @return document plane
|
||||
*/
|
||||
public DocumentPlane add(BoundingBox obj) {
|
||||
|
||||
int objsBefore = this.objs.size();
|
||||
/*
|
||||
* iterate over grid squares
|
||||
*/
|
||||
for (int y = ((int) obj.getY()) / gridSize; y <= ((int) (obj.getY() + obj.getHeight() + gridSize - 1)) / gridSize; ++y) {
|
||||
for (int x = ((int) obj.getX()) / gridSize; x <= ((int) (obj.getX() + obj.getWidth() + gridSize - 1)) / gridSize; ++x) {
|
||||
GridXY xy = new GridXY(x, y);
|
||||
if (!grid.keySet().contains(xy)) {
|
||||
/*
|
||||
* add the non-existing key
|
||||
*/
|
||||
grid.put(xy, new ArrayList<BoundingBox>());
|
||||
grid.get(xy).add(obj);
|
||||
assert grid.get(xy).size() == 1;
|
||||
} else {
|
||||
grid.get(xy).add(obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
objs.add(obj);
|
||||
/*
|
||||
* size of the object list should be incremented
|
||||
*/
|
||||
assert objsBefore + 1 == objs.size();
|
||||
/*
|
||||
* object list must contain the same number of objects as object dictionary
|
||||
*/
|
||||
assert objs.size() == elementsInGrid();
|
||||
return this;
|
||||
}
|
||||
|
||||
|
||||
public DocumentPlane remove(BoundingBox obj) {
|
||||
/*
|
||||
* iterate over grid squares
|
||||
*/
|
||||
for (int y = ((int) obj.getY()) / gridSize; y <= ((int) (obj.getY() + obj.getHeight() + gridSize - 1)) / gridSize; ++y) {
|
||||
for (int x = ((int) obj.getX()) / gridSize; x <= ((int) (obj.getX() + obj.getWidth() + gridSize - 1)) / gridSize; ++x) {
|
||||
GridXY xy = new GridXY(x, y);
|
||||
if (grid.get(xy).contains(obj)) {
|
||||
grid.get(xy).remove(obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
objs.remove(obj);
|
||||
assert objs.size() == elementsInGrid();
|
||||
return this;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Find objects within search bounds
|
||||
*
|
||||
* @param searchBounds is a search rectangle
|
||||
* @return list of objects in!side search rectangle
|
||||
*/
|
||||
public List<BoundingBox> find(Rectangle2D searchBounds) {
|
||||
|
||||
List<BoundingBox> done = new ArrayList<BoundingBox>(); //contains already considered objects (wrt. optimization)
|
||||
List<BoundingBox> ret = new ArrayList<BoundingBox>();
|
||||
double x0 = searchBounds.getX();
|
||||
double y0 = searchBounds.getY();
|
||||
double y1 = searchBounds.getY() + searchBounds.getHeight();
|
||||
double x1 = searchBounds.getX() + searchBounds.getWidth();
|
||||
/*
|
||||
* iterate over grid squares
|
||||
*/
|
||||
for (int y = (int) y0 / gridSize; y < ((int) (y1 + gridSize - 1)) / gridSize; ++y) {
|
||||
for (int x = (int) x0 / gridSize; x < ((int) (x1 + gridSize - 1)) / gridSize; ++x) {
|
||||
GridXY xy = new GridXY(x, y);
|
||||
if (!grid.containsKey(xy)) {
|
||||
continue;
|
||||
}
|
||||
for (BoundingBox obj : grid.get(xy)) {
|
||||
if (done.contains(obj)) /*
|
||||
* omit if already checked
|
||||
*/ {
|
||||
continue;
|
||||
}
|
||||
/*
|
||||
* add to the checked objects
|
||||
*/
|
||||
done.add(obj);
|
||||
/*
|
||||
* check if two objects overlap
|
||||
*/
|
||||
if (obj.getX() + obj.getWidth() <= x0 || x1 <= obj.getX() || obj.getY() + obj.getHeight() <= y0 || y1 <= obj.getY()) {
|
||||
continue;
|
||||
}
|
||||
ret.add(obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Count objects stored in objects dictionary
|
||||
*
|
||||
* @return number of elements
|
||||
*/
|
||||
protected int elementsInGrid() {
|
||||
|
||||
List<BoundingBox> objs_ = new ArrayList<BoundingBox>();
|
||||
for (GridXY coord : grid.keySet()) {
|
||||
for (BoundingBox obj : grid.get(coord)) {
|
||||
if (!objs_.contains(obj)) {
|
||||
objs_.add(obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
return objs_.size();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,29 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone;
|
||||
|
||||
public class TreeToListConverter {
|
||||
|
||||
public List<Zone> convertToList(BoundingBoxZoneGroup obj) {
|
||||
|
||||
List<Zone> ret = new ArrayList<>();
|
||||
if (obj.getLeftChild() instanceof Zone) {
|
||||
Zone zone = (Zone) obj.getLeftChild();
|
||||
ret.add(zone);
|
||||
} else { // obj.getLeftChild() instanceof BxZoneGroup
|
||||
ret.addAll(convertToList((BoundingBoxZoneGroup) obj.getLeftChild()));
|
||||
}
|
||||
|
||||
if (obj.getRightChild() instanceof Zone) {
|
||||
Zone zone = (Zone) obj.getRightChild();
|
||||
ret.add(zone);
|
||||
} else { // obj.getRightChild() instanceof BxZoneGroup
|
||||
ret.addAll(convertToList((BoundingBoxZoneGroup) obj.getRightChild()));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,50 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.AngleFilter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.DisjointSets;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Line;
|
||||
|
||||
@Service
|
||||
public class LineBuilderService {
|
||||
|
||||
private static final double CHARACTER_SPACING_DISTANCE_MULTIPLIER = 3.5;
|
||||
private static final double MAX_VERTICAL_CHARACTER_DISTANCE = 0.67;
|
||||
private static final double ANGLE_TOLERANCE = Math.PI / 6;
|
||||
|
||||
|
||||
public List<Line> buildLines(List<Character> characters, double characterSpacing, double lineSpacing) {
|
||||
|
||||
double maxHorizontalDistance = characterSpacing * CHARACTER_SPACING_DISTANCE_MULTIPLIER;
|
||||
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_CHARACTER_DISTANCE;
|
||||
|
||||
DisjointSets<Character> sets = new DisjointSets<>(characters);
|
||||
AngleFilter filter = AngleFilter.newInstance(-ANGLE_TOLERANCE, ANGLE_TOLERANCE);
|
||||
|
||||
characters.forEach(character -> {
|
||||
character.getNeighbors().forEach(neighbor -> {
|
||||
double x = neighbor.getHorizontalDistance() / maxHorizontalDistance;
|
||||
double y = neighbor.getVerticalDistance() / maxVerticalDistance;
|
||||
if (filter.matches(neighbor) && Math.pow(x, 2) + Math.pow(y, 2) <= 1) {
|
||||
sets.union(character, neighbor.getCharacter());
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
List<Line> lines = new ArrayList<>();
|
||||
sets.forEach(group -> {
|
||||
List<Character> lineComponents = new ArrayList<>(group);
|
||||
lineComponents.sort(Comparator.comparingDouble(Character::getX));
|
||||
lines.add(new Line(lineComponents, characterSpacing));
|
||||
});
|
||||
|
||||
return lines;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,78 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Neighbor;
|
||||
|
||||
@Service
|
||||
public class NearestNeighbourService {
|
||||
|
||||
private static final int NUMBER_OF_NEIGHBOURS = 8;
|
||||
private static final double STEP = 16.0;
|
||||
|
||||
|
||||
public void findNearestNeighbors(List<Character> characters) {
|
||||
|
||||
if (characters.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
characters.sort(Comparator.comparingDouble(Character::getX));
|
||||
|
||||
int maxNeighborCount = NUMBER_OF_NEIGHBOURS;
|
||||
if (characters.size() <= NUMBER_OF_NEIGHBOURS) {
|
||||
maxNeighborCount = characters.size() - 1;
|
||||
}
|
||||
|
||||
for (int i = 0; i < characters.size(); i++) {
|
||||
|
||||
List<Neighbor> candidates = new ArrayList<>();
|
||||
|
||||
int start = i;
|
||||
int end = i + 1;
|
||||
|
||||
double distance = Double.POSITIVE_INFINITY;
|
||||
|
||||
for (double searchDistance = 0; searchDistance < distance; ) {
|
||||
|
||||
searchDistance += STEP;
|
||||
boolean newCandidatesFound = false;
|
||||
|
||||
while (start > 0 && characters.get(i).getX() - characters.get(start - 1).getX() < searchDistance) {
|
||||
start--;
|
||||
candidates.add(new Neighbor(characters.get(start), characters.get(i)));
|
||||
clearLeastDistant(candidates, maxNeighborCount);
|
||||
newCandidatesFound = true;
|
||||
}
|
||||
|
||||
while (end < characters.size() && characters.get(end).getX() - characters.get(i).getX() < searchDistance) {
|
||||
candidates.add(new Neighbor(characters.get(end), characters.get(i)));
|
||||
clearLeastDistant(candidates, maxNeighborCount);
|
||||
end++;
|
||||
newCandidatesFound = true;
|
||||
}
|
||||
|
||||
if (newCandidatesFound && candidates.size() >= maxNeighborCount) {
|
||||
distance = candidates.get(maxNeighborCount - 1).getDistance();
|
||||
}
|
||||
}
|
||||
clearLeastDistant(candidates, maxNeighborCount);
|
||||
characters.get(i).setNeighbors(new ArrayList<>(candidates));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void clearLeastDistant(List<Neighbor> candidates, int maxNeighborCount) {
|
||||
|
||||
if (candidates.size() > maxNeighborCount) {
|
||||
candidates.sort(Comparator.comparingDouble(Neighbor::getDistance));
|
||||
candidates.remove(candidates.remove(candidates.size() - 1));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,286 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.BoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder.BoundingBoxZoneGroup;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder.DistElem;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder.DocumentPlane;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder.TreeToListConverter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.utils.DoubleUtils;
|
||||
|
||||
@Service
|
||||
public class ReadingOrderService {
|
||||
|
||||
static final int GRIDSIZE = 50;
|
||||
static final double EPS = 0.01;
|
||||
static final int MAX_ZONES = 1000;
|
||||
static final Comparator<BoundingBox> Y_ASCENDING_ORDER = new Comparator<BoundingBox>() {
|
||||
|
||||
@Override
|
||||
public int compare(BoundingBox o1, BoundingBox o2) {
|
||||
|
||||
return DoubleUtils.compareDouble(o1.getY(), o2.getY(), EPS);
|
||||
}
|
||||
};
|
||||
|
||||
static final Comparator<BoundingBox> X_ASCENDING_ORDER = new Comparator<BoundingBox>() {
|
||||
|
||||
@Override
|
||||
public int compare(BoundingBox o1, BoundingBox o2) {
|
||||
|
||||
return DoubleUtils.compareDouble(o1.getX(), o2.getX(), EPS);
|
||||
}
|
||||
};
|
||||
|
||||
static final Comparator<BoundingBox> YX_ASCENDING_ORDER = new Comparator<BoundingBox>() {
|
||||
|
||||
@Override
|
||||
public int compare(BoundingBox o1, BoundingBox o2) {
|
||||
|
||||
int yCompare = Y_ASCENDING_ORDER.compare(o1, o2);
|
||||
return yCompare == 0 ? X_ASCENDING_ORDER.compare(o1, o2) : yCompare;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
public List<Zone> resolve(List<Zone> zones) {
|
||||
|
||||
List<Zone> orderedZones;
|
||||
if (zones.size() > MAX_ZONES) {
|
||||
orderedZones = new ArrayList<>(zones);
|
||||
Collections.sort(orderedZones, YX_ASCENDING_ORDER);
|
||||
} else {
|
||||
orderedZones = reorderZones(zones);
|
||||
}
|
||||
return orderedZones;
|
||||
}
|
||||
|
||||
|
||||
private List<Zone> reorderZones(List<Zone> unorderedZones) {
|
||||
|
||||
if (unorderedZones.isEmpty()) {
|
||||
return new ArrayList<>();
|
||||
} else if (unorderedZones.size() == 1) {
|
||||
List<Zone> ret = new ArrayList<>(1);
|
||||
ret.add(unorderedZones.get(0));
|
||||
return ret;
|
||||
} else {
|
||||
BoundingBoxZoneGroup bxZonesTree = groupZonesHierarchically(unorderedZones);
|
||||
sortGroupedZones(bxZonesTree);
|
||||
TreeToListConverter treeConverter = new TreeToListConverter();
|
||||
List<Zone> orderedZones = treeConverter.convertToList(bxZonesTree);
|
||||
assert unorderedZones.size() == orderedZones.size();
|
||||
return orderedZones;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Builds a binary tree of zones and groups of zones from a list of unordered zones. This is done in hierarchical
|
||||
* clustering by joining two least distant nodes. Distance is calculated in the distance() method.
|
||||
*
|
||||
* @param zones is a list of unordered zones
|
||||
* @return root of the zones clustered in a tree
|
||||
*/
|
||||
private BoundingBoxZoneGroup groupZonesHierarchically(List<Zone> zones) {
|
||||
/*
|
||||
* Distance tuples are stored sorted by ascending distance value
|
||||
*/
|
||||
List<DistElem<BoundingBox>> dists = new ArrayList<DistElem<BoundingBox>>(zones.size() * zones.size() / 2);
|
||||
for (int idx1 = 0; idx1 < zones.size(); ++idx1) {
|
||||
for (int idx2 = idx1 + 1; idx2 < zones.size(); ++idx2) {
|
||||
Zone zone1 = zones.get(idx1);
|
||||
Zone zone2 = zones.get(idx2);
|
||||
dists.add(new DistElem<BoundingBox>(false, distance(zone1, zone2), zone1, zone2));
|
||||
}
|
||||
}
|
||||
Collections.sort(dists);
|
||||
DocumentPlane plane = new DocumentPlane(zones, GRIDSIZE);
|
||||
while (!dists.isEmpty()) {
|
||||
DistElem<BoundingBox> distElem = dists.get(0);
|
||||
dists.remove(0);
|
||||
if (!distElem.isC() && plane.anyObjectsBetween(distElem.getObj1(), distElem.getObj2())) {
|
||||
dists.add(new DistElem<BoundingBox>(true, distElem.getDist(), distElem.getObj1(), distElem.getObj2()));
|
||||
continue;
|
||||
}
|
||||
BoundingBoxZoneGroup newGroup = new BoundingBoxZoneGroup(distElem.getObj1(), distElem.getObj2());
|
||||
plane.remove(distElem.getObj1()).remove(distElem.getObj2());
|
||||
dists = removeDistElementsContainingObject(dists, distElem.getObj1());
|
||||
dists = removeDistElementsContainingObject(dists, distElem.getObj2());
|
||||
for (BoundingBox other : plane.getObjects()) {
|
||||
dists.add(new DistElem<BoundingBox>(false, distance(other, newGroup), newGroup, other));
|
||||
}
|
||||
Collections.sort(dists);
|
||||
plane.add(newGroup);
|
||||
}
|
||||
|
||||
assert plane.getObjects().size() == 1 : "There should be one object left at the plane after grouping";
|
||||
return (BoundingBoxZoneGroup) plane.getObjects().get(0);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Removes all distance tuples containing obj
|
||||
*/
|
||||
private List<DistElem<BoundingBox>> removeDistElementsContainingObject(Collection<DistElem<BoundingBox>> list, BoundingBox obj) {
|
||||
|
||||
List<DistElem<BoundingBox>> ret = new ArrayList<DistElem<BoundingBox>>();
|
||||
for (DistElem<BoundingBox> distElem : list) {
|
||||
if (distElem.getObj1() != obj && distElem.getObj2() != obj) {
|
||||
ret.add(distElem);
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Swaps children of BxZoneGroup if necessary. A group with smaller sort factor is placed to the left (leftChild).
|
||||
* An object with greater sort factor is placed on the right (rightChild). This plays an important role when
|
||||
* traversing the tree in conversion to a one dimensional list.
|
||||
*
|
||||
* @param group
|
||||
*/
|
||||
private void sortGroupedZones(BoundingBoxZoneGroup group) {
|
||||
|
||||
BoundingBox leftChild = group.getLeftChild();
|
||||
BoundingBox rightChild = group.getRightChild();
|
||||
if (shouldBeSwapped(leftChild, rightChild)) {
|
||||
// swap
|
||||
group.setLeftChild(rightChild);
|
||||
group.setRightChild(leftChild);
|
||||
}
|
||||
|
||||
if (leftChild instanceof BoundingBoxZoneGroup) // if the child is a tree node, then recurse
|
||||
{
|
||||
sortGroupedZones((BoundingBoxZoneGroup) leftChild);
|
||||
}
|
||||
if (rightChild instanceof BoundingBoxZoneGroup) // as above - recurse
|
||||
{
|
||||
sortGroupedZones((BoundingBoxZoneGroup) rightChild);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private boolean shouldBeSwapped(BoundingBox first, BoundingBox second) {
|
||||
|
||||
double cx, cy, cw, ch, ox, oy, ow, oh;
|
||||
cx = first.getBBox().getX();
|
||||
cy = first.getBBox().getY();
|
||||
cw = first.getBBox().getWidth();
|
||||
ch = first.getBBox().getHeight();
|
||||
|
||||
ox = second.getBBox().getX();
|
||||
oy = second.getBBox().getY();
|
||||
ow = second.getBBox().getWidth();
|
||||
oh = second.getBBox().getHeight();
|
||||
|
||||
// Determine Octant
|
||||
//
|
||||
// 0 | 1 | 2
|
||||
// __|___|__
|
||||
// 7 | 9 | 3 First is placed in 9th square
|
||||
// __|___|__
|
||||
// 6 | 5 | 4
|
||||
|
||||
if (cx + cw <= ox) { //2,3,4
|
||||
return false;
|
||||
} else if (ox + ow <= cx) { //0,6,7
|
||||
return true; //6
|
||||
} else if (cy + ch <= oy) {
|
||||
return false; //5
|
||||
} else if (oy + oh <= cy) {
|
||||
return true; //1
|
||||
} else { //two zones
|
||||
double xdiff = ox + ow / 2 - cx - cw / 2;
|
||||
double ydiff = oy + oh / 2 - cy - ch / 2;
|
||||
return xdiff + ydiff < 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* A distance function between two TextBoxes.
|
||||
* <p>
|
||||
* Consider the bounding rectangle for obj1 and obj2. Return its area minus the areas of obj1 and obj2, shown as
|
||||
* 'www' below. This value may be negative. (x0,y0) +------+..........+ | obj1 |wwwwwwwwww: +------+www+------+
|
||||
* :wwwwwwwwww| obj2 | +..........+------+ (x1,y1)
|
||||
*
|
||||
* @return distance value based on objects' coordinates and physical size on a plane
|
||||
*/
|
||||
private double distance(BoundingBox obj1, BoundingBox obj2) {
|
||||
|
||||
double x0 = Math.min(obj1.getX(), obj2.getX());
|
||||
double y0 = Math.min(obj1.getY(), obj2.getY());
|
||||
double x1 = Math.max(obj1.getX() + obj1.getWidth(), obj2.getX() + obj2.getWidth());
|
||||
double y1 = Math.max(obj1.getY() + obj1.getHeight(), obj2.getY() + obj2.getHeight());
|
||||
double dist = ((x1 - x0) * (y1 - y0) - obj1.getArea() - obj2.getArea());
|
||||
|
||||
double factor = ((x1 - x0)/x1) / ((y1 - y0)/y1);
|
||||
|
||||
double obj1X = obj1.getX();
|
||||
double obj1Y_2 = obj1.getBBox().getMaxY();
|
||||
double obj1X_2 = obj1.getBBox().getMaxX();
|
||||
double obj1CenterX = obj1.getBBox().getCenterX();
|
||||
double obj1CenterY = obj1.getBBox().getCenterY();
|
||||
double obj2X = obj2.getX();
|
||||
double obj2Y_2 = obj2.getBBox().getMaxY();
|
||||
double obj2X_2 = obj2.getBBox().getMaxX();
|
||||
double obj2CenterX = obj2.getBBox().getCenterX();
|
||||
double obj2CenterY = obj2.getBBox().getCenterY();
|
||||
|
||||
double obj1obj2VectorCosineAbsLeft = Math.abs((obj2X - obj1X) / Math.sqrt((obj2X - obj1X) * (obj2X - obj1X) + (obj2CenterY - obj1CenterY) * (obj2CenterY - obj1CenterY)));
|
||||
double obj1obj2VectorCosineAbsRight = Math.abs((obj2X_2 - obj1X_2) / Math.sqrt((obj2X_2 - obj1X_2) * (obj2X_2 - obj1X_2) + (obj2CenterY - obj1CenterY) * (obj2CenterY - obj1CenterY)));
|
||||
double obj1obj2VectorCosineAbsCenter = Math.abs((obj2CenterX - obj1CenterX) / Math.sqrt((obj2CenterX - obj1CenterX) * (obj2CenterX - obj1CenterX) + (obj2CenterY - obj1CenterY) * (obj2CenterY - obj1CenterY)));
|
||||
|
||||
double cosine = Math.min(obj1obj2VectorCosineAbsLeft, Math.min(obj1obj2VectorCosineAbsRight, obj1obj2VectorCosineAbsCenter));
|
||||
|
||||
final double MAGIC_COEFF = 0.85;
|
||||
//return dist * (MAGIC_COEFF + cosine);
|
||||
|
||||
return Math.sqrt(Math.pow((obj1X - obj2X), 2) + Math.pow((obj1Y_2 - obj2Y_2) * MAGIC_COEFF, 2));
|
||||
|
||||
|
||||
/**if (Math.abs(obj1CenterX - obj2CenterX) >= Math.abs(obj1CenterY - obj2CenterY)) {
|
||||
return dist * 2;
|
||||
} else {
|
||||
return dist;
|
||||
}**/
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
private double distanceNew(BoundingBox obj1, BoundingBox obj2) {
|
||||
|
||||
if(obj1.getBBox().intersects(obj2.getBBox()))
|
||||
return -1;
|
||||
|
||||
double minX0 = Math.min(obj1.getX(), obj2.getX());
|
||||
double maxX0 = Math.max(obj1.getX(), obj2.getX());
|
||||
double minY0 = Math.min(obj1.getY(), obj2.getY());
|
||||
double maxY0 = Math.max(obj1.getY(), obj2.getY());
|
||||
double minX1 = Math.min(obj1.getX() + obj1.getWidth(), obj2.getX() + obj2.getWidth());
|
||||
double maxX1 = Math.max(obj1.getX() + obj1.getWidth(), obj2.getX() + obj2.getWidth());
|
||||
double minY1 = Math.min(obj1.getY() + obj1.getHeight(), obj2.getY() + obj2.getHeight());
|
||||
double maxY1 = Math.max(obj1.getY() + obj1.getHeight(), obj2.getY() + obj2.getHeight());
|
||||
List<Double> xValues = new ArrayList<>(List.of(minX0, maxX0, minX1, maxX1));
|
||||
Collections.sort(xValues);
|
||||
List<Double> yValues = new ArrayList<>(List.of(minY0, maxY0, minY1, maxY1));
|
||||
Collections.sort(yValues);
|
||||
|
||||
double yArea = (xValues.get(2) - xValues.get(1)) * (yValues.get(3) - yValues.get(0));
|
||||
double xArea = (yValues.get(2) - yValues.get(1)) * (xValues.get(3) - xValues.get(0));
|
||||
|
||||
return Math.min(10*yArea, xArea);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,56 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.AngleFilter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Histogram;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Neighbor;
|
||||
|
||||
@Service
|
||||
public class SpacingService {
|
||||
|
||||
private static final double SPACING_HISTOGRAM_RESOLUTION = 0.5;
|
||||
private static final double SPACING_HISTOGRAM_SMOOTHING_LENGTH = 2.5;
|
||||
private static final double SPACING_HIST_SMOOTHING_STANDARD_DEVIATION = 0.5;
|
||||
private static final double ANGLE_TOLERANCE = Math.PI / 6;
|
||||
|
||||
|
||||
public double computeCharacterSpacing(List<Character> components) {
|
||||
|
||||
return computeSpacing(components, 0);
|
||||
}
|
||||
|
||||
|
||||
public double computeLineSpacing(List<Character> components) {
|
||||
|
||||
return computeSpacing(components, Math.PI / 2);
|
||||
}
|
||||
|
||||
|
||||
private double computeSpacing(List<Character> components, double angle) {
|
||||
|
||||
double maxDistance = Double.NEGATIVE_INFINITY;
|
||||
|
||||
for (Character component : components) {
|
||||
for (Neighbor neighbor : component.getNeighbors()) {
|
||||
maxDistance = Math.max(maxDistance, neighbor.getDistance());
|
||||
}
|
||||
}
|
||||
Histogram histogram = new Histogram(0, maxDistance, SPACING_HISTOGRAM_RESOLUTION);
|
||||
AngleFilter filter = AngleFilter.newInstance(angle - ANGLE_TOLERANCE, angle + ANGLE_TOLERANCE);
|
||||
for (Character component : components) {
|
||||
for (Neighbor neighbor : component.getNeighbors()) {
|
||||
if (filter.matches(neighbor)) {
|
||||
histogram.add(neighbor.getDistance());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
histogram.gaussianSmooth(SPACING_HISTOGRAM_SMOOTHING_LENGTH, SPACING_HIST_SMOOTHING_STANDARD_DEVIATION);
|
||||
return histogram.getPeakValue();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,94 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.DisjointSets;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Line;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone;
|
||||
|
||||
@Service
|
||||
public class ZoneBuilderService {
|
||||
|
||||
private static final double MIN_HORIZONTAL_DISTANCE_MULTIPLIER = -0.5;
|
||||
private static final double MAX_VERTICAL_DISTANCE_MULTIPLIER = 1.2;
|
||||
|
||||
private static final double MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER = -3.0;
|
||||
|
||||
private static final double MAX_VERTICAL_MERGE_DISTANCE_MULTIPLIER = 0.5;
|
||||
|
||||
private static final double MIN_LINE_SIZE_SCALE = 0.9;
|
||||
|
||||
private static final double MAX_LINE_SIZE_SCALE = 2.5;
|
||||
|
||||
private static final double ANGLE_TOLERANCE = Math.PI / 6;
|
||||
|
||||
public static final int MAX_ZONES = 300;
|
||||
|
||||
|
||||
public List<Zone> buildZones(List<Line> lines, double characterSpacing, double lineSpacing) {
|
||||
|
||||
double minHorizontalDistance = characterSpacing * MIN_HORIZONTAL_DISTANCE_MULTIPLIER;
|
||||
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_DISTANCE_MULTIPLIER;
|
||||
double minHorizontalMergeDistance = characterSpacing * MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER;
|
||||
double maxVerticalMergeDistance = lineSpacing * MAX_VERTICAL_MERGE_DISTANCE_MULTIPLIER;
|
||||
|
||||
DisjointSets<Line> sets = new DisjointSets<>(lines);
|
||||
|
||||
double meanHeight = calculateMeanHeight(lines);
|
||||
|
||||
lines.forEach(outerLine -> //
|
||||
lines.forEach(innerLine -> {
|
||||
double scale = Math.min(outerLine.getHeight(), innerLine.getHeight()) / meanHeight;
|
||||
scale = Math.max(MIN_LINE_SIZE_SCALE, Math.min(scale, MAX_LINE_SIZE_SCALE));
|
||||
|
||||
if (!sets.areTogether(outerLine, innerLine) && outerLine.angularDifference(innerLine) <= ANGLE_TOLERANCE) {
|
||||
|
||||
double horizontalDistance = outerLine.horizontalDistance(innerLine) / scale;
|
||||
double verticalDistance = outerLine.verticalDistance(innerLine) / scale;
|
||||
|
||||
// Line over or above
|
||||
if (minHorizontalDistance <= horizontalDistance && verticalDistance <= maxVerticalDistance) {
|
||||
sets.union(outerLine, innerLine);
|
||||
}
|
||||
|
||||
// Split line that needs later merging
|
||||
else if (minHorizontalMergeDistance <= horizontalDistance && verticalDistance <= maxVerticalMergeDistance) {
|
||||
sets.union(outerLine, innerLine);
|
||||
}
|
||||
}
|
||||
}));
|
||||
|
||||
List<Zone> zones = new ArrayList<>();
|
||||
sets.forEach(group -> {
|
||||
zones.add(new Zone(new ArrayList<>(group)));
|
||||
});
|
||||
|
||||
if (zones.size() > MAX_ZONES) {
|
||||
List<Line> oneZoneLines = new ArrayList<>();
|
||||
for (Zone zone : zones) {
|
||||
oneZoneLines.addAll(zone.getLines());
|
||||
}
|
||||
return List.of(new Zone(oneZoneLines));
|
||||
}
|
||||
|
||||
return zones;
|
||||
}
|
||||
|
||||
|
||||
private double calculateMeanHeight(List<Line> lines) {
|
||||
|
||||
double meanHeight = 0.0;
|
||||
double weights = 0.0;
|
||||
for (Line line : lines) {
|
||||
double weight = line.getLength();
|
||||
meanHeight += line.getHeight() * weight;
|
||||
weights += weight;
|
||||
}
|
||||
meanHeight /= weights;
|
||||
return meanHeight;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,18 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.utils;
|
||||
|
||||
public class DoubleUtils {
|
||||
|
||||
public static int compareDouble(double d1, double d2, double precision) {
|
||||
|
||||
if (Double.isNaN(d1) || Double.isNaN(d2)) {
|
||||
return Double.compare(d1, d2);
|
||||
}
|
||||
if (precision == 0) {
|
||||
precision = 1;
|
||||
}
|
||||
long i1 = Math.round(d1 / precision);
|
||||
long i2 = Math.round(d2 / precision);
|
||||
return Long.valueOf(i1).compareTo(i2);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,270 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.visualization;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.PDPageContentStream;
|
||||
import org.apache.pdfbox.pdmodel.font.PDType1Font;
|
||||
import org.apache.pdfbox.pdmodel.font.Standard14Fonts;
|
||||
import org.apache.pdfbox.util.Matrix;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PdfVisualisationUtility;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Getter;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class PdfDraw {
|
||||
|
||||
public static void drawRectanglesPerPage(String filename, List<List<Rectangle2D>> rectanglesPerPage, String tmpFileName) throws IOException {
|
||||
|
||||
ClassPathResource pdfResource = new ClassPathResource(filename);
|
||||
try (PDDocument pdDocument = Loader.loadPDF(pdfResource.getFile()); var out = new FileOutputStream(tmpFileName)) {
|
||||
for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) {
|
||||
PdfVisualisationUtility.drawRectangle2DList(pdDocument,
|
||||
pageNumber,
|
||||
rectanglesPerPage.get(pageNumber - 1),
|
||||
PdfVisualisationUtility.Options.builder().stroke(true).build());
|
||||
}
|
||||
pdDocument.save(out);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
public static void drawRectanglesPerPageNumberedByLine(String filename, List<List<List<Rectangle2D>>> rectanglesPerPage, String tmpFileName) throws IOException {
|
||||
|
||||
ClassPathResource pdfResource = new ClassPathResource(filename);
|
||||
try (PDDocument pdDocument = Loader.loadPDF(pdfResource.getFile()); var out = new FileOutputStream(tmpFileName)) {
|
||||
for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) {
|
||||
var rectanglesOnPage = rectanglesPerPage.get(pageNumber - 1);
|
||||
for (int lineNumber = 0; lineNumber < rectanglesOnPage.size(); lineNumber++) {
|
||||
var rectanglesInLine = rectanglesOnPage.get(lineNumber);
|
||||
PdfVisualisationUtility.drawRectangle2DList(pdDocument, pageNumber, rectanglesInLine, PdfVisualisationUtility.Options.builder().stroke(true).build());
|
||||
double y = Math.min(rectanglesInLine.get(0).getMinY(), rectanglesInLine.get(0).getMaxY());
|
||||
PdfVisualisationUtility.drawText(String.format("%d", lineNumber),
|
||||
pdDocument,
|
||||
new Point2D.Double(rectanglesInLine.get(0).getX() - (5 + (5 * countNumberOfDigits(lineNumber))), y + 2),
|
||||
pageNumber,
|
||||
PdfVisualisationUtility.Options.builder().stroke(true).build());
|
||||
}
|
||||
}
|
||||
pdDocument.save(out);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
private static int countNumberOfDigits(int num) {
|
||||
|
||||
int final_num = num;
|
||||
if (final_num == 0) {
|
||||
return 1;
|
||||
}
|
||||
int count = 0;
|
||||
for (; final_num != 0; final_num /= 10) {
|
||||
count++;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
|
||||
public static void drawDocumentGraph(PDDocument document, Document documentGraph) {
|
||||
|
||||
documentGraph.getDocumentTree().allEntriesInOrder().forEach(entry -> drawNode(document, entry));
|
||||
}
|
||||
|
||||
|
||||
public static void drawNode(PDDocument document, DocumentTree.Entry entry) {
|
||||
|
||||
Options options = buildStandardOptionsForNodes(entry);
|
||||
|
||||
drawBBoxAndLabelAndNumberOnPage(document, entry, options);
|
||||
|
||||
}
|
||||
|
||||
|
||||
public static void drawTextBlock(PDDocument document, TextBlock textBlock, Options options) {
|
||||
|
||||
textBlock.getAtomicTextBlocks().forEach(atb -> drawAtomicTextBlock(document, atb, options));
|
||||
}
|
||||
|
||||
|
||||
public static void drawAtomicTextBlock(PDDocument document, AtomicTextBlock atomicTextBlock, Options options) {
|
||||
|
||||
drawRectangle2DList(document, atomicTextBlock.getPage().getNumber(), atomicTextBlock.getPositions().stream().toList(), options);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private static void drawText(String string, PDDocument document, Point2D location, Integer pageNumber, Options options, boolean rotate) {
|
||||
|
||||
var pdPage = document.getPage(pageNumber - 1);
|
||||
var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true);
|
||||
|
||||
contentStream.setNonStrokingColor(options.getStrokeColor());
|
||||
contentStream.setLineWidth(options.getStrokeWidth());
|
||||
|
||||
contentStream.beginText();
|
||||
if (rotate) {
|
||||
contentStream.setTextMatrix(Matrix.getRotateInstance(Math.toRadians(15), (float) location.getX(), (float) location.getY()));
|
||||
} else {
|
||||
contentStream.newLineAtOffset((float) location.getX(), (float) location.getY());
|
||||
}
|
||||
contentStream.setFont(new PDType1Font(Standard14Fonts.FontName.HELVETICA), 10);
|
||||
contentStream.showText(string);
|
||||
contentStream.endText();
|
||||
contentStream.close();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public static void drawRectangle2DList(PDDocument document, int pageNumber, List<Rectangle2D> rectCollection, Options options) {
|
||||
|
||||
var pdPage = document.getPage(pageNumber - 1);
|
||||
drawRectangle2DList(document, rectCollection, options, pdPage);
|
||||
}
|
||||
|
||||
|
||||
private static void drawRectangle2DList(PDDocument document, List<Rectangle2D> rectCollection, Options options, PDPage pdPage) throws IOException {
|
||||
|
||||
var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true);
|
||||
|
||||
contentStream.setStrokingColor(options.getStrokeColor());
|
||||
contentStream.setNonStrokingColor(options.getFillColor());
|
||||
contentStream.setLineWidth(options.getStrokeWidth());
|
||||
|
||||
for (var r : rectCollection) {
|
||||
contentStream.addRect((float) r.getMinX(), (float) r.getMinY(), (float) r.getWidth(), (float) r.getHeight());
|
||||
|
||||
if (options.isStroke() && options.isFill()) {
|
||||
contentStream.fillAndStroke();
|
||||
} else if (options.isStroke()) {
|
||||
contentStream.stroke();
|
||||
} else if (options.isFill()) {
|
||||
contentStream.fill();
|
||||
}
|
||||
}
|
||||
contentStream.close();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public static void drawRectanglesAndLinesPerPage(String filename, List<List<Rectangle2D>> list, List<List<Rectangle2D>> rectanglesPerPage, String tmpFileName) {
|
||||
|
||||
ClassPathResource pdfResource = new ClassPathResource(filename);
|
||||
try (PDDocument pdDocument = Loader.loadPDF(pdfResource.getFile()); var out = new FileOutputStream(tmpFileName)) {
|
||||
for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) {
|
||||
// PdfVisualisationUtility.drawLine2DList(pdDocument,
|
||||
// pageNumber,
|
||||
// list.get(pageNumber - 1),
|
||||
// PdfVisualisationUtility.Options.builder().stroke(true).build());
|
||||
PdfVisualisationUtility.drawRectangle2DList(pdDocument,
|
||||
pageNumber,
|
||||
rectanglesPerPage.get(pageNumber - 1),
|
||||
PdfVisualisationUtility.Options.builder().stroke(true).build());
|
||||
PdfVisualisationUtility.drawRectangle2DList(pdDocument, pageNumber, list.get(pageNumber - 1), PdfVisualisationUtility.Options.builder().stroke(true).build());
|
||||
}
|
||||
pdDocument.save(out);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public static void drawLinesPerPage(String filename, List<List<Ruling>> linesPerPage, String tmpFileName) {
|
||||
|
||||
ClassPathResource pdfResource = new ClassPathResource(filename);
|
||||
try (PDDocument pdDocument = Loader.loadPDF(pdfResource.getFile()); var out = new FileOutputStream(tmpFileName)) {
|
||||
for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) {
|
||||
PdfVisualisationUtility.drawLine2DList(pdDocument,
|
||||
pageNumber,
|
||||
linesPerPage.get(pageNumber - 1),
|
||||
PdfVisualisationUtility.Options.builder().strokeColor(Color.RED).stroke(true).build());
|
||||
}
|
||||
pdDocument.save(out);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@Getter
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public static class Options {
|
||||
|
||||
boolean stroke;
|
||||
@Builder.Default
|
||||
Color strokeColor = Color.BLACK;
|
||||
@Builder.Default
|
||||
float strokeWidth = 1f;
|
||||
|
||||
boolean fill;
|
||||
@Builder.Default
|
||||
Color fillColor = Color.BLACK;
|
||||
|
||||
}
|
||||
|
||||
|
||||
private static Options buildStandardOptionsForNodes(DocumentTree.Entry entry) {
|
||||
|
||||
return Options.builder().stroke(true).strokeColor(switch (entry.getType()) {
|
||||
case DOCUMENT -> Color.LIGHT_GRAY;
|
||||
case HEADER, FOOTER -> Color.GREEN;
|
||||
case PARAGRAPH -> Color.BLUE;
|
||||
case HEADLINE -> Color.RED;
|
||||
case SECTION -> Color.BLACK;
|
||||
case TABLE -> Color.ORANGE;
|
||||
case TABLE_CELL -> Color.GRAY;
|
||||
case IMAGE -> Color.MAGENTA;
|
||||
}).build();
|
||||
}
|
||||
|
||||
|
||||
private static void drawBBoxAndLabelAndNumberOnPage(PDDocument document, DocumentTree.Entry entry, Options options) {
|
||||
|
||||
Map<Page, Rectangle2D> rectanglesPerPage = entry.getNode().getBBox();
|
||||
for (Page page : rectanglesPerPage.keySet()) {
|
||||
Rectangle2D rectangle2D = rectanglesPerPage.get(page);
|
||||
if (entry.getType() == NodeType.SECTION) {
|
||||
rectangle2D = RectangleTransformations.pad(rectangle2D, 10, 10);
|
||||
}
|
||||
drawRectangle2DList(document, page.getNumber(), List.of(rectangle2D), options);
|
||||
drawText(buildString(entry),
|
||||
document,
|
||||
new Point2D.Double(rectangle2D.getMinX(), rectangle2D.getMaxY() + 2),
|
||||
page.getNumber(),
|
||||
options,
|
||||
entry.getType() == NodeType.TABLE_CELL);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static String buildString(DocumentTree.Entry entry) {
|
||||
|
||||
return entry.getNode().getNumberOnPage() + ": " + entry.getTreeId() + ": " + entry.getType();
|
||||
}
|
||||
|
||||
}
|
||||
@ -25,7 +25,9 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
|
||||
@SneakyThrows
|
||||
public void testLayoutParserEndToEnd() {
|
||||
|
||||
prepareStorage("files/bdr/Wie weiter bei Kristeneinrichtungen.pdf");
|
||||
String s = "(";
|
||||
String s1 = ")";
|
||||
prepareStorage("files/Minimal Examples/WrongOrderPage1.pdf");
|
||||
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER);
|
||||
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
|
||||
Arrays.stream(finishedEvent.message().split("\n")).forEach(log::info);
|
||||
|
||||
@ -25,7 +25,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
@SneakyThrows
|
||||
public void testViewerDocument() {
|
||||
|
||||
String fileName = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
|
||||
String fileName = "files/Plenarprotokoll 1 (keine Druchsache!) (1).pdf";
|
||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||
|
||||
var documentFile = new ClassPathResource(fileName).getFile();
|
||||
@ -35,9 +35,10 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER);
|
||||
long start = System.currentTimeMillis();
|
||||
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
|
||||
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
|
||||
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@Disabled
|
||||
@SneakyThrows
|
||||
@ -51,7 +52,11 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
var tableResponse = mapper.readValue(new ClassPathResource(tableFileName).getInputStream(), TableServiceResponse.class);
|
||||
var documentFile = new ClassPathResource(fileName).getFile();
|
||||
|
||||
var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE, documentFile, new ImageServiceResponse(), tableResponse, Path.of(fileName).getFileName().toFile().toString());
|
||||
var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE,
|
||||
documentFile,
|
||||
new ImageServiceResponse(),
|
||||
tableResponse,
|
||||
Path.of(fileName).getFileName().toFile().toString());
|
||||
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
|
||||
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
|
||||
Document document = DocumentGraphFactory.buildDocumentGraph(classificationDocument);
|
||||
|
||||
@ -48,7 +48,7 @@ class GapAcrossLinesDetectionServiceTest {
|
||||
|
||||
|
||||
@Test
|
||||
@Disabled
|
||||
//@Disabled
|
||||
@SneakyThrows
|
||||
public void testColumnDetection() {
|
||||
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user