RED-7141: Implemented docstrum layout parsing

This commit is contained in:
Dominique Eifländer 2024-02-22 11:02:50 +01:00
parent f4b6386e1c
commit 240ef82def
21 changed files with 1539 additions and 52 deletions

View File

@ -3,5 +3,8 @@ package com.knecon.fforesight.service.layoutparser.internal.api.queue;
public enum LayoutParsingType {
REDACT_MANAGER,
TAAS,
DOCUMINE
DOCUMINE,
DOCSTRUM,
DOCSTRUM_XY
}

View File

@ -1,5 +1,7 @@
package com.knecon.fforesight.service.layoutparser.processor;
import static com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType.DOCSTRUM;
import static com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType.DOCSTRUM_XY;
import static java.lang.String.format;
import java.awt.geom.Rectangle2D;
@ -26,6 +28,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsi
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationSection;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
@ -43,6 +46,7 @@ import com.knecon.fforesight.service.layoutparser.processor.services.RulingClean
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService;
import com.knecon.fforesight.service.layoutparser.processor.services.TableExtractionService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.TaasBlockificationService;
@ -86,6 +90,7 @@ public class LayoutParsingPipeline {
TaasBlockificationService taasBlockificationService;
DocuMineBlockificationService docuMineBlockificationService;
RedactManagerBlockificationService redactManagerBlockificationService;
DocstrumBlockificationService docstrumBlockificationService;
LayoutGridService layoutGridService;
ObservationRegistry observationRegistry;
VisualLayoutParsingAdapter visualLayoutParsingAdapter;
@ -97,8 +102,7 @@ public class LayoutParsingPipeline {
log.info("Starting layout parsing for {}", layoutParsingRequest.identifier());
File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId())
.orElse(originFile);
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()).orElse(originFile);
VisualLayoutParsingResponse visualLayoutParsingResponse = new VisualLayoutParsingResponse();
if (layoutParsingRequest.visualLayoutParsingFileId().isPresent()) {
@ -106,24 +110,20 @@ public class LayoutParsingPipeline {
}
ImageServiceResponse imageServiceResponse = new ImageServiceResponse();
if (layoutParsingRequest.imagesFileStorageId()
.isPresent()) {
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId()
.get());
if (layoutParsingRequest.imagesFileStorageId().isPresent()) {
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get());
}
TableServiceResponse tableServiceResponse = new TableServiceResponse();
if (layoutParsingRequest.tablesFileStorageId()
.isPresent()) {
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId()
.get());
if (layoutParsingRequest.tablesFileStorageId().isPresent()) {
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get());
}
ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(),
originFile,
imageServiceResponse,
tableServiceResponse,
visualLayoutParsingResponse,
originFile,
imageServiceResponse,
tableServiceResponse,
visualLayoutParsingResponse,
layoutParsingRequest.identifier().toString());
log.info("Building document graph for {}", layoutParsingRequest.identifier());
@ -156,25 +156,25 @@ public class LayoutParsingPipeline {
.numberOfPages(documentGraph.getNumberOfPages())
.duration(System.currentTimeMillis() - start)
.message(format("""
Layout parsing has finished in %.02f s.
identifiers: %s
%s
Files have been saved with Ids:
Structure: %s
Text: %s
Positions: %s
PageData: %s
Simplified Text: %s
Viewer Doc: %s""",
((float) (System.currentTimeMillis() - start)) / 1000,
layoutParsingRequest.identifier(),
buildSemanticNodeCountMessage(documentGraph.getNumberOfPages(), documentGraph.buildSemanticNodeCounts()),
layoutParsingRequest.structureFileStorageId(),
layoutParsingRequest.textBlockFileStorageId(),
layoutParsingRequest.positionBlockFileStorageId(),
layoutParsingRequest.pageFileStorageId(),
layoutParsingRequest.simplifiedTextStorageId(),
layoutParsingRequest.viewerDocumentStorageId()))
Layout parsing has finished in %.02f s.
identifiers: %s
%s
Files have been saved with Ids:
Structure: %s
Text: %s
Positions: %s
PageData: %s
Simplified Text: %s
Viewer Doc: %s""",
((float) (System.currentTimeMillis() - start)) / 1000,
layoutParsingRequest.identifier(),
buildSemanticNodeCountMessage(documentGraph.getNumberOfPages(), documentGraph.buildSemanticNodeCounts()),
layoutParsingRequest.structureFileStorageId(),
layoutParsingRequest.textBlockFileStorageId(),
layoutParsingRequest.positionBlockFileStorageId(),
layoutParsingRequest.pageFileStorageId(),
layoutParsingRequest.simplifiedTextStorageId(),
layoutParsingRequest.viewerDocumentStorageId()))
.build();
}
@ -195,14 +195,14 @@ public class LayoutParsingPipeline {
private String buildSemanticNodeCountMessage(int numberOfPages, Map<NodeType, Long> semanticNodeCounts) {
return String.format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed",
numberOfPages,
semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION),
semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE),
semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH),
semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE),
semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL),
semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER),
semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER));
numberOfPages,
semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION),
semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE),
semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH),
semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE),
semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL),
semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER),
semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER));
}
@ -220,7 +220,7 @@ public class LayoutParsingPipeline {
Map<Integer, List<TableCells>> pdfTableCells = cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse);
Map<Integer, List<ClassifiedImage>> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse);
Map<Integer, List<ClassifiedImage>> signatures = new HashMap<>();
if(signatures.size() > 0) {
if (signatures.size() > 0) {
visualLayoutParsingAdapter.buildExtractedSignaturesPerPage(visualLayoutParsingResponse);
}
@ -266,6 +266,8 @@ public class LayoutParsingPipeline {
case REDACT_MANAGER -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
case TAAS -> taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
case DOCSTRUM -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical(), false);
case DOCSTRUM_XY -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical(), true);
};
classificationPage.setCleanRulings(cleanRulings);
classificationPage.setRotation(rotation);
@ -283,12 +285,16 @@ public class LayoutParsingPipeline {
imageServiceResponseAdapter.findOcr(classificationPage);
}
if(signatures.containsKey(pageNumber)) {
if (signatures.containsKey(pageNumber)) {
classificationPage.setImages(signatures.get(pageNumber));
}
tableExtractionService.extractTables(cleanRulings, classificationPage);
if (layoutParsingType == DOCSTRUM || layoutParsingType == DOCSTRUM_XY) {
docstrumBlockificationService.combineBlocks(classificationPage);
}
buildPageStatistics(classificationPage);
increaseDocumentStatistics(classificationPage, classificationDocument);
@ -304,11 +310,26 @@ public class LayoutParsingPipeline {
case TAAS -> taasClassificationService.classifyDocument(classificationDocument);
case DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument);
case REDACT_MANAGER -> redactManagerClassificationService.classifyDocument(classificationDocument);
case DOCSTRUM_XY -> redactManagerClassificationService.classifyDocument(classificationDocument);
}
log.info("Building Sections for {}", identifier);
sectionsBuilderService.buildSections(classificationDocument);
sectionsBuilderService.addImagesToSections(classificationDocument);
if (layoutParsingType == DOCSTRUM || layoutParsingType == DOCSTRUM_XY) {
// Currently for debugging return paragraphs as sections, becaus there is a merging logic in sectionBuilder
List<ClassificationSection> sections = new ArrayList<>();
for (var page : classificationPages) {
page.getTextBlocks().forEach(block -> {
block.setPage(page.getPageNumber());
var section = sectionsBuilderService.buildTextBlock(List.of(block), "a");
sections.add(section);
});
}
classificationDocument.setSections(sections);
} else {
sectionsBuilderService.buildSections(classificationDocument);
sectionsBuilderService.addImagesToSections(classificationDocument);
}
return classificationDocument;
}

View File

@ -55,6 +55,17 @@ public class TextPositionSequence implements CharSequence {
}
public TextPositionSequence(List<RedTextPosition> textPositions, int page) {
this.textPositions = textPositions;
this.page = page;
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
this.rotation = textPositions.get(0).getRotation();
this.pageHeight = textPositions.get(0).getPageHeight();
this.pageWidth = textPositions.get(0).getPageWidth();
}
@Override
public int length() {

View File

@ -240,7 +240,7 @@ public class SectionsBuilderService {
}
private ClassificationSection buildTextBlock(List<AbstractPageBlock> wordBlockList, String lastHeadline) {
public ClassificationSection buildTextBlock(List<AbstractPageBlock> wordBlockList, String lastHeadline) {
ClassificationSection section = new ClassificationSection();

View File

@ -0,0 +1,252 @@
package com.knecon.fforesight.service.layoutparser.processor.services.blockification;
import static java.util.stream.Collectors.toSet;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.ListIterator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.DocstrumSegmentationService;
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
import lombok.RequiredArgsConstructor;
@SuppressWarnings("all")
@Service
@RequiredArgsConstructor
public class DocstrumBlockificationService {
private final DocstrumSegmentationService docstrumSegmentationService;
static final float THRESHOLD = 1f;
Pattern pattern = Pattern.compile("^(\\p{Digit}{1,3}\\.){0,3}\\p{Digit}{1,3}[\\p{Lower}.]?", Pattern.CASE_INSENSITIVE);
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines, boolean xyOder) {
List<AbstractPageBlock> abstractPageBlocks = new ArrayList<>();
var zones = docstrumSegmentationService.segmentPage(textPositions, xyOder);
zones.forEach(zone -> {
List<TextPositionSequence> textPositionSequences = new ArrayList<>();
zone.getLines().forEach(line -> {
line.getWords().forEach(word -> {
textPositionSequences.add(new TextPositionSequence(word.getTextPositions(), word.getPage()));
});
});
abstractPageBlocks.addAll(splitZonesAtRulings(textPositionSequences, horizontalRulingLines, verticalRulingLines));
});
return new ClassificationPage(abstractPageBlocks);
}
public void combineBlocks(ClassificationPage page) {
TextPageBlock previous = new TextPageBlock();
ListIterator<AbstractPageBlock> itty = page.getTextBlocks().listIterator();
while (itty.hasNext()) {
AbstractPageBlock block = itty.next();
if (block instanceof TablePageBlock) {
continue;
}
TextPageBlock current = (TextPageBlock) block;
if (previous != null) {
Matcher matcher = pattern.matcher(previous.getText().toString());
if (matcher.matches() && Math.abs(previous.getMinY() - current.getMinY()) < 1) {
previous.getSequences().addAll(current.getSequences());
previous = buildTextBlock(previous.getSequences(), 0);
itty.remove();
}
}
previous = current;
}
}
public List<AbstractPageBlock> splitZonesAtRulings(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
int indexOnPage = 0;
List<TextPositionSequence> chunkWords = new ArrayList<>();
List<AbstractPageBlock> chunkBlockList = new ArrayList<>();
float minX = 1000, maxX = 0, minY = 1000, maxY = 0;
TextPositionSequence prev = null;
Float splitX1 = null;
for (TextPositionSequence word : textPositions) {
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
if (prev != null && (splitByDir || isSplitByRuling)) {
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
indexOnPage++;
chunkBlockList.add(cb1);
chunkWords = new ArrayList<>();
minX = 1000;
maxX = 0;
minY = 1000;
maxY = 0;
prev = null;
}
chunkWords.add(word);
prev = word;
if (word.getMinXDirAdj() < minX) {
minX = word.getMinXDirAdj();
}
if (word.getMaxXDirAdj() > maxX) {
maxX = word.getMaxXDirAdj();
}
if (word.getMinYDirAdj() < minY) {
minY = word.getMinYDirAdj();
}
if (word.getMaxYDirAdj() > maxY) {
maxY = word.getMaxYDirAdj();
}
}
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
if (cb1 != null) {
chunkBlockList.add(cb1);
}
return chunkBlockList;
}
private boolean equalsWithThreshold(float f1, float f2) {
return Math.abs(f1 - f2) < THRESHOLD;
}
private TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
TextPageBlock textBlock = null;
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter();
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
for (TextPositionSequence wordBlock : wordBlockList) {
lineHeightFrequencyCounter.add(wordBlock.getTextHeight());
fontSizeFrequencyCounter.add(wordBlock.getFontSize());
spaceFrequencyCounter.add(wordBlock.getSpaceWidth());
fontFrequencyCounter.add(wordBlock.getFont());
styleFrequencyCounter.add(wordBlock.getFontStyle());
if (textBlock == null) {
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
wordBlock.getMaxXDirAdj(),
wordBlock.getMinYDirAdj(),
wordBlock.getMaxYDirAdj(),
wordBlockList,
wordBlock.getRotation());
} else {
TextPageBlock spatialEntity = textBlock.union(wordBlock);
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
}
}
if (textBlock != null) {
textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
}
if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) {
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
}
return textBlock;
}
private boolean isSplitByRuling(float minX,
float minY,
float maxX,
float maxY,
TextPositionSequence word,
List<Ruling> horizontalRulingLines,
List<Ruling> verticalRulingLines) {
return isSplitByRuling(maxX,
minY,
word.getMinXDirAdj(),
word.getMinYDirAdj(),
verticalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(minX,
minY,
word.getMinXDirAdj(),
word.getMaxYDirAdj(),
horizontalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(maxX,
minY,
word.getMinXDirAdj(),
word.getMinYDirAdj(),
horizontalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(minX,
minY,
word.getMinXDirAdj(),
word.getMaxYDirAdj(),
verticalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight());
}
private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List<Ruling> rulingLines, float dir, float pageWidth, float pageHeight) {
for (Ruling ruling : rulingLines) {
var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight);
if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) {
return true;
}
}
return false;
}
private double round(float value, int decimalPoints) {
var d = Math.pow(10, decimalPoints);
return Math.round(value * d) / d;
}
}

View File

@ -0,0 +1,48 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum;
import java.util.List;
import java.util.stream.Collectors;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.LineBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.NearestNeighbourService;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.ReadingOrderService;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.SpacingService;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.ZoneBuilderService;
import lombok.RequiredArgsConstructor;
@Service
@RequiredArgsConstructor
public class DocstrumSegmentationService {
private final NearestNeighbourService nearestNeighbourService;
private final SpacingService spacingService;
private final LineBuilderService lineBuilderService;
private final ZoneBuilderService zoneBuilderService;
private final ReadingOrderService readingOrderService;
public List<Zone> segmentPage(List<TextPositionSequence> textPositions, boolean xyOder) {
var positions = textPositions.stream().map(TextPositionSequence::getTextPositions).flatMap(List::stream).toList();
var characters = positions.stream().map(Character::new).collect(Collectors.toList());
nearestNeighbourService.findNearestNeighbors(characters);
var characterSpacing = spacingService.computeCharacterSpacing(characters);
var lineSpacing = spacingService.computeLineSpacing(characters);
var lines = lineBuilderService.buildLines(characters, characterSpacing, lineSpacing);
var zones = zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing);
return readingOrderService.resolve(zones, xyOder);
}
}

View File

@ -0,0 +1,32 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
public class AngleFilter {
protected double lowerAngle;
protected double upperAngle;
public AngleFilter(double lowerAngle, double upperAngle) {
if (lowerAngle < -Math.PI / 2) {
lowerAngle += Math.PI;
}
if (upperAngle >= Math.PI / 2) {
upperAngle -= Math.PI;
}
this.lowerAngle = lowerAngle;
this.upperAngle = upperAngle;
}
public boolean matches(Neighbor neighbor) {
if (lowerAngle <= upperAngle) {
return lowerAngle <= neighbor.getAngle() && neighbor.getAngle() < upperAngle;
} else {
return lowerAngle <= neighbor.getAngle() || neighbor.getAngle() < upperAngle;
}
}
}

View File

@ -0,0 +1,48 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
import java.awt.geom.Rectangle2D;
import lombok.Data;
@Data
public abstract class BoundingBox {
private Rectangle2D bBox;
public double getX() {
return bBox.getX();
}
public double getY() {
return bBox.getY();
}
public double getWidth() {
return bBox.getWidth();
}
public double getHeight() {
return bBox.getHeight();
}
public double getArea() {
return (bBox.getHeight() * bBox.getWidth());
}
public boolean contains(Rectangle2D contained, double tolerance) {
return bBox.getX() <= contained.getX() + tolerance && bBox.getY() <= contained.getY() + tolerance && bBox.getX() + bBox.getWidth() >= contained.getX() + contained.getWidth() - tolerance && bBox.getY() + bBox.getHeight() >= contained.getY() + contained.getHeight() - tolerance;
}
}

View File

@ -0,0 +1,84 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
import lombok.Data;
@Data
public class Character {
private final double x;
private final double y;
private final RedTextPosition textPosition;
private List<Neighbor> neighbors = new ArrayList<>();
public Character(RedTextPosition chunk) {
this.x = chunk.getXDirAdj() + chunk.getWidthDirAdj() / 2;
this.y = chunk.getYDirAdj() + chunk.getHeightDir() / 2;
this.textPosition = chunk;
}
public double getHeight() {
return textPosition.getHeightDir();
}
public double distance(Character character) {
double dx = getX() - character.getX();
double dy = getY() - character.getY();
return Math.sqrt(dx * dx + dy * dy);
}
public double horizontalDistance(Character character) {
return Math.abs(getX() - character.getX());
}
public double verticalDistance(Character character) {
return Math.abs(getY() - character.getY());
}
public double overlappingDistance(Character other) {
double[] xs = new double[4];
double s = Math.sin(-0), c = Math.cos(-0);
xs[0] = c * x - s * y;
xs[1] = c * (x + textPosition.getWidthDirAdj()) - s * (y + textPosition.getHeightDir());
xs[2] = c * other.x - s * other.y;
xs[3] = c * (other.x + other.textPosition.getWidthDirAdj()) - s * (other.y + other.textPosition.getHeightDir());
boolean overlapping = xs[1] >= xs[2] && xs[3] >= xs[0];
Arrays.sort(xs);
return Math.abs(xs[2] - xs[1]) * (overlapping ? 1 : -1);
}
public void setNeighbors(List<Neighbor> neighbors) {
this.neighbors = neighbors;
}
public double angle(Character character) {
if (getX() > character.getX()) {
return Math.atan2(getY() - character.getY(), getX() - character.getX());
} else {
return Math.atan2(character.getY() - getY(), character.getX() - getX());
}
}
}

View File

@ -0,0 +1,194 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
import java.util.AbstractSet;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Set;
public class DisjointSets<E> implements Iterable<Set<E>> {
private final Map<E, Entry<E>> map = new HashMap<>();
public DisjointSets(Collection<? extends E> collection) {
for (E element : collection) {
map.put(element, new Entry<E>(element));
}
}
public boolean areTogether(E e1, E e2) {
return map.get(e1).findRepresentative() == map.get(e2).findRepresentative();
}
public void union(E e1, E e2) {
Entry<E> r1 = map.get(e1).findRepresentative();
Entry<E> r2 = map.get(e2).findRepresentative();
if (r1 != r2) {
if (r1.size <= r2.size) {
r2.mergeWith(r1);
} else {
r1.mergeWith(r2);
}
}
}
@Override
public Iterator<Set<E>> iterator() {
return new Iterator<>() {
private final Iterator<Entry<E>> iterator = map.values().iterator();
private Entry<E> nextRepresentative;
{
findNextRepresentative();
}
@Override
public boolean hasNext() {
return nextRepresentative != null;
}
@Override
public Set<E> next() {
if (nextRepresentative == null) {
throw new NoSuchElementException();
}
Set<E> result = nextRepresentative.asSet();
findNextRepresentative();
return result;
}
private void findNextRepresentative() {
while (iterator.hasNext()) {
Entry<E> candidate = iterator.next();
if (candidate.isRepresentative()) {
nextRepresentative = candidate;
return;
}
}
nextRepresentative = null;
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
};
}
private static class Entry<E> {
private int size = 1;
private final E value;
private Entry<E> parent = this;
private Entry<E> next = null;
private Entry<E> last = this;
Entry(E value) {
this.value = value;
}
void mergeWith(Entry<E> otherRepresentative) {
size += otherRepresentative.size;
last.next = otherRepresentative;
last = otherRepresentative.last;
otherRepresentative.parent = this;
}
Entry<E> findRepresentative() {
Entry<E> representative = parent;
while (representative.parent != representative) {
representative = representative.parent;
}
for (Entry<E> entry = this; entry != representative; ) {
Entry<E> nextEntry = entry.parent;
entry.parent = representative;
entry = nextEntry;
}
return representative;
}
boolean isRepresentative() {
return parent == this;
}
Set<E> asSet() {
return new AbstractSet<E>() {
@Override
public Iterator<E> iterator() {
return new Iterator<E>() {
private Entry<E> nextEntry = findRepresentative();
@Override
public boolean hasNext() {
return nextEntry != null;
}
@Override
public E next() {
if (nextEntry == null) {
throw new NoSuchElementException();
}
E result = nextEntry.value;
nextEntry = nextEntry.next;
return result;
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
};
}
@Override
public int size() {
return findRepresentative().size;
}
};
}
}
}

View File

@ -0,0 +1,91 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
public class Histogram {
private static final double EPSILON = 1.0e-6;
private final double min;
private final double resolution;
private double[] frequencies;
public Histogram(double minValue, double maxValue, double resolution) {
this.min = minValue - EPSILON;
double delta = maxValue - minValue + 2 * EPSILON;
int size = Math.max(1, (int) Math.round((maxValue - minValue) / resolution));
this.resolution = delta / size;
this.frequencies = new double[size];
}
public void kernelSmooth(double[] kernel) {
double[] newFrequencies = new double[frequencies.length];
int shift = (kernel.length - 1) / 2;
for (int i = 0; i < kernel.length; i++) {
int jStart = Math.max(0, i - shift);
int jEnd = Math.min(frequencies.length, frequencies.length + i - shift);
for (int j = jStart; j < jEnd; j++) {
newFrequencies[j - i + shift] += kernel[i] * frequencies[j];
}
}
frequencies = newFrequencies;
}
public double[] createGaussianKernel(double length, double stdDeviation) {
int r = (int) Math.round(length / resolution) / 2;
stdDeviation /= resolution;
int size = 2 * r + 1;
double[] kernel = new double[size];
double sum = 0;
double b = 2 * stdDeviation * stdDeviation;
double a = 1 / Math.sqrt(Math.PI * b);
for (int i = 0; i < size; i++) {
kernel[i] = a * Math.exp(-(i - r) * (i - r) / b);
sum += kernel[i];
}
for (int i = 0; i < size; i++) {
kernel[i] /= sum;
}
return kernel;
}
public void gaussianSmooth(double windowLength, double stdDeviation) {
kernelSmooth(createGaussianKernel(windowLength, stdDeviation));
}
public void add(double value) {
frequencies[(int) ((value - min) / resolution)] += 1.0;
}
public int getSize() {
return frequencies.length;
}
public double getPeakValue() {
int peakIndex = 0;
for (int i = 1; i < frequencies.length; i++) {
if (frequencies[i] > frequencies[peakIndex]) {
peakIndex = i;
}
}
int peakEndIndex = peakIndex + 1;
final double EPS = 0.0001;
while (peakEndIndex < frequencies.length && Math.abs(frequencies[peakEndIndex] - frequencies[peakIndex]) < EPS) {
peakEndIndex++;
}
return ((double) peakIndex + peakEndIndex) / 2 * resolution + min;
}
}

View File

@ -0,0 +1,165 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import lombok.Data;
@Data
public class Line extends BoundingBox {
private static final double WORD_DISTANCE_MULTIPLIER = 0.2;
private final double x0;
private final double y0;
private final double x1;
private final double y1;
private final double height;
private final List<Character> characters;
private final List<TextPositionSequence> words = new ArrayList<>();
public Line(List<Character> characters, double wordSpacing) {
this.characters = characters;
if (characters.size() >= 2) {
// linear regression
double sx = 0.0, sxx = 0.0, sxy = 0.0, sy = 0.0;
for (Character character : characters) {
sx += character.getX();
sxx += character.getX() * character.getX();
sxy += character.getX() * character.getY();
sy += character.getY();
}
double b = (characters.size() * sxy - sx * sy) / (characters.size() * sxx - sx * sx);
double a = (sy - b * sx) / characters.size();
this.x0 = characters.get(0).getX();
this.y0 = a + b * this.x0;
this.x1 = characters.get(characters.size() - 1).getX();
this.y1 = a + b * this.x1;
} else {
Character character = characters.get(0);
double dx = character.getTextPosition().getWidthDirAdj() / 3;
double dy = dx * Math.tan(0);
this.x0 = character.getX() - dx;
this.x1 = character.getX() + dx;
this.y0 = character.getY() - dy;
this.y1 = character.getY() + dy;
}
height = computeHeight();
computeWords(wordSpacing * WORD_DISTANCE_MULTIPLIER);
buildBBox();
}
public double getAngle() {
return Math.atan2(y1 - y0, x1 - x0);
}
public double getLength() {
return Math.sqrt((x0 - x1) * (x0 - x1) + (y0 - y1) * (y0 - y1));
}
private double computeHeight() {
double sum = 0.0;
for (Character component : characters) {
sum += component.getHeight();
}
return sum / characters.size();
}
public double angularDifference(Line j) {
double diff = Math.abs(getAngle() - j.getAngle());
if (diff <= Math.PI / 2) {
return diff;
} else {
return Math.PI - diff;
}
}
public double horizontalDistance(Line other) {
double[] xs = new double[4];
xs[0] = x0;
xs[1] = x1;
xs[2] = other.x0;
xs[3] = other.x1;
boolean overlapping = xs[1] >= xs[2] && xs[3] >= xs[0];
Arrays.sort(xs);
return Math.abs(xs[2] - xs[1]) * (overlapping ? 1 : -1);
}
public double verticalDistance(Line other) {
double ym = (y0 + y1) / 2;
double yn = (other.y0 + other.y1) / 2;
return Math.abs(ym - yn) / Math.sqrt(1);
}
private void computeWords(double wordSpacing) {
TextPositionSequence word = new TextPositionSequence();
Character previous = null;
for (Character current : characters) {
if (previous != null) {
double dist = current.getTextPosition().getXDirAdj() - previous.getTextPosition().getXDirAdj() - previous.getTextPosition().getWidthDirAdj();
if (dist > wordSpacing) {
words.add(word);
word = new TextPositionSequence();
}
}
word.getTextPositions().add(current.getTextPosition());
previous = current;
}
words.add(word);
}
private void buildBBox() {
double minX = Double.POSITIVE_INFINITY;
double minY = Double.POSITIVE_INFINITY;
double maxX = Double.NEGATIVE_INFINITY;
double maxY = Double.NEGATIVE_INFINITY;
for (Character character : characters) {
minX = Math.min(minX, character.getTextPosition().getXDirAdj());
minY = Math.min(minY, character.getTextPosition().getYDirAdj());
maxX = Math.max(maxX, character.getTextPosition().getXDirAdj() + character.getTextPosition().getWidthDirAdj());
maxY = Math.max(maxY, character.getTextPosition().getYDirAdj() + character.getTextPosition().getHeightDir());
}
this.setBBox(new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY));
}
public String toString() {
StringBuilder sb = new StringBuilder();
words.forEach(word -> sb.append(word.toString()).append(" "));
return sb.toString().trim();
}
}

View File

@ -0,0 +1,36 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
import lombok.Getter;
public class Neighbor {
@Getter
private final double distance;
@Getter
private final double angle;
private final Character originCharacter;
@Getter
private final Character character;
public Neighbor(Character neighbor, Character origin) {
this.distance = neighbor.distance(origin);
this.angle = neighbor.angle(origin);
this.character = neighbor;
this.originCharacter = origin;
}
public double getHorizontalDistance() {
return character.horizontalDistance(originCharacter);
}
public double getVerticalDistance() {
return character.verticalDistance(originCharacter);
}
}

View File

@ -0,0 +1,50 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
import java.awt.geom.Rectangle2D;
import java.util.Comparator;
import java.util.List;
import lombok.Data;
@Data
public class Zone extends BoundingBox {
private List<Line> lines;
public Zone(List<Line> lines) {
lines.sort(Comparator.comparingDouble(Line::getY));
this.lines = lines;
buildBBox();
}
public void buildBBox() {
double minX = Double.POSITIVE_INFINITY;
double minY = Double.POSITIVE_INFINITY;
double maxX = Double.NEGATIVE_INFINITY;
double maxY = Double.NEGATIVE_INFINITY;
for (Line line : lines) {
minX = Math.min(minX, line.getX());
minY = Math.min(minY, line.getY());
maxX = Math.max(maxX, line.getX() + line.getWidth());
maxY = Math.max(maxY, line.getY() + line.getHeight());
}
this.setBBox(new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY));
}
public String toString() {
StringBuilder sb = new StringBuilder();
lines.forEach(line -> sb.append(line.toString()).append("\n"));
return sb.toString().trim();
}
}

View File

@ -0,0 +1,50 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.AngleFilter;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.DisjointSets;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Line;
@Service
public class LineBuilderService {
private static final double CHARACTER_SPACING_DISTANCE_MULTIPLIER = 3.5;
private static final double MAX_VERTICAL_CHARACTER_DISTANCE = 0.67;
private static final double ANGLE_TOLERANCE = Math.PI / 6;
public List<Line> buildLines(List<Character> characters, double characterSpacing, double lineSpacing) {
double maxHorizontalDistance = characterSpacing * CHARACTER_SPACING_DISTANCE_MULTIPLIER;
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_CHARACTER_DISTANCE;
DisjointSets<Character> sets = new DisjointSets<>(characters);
AngleFilter filter = new AngleFilter(-ANGLE_TOLERANCE, ANGLE_TOLERANCE);
characters.forEach(character -> {
character.getNeighbors().forEach(neighbor -> {
double x = neighbor.getHorizontalDistance() / maxHorizontalDistance;
double y = neighbor.getVerticalDistance() / maxVerticalDistance;
if (filter.matches(neighbor) && Math.pow(x, 2) + Math.pow(y, 2) <= 1) {
sets.union(character, neighbor.getCharacter());
}
});
});
List<Line> lines = new ArrayList<>();
sets.forEach(group -> {
List<Character> lineCharacters = new ArrayList<>(group);
lineCharacters.sort(Comparator.comparingDouble(Character::getX));
lines.add(new Line(lineCharacters, characterSpacing));
});
return lines;
}
}

View File

@ -0,0 +1,78 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Neighbor;
@Service
public class NearestNeighbourService {
private static final int NUMBER_OF_NEIGHBOURS = 8;
private static final double STEP = 16.0;
public void findNearestNeighbors(List<Character> characters) {
if (characters.isEmpty()) {
return;
}
characters.sort(Comparator.comparingDouble(Character::getX));
int maxNeighborCount = NUMBER_OF_NEIGHBOURS;
if (characters.size() <= NUMBER_OF_NEIGHBOURS) {
maxNeighborCount = characters.size() - 1;
}
for (int i = 0; i < characters.size(); i++) {
List<Neighbor> candidates = new ArrayList<>();
int start = i;
int end = i + 1;
double distance = Double.POSITIVE_INFINITY;
for (double searchDistance = 0; searchDistance < distance; ) {
searchDistance += STEP;
boolean newCandidatesFound = false;
while (start > 0 && characters.get(i).getX() - characters.get(start - 1).getX() < searchDistance) {
start--;
candidates.add(new Neighbor(characters.get(start), characters.get(i)));
clearLeastDistant(candidates, maxNeighborCount);
newCandidatesFound = true;
}
while (end < characters.size() && characters.get(end).getX() - characters.get(i).getX() < searchDistance) {
candidates.add(new Neighbor(characters.get(end), characters.get(i)));
clearLeastDistant(candidates, maxNeighborCount);
end++;
newCandidatesFound = true;
}
if (newCandidatesFound && candidates.size() >= maxNeighborCount) {
distance = candidates.get(maxNeighborCount - 1).getDistance();
}
}
clearLeastDistant(candidates, maxNeighborCount);
characters.get(i).setNeighbors(new ArrayList<>(candidates));
}
}
private void clearLeastDistant(List<Neighbor> candidates, int maxNeighborCount) {
if (candidates.size() > maxNeighborCount) {
candidates.sort(Comparator.comparingDouble(Neighbor::getDistance));
candidates.remove(candidates.remove(candidates.size() - 1));
}
}
}

View File

@ -0,0 +1,99 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.ListIterator;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.utils.DoubleUtils;
@Service
public class ReadingOrderService {
private static final double THRESHOLD = 1;
public List<Zone> resolve(List<Zone> zones, boolean xyOrder) {
if (zones.isEmpty() || zones.size() == 1) {
return zones;
}
if (xyOrder) {
zones.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
return zones;
}
return resolveMultiColumnReadingOder(zones);
}
private List<Zone> resolveMultiColumnReadingOder(List<Zone> zones) {
// Simple reading order resolver for multi column page layout as described here : https://pub.towardsai.net/advanced-rag-02-unveiling-pdf-parsing-b84ae866344e
// TODO implement a more fancy reading order resolver see https://github.com/BobLd/DocumentLayoutAnalysis/blob/master/README.md#reading-order
double minX = Double.POSITIVE_INFINITY;
double maxX = Double.NEGATIVE_INFINITY;
for (Zone zone : zones) {
if (zone.getX() < minX) {
minX = zone.getX();
}
if (zone.getX() + zone.getWidth() > maxX) {
maxX = zone.getX() + zone.getWidth();
}
}
double midLineXCoordinate = (minX + maxX) / 2;
List<Zone> leftOf = new ArrayList<>();
List<Zone> rightOf = new ArrayList<>();
List<Zone> middle = new ArrayList<>();
for (Zone zone : zones) {
if (zone.getX() < midLineXCoordinate && zone.getX() + zone.getWidth() < midLineXCoordinate) {
leftOf.add(zone);
} else if (zone.getX() > midLineXCoordinate && zone.getX() + zone.getWidth() > midLineXCoordinate) {
rightOf.add(zone);
} else {
middle.add(zone);
}
}
leftOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
rightOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
middle.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
List<Zone> sortedZones = new ArrayList<>();
sortedZones.addAll(leftOf);
sortedZones.addAll(rightOf);
ListIterator<Zone> itty = middle.listIterator();
while (itty.hasNext()) {
Zone current = itty.next();
for (int i = 0; i < sortedZones.size(); i++) {
if (current.getY() < sortedZones.get(i).getY()) {
sortedZones.add(i, current);
itty.remove();
break;
}
}
}
sortedZones.addAll(middle);
return sortedZones;
}
}

View File

@ -0,0 +1,56 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service;
import java.util.List;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.AngleFilter;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Histogram;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Neighbor;
@Service
public class SpacingService {
private static final double SPACING_HISTOGRAM_RESOLUTION = 0.5;
private static final double SPACING_HISTOGRAM_SMOOTHING_LENGTH = 2.5;
private static final double SPACING_HIST_SMOOTHING_STANDARD_DEVIATION = 0.5;
private static final double ANGLE_TOLERANCE = Math.PI / 6;
public double computeCharacterSpacing(List<Character> characters) {
return computeSpacing(characters, 0);
}
public double computeLineSpacing(List<Character> characters) {
return computeSpacing(characters, Math.PI / 2);
}
private double computeSpacing(List<Character> characters, double angle) {
double maxDistance = Double.NEGATIVE_INFINITY;
for (Character character : characters) {
for (Neighbor neighbor : character.getNeighbors()) {
maxDistance = Math.max(maxDistance, neighbor.getDistance());
}
}
Histogram histogram = new Histogram(0, maxDistance, SPACING_HISTOGRAM_RESOLUTION);
AngleFilter angleFilter = new AngleFilter(angle - ANGLE_TOLERANCE, angle + ANGLE_TOLERANCE);
for (Character character : characters) {
for (Neighbor neighbor : character.getNeighbors()) {
if (angleFilter.matches(neighbor)) {
histogram.add(neighbor.getDistance());
}
}
}
histogram.gaussianSmooth(SPACING_HISTOGRAM_SMOOTHING_LENGTH, SPACING_HIST_SMOOTHING_STANDARD_DEVIATION);
return histogram.getPeakValue();
}
}

View File

@ -0,0 +1,150 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.Set;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.DisjointSets;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Line;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone;
@Service
public class ZoneBuilderService {
private static final double MIN_HORIZONTAL_DISTANCE_MULTIPLIER = -0.5;
private static final double MAX_VERTICAL_DISTANCE_MULTIPLIER = 1.2;
private static final double MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER = -3.0;
private static final double MAX_VERTICAL_MERGE_DISTANCE_MULTIPLIER = 0.5;
private static final double MIN_LINE_SIZE_SCALE = 0.9;
private static final double MAX_LINE_SIZE_SCALE = 2.5;
private static final double ANGLE_TOLERANCE = Math.PI / 6;
private static final int MAX_ZONES = 300;
private static final double MAX_VERTICAL_MERGE_DISTANCE = 0.5;
public List<Zone> buildZones(List<Line> lines, double characterSpacing, double lineSpacing) {
double minHorizontalDistance = characterSpacing * MIN_HORIZONTAL_DISTANCE_MULTIPLIER;
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_DISTANCE_MULTIPLIER;
double minHorizontalMergeDistance = characterSpacing * MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER;
double maxVerticalMergeDistance = lineSpacing * MAX_VERTICAL_MERGE_DISTANCE_MULTIPLIER;
DisjointSets<Line> sets = new DisjointSets<>(lines);
double meanHeight = calculateMeanHeight(lines);
lines.forEach(outerLine -> //
lines.forEach(innerLine -> {
double scale = Math.min(outerLine.getHeight(), innerLine.getHeight()) / meanHeight;
scale = Math.max(MIN_LINE_SIZE_SCALE, Math.min(scale, MAX_LINE_SIZE_SCALE));
if (!sets.areTogether(outerLine, innerLine) && outerLine.angularDifference(innerLine) <= ANGLE_TOLERANCE) {
double horizontalDistance = outerLine.horizontalDistance(innerLine) / scale;
double verticalDistance = outerLine.verticalDistance(innerLine) / scale;
if (minHorizontalDistance <= horizontalDistance && verticalDistance <= maxVerticalDistance //
|| minHorizontalMergeDistance <= horizontalDistance && verticalDistance <= maxVerticalMergeDistance) {
sets.union(outerLine, innerLine);
}
}
}));
List<Zone> zones = new ArrayList<>();
sets.forEach(group -> {
zones.add(mergeLinesInZone(new ArrayList<>(group), characterSpacing, lineSpacing));
});
if (zones.size() > MAX_ZONES) {
List<Line> oneZoneLines = new ArrayList<>();
for (Zone zone : zones) {
oneZoneLines.addAll(zone.getLines());
}
return List.of(mergeLinesInZone(oneZoneLines, characterSpacing, lineSpacing));
}
return zones;
}
private double calculateMeanHeight(List<Line> lines) {
double meanHeight = 0.0;
double weights = 0.0;
for (Line line : lines) {
double weight = line.getLength();
meanHeight += line.getHeight() * weight;
weights += weight;
}
meanHeight /= weights;
return meanHeight;
}
private Zone mergeLinesInZone(List<Line> lines, double characterSpacing, double lineSpacing) {
double maxHorizontalDistance = 0;
double minVerticalDistance = 0;
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_MERGE_DISTANCE;
DisjointSets<Line> sets = new DisjointSets<>(lines);
lines.forEach(outer -> {
lines.forEach(inner -> {
if (inner != outer) {
double horizontalDistance = outer.horizontalDistance(inner);
double verticalDistance = outer.verticalDistance(inner);
if (horizontalDistance <= maxHorizontalDistance && minVerticalDistance <= verticalDistance && verticalDistance <= maxVerticalDistance) {
sets.union(outer, inner);
} else if (minVerticalDistance <= verticalDistance && verticalDistance <= maxVerticalDistance && Math.abs(horizontalDistance - Math.min(outer.getLength(),
inner.getLength())) < 0.1) {
boolean characterOverlap = false;
int overlappingCount = 0;
for (Character outerCharacter : outer.getCharacters()) {
for (Character innerCharacter : inner.getCharacters()) {
double characterOverlapDistance = outerCharacter.overlappingDistance(innerCharacter);
if (characterOverlapDistance > 2) {
characterOverlap = true;
}
if (characterOverlapDistance > 0) {
overlappingCount++;
}
}
}
if (!characterOverlap && overlappingCount <= 2) {
sets.union(outer, inner);
}
}
}
});
});
List<Line> outputZone = new ArrayList<>();
for (Set<Line> group : sets) {
List<Character> components = new ArrayList<>();
for (Line line : group) {
components.addAll(line.getCharacters());
}
components.sort(Comparator.comparingDouble(Character::getX));
outputZone.add(new Line(components, characterSpacing));
}
return new Zone(outputZone);
}
}

View File

@ -0,0 +1,18 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.utils;
public class DoubleUtils {
public static int compareDouble(double d1, double d2, double precision) {
if (Double.isNaN(d1) || Double.isNaN(d2)) {
return Double.compare(d1, d2);
}
if (precision == 0) {
precision = 1;
}
long i1 = Math.round(d1 / precision);
long i2 = Math.round(d2 / precision);
return Long.compare(i1, i2);
}
}

View File

@ -33,7 +33,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER);
Document document = buildGraph(fileName, LayoutParsingType.DOCSTRUM_XY);
long start = System.currentTimeMillis();
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
@ -54,10 +54,11 @@ public class ViewerDocumentTest extends BuildDocumentTest {
var documentFile = new ClassPathResource(fileName).getFile();
var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE,
documentFile,
new ImageServiceResponse(),
tableResponse,
new VisualLayoutParsingResponse(),Path.of(fileName).getFileName().toFile().toString());
documentFile,
new ImageServiceResponse(),
tableResponse,
new VisualLayoutParsingResponse(),
Path.of(fileName).getFileName().toFile().toString());
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
Document document = DocumentGraphFactory.buildDocumentGraph(classificationDocument);