Compare commits
11 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
bb5b631950 | ||
|
|
2567d89fbb | ||
|
|
aef1146e8f | ||
|
|
7f56ed15c8 | ||
|
|
91401361e9 | ||
|
|
2ab60195e4 | ||
|
|
32c877e8f7 | ||
|
|
385d4b399e | ||
|
|
d0e1af3a44 | ||
|
|
d06933ed17 | ||
|
|
240ef82def |
@ -3,5 +3,8 @@ package com.knecon.fforesight.service.layoutparser.internal.api.queue;
|
|||||||
public enum LayoutParsingType {
|
public enum LayoutParsingType {
|
||||||
REDACT_MANAGER,
|
REDACT_MANAGER,
|
||||||
TAAS,
|
TAAS,
|
||||||
DOCUMINE
|
DOCUMINE,
|
||||||
|
|
||||||
|
DOCSTRUM,
|
||||||
|
DOCSTRUM_ROW_WISE
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,5 +1,7 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor;
|
package com.knecon.fforesight.service.layoutparser.processor;
|
||||||
|
|
||||||
|
import static com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType.DOCSTRUM;
|
||||||
|
import static com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType.DOCSTRUM_ROW_WISE;
|
||||||
import static java.lang.String.format;
|
import static java.lang.String.format;
|
||||||
|
|
||||||
import java.awt.geom.Rectangle2D;
|
import java.awt.geom.Rectangle2D;
|
||||||
@ -26,6 +28,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsi
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationSection;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||||
@ -43,6 +46,7 @@ import com.knecon.fforesight.service.layoutparser.processor.services.RulingClean
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.TableExtractionService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.TableExtractionService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.TaasBlockificationService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.TaasBlockificationService;
|
||||||
@ -86,6 +90,7 @@ public class LayoutParsingPipeline {
|
|||||||
TaasBlockificationService taasBlockificationService;
|
TaasBlockificationService taasBlockificationService;
|
||||||
DocuMineBlockificationService docuMineBlockificationService;
|
DocuMineBlockificationService docuMineBlockificationService;
|
||||||
RedactManagerBlockificationService redactManagerBlockificationService;
|
RedactManagerBlockificationService redactManagerBlockificationService;
|
||||||
|
DocstrumBlockificationService docstrumBlockificationService;
|
||||||
LayoutGridService layoutGridService;
|
LayoutGridService layoutGridService;
|
||||||
ObservationRegistry observationRegistry;
|
ObservationRegistry observationRegistry;
|
||||||
VisualLayoutParsingAdapter visualLayoutParsingAdapter;
|
VisualLayoutParsingAdapter visualLayoutParsingAdapter;
|
||||||
@ -97,8 +102,7 @@ public class LayoutParsingPipeline {
|
|||||||
log.info("Starting layout parsing for {}", layoutParsingRequest.identifier());
|
log.info("Starting layout parsing for {}", layoutParsingRequest.identifier());
|
||||||
|
|
||||||
File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
|
File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
|
||||||
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId())
|
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()).orElse(originFile);
|
||||||
.orElse(originFile);
|
|
||||||
|
|
||||||
VisualLayoutParsingResponse visualLayoutParsingResponse = new VisualLayoutParsingResponse();
|
VisualLayoutParsingResponse visualLayoutParsingResponse = new VisualLayoutParsingResponse();
|
||||||
if (layoutParsingRequest.visualLayoutParsingFileId().isPresent()) {
|
if (layoutParsingRequest.visualLayoutParsingFileId().isPresent()) {
|
||||||
@ -106,24 +110,20 @@ public class LayoutParsingPipeline {
|
|||||||
}
|
}
|
||||||
|
|
||||||
ImageServiceResponse imageServiceResponse = new ImageServiceResponse();
|
ImageServiceResponse imageServiceResponse = new ImageServiceResponse();
|
||||||
if (layoutParsingRequest.imagesFileStorageId()
|
if (layoutParsingRequest.imagesFileStorageId().isPresent()) {
|
||||||
.isPresent()) {
|
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get());
|
||||||
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId()
|
|
||||||
.get());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
TableServiceResponse tableServiceResponse = new TableServiceResponse();
|
TableServiceResponse tableServiceResponse = new TableServiceResponse();
|
||||||
if (layoutParsingRequest.tablesFileStorageId()
|
if (layoutParsingRequest.tablesFileStorageId().isPresent()) {
|
||||||
.isPresent()) {
|
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get());
|
||||||
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId()
|
|
||||||
.get());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(),
|
ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(),
|
||||||
originFile,
|
originFile,
|
||||||
imageServiceResponse,
|
imageServiceResponse,
|
||||||
tableServiceResponse,
|
tableServiceResponse,
|
||||||
visualLayoutParsingResponse,
|
visualLayoutParsingResponse,
|
||||||
layoutParsingRequest.identifier().toString());
|
layoutParsingRequest.identifier().toString());
|
||||||
|
|
||||||
log.info("Building document graph for {}", layoutParsingRequest.identifier());
|
log.info("Building document graph for {}", layoutParsingRequest.identifier());
|
||||||
@ -156,25 +156,25 @@ public class LayoutParsingPipeline {
|
|||||||
.numberOfPages(documentGraph.getNumberOfPages())
|
.numberOfPages(documentGraph.getNumberOfPages())
|
||||||
.duration(System.currentTimeMillis() - start)
|
.duration(System.currentTimeMillis() - start)
|
||||||
.message(format("""
|
.message(format("""
|
||||||
Layout parsing has finished in %.02f s.
|
Layout parsing has finished in %.02f s.
|
||||||
identifiers: %s
|
identifiers: %s
|
||||||
%s
|
%s
|
||||||
Files have been saved with Ids:
|
Files have been saved with Ids:
|
||||||
Structure: %s
|
Structure: %s
|
||||||
Text: %s
|
Text: %s
|
||||||
Positions: %s
|
Positions: %s
|
||||||
PageData: %s
|
PageData: %s
|
||||||
Simplified Text: %s
|
Simplified Text: %s
|
||||||
Viewer Doc: %s""",
|
Viewer Doc: %s""",
|
||||||
((float) (System.currentTimeMillis() - start)) / 1000,
|
((float) (System.currentTimeMillis() - start)) / 1000,
|
||||||
layoutParsingRequest.identifier(),
|
layoutParsingRequest.identifier(),
|
||||||
buildSemanticNodeCountMessage(documentGraph.getNumberOfPages(), documentGraph.buildSemanticNodeCounts()),
|
buildSemanticNodeCountMessage(documentGraph.getNumberOfPages(), documentGraph.buildSemanticNodeCounts()),
|
||||||
layoutParsingRequest.structureFileStorageId(),
|
layoutParsingRequest.structureFileStorageId(),
|
||||||
layoutParsingRequest.textBlockFileStorageId(),
|
layoutParsingRequest.textBlockFileStorageId(),
|
||||||
layoutParsingRequest.positionBlockFileStorageId(),
|
layoutParsingRequest.positionBlockFileStorageId(),
|
||||||
layoutParsingRequest.pageFileStorageId(),
|
layoutParsingRequest.pageFileStorageId(),
|
||||||
layoutParsingRequest.simplifiedTextStorageId(),
|
layoutParsingRequest.simplifiedTextStorageId(),
|
||||||
layoutParsingRequest.viewerDocumentStorageId()))
|
layoutParsingRequest.viewerDocumentStorageId()))
|
||||||
.build();
|
.build();
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -195,14 +195,14 @@ public class LayoutParsingPipeline {
|
|||||||
private String buildSemanticNodeCountMessage(int numberOfPages, Map<NodeType, Long> semanticNodeCounts) {
|
private String buildSemanticNodeCountMessage(int numberOfPages, Map<NodeType, Long> semanticNodeCounts) {
|
||||||
|
|
||||||
return String.format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed",
|
return String.format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed",
|
||||||
numberOfPages,
|
numberOfPages,
|
||||||
semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION),
|
semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION),
|
||||||
semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE),
|
semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE),
|
||||||
semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH),
|
semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH),
|
||||||
semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE),
|
semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE),
|
||||||
semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL),
|
semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL),
|
||||||
semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER),
|
semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER),
|
||||||
semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER));
|
semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -220,7 +220,7 @@ public class LayoutParsingPipeline {
|
|||||||
Map<Integer, List<TableCells>> pdfTableCells = cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse);
|
Map<Integer, List<TableCells>> pdfTableCells = cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse);
|
||||||
Map<Integer, List<ClassifiedImage>> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse);
|
Map<Integer, List<ClassifiedImage>> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse);
|
||||||
Map<Integer, List<ClassifiedImage>> signatures = new HashMap<>();
|
Map<Integer, List<ClassifiedImage>> signatures = new HashMap<>();
|
||||||
if(signatures.size() > 0) {
|
if (signatures.size() > 0) {
|
||||||
visualLayoutParsingAdapter.buildExtractedSignaturesPerPage(visualLayoutParsingResponse);
|
visualLayoutParsingAdapter.buildExtractedSignaturesPerPage(visualLayoutParsingResponse);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -266,6 +266,8 @@ public class LayoutParsingPipeline {
|
|||||||
case REDACT_MANAGER -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
case REDACT_MANAGER -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||||
case TAAS -> taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
case TAAS -> taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||||
case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||||
|
case DOCSTRUM -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical(), true);
|
||||||
|
case DOCSTRUM_ROW_WISE -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical(), false);
|
||||||
};
|
};
|
||||||
classificationPage.setCleanRulings(cleanRulings);
|
classificationPage.setCleanRulings(cleanRulings);
|
||||||
classificationPage.setRotation(rotation);
|
classificationPage.setRotation(rotation);
|
||||||
@ -283,12 +285,16 @@ public class LayoutParsingPipeline {
|
|||||||
imageServiceResponseAdapter.findOcr(classificationPage);
|
imageServiceResponseAdapter.findOcr(classificationPage);
|
||||||
}
|
}
|
||||||
|
|
||||||
if(signatures.containsKey(pageNumber)) {
|
if (signatures.containsKey(pageNumber)) {
|
||||||
classificationPage.setImages(signatures.get(pageNumber));
|
classificationPage.setImages(signatures.get(pageNumber));
|
||||||
}
|
}
|
||||||
|
|
||||||
tableExtractionService.extractTables(cleanRulings, classificationPage);
|
tableExtractionService.extractTables(cleanRulings, classificationPage);
|
||||||
|
|
||||||
|
if (layoutParsingType == DOCSTRUM || layoutParsingType == DOCSTRUM_ROW_WISE) {
|
||||||
|
// docstrumBlockificationService.combineBlocks(classificationPage); //todo 8666
|
||||||
|
}
|
||||||
|
|
||||||
buildPageStatistics(classificationPage);
|
buildPageStatistics(classificationPage);
|
||||||
increaseDocumentStatistics(classificationPage, classificationDocument);
|
increaseDocumentStatistics(classificationPage, classificationDocument);
|
||||||
|
|
||||||
@ -304,11 +310,26 @@ public class LayoutParsingPipeline {
|
|||||||
case TAAS -> taasClassificationService.classifyDocument(classificationDocument);
|
case TAAS -> taasClassificationService.classifyDocument(classificationDocument);
|
||||||
case DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument);
|
case DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument);
|
||||||
case REDACT_MANAGER -> redactManagerClassificationService.classifyDocument(classificationDocument);
|
case REDACT_MANAGER -> redactManagerClassificationService.classifyDocument(classificationDocument);
|
||||||
|
case DOCSTRUM_ROW_WISE -> redactManagerClassificationService.classifyDocument(classificationDocument);
|
||||||
}
|
}
|
||||||
|
|
||||||
log.info("Building Sections for {}", identifier);
|
log.info("Building Sections for {}", identifier);
|
||||||
sectionsBuilderService.buildSections(classificationDocument);
|
|
||||||
sectionsBuilderService.addImagesToSections(classificationDocument);
|
if (layoutParsingType == DOCSTRUM || layoutParsingType == DOCSTRUM_ROW_WISE) {
|
||||||
|
// Currently for debugging return paragraphs as sections, because there is a merging logic in sectionBuilder
|
||||||
|
List<ClassificationSection> sections = new ArrayList<>();
|
||||||
|
for (var page : classificationPages) {
|
||||||
|
page.getTextBlocks().forEach(block -> {
|
||||||
|
block.setPage(page.getPageNumber());
|
||||||
|
var section = sectionsBuilderService.buildTextBlock(List.of(block), "a");
|
||||||
|
sections.add(section);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
classificationDocument.setSections(sections);
|
||||||
|
} else {
|
||||||
|
sectionsBuilderService.buildSections(classificationDocument);
|
||||||
|
sectionsBuilderService.addImagesToSections(classificationDocument);
|
||||||
|
}
|
||||||
return classificationDocument;
|
return classificationDocument;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -45,6 +45,12 @@ public abstract class AbstractPageBlock {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean containsBlock(TextPageBlock other, float threshold) {
|
||||||
|
|
||||||
|
return this.minX <= other.getMinX() + threshold && this.maxX >= other.getMaxX() - threshold && this.minY <= other.getMinY() + threshold && this.maxY >= other.getMaxY() - threshold;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public boolean contains(AbstractPageBlock other) {
|
public boolean contains(AbstractPageBlock other) {
|
||||||
|
|
||||||
return this.minX <= other.minX && this.maxX >= other.maxX && this.minY >= other.minY && this.maxY <= other.maxY;
|
return this.minX <= other.minX && this.maxX >= other.maxX && this.minY >= other.minY && this.maxY <= other.maxY;
|
||||||
@ -96,6 +102,12 @@ public abstract class AbstractPageBlock {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean intersects(AbstractPageBlock apb) {
|
||||||
|
|
||||||
|
return this.minY < apb.getMaxY() && this.maxY >= apb.getMinY() && this.minX < apb.getMaxX() && this.maxX > apb.getMinX();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public abstract boolean isEmpty();
|
public abstract boolean isEmpty();
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -46,8 +46,12 @@ public class RedTextPosition {
|
|||||||
private String fontName;
|
private String fontName;
|
||||||
|
|
||||||
|
|
||||||
|
@JsonIgnore
|
||||||
|
private int textSequence;
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public static RedTextPosition fromTextPosition(TextPosition textPosition) {
|
public static RedTextPosition fromTextPosition(TextPosition textPosition, int textSequence) {
|
||||||
|
|
||||||
var pos = new RedTextPosition();
|
var pos = new RedTextPosition();
|
||||||
BeanUtils.copyProperties(textPosition, pos);
|
BeanUtils.copyProperties(textPosition, pos);
|
||||||
@ -63,6 +67,7 @@ public class RedTextPosition {
|
|||||||
position[3] = textPosition.getHeightDir();
|
position[3] = textPosition.getHeightDir();
|
||||||
|
|
||||||
pos.setPosition(position);
|
pos.setPosition(position);
|
||||||
|
pos.setTextSequence(textSequence);
|
||||||
return pos;
|
return pos;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -73,7 +73,7 @@ public class TextPageBlock extends AbstractPageBlock {
|
|||||||
|
|
||||||
return sequences.get(0).getPageWidth();
|
return sequences.get(0).getPageWidth();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public static TextPageBlock merge(List<TextPageBlock> textBlocksToMerge) {
|
public static TextPageBlock merge(List<TextPageBlock> textBlocksToMerge) {
|
||||||
|
|
||||||
@ -82,6 +82,7 @@ public class TextPageBlock extends AbstractPageBlock {
|
|||||||
return fromTextPositionSequences(sequences);
|
return fromTextPositionSequences(sequences);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public static TextPageBlock fromTextPositionSequences(List<TextPositionSequence> wordBlockList) {
|
public static TextPageBlock fromTextPositionSequences(List<TextPositionSequence> wordBlockList) {
|
||||||
|
|
||||||
TextPageBlock textBlock = null;
|
TextPageBlock textBlock = null;
|
||||||
@ -133,7 +134,6 @@ public class TextPageBlock extends AbstractPageBlock {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the minX value in pdf coordinate system.
|
* Returns the minX value in pdf coordinate system.
|
||||||
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
|
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
|
||||||
@ -362,7 +362,22 @@ public class TextPageBlock extends AbstractPageBlock {
|
|||||||
}
|
}
|
||||||
|
|
||||||
return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString());
|
return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public int getNumberOfLines() {
|
||||||
|
|
||||||
|
int numberOfLines = 1;
|
||||||
|
TextPositionSequence previous = null;
|
||||||
|
for (TextPositionSequence word : sequences) {
|
||||||
|
if (previous != null) {
|
||||||
|
if (word.getMaxYDirAdj() - previous.getMaxYDirAdj() > word.getTextHeight()) {
|
||||||
|
numberOfLines++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
previous = word;
|
||||||
|
}
|
||||||
|
return numberOfLines;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -43,9 +43,9 @@ public class TextPositionSequence implements CharSequence {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public TextPositionSequence(List<TextPosition> textPositions, int page, boolean isParagraphStart) {
|
public TextPositionSequence(List<TextPosition> textPositions, int page, boolean isParagraphStart, int textSequence) {
|
||||||
|
|
||||||
this.textPositions = textPositions.stream().map(RedTextPosition::fromTextPosition).collect(Collectors.toList());
|
this.textPositions = textPositions.stream().map(textPosition -> RedTextPosition.fromTextPosition(textPosition, textSequence)).collect(Collectors.toList());
|
||||||
this.page = page;
|
this.page = page;
|
||||||
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
|
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
|
||||||
this.rotation = textPositions.get(0).getRotation();
|
this.rotation = textPositions.get(0).getRotation();
|
||||||
@ -55,6 +55,17 @@ public class TextPositionSequence implements CharSequence {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public TextPositionSequence(List<RedTextPosition> textPositions, int page) {
|
||||||
|
|
||||||
|
this.textPositions = textPositions;
|
||||||
|
this.page = page;
|
||||||
|
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
|
||||||
|
this.rotation = textPositions.get(0).getRotation();
|
||||||
|
this.pageHeight = textPositions.get(0).getPageHeight();
|
||||||
|
this.pageWidth = textPositions.get(0).getPageWidth();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int length() {
|
public int length() {
|
||||||
|
|
||||||
@ -122,9 +133,9 @@ public class TextPositionSequence implements CharSequence {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void add(TextPosition textPosition) {
|
public void add(TextPosition textPosition, int textSequence) {
|
||||||
|
|
||||||
this.textPositions.add(RedTextPosition.fromTextPosition(textPosition));
|
this.textPositions.add(RedTextPosition.fromTextPosition(textPosition, textSequence));
|
||||||
|
|
||||||
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
|
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
|
||||||
this.rotation = textPositions.get(0).getRotation();
|
this.rotation = textPositions.get(0).getRotation();
|
||||||
|
|||||||
@ -240,7 +240,7 @@ public class SectionsBuilderService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private ClassificationSection buildTextBlock(List<AbstractPageBlock> wordBlockList, String lastHeadline) {
|
public ClassificationSection buildTextBlock(List<AbstractPageBlock> wordBlockList, String lastHeadline) {
|
||||||
|
|
||||||
ClassificationSection section = new ClassificationSection();
|
ClassificationSection section = new ClassificationSection();
|
||||||
|
|
||||||
|
|||||||
@ -0,0 +1,310 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.services.blockification;
|
||||||
|
|
||||||
|
import static java.util.stream.Collectors.toSet;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.ListIterator;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.DocstrumSegmentationService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.QuickSort;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionSequenceComparator;
|
||||||
|
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
|
||||||
|
@SuppressWarnings("all")
|
||||||
|
@Service
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
public class DocstrumBlockificationService {
|
||||||
|
|
||||||
|
private final DocstrumSegmentationService docstrumSegmentationService;
|
||||||
|
|
||||||
|
static final float THRESHOLD = 2f;
|
||||||
|
|
||||||
|
|
||||||
|
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines, boolean columnWise) {
|
||||||
|
|
||||||
|
List<AbstractPageBlock> abstractPageBlocks = new ArrayList<>();
|
||||||
|
var zones = docstrumSegmentationService.segmentPage(textPositions, columnWise);
|
||||||
|
zones.forEach(zone -> {
|
||||||
|
|
||||||
|
List<TextPositionSequence> textPositionSequences = new ArrayList<>();
|
||||||
|
zone.getLines().forEach(line -> {
|
||||||
|
line.getWords().forEach(word -> {
|
||||||
|
textPositionSequences.add(new TextPositionSequence(word.getTextPositions(), word.getPage()));
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
abstractPageBlocks.addAll(splitZonesAtRulings(textPositionSequences, horizontalRulingLines, verticalRulingLines));
|
||||||
|
// abstractPageBlocks.add(buildTextBlock(textPositionSequences, 0));
|
||||||
|
});
|
||||||
|
|
||||||
|
return new ClassificationPage(abstractPageBlocks);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void combineBlocks(ClassificationPage page) {
|
||||||
|
|
||||||
|
mergeZones(page.getTextBlocks());
|
||||||
|
|
||||||
|
TextPageBlock previous = new TextPageBlock();
|
||||||
|
ListIterator<AbstractPageBlock> itty = page.getTextBlocks().listIterator();
|
||||||
|
while (itty.hasNext()) {
|
||||||
|
AbstractPageBlock block = itty.next();
|
||||||
|
if (block instanceof TablePageBlock) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
TextPageBlock current = (TextPageBlock) block;
|
||||||
|
|
||||||
|
if (previous != null && !previous.getSequences().isEmpty()) {
|
||||||
|
|
||||||
|
if (current.getDir() == previous.getDir() && (Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD || Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) && (previous.getNumberOfLines() >= 2 && current.getNumberOfLines() >= 2 /* && current.getNumberOfLines() <= 10 */ && previous.getNumberOfLines() <= current.getNumberOfLines())) {
|
||||||
|
previous.getSequences().addAll(current.getSequences());
|
||||||
|
previous = buildTextBlock(previous.getSequences(), 0);
|
||||||
|
itty.remove();
|
||||||
|
|
||||||
|
// Might be a left/right mapping add one sorted as well
|
||||||
|
var sortedDublicate = buildTextBlock(previous.getSequences().stream().sorted(new TextPositionSequenceComparator()).collect(Collectors.toList()), 0);
|
||||||
|
itty.add(sortedDublicate);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (current.getDir() == previous.getDir() && (Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD || Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) && (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 /* && current.getNumberOfLines() <= 10 */ || previous.getNumberOfLines() == 2 && current.getNumberOfLines() == 1)) {
|
||||||
|
previous.getSequences().addAll(current.getSequences());
|
||||||
|
previous = buildTextBlock(previous.getSequences(), 0);
|
||||||
|
itty.remove();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (current.getDir() == previous.getDir() && previous.containsBlock(current, THRESHOLD)) {
|
||||||
|
previous.getSequences().addAll(current.getSequences());
|
||||||
|
QuickSort.sort(previous.getSequences(), new TextPositionSequenceComparator());
|
||||||
|
previous = buildTextBlock(previous.getSequences(), 0);
|
||||||
|
itty.remove();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
previous = current;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void mergeZones(List<AbstractPageBlock> zones) {
|
||||||
|
|
||||||
|
ListIterator<AbstractPageBlock> itty = zones.listIterator();
|
||||||
|
|
||||||
|
while (itty.hasNext()) {
|
||||||
|
AbstractPageBlock block = itty.next();
|
||||||
|
if (block instanceof TablePageBlock) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
TextPageBlock current = (TextPageBlock) block;
|
||||||
|
|
||||||
|
List<AbstractPageBlock> toBeRemoved = new ArrayList<>();
|
||||||
|
for (AbstractPageBlock innerZone : zones) {
|
||||||
|
if (innerZone == current) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (innerZone instanceof TablePageBlock) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
TextPageBlock inner = (TextPageBlock) innerZone;
|
||||||
|
|
||||||
|
if (current.getDir() == inner.getDir() && current.almostIntersects(inner, 0, 0)) {
|
||||||
|
|
||||||
|
current.getSequences().addAll(inner.getSequences());
|
||||||
|
current = buildTextBlock(inner.getSequences().stream().sorted(new TextPositionSequenceComparator()).collect(Collectors.toList()), 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
zones.removeAll(toBeRemoved);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public List<AbstractPageBlock> splitZonesAtRulings(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
||||||
|
|
||||||
|
int indexOnPage = 0;
|
||||||
|
List<TextPositionSequence> chunkWords = new ArrayList<>();
|
||||||
|
List<AbstractPageBlock> chunkBlockList = new ArrayList<>();
|
||||||
|
|
||||||
|
float minX = 1000, maxX = 0, minY = 1000, maxY = 0;
|
||||||
|
TextPositionSequence prev = null;
|
||||||
|
|
||||||
|
Float splitX1 = null;
|
||||||
|
for (TextPositionSequence word : textPositions) {
|
||||||
|
|
||||||
|
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
|
||||||
|
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
|
||||||
|
|
||||||
|
if (prev != null && (splitByDir || isSplitByRuling)) {
|
||||||
|
|
||||||
|
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
|
||||||
|
indexOnPage++;
|
||||||
|
|
||||||
|
chunkBlockList.add(cb1);
|
||||||
|
chunkWords = new ArrayList<>();
|
||||||
|
|
||||||
|
minX = 1000;
|
||||||
|
maxX = 0;
|
||||||
|
minY = 1000;
|
||||||
|
maxY = 0;
|
||||||
|
prev = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
chunkWords.add(word);
|
||||||
|
|
||||||
|
prev = word;
|
||||||
|
if (word.getMinXDirAdj() < minX) {
|
||||||
|
minX = word.getMinXDirAdj();
|
||||||
|
}
|
||||||
|
if (word.getMaxXDirAdj() > maxX) {
|
||||||
|
maxX = word.getMaxXDirAdj();
|
||||||
|
}
|
||||||
|
if (word.getMinYDirAdj() < minY) {
|
||||||
|
minY = word.getMinYDirAdj();
|
||||||
|
}
|
||||||
|
if (word.getMaxYDirAdj() > maxY) {
|
||||||
|
maxY = word.getMaxYDirAdj();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
|
||||||
|
if (cb1 != null) {
|
||||||
|
chunkBlockList.add(cb1);
|
||||||
|
}
|
||||||
|
|
||||||
|
return chunkBlockList;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private boolean equalsWithThreshold(float f1, float f2) {
|
||||||
|
|
||||||
|
return Math.abs(f1 - f2) < THRESHOLD;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
|
||||||
|
|
||||||
|
TextPageBlock textBlock = null;
|
||||||
|
|
||||||
|
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
|
||||||
|
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
|
||||||
|
FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter();
|
||||||
|
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
|
||||||
|
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
|
||||||
|
|
||||||
|
for (TextPositionSequence wordBlock : wordBlockList) {
|
||||||
|
|
||||||
|
lineHeightFrequencyCounter.add(wordBlock.getTextHeight());
|
||||||
|
fontSizeFrequencyCounter.add(wordBlock.getFontSize());
|
||||||
|
spaceFrequencyCounter.add(wordBlock.getSpaceWidth());
|
||||||
|
fontFrequencyCounter.add(wordBlock.getFont());
|
||||||
|
styleFrequencyCounter.add(wordBlock.getFontStyle());
|
||||||
|
|
||||||
|
if (textBlock == null) {
|
||||||
|
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
|
||||||
|
wordBlock.getMaxXDirAdj(),
|
||||||
|
wordBlock.getMinYDirAdj(),
|
||||||
|
wordBlock.getMaxYDirAdj(),
|
||||||
|
wordBlockList,
|
||||||
|
wordBlock.getRotation());
|
||||||
|
} else {
|
||||||
|
TextPageBlock spatialEntity = textBlock.union(wordBlock);
|
||||||
|
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (textBlock != null) {
|
||||||
|
textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular());
|
||||||
|
textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular());
|
||||||
|
textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular());
|
||||||
|
textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
|
||||||
|
textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
|
||||||
|
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
|
||||||
|
}
|
||||||
|
|
||||||
|
if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) {
|
||||||
|
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
|
||||||
|
}
|
||||||
|
return textBlock;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private boolean isSplitByRuling(float minX,
|
||||||
|
float minY,
|
||||||
|
float maxX,
|
||||||
|
float maxY,
|
||||||
|
TextPositionSequence word,
|
||||||
|
List<Ruling> horizontalRulingLines,
|
||||||
|
List<Ruling> verticalRulingLines) {
|
||||||
|
|
||||||
|
return isSplitByRuling(maxX,
|
||||||
|
minY,
|
||||||
|
word.getMinXDirAdj(),
|
||||||
|
word.getMinYDirAdj(),
|
||||||
|
verticalRulingLines,
|
||||||
|
word.getDir().getDegrees(),
|
||||||
|
word.getPageWidth(),
|
||||||
|
word.getPageHeight()) //
|
||||||
|
|| isSplitByRuling(minX,
|
||||||
|
minY,
|
||||||
|
word.getMinXDirAdj(),
|
||||||
|
word.getMaxYDirAdj(),
|
||||||
|
horizontalRulingLines,
|
||||||
|
word.getDir().getDegrees(),
|
||||||
|
word.getPageWidth(),
|
||||||
|
word.getPageHeight()) //
|
||||||
|
|| isSplitByRuling(maxX,
|
||||||
|
minY,
|
||||||
|
word.getMinXDirAdj(),
|
||||||
|
word.getMinYDirAdj(),
|
||||||
|
horizontalRulingLines,
|
||||||
|
word.getDir().getDegrees(),
|
||||||
|
word.getPageWidth(),
|
||||||
|
word.getPageHeight()) //
|
||||||
|
|| isSplitByRuling(minX,
|
||||||
|
minY,
|
||||||
|
word.getMinXDirAdj(),
|
||||||
|
word.getMaxYDirAdj(),
|
||||||
|
verticalRulingLines,
|
||||||
|
word.getDir().getDegrees(),
|
||||||
|
word.getPageWidth(),
|
||||||
|
word.getPageHeight());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List<Ruling> rulingLines, float dir, float pageWidth, float pageHeight) {
|
||||||
|
|
||||||
|
for (Ruling ruling : rulingLines) {
|
||||||
|
var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight);
|
||||||
|
if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private double round(float value, int decimalPoints) {
|
||||||
|
|
||||||
|
var d = Math.pow(10, decimalPoints);
|
||||||
|
return Math.round(value * d) / d;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,59 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.LineBuilderService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.NearestNeighbourService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.ReadingOrderService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.SpacingService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.ZoneBuilderService;
|
||||||
|
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
|
||||||
|
@Service
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
public class DocstrumSegmentationService {
|
||||||
|
|
||||||
|
private final NearestNeighbourService nearestNeighbourService;
|
||||||
|
private final SpacingService spacingService;
|
||||||
|
private final LineBuilderService lineBuilderService;
|
||||||
|
private final ZoneBuilderService zoneBuilderService;
|
||||||
|
private final ReadingOrderService readingOrderService;
|
||||||
|
|
||||||
|
|
||||||
|
public List<Zone> segmentPage(List<TextPositionSequence> textPositions, boolean columnWise) {
|
||||||
|
|
||||||
|
List<Zone> zones = new ArrayList<>();
|
||||||
|
zones.addAll(computeZones(textPositions, TextDirection.ZERO));
|
||||||
|
zones.addAll(computeZones(textPositions, TextDirection.QUARTER_CIRCLE));
|
||||||
|
zones.addAll(computeZones(textPositions, TextDirection.HALF_CIRCLE));
|
||||||
|
zones.addAll(computeZones(textPositions, TextDirection.THREE_QUARTER_CIRCLE));
|
||||||
|
|
||||||
|
return readingOrderService.resolve(zones, columnWise);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private List<Zone> computeZones(List<TextPositionSequence> textPositions, TextDirection direction) {
|
||||||
|
|
||||||
|
var positions = textPositions.stream().filter(t -> t.getDir() == direction).map(TextPositionSequence::getTextPositions).flatMap(List::stream).toList();
|
||||||
|
|
||||||
|
var characters = positions.stream().map(Character::new).collect(Collectors.toList());
|
||||||
|
|
||||||
|
nearestNeighbourService.findNearestNeighbors(characters);
|
||||||
|
|
||||||
|
var characterSpacing = spacingService.computeCharacterSpacing(characters);
|
||||||
|
var lineSpacing = Math.min(spacingService.computeLineSpacing(characters), 20);
|
||||||
|
|
||||||
|
var lines = lineBuilderService.buildLines(characters, characterSpacing, lineSpacing);
|
||||||
|
return zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,32 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
|
||||||
|
|
||||||
|
public class AngleFilter {
|
||||||
|
|
||||||
|
protected double lowerAngle;
|
||||||
|
protected double upperAngle;
|
||||||
|
|
||||||
|
|
||||||
|
public AngleFilter(double lowerAngle, double upperAngle) {
|
||||||
|
|
||||||
|
if (lowerAngle < -Math.PI / 2) {
|
||||||
|
lowerAngle += Math.PI;
|
||||||
|
}
|
||||||
|
if (upperAngle >= Math.PI / 2) {
|
||||||
|
upperAngle -= Math.PI;
|
||||||
|
}
|
||||||
|
|
||||||
|
this.lowerAngle = lowerAngle;
|
||||||
|
this.upperAngle = upperAngle;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean matches(Neighbor neighbor) {
|
||||||
|
|
||||||
|
if (lowerAngle <= upperAngle) {
|
||||||
|
return lowerAngle <= neighbor.getAngle() && neighbor.getAngle() < upperAngle;
|
||||||
|
} else {
|
||||||
|
return lowerAngle <= neighbor.getAngle() || neighbor.getAngle() < upperAngle;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,56 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
|
||||||
|
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||||
|
|
||||||
|
import lombok.Data;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
public abstract class BoundingBox {
|
||||||
|
|
||||||
|
private Rectangle2D bBox;
|
||||||
|
|
||||||
|
|
||||||
|
public double getX() {
|
||||||
|
|
||||||
|
return bBox.getX();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double getY() {
|
||||||
|
|
||||||
|
return bBox.getY();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double getWidth() {
|
||||||
|
|
||||||
|
return bBox.getWidth();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double getHeight() {
|
||||||
|
|
||||||
|
return bBox.getHeight();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double getArea() {
|
||||||
|
|
||||||
|
return (bBox.getHeight() * bBox.getWidth());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean contains(Rectangle2D contained, double tolerance) {
|
||||||
|
|
||||||
|
return bBox.getX() <= contained.getX() + tolerance && bBox.getY() <= contained.getY() + tolerance && bBox.getX() + bBox.getWidth() >= contained.getX() + contained.getWidth() - tolerance && bBox.getY() + bBox.getHeight() >= contained.getY() + contained.getHeight() - tolerance;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean intersectsY(BoundingBox other) {
|
||||||
|
|
||||||
|
return this.getBBox().getMinY() <= other.getBBox().getMaxY() && this.getBBox().getMaxY() >= other.getBBox().getMinY();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,84 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
||||||
|
|
||||||
|
import lombok.Data;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
public class Character {
|
||||||
|
|
||||||
|
private final double x;
|
||||||
|
private final double y;
|
||||||
|
private final RedTextPosition textPosition;
|
||||||
|
|
||||||
|
private List<Neighbor> neighbors = new ArrayList<>();
|
||||||
|
|
||||||
|
|
||||||
|
public Character(RedTextPosition chunk) {
|
||||||
|
|
||||||
|
this.x = chunk.getXDirAdj() + chunk.getWidthDirAdj() / 2;
|
||||||
|
this.y = chunk.getYDirAdj() + chunk.getHeightDir() / 2;
|
||||||
|
this.textPosition = chunk;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double getHeight() {
|
||||||
|
|
||||||
|
return textPosition.getHeightDir();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double distance(Character character) {
|
||||||
|
|
||||||
|
double dx = getX() - character.getX();
|
||||||
|
double dy = getY() - character.getY();
|
||||||
|
return Math.sqrt(dx * dx + dy * dy);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double horizontalDistance(Character character) {
|
||||||
|
|
||||||
|
return Math.abs(getX() - character.getX());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double verticalDistance(Character character) {
|
||||||
|
|
||||||
|
return Math.abs(getY() - character.getY());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double overlappingDistance(Character other) {
|
||||||
|
|
||||||
|
double[] xs = new double[4];
|
||||||
|
double s = Math.sin(-0), c = Math.cos(-0);
|
||||||
|
xs[0] = c * x - s * y;
|
||||||
|
xs[1] = c * (x + textPosition.getWidthDirAdj()) - s * (y + textPosition.getHeightDir());
|
||||||
|
xs[2] = c * other.x - s * other.y;
|
||||||
|
xs[3] = c * (other.x + other.textPosition.getWidthDirAdj()) - s * (other.y + other.textPosition.getHeightDir());
|
||||||
|
boolean overlapping = xs[1] >= xs[2] && xs[3] >= xs[0];
|
||||||
|
Arrays.sort(xs);
|
||||||
|
return Math.abs(xs[2] - xs[1]) * (overlapping ? 1 : -1);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void setNeighbors(List<Neighbor> neighbors) {
|
||||||
|
|
||||||
|
this.neighbors = neighbors;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double angle(Character character) {
|
||||||
|
|
||||||
|
if (getX() > character.getX()) {
|
||||||
|
return Math.atan2(getY() - character.getY(), getX() - character.getX());
|
||||||
|
} else {
|
||||||
|
return Math.atan2(character.getY() - getY(), character.getX() - getX());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,194 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
|
||||||
|
|
||||||
|
import java.util.AbstractSet;
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.NoSuchElementException;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
public class DisjointSets<E> implements Iterable<Set<E>> {
|
||||||
|
|
||||||
|
private final Map<E, Entry<E>> map = new HashMap<>();
|
||||||
|
|
||||||
|
|
||||||
|
public DisjointSets(Collection<? extends E> collection) {
|
||||||
|
|
||||||
|
for (E element : collection) {
|
||||||
|
map.put(element, new Entry<E>(element));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean areTogether(E e1, E e2) {
|
||||||
|
|
||||||
|
return map.get(e1).findRepresentative() == map.get(e2).findRepresentative();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void union(E e1, E e2) {
|
||||||
|
|
||||||
|
Entry<E> r1 = map.get(e1).findRepresentative();
|
||||||
|
Entry<E> r2 = map.get(e2).findRepresentative();
|
||||||
|
if (r1 != r2) {
|
||||||
|
if (r1.size <= r2.size) {
|
||||||
|
r2.mergeWith(r1);
|
||||||
|
} else {
|
||||||
|
r1.mergeWith(r2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Iterator<Set<E>> iterator() {
|
||||||
|
|
||||||
|
return new Iterator<>() {
|
||||||
|
|
||||||
|
private final Iterator<Entry<E>> iterator = map.values().iterator();
|
||||||
|
private Entry<E> nextRepresentative;
|
||||||
|
|
||||||
|
{
|
||||||
|
findNextRepresentative();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean hasNext() {
|
||||||
|
|
||||||
|
return nextRepresentative != null;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Set<E> next() {
|
||||||
|
|
||||||
|
if (nextRepresentative == null) {
|
||||||
|
throw new NoSuchElementException();
|
||||||
|
}
|
||||||
|
Set<E> result = nextRepresentative.asSet();
|
||||||
|
findNextRepresentative();
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void findNextRepresentative() {
|
||||||
|
|
||||||
|
while (iterator.hasNext()) {
|
||||||
|
Entry<E> candidate = iterator.next();
|
||||||
|
if (candidate.isRepresentative()) {
|
||||||
|
nextRepresentative = candidate;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
nextRepresentative = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void remove() {
|
||||||
|
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static class Entry<E> {
|
||||||
|
|
||||||
|
private int size = 1;
|
||||||
|
private final E value;
|
||||||
|
private Entry<E> parent = this;
|
||||||
|
private Entry<E> next = null;
|
||||||
|
private Entry<E> last = this;
|
||||||
|
|
||||||
|
|
||||||
|
Entry(E value) {
|
||||||
|
|
||||||
|
this.value = value;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void mergeWith(Entry<E> otherRepresentative) {
|
||||||
|
|
||||||
|
size += otherRepresentative.size;
|
||||||
|
last.next = otherRepresentative;
|
||||||
|
last = otherRepresentative.last;
|
||||||
|
otherRepresentative.parent = this;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Entry<E> findRepresentative() {
|
||||||
|
|
||||||
|
Entry<E> representative = parent;
|
||||||
|
while (representative.parent != representative) {
|
||||||
|
representative = representative.parent;
|
||||||
|
}
|
||||||
|
for (Entry<E> entry = this; entry != representative; ) {
|
||||||
|
Entry<E> nextEntry = entry.parent;
|
||||||
|
entry.parent = representative;
|
||||||
|
entry = nextEntry;
|
||||||
|
}
|
||||||
|
return representative;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
boolean isRepresentative() {
|
||||||
|
|
||||||
|
return parent == this;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Set<E> asSet() {
|
||||||
|
|
||||||
|
return new AbstractSet<E>() {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Iterator<E> iterator() {
|
||||||
|
|
||||||
|
return new Iterator<E>() {
|
||||||
|
|
||||||
|
private Entry<E> nextEntry = findRepresentative();
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean hasNext() {
|
||||||
|
|
||||||
|
return nextEntry != null;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public E next() {
|
||||||
|
|
||||||
|
if (nextEntry == null) {
|
||||||
|
throw new NoSuchElementException();
|
||||||
|
}
|
||||||
|
E result = nextEntry.value;
|
||||||
|
nextEntry = nextEntry.next;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void remove() {
|
||||||
|
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int size() {
|
||||||
|
|
||||||
|
return findRepresentative().size;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,91 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
|
||||||
|
|
||||||
|
public class Histogram {
|
||||||
|
|
||||||
|
private static final double EPSILON = 1.0e-6;
|
||||||
|
private final double min;
|
||||||
|
private final double resolution;
|
||||||
|
private double[] frequencies;
|
||||||
|
|
||||||
|
|
||||||
|
public Histogram(double minValue, double maxValue, double resolution) {
|
||||||
|
|
||||||
|
this.min = minValue - EPSILON;
|
||||||
|
double delta = maxValue - minValue + 2 * EPSILON;
|
||||||
|
int size = Math.max(1, (int) Math.round((maxValue - minValue) / resolution));
|
||||||
|
this.resolution = delta / size;
|
||||||
|
this.frequencies = new double[size];
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void kernelSmooth(double[] kernel) {
|
||||||
|
|
||||||
|
double[] newFrequencies = new double[frequencies.length];
|
||||||
|
int shift = (kernel.length - 1) / 2;
|
||||||
|
for (int i = 0; i < kernel.length; i++) {
|
||||||
|
int jStart = Math.max(0, i - shift);
|
||||||
|
int jEnd = Math.min(frequencies.length, frequencies.length + i - shift);
|
||||||
|
for (int j = jStart; j < jEnd; j++) {
|
||||||
|
newFrequencies[j - i + shift] += kernel[i] * frequencies[j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
frequencies = newFrequencies;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double[] createGaussianKernel(double length, double stdDeviation) {
|
||||||
|
|
||||||
|
int r = (int) Math.round(length / resolution) / 2;
|
||||||
|
stdDeviation /= resolution;
|
||||||
|
|
||||||
|
int size = 2 * r + 1;
|
||||||
|
double[] kernel = new double[size];
|
||||||
|
double sum = 0;
|
||||||
|
double b = 2 * stdDeviation * stdDeviation;
|
||||||
|
double a = 1 / Math.sqrt(Math.PI * b);
|
||||||
|
for (int i = 0; i < size; i++) {
|
||||||
|
kernel[i] = a * Math.exp(-(i - r) * (i - r) / b);
|
||||||
|
sum += kernel[i];
|
||||||
|
}
|
||||||
|
for (int i = 0; i < size; i++) {
|
||||||
|
kernel[i] /= sum;
|
||||||
|
}
|
||||||
|
return kernel;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void gaussianSmooth(double windowLength, double stdDeviation) {
|
||||||
|
|
||||||
|
kernelSmooth(createGaussianKernel(windowLength, stdDeviation));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void add(double value) {
|
||||||
|
|
||||||
|
frequencies[(int) ((value - min) / resolution)] += 1.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public int getSize() {
|
||||||
|
|
||||||
|
return frequencies.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double getPeakValue() {
|
||||||
|
|
||||||
|
int peakIndex = 0;
|
||||||
|
for (int i = 1; i < frequencies.length; i++) {
|
||||||
|
if (frequencies[i] > frequencies[peakIndex]) {
|
||||||
|
peakIndex = i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
int peakEndIndex = peakIndex + 1;
|
||||||
|
final double EPS = 0.0001;
|
||||||
|
while (peakEndIndex < frequencies.length && Math.abs(frequencies[peakEndIndex] - frequencies[peakIndex]) < EPS) {
|
||||||
|
peakEndIndex++;
|
||||||
|
}
|
||||||
|
return ((double) peakIndex + peakEndIndex) / 2 * resolution + min;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,165 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
|
||||||
|
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
|
|
||||||
|
import lombok.Data;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
public class Line extends BoundingBox {
|
||||||
|
|
||||||
|
private static final double WORD_DISTANCE_MULTIPLIER = 0.2;
|
||||||
|
|
||||||
|
private final double x0;
|
||||||
|
private final double y0;
|
||||||
|
|
||||||
|
private final double x1;
|
||||||
|
private final double y1;
|
||||||
|
|
||||||
|
private final double height;
|
||||||
|
|
||||||
|
private final List<Character> characters;
|
||||||
|
private final List<TextPositionSequence> words = new ArrayList<>();
|
||||||
|
|
||||||
|
|
||||||
|
public Line(List<Character> characters, double wordSpacing) {
|
||||||
|
|
||||||
|
this.characters = characters;
|
||||||
|
|
||||||
|
if (characters.size() >= 2) {
|
||||||
|
// linear regression
|
||||||
|
double sx = 0.0, sxx = 0.0, sxy = 0.0, sy = 0.0;
|
||||||
|
for (Character character : characters) {
|
||||||
|
sx += character.getX();
|
||||||
|
sxx += character.getX() * character.getX();
|
||||||
|
sxy += character.getX() * character.getY();
|
||||||
|
sy += character.getY();
|
||||||
|
}
|
||||||
|
double b = (characters.size() * sxy - sx * sy) / (characters.size() * sxx - sx * sx);
|
||||||
|
double a = (sy - b * sx) / characters.size();
|
||||||
|
|
||||||
|
this.x0 = characters.get(0).getX();
|
||||||
|
this.y0 = a + b * this.x0;
|
||||||
|
this.x1 = characters.get(characters.size() - 1).getX();
|
||||||
|
this.y1 = a + b * this.x1;
|
||||||
|
} else {
|
||||||
|
Character character = characters.get(0);
|
||||||
|
double dx = character.getTextPosition().getWidthDirAdj() / 3;
|
||||||
|
double dy = dx * Math.tan(0);
|
||||||
|
this.x0 = character.getX() - dx;
|
||||||
|
this.x1 = character.getX() + dx;
|
||||||
|
this.y0 = character.getY() - dy;
|
||||||
|
this.y1 = character.getY() + dy;
|
||||||
|
}
|
||||||
|
height = computeHeight();
|
||||||
|
computeWords(wordSpacing * WORD_DISTANCE_MULTIPLIER);
|
||||||
|
buildBBox();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double getAngle() {
|
||||||
|
|
||||||
|
return Math.atan2(y1 - y0, x1 - x0);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double getLength() {
|
||||||
|
|
||||||
|
return Math.sqrt((x0 - x1) * (x0 - x1) + (y0 - y1) * (y0 - y1));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private double computeHeight() {
|
||||||
|
|
||||||
|
double sum = 0.0;
|
||||||
|
for (Character component : characters) {
|
||||||
|
sum += component.getHeight();
|
||||||
|
}
|
||||||
|
return sum / characters.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double angularDifference(Line j) {
|
||||||
|
|
||||||
|
double diff = Math.abs(getAngle() - j.getAngle());
|
||||||
|
if (diff <= Math.PI / 2) {
|
||||||
|
return diff;
|
||||||
|
} else {
|
||||||
|
return Math.PI - diff;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double horizontalDistance(Line other) {
|
||||||
|
|
||||||
|
double[] xs = new double[4];
|
||||||
|
xs[0] = x0;
|
||||||
|
xs[1] = x1;
|
||||||
|
xs[2] = other.x0;
|
||||||
|
xs[3] = other.x1;
|
||||||
|
boolean overlapping = xs[1] >= xs[2] && xs[3] >= xs[0];
|
||||||
|
Arrays.sort(xs);
|
||||||
|
return Math.abs(xs[2] - xs[1]) * (overlapping ? 1 : -1);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double verticalDistance(Line other) {
|
||||||
|
|
||||||
|
double ym = (y0 + y1) / 2;
|
||||||
|
double yn = (other.y0 + other.y1) / 2;
|
||||||
|
return Math.abs(ym - yn) / Math.sqrt(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void computeWords(double wordSpacing) {
|
||||||
|
|
||||||
|
TextPositionSequence word = new TextPositionSequence();
|
||||||
|
Character previous = null;
|
||||||
|
for (Character current : characters) {
|
||||||
|
if (previous != null) {
|
||||||
|
double dist = current.getTextPosition().getXDirAdj() - previous.getTextPosition().getXDirAdj() - previous.getTextPosition().getWidthDirAdj();
|
||||||
|
if (dist > wordSpacing) {
|
||||||
|
words.add(word);
|
||||||
|
word = new TextPositionSequence();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
word.getTextPositions().add(current.getTextPosition());
|
||||||
|
previous = current;
|
||||||
|
}
|
||||||
|
words.add(word);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void buildBBox() {
|
||||||
|
|
||||||
|
double minX = Double.POSITIVE_INFINITY;
|
||||||
|
double minY = Double.POSITIVE_INFINITY;
|
||||||
|
double maxX = Double.NEGATIVE_INFINITY;
|
||||||
|
double maxY = Double.NEGATIVE_INFINITY;
|
||||||
|
|
||||||
|
for (Character character : characters) {
|
||||||
|
|
||||||
|
minX = Math.min(minX, character.getTextPosition().getXDirAdj());
|
||||||
|
minY = Math.min(minY, character.getTextPosition().getYDirAdj());
|
||||||
|
maxX = Math.max(maxX, character.getTextPosition().getXDirAdj() + character.getTextPosition().getWidthDirAdj());
|
||||||
|
maxY = Math.max(maxY, character.getTextPosition().getYDirAdj() + character.getTextPosition().getHeightDir());
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
this.setBBox(new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
words.forEach(word -> sb.append(word.toString()).append(" "));
|
||||||
|
return sb.toString().trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
@ -0,0 +1,36 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
|
||||||
|
|
||||||
|
import lombok.Getter;
|
||||||
|
|
||||||
|
public class Neighbor {
|
||||||
|
|
||||||
|
@Getter
|
||||||
|
private final double distance;
|
||||||
|
@Getter
|
||||||
|
private final double angle;
|
||||||
|
private final Character originCharacter;
|
||||||
|
@Getter
|
||||||
|
private final Character character;
|
||||||
|
|
||||||
|
|
||||||
|
public Neighbor(Character neighbor, Character origin) {
|
||||||
|
|
||||||
|
this.distance = neighbor.distance(origin);
|
||||||
|
this.angle = neighbor.angle(origin);
|
||||||
|
this.character = neighbor;
|
||||||
|
this.originCharacter = origin;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double getHorizontalDistance() {
|
||||||
|
|
||||||
|
return character.horizontalDistance(originCharacter);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double getVerticalDistance() {
|
||||||
|
|
||||||
|
return character.verticalDistance(originCharacter);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,51 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
|
||||||
|
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import lombok.Data;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
public class Zone extends BoundingBox {
|
||||||
|
|
||||||
|
private List<Line> lines;
|
||||||
|
|
||||||
|
private int readingOrder = -1;
|
||||||
|
|
||||||
|
public Zone(List<Line> lines) {
|
||||||
|
|
||||||
|
lines.sort(Comparator.comparingDouble(Line::getY));
|
||||||
|
this.lines = lines;
|
||||||
|
buildBBox();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void buildBBox() {
|
||||||
|
|
||||||
|
double minX = Double.POSITIVE_INFINITY;
|
||||||
|
double minY = Double.POSITIVE_INFINITY;
|
||||||
|
double maxX = Double.NEGATIVE_INFINITY;
|
||||||
|
double maxY = Double.NEGATIVE_INFINITY;
|
||||||
|
|
||||||
|
for (Line line : lines) {
|
||||||
|
|
||||||
|
minX = Math.min(minX, line.getX());
|
||||||
|
minY = Math.min(minY, line.getY());
|
||||||
|
maxX = Math.max(maxX, line.getX() + line.getWidth());
|
||||||
|
maxY = Math.max(maxY, line.getY() + line.getHeight());
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
this.setBBox(new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
lines.forEach(line -> sb.append(line.toString()).append("\n"));
|
||||||
|
return sb.toString().trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,59 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder;
|
||||||
|
|
||||||
|
public enum IntervalRelations {
|
||||||
|
// Unknown interval relations.
|
||||||
|
UNKNOWN,
|
||||||
|
|
||||||
|
// X takes place before Y.
|
||||||
|
// |____X____|......................
|
||||||
|
// ......................|____Y____|
|
||||||
|
PRECEDES,
|
||||||
|
|
||||||
|
// X meets Y.
|
||||||
|
// |____X____|.................
|
||||||
|
// ................|____Y____|
|
||||||
|
MEETS,
|
||||||
|
|
||||||
|
// X overlaps with Y.
|
||||||
|
// |______X______|.................
|
||||||
|
// ................|______Y______|
|
||||||
|
OVERLAPS,
|
||||||
|
|
||||||
|
// X starts Y.
|
||||||
|
// |____X____|.................
|
||||||
|
// |_____Y_____|..............
|
||||||
|
STARTS,
|
||||||
|
|
||||||
|
// X during Y.
|
||||||
|
// ........|____X____|.........
|
||||||
|
// .....|______Y______|.....
|
||||||
|
DURING,
|
||||||
|
|
||||||
|
// X finishes Y.
|
||||||
|
// .................|____X____|
|
||||||
|
// ..............|_____Y_____|
|
||||||
|
FINISHES,
|
||||||
|
|
||||||
|
// Inverse precedes.
|
||||||
|
PRECEDES_INVERSE,
|
||||||
|
|
||||||
|
// Inverse meets.
|
||||||
|
MEETS_INVERSE,
|
||||||
|
|
||||||
|
// Inverse overlaps.
|
||||||
|
OVERLAPS_INVERSE,
|
||||||
|
|
||||||
|
// Inverse Starts.
|
||||||
|
STARTS_INVERSE,
|
||||||
|
|
||||||
|
// Inverse during.
|
||||||
|
DURING_INVERSE,
|
||||||
|
|
||||||
|
// Inverse finishes.
|
||||||
|
FINISHES_INVERSE,
|
||||||
|
|
||||||
|
// X is equal to Y.
|
||||||
|
// ..........|____X____|............
|
||||||
|
// ..........|____Y____|............
|
||||||
|
EQUALS
|
||||||
|
}
|
||||||
@ -0,0 +1,11 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder;
|
||||||
|
|
||||||
|
import java.util.Collection;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone;
|
||||||
|
|
||||||
|
public interface ReadingOrderDetector {
|
||||||
|
|
||||||
|
Collection<Zone> get(Collection<Zone> zones);
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,10 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder;
|
||||||
|
|
||||||
|
public enum SpatialReasoningRules {
|
||||||
|
// In western culture the reading order is from left to right and from top to bottom.
|
||||||
|
BASIC,
|
||||||
|
// The diagonal direction 'left-bottom to top-right' cannot be present among the Basic relations allowed.
|
||||||
|
ROW_WISE,
|
||||||
|
// The diagonal direction 'right-top to bottom-left' cannot be present among the Basic relations allowed.
|
||||||
|
COLUMN_WISE;
|
||||||
|
}
|
||||||
@ -0,0 +1,261 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.tuple.ImmutablePair;
|
||||||
|
import org.apache.commons.lang3.tuple.Pair;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone;
|
||||||
|
|
||||||
|
import lombok.Getter;
|
||||||
|
|
||||||
|
public class UnsupervisedReadingOrderDetector {
|
||||||
|
|
||||||
|
private boolean useRenderingOrder = true;
|
||||||
|
@Getter
|
||||||
|
private SpatialReasoningRules spatialReasoningRule = SpatialReasoningRules.COLUMN_WISE;
|
||||||
|
private double tolerance = 5;
|
||||||
|
private ZoneComparator zoneComparator;
|
||||||
|
|
||||||
|
|
||||||
|
public boolean useRenderingOrder() {
|
||||||
|
|
||||||
|
return useRenderingOrder;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public UnsupervisedReadingOrderDetector() {
|
||||||
|
|
||||||
|
configureComparator();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public UnsupervisedReadingOrderDetector(double tolerance, SpatialReasoningRules spatialReasoningRule, boolean useRenderingOrder) {
|
||||||
|
|
||||||
|
this.tolerance = tolerance;
|
||||||
|
this.spatialReasoningRule = spatialReasoningRule;
|
||||||
|
this.useRenderingOrder = useRenderingOrder;
|
||||||
|
|
||||||
|
configureComparator();
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void configureComparator() {
|
||||||
|
|
||||||
|
switch (spatialReasoningRule) {
|
||||||
|
case COLUMN_WISE:
|
||||||
|
if (useRenderingOrder) {
|
||||||
|
zoneComparator = (Zone z1, Zone z2, double t) -> getBeforeInReadingColumnWise(z1, z2, t) || getBeforeInRendering(z1, z2);
|
||||||
|
} else {
|
||||||
|
zoneComparator = this::getBeforeInReadingColumnWise;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
case ROW_WISE:
|
||||||
|
if (useRenderingOrder) {
|
||||||
|
zoneComparator = (Zone z1, Zone z2, double t) -> getBeforeInReadingRowWise(z1, z2, t) || getBeforeInRendering(z1, z2);
|
||||||
|
} else {
|
||||||
|
zoneComparator = this::getBeforeInReadingRowWise;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
case BASIC:
|
||||||
|
default:
|
||||||
|
if (useRenderingOrder) {
|
||||||
|
zoneComparator = (Zone z1, Zone z2, double t) -> getBeforeInReading(z1, z2, t) || getBeforeInRendering(z1, z2);
|
||||||
|
} else {
|
||||||
|
zoneComparator = this::getBeforeInReading;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public List<Zone> get(List<Zone> zones) {
|
||||||
|
|
||||||
|
int readingOrder = 0;
|
||||||
|
Map<Integer, List<Integer>> graph = buildGraph(zones);
|
||||||
|
|
||||||
|
List<Zone> orderedZones = new ArrayList<>();
|
||||||
|
|
||||||
|
while (!graph.isEmpty()) {
|
||||||
|
int maxCount = graph.values()
|
||||||
|
.stream()
|
||||||
|
.mapToInt(List::size)
|
||||||
|
.max()
|
||||||
|
.orElse(0);
|
||||||
|
|
||||||
|
Map.Entry<Integer, List<Integer>> current = graph.entrySet()
|
||||||
|
.stream()
|
||||||
|
.filter(entry -> entry.getValue().size() == maxCount)
|
||||||
|
.findFirst()
|
||||||
|
.orElse(null);
|
||||||
|
|
||||||
|
if (current != null) {
|
||||||
|
int index = current.getKey();
|
||||||
|
graph.remove(index);
|
||||||
|
|
||||||
|
for (List<Integer> valueList : graph.values()) {
|
||||||
|
valueList.remove(Integer.valueOf(index));
|
||||||
|
}
|
||||||
|
|
||||||
|
Zone zone = zones.get(index);
|
||||||
|
zone.setReadingOrder(readingOrder++);
|
||||||
|
orderedZones.add(zone);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return orderedZones;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private Map<Integer, List<Integer>> buildGraph(List<Zone> zones) {
|
||||||
|
|
||||||
|
Map<Integer, List<Integer>> graph = new HashMap<>();
|
||||||
|
|
||||||
|
for (int i = 0; i < zones.size(); i++) {
|
||||||
|
graph.put(i, new ArrayList<>());
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < zones.size(); i++) {
|
||||||
|
Zone zone1 = zones.get(i);
|
||||||
|
for (int j = 0; j < zones.size(); j++) {
|
||||||
|
if (i == j) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
Zone zone2 = zones.get(j);
|
||||||
|
|
||||||
|
if (zoneComparator.isBefore(zone1, zone2, tolerance)) {
|
||||||
|
graph.get(i).add(j);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return graph;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private boolean getBeforeInRendering(Zone z1, Zone z2) {
|
||||||
|
|
||||||
|
double avgTextSequenceZ1 = z1.getLines()
|
||||||
|
.stream()
|
||||||
|
.flatMap(line -> line.getCharacters()
|
||||||
|
.stream())
|
||||||
|
.map(character -> character.getTextPosition().getTextSequence())
|
||||||
|
.collect(Collectors.averagingDouble(Integer::intValue));
|
||||||
|
|
||||||
|
double avgTextSequenceZ2 = z2.getLines()
|
||||||
|
.stream()
|
||||||
|
.flatMap(line -> line.getCharacters()
|
||||||
|
.stream())
|
||||||
|
.map(character -> character.getTextPosition().getTextSequence())
|
||||||
|
.collect(Collectors.averagingDouble(Integer::intValue));
|
||||||
|
|
||||||
|
return avgTextSequenceZ1 < avgTextSequenceZ2;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private boolean getBeforeInReading(Zone z1, Zone z2, double tolerance) {
|
||||||
|
|
||||||
|
IntervalRelations xRelation = getIntervalRelationX(z1, z2, tolerance).get(0);
|
||||||
|
IntervalRelations yRelation = getIntervalRelationY(z1, z2, tolerance).get(0);
|
||||||
|
|
||||||
|
return xRelation == IntervalRelations.PRECEDES
|
||||||
|
|| yRelation == IntervalRelations.PRECEDES
|
||||||
|
|| xRelation == IntervalRelations.MEETS
|
||||||
|
|| yRelation == IntervalRelations.MEETS
|
||||||
|
|| xRelation == IntervalRelations.OVERLAPS
|
||||||
|
|| yRelation == IntervalRelations.OVERLAPS;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private boolean getBeforeInReadingColumnWise(Zone z1, Zone z2, double tolerance) {
|
||||||
|
|
||||||
|
IntervalRelations xRelation = getIntervalRelationX(z1, z2, tolerance).get(0);
|
||||||
|
IntervalRelations yRelation = getIntervalRelationY(z1, z2, tolerance).get(0);
|
||||||
|
|
||||||
|
return getIntervalRelations(xRelation, yRelation);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static boolean getIntervalRelations(IntervalRelations relation1, IntervalRelations relation2) {
|
||||||
|
|
||||||
|
return relation1 == IntervalRelations.PRECEDES //
|
||||||
|
|| relation1 == IntervalRelations.MEETS //
|
||||||
|
|| relation1 == IntervalRelations.OVERLAPS && //
|
||||||
|
(relation2 == IntervalRelations.PRECEDES //
|
||||||
|
|| relation2 == IntervalRelations.MEETS //
|
||||||
|
|| relation2 == IntervalRelations.OVERLAPS) //
|
||||||
|
|| ((relation2 == IntervalRelations.PRECEDES || relation2 == IntervalRelations.MEETS || relation2 == IntervalRelations.OVERLAPS) && //
|
||||||
|
(relation1 == IntervalRelations.STARTS //
|
||||||
|
|| relation1 == IntervalRelations.FINISHES_INVERSE //
|
||||||
|
|| relation1 == IntervalRelations.EQUALS //
|
||||||
|
|| relation1 == IntervalRelations.DURING //
|
||||||
|
|| relation1 == IntervalRelations.DURING_INVERSE //
|
||||||
|
|| relation1 == IntervalRelations.FINISHES //
|
||||||
|
|| relation1 == IntervalRelations.STARTS_INVERSE //
|
||||||
|
|| relation1 == IntervalRelations.OVERLAPS_INVERSE));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
private boolean getBeforeInReadingRowWise(Zone z1, Zone z2, double tolerance) {
|
||||||
|
|
||||||
|
IntervalRelations xRelations = getIntervalRelationX(z1, z2, tolerance).get(0);
|
||||||
|
IntervalRelations yRelations = getIntervalRelationY(z1, z2, tolerance).get(0);
|
||||||
|
|
||||||
|
return getIntervalRelations(yRelations, xRelations);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static List<IntervalRelations> getIntervalRelationX(Zone z1, Zone z2, double t) {
|
||||||
|
|
||||||
|
return getIntervalRelation(new ImmutablePair<>(z1.getX(), z1.getX() + z1.getWidth()), new ImmutablePair<>(z2.getX(), z2.getX() + z2.getWidth()), t);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static List<IntervalRelations> getIntervalRelationY(Zone z1, Zone z2, double t) {
|
||||||
|
|
||||||
|
return getIntervalRelation(new ImmutablePair<>(z1.getY(), z1.getY() + z1.getHeight()), new ImmutablePair<>(z2.getY(), z2.getY() + z2.getHeight()), t);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static List<IntervalRelations> getIntervalRelation(Pair<Double, Double> a, Pair<Double, Double> b, double t) {
|
||||||
|
|
||||||
|
var intervalRelations = getIntervalRelation(a, b, t, false);
|
||||||
|
intervalRelations.addAll(getIntervalRelation(b, a, t, true));
|
||||||
|
if ((b.getLeft() - t <= a.getLeft() && a.getLeft() <= b.getLeft() + t) && (b.getRight() - t <= a.getRight() && a.getRight() <= b.getRight() + t)) {
|
||||||
|
intervalRelations.add(IntervalRelations.EQUALS);
|
||||||
|
}
|
||||||
|
return intervalRelations;
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static List<IntervalRelations> getIntervalRelation(Pair<Double, Double> a, Pair<Double, Double> b, double t, boolean inverse) {
|
||||||
|
|
||||||
|
List<IntervalRelations> intervalRelations = new ArrayList<>();
|
||||||
|
if (a.getRight() < b.getLeft() - t) {
|
||||||
|
intervalRelations.add(inverse ? IntervalRelations.PRECEDES_INVERSE : IntervalRelations.PRECEDES);
|
||||||
|
} if (b.getLeft() - t <= a.getRight() && a.getRight() <= b.getLeft() + t) {
|
||||||
|
intervalRelations.add(inverse ? IntervalRelations.MEETS_INVERSE : IntervalRelations.MEETS);
|
||||||
|
} if (a.getLeft() < b.getLeft() - t && (b.getLeft() + t < a.getRight() && a.getRight() < b.getRight() - t)) {
|
||||||
|
intervalRelations.add(inverse ? IntervalRelations.OVERLAPS_INVERSE : IntervalRelations.OVERLAPS);
|
||||||
|
} if ((b.getLeft() - t <= a.getLeft() && a.getLeft() <= b.getLeft() + t) && a.getRight() < b.getRight() - t) {
|
||||||
|
intervalRelations.add(inverse ? IntervalRelations.STARTS_INVERSE : IntervalRelations.STARTS);
|
||||||
|
} if (a.getLeft() > b.getLeft() + t && a.getRight() < b.getRight() + t) {
|
||||||
|
intervalRelations.add(inverse ? IntervalRelations.DURING_INVERSE : IntervalRelations.DURING);
|
||||||
|
} if (a.getLeft() > b.getLeft() + t && (b.getRight() - t <= a.getRight() && a.getRight() <= b.getRight() + t)) {
|
||||||
|
intervalRelations.add(inverse ? IntervalRelations.FINISHES_INVERSE : IntervalRelations.FINISHES);
|
||||||
|
}
|
||||||
|
return intervalRelations;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,8 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone;
|
||||||
|
|
||||||
|
@FunctionalInterface
|
||||||
|
public interface ZoneComparator {
|
||||||
|
boolean isBefore(Zone zone1, Zone zone2, double tolerance);
|
||||||
|
}
|
||||||
@ -0,0 +1,52 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.AngleFilter;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.DisjointSets;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Line;
|
||||||
|
|
||||||
|
@Service
|
||||||
|
public class LineBuilderService {
|
||||||
|
|
||||||
|
private static final double CHARACTER_SPACING_DISTANCE_MULTIPLIER = 3.5;
|
||||||
|
private static final double MAX_VERTICAL_CHARACTER_DISTANCE = 0.67;
|
||||||
|
private static final double ANGLE_TOLERANCE = Math.PI / 6;
|
||||||
|
|
||||||
|
|
||||||
|
public List<Line> buildLines(List<Character> characters, double characterSpacing, double lineSpacing) {
|
||||||
|
|
||||||
|
double maxHorizontalDistance = characterSpacing * CHARACTER_SPACING_DISTANCE_MULTIPLIER;
|
||||||
|
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_CHARACTER_DISTANCE;
|
||||||
|
|
||||||
|
DisjointSets<Character> sets = new DisjointSets<>(characters);
|
||||||
|
AngleFilter filter = new AngleFilter(-ANGLE_TOLERANCE, ANGLE_TOLERANCE);
|
||||||
|
|
||||||
|
characters.forEach(character -> {
|
||||||
|
character.getNeighbors().forEach(neighbor -> {
|
||||||
|
double x = neighbor.getHorizontalDistance() / maxHorizontalDistance;
|
||||||
|
double y = neighbor.getVerticalDistance() / maxVerticalDistance;
|
||||||
|
if (character.getTextPosition().getDir() == neighbor.getCharacter().getTextPosition().getDir() && filter.matches(neighbor) && Math.pow(x, 2) + Math.pow(y,
|
||||||
|
2) <= 1) {
|
||||||
|
sets.union(character, neighbor.getCharacter());
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
List<Line> lines = new ArrayList<>();
|
||||||
|
sets.forEach(group -> {
|
||||||
|
List<Character> lineCharacters = new ArrayList<>(group);
|
||||||
|
// QuickSort.sort(lineCharacters, new CharacterComparator());
|
||||||
|
lineCharacters.sort(Comparator.comparingDouble(Character::getX));
|
||||||
|
lines.add(new Line(lineCharacters, characterSpacing));
|
||||||
|
});
|
||||||
|
|
||||||
|
return lines;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,78 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Neighbor;
|
||||||
|
|
||||||
|
@Service
|
||||||
|
public class NearestNeighbourService {
|
||||||
|
|
||||||
|
private static final int NUMBER_OF_NEIGHBOURS = 8;
|
||||||
|
private static final double STEP = 16.0;
|
||||||
|
|
||||||
|
|
||||||
|
public void findNearestNeighbors(List<Character> characters) {
|
||||||
|
|
||||||
|
if (characters.isEmpty()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
characters.sort(Comparator.comparingDouble(Character::getX));
|
||||||
|
|
||||||
|
int maxNeighborCount = NUMBER_OF_NEIGHBOURS;
|
||||||
|
if (characters.size() <= NUMBER_OF_NEIGHBOURS) {
|
||||||
|
maxNeighborCount = characters.size() - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < characters.size(); i++) {
|
||||||
|
|
||||||
|
List<Neighbor> candidates = new ArrayList<>();
|
||||||
|
|
||||||
|
int start = i;
|
||||||
|
int end = i + 1;
|
||||||
|
|
||||||
|
double distance = Double.POSITIVE_INFINITY;
|
||||||
|
|
||||||
|
for (double searchDistance = 0; searchDistance < distance; ) {
|
||||||
|
|
||||||
|
searchDistance += STEP;
|
||||||
|
boolean newCandidatesFound = false;
|
||||||
|
|
||||||
|
while (start > 0 && characters.get(i).getX() - characters.get(start - 1).getX() < searchDistance) {
|
||||||
|
start--;
|
||||||
|
candidates.add(new Neighbor(characters.get(start), characters.get(i)));
|
||||||
|
clearLeastDistant(candidates, maxNeighborCount);
|
||||||
|
newCandidatesFound = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
while (end < characters.size() && characters.get(end).getX() - characters.get(i).getX() < searchDistance) {
|
||||||
|
candidates.add(new Neighbor(characters.get(end), characters.get(i)));
|
||||||
|
clearLeastDistant(candidates, maxNeighborCount);
|
||||||
|
end++;
|
||||||
|
newCandidatesFound = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (newCandidatesFound && candidates.size() >= maxNeighborCount) {
|
||||||
|
distance = candidates.get(maxNeighborCount - 1).getDistance();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
clearLeastDistant(candidates, maxNeighborCount);
|
||||||
|
characters.get(i).setNeighbors(new ArrayList<>(candidates));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void clearLeastDistant(List<Neighbor> candidates, int maxNeighborCount) {
|
||||||
|
|
||||||
|
if (candidates.size() > maxNeighborCount) {
|
||||||
|
candidates.sort(Comparator.comparingDouble(Neighbor::getDistance));
|
||||||
|
candidates.remove(candidates.remove(candidates.size() - 1));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,167 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.ListIterator;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.BoundingBox;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder.SpatialReasoningRules;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder.UnsupervisedReadingOrderDetector;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.utils.DoubleUtils;
|
||||||
|
|
||||||
|
@Service
|
||||||
|
public class ReadingOrderService {
|
||||||
|
|
||||||
|
private static final double THRESHOLD = 1;
|
||||||
|
|
||||||
|
|
||||||
|
public List<Zone> resolveNew(List<Zone> zones, boolean columnWise) {
|
||||||
|
|
||||||
|
if (zones.isEmpty() || zones.size() == 1) {
|
||||||
|
return zones;
|
||||||
|
}
|
||||||
|
|
||||||
|
SpatialReasoningRules spatialReasoningRules = columnWise ? SpatialReasoningRules.COLUMN_WISE : SpatialReasoningRules.ROW_WISE;
|
||||||
|
|
||||||
|
var unsupervisedReadingOrderDetector = new UnsupervisedReadingOrderDetector(1, spatialReasoningRules, false);
|
||||||
|
|
||||||
|
return unsupervisedReadingOrderDetector.get(zones);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public List<Zone> resolve(List<Zone> zones, boolean columnWise) {
|
||||||
|
|
||||||
|
if (zones.isEmpty() || zones.size() == 1) {
|
||||||
|
return zones;
|
||||||
|
}
|
||||||
|
|
||||||
|
Map<Long, Integer> histogram = new HashMap<>();
|
||||||
|
|
||||||
|
for (Zone zone : zones) {
|
||||||
|
long minY = Math.round(zone.getBBox().getMinY());
|
||||||
|
long maxY = Math.round(zone.getBBox().getMaxY());
|
||||||
|
for (long i = minY; i <= maxY; i++) {
|
||||||
|
histogram.put(i, histogram.getOrDefault(i, 0) + 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
columnWise = histogram.values()
|
||||||
|
.stream()
|
||||||
|
.mapToInt(Integer::intValue).average()
|
||||||
|
.orElse(1) > 1.5;
|
||||||
|
|
||||||
|
if (!columnWise) {
|
||||||
|
zones.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||||
|
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||||
|
return zones;
|
||||||
|
}
|
||||||
|
|
||||||
|
return resolveMultiColumnReadingOder(zones);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private List<Zone> resolveMultiColumnReadingOder(List<Zone> zones) {
|
||||||
|
|
||||||
|
// Simple reading order resolver for multi column page layout as described here : https://pub.towardsai.net/advanced-rag-02-unveiling-pdf-parsing-b84ae866344e
|
||||||
|
// TODO implement a more fancy reading order resolver see https://github.com/BobLd/DocumentLayoutAnalysis/blob/master/README.md#reading-order
|
||||||
|
|
||||||
|
double minX = Double.POSITIVE_INFINITY;
|
||||||
|
double maxX = Double.NEGATIVE_INFINITY;
|
||||||
|
|
||||||
|
for (Zone zone : zones) {
|
||||||
|
if (zone.getX() < minX) {
|
||||||
|
minX = zone.getX();
|
||||||
|
}
|
||||||
|
if (zone.getX() + zone.getWidth() > maxX) {
|
||||||
|
maxX = zone.getX() + zone.getWidth();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
double midLineXCoordinate = (minX + maxX) / 2;
|
||||||
|
|
||||||
|
List<Zone> leftOf = new ArrayList<>();
|
||||||
|
List<Zone> rightOf = new ArrayList<>();
|
||||||
|
List<Zone> middle = new ArrayList<>();
|
||||||
|
for (Zone zone : zones) {
|
||||||
|
if (zone.getX() < midLineXCoordinate && zone.getX() + zone.getWidth() < midLineXCoordinate) {
|
||||||
|
leftOf.add(zone);
|
||||||
|
} else if (zone.getX() > midLineXCoordinate && zone.getX() + zone.getWidth() > midLineXCoordinate) {
|
||||||
|
rightOf.add(zone);
|
||||||
|
} else {
|
||||||
|
middle.add(zone);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
List<Zone> leftNotIntersecting = new ArrayList<>();
|
||||||
|
for (Zone leftZone : leftOf) {
|
||||||
|
boolean intersects = false;
|
||||||
|
for (Zone rightZone : rightOf) {
|
||||||
|
if (leftZone.intersectsY(rightZone)) {
|
||||||
|
intersects = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!intersects) {
|
||||||
|
leftNotIntersecting.add(leftZone);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
List<Zone> rightNotIntersecting = new ArrayList<>();
|
||||||
|
for (Zone rightZone : rightOf) {
|
||||||
|
boolean intersects = false;
|
||||||
|
for (Zone leftZone : leftOf) {
|
||||||
|
if (rightZone.intersectsY(leftZone)) {
|
||||||
|
intersects = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!intersects) {
|
||||||
|
rightNotIntersecting.add(rightZone);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
leftOf.removeAll(leftNotIntersecting);
|
||||||
|
rightOf.removeAll(rightNotIntersecting);
|
||||||
|
|
||||||
|
middle.addAll(leftNotIntersecting);
|
||||||
|
middle.addAll(rightNotIntersecting);
|
||||||
|
|
||||||
|
leftOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||||
|
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||||
|
|
||||||
|
rightOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||||
|
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||||
|
|
||||||
|
middle.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||||
|
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||||
|
|
||||||
|
List<Zone> sortedZones = new ArrayList<>();
|
||||||
|
sortedZones.addAll(leftOf);
|
||||||
|
sortedZones.addAll(rightOf);
|
||||||
|
|
||||||
|
ListIterator<Zone> itty = middle.listIterator();
|
||||||
|
|
||||||
|
while (itty.hasNext()) {
|
||||||
|
Zone current = itty.next();
|
||||||
|
for (int i = 0; i < sortedZones.size(); i++) {
|
||||||
|
if (current.getY() < sortedZones.get(i).getY()) {
|
||||||
|
sortedZones.add(i, current);
|
||||||
|
itty.remove();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
sortedZones.addAll(middle);
|
||||||
|
|
||||||
|
return sortedZones;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,56 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.AngleFilter;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Histogram;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Neighbor;
|
||||||
|
|
||||||
|
@Service
|
||||||
|
public class SpacingService {
|
||||||
|
|
||||||
|
private static final double SPACING_HISTOGRAM_RESOLUTION = 0.5;
|
||||||
|
private static final double SPACING_HISTOGRAM_SMOOTHING_LENGTH = 2.5;
|
||||||
|
private static final double SPACING_HIST_SMOOTHING_STANDARD_DEVIATION = 0.5;
|
||||||
|
private static final double ANGLE_TOLERANCE = Math.PI / 6;
|
||||||
|
|
||||||
|
|
||||||
|
public double computeCharacterSpacing(List<Character> characters) {
|
||||||
|
|
||||||
|
return computeSpacing(characters, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double computeLineSpacing(List<Character> characters) {
|
||||||
|
|
||||||
|
return computeSpacing(characters, Math.PI / 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private double computeSpacing(List<Character> characters, double angle) {
|
||||||
|
|
||||||
|
double maxDistance = Double.NEGATIVE_INFINITY;
|
||||||
|
|
||||||
|
for (Character character : characters) {
|
||||||
|
for (Neighbor neighbor : character.getNeighbors()) {
|
||||||
|
maxDistance = Math.max(maxDistance, neighbor.getDistance());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Histogram histogram = new Histogram(0, maxDistance, SPACING_HISTOGRAM_RESOLUTION);
|
||||||
|
AngleFilter angleFilter = new AngleFilter(angle - ANGLE_TOLERANCE, angle + ANGLE_TOLERANCE);
|
||||||
|
for (Character character : characters) {
|
||||||
|
for (Neighbor neighbor : character.getNeighbors()) {
|
||||||
|
if (angleFilter.matches(neighbor)) {
|
||||||
|
histogram.add(neighbor.getDistance());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
histogram.gaussianSmooth(SPACING_HISTOGRAM_SMOOTHING_LENGTH, SPACING_HIST_SMOOTHING_STANDARD_DEVIATION);
|
||||||
|
return histogram.getPeakValue();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,190 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.ListIterator;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.DisjointSets;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Line;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone;
|
||||||
|
|
||||||
|
@Service
|
||||||
|
public class ZoneBuilderService {
|
||||||
|
|
||||||
|
private static final double MIN_HORIZONTAL_DISTANCE_MULTIPLIER = -0.5;
|
||||||
|
private static final double MAX_VERTICAL_DISTANCE_MULTIPLIER = 1.2;
|
||||||
|
|
||||||
|
private static final double MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER = -3.0;
|
||||||
|
|
||||||
|
private static final double MAX_VERTICAL_MERGE_DISTANCE_MULTIPLIER = 0.5;
|
||||||
|
|
||||||
|
private static final double MIN_LINE_SIZE_SCALE = 0.9;
|
||||||
|
|
||||||
|
private static final double MAX_LINE_SIZE_SCALE = 2.5;
|
||||||
|
|
||||||
|
private static final double ANGLE_TOLERANCE = Math.PI / 6;
|
||||||
|
|
||||||
|
private static final int MAX_ZONES = 300;
|
||||||
|
|
||||||
|
private static final double MAX_VERTICAL_MERGE_DISTANCE = 0.5;
|
||||||
|
|
||||||
|
|
||||||
|
public List<Zone> buildZones(List<Line> lines, double characterSpacing, double lineSpacing) {
|
||||||
|
|
||||||
|
double minHorizontalDistance = characterSpacing * MIN_HORIZONTAL_DISTANCE_MULTIPLIER;
|
||||||
|
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_DISTANCE_MULTIPLIER;
|
||||||
|
double minHorizontalMergeDistance = characterSpacing * MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER;
|
||||||
|
double maxVerticalMergeDistance = lineSpacing * MAX_VERTICAL_MERGE_DISTANCE_MULTIPLIER;
|
||||||
|
|
||||||
|
DisjointSets<Line> sets = new DisjointSets<>(lines);
|
||||||
|
|
||||||
|
double meanHeight = calculateMeanHeight(lines);
|
||||||
|
|
||||||
|
lines.forEach(outerLine -> //
|
||||||
|
lines.forEach(innerLine -> {
|
||||||
|
|
||||||
|
double scale = Math.min(outerLine.getHeight(), innerLine.getHeight()) / meanHeight;
|
||||||
|
scale = Math.max(MIN_LINE_SIZE_SCALE, Math.min(scale, MAX_LINE_SIZE_SCALE));
|
||||||
|
|
||||||
|
if (!sets.areTogether(outerLine, innerLine) && outerLine.angularDifference(innerLine) <= ANGLE_TOLERANCE) {
|
||||||
|
|
||||||
|
double horizontalDistance = outerLine.horizontalDistance(innerLine) / scale;
|
||||||
|
double verticalDistance = outerLine.verticalDistance(innerLine) / scale;
|
||||||
|
|
||||||
|
if (minHorizontalDistance <= horizontalDistance && verticalDistance <= maxVerticalDistance //
|
||||||
|
|| minHorizontalMergeDistance <= horizontalDistance && verticalDistance <= maxVerticalMergeDistance) {
|
||||||
|
sets.union(outerLine, innerLine);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
|
||||||
|
List<Zone> zones = new ArrayList<>();
|
||||||
|
sets.forEach(group -> {
|
||||||
|
zones.add(new Zone(new ArrayList<>(group)));
|
||||||
|
});
|
||||||
|
|
||||||
|
// List<Zone> mergedZones = mergeZones(zones);
|
||||||
|
|
||||||
|
List<Zone> finalZones = zones;
|
||||||
|
|
||||||
|
if (finalZones.size() > MAX_ZONES) {
|
||||||
|
List<Line> oneZoneLines = new ArrayList<>();
|
||||||
|
for (Zone zone : finalZones) {
|
||||||
|
oneZoneLines.addAll(zone.getLines());
|
||||||
|
}
|
||||||
|
return List.of(mergeLinesInZone(oneZoneLines, characterSpacing, lineSpacing));
|
||||||
|
}
|
||||||
|
|
||||||
|
return finalZones;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private List<Zone> mergeZones(List<Zone> zones) {
|
||||||
|
|
||||||
|
ListIterator<Zone> itty = zones.listIterator();
|
||||||
|
|
||||||
|
while (itty.hasNext()) {
|
||||||
|
|
||||||
|
Zone current = itty.next();
|
||||||
|
|
||||||
|
for (Zone inner : zones) {
|
||||||
|
if (inner == current) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (current.getBBox().intersects(inner.getBBox())) {
|
||||||
|
inner.getLines().addAll(current.getLines());
|
||||||
|
inner.buildBBox();
|
||||||
|
itty.remove();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return zones;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private double calculateMeanHeight(List<Line> lines) {
|
||||||
|
|
||||||
|
double meanHeight = 0.0;
|
||||||
|
double weights = 0.0;
|
||||||
|
for (Line line : lines) {
|
||||||
|
double weight = line.getLength();
|
||||||
|
meanHeight += line.getHeight() * weight;
|
||||||
|
weights += weight;
|
||||||
|
}
|
||||||
|
meanHeight /= weights;
|
||||||
|
return meanHeight;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private List<Zone> mergeLinesInZones(List<Zone> zones, double characterSpacing, double lineSpacing) {
|
||||||
|
|
||||||
|
List<Zone> merged = new ArrayList<>();
|
||||||
|
for (Zone zone : zones) {
|
||||||
|
merged.add(mergeLinesInZone(zone.getLines(), characterSpacing, lineSpacing));
|
||||||
|
}
|
||||||
|
return merged;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private Zone mergeLinesInZone(List<Line> lines, double characterSpacing, double lineSpacing) {
|
||||||
|
|
||||||
|
double maxHorizontalDistance = 0;
|
||||||
|
double minVerticalDistance = 0;
|
||||||
|
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_MERGE_DISTANCE;
|
||||||
|
|
||||||
|
DisjointSets<Line> sets = new DisjointSets<>(lines);
|
||||||
|
|
||||||
|
lines.forEach(outer -> {
|
||||||
|
|
||||||
|
lines.forEach(inner -> {
|
||||||
|
if (inner != outer) {
|
||||||
|
|
||||||
|
double horizontalDistance = outer.horizontalDistance(inner);
|
||||||
|
double verticalDistance = outer.verticalDistance(inner);
|
||||||
|
|
||||||
|
if (horizontalDistance <= maxHorizontalDistance && minVerticalDistance <= verticalDistance && verticalDistance <= maxVerticalDistance) {
|
||||||
|
sets.union(outer, inner);
|
||||||
|
} else if (minVerticalDistance <= verticalDistance && verticalDistance <= maxVerticalDistance && Math.abs(horizontalDistance - Math.min(outer.getLength(),
|
||||||
|
inner.getLength())) < 0.1) {
|
||||||
|
boolean characterOverlap = false;
|
||||||
|
int overlappingCount = 0;
|
||||||
|
for (Character outerCharacter : outer.getCharacters()) {
|
||||||
|
for (Character innerCharacter : inner.getCharacters()) {
|
||||||
|
double characterOverlapDistance = outerCharacter.overlappingDistance(innerCharacter);
|
||||||
|
if (characterOverlapDistance > 2) {
|
||||||
|
characterOverlap = true;
|
||||||
|
}
|
||||||
|
if (characterOverlapDistance > 0) {
|
||||||
|
overlappingCount++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!characterOverlap && overlappingCount <= 2) {
|
||||||
|
sets.union(outer, inner);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
List<Line> outputZone = new ArrayList<>();
|
||||||
|
for (Set<Line> group : sets) {
|
||||||
|
List<Character> components = new ArrayList<>();
|
||||||
|
for (Line line : group) {
|
||||||
|
components.addAll(line.getCharacters());
|
||||||
|
}
|
||||||
|
components.sort(Comparator.comparingDouble(Character::getX));
|
||||||
|
|
||||||
|
outputZone.add(new Line(components, characterSpacing));
|
||||||
|
}
|
||||||
|
return new Zone(outputZone);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,40 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.utils;
|
||||||
|
|
||||||
|
import java.util.Comparator;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character;
|
||||||
|
|
||||||
|
public class CharacterComparator implements Comparator<Character> {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int compare(Character pos1, Character pos2) {
|
||||||
|
// only compare text that is in the same direction
|
||||||
|
int cmp1 = Float.compare(pos1.getTextPosition().getDir(), pos2.getTextPosition().getDir());
|
||||||
|
if (cmp1 != 0) {
|
||||||
|
return cmp1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// get the text direction adjusted coordinates
|
||||||
|
float x1 = pos1.getTextPosition().getXDirAdj();
|
||||||
|
float x2 = pos2.getTextPosition().getXDirAdj();
|
||||||
|
|
||||||
|
float pos1YBottom = pos1.getTextPosition().getYDirAdj();
|
||||||
|
float pos2YBottom = pos2.getTextPosition().getYDirAdj();
|
||||||
|
|
||||||
|
// note that the coordinates have been adjusted so 0,0 is in upper left
|
||||||
|
float pos1YTop = pos1YBottom - pos1.getTextPosition().getHeightDir();
|
||||||
|
float pos2YTop = pos2YBottom - pos2.getTextPosition().getHeightDir();
|
||||||
|
|
||||||
|
float yDifference = Math.abs(pos1YBottom - pos2YBottom);
|
||||||
|
|
||||||
|
// we will do a simple tolerance comparison
|
||||||
|
if (yDifference < .1 || pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom || pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom) {
|
||||||
|
return Float.compare(x1, x2);
|
||||||
|
} else if (pos1YBottom < pos2YBottom) {
|
||||||
|
return -1;
|
||||||
|
} else {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,18 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.utils;
|
||||||
|
|
||||||
|
public class DoubleUtils {
|
||||||
|
|
||||||
|
public static int compareDouble(double d1, double d2, double precision) {
|
||||||
|
|
||||||
|
if (Double.isNaN(d1) || Double.isNaN(d2)) {
|
||||||
|
return Double.compare(d1, d2);
|
||||||
|
}
|
||||||
|
if (precision == 0) {
|
||||||
|
precision = 1;
|
||||||
|
}
|
||||||
|
long i1 = Math.round(d1 / precision);
|
||||||
|
long i2 = Math.round(d2 / precision);
|
||||||
|
return Long.compare(i1, i2);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,39 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.utils;
|
||||||
|
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
|
|
||||||
|
public class ReadingOrderHelper {
|
||||||
|
|
||||||
|
public static List<TextPositionSequence> orderByReadingOrder(List<TextPositionSequence> words) {
|
||||||
|
if (words.size() <= 1) {
|
||||||
|
return words;
|
||||||
|
}
|
||||||
|
|
||||||
|
int textOrientation = words.get(0).getRotation();
|
||||||
|
|
||||||
|
switch (textOrientation) {
|
||||||
|
case 0:
|
||||||
|
return words.stream()
|
||||||
|
.sorted(Comparator.comparingDouble(w -> w.getRectangle().getTopLeft().getX()))
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
case 90:
|
||||||
|
return words.stream()
|
||||||
|
.sorted((w1, w2) -> -Double.compare(w1.getRectangle().getTopLeft().getY(), w2.getRectangle().getTopLeft().getY()))
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
case 180:
|
||||||
|
return words.stream()
|
||||||
|
.sorted((w1, w2) -> -Double.compare(w1.getRectangle().getTopLeft().getX(), w2.getRectangle().getTopLeft().getX()))
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
case 270:
|
||||||
|
return words.stream()
|
||||||
|
.sorted(Comparator.comparingDouble(w -> w.getRectangle().getTopLeft().getY()))
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
default:
|
||||||
|
throw new IllegalArgumentException("Not sure what to do with this text rotation...");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -237,8 +237,13 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
|||||||
int startIndex = 0;
|
int startIndex = 0;
|
||||||
RedTextPosition previous = null;
|
RedTextPosition previous = null;
|
||||||
|
|
||||||
|
float direction = -1;
|
||||||
for (int i = 0; i <= textPositions.size() - 1; i++) {
|
for (int i = 0; i <= textPositions.size() - 1; i++) {
|
||||||
|
|
||||||
|
if (direction == -1) {
|
||||||
|
direction = textPositions.get(i).getDir();
|
||||||
|
}
|
||||||
|
|
||||||
if (!textPositionSequences.isEmpty()) {
|
if (!textPositionSequences.isEmpty()) {
|
||||||
previous = textPositionSequences.get(textPositionSequences.size() - 1)
|
previous = textPositionSequences.get(textPositionSequences.size() - 1)
|
||||||
.getTextPositions()
|
.getTextPositions()
|
||||||
@ -250,11 +255,18 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (textPositions.get(i).getDir() != direction && startIndex != i) {
|
||||||
|
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
||||||
|
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart, textPositionSequences.size()));
|
||||||
|
startIndex = i;
|
||||||
|
direction = textPositions.get(i).getDir();
|
||||||
|
}
|
||||||
|
|
||||||
// Strange but sometimes this is happening, for example: Metolachlor2.pdf
|
// Strange but sometimes this is happening, for example: Metolachlor2.pdf
|
||||||
if (checkIfCurrentPositionIsToTheRightOfPreviousPosition(i, textPositions)) {
|
if (checkIfCurrentPositionIsToTheRightOfPreviousPosition(i, textPositions)) {
|
||||||
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
||||||
if (checkIfSequenceContainsOnlyWhitespaces(sublist)) {
|
if (checkIfSequenceContainsOnlyWhitespaces(sublist)) {
|
||||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart));
|
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart, textPositionSequences.size()));
|
||||||
}
|
}
|
||||||
startIndex = i;
|
startIndex = i;
|
||||||
}
|
}
|
||||||
@ -262,7 +274,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
|||||||
if (textPositions.get(i).getRotation() == 0 && i > 0 && textPositions.get(i).getX() > textPositions.get(i - 1).getEndX() + 1) {
|
if (textPositions.get(i).getRotation() == 0 && i > 0 && textPositions.get(i).getX() > textPositions.get(i - 1).getEndX() + 1) {
|
||||||
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
||||||
if (checkIfSequenceContainsOnlyWhitespaces(sublist)) {
|
if (checkIfSequenceContainsOnlyWhitespaces(sublist)) {
|
||||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart));
|
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart, textPositionSequences.size()));
|
||||||
}
|
}
|
||||||
startIndex = i;
|
startIndex = i;
|
||||||
}
|
}
|
||||||
@ -276,10 +288,10 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
|||||||
// Remove false sequence ends (whitespaces)
|
// Remove false sequence ends (whitespaces)
|
||||||
if (checkIfGapSizeBetweenCharactersSmallerThanMaximum(previous, sublist, 0.01f)) {
|
if (checkIfGapSizeBetweenCharactersSmallerThanMaximum(previous, sublist, 0.01f)) {
|
||||||
for (TextPosition t : sublist) {
|
for (TextPosition t : sublist) {
|
||||||
textPositionSequences.get(textPositionSequences.size() - 1).add(t);
|
textPositionSequences.get(textPositionSequences.size() - 1).add(t, textPositionSequences.size());
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart));
|
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart, textPositionSequences.size()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
startIndex = i + 1;
|
startIndex = i + 1;
|
||||||
@ -299,10 +311,10 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
|||||||
if (previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0)
|
if (previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0)
|
||||||
.getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < 0.01) {
|
.getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < 0.01) {
|
||||||
for (TextPosition t : sublist) {
|
for (TextPosition t : sublist) {
|
||||||
textPositionSequences.get(textPositionSequences.size() - 1).add(t);
|
textPositionSequences.get(textPositionSequences.size() - 1).add(t, textPositionSequences.size());
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, isParagraphStart));
|
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, isParagraphStart, textPositionSequences.size()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
super.writeString(text);
|
super.writeString(text);
|
||||||
@ -329,6 +341,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
|||||||
.getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < maximumGapSize;
|
.getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < maximumGapSize;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String getText(PDDocument doc) throws IOException {
|
public String getText(PDDocument doc) throws IOException {
|
||||||
|
|
||||||
|
|||||||
@ -43,7 +43,7 @@ public class MarkedContentUtils {
|
|||||||
|
|
||||||
return markedContentByYPosition.values().stream()
|
return markedContentByYPosition.values().stream()
|
||||||
.map(textPositions -> new TextPositionSequence(textPositions.stream()
|
.map(textPositions -> new TextPositionSequence(textPositions.stream()
|
||||||
.toList(), 0, true)
|
.toList(), 0, true, 0)
|
||||||
.getRectangle())
|
.getRectangle())
|
||||||
.map(t -> new Rectangle2D.Float(t.getTopLeft().getX(), t.getTopLeft().getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight()))).collect(Collectors.toList());
|
.map(t -> new Rectangle2D.Float(t.getTopLeft().getX(), t.getTopLeft().getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight()))).collect(Collectors.toList());
|
||||||
}
|
}
|
||||||
|
|||||||
@ -26,14 +26,14 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
|||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void testViewerDocument() {
|
public void testViewerDocument() {
|
||||||
|
|
||||||
String fileName = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
|
String fileName = "files/Plenarprotokoll 1 (keine Druchsache!) (1).pdf";
|
||||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||||
|
|
||||||
var documentFile = new ClassPathResource(fileName).getFile();
|
var documentFile = new ClassPathResource(fileName).getFile();
|
||||||
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
|
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
|
||||||
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
|
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
|
||||||
|
|
||||||
Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER);
|
Document document = buildGraph(fileName, LayoutParsingType.DOCSTRUM);
|
||||||
long start = System.currentTimeMillis();
|
long start = System.currentTimeMillis();
|
||||||
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
|
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
|
||||||
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
|
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
|
||||||
@ -54,10 +54,11 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
|||||||
var documentFile = new ClassPathResource(fileName).getFile();
|
var documentFile = new ClassPathResource(fileName).getFile();
|
||||||
|
|
||||||
var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE,
|
var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE,
|
||||||
documentFile,
|
documentFile,
|
||||||
new ImageServiceResponse(),
|
new ImageServiceResponse(),
|
||||||
tableResponse,
|
tableResponse,
|
||||||
new VisualLayoutParsingResponse(),Path.of(fileName).getFileName().toFile().toString());
|
new VisualLayoutParsingResponse(),
|
||||||
|
Path.of(fileName).getFileName().toFile().toString());
|
||||||
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
|
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
|
||||||
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
|
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
|
||||||
Document document = DocumentGraphFactory.buildDocumentGraph(classificationDocument);
|
Document document = DocumentGraphFactory.buildDocumentGraph(classificationDocument);
|
||||||
|
|||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user