Compare commits

...

11 Commits

Author SHA1 Message Date
maverickstuder
bb5b631950 RED-8666 2024-03-04 16:31:54 +01:00
maverickstuder
2567d89fbb RED-8666 2024-03-04 09:23:30 +01:00
maverickstuder
aef1146e8f RED-8666 2024-02-29 18:30:22 +01:00
maverickstuder
7f56ed15c8 RED-8666 2024-02-29 13:01:36 +01:00
maverickstuder
91401361e9 RED-8666 2024-02-28 17:49:23 +01:00
maverickstuder
2ab60195e4 RED-8666 2024-02-28 15:28:13 +01:00
Dominique Eifländer
32c877e8f7 RED-7141: Fixed problem with different text directions 2024-02-26 13:08:52 +01:00
Dominique Eifländer
385d4b399e RED-7141: Improved basic block combination logic 2024-02-23 10:01:28 +01:00
Dominique Eifländer
d0e1af3a44 RED-7141: Improved basic block combination logic 2024-02-22 16:57:24 +01:00
Dominique Eifländer
d06933ed17 RED-7141: Added basic block combination logic 2024-02-22 15:50:41 +01:00
Dominique Eifländer
240ef82def RED-7141: Implemented docstrum layout parsing 2024-02-22 13:04:08 +01:00
37 changed files with 2215 additions and 67 deletions

View File

@ -3,5 +3,8 @@ package com.knecon.fforesight.service.layoutparser.internal.api.queue;
public enum LayoutParsingType { public enum LayoutParsingType {
REDACT_MANAGER, REDACT_MANAGER,
TAAS, TAAS,
DOCUMINE DOCUMINE,
DOCSTRUM,
DOCSTRUM_ROW_WISE
} }

View File

@ -1,5 +1,7 @@
package com.knecon.fforesight.service.layoutparser.processor; package com.knecon.fforesight.service.layoutparser.processor;
import static com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType.DOCSTRUM;
import static com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType.DOCSTRUM_ROW_WISE;
import static java.lang.String.format; import static java.lang.String.format;
import java.awt.geom.Rectangle2D; import java.awt.geom.Rectangle2D;
@ -26,6 +28,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsi
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationSection;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
@ -43,6 +46,7 @@ import com.knecon.fforesight.service.layoutparser.processor.services.RulingClean
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService; import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService; import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService;
import com.knecon.fforesight.service.layoutparser.processor.services.TableExtractionService; import com.knecon.fforesight.service.layoutparser.processor.services.TableExtractionService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService; import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService; import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.TaasBlockificationService; import com.knecon.fforesight.service.layoutparser.processor.services.blockification.TaasBlockificationService;
@ -86,6 +90,7 @@ public class LayoutParsingPipeline {
TaasBlockificationService taasBlockificationService; TaasBlockificationService taasBlockificationService;
DocuMineBlockificationService docuMineBlockificationService; DocuMineBlockificationService docuMineBlockificationService;
RedactManagerBlockificationService redactManagerBlockificationService; RedactManagerBlockificationService redactManagerBlockificationService;
DocstrumBlockificationService docstrumBlockificationService;
LayoutGridService layoutGridService; LayoutGridService layoutGridService;
ObservationRegistry observationRegistry; ObservationRegistry observationRegistry;
VisualLayoutParsingAdapter visualLayoutParsingAdapter; VisualLayoutParsingAdapter visualLayoutParsingAdapter;
@ -97,8 +102,7 @@ public class LayoutParsingPipeline {
log.info("Starting layout parsing for {}", layoutParsingRequest.identifier()); log.info("Starting layout parsing for {}", layoutParsingRequest.identifier());
File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId()); File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()) File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()).orElse(originFile);
.orElse(originFile);
VisualLayoutParsingResponse visualLayoutParsingResponse = new VisualLayoutParsingResponse(); VisualLayoutParsingResponse visualLayoutParsingResponse = new VisualLayoutParsingResponse();
if (layoutParsingRequest.visualLayoutParsingFileId().isPresent()) { if (layoutParsingRequest.visualLayoutParsingFileId().isPresent()) {
@ -106,24 +110,20 @@ public class LayoutParsingPipeline {
} }
ImageServiceResponse imageServiceResponse = new ImageServiceResponse(); ImageServiceResponse imageServiceResponse = new ImageServiceResponse();
if (layoutParsingRequest.imagesFileStorageId() if (layoutParsingRequest.imagesFileStorageId().isPresent()) {
.isPresent()) { imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get());
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId()
.get());
} }
TableServiceResponse tableServiceResponse = new TableServiceResponse(); TableServiceResponse tableServiceResponse = new TableServiceResponse();
if (layoutParsingRequest.tablesFileStorageId() if (layoutParsingRequest.tablesFileStorageId().isPresent()) {
.isPresent()) { tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get());
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId()
.get());
} }
ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(), ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(),
originFile, originFile,
imageServiceResponse, imageServiceResponse,
tableServiceResponse, tableServiceResponse,
visualLayoutParsingResponse, visualLayoutParsingResponse,
layoutParsingRequest.identifier().toString()); layoutParsingRequest.identifier().toString());
log.info("Building document graph for {}", layoutParsingRequest.identifier()); log.info("Building document graph for {}", layoutParsingRequest.identifier());
@ -156,25 +156,25 @@ public class LayoutParsingPipeline {
.numberOfPages(documentGraph.getNumberOfPages()) .numberOfPages(documentGraph.getNumberOfPages())
.duration(System.currentTimeMillis() - start) .duration(System.currentTimeMillis() - start)
.message(format(""" .message(format("""
Layout parsing has finished in %.02f s. Layout parsing has finished in %.02f s.
identifiers: %s identifiers: %s
%s %s
Files have been saved with Ids: Files have been saved with Ids:
Structure: %s Structure: %s
Text: %s Text: %s
Positions: %s Positions: %s
PageData: %s PageData: %s
Simplified Text: %s Simplified Text: %s
Viewer Doc: %s""", Viewer Doc: %s""",
((float) (System.currentTimeMillis() - start)) / 1000, ((float) (System.currentTimeMillis() - start)) / 1000,
layoutParsingRequest.identifier(), layoutParsingRequest.identifier(),
buildSemanticNodeCountMessage(documentGraph.getNumberOfPages(), documentGraph.buildSemanticNodeCounts()), buildSemanticNodeCountMessage(documentGraph.getNumberOfPages(), documentGraph.buildSemanticNodeCounts()),
layoutParsingRequest.structureFileStorageId(), layoutParsingRequest.structureFileStorageId(),
layoutParsingRequest.textBlockFileStorageId(), layoutParsingRequest.textBlockFileStorageId(),
layoutParsingRequest.positionBlockFileStorageId(), layoutParsingRequest.positionBlockFileStorageId(),
layoutParsingRequest.pageFileStorageId(), layoutParsingRequest.pageFileStorageId(),
layoutParsingRequest.simplifiedTextStorageId(), layoutParsingRequest.simplifiedTextStorageId(),
layoutParsingRequest.viewerDocumentStorageId())) layoutParsingRequest.viewerDocumentStorageId()))
.build(); .build();
} }
@ -195,14 +195,14 @@ public class LayoutParsingPipeline {
private String buildSemanticNodeCountMessage(int numberOfPages, Map<NodeType, Long> semanticNodeCounts) { private String buildSemanticNodeCountMessage(int numberOfPages, Map<NodeType, Long> semanticNodeCounts) {
return String.format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed", return String.format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed",
numberOfPages, numberOfPages,
semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION), semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION),
semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE), semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE),
semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH), semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH),
semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE), semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE),
semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL), semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL),
semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER), semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER),
semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER)); semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER));
} }
@ -220,7 +220,7 @@ public class LayoutParsingPipeline {
Map<Integer, List<TableCells>> pdfTableCells = cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse); Map<Integer, List<TableCells>> pdfTableCells = cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse);
Map<Integer, List<ClassifiedImage>> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse); Map<Integer, List<ClassifiedImage>> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse);
Map<Integer, List<ClassifiedImage>> signatures = new HashMap<>(); Map<Integer, List<ClassifiedImage>> signatures = new HashMap<>();
if(signatures.size() > 0) { if (signatures.size() > 0) {
visualLayoutParsingAdapter.buildExtractedSignaturesPerPage(visualLayoutParsingResponse); visualLayoutParsingAdapter.buildExtractedSignaturesPerPage(visualLayoutParsingResponse);
} }
@ -266,6 +266,8 @@ public class LayoutParsingPipeline {
case REDACT_MANAGER -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); case REDACT_MANAGER -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
case TAAS -> taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); case TAAS -> taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
case DOCSTRUM -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical(), true);
case DOCSTRUM_ROW_WISE -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical(), false);
}; };
classificationPage.setCleanRulings(cleanRulings); classificationPage.setCleanRulings(cleanRulings);
classificationPage.setRotation(rotation); classificationPage.setRotation(rotation);
@ -283,12 +285,16 @@ public class LayoutParsingPipeline {
imageServiceResponseAdapter.findOcr(classificationPage); imageServiceResponseAdapter.findOcr(classificationPage);
} }
if(signatures.containsKey(pageNumber)) { if (signatures.containsKey(pageNumber)) {
classificationPage.setImages(signatures.get(pageNumber)); classificationPage.setImages(signatures.get(pageNumber));
} }
tableExtractionService.extractTables(cleanRulings, classificationPage); tableExtractionService.extractTables(cleanRulings, classificationPage);
if (layoutParsingType == DOCSTRUM || layoutParsingType == DOCSTRUM_ROW_WISE) {
// docstrumBlockificationService.combineBlocks(classificationPage); //todo 8666
}
buildPageStatistics(classificationPage); buildPageStatistics(classificationPage);
increaseDocumentStatistics(classificationPage, classificationDocument); increaseDocumentStatistics(classificationPage, classificationDocument);
@ -304,11 +310,26 @@ public class LayoutParsingPipeline {
case TAAS -> taasClassificationService.classifyDocument(classificationDocument); case TAAS -> taasClassificationService.classifyDocument(classificationDocument);
case DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument); case DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument);
case REDACT_MANAGER -> redactManagerClassificationService.classifyDocument(classificationDocument); case REDACT_MANAGER -> redactManagerClassificationService.classifyDocument(classificationDocument);
case DOCSTRUM_ROW_WISE -> redactManagerClassificationService.classifyDocument(classificationDocument);
} }
log.info("Building Sections for {}", identifier); log.info("Building Sections for {}", identifier);
sectionsBuilderService.buildSections(classificationDocument);
sectionsBuilderService.addImagesToSections(classificationDocument); if (layoutParsingType == DOCSTRUM || layoutParsingType == DOCSTRUM_ROW_WISE) {
// Currently for debugging return paragraphs as sections, because there is a merging logic in sectionBuilder
List<ClassificationSection> sections = new ArrayList<>();
for (var page : classificationPages) {
page.getTextBlocks().forEach(block -> {
block.setPage(page.getPageNumber());
var section = sectionsBuilderService.buildTextBlock(List.of(block), "a");
sections.add(section);
});
}
classificationDocument.setSections(sections);
} else {
sectionsBuilderService.buildSections(classificationDocument);
sectionsBuilderService.addImagesToSections(classificationDocument);
}
return classificationDocument; return classificationDocument;
} }

View File

@ -45,6 +45,12 @@ public abstract class AbstractPageBlock {
} }
public boolean containsBlock(TextPageBlock other, float threshold) {
return this.minX <= other.getMinX() + threshold && this.maxX >= other.getMaxX() - threshold && this.minY <= other.getMinY() + threshold && this.maxY >= other.getMaxY() - threshold;
}
public boolean contains(AbstractPageBlock other) { public boolean contains(AbstractPageBlock other) {
return this.minX <= other.minX && this.maxX >= other.maxX && this.minY >= other.minY && this.maxY <= other.maxY; return this.minX <= other.minX && this.maxX >= other.maxX && this.minY >= other.minY && this.maxY <= other.maxY;
@ -96,6 +102,12 @@ public abstract class AbstractPageBlock {
} }
public boolean intersects(AbstractPageBlock apb) {
return this.minY < apb.getMaxY() && this.maxY >= apb.getMinY() && this.minX < apb.getMaxX() && this.maxX > apb.getMinX();
}
public abstract boolean isEmpty(); public abstract boolean isEmpty();
} }

View File

@ -46,8 +46,12 @@ public class RedTextPosition {
private String fontName; private String fontName;
@JsonIgnore
private int textSequence;
@SneakyThrows @SneakyThrows
public static RedTextPosition fromTextPosition(TextPosition textPosition) { public static RedTextPosition fromTextPosition(TextPosition textPosition, int textSequence) {
var pos = new RedTextPosition(); var pos = new RedTextPosition();
BeanUtils.copyProperties(textPosition, pos); BeanUtils.copyProperties(textPosition, pos);
@ -63,6 +67,7 @@ public class RedTextPosition {
position[3] = textPosition.getHeightDir(); position[3] = textPosition.getHeightDir();
pos.setPosition(position); pos.setPosition(position);
pos.setTextSequence(textSequence);
return pos; return pos;
} }

View File

@ -73,7 +73,7 @@ public class TextPageBlock extends AbstractPageBlock {
return sequences.get(0).getPageWidth(); return sequences.get(0).getPageWidth();
} }
public static TextPageBlock merge(List<TextPageBlock> textBlocksToMerge) { public static TextPageBlock merge(List<TextPageBlock> textBlocksToMerge) {
@ -82,6 +82,7 @@ public class TextPageBlock extends AbstractPageBlock {
return fromTextPositionSequences(sequences); return fromTextPositionSequences(sequences);
} }
public static TextPageBlock fromTextPositionSequences(List<TextPositionSequence> wordBlockList) { public static TextPageBlock fromTextPositionSequences(List<TextPositionSequence> wordBlockList) {
TextPageBlock textBlock = null; TextPageBlock textBlock = null;
@ -133,7 +134,6 @@ public class TextPageBlock extends AbstractPageBlock {
} }
/** /**
* Returns the minX value in pdf coordinate system. * Returns the minX value in pdf coordinate system.
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation. * Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
@ -362,7 +362,22 @@ public class TextPageBlock extends AbstractPageBlock {
} }
return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString()); return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString());
}
public int getNumberOfLines() {
int numberOfLines = 1;
TextPositionSequence previous = null;
for (TextPositionSequence word : sequences) {
if (previous != null) {
if (word.getMaxYDirAdj() - previous.getMaxYDirAdj() > word.getTextHeight()) {
numberOfLines++;
}
}
previous = word;
}
return numberOfLines;
} }

View File

@ -43,9 +43,9 @@ public class TextPositionSequence implements CharSequence {
} }
public TextPositionSequence(List<TextPosition> textPositions, int page, boolean isParagraphStart) { public TextPositionSequence(List<TextPosition> textPositions, int page, boolean isParagraphStart, int textSequence) {
this.textPositions = textPositions.stream().map(RedTextPosition::fromTextPosition).collect(Collectors.toList()); this.textPositions = textPositions.stream().map(textPosition -> RedTextPosition.fromTextPosition(textPosition, textSequence)).collect(Collectors.toList());
this.page = page; this.page = page;
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir()); this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
this.rotation = textPositions.get(0).getRotation(); this.rotation = textPositions.get(0).getRotation();
@ -55,6 +55,17 @@ public class TextPositionSequence implements CharSequence {
} }
public TextPositionSequence(List<RedTextPosition> textPositions, int page) {
this.textPositions = textPositions;
this.page = page;
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
this.rotation = textPositions.get(0).getRotation();
this.pageHeight = textPositions.get(0).getPageHeight();
this.pageWidth = textPositions.get(0).getPageWidth();
}
@Override @Override
public int length() { public int length() {
@ -122,9 +133,9 @@ public class TextPositionSequence implements CharSequence {
} }
public void add(TextPosition textPosition) { public void add(TextPosition textPosition, int textSequence) {
this.textPositions.add(RedTextPosition.fromTextPosition(textPosition)); this.textPositions.add(RedTextPosition.fromTextPosition(textPosition, textSequence));
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir()); this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
this.rotation = textPositions.get(0).getRotation(); this.rotation = textPositions.get(0).getRotation();

View File

@ -240,7 +240,7 @@ public class SectionsBuilderService {
} }
private ClassificationSection buildTextBlock(List<AbstractPageBlock> wordBlockList, String lastHeadline) { public ClassificationSection buildTextBlock(List<AbstractPageBlock> wordBlockList, String lastHeadline) {
ClassificationSection section = new ClassificationSection(); ClassificationSection section = new ClassificationSection();

View File

@ -0,0 +1,310 @@
package com.knecon.fforesight.service.layoutparser.processor.services.blockification;
import static java.util.stream.Collectors.toSet;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.ListIterator;
import java.util.stream.Collectors;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.DocstrumSegmentationService;
import com.knecon.fforesight.service.layoutparser.processor.utils.QuickSort;
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionSequenceComparator;
import lombok.RequiredArgsConstructor;
@SuppressWarnings("all")
@Service
@RequiredArgsConstructor
public class DocstrumBlockificationService {
private final DocstrumSegmentationService docstrumSegmentationService;
static final float THRESHOLD = 2f;
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines, boolean columnWise) {
List<AbstractPageBlock> abstractPageBlocks = new ArrayList<>();
var zones = docstrumSegmentationService.segmentPage(textPositions, columnWise);
zones.forEach(zone -> {
List<TextPositionSequence> textPositionSequences = new ArrayList<>();
zone.getLines().forEach(line -> {
line.getWords().forEach(word -> {
textPositionSequences.add(new TextPositionSequence(word.getTextPositions(), word.getPage()));
});
});
abstractPageBlocks.addAll(splitZonesAtRulings(textPositionSequences, horizontalRulingLines, verticalRulingLines));
// abstractPageBlocks.add(buildTextBlock(textPositionSequences, 0));
});
return new ClassificationPage(abstractPageBlocks);
}
public void combineBlocks(ClassificationPage page) {
mergeZones(page.getTextBlocks());
TextPageBlock previous = new TextPageBlock();
ListIterator<AbstractPageBlock> itty = page.getTextBlocks().listIterator();
while (itty.hasNext()) {
AbstractPageBlock block = itty.next();
if (block instanceof TablePageBlock) {
continue;
}
TextPageBlock current = (TextPageBlock) block;
if (previous != null && !previous.getSequences().isEmpty()) {
if (current.getDir() == previous.getDir() && (Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD || Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) && (previous.getNumberOfLines() >= 2 && current.getNumberOfLines() >= 2 /* && current.getNumberOfLines() <= 10 */ && previous.getNumberOfLines() <= current.getNumberOfLines())) {
previous.getSequences().addAll(current.getSequences());
previous = buildTextBlock(previous.getSequences(), 0);
itty.remove();
// Might be a left/right mapping add one sorted as well
var sortedDublicate = buildTextBlock(previous.getSequences().stream().sorted(new TextPositionSequenceComparator()).collect(Collectors.toList()), 0);
itty.add(sortedDublicate);
continue;
}
if (current.getDir() == previous.getDir() && (Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD || Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) && (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 /* && current.getNumberOfLines() <= 10 */ || previous.getNumberOfLines() == 2 && current.getNumberOfLines() == 1)) {
previous.getSequences().addAll(current.getSequences());
previous = buildTextBlock(previous.getSequences(), 0);
itty.remove();
continue;
}
if (current.getDir() == previous.getDir() && previous.containsBlock(current, THRESHOLD)) {
previous.getSequences().addAll(current.getSequences());
QuickSort.sort(previous.getSequences(), new TextPositionSequenceComparator());
previous = buildTextBlock(previous.getSequences(), 0);
itty.remove();
continue;
}
}
previous = current;
}
}
private void mergeZones(List<AbstractPageBlock> zones) {
ListIterator<AbstractPageBlock> itty = zones.listIterator();
while (itty.hasNext()) {
AbstractPageBlock block = itty.next();
if (block instanceof TablePageBlock) {
continue;
}
TextPageBlock current = (TextPageBlock) block;
List<AbstractPageBlock> toBeRemoved = new ArrayList<>();
for (AbstractPageBlock innerZone : zones) {
if (innerZone == current) {
continue;
}
if (innerZone instanceof TablePageBlock) {
continue;
}
TextPageBlock inner = (TextPageBlock) innerZone;
if (current.getDir() == inner.getDir() && current.almostIntersects(inner, 0, 0)) {
current.getSequences().addAll(inner.getSequences());
current = buildTextBlock(inner.getSequences().stream().sorted(new TextPositionSequenceComparator()).collect(Collectors.toList()), 0);
}
}
zones.removeAll(toBeRemoved);
}
}
public List<AbstractPageBlock> splitZonesAtRulings(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
int indexOnPage = 0;
List<TextPositionSequence> chunkWords = new ArrayList<>();
List<AbstractPageBlock> chunkBlockList = new ArrayList<>();
float minX = 1000, maxX = 0, minY = 1000, maxY = 0;
TextPositionSequence prev = null;
Float splitX1 = null;
for (TextPositionSequence word : textPositions) {
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
if (prev != null && (splitByDir || isSplitByRuling)) {
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
indexOnPage++;
chunkBlockList.add(cb1);
chunkWords = new ArrayList<>();
minX = 1000;
maxX = 0;
minY = 1000;
maxY = 0;
prev = null;
}
chunkWords.add(word);
prev = word;
if (word.getMinXDirAdj() < minX) {
minX = word.getMinXDirAdj();
}
if (word.getMaxXDirAdj() > maxX) {
maxX = word.getMaxXDirAdj();
}
if (word.getMinYDirAdj() < minY) {
minY = word.getMinYDirAdj();
}
if (word.getMaxYDirAdj() > maxY) {
maxY = word.getMaxYDirAdj();
}
}
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
if (cb1 != null) {
chunkBlockList.add(cb1);
}
return chunkBlockList;
}
private boolean equalsWithThreshold(float f1, float f2) {
return Math.abs(f1 - f2) < THRESHOLD;
}
private TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
TextPageBlock textBlock = null;
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter();
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
for (TextPositionSequence wordBlock : wordBlockList) {
lineHeightFrequencyCounter.add(wordBlock.getTextHeight());
fontSizeFrequencyCounter.add(wordBlock.getFontSize());
spaceFrequencyCounter.add(wordBlock.getSpaceWidth());
fontFrequencyCounter.add(wordBlock.getFont());
styleFrequencyCounter.add(wordBlock.getFontStyle());
if (textBlock == null) {
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
wordBlock.getMaxXDirAdj(),
wordBlock.getMinYDirAdj(),
wordBlock.getMaxYDirAdj(),
wordBlockList,
wordBlock.getRotation());
} else {
TextPageBlock spatialEntity = textBlock.union(wordBlock);
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
}
}
if (textBlock != null) {
textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
}
if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) {
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
}
return textBlock;
}
private boolean isSplitByRuling(float minX,
float minY,
float maxX,
float maxY,
TextPositionSequence word,
List<Ruling> horizontalRulingLines,
List<Ruling> verticalRulingLines) {
return isSplitByRuling(maxX,
minY,
word.getMinXDirAdj(),
word.getMinYDirAdj(),
verticalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(minX,
minY,
word.getMinXDirAdj(),
word.getMaxYDirAdj(),
horizontalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(maxX,
minY,
word.getMinXDirAdj(),
word.getMinYDirAdj(),
horizontalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(minX,
minY,
word.getMinXDirAdj(),
word.getMaxYDirAdj(),
verticalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight());
}
private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List<Ruling> rulingLines, float dir, float pageWidth, float pageHeight) {
for (Ruling ruling : rulingLines) {
var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight);
if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) {
return true;
}
}
return false;
}
private double round(float value, int decimalPoints) {
var d = Math.pow(10, decimalPoints);
return Math.round(value * d) / d;
}
}

View File

@ -0,0 +1,59 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.LineBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.NearestNeighbourService;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.ReadingOrderService;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.SpacingService;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.ZoneBuilderService;
import lombok.RequiredArgsConstructor;
@Service
@RequiredArgsConstructor
public class DocstrumSegmentationService {
private final NearestNeighbourService nearestNeighbourService;
private final SpacingService spacingService;
private final LineBuilderService lineBuilderService;
private final ZoneBuilderService zoneBuilderService;
private final ReadingOrderService readingOrderService;
public List<Zone> segmentPage(List<TextPositionSequence> textPositions, boolean columnWise) {
List<Zone> zones = new ArrayList<>();
zones.addAll(computeZones(textPositions, TextDirection.ZERO));
zones.addAll(computeZones(textPositions, TextDirection.QUARTER_CIRCLE));
zones.addAll(computeZones(textPositions, TextDirection.HALF_CIRCLE));
zones.addAll(computeZones(textPositions, TextDirection.THREE_QUARTER_CIRCLE));
return readingOrderService.resolve(zones, columnWise);
}
private List<Zone> computeZones(List<TextPositionSequence> textPositions, TextDirection direction) {
var positions = textPositions.stream().filter(t -> t.getDir() == direction).map(TextPositionSequence::getTextPositions).flatMap(List::stream).toList();
var characters = positions.stream().map(Character::new).collect(Collectors.toList());
nearestNeighbourService.findNearestNeighbors(characters);
var characterSpacing = spacingService.computeCharacterSpacing(characters);
var lineSpacing = Math.min(spacingService.computeLineSpacing(characters), 20);
var lines = lineBuilderService.buildLines(characters, characterSpacing, lineSpacing);
return zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing);
}
}

View File

@ -0,0 +1,32 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
public class AngleFilter {
protected double lowerAngle;
protected double upperAngle;
public AngleFilter(double lowerAngle, double upperAngle) {
if (lowerAngle < -Math.PI / 2) {
lowerAngle += Math.PI;
}
if (upperAngle >= Math.PI / 2) {
upperAngle -= Math.PI;
}
this.lowerAngle = lowerAngle;
this.upperAngle = upperAngle;
}
public boolean matches(Neighbor neighbor) {
if (lowerAngle <= upperAngle) {
return lowerAngle <= neighbor.getAngle() && neighbor.getAngle() < upperAngle;
} else {
return lowerAngle <= neighbor.getAngle() || neighbor.getAngle() < upperAngle;
}
}
}

View File

@ -0,0 +1,56 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
import java.awt.geom.Rectangle2D;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import lombok.Data;
@Data
public abstract class BoundingBox {
private Rectangle2D bBox;
public double getX() {
return bBox.getX();
}
public double getY() {
return bBox.getY();
}
public double getWidth() {
return bBox.getWidth();
}
public double getHeight() {
return bBox.getHeight();
}
public double getArea() {
return (bBox.getHeight() * bBox.getWidth());
}
public boolean contains(Rectangle2D contained, double tolerance) {
return bBox.getX() <= contained.getX() + tolerance && bBox.getY() <= contained.getY() + tolerance && bBox.getX() + bBox.getWidth() >= contained.getX() + contained.getWidth() - tolerance && bBox.getY() + bBox.getHeight() >= contained.getY() + contained.getHeight() - tolerance;
}
public boolean intersectsY(BoundingBox other) {
return this.getBBox().getMinY() <= other.getBBox().getMaxY() && this.getBBox().getMaxY() >= other.getBBox().getMinY();
}
}

View File

@ -0,0 +1,84 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
import lombok.Data;
@Data
public class Character {
private final double x;
private final double y;
private final RedTextPosition textPosition;
private List<Neighbor> neighbors = new ArrayList<>();
public Character(RedTextPosition chunk) {
this.x = chunk.getXDirAdj() + chunk.getWidthDirAdj() / 2;
this.y = chunk.getYDirAdj() + chunk.getHeightDir() / 2;
this.textPosition = chunk;
}
public double getHeight() {
return textPosition.getHeightDir();
}
public double distance(Character character) {
double dx = getX() - character.getX();
double dy = getY() - character.getY();
return Math.sqrt(dx * dx + dy * dy);
}
public double horizontalDistance(Character character) {
return Math.abs(getX() - character.getX());
}
public double verticalDistance(Character character) {
return Math.abs(getY() - character.getY());
}
public double overlappingDistance(Character other) {
double[] xs = new double[4];
double s = Math.sin(-0), c = Math.cos(-0);
xs[0] = c * x - s * y;
xs[1] = c * (x + textPosition.getWidthDirAdj()) - s * (y + textPosition.getHeightDir());
xs[2] = c * other.x - s * other.y;
xs[3] = c * (other.x + other.textPosition.getWidthDirAdj()) - s * (other.y + other.textPosition.getHeightDir());
boolean overlapping = xs[1] >= xs[2] && xs[3] >= xs[0];
Arrays.sort(xs);
return Math.abs(xs[2] - xs[1]) * (overlapping ? 1 : -1);
}
public void setNeighbors(List<Neighbor> neighbors) {
this.neighbors = neighbors;
}
public double angle(Character character) {
if (getX() > character.getX()) {
return Math.atan2(getY() - character.getY(), getX() - character.getX());
} else {
return Math.atan2(character.getY() - getY(), character.getX() - getX());
}
}
}

View File

@ -0,0 +1,194 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
import java.util.AbstractSet;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Set;
public class DisjointSets<E> implements Iterable<Set<E>> {
private final Map<E, Entry<E>> map = new HashMap<>();
public DisjointSets(Collection<? extends E> collection) {
for (E element : collection) {
map.put(element, new Entry<E>(element));
}
}
public boolean areTogether(E e1, E e2) {
return map.get(e1).findRepresentative() == map.get(e2).findRepresentative();
}
public void union(E e1, E e2) {
Entry<E> r1 = map.get(e1).findRepresentative();
Entry<E> r2 = map.get(e2).findRepresentative();
if (r1 != r2) {
if (r1.size <= r2.size) {
r2.mergeWith(r1);
} else {
r1.mergeWith(r2);
}
}
}
@Override
public Iterator<Set<E>> iterator() {
return new Iterator<>() {
private final Iterator<Entry<E>> iterator = map.values().iterator();
private Entry<E> nextRepresentative;
{
findNextRepresentative();
}
@Override
public boolean hasNext() {
return nextRepresentative != null;
}
@Override
public Set<E> next() {
if (nextRepresentative == null) {
throw new NoSuchElementException();
}
Set<E> result = nextRepresentative.asSet();
findNextRepresentative();
return result;
}
private void findNextRepresentative() {
while (iterator.hasNext()) {
Entry<E> candidate = iterator.next();
if (candidate.isRepresentative()) {
nextRepresentative = candidate;
return;
}
}
nextRepresentative = null;
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
};
}
private static class Entry<E> {
private int size = 1;
private final E value;
private Entry<E> parent = this;
private Entry<E> next = null;
private Entry<E> last = this;
Entry(E value) {
this.value = value;
}
void mergeWith(Entry<E> otherRepresentative) {
size += otherRepresentative.size;
last.next = otherRepresentative;
last = otherRepresentative.last;
otherRepresentative.parent = this;
}
Entry<E> findRepresentative() {
Entry<E> representative = parent;
while (representative.parent != representative) {
representative = representative.parent;
}
for (Entry<E> entry = this; entry != representative; ) {
Entry<E> nextEntry = entry.parent;
entry.parent = representative;
entry = nextEntry;
}
return representative;
}
boolean isRepresentative() {
return parent == this;
}
Set<E> asSet() {
return new AbstractSet<E>() {
@Override
public Iterator<E> iterator() {
return new Iterator<E>() {
private Entry<E> nextEntry = findRepresentative();
@Override
public boolean hasNext() {
return nextEntry != null;
}
@Override
public E next() {
if (nextEntry == null) {
throw new NoSuchElementException();
}
E result = nextEntry.value;
nextEntry = nextEntry.next;
return result;
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
};
}
@Override
public int size() {
return findRepresentative().size;
}
};
}
}
}

View File

@ -0,0 +1,91 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
public class Histogram {
private static final double EPSILON = 1.0e-6;
private final double min;
private final double resolution;
private double[] frequencies;
public Histogram(double minValue, double maxValue, double resolution) {
this.min = minValue - EPSILON;
double delta = maxValue - minValue + 2 * EPSILON;
int size = Math.max(1, (int) Math.round((maxValue - minValue) / resolution));
this.resolution = delta / size;
this.frequencies = new double[size];
}
public void kernelSmooth(double[] kernel) {
double[] newFrequencies = new double[frequencies.length];
int shift = (kernel.length - 1) / 2;
for (int i = 0; i < kernel.length; i++) {
int jStart = Math.max(0, i - shift);
int jEnd = Math.min(frequencies.length, frequencies.length + i - shift);
for (int j = jStart; j < jEnd; j++) {
newFrequencies[j - i + shift] += kernel[i] * frequencies[j];
}
}
frequencies = newFrequencies;
}
public double[] createGaussianKernel(double length, double stdDeviation) {
int r = (int) Math.round(length / resolution) / 2;
stdDeviation /= resolution;
int size = 2 * r + 1;
double[] kernel = new double[size];
double sum = 0;
double b = 2 * stdDeviation * stdDeviation;
double a = 1 / Math.sqrt(Math.PI * b);
for (int i = 0; i < size; i++) {
kernel[i] = a * Math.exp(-(i - r) * (i - r) / b);
sum += kernel[i];
}
for (int i = 0; i < size; i++) {
kernel[i] /= sum;
}
return kernel;
}
public void gaussianSmooth(double windowLength, double stdDeviation) {
kernelSmooth(createGaussianKernel(windowLength, stdDeviation));
}
public void add(double value) {
frequencies[(int) ((value - min) / resolution)] += 1.0;
}
public int getSize() {
return frequencies.length;
}
public double getPeakValue() {
int peakIndex = 0;
for (int i = 1; i < frequencies.length; i++) {
if (frequencies[i] > frequencies[peakIndex]) {
peakIndex = i;
}
}
int peakEndIndex = peakIndex + 1;
final double EPS = 0.0001;
while (peakEndIndex < frequencies.length && Math.abs(frequencies[peakEndIndex] - frequencies[peakIndex]) < EPS) {
peakEndIndex++;
}
return ((double) peakIndex + peakEndIndex) / 2 * resolution + min;
}
}

View File

@ -0,0 +1,165 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import lombok.Data;
@Data
public class Line extends BoundingBox {
private static final double WORD_DISTANCE_MULTIPLIER = 0.2;
private final double x0;
private final double y0;
private final double x1;
private final double y1;
private final double height;
private final List<Character> characters;
private final List<TextPositionSequence> words = new ArrayList<>();
public Line(List<Character> characters, double wordSpacing) {
this.characters = characters;
if (characters.size() >= 2) {
// linear regression
double sx = 0.0, sxx = 0.0, sxy = 0.0, sy = 0.0;
for (Character character : characters) {
sx += character.getX();
sxx += character.getX() * character.getX();
sxy += character.getX() * character.getY();
sy += character.getY();
}
double b = (characters.size() * sxy - sx * sy) / (characters.size() * sxx - sx * sx);
double a = (sy - b * sx) / characters.size();
this.x0 = characters.get(0).getX();
this.y0 = a + b * this.x0;
this.x1 = characters.get(characters.size() - 1).getX();
this.y1 = a + b * this.x1;
} else {
Character character = characters.get(0);
double dx = character.getTextPosition().getWidthDirAdj() / 3;
double dy = dx * Math.tan(0);
this.x0 = character.getX() - dx;
this.x1 = character.getX() + dx;
this.y0 = character.getY() - dy;
this.y1 = character.getY() + dy;
}
height = computeHeight();
computeWords(wordSpacing * WORD_DISTANCE_MULTIPLIER);
buildBBox();
}
public double getAngle() {
return Math.atan2(y1 - y0, x1 - x0);
}
public double getLength() {
return Math.sqrt((x0 - x1) * (x0 - x1) + (y0 - y1) * (y0 - y1));
}
private double computeHeight() {
double sum = 0.0;
for (Character component : characters) {
sum += component.getHeight();
}
return sum / characters.size();
}
public double angularDifference(Line j) {
double diff = Math.abs(getAngle() - j.getAngle());
if (diff <= Math.PI / 2) {
return diff;
} else {
return Math.PI - diff;
}
}
public double horizontalDistance(Line other) {
double[] xs = new double[4];
xs[0] = x0;
xs[1] = x1;
xs[2] = other.x0;
xs[3] = other.x1;
boolean overlapping = xs[1] >= xs[2] && xs[3] >= xs[0];
Arrays.sort(xs);
return Math.abs(xs[2] - xs[1]) * (overlapping ? 1 : -1);
}
public double verticalDistance(Line other) {
double ym = (y0 + y1) / 2;
double yn = (other.y0 + other.y1) / 2;
return Math.abs(ym - yn) / Math.sqrt(1);
}
private void computeWords(double wordSpacing) {
TextPositionSequence word = new TextPositionSequence();
Character previous = null;
for (Character current : characters) {
if (previous != null) {
double dist = current.getTextPosition().getXDirAdj() - previous.getTextPosition().getXDirAdj() - previous.getTextPosition().getWidthDirAdj();
if (dist > wordSpacing) {
words.add(word);
word = new TextPositionSequence();
}
}
word.getTextPositions().add(current.getTextPosition());
previous = current;
}
words.add(word);
}
private void buildBBox() {
double minX = Double.POSITIVE_INFINITY;
double minY = Double.POSITIVE_INFINITY;
double maxX = Double.NEGATIVE_INFINITY;
double maxY = Double.NEGATIVE_INFINITY;
for (Character character : characters) {
minX = Math.min(minX, character.getTextPosition().getXDirAdj());
minY = Math.min(minY, character.getTextPosition().getYDirAdj());
maxX = Math.max(maxX, character.getTextPosition().getXDirAdj() + character.getTextPosition().getWidthDirAdj());
maxY = Math.max(maxY, character.getTextPosition().getYDirAdj() + character.getTextPosition().getHeightDir());
}
this.setBBox(new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY));
}
public String toString() {
StringBuilder sb = new StringBuilder();
words.forEach(word -> sb.append(word.toString()).append(" "));
return sb.toString().trim();
}
}

View File

@ -0,0 +1,36 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
import lombok.Getter;
public class Neighbor {
@Getter
private final double distance;
@Getter
private final double angle;
private final Character originCharacter;
@Getter
private final Character character;
public Neighbor(Character neighbor, Character origin) {
this.distance = neighbor.distance(origin);
this.angle = neighbor.angle(origin);
this.character = neighbor;
this.originCharacter = origin;
}
public double getHorizontalDistance() {
return character.horizontalDistance(originCharacter);
}
public double getVerticalDistance() {
return character.verticalDistance(originCharacter);
}
}

View File

@ -0,0 +1,51 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
import java.awt.geom.Rectangle2D;
import java.util.Comparator;
import java.util.List;
import lombok.Data;
@Data
public class Zone extends BoundingBox {
private List<Line> lines;
private int readingOrder = -1;
public Zone(List<Line> lines) {
lines.sort(Comparator.comparingDouble(Line::getY));
this.lines = lines;
buildBBox();
}
public void buildBBox() {
double minX = Double.POSITIVE_INFINITY;
double minY = Double.POSITIVE_INFINITY;
double maxX = Double.NEGATIVE_INFINITY;
double maxY = Double.NEGATIVE_INFINITY;
for (Line line : lines) {
minX = Math.min(minX, line.getX());
minY = Math.min(minY, line.getY());
maxX = Math.max(maxX, line.getX() + line.getWidth());
maxY = Math.max(maxY, line.getY() + line.getHeight());
}
this.setBBox(new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY));
}
public String toString() {
StringBuilder sb = new StringBuilder();
lines.forEach(line -> sb.append(line.toString()).append("\n"));
return sb.toString().trim();
}
}

View File

@ -0,0 +1,59 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder;
public enum IntervalRelations {
// Unknown interval relations.
UNKNOWN,
// X takes place before Y.
// |____X____|......................
// ......................|____Y____|
PRECEDES,
// X meets Y.
// |____X____|.................
// ................|____Y____|
MEETS,
// X overlaps with Y.
// |______X______|.................
// ................|______Y______|
OVERLAPS,
// X starts Y.
// |____X____|.................
// |_____Y_____|..............
STARTS,
// X during Y.
// ........|____X____|.........
// .....|______Y______|.....
DURING,
// X finishes Y.
// .................|____X____|
// ..............|_____Y_____|
FINISHES,
// Inverse precedes.
PRECEDES_INVERSE,
// Inverse meets.
MEETS_INVERSE,
// Inverse overlaps.
OVERLAPS_INVERSE,
// Inverse Starts.
STARTS_INVERSE,
// Inverse during.
DURING_INVERSE,
// Inverse finishes.
FINISHES_INVERSE,
// X is equal to Y.
// ..........|____X____|............
// ..........|____Y____|............
EQUALS
}

View File

@ -0,0 +1,11 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder;
import java.util.Collection;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone;
public interface ReadingOrderDetector {
Collection<Zone> get(Collection<Zone> zones);
}

View File

@ -0,0 +1,10 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder;
public enum SpatialReasoningRules {
// In western culture the reading order is from left to right and from top to bottom.
BASIC,
// The diagonal direction 'left-bottom to top-right' cannot be present among the Basic relations allowed.
ROW_WISE,
// The diagonal direction 'right-top to bottom-left' cannot be present among the Basic relations allowed.
COLUMN_WISE;
}

View File

@ -0,0 +1,261 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone;
import lombok.Getter;
public class UnsupervisedReadingOrderDetector {
private boolean useRenderingOrder = true;
@Getter
private SpatialReasoningRules spatialReasoningRule = SpatialReasoningRules.COLUMN_WISE;
private double tolerance = 5;
private ZoneComparator zoneComparator;
public boolean useRenderingOrder() {
return useRenderingOrder;
}
public UnsupervisedReadingOrderDetector() {
configureComparator();
}
public UnsupervisedReadingOrderDetector(double tolerance, SpatialReasoningRules spatialReasoningRule, boolean useRenderingOrder) {
this.tolerance = tolerance;
this.spatialReasoningRule = spatialReasoningRule;
this.useRenderingOrder = useRenderingOrder;
configureComparator();
}
public void configureComparator() {
switch (spatialReasoningRule) {
case COLUMN_WISE:
if (useRenderingOrder) {
zoneComparator = (Zone z1, Zone z2, double t) -> getBeforeInReadingColumnWise(z1, z2, t) || getBeforeInRendering(z1, z2);
} else {
zoneComparator = this::getBeforeInReadingColumnWise;
}
break;
case ROW_WISE:
if (useRenderingOrder) {
zoneComparator = (Zone z1, Zone z2, double t) -> getBeforeInReadingRowWise(z1, z2, t) || getBeforeInRendering(z1, z2);
} else {
zoneComparator = this::getBeforeInReadingRowWise;
}
break;
case BASIC:
default:
if (useRenderingOrder) {
zoneComparator = (Zone z1, Zone z2, double t) -> getBeforeInReading(z1, z2, t) || getBeforeInRendering(z1, z2);
} else {
zoneComparator = this::getBeforeInReading;
}
break;
}
}
public List<Zone> get(List<Zone> zones) {
int readingOrder = 0;
Map<Integer, List<Integer>> graph = buildGraph(zones);
List<Zone> orderedZones = new ArrayList<>();
while (!graph.isEmpty()) {
int maxCount = graph.values()
.stream()
.mapToInt(List::size)
.max()
.orElse(0);
Map.Entry<Integer, List<Integer>> current = graph.entrySet()
.stream()
.filter(entry -> entry.getValue().size() == maxCount)
.findFirst()
.orElse(null);
if (current != null) {
int index = current.getKey();
graph.remove(index);
for (List<Integer> valueList : graph.values()) {
valueList.remove(Integer.valueOf(index));
}
Zone zone = zones.get(index);
zone.setReadingOrder(readingOrder++);
orderedZones.add(zone);
}
}
return orderedZones;
}
private Map<Integer, List<Integer>> buildGraph(List<Zone> zones) {
Map<Integer, List<Integer>> graph = new HashMap<>();
for (int i = 0; i < zones.size(); i++) {
graph.put(i, new ArrayList<>());
}
for (int i = 0; i < zones.size(); i++) {
Zone zone1 = zones.get(i);
for (int j = 0; j < zones.size(); j++) {
if (i == j) {
continue;
}
Zone zone2 = zones.get(j);
if (zoneComparator.isBefore(zone1, zone2, tolerance)) {
graph.get(i).add(j);
}
}
}
return graph;
}
private boolean getBeforeInRendering(Zone z1, Zone z2) {
double avgTextSequenceZ1 = z1.getLines()
.stream()
.flatMap(line -> line.getCharacters()
.stream())
.map(character -> character.getTextPosition().getTextSequence())
.collect(Collectors.averagingDouble(Integer::intValue));
double avgTextSequenceZ2 = z2.getLines()
.stream()
.flatMap(line -> line.getCharacters()
.stream())
.map(character -> character.getTextPosition().getTextSequence())
.collect(Collectors.averagingDouble(Integer::intValue));
return avgTextSequenceZ1 < avgTextSequenceZ2;
}
private boolean getBeforeInReading(Zone z1, Zone z2, double tolerance) {
IntervalRelations xRelation = getIntervalRelationX(z1, z2, tolerance).get(0);
IntervalRelations yRelation = getIntervalRelationY(z1, z2, tolerance).get(0);
return xRelation == IntervalRelations.PRECEDES
|| yRelation == IntervalRelations.PRECEDES
|| xRelation == IntervalRelations.MEETS
|| yRelation == IntervalRelations.MEETS
|| xRelation == IntervalRelations.OVERLAPS
|| yRelation == IntervalRelations.OVERLAPS;
}
private boolean getBeforeInReadingColumnWise(Zone z1, Zone z2, double tolerance) {
IntervalRelations xRelation = getIntervalRelationX(z1, z2, tolerance).get(0);
IntervalRelations yRelation = getIntervalRelationY(z1, z2, tolerance).get(0);
return getIntervalRelations(xRelation, yRelation);
}
private static boolean getIntervalRelations(IntervalRelations relation1, IntervalRelations relation2) {
return relation1 == IntervalRelations.PRECEDES //
|| relation1 == IntervalRelations.MEETS //
|| relation1 == IntervalRelations.OVERLAPS && //
(relation2 == IntervalRelations.PRECEDES //
|| relation2 == IntervalRelations.MEETS //
|| relation2 == IntervalRelations.OVERLAPS) //
|| ((relation2 == IntervalRelations.PRECEDES || relation2 == IntervalRelations.MEETS || relation2 == IntervalRelations.OVERLAPS) && //
(relation1 == IntervalRelations.STARTS //
|| relation1 == IntervalRelations.FINISHES_INVERSE //
|| relation1 == IntervalRelations.EQUALS //
|| relation1 == IntervalRelations.DURING //
|| relation1 == IntervalRelations.DURING_INVERSE //
|| relation1 == IntervalRelations.FINISHES //
|| relation1 == IntervalRelations.STARTS_INVERSE //
|| relation1 == IntervalRelations.OVERLAPS_INVERSE));
}
private boolean getBeforeInReadingRowWise(Zone z1, Zone z2, double tolerance) {
IntervalRelations xRelations = getIntervalRelationX(z1, z2, tolerance).get(0);
IntervalRelations yRelations = getIntervalRelationY(z1, z2, tolerance).get(0);
return getIntervalRelations(yRelations, xRelations);
}
private static List<IntervalRelations> getIntervalRelationX(Zone z1, Zone z2, double t) {
return getIntervalRelation(new ImmutablePair<>(z1.getX(), z1.getX() + z1.getWidth()), new ImmutablePair<>(z2.getX(), z2.getX() + z2.getWidth()), t);
}
private static List<IntervalRelations> getIntervalRelationY(Zone z1, Zone z2, double t) {
return getIntervalRelation(new ImmutablePair<>(z1.getY(), z1.getY() + z1.getHeight()), new ImmutablePair<>(z2.getY(), z2.getY() + z2.getHeight()), t);
}
private static List<IntervalRelations> getIntervalRelation(Pair<Double, Double> a, Pair<Double, Double> b, double t) {
var intervalRelations = getIntervalRelation(a, b, t, false);
intervalRelations.addAll(getIntervalRelation(b, a, t, true));
if ((b.getLeft() - t <= a.getLeft() && a.getLeft() <= b.getLeft() + t) && (b.getRight() - t <= a.getRight() && a.getRight() <= b.getRight() + t)) {
intervalRelations.add(IntervalRelations.EQUALS);
}
return intervalRelations;
}
private static List<IntervalRelations> getIntervalRelation(Pair<Double, Double> a, Pair<Double, Double> b, double t, boolean inverse) {
List<IntervalRelations> intervalRelations = new ArrayList<>();
if (a.getRight() < b.getLeft() - t) {
intervalRelations.add(inverse ? IntervalRelations.PRECEDES_INVERSE : IntervalRelations.PRECEDES);
} if (b.getLeft() - t <= a.getRight() && a.getRight() <= b.getLeft() + t) {
intervalRelations.add(inverse ? IntervalRelations.MEETS_INVERSE : IntervalRelations.MEETS);
} if (a.getLeft() < b.getLeft() - t && (b.getLeft() + t < a.getRight() && a.getRight() < b.getRight() - t)) {
intervalRelations.add(inverse ? IntervalRelations.OVERLAPS_INVERSE : IntervalRelations.OVERLAPS);
} if ((b.getLeft() - t <= a.getLeft() && a.getLeft() <= b.getLeft() + t) && a.getRight() < b.getRight() - t) {
intervalRelations.add(inverse ? IntervalRelations.STARTS_INVERSE : IntervalRelations.STARTS);
} if (a.getLeft() > b.getLeft() + t && a.getRight() < b.getRight() + t) {
intervalRelations.add(inverse ? IntervalRelations.DURING_INVERSE : IntervalRelations.DURING);
} if (a.getLeft() > b.getLeft() + t && (b.getRight() - t <= a.getRight() && a.getRight() <= b.getRight() + t)) {
intervalRelations.add(inverse ? IntervalRelations.FINISHES_INVERSE : IntervalRelations.FINISHES);
}
return intervalRelations;
}
}

View File

@ -0,0 +1,8 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone;
@FunctionalInterface
public interface ZoneComparator {
boolean isBefore(Zone zone1, Zone zone2, double tolerance);
}

View File

@ -0,0 +1,52 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.AngleFilter;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.DisjointSets;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Line;
@Service
public class LineBuilderService {
private static final double CHARACTER_SPACING_DISTANCE_MULTIPLIER = 3.5;
private static final double MAX_VERTICAL_CHARACTER_DISTANCE = 0.67;
private static final double ANGLE_TOLERANCE = Math.PI / 6;
public List<Line> buildLines(List<Character> characters, double characterSpacing, double lineSpacing) {
double maxHorizontalDistance = characterSpacing * CHARACTER_SPACING_DISTANCE_MULTIPLIER;
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_CHARACTER_DISTANCE;
DisjointSets<Character> sets = new DisjointSets<>(characters);
AngleFilter filter = new AngleFilter(-ANGLE_TOLERANCE, ANGLE_TOLERANCE);
characters.forEach(character -> {
character.getNeighbors().forEach(neighbor -> {
double x = neighbor.getHorizontalDistance() / maxHorizontalDistance;
double y = neighbor.getVerticalDistance() / maxVerticalDistance;
if (character.getTextPosition().getDir() == neighbor.getCharacter().getTextPosition().getDir() && filter.matches(neighbor) && Math.pow(x, 2) + Math.pow(y,
2) <= 1) {
sets.union(character, neighbor.getCharacter());
}
});
});
List<Line> lines = new ArrayList<>();
sets.forEach(group -> {
List<Character> lineCharacters = new ArrayList<>(group);
// QuickSort.sort(lineCharacters, new CharacterComparator());
lineCharacters.sort(Comparator.comparingDouble(Character::getX));
lines.add(new Line(lineCharacters, characterSpacing));
});
return lines;
}
}

View File

@ -0,0 +1,78 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Neighbor;
@Service
public class NearestNeighbourService {
private static final int NUMBER_OF_NEIGHBOURS = 8;
private static final double STEP = 16.0;
public void findNearestNeighbors(List<Character> characters) {
if (characters.isEmpty()) {
return;
}
characters.sort(Comparator.comparingDouble(Character::getX));
int maxNeighborCount = NUMBER_OF_NEIGHBOURS;
if (characters.size() <= NUMBER_OF_NEIGHBOURS) {
maxNeighborCount = characters.size() - 1;
}
for (int i = 0; i < characters.size(); i++) {
List<Neighbor> candidates = new ArrayList<>();
int start = i;
int end = i + 1;
double distance = Double.POSITIVE_INFINITY;
for (double searchDistance = 0; searchDistance < distance; ) {
searchDistance += STEP;
boolean newCandidatesFound = false;
while (start > 0 && characters.get(i).getX() - characters.get(start - 1).getX() < searchDistance) {
start--;
candidates.add(new Neighbor(characters.get(start), characters.get(i)));
clearLeastDistant(candidates, maxNeighborCount);
newCandidatesFound = true;
}
while (end < characters.size() && characters.get(end).getX() - characters.get(i).getX() < searchDistance) {
candidates.add(new Neighbor(characters.get(end), characters.get(i)));
clearLeastDistant(candidates, maxNeighborCount);
end++;
newCandidatesFound = true;
}
if (newCandidatesFound && candidates.size() >= maxNeighborCount) {
distance = candidates.get(maxNeighborCount - 1).getDistance();
}
}
clearLeastDistant(candidates, maxNeighborCount);
characters.get(i).setNeighbors(new ArrayList<>(candidates));
}
}
private void clearLeastDistant(List<Neighbor> candidates, int maxNeighborCount) {
if (candidates.size() > maxNeighborCount) {
candidates.sort(Comparator.comparingDouble(Neighbor::getDistance));
candidates.remove(candidates.remove(candidates.size() - 1));
}
}
}

View File

@ -0,0 +1,167 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.ListIterator;
import java.util.Map;
import java.util.stream.Collectors;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder.SpatialReasoningRules;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder.UnsupervisedReadingOrderDetector;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.utils.DoubleUtils;
@Service
public class ReadingOrderService {
private static final double THRESHOLD = 1;
public List<Zone> resolveNew(List<Zone> zones, boolean columnWise) {
if (zones.isEmpty() || zones.size() == 1) {
return zones;
}
SpatialReasoningRules spatialReasoningRules = columnWise ? SpatialReasoningRules.COLUMN_WISE : SpatialReasoningRules.ROW_WISE;
var unsupervisedReadingOrderDetector = new UnsupervisedReadingOrderDetector(1, spatialReasoningRules, false);
return unsupervisedReadingOrderDetector.get(zones);
}
public List<Zone> resolve(List<Zone> zones, boolean columnWise) {
if (zones.isEmpty() || zones.size() == 1) {
return zones;
}
Map<Long, Integer> histogram = new HashMap<>();
for (Zone zone : zones) {
long minY = Math.round(zone.getBBox().getMinY());
long maxY = Math.round(zone.getBBox().getMaxY());
for (long i = minY; i <= maxY; i++) {
histogram.put(i, histogram.getOrDefault(i, 0) + 1);
}
}
columnWise = histogram.values()
.stream()
.mapToInt(Integer::intValue).average()
.orElse(1) > 1.5;
if (!columnWise) {
zones.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
return zones;
}
return resolveMultiColumnReadingOder(zones);
}
private List<Zone> resolveMultiColumnReadingOder(List<Zone> zones) {
// Simple reading order resolver for multi column page layout as described here : https://pub.towardsai.net/advanced-rag-02-unveiling-pdf-parsing-b84ae866344e
// TODO implement a more fancy reading order resolver see https://github.com/BobLd/DocumentLayoutAnalysis/blob/master/README.md#reading-order
double minX = Double.POSITIVE_INFINITY;
double maxX = Double.NEGATIVE_INFINITY;
for (Zone zone : zones) {
if (zone.getX() < minX) {
minX = zone.getX();
}
if (zone.getX() + zone.getWidth() > maxX) {
maxX = zone.getX() + zone.getWidth();
}
}
double midLineXCoordinate = (minX + maxX) / 2;
List<Zone> leftOf = new ArrayList<>();
List<Zone> rightOf = new ArrayList<>();
List<Zone> middle = new ArrayList<>();
for (Zone zone : zones) {
if (zone.getX() < midLineXCoordinate && zone.getX() + zone.getWidth() < midLineXCoordinate) {
leftOf.add(zone);
} else if (zone.getX() > midLineXCoordinate && zone.getX() + zone.getWidth() > midLineXCoordinate) {
rightOf.add(zone);
} else {
middle.add(zone);
}
}
List<Zone> leftNotIntersecting = new ArrayList<>();
for (Zone leftZone : leftOf) {
boolean intersects = false;
for (Zone rightZone : rightOf) {
if (leftZone.intersectsY(rightZone)) {
intersects = true;
break;
}
}
if (!intersects) {
leftNotIntersecting.add(leftZone);
}
}
List<Zone> rightNotIntersecting = new ArrayList<>();
for (Zone rightZone : rightOf) {
boolean intersects = false;
for (Zone leftZone : leftOf) {
if (rightZone.intersectsY(leftZone)) {
intersects = true;
break;
}
}
if (!intersects) {
rightNotIntersecting.add(rightZone);
}
}
leftOf.removeAll(leftNotIntersecting);
rightOf.removeAll(rightNotIntersecting);
middle.addAll(leftNotIntersecting);
middle.addAll(rightNotIntersecting);
leftOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
rightOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
middle.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
List<Zone> sortedZones = new ArrayList<>();
sortedZones.addAll(leftOf);
sortedZones.addAll(rightOf);
ListIterator<Zone> itty = middle.listIterator();
while (itty.hasNext()) {
Zone current = itty.next();
for (int i = 0; i < sortedZones.size(); i++) {
if (current.getY() < sortedZones.get(i).getY()) {
sortedZones.add(i, current);
itty.remove();
break;
}
}
}
sortedZones.addAll(middle);
return sortedZones;
}
}

View File

@ -0,0 +1,56 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service;
import java.util.List;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.AngleFilter;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Histogram;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Neighbor;
@Service
public class SpacingService {
private static final double SPACING_HISTOGRAM_RESOLUTION = 0.5;
private static final double SPACING_HISTOGRAM_SMOOTHING_LENGTH = 2.5;
private static final double SPACING_HIST_SMOOTHING_STANDARD_DEVIATION = 0.5;
private static final double ANGLE_TOLERANCE = Math.PI / 6;
public double computeCharacterSpacing(List<Character> characters) {
return computeSpacing(characters, 0);
}
public double computeLineSpacing(List<Character> characters) {
return computeSpacing(characters, Math.PI / 2);
}
private double computeSpacing(List<Character> characters, double angle) {
double maxDistance = Double.NEGATIVE_INFINITY;
for (Character character : characters) {
for (Neighbor neighbor : character.getNeighbors()) {
maxDistance = Math.max(maxDistance, neighbor.getDistance());
}
}
Histogram histogram = new Histogram(0, maxDistance, SPACING_HISTOGRAM_RESOLUTION);
AngleFilter angleFilter = new AngleFilter(angle - ANGLE_TOLERANCE, angle + ANGLE_TOLERANCE);
for (Character character : characters) {
for (Neighbor neighbor : character.getNeighbors()) {
if (angleFilter.matches(neighbor)) {
histogram.add(neighbor.getDistance());
}
}
}
histogram.gaussianSmooth(SPACING_HISTOGRAM_SMOOTHING_LENGTH, SPACING_HIST_SMOOTHING_STANDARD_DEVIATION);
return histogram.getPeakValue();
}
}

View File

@ -0,0 +1,190 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.ListIterator;
import java.util.Set;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.DisjointSets;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Line;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone;
@Service
public class ZoneBuilderService {
private static final double MIN_HORIZONTAL_DISTANCE_MULTIPLIER = -0.5;
private static final double MAX_VERTICAL_DISTANCE_MULTIPLIER = 1.2;
private static final double MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER = -3.0;
private static final double MAX_VERTICAL_MERGE_DISTANCE_MULTIPLIER = 0.5;
private static final double MIN_LINE_SIZE_SCALE = 0.9;
private static final double MAX_LINE_SIZE_SCALE = 2.5;
private static final double ANGLE_TOLERANCE = Math.PI / 6;
private static final int MAX_ZONES = 300;
private static final double MAX_VERTICAL_MERGE_DISTANCE = 0.5;
public List<Zone> buildZones(List<Line> lines, double characterSpacing, double lineSpacing) {
double minHorizontalDistance = characterSpacing * MIN_HORIZONTAL_DISTANCE_MULTIPLIER;
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_DISTANCE_MULTIPLIER;
double minHorizontalMergeDistance = characterSpacing * MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER;
double maxVerticalMergeDistance = lineSpacing * MAX_VERTICAL_MERGE_DISTANCE_MULTIPLIER;
DisjointSets<Line> sets = new DisjointSets<>(lines);
double meanHeight = calculateMeanHeight(lines);
lines.forEach(outerLine -> //
lines.forEach(innerLine -> {
double scale = Math.min(outerLine.getHeight(), innerLine.getHeight()) / meanHeight;
scale = Math.max(MIN_LINE_SIZE_SCALE, Math.min(scale, MAX_LINE_SIZE_SCALE));
if (!sets.areTogether(outerLine, innerLine) && outerLine.angularDifference(innerLine) <= ANGLE_TOLERANCE) {
double horizontalDistance = outerLine.horizontalDistance(innerLine) / scale;
double verticalDistance = outerLine.verticalDistance(innerLine) / scale;
if (minHorizontalDistance <= horizontalDistance && verticalDistance <= maxVerticalDistance //
|| minHorizontalMergeDistance <= horizontalDistance && verticalDistance <= maxVerticalMergeDistance) {
sets.union(outerLine, innerLine);
}
}
}));
List<Zone> zones = new ArrayList<>();
sets.forEach(group -> {
zones.add(new Zone(new ArrayList<>(group)));
});
// List<Zone> mergedZones = mergeZones(zones);
List<Zone> finalZones = zones;
if (finalZones.size() > MAX_ZONES) {
List<Line> oneZoneLines = new ArrayList<>();
for (Zone zone : finalZones) {
oneZoneLines.addAll(zone.getLines());
}
return List.of(mergeLinesInZone(oneZoneLines, characterSpacing, lineSpacing));
}
return finalZones;
}
private List<Zone> mergeZones(List<Zone> zones) {
ListIterator<Zone> itty = zones.listIterator();
while (itty.hasNext()) {
Zone current = itty.next();
for (Zone inner : zones) {
if (inner == current) {
continue;
}
if (current.getBBox().intersects(inner.getBBox())) {
inner.getLines().addAll(current.getLines());
inner.buildBBox();
itty.remove();
break;
}
}
}
return zones;
}
private double calculateMeanHeight(List<Line> lines) {
double meanHeight = 0.0;
double weights = 0.0;
for (Line line : lines) {
double weight = line.getLength();
meanHeight += line.getHeight() * weight;
weights += weight;
}
meanHeight /= weights;
return meanHeight;
}
private List<Zone> mergeLinesInZones(List<Zone> zones, double characterSpacing, double lineSpacing) {
List<Zone> merged = new ArrayList<>();
for (Zone zone : zones) {
merged.add(mergeLinesInZone(zone.getLines(), characterSpacing, lineSpacing));
}
return merged;
}
private Zone mergeLinesInZone(List<Line> lines, double characterSpacing, double lineSpacing) {
double maxHorizontalDistance = 0;
double minVerticalDistance = 0;
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_MERGE_DISTANCE;
DisjointSets<Line> sets = new DisjointSets<>(lines);
lines.forEach(outer -> {
lines.forEach(inner -> {
if (inner != outer) {
double horizontalDistance = outer.horizontalDistance(inner);
double verticalDistance = outer.verticalDistance(inner);
if (horizontalDistance <= maxHorizontalDistance && minVerticalDistance <= verticalDistance && verticalDistance <= maxVerticalDistance) {
sets.union(outer, inner);
} else if (minVerticalDistance <= verticalDistance && verticalDistance <= maxVerticalDistance && Math.abs(horizontalDistance - Math.min(outer.getLength(),
inner.getLength())) < 0.1) {
boolean characterOverlap = false;
int overlappingCount = 0;
for (Character outerCharacter : outer.getCharacters()) {
for (Character innerCharacter : inner.getCharacters()) {
double characterOverlapDistance = outerCharacter.overlappingDistance(innerCharacter);
if (characterOverlapDistance > 2) {
characterOverlap = true;
}
if (characterOverlapDistance > 0) {
overlappingCount++;
}
}
}
if (!characterOverlap && overlappingCount <= 2) {
sets.union(outer, inner);
}
}
}
});
});
List<Line> outputZone = new ArrayList<>();
for (Set<Line> group : sets) {
List<Character> components = new ArrayList<>();
for (Line line : group) {
components.addAll(line.getCharacters());
}
components.sort(Comparator.comparingDouble(Character::getX));
outputZone.add(new Line(components, characterSpacing));
}
return new Zone(outputZone);
}
}

View File

@ -0,0 +1,40 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.utils;
import java.util.Comparator;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character;
public class CharacterComparator implements Comparator<Character> {
@Override
public int compare(Character pos1, Character pos2) {
// only compare text that is in the same direction
int cmp1 = Float.compare(pos1.getTextPosition().getDir(), pos2.getTextPosition().getDir());
if (cmp1 != 0) {
return cmp1;
}
// get the text direction adjusted coordinates
float x1 = pos1.getTextPosition().getXDirAdj();
float x2 = pos2.getTextPosition().getXDirAdj();
float pos1YBottom = pos1.getTextPosition().getYDirAdj();
float pos2YBottom = pos2.getTextPosition().getYDirAdj();
// note that the coordinates have been adjusted so 0,0 is in upper left
float pos1YTop = pos1YBottom - pos1.getTextPosition().getHeightDir();
float pos2YTop = pos2YBottom - pos2.getTextPosition().getHeightDir();
float yDifference = Math.abs(pos1YBottom - pos2YBottom);
// we will do a simple tolerance comparison
if (yDifference < .1 || pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom || pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom) {
return Float.compare(x1, x2);
} else if (pos1YBottom < pos2YBottom) {
return -1;
} else {
return 1;
}
}
}

View File

@ -0,0 +1,18 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.utils;
public class DoubleUtils {
public static int compareDouble(double d1, double d2, double precision) {
if (Double.isNaN(d1) || Double.isNaN(d2)) {
return Double.compare(d1, d2);
}
if (precision == 0) {
precision = 1;
}
long i1 = Math.round(d1 / precision);
long i2 = Math.round(d2 / precision);
return Long.compare(i1, i2);
}
}

View File

@ -0,0 +1,39 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.utils;
import java.util.Comparator;
import java.util.List;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
public class ReadingOrderHelper {
public static List<TextPositionSequence> orderByReadingOrder(List<TextPositionSequence> words) {
if (words.size() <= 1) {
return words;
}
int textOrientation = words.get(0).getRotation();
switch (textOrientation) {
case 0:
return words.stream()
.sorted(Comparator.comparingDouble(w -> w.getRectangle().getTopLeft().getX()))
.collect(Collectors.toList());
case 90:
return words.stream()
.sorted((w1, w2) -> -Double.compare(w1.getRectangle().getTopLeft().getY(), w2.getRectangle().getTopLeft().getY()))
.collect(Collectors.toList());
case 180:
return words.stream()
.sorted((w1, w2) -> -Double.compare(w1.getRectangle().getTopLeft().getX(), w2.getRectangle().getTopLeft().getX()))
.collect(Collectors.toList());
case 270:
return words.stream()
.sorted(Comparator.comparingDouble(w -> w.getRectangle().getTopLeft().getY()))
.collect(Collectors.toList());
default:
throw new IllegalArgumentException("Not sure what to do with this text rotation...");
}
}
}

View File

@ -237,8 +237,13 @@ public class PDFLinesTextStripper extends PDFTextStripper {
int startIndex = 0; int startIndex = 0;
RedTextPosition previous = null; RedTextPosition previous = null;
float direction = -1;
for (int i = 0; i <= textPositions.size() - 1; i++) { for (int i = 0; i <= textPositions.size() - 1; i++) {
if (direction == -1) {
direction = textPositions.get(i).getDir();
}
if (!textPositionSequences.isEmpty()) { if (!textPositionSequences.isEmpty()) {
previous = textPositionSequences.get(textPositionSequences.size() - 1) previous = textPositionSequences.get(textPositionSequences.size() - 1)
.getTextPositions() .getTextPositions()
@ -250,11 +255,18 @@ public class PDFLinesTextStripper extends PDFTextStripper {
continue; continue;
} }
if (textPositions.get(i).getDir() != direction && startIndex != i) {
List<TextPosition> sublist = textPositions.subList(startIndex, i);
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart, textPositionSequences.size()));
startIndex = i;
direction = textPositions.get(i).getDir();
}
// Strange but sometimes this is happening, for example: Metolachlor2.pdf // Strange but sometimes this is happening, for example: Metolachlor2.pdf
if (checkIfCurrentPositionIsToTheRightOfPreviousPosition(i, textPositions)) { if (checkIfCurrentPositionIsToTheRightOfPreviousPosition(i, textPositions)) {
List<TextPosition> sublist = textPositions.subList(startIndex, i); List<TextPosition> sublist = textPositions.subList(startIndex, i);
if (checkIfSequenceContainsOnlyWhitespaces(sublist)) { if (checkIfSequenceContainsOnlyWhitespaces(sublist)) {
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart)); textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart, textPositionSequences.size()));
} }
startIndex = i; startIndex = i;
} }
@ -262,7 +274,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
if (textPositions.get(i).getRotation() == 0 && i > 0 && textPositions.get(i).getX() > textPositions.get(i - 1).getEndX() + 1) { if (textPositions.get(i).getRotation() == 0 && i > 0 && textPositions.get(i).getX() > textPositions.get(i - 1).getEndX() + 1) {
List<TextPosition> sublist = textPositions.subList(startIndex, i); List<TextPosition> sublist = textPositions.subList(startIndex, i);
if (checkIfSequenceContainsOnlyWhitespaces(sublist)) { if (checkIfSequenceContainsOnlyWhitespaces(sublist)) {
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart)); textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart, textPositionSequences.size()));
} }
startIndex = i; startIndex = i;
} }
@ -276,10 +288,10 @@ public class PDFLinesTextStripper extends PDFTextStripper {
// Remove false sequence ends (whitespaces) // Remove false sequence ends (whitespaces)
if (checkIfGapSizeBetweenCharactersSmallerThanMaximum(previous, sublist, 0.01f)) { if (checkIfGapSizeBetweenCharactersSmallerThanMaximum(previous, sublist, 0.01f)) {
for (TextPosition t : sublist) { for (TextPosition t : sublist) {
textPositionSequences.get(textPositionSequences.size() - 1).add(t); textPositionSequences.get(textPositionSequences.size() - 1).add(t, textPositionSequences.size());
} }
} else { } else {
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart)); textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart, textPositionSequences.size()));
} }
} }
startIndex = i + 1; startIndex = i + 1;
@ -299,10 +311,10 @@ public class PDFLinesTextStripper extends PDFTextStripper {
if (previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0) if (previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0)
.getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < 0.01) { .getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < 0.01) {
for (TextPosition t : sublist) { for (TextPosition t : sublist) {
textPositionSequences.get(textPositionSequences.size() - 1).add(t); textPositionSequences.get(textPositionSequences.size() - 1).add(t, textPositionSequences.size());
} }
} else { } else {
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, isParagraphStart)); textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, isParagraphStart, textPositionSequences.size()));
} }
} }
super.writeString(text); super.writeString(text);
@ -329,6 +341,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
.getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < maximumGapSize; .getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < maximumGapSize;
} }
@Override @Override
public String getText(PDDocument doc) throws IOException { public String getText(PDDocument doc) throws IOException {

View File

@ -43,7 +43,7 @@ public class MarkedContentUtils {
return markedContentByYPosition.values().stream() return markedContentByYPosition.values().stream()
.map(textPositions -> new TextPositionSequence(textPositions.stream() .map(textPositions -> new TextPositionSequence(textPositions.stream()
.toList(), 0, true) .toList(), 0, true, 0)
.getRectangle()) .getRectangle())
.map(t -> new Rectangle2D.Float(t.getTopLeft().getX(), t.getTopLeft().getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight()))).collect(Collectors.toList()); .map(t -> new Rectangle2D.Float(t.getTopLeft().getX(), t.getTopLeft().getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight()))).collect(Collectors.toList());
} }

View File

@ -26,14 +26,14 @@ public class ViewerDocumentTest extends BuildDocumentTest {
@SneakyThrows @SneakyThrows
public void testViewerDocument() { public void testViewerDocument() {
String fileName = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"; String fileName = "files/Plenarprotokoll 1 (keine Druchsache!) (1).pdf";
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf"; String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
var documentFile = new ClassPathResource(fileName).getFile(); var documentFile = new ClassPathResource(fileName).getFile();
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null); ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService); LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER); Document document = buildGraph(fileName, LayoutParsingType.DOCSTRUM);
long start = System.currentTimeMillis(); long start = System.currentTimeMillis();
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true); layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000); System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
@ -54,10 +54,11 @@ public class ViewerDocumentTest extends BuildDocumentTest {
var documentFile = new ClassPathResource(fileName).getFile(); var documentFile = new ClassPathResource(fileName).getFile();
var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE, var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE,
documentFile, documentFile,
new ImageServiceResponse(), new ImageServiceResponse(),
tableResponse, tableResponse,
new VisualLayoutParsingResponse(),Path.of(fileName).getFileName().toFile().toString()); new VisualLayoutParsingResponse(),
Path.of(fileName).getFileName().toFile().toString());
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null); ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService); LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
Document document = DocumentGraphFactory.buildDocumentGraph(classificationDocument); Document document = DocumentGraphFactory.buildDocumentGraph(classificationDocument);