From 240ef82def69fe80be39349a4353a4aaeb3c88a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dominique=20Eifl=C3=A4nder?= Date: Thu, 22 Feb 2024 11:02:50 +0100 Subject: [PATCH] RED-7141: Implemented docstrum layout parsing --- .../internal/api/queue/LayoutParsingType.java | 5 +- .../processor/LayoutParsingPipeline.java | 111 ++++---- .../model/text/TextPositionSequence.java | 11 + .../services/SectionsBuilderService.java | 2 +- .../DocstrumBlockificationService.java | 252 ++++++++++++++++++ .../docstrum/DocstrumSegmentationService.java | 48 ++++ .../services/docstrum/model/AngleFilter.java | 32 +++ .../services/docstrum/model/BoundingBox.java | 48 ++++ .../services/docstrum/model/Character.java | 84 ++++++ .../services/docstrum/model/DisjointSets.java | 194 ++++++++++++++ .../services/docstrum/model/Histogram.java | 91 +++++++ .../services/docstrum/model/Line.java | 165 ++++++++++++ .../services/docstrum/model/Neighbor.java | 36 +++ .../services/docstrum/model/Zone.java | 50 ++++ .../docstrum/service/LineBuilderService.java | 50 ++++ .../service/NearestNeighbourService.java | 78 ++++++ .../docstrum/service/ReadingOrderService.java | 99 +++++++ .../docstrum/service/SpacingService.java | 56 ++++ .../docstrum/service/ZoneBuilderService.java | 150 +++++++++++ .../services/docstrum/utils/DoubleUtils.java | 18 ++ .../server/graph/ViewerDocumentTest.java | 11 +- 21 files changed, 1539 insertions(+), 52 deletions(-) create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/DocstrumSegmentationService.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/AngleFilter.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/BoundingBox.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Character.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/DisjointSets.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Histogram.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Line.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Neighbor.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Zone.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/LineBuilderService.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/NearestNeighbourService.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/ReadingOrderService.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/SpacingService.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/ZoneBuilderService.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/utils/DoubleUtils.java diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingType.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingType.java index 7598d29..9e915de 100644 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingType.java +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingType.java @@ -3,5 +3,8 @@ package com.knecon.fforesight.service.layoutparser.internal.api.queue; public enum LayoutParsingType { REDACT_MANAGER, TAAS, - DOCUMINE + DOCUMINE, + + DOCSTRUM, + DOCSTRUM_XY } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index 25cf3f8..3344f7b 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -1,5 +1,7 @@ package com.knecon.fforesight.service.layoutparser.processor; +import static com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType.DOCSTRUM; +import static com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType.DOCSTRUM_XY; import static java.lang.String.format; import java.awt.geom.Rectangle2D; @@ -26,6 +28,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsi import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationSection; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; @@ -43,6 +46,7 @@ import com.knecon.fforesight.service.layoutparser.processor.services.RulingClean import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService; import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService; import com.knecon.fforesight.service.layoutparser.processor.services.TableExtractionService; +import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService; import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService; import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService; import com.knecon.fforesight.service.layoutparser.processor.services.blockification.TaasBlockificationService; @@ -86,6 +90,7 @@ public class LayoutParsingPipeline { TaasBlockificationService taasBlockificationService; DocuMineBlockificationService docuMineBlockificationService; RedactManagerBlockificationService redactManagerBlockificationService; + DocstrumBlockificationService docstrumBlockificationService; LayoutGridService layoutGridService; ObservationRegistry observationRegistry; VisualLayoutParsingAdapter visualLayoutParsingAdapter; @@ -97,8 +102,7 @@ public class LayoutParsingPipeline { log.info("Starting layout parsing for {}", layoutParsingRequest.identifier()); File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId()); - File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()) - .orElse(originFile); + File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()).orElse(originFile); VisualLayoutParsingResponse visualLayoutParsingResponse = new VisualLayoutParsingResponse(); if (layoutParsingRequest.visualLayoutParsingFileId().isPresent()) { @@ -106,24 +110,20 @@ public class LayoutParsingPipeline { } ImageServiceResponse imageServiceResponse = new ImageServiceResponse(); - if (layoutParsingRequest.imagesFileStorageId() - .isPresent()) { - imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId() - .get()); + if (layoutParsingRequest.imagesFileStorageId().isPresent()) { + imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get()); } TableServiceResponse tableServiceResponse = new TableServiceResponse(); - if (layoutParsingRequest.tablesFileStorageId() - .isPresent()) { - tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId() - .get()); + if (layoutParsingRequest.tablesFileStorageId().isPresent()) { + tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get()); } ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(), - originFile, - imageServiceResponse, - tableServiceResponse, - visualLayoutParsingResponse, + originFile, + imageServiceResponse, + tableServiceResponse, + visualLayoutParsingResponse, layoutParsingRequest.identifier().toString()); log.info("Building document graph for {}", layoutParsingRequest.identifier()); @@ -156,25 +156,25 @@ public class LayoutParsingPipeline { .numberOfPages(documentGraph.getNumberOfPages()) .duration(System.currentTimeMillis() - start) .message(format(""" - Layout parsing has finished in %.02f s. - identifiers: %s - %s - Files have been saved with Ids: - Structure: %s - Text: %s - Positions: %s - PageData: %s - Simplified Text: %s - Viewer Doc: %s""", - ((float) (System.currentTimeMillis() - start)) / 1000, - layoutParsingRequest.identifier(), - buildSemanticNodeCountMessage(documentGraph.getNumberOfPages(), documentGraph.buildSemanticNodeCounts()), - layoutParsingRequest.structureFileStorageId(), - layoutParsingRequest.textBlockFileStorageId(), - layoutParsingRequest.positionBlockFileStorageId(), - layoutParsingRequest.pageFileStorageId(), - layoutParsingRequest.simplifiedTextStorageId(), - layoutParsingRequest.viewerDocumentStorageId())) + Layout parsing has finished in %.02f s. + identifiers: %s + %s + Files have been saved with Ids: + Structure: %s + Text: %s + Positions: %s + PageData: %s + Simplified Text: %s + Viewer Doc: %s""", + ((float) (System.currentTimeMillis() - start)) / 1000, + layoutParsingRequest.identifier(), + buildSemanticNodeCountMessage(documentGraph.getNumberOfPages(), documentGraph.buildSemanticNodeCounts()), + layoutParsingRequest.structureFileStorageId(), + layoutParsingRequest.textBlockFileStorageId(), + layoutParsingRequest.positionBlockFileStorageId(), + layoutParsingRequest.pageFileStorageId(), + layoutParsingRequest.simplifiedTextStorageId(), + layoutParsingRequest.viewerDocumentStorageId())) .build(); } @@ -195,14 +195,14 @@ public class LayoutParsingPipeline { private String buildSemanticNodeCountMessage(int numberOfPages, Map semanticNodeCounts) { return String.format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed", - numberOfPages, - semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION), - semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE), - semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH), - semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE), - semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL), - semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER), - semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER)); + numberOfPages, + semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION), + semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE), + semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH), + semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE), + semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL), + semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER), + semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER)); } @@ -220,7 +220,7 @@ public class LayoutParsingPipeline { Map> pdfTableCells = cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse); Map> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse); Map> signatures = new HashMap<>(); - if(signatures.size() > 0) { + if (signatures.size() > 0) { visualLayoutParsingAdapter.buildExtractedSignaturesPerPage(visualLayoutParsingResponse); } @@ -266,6 +266,8 @@ public class LayoutParsingPipeline { case REDACT_MANAGER -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); case TAAS -> taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); + case DOCSTRUM -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical(), false); + case DOCSTRUM_XY -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical(), true); }; classificationPage.setCleanRulings(cleanRulings); classificationPage.setRotation(rotation); @@ -283,12 +285,16 @@ public class LayoutParsingPipeline { imageServiceResponseAdapter.findOcr(classificationPage); } - if(signatures.containsKey(pageNumber)) { + if (signatures.containsKey(pageNumber)) { classificationPage.setImages(signatures.get(pageNumber)); } tableExtractionService.extractTables(cleanRulings, classificationPage); + if (layoutParsingType == DOCSTRUM || layoutParsingType == DOCSTRUM_XY) { + docstrumBlockificationService.combineBlocks(classificationPage); + } + buildPageStatistics(classificationPage); increaseDocumentStatistics(classificationPage, classificationDocument); @@ -304,11 +310,26 @@ public class LayoutParsingPipeline { case TAAS -> taasClassificationService.classifyDocument(classificationDocument); case DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument); case REDACT_MANAGER -> redactManagerClassificationService.classifyDocument(classificationDocument); + case DOCSTRUM_XY -> redactManagerClassificationService.classifyDocument(classificationDocument); } log.info("Building Sections for {}", identifier); - sectionsBuilderService.buildSections(classificationDocument); - sectionsBuilderService.addImagesToSections(classificationDocument); + + if (layoutParsingType == DOCSTRUM || layoutParsingType == DOCSTRUM_XY) { + // Currently for debugging return paragraphs as sections, becaus there is a merging logic in sectionBuilder + List sections = new ArrayList<>(); + for (var page : classificationPages) { + page.getTextBlocks().forEach(block -> { + block.setPage(page.getPageNumber()); + var section = sectionsBuilderService.buildTextBlock(List.of(block), "a"); + sections.add(section); + }); + } + classificationDocument.setSections(sections); + } else { + sectionsBuilderService.buildSections(classificationDocument); + sectionsBuilderService.addImagesToSections(classificationDocument); + } return classificationDocument; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequence.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequence.java index 82829c6..dc77c45 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequence.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequence.java @@ -55,6 +55,17 @@ public class TextPositionSequence implements CharSequence { } + public TextPositionSequence(List textPositions, int page) { + + this.textPositions = textPositions; + this.page = page; + this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir()); + this.rotation = textPositions.get(0).getRotation(); + this.pageHeight = textPositions.get(0).getPageHeight(); + this.pageWidth = textPositions.get(0).getPageWidth(); + } + + @Override public int length() { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/SectionsBuilderService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/SectionsBuilderService.java index 04cc930..ae8be2a 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/SectionsBuilderService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/SectionsBuilderService.java @@ -240,7 +240,7 @@ public class SectionsBuilderService { } - private ClassificationSection buildTextBlock(List wordBlockList, String lastHeadline) { + public ClassificationSection buildTextBlock(List wordBlockList, String lastHeadline) { ClassificationSection section = new ClassificationSection(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java new file mode 100644 index 0000000..5c45105 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java @@ -0,0 +1,252 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.blockification; + +import static java.util.stream.Collectors.toSet; + +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; +import java.util.ListIterator; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.springframework.stereotype.Service; + +import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; +import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.DocstrumSegmentationService; +import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil; + +import lombok.RequiredArgsConstructor; + +@SuppressWarnings("all") +@Service +@RequiredArgsConstructor +public class DocstrumBlockificationService { + + private final DocstrumSegmentationService docstrumSegmentationService; + + static final float THRESHOLD = 1f; + Pattern pattern = Pattern.compile("^(\\p{Digit}{1,3}\\.){0,3}\\p{Digit}{1,3}[\\p{Lower}.]?", Pattern.CASE_INSENSITIVE); + + + public ClassificationPage blockify(List textPositions, List horizontalRulingLines, List verticalRulingLines, boolean xyOder) { + + List abstractPageBlocks = new ArrayList<>(); + var zones = docstrumSegmentationService.segmentPage(textPositions, xyOder); + zones.forEach(zone -> { + + List textPositionSequences = new ArrayList<>(); + zone.getLines().forEach(line -> { + line.getWords().forEach(word -> { + textPositionSequences.add(new TextPositionSequence(word.getTextPositions(), word.getPage())); + }); + }); + + abstractPageBlocks.addAll(splitZonesAtRulings(textPositionSequences, horizontalRulingLines, verticalRulingLines)); + }); + + return new ClassificationPage(abstractPageBlocks); + } + + + public void combineBlocks(ClassificationPage page) { + + TextPageBlock previous = new TextPageBlock(); + ListIterator itty = page.getTextBlocks().listIterator(); + while (itty.hasNext()) { + AbstractPageBlock block = itty.next(); + if (block instanceof TablePageBlock) { + continue; + } + TextPageBlock current = (TextPageBlock) block; + + if (previous != null) { + Matcher matcher = pattern.matcher(previous.getText().toString()); + if (matcher.matches() && Math.abs(previous.getMinY() - current.getMinY()) < 1) { + previous.getSequences().addAll(current.getSequences()); + previous = buildTextBlock(previous.getSequences(), 0); + itty.remove(); + } + } + previous = current; + } + } + + + public List splitZonesAtRulings(List textPositions, List horizontalRulingLines, List verticalRulingLines) { + + int indexOnPage = 0; + List chunkWords = new ArrayList<>(); + List chunkBlockList = new ArrayList<>(); + + float minX = 1000, maxX = 0, minY = 1000, maxY = 0; + TextPositionSequence prev = null; + + Float splitX1 = null; + for (TextPositionSequence word : textPositions) { + + boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines); + boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir()); + + if (prev != null && (splitByDir || isSplitByRuling)) { + + TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage); + indexOnPage++; + + chunkBlockList.add(cb1); + chunkWords = new ArrayList<>(); + + minX = 1000; + maxX = 0; + minY = 1000; + maxY = 0; + prev = null; + } + + chunkWords.add(word); + + prev = word; + if (word.getMinXDirAdj() < minX) { + minX = word.getMinXDirAdj(); + } + if (word.getMaxXDirAdj() > maxX) { + maxX = word.getMaxXDirAdj(); + } + if (word.getMinYDirAdj() < minY) { + minY = word.getMinYDirAdj(); + } + if (word.getMaxYDirAdj() > maxY) { + maxY = word.getMaxYDirAdj(); + } + } + + TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage); + if (cb1 != null) { + chunkBlockList.add(cb1); + } + + return chunkBlockList; + } + + + private boolean equalsWithThreshold(float f1, float f2) { + + return Math.abs(f1 - f2) < THRESHOLD; + } + + + private TextPageBlock buildTextBlock(List wordBlockList, int indexOnPage) { + + TextPageBlock textBlock = null; + + FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter(); + FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter(); + FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter(); + StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter(); + StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter(); + + for (TextPositionSequence wordBlock : wordBlockList) { + + lineHeightFrequencyCounter.add(wordBlock.getTextHeight()); + fontSizeFrequencyCounter.add(wordBlock.getFontSize()); + spaceFrequencyCounter.add(wordBlock.getSpaceWidth()); + fontFrequencyCounter.add(wordBlock.getFont()); + styleFrequencyCounter.add(wordBlock.getFontStyle()); + + if (textBlock == null) { + textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(), + wordBlock.getMaxXDirAdj(), + wordBlock.getMinYDirAdj(), + wordBlock.getMaxYDirAdj(), + wordBlockList, + wordBlock.getRotation()); + } else { + TextPageBlock spatialEntity = textBlock.union(wordBlock); + textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight()); + } + } + + if (textBlock != null) { + textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular()); + textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular()); + textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular()); + textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular()); + textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular()); + textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest()); + } + + if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) { + textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj)); + } + return textBlock; + } + + + private boolean isSplitByRuling(float minX, + float minY, + float maxX, + float maxY, + TextPositionSequence word, + List horizontalRulingLines, + List verticalRulingLines) { + + return isSplitByRuling(maxX, + minY, + word.getMinXDirAdj(), + word.getMinYDirAdj(), + verticalRulingLines, + word.getDir().getDegrees(), + word.getPageWidth(), + word.getPageHeight()) // + || isSplitByRuling(minX, + minY, + word.getMinXDirAdj(), + word.getMaxYDirAdj(), + horizontalRulingLines, + word.getDir().getDegrees(), + word.getPageWidth(), + word.getPageHeight()) // + || isSplitByRuling(maxX, + minY, + word.getMinXDirAdj(), + word.getMinYDirAdj(), + horizontalRulingLines, + word.getDir().getDegrees(), + word.getPageWidth(), + word.getPageHeight()) // + || isSplitByRuling(minX, + minY, + word.getMinXDirAdj(), + word.getMaxYDirAdj(), + verticalRulingLines, + word.getDir().getDegrees(), + word.getPageWidth(), + word.getPageHeight()); + } + + + private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List rulingLines, float dir, float pageWidth, float pageHeight) { + + for (Ruling ruling : rulingLines) { + var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight); + if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) { + return true; + } + } + return false; + } + + + private double round(float value, int decimalPoints) { + + var d = Math.pow(10, decimalPoints); + return Math.round(value * d) / d; + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/DocstrumSegmentationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/DocstrumSegmentationService.java new file mode 100644 index 0000000..fbf92da --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/DocstrumSegmentationService.java @@ -0,0 +1,48 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.docstrum; + +import java.util.List; +import java.util.stream.Collectors; + +import org.springframework.stereotype.Service; + +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.LineBuilderService; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.NearestNeighbourService; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.ReadingOrderService; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.SpacingService; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.ZoneBuilderService; + +import lombok.RequiredArgsConstructor; + +@Service +@RequiredArgsConstructor +public class DocstrumSegmentationService { + + private final NearestNeighbourService nearestNeighbourService; + private final SpacingService spacingService; + private final LineBuilderService lineBuilderService; + private final ZoneBuilderService zoneBuilderService; + private final ReadingOrderService readingOrderService; + + + public List segmentPage(List textPositions, boolean xyOder) { + + var positions = textPositions.stream().map(TextPositionSequence::getTextPositions).flatMap(List::stream).toList(); + + var characters = positions.stream().map(Character::new).collect(Collectors.toList()); + + nearestNeighbourService.findNearestNeighbors(characters); + + var characterSpacing = spacingService.computeCharacterSpacing(characters); + var lineSpacing = spacingService.computeLineSpacing(characters); + + var lines = lineBuilderService.buildLines(characters, characterSpacing, lineSpacing); + + var zones = zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing); + + return readingOrderService.resolve(zones, xyOder); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/AngleFilter.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/AngleFilter.java new file mode 100644 index 0000000..de0f4f9 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/AngleFilter.java @@ -0,0 +1,32 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model; + +public class AngleFilter { + + protected double lowerAngle; + protected double upperAngle; + + + public AngleFilter(double lowerAngle, double upperAngle) { + + if (lowerAngle < -Math.PI / 2) { + lowerAngle += Math.PI; + } + if (upperAngle >= Math.PI / 2) { + upperAngle -= Math.PI; + } + + this.lowerAngle = lowerAngle; + this.upperAngle = upperAngle; + } + + + public boolean matches(Neighbor neighbor) { + + if (lowerAngle <= upperAngle) { + return lowerAngle <= neighbor.getAngle() && neighbor.getAngle() < upperAngle; + } else { + return lowerAngle <= neighbor.getAngle() || neighbor.getAngle() < upperAngle; + } + } + +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/BoundingBox.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/BoundingBox.java new file mode 100644 index 0000000..5215d6f --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/BoundingBox.java @@ -0,0 +1,48 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model; + +import java.awt.geom.Rectangle2D; + +import lombok.Data; + +@Data +public abstract class BoundingBox { + + private Rectangle2D bBox; + + + public double getX() { + + return bBox.getX(); + } + + + public double getY() { + + return bBox.getY(); + } + + + public double getWidth() { + + return bBox.getWidth(); + } + + + public double getHeight() { + + return bBox.getHeight(); + } + + + public double getArea() { + + return (bBox.getHeight() * bBox.getWidth()); + } + + + public boolean contains(Rectangle2D contained, double tolerance) { + + return bBox.getX() <= contained.getX() + tolerance && bBox.getY() <= contained.getY() + tolerance && bBox.getX() + bBox.getWidth() >= contained.getX() + contained.getWidth() - tolerance && bBox.getY() + bBox.getHeight() >= contained.getY() + contained.getHeight() - tolerance; + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Character.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Character.java new file mode 100644 index 0000000..987665a --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Character.java @@ -0,0 +1,84 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition; + +import lombok.Data; + +@Data +public class Character { + + private final double x; + private final double y; + private final RedTextPosition textPosition; + + private List neighbors = new ArrayList<>(); + + + public Character(RedTextPosition chunk) { + + this.x = chunk.getXDirAdj() + chunk.getWidthDirAdj() / 2; + this.y = chunk.getYDirAdj() + chunk.getHeightDir() / 2; + this.textPosition = chunk; + } + + + public double getHeight() { + + return textPosition.getHeightDir(); + } + + + public double distance(Character character) { + + double dx = getX() - character.getX(); + double dy = getY() - character.getY(); + return Math.sqrt(dx * dx + dy * dy); + } + + + public double horizontalDistance(Character character) { + + return Math.abs(getX() - character.getX()); + } + + + public double verticalDistance(Character character) { + + return Math.abs(getY() - character.getY()); + } + + + public double overlappingDistance(Character other) { + + double[] xs = new double[4]; + double s = Math.sin(-0), c = Math.cos(-0); + xs[0] = c * x - s * y; + xs[1] = c * (x + textPosition.getWidthDirAdj()) - s * (y + textPosition.getHeightDir()); + xs[2] = c * other.x - s * other.y; + xs[3] = c * (other.x + other.textPosition.getWidthDirAdj()) - s * (other.y + other.textPosition.getHeightDir()); + boolean overlapping = xs[1] >= xs[2] && xs[3] >= xs[0]; + Arrays.sort(xs); + return Math.abs(xs[2] - xs[1]) * (overlapping ? 1 : -1); + } + + + public void setNeighbors(List neighbors) { + + this.neighbors = neighbors; + } + + + public double angle(Character character) { + + if (getX() > character.getX()) { + return Math.atan2(getY() - character.getY(), getX() - character.getX()); + } else { + return Math.atan2(character.getY() - getY(), character.getX() - getX()); + } + } + +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/DisjointSets.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/DisjointSets.java new file mode 100644 index 0000000..bc20874 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/DisjointSets.java @@ -0,0 +1,194 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model; + +import java.util.AbstractSet; +import java.util.Collection; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import java.util.NoSuchElementException; +import java.util.Set; + +public class DisjointSets implements Iterable> { + + private final Map> map = new HashMap<>(); + + + public DisjointSets(Collection collection) { + + for (E element : collection) { + map.put(element, new Entry(element)); + } + } + + + public boolean areTogether(E e1, E e2) { + + return map.get(e1).findRepresentative() == map.get(e2).findRepresentative(); + } + + + public void union(E e1, E e2) { + + Entry r1 = map.get(e1).findRepresentative(); + Entry r2 = map.get(e2).findRepresentative(); + if (r1 != r2) { + if (r1.size <= r2.size) { + r2.mergeWith(r1); + } else { + r1.mergeWith(r2); + } + } + } + + + @Override + public Iterator> iterator() { + + return new Iterator<>() { + + private final Iterator> iterator = map.values().iterator(); + private Entry nextRepresentative; + + { + findNextRepresentative(); + } + + @Override + public boolean hasNext() { + + return nextRepresentative != null; + } + + + @Override + public Set next() { + + if (nextRepresentative == null) { + throw new NoSuchElementException(); + } + Set result = nextRepresentative.asSet(); + findNextRepresentative(); + return result; + } + + + private void findNextRepresentative() { + + while (iterator.hasNext()) { + Entry candidate = iterator.next(); + if (candidate.isRepresentative()) { + nextRepresentative = candidate; + return; + } + } + nextRepresentative = null; + } + + + @Override + public void remove() { + + throw new UnsupportedOperationException(); + } + + }; + } + + + private static class Entry { + + private int size = 1; + private final E value; + private Entry parent = this; + private Entry next = null; + private Entry last = this; + + + Entry(E value) { + + this.value = value; + } + + + void mergeWith(Entry otherRepresentative) { + + size += otherRepresentative.size; + last.next = otherRepresentative; + last = otherRepresentative.last; + otherRepresentative.parent = this; + } + + + Entry findRepresentative() { + + Entry representative = parent; + while (representative.parent != representative) { + representative = representative.parent; + } + for (Entry entry = this; entry != representative; ) { + Entry nextEntry = entry.parent; + entry.parent = representative; + entry = nextEntry; + } + return representative; + } + + + boolean isRepresentative() { + + return parent == this; + } + + + Set asSet() { + + return new AbstractSet() { + + @Override + public Iterator iterator() { + + return new Iterator() { + + private Entry nextEntry = findRepresentative(); + + + @Override + public boolean hasNext() { + + return nextEntry != null; + } + + + @Override + public E next() { + + if (nextEntry == null) { + throw new NoSuchElementException(); + } + E result = nextEntry.value; + nextEntry = nextEntry.next; + return result; + } + + + @Override + public void remove() { + + throw new UnsupportedOperationException(); + } + + }; + } + + + @Override + public int size() { + + return findRepresentative().size; + } + }; + } + + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Histogram.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Histogram.java new file mode 100644 index 0000000..4b6913a --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Histogram.java @@ -0,0 +1,91 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model; + +public class Histogram { + + private static final double EPSILON = 1.0e-6; + private final double min; + private final double resolution; + private double[] frequencies; + + + public Histogram(double minValue, double maxValue, double resolution) { + + this.min = minValue - EPSILON; + double delta = maxValue - minValue + 2 * EPSILON; + int size = Math.max(1, (int) Math.round((maxValue - minValue) / resolution)); + this.resolution = delta / size; + this.frequencies = new double[size]; + } + + + public void kernelSmooth(double[] kernel) { + + double[] newFrequencies = new double[frequencies.length]; + int shift = (kernel.length - 1) / 2; + for (int i = 0; i < kernel.length; i++) { + int jStart = Math.max(0, i - shift); + int jEnd = Math.min(frequencies.length, frequencies.length + i - shift); + for (int j = jStart; j < jEnd; j++) { + newFrequencies[j - i + shift] += kernel[i] * frequencies[j]; + } + } + frequencies = newFrequencies; + } + + + public double[] createGaussianKernel(double length, double stdDeviation) { + + int r = (int) Math.round(length / resolution) / 2; + stdDeviation /= resolution; + + int size = 2 * r + 1; + double[] kernel = new double[size]; + double sum = 0; + double b = 2 * stdDeviation * stdDeviation; + double a = 1 / Math.sqrt(Math.PI * b); + for (int i = 0; i < size; i++) { + kernel[i] = a * Math.exp(-(i - r) * (i - r) / b); + sum += kernel[i]; + } + for (int i = 0; i < size; i++) { + kernel[i] /= sum; + } + return kernel; + } + + + public void gaussianSmooth(double windowLength, double stdDeviation) { + + kernelSmooth(createGaussianKernel(windowLength, stdDeviation)); + } + + + public void add(double value) { + + frequencies[(int) ((value - min) / resolution)] += 1.0; + } + + + public int getSize() { + + return frequencies.length; + } + + + public double getPeakValue() { + + int peakIndex = 0; + for (int i = 1; i < frequencies.length; i++) { + if (frequencies[i] > frequencies[peakIndex]) { + peakIndex = i; + } + } + int peakEndIndex = peakIndex + 1; + final double EPS = 0.0001; + while (peakEndIndex < frequencies.length && Math.abs(frequencies[peakEndIndex] - frequencies[peakIndex]) < EPS) { + peakEndIndex++; + } + return ((double) peakIndex + peakEndIndex) / 2 * resolution + min; + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Line.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Line.java new file mode 100644 index 0000000..70b8c75 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Line.java @@ -0,0 +1,165 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model; + +import java.awt.geom.Rectangle2D; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; + +import lombok.Data; + +@Data +public class Line extends BoundingBox { + + private static final double WORD_DISTANCE_MULTIPLIER = 0.2; + + private final double x0; + private final double y0; + + private final double x1; + private final double y1; + + private final double height; + + private final List characters; + private final List words = new ArrayList<>(); + + + public Line(List characters, double wordSpacing) { + + this.characters = characters; + + if (characters.size() >= 2) { + // linear regression + double sx = 0.0, sxx = 0.0, sxy = 0.0, sy = 0.0; + for (Character character : characters) { + sx += character.getX(); + sxx += character.getX() * character.getX(); + sxy += character.getX() * character.getY(); + sy += character.getY(); + } + double b = (characters.size() * sxy - sx * sy) / (characters.size() * sxx - sx * sx); + double a = (sy - b * sx) / characters.size(); + + this.x0 = characters.get(0).getX(); + this.y0 = a + b * this.x0; + this.x1 = characters.get(characters.size() - 1).getX(); + this.y1 = a + b * this.x1; + } else { + Character character = characters.get(0); + double dx = character.getTextPosition().getWidthDirAdj() / 3; + double dy = dx * Math.tan(0); + this.x0 = character.getX() - dx; + this.x1 = character.getX() + dx; + this.y0 = character.getY() - dy; + this.y1 = character.getY() + dy; + } + height = computeHeight(); + computeWords(wordSpacing * WORD_DISTANCE_MULTIPLIER); + buildBBox(); + } + + + public double getAngle() { + + return Math.atan2(y1 - y0, x1 - x0); + } + + + public double getLength() { + + return Math.sqrt((x0 - x1) * (x0 - x1) + (y0 - y1) * (y0 - y1)); + } + + + private double computeHeight() { + + double sum = 0.0; + for (Character component : characters) { + sum += component.getHeight(); + } + return sum / characters.size(); + } + + + public double angularDifference(Line j) { + + double diff = Math.abs(getAngle() - j.getAngle()); + if (diff <= Math.PI / 2) { + return diff; + } else { + return Math.PI - diff; + } + } + + + public double horizontalDistance(Line other) { + + double[] xs = new double[4]; + xs[0] = x0; + xs[1] = x1; + xs[2] = other.x0; + xs[3] = other.x1; + boolean overlapping = xs[1] >= xs[2] && xs[3] >= xs[0]; + Arrays.sort(xs); + return Math.abs(xs[2] - xs[1]) * (overlapping ? 1 : -1); + } + + + public double verticalDistance(Line other) { + + double ym = (y0 + y1) / 2; + double yn = (other.y0 + other.y1) / 2; + return Math.abs(ym - yn) / Math.sqrt(1); + } + + + private void computeWords(double wordSpacing) { + + TextPositionSequence word = new TextPositionSequence(); + Character previous = null; + for (Character current : characters) { + if (previous != null) { + double dist = current.getTextPosition().getXDirAdj() - previous.getTextPosition().getXDirAdj() - previous.getTextPosition().getWidthDirAdj(); + if (dist > wordSpacing) { + words.add(word); + word = new TextPositionSequence(); + } + } + word.getTextPositions().add(current.getTextPosition()); + previous = current; + } + words.add(word); + } + + + private void buildBBox() { + + double minX = Double.POSITIVE_INFINITY; + double minY = Double.POSITIVE_INFINITY; + double maxX = Double.NEGATIVE_INFINITY; + double maxY = Double.NEGATIVE_INFINITY; + + for (Character character : characters) { + + minX = Math.min(minX, character.getTextPosition().getXDirAdj()); + minY = Math.min(minY, character.getTextPosition().getYDirAdj()); + maxX = Math.max(maxX, character.getTextPosition().getXDirAdj() + character.getTextPosition().getWidthDirAdj()); + maxY = Math.max(maxY, character.getTextPosition().getYDirAdj() + character.getTextPosition().getHeightDir()); + + } + + this.setBBox(new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY)); + } + + + public String toString() { + + StringBuilder sb = new StringBuilder(); + words.forEach(word -> sb.append(word.toString()).append(" ")); + return sb.toString().trim(); + } + +} + diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Neighbor.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Neighbor.java new file mode 100644 index 0000000..b2b4174 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Neighbor.java @@ -0,0 +1,36 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model; + +import lombok.Getter; + +public class Neighbor { + + @Getter + private final double distance; + @Getter + private final double angle; + private final Character originCharacter; + @Getter + private final Character character; + + + public Neighbor(Character neighbor, Character origin) { + + this.distance = neighbor.distance(origin); + this.angle = neighbor.angle(origin); + this.character = neighbor; + this.originCharacter = origin; + } + + + public double getHorizontalDistance() { + + return character.horizontalDistance(originCharacter); + } + + + public double getVerticalDistance() { + + return character.verticalDistance(originCharacter); + } + +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Zone.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Zone.java new file mode 100644 index 0000000..f49b6d1 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Zone.java @@ -0,0 +1,50 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model; + +import java.awt.geom.Rectangle2D; +import java.util.Comparator; +import java.util.List; + +import lombok.Data; + +@Data +public class Zone extends BoundingBox { + + private List lines; + + + public Zone(List lines) { + + lines.sort(Comparator.comparingDouble(Line::getY)); + this.lines = lines; + buildBBox(); + } + + + public void buildBBox() { + + double minX = Double.POSITIVE_INFINITY; + double minY = Double.POSITIVE_INFINITY; + double maxX = Double.NEGATIVE_INFINITY; + double maxY = Double.NEGATIVE_INFINITY; + + for (Line line : lines) { + + minX = Math.min(minX, line.getX()); + minY = Math.min(minY, line.getY()); + maxX = Math.max(maxX, line.getX() + line.getWidth()); + maxY = Math.max(maxY, line.getY() + line.getHeight()); + + } + + this.setBBox(new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY)); + } + + + public String toString() { + + StringBuilder sb = new StringBuilder(); + lines.forEach(line -> sb.append(line.toString()).append("\n")); + return sb.toString().trim(); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/LineBuilderService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/LineBuilderService.java new file mode 100644 index 0000000..19ee66c --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/LineBuilderService.java @@ -0,0 +1,50 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service; + +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; + +import org.springframework.stereotype.Service; + +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.AngleFilter; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.DisjointSets; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Line; + +@Service +public class LineBuilderService { + + private static final double CHARACTER_SPACING_DISTANCE_MULTIPLIER = 3.5; + private static final double MAX_VERTICAL_CHARACTER_DISTANCE = 0.67; + private static final double ANGLE_TOLERANCE = Math.PI / 6; + + + public List buildLines(List characters, double characterSpacing, double lineSpacing) { + + double maxHorizontalDistance = characterSpacing * CHARACTER_SPACING_DISTANCE_MULTIPLIER; + double maxVerticalDistance = lineSpacing * MAX_VERTICAL_CHARACTER_DISTANCE; + + DisjointSets sets = new DisjointSets<>(characters); + AngleFilter filter = new AngleFilter(-ANGLE_TOLERANCE, ANGLE_TOLERANCE); + + characters.forEach(character -> { + character.getNeighbors().forEach(neighbor -> { + double x = neighbor.getHorizontalDistance() / maxHorizontalDistance; + double y = neighbor.getVerticalDistance() / maxVerticalDistance; + if (filter.matches(neighbor) && Math.pow(x, 2) + Math.pow(y, 2) <= 1) { + sets.union(character, neighbor.getCharacter()); + } + }); + }); + + List lines = new ArrayList<>(); + sets.forEach(group -> { + List lineCharacters = new ArrayList<>(group); + lineCharacters.sort(Comparator.comparingDouble(Character::getX)); + lines.add(new Line(lineCharacters, characterSpacing)); + }); + + return lines; + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/NearestNeighbourService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/NearestNeighbourService.java new file mode 100644 index 0000000..1a3f6e2 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/NearestNeighbourService.java @@ -0,0 +1,78 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service; + +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; + +import org.springframework.stereotype.Service; + +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Neighbor; + +@Service +public class NearestNeighbourService { + + private static final int NUMBER_OF_NEIGHBOURS = 8; + private static final double STEP = 16.0; + + + public void findNearestNeighbors(List characters) { + + if (characters.isEmpty()) { + return; + } + + characters.sort(Comparator.comparingDouble(Character::getX)); + + int maxNeighborCount = NUMBER_OF_NEIGHBOURS; + if (characters.size() <= NUMBER_OF_NEIGHBOURS) { + maxNeighborCount = characters.size() - 1; + } + + for (int i = 0; i < characters.size(); i++) { + + List candidates = new ArrayList<>(); + + int start = i; + int end = i + 1; + + double distance = Double.POSITIVE_INFINITY; + + for (double searchDistance = 0; searchDistance < distance; ) { + + searchDistance += STEP; + boolean newCandidatesFound = false; + + while (start > 0 && characters.get(i).getX() - characters.get(start - 1).getX() < searchDistance) { + start--; + candidates.add(new Neighbor(characters.get(start), characters.get(i))); + clearLeastDistant(candidates, maxNeighborCount); + newCandidatesFound = true; + } + + while (end < characters.size() && characters.get(end).getX() - characters.get(i).getX() < searchDistance) { + candidates.add(new Neighbor(characters.get(end), characters.get(i))); + clearLeastDistant(candidates, maxNeighborCount); + end++; + newCandidatesFound = true; + } + + if (newCandidatesFound && candidates.size() >= maxNeighborCount) { + distance = candidates.get(maxNeighborCount - 1).getDistance(); + } + } + clearLeastDistant(candidates, maxNeighborCount); + characters.get(i).setNeighbors(new ArrayList<>(candidates)); + } + } + + + private void clearLeastDistant(List candidates, int maxNeighborCount) { + + if (candidates.size() > maxNeighborCount) { + candidates.sort(Comparator.comparingDouble(Neighbor::getDistance)); + candidates.remove(candidates.remove(candidates.size() - 1)); + } + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/ReadingOrderService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/ReadingOrderService.java new file mode 100644 index 0000000..aaae95f --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/ReadingOrderService.java @@ -0,0 +1,99 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service; + +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; +import java.util.ListIterator; + +import org.springframework.stereotype.Service; + +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.BoundingBox; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.utils.DoubleUtils; + +@Service +public class ReadingOrderService { + + private static final double THRESHOLD = 1; + + + public List resolve(List zones, boolean xyOrder) { + + if (zones.isEmpty() || zones.size() == 1) { + return zones; + } + + if (xyOrder) { + zones.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) + .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); + return zones; + } + + return resolveMultiColumnReadingOder(zones); + } + + + private List resolveMultiColumnReadingOder(List zones) { + + // Simple reading order resolver for multi column page layout as described here : https://pub.towardsai.net/advanced-rag-02-unveiling-pdf-parsing-b84ae866344e + // TODO implement a more fancy reading order resolver see https://github.com/BobLd/DocumentLayoutAnalysis/blob/master/README.md#reading-order + + double minX = Double.POSITIVE_INFINITY; + double maxX = Double.NEGATIVE_INFINITY; + + for (Zone zone : zones) { + if (zone.getX() < minX) { + minX = zone.getX(); + } + if (zone.getX() + zone.getWidth() > maxX) { + maxX = zone.getX() + zone.getWidth(); + } + } + + double midLineXCoordinate = (minX + maxX) / 2; + + List leftOf = new ArrayList<>(); + List rightOf = new ArrayList<>(); + List middle = new ArrayList<>(); + for (Zone zone : zones) { + if (zone.getX() < midLineXCoordinate && zone.getX() + zone.getWidth() < midLineXCoordinate) { + leftOf.add(zone); + } else if (zone.getX() > midLineXCoordinate && zone.getX() + zone.getWidth() > midLineXCoordinate) { + rightOf.add(zone); + } else { + middle.add(zone); + } + } + + leftOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) + .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); + + rightOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) + .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); + + middle.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) + .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); + + List sortedZones = new ArrayList<>(); + sortedZones.addAll(leftOf); + sortedZones.addAll(rightOf); + + ListIterator itty = middle.listIterator(); + + while (itty.hasNext()) { + Zone current = itty.next(); + for (int i = 0; i < sortedZones.size(); i++) { + if (current.getY() < sortedZones.get(i).getY()) { + sortedZones.add(i, current); + itty.remove(); + break; + } + } + } + + sortedZones.addAll(middle); + + return sortedZones; + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/SpacingService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/SpacingService.java new file mode 100644 index 0000000..2aab22d --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/SpacingService.java @@ -0,0 +1,56 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service; + +import java.util.List; + +import org.springframework.stereotype.Service; + +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.AngleFilter; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Histogram; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Neighbor; + +@Service +public class SpacingService { + + private static final double SPACING_HISTOGRAM_RESOLUTION = 0.5; + private static final double SPACING_HISTOGRAM_SMOOTHING_LENGTH = 2.5; + private static final double SPACING_HIST_SMOOTHING_STANDARD_DEVIATION = 0.5; + private static final double ANGLE_TOLERANCE = Math.PI / 6; + + + public double computeCharacterSpacing(List characters) { + + return computeSpacing(characters, 0); + } + + + public double computeLineSpacing(List characters) { + + return computeSpacing(characters, Math.PI / 2); + } + + + private double computeSpacing(List characters, double angle) { + + double maxDistance = Double.NEGATIVE_INFINITY; + + for (Character character : characters) { + for (Neighbor neighbor : character.getNeighbors()) { + maxDistance = Math.max(maxDistance, neighbor.getDistance()); + } + } + Histogram histogram = new Histogram(0, maxDistance, SPACING_HISTOGRAM_RESOLUTION); + AngleFilter angleFilter = new AngleFilter(angle - ANGLE_TOLERANCE, angle + ANGLE_TOLERANCE); + for (Character character : characters) { + for (Neighbor neighbor : character.getNeighbors()) { + if (angleFilter.matches(neighbor)) { + histogram.add(neighbor.getDistance()); + } + } + } + + histogram.gaussianSmooth(SPACING_HISTOGRAM_SMOOTHING_LENGTH, SPACING_HIST_SMOOTHING_STANDARD_DEVIATION); + return histogram.getPeakValue(); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/ZoneBuilderService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/ZoneBuilderService.java new file mode 100644 index 0000000..3ddc2d2 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/ZoneBuilderService.java @@ -0,0 +1,150 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service; + +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; +import java.util.Set; + +import org.springframework.stereotype.Service; + +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.DisjointSets; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Line; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone; + +@Service +public class ZoneBuilderService { + + private static final double MIN_HORIZONTAL_DISTANCE_MULTIPLIER = -0.5; + private static final double MAX_VERTICAL_DISTANCE_MULTIPLIER = 1.2; + + private static final double MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER = -3.0; + + private static final double MAX_VERTICAL_MERGE_DISTANCE_MULTIPLIER = 0.5; + + private static final double MIN_LINE_SIZE_SCALE = 0.9; + + private static final double MAX_LINE_SIZE_SCALE = 2.5; + + private static final double ANGLE_TOLERANCE = Math.PI / 6; + + private static final int MAX_ZONES = 300; + + private static final double MAX_VERTICAL_MERGE_DISTANCE = 0.5; + + + public List buildZones(List lines, double characterSpacing, double lineSpacing) { + + double minHorizontalDistance = characterSpacing * MIN_HORIZONTAL_DISTANCE_MULTIPLIER; + double maxVerticalDistance = lineSpacing * MAX_VERTICAL_DISTANCE_MULTIPLIER; + double minHorizontalMergeDistance = characterSpacing * MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER; + double maxVerticalMergeDistance = lineSpacing * MAX_VERTICAL_MERGE_DISTANCE_MULTIPLIER; + + DisjointSets sets = new DisjointSets<>(lines); + + double meanHeight = calculateMeanHeight(lines); + + lines.forEach(outerLine -> // + lines.forEach(innerLine -> { + + double scale = Math.min(outerLine.getHeight(), innerLine.getHeight()) / meanHeight; + scale = Math.max(MIN_LINE_SIZE_SCALE, Math.min(scale, MAX_LINE_SIZE_SCALE)); + + if (!sets.areTogether(outerLine, innerLine) && outerLine.angularDifference(innerLine) <= ANGLE_TOLERANCE) { + + double horizontalDistance = outerLine.horizontalDistance(innerLine) / scale; + double verticalDistance = outerLine.verticalDistance(innerLine) / scale; + + if (minHorizontalDistance <= horizontalDistance && verticalDistance <= maxVerticalDistance // + || minHorizontalMergeDistance <= horizontalDistance && verticalDistance <= maxVerticalMergeDistance) { + sets.union(outerLine, innerLine); + } + } + })); + + List zones = new ArrayList<>(); + sets.forEach(group -> { + zones.add(mergeLinesInZone(new ArrayList<>(group), characterSpacing, lineSpacing)); + }); + + if (zones.size() > MAX_ZONES) { + List oneZoneLines = new ArrayList<>(); + for (Zone zone : zones) { + oneZoneLines.addAll(zone.getLines()); + } + return List.of(mergeLinesInZone(oneZoneLines, characterSpacing, lineSpacing)); + } + + return zones; + } + + + private double calculateMeanHeight(List lines) { + + double meanHeight = 0.0; + double weights = 0.0; + for (Line line : lines) { + double weight = line.getLength(); + meanHeight += line.getHeight() * weight; + weights += weight; + } + meanHeight /= weights; + return meanHeight; + } + + + private Zone mergeLinesInZone(List lines, double characterSpacing, double lineSpacing) { + + double maxHorizontalDistance = 0; + double minVerticalDistance = 0; + double maxVerticalDistance = lineSpacing * MAX_VERTICAL_MERGE_DISTANCE; + + DisjointSets sets = new DisjointSets<>(lines); + + lines.forEach(outer -> { + + lines.forEach(inner -> { + if (inner != outer) { + + double horizontalDistance = outer.horizontalDistance(inner); + double verticalDistance = outer.verticalDistance(inner); + + if (horizontalDistance <= maxHorizontalDistance && minVerticalDistance <= verticalDistance && verticalDistance <= maxVerticalDistance) { + sets.union(outer, inner); + } else if (minVerticalDistance <= verticalDistance && verticalDistance <= maxVerticalDistance && Math.abs(horizontalDistance - Math.min(outer.getLength(), + inner.getLength())) < 0.1) { + boolean characterOverlap = false; + int overlappingCount = 0; + for (Character outerCharacter : outer.getCharacters()) { + for (Character innerCharacter : inner.getCharacters()) { + double characterOverlapDistance = outerCharacter.overlappingDistance(innerCharacter); + if (characterOverlapDistance > 2) { + characterOverlap = true; + } + if (characterOverlapDistance > 0) { + overlappingCount++; + } + } + } + if (!characterOverlap && overlappingCount <= 2) { + sets.union(outer, inner); + } + } + } + }); + }); + + List outputZone = new ArrayList<>(); + for (Set group : sets) { + List components = new ArrayList<>(); + for (Line line : group) { + components.addAll(line.getCharacters()); + } + components.sort(Comparator.comparingDouble(Character::getX)); + + outputZone.add(new Line(components, characterSpacing)); + } + return new Zone(outputZone); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/utils/DoubleUtils.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/utils/DoubleUtils.java new file mode 100644 index 0000000..c1853c9 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/utils/DoubleUtils.java @@ -0,0 +1,18 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.utils; + +public class DoubleUtils { + + public static int compareDouble(double d1, double d2, double precision) { + + if (Double.isNaN(d1) || Double.isNaN(d2)) { + return Double.compare(d1, d2); + } + if (precision == 0) { + precision = 1; + } + long i1 = Math.round(d1 / precision); + long i2 = Math.round(d2 / precision); + return Long.compare(i1, i2); + } + +} diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java index 0e28417..264e2da 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java @@ -33,7 +33,7 @@ public class ViewerDocumentTest extends BuildDocumentTest { ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null); LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService); - Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER); + Document document = buildGraph(fileName, LayoutParsingType.DOCSTRUM_XY); long start = System.currentTimeMillis(); layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true); System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000); @@ -54,10 +54,11 @@ public class ViewerDocumentTest extends BuildDocumentTest { var documentFile = new ClassPathResource(fileName).getFile(); var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE, - documentFile, - new ImageServiceResponse(), - tableResponse, - new VisualLayoutParsingResponse(),Path.of(fileName).getFileName().toFile().toString()); + documentFile, + new ImageServiceResponse(), + tableResponse, + new VisualLayoutParsingResponse(), + Path.of(fileName).getFileName().toFile().toString()); ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null); LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService); Document document = DocumentGraphFactory.buildDocumentGraph(classificationDocument);