From 1d1bd321c28c5b3f8a2e7cfba03c18f9e0753e1d Mon Sep 17 00:00:00 2001 From: Kilian Schuettler Date: Tue, 26 Nov 2024 16:56:52 +0100 Subject: [PATCH] RED-8670: add tables to idp result * apparently i've fixed some error, where the ocr-service sometimes hangs --- .../ocr/v1/api/model/AzureAnalyzeResult.java | 25 -- .../service/ocr/v1/api/model/Figure.java | 3 +- .../service/ocr/v1/api/model/IdpResult.java | 23 ++ .../service/ocr/v1/api/model/Table.java | 7 + .../service/ocr/v1/api/model/TableCell.java | 5 + .../ocr/v1/api/model/TableCellType.java | 5 + .../NativeLibrariesInitializer.java | 6 +- .../processor/model/DocumentSpanLookup.java | 102 ++++++++ .../ocr/processor/service/OCRService.java | 15 ++ .../OcrResultPostProcessingPipeline.java | 1 + .../ImageProcessingService.java | 1 + .../utils/StringCleaningUtility.java | 40 +++ .../visualizations/AnalyzeResultMapper.java | 4 +- .../layers/IdpResultFactory.java | 236 ++++++++++++++++++ .../visualizations/layers/LayerFactory.java | 16 +- .../visualizations/layers/OcrResult.java | 3 +- .../utils/RotationCorrectionUtility.java | 32 ++- .../ocr/v1/server/FileStorageService.java | 12 +- .../src/main/resources/application.yml | 2 + .../v1/server/OcrServiceIntegrationTest.java | 6 +- .../src/test/resources/application.yml | 9 +- 21 files changed, 497 insertions(+), 56 deletions(-) delete mode 100644 azure-ocr-service/azure-ocr-service-api/src/main/java/com/knecon/fforesight/service/ocr/v1/api/model/AzureAnalyzeResult.java create mode 100644 azure-ocr-service/azure-ocr-service-api/src/main/java/com/knecon/fforesight/service/ocr/v1/api/model/IdpResult.java create mode 100644 azure-ocr-service/azure-ocr-service-api/src/main/java/com/knecon/fforesight/service/ocr/v1/api/model/Table.java create mode 100644 azure-ocr-service/azure-ocr-service-api/src/main/java/com/knecon/fforesight/service/ocr/v1/api/model/TableCell.java create mode 100644 azure-ocr-service/azure-ocr-service-api/src/main/java/com/knecon/fforesight/service/ocr/v1/api/model/TableCellType.java create mode 100644 azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/DocumentSpanLookup.java create mode 100644 azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/StringCleaningUtility.java create mode 100644 azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/visualizations/layers/IdpResultFactory.java diff --git a/azure-ocr-service/azure-ocr-service-api/src/main/java/com/knecon/fforesight/service/ocr/v1/api/model/AzureAnalyzeResult.java b/azure-ocr-service/azure-ocr-service-api/src/main/java/com/knecon/fforesight/service/ocr/v1/api/model/AzureAnalyzeResult.java deleted file mode 100644 index 2616673..0000000 --- a/azure-ocr-service/azure-ocr-service-api/src/main/java/com/knecon/fforesight/service/ocr/v1/api/model/AzureAnalyzeResult.java +++ /dev/null @@ -1,25 +0,0 @@ -package com.knecon.fforesight.service.ocr.v1.api.model; - -import java.util.ArrayList; -import java.util.List; - -import lombok.AccessLevel; -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.Getter; -import lombok.experimental.FieldDefaults; - -@Getter -@Builder -@AllArgsConstructor -@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) -public class AzureAnalyzeResult { - - @Builder.Default - List keyValuePairs = new ArrayList<>(); - @Builder.Default - List handWrittenText = new ArrayList<>(); - @Builder.Default - List
figures = new ArrayList<>(); - -} diff --git a/azure-ocr-service/azure-ocr-service-api/src/main/java/com/knecon/fforesight/service/ocr/v1/api/model/Figure.java b/azure-ocr-service/azure-ocr-service-api/src/main/java/com/knecon/fforesight/service/ocr/v1/api/model/Figure.java index d340c44..872fca5 100644 --- a/azure-ocr-service/azure-ocr-service-api/src/main/java/com/knecon/fforesight/service/ocr/v1/api/model/Figure.java +++ b/azure-ocr-service/azure-ocr-service-api/src/main/java/com/knecon/fforesight/service/ocr/v1/api/model/Figure.java @@ -1,10 +1,11 @@ package com.knecon.fforesight.service.ocr.v1.api.model; +import java.util.List; import java.util.Optional; import lombok.Builder; @Builder -public record Figure(Optional caption, Region image) { +public record Figure(TextRegion caption, Region image, List footnotes) { } diff --git a/azure-ocr-service/azure-ocr-service-api/src/main/java/com/knecon/fforesight/service/ocr/v1/api/model/IdpResult.java b/azure-ocr-service/azure-ocr-service-api/src/main/java/com/knecon/fforesight/service/ocr/v1/api/model/IdpResult.java new file mode 100644 index 0000000..f1cf8e9 --- /dev/null +++ b/azure-ocr-service/azure-ocr-service-api/src/main/java/com/knecon/fforesight/service/ocr/v1/api/model/IdpResult.java @@ -0,0 +1,23 @@ +package com.knecon.fforesight.service.ocr.v1.api.model; + +import java.util.Collections; +import java.util.LinkedList; +import java.util.List; + +public record IdpResult(List keyValuePairs, List handWrittenText, List
figures, List tables) { + + public static IdpResult initSynchronized() { + + return new IdpResult(Collections.synchronizedList(new LinkedList<>()), + Collections.synchronizedList(new LinkedList<>()), + Collections.synchronizedList(new LinkedList<>()), + Collections.synchronizedList(new LinkedList<>())); + } + + + public static IdpResult empty() { + + return new IdpResult(Collections.emptyList(), Collections.emptyList(), Collections.emptyList(), Collections.emptyList()); + } + +} diff --git a/azure-ocr-service/azure-ocr-service-api/src/main/java/com/knecon/fforesight/service/ocr/v1/api/model/Table.java b/azure-ocr-service/azure-ocr-service-api/src/main/java/com/knecon/fforesight/service/ocr/v1/api/model/Table.java new file mode 100644 index 0000000..d989381 --- /dev/null +++ b/azure-ocr-service/azure-ocr-service-api/src/main/java/com/knecon/fforesight/service/ocr/v1/api/model/Table.java @@ -0,0 +1,7 @@ +package com.knecon.fforesight.service.ocr.v1.api.model; + +import java.util.List; + +public record Table(TextRegion caption, int numberOfCols, int numberOfRows, List cells, List footnotes, List bboxes) { + +} diff --git a/azure-ocr-service/azure-ocr-service-api/src/main/java/com/knecon/fforesight/service/ocr/v1/api/model/TableCell.java b/azure-ocr-service/azure-ocr-service-api/src/main/java/com/knecon/fforesight/service/ocr/v1/api/model/TableCell.java new file mode 100644 index 0000000..668d3df --- /dev/null +++ b/azure-ocr-service/azure-ocr-service-api/src/main/java/com/knecon/fforesight/service/ocr/v1/api/model/TableCell.java @@ -0,0 +1,5 @@ +package com.knecon.fforesight.service.ocr.v1.api.model; + +public record TableCell(TextRegion textRegion, int row, int col, TableCellType kind) { + +} diff --git a/azure-ocr-service/azure-ocr-service-api/src/main/java/com/knecon/fforesight/service/ocr/v1/api/model/TableCellType.java b/azure-ocr-service/azure-ocr-service-api/src/main/java/com/knecon/fforesight/service/ocr/v1/api/model/TableCellType.java new file mode 100644 index 0000000..87767dd --- /dev/null +++ b/azure-ocr-service/azure-ocr-service-api/src/main/java/com/knecon/fforesight/service/ocr/v1/api/model/TableCellType.java @@ -0,0 +1,5 @@ +package com.knecon.fforesight.service.ocr.v1.api.model; + +public enum TableCellType { + ROW_HEADER, COLUMN_HEADER, CONTENT, STUB_HEAD, DESCRIPTION +} diff --git a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/initializer/NativeLibrariesInitializer.java b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/initializer/NativeLibrariesInitializer.java index a7e5fcc..8c20bbc 100644 --- a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/initializer/NativeLibrariesInitializer.java +++ b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/initializer/NativeLibrariesInitializer.java @@ -21,6 +21,8 @@ public class NativeLibrariesInitializer { @Value("${pdftron.license:}") private String pdftronLicense; + @Value("${native-libs.path:}") + private String nativeLibsPath; @SneakyThrows @PostConstruct @@ -32,8 +34,8 @@ public class NativeLibrariesInitializer { PDFNet.setTempPath("/tmp/pdftron"); PDFNet.initialize(pdftronLicense); - log.info("Setting jna.library.path: {}", System.getenv("VCPKG_DYNAMIC_LIB")); - System.setProperty("jna.library.path", System.getenv("VCPKG_DYNAMIC_LIB")); + log.info("Setting jna.library.path: {}", nativeLibsPath); + System.setProperty("jna.library.path", nativeLibsPath); log.info("Asserting Native Libraries loaded"); diff --git a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/DocumentSpanLookup.java b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/DocumentSpanLookup.java new file mode 100644 index 0000000..1266516 --- /dev/null +++ b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/DocumentSpanLookup.java @@ -0,0 +1,102 @@ +package com.knecon.fforesight.service.ocr.processor.model; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.List; +import java.util.Map; +import java.util.TreeMap; +import java.util.stream.Collectors; + +import com.azure.ai.documentintelligence.models.AnalyzeResult; +import com.azure.ai.documentintelligence.models.DocumentPage; +import com.azure.ai.documentintelligence.models.DocumentSpan; +import com.azure.ai.documentintelligence.models.DocumentWord; + +public class DocumentSpanLookup { + + List documentWordLookup; + + + public DocumentSpanLookup(AnalyzeResult analyzeResult) { + + documentWordLookup = new ArrayList<>(analyzeResult.getPages().size()); + int offset = 0; + for (DocumentPage page : analyzeResult.getPages()) { + + if (page.getWords() == null || page.getWords().isEmpty()) { + documentWordLookup.add(new PageSpanLookup(offset, offset, null)); + } + int start = page.getWords() + .get(0).getSpan().getOffset(); + DocumentSpan span = page.getWords() + .get(page.getWords().size() - 1).getSpan(); + int end = span.getOffset() + span.getLength(); + SpanLookup pageWords = new SpanLookup<>(page.getWords() + .stream(), DocumentWord::getSpan); + documentWordLookup.add(new PageSpanLookup(start, end, pageWords)); + offset = end + 1; + } + } + + + public List findWordsOnPages(DocumentSpan documentSpan) { + + if (documentSpan == null) { + return Collections.emptyList(); + } + int firstSmallerIdx = findIdxOfFirstSmallerObject(documentSpan); + PageSpanLookup firstPage = documentWordLookup.get(firstSmallerIdx); + List wordsOnPages = new ArrayList<>(); + for (int pageNumber = firstSmallerIdx; pageNumber < documentWordLookup.size(); pageNumber++) { + PageSpanLookup page = documentWordLookup.get(pageNumber); + if (page.end >= documentSpan.getOffset()) { + break; + } + firstPage.wordSpanLookup.findElementsContainedInSpan(documentSpan) + .stream() + .map(documentWord -> new WordOnPage(documentWord, firstSmallerIdx)) + .forEach(wordsOnPages::add); + } + return wordsOnPages; + } + + + private int findIdxOfFirstSmallerObject(DocumentSpan documentSpan) { + + int idx = Collections.binarySearch(documentWordLookup, new PageSpanLookup(documentSpan.getOffset(), -1, null), Comparator.comparing(PageSpanLookup::start)); + + if (idx >= 0) { + return idx; + } else { + int insertionPoint = -(idx + 1); + + if (insertionPoint == 0) { + return -1; + } + var lastSmaller = documentWordLookup.get(insertionPoint - 1); + for (int resultIdx = insertionPoint - 2; resultIdx >= 0; resultIdx--) { + if (documentWordLookup.get(resultIdx).compareTo(lastSmaller) == 0) { + return resultIdx + 1; + } + } + return 0; + } + } + + + public record WordOnPage(DocumentWord documentWord, int pageNumber) { + + } + + private record PageSpanLookup(int start, int end, SpanLookup wordSpanLookup) implements Comparable { + + @Override + public int compareTo(PageSpanLookup o) { + + return Integer.compare(start, o.start); + } + + } + +} diff --git a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OCRService.java b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OCRService.java index c87a5e0..fd83348 100644 --- a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OCRService.java +++ b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OCRService.java @@ -5,6 +5,7 @@ import static com.knecon.fforesight.service.ocr.processor.model.Statistics.human import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; +import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.StandardCopyOption; @@ -13,6 +14,7 @@ import java.util.Set; import org.springframework.stereotype.Service; +import com.fasterxml.jackson.databind.ObjectMapper; import com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService; import com.iqser.red.pdftronlogic.commons.OCGWatermarkRemovalService; import com.iqser.red.pdftronlogic.commons.WatermarkRemovalService; @@ -45,6 +47,7 @@ public class OCRService { BatchFactory batchFactory; AsyncOcrService asyncOcrService; OcrServiceSettings settings; + ObjectMapper mapper; /** @@ -145,6 +148,10 @@ public class OCRService { RotationCorrectionUtility.rotatePages(viewerDocumentFile.toPath(), viewerDocumentFile.toPath(), ocrResult.anglesPerPage()); } + if (features.contains(AzureOcrFeature.IDP)) { + saveAnalyzeResultFile(analyzeResultFile, ocrResult); + } + supervisor.getStatistics().drawingPdfFinished(); supervisor.sendFinished(); @@ -154,4 +161,12 @@ public class OCRService { } + + private void saveAnalyzeResultFile(File analyzeResultFile, OcrResult ocrResult) throws IOException { + + try (var out = new FileOutputStream(analyzeResultFile)) { + mapper.writeValue(out, ocrResult.idpResult()); + } + } + } diff --git a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrResultPostProcessingPipeline.java b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrResultPostProcessingPipeline.java index 5133928..c153e07 100644 --- a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrResultPostProcessingPipeline.java +++ b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrResultPostProcessingPipeline.java @@ -106,6 +106,7 @@ public class OcrResultPostProcessingPipeline { writableOcrResultList.add(builder.build()); } + log.debug("Batch {}: finished post-processing.", batch.getIndex()); return writableOcrResultList; } diff --git a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/imageprocessing/ImageProcessingService.java b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/imageprocessing/ImageProcessingService.java index 87dc937..81262d7 100644 --- a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/imageprocessing/ImageProcessingService.java +++ b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/imageprocessing/ImageProcessingService.java @@ -90,6 +90,7 @@ public class ImageProcessingService { supervisor.markError(e.getMessage()); } finally { supervisor.markPageFinished(processedImage); + log.debug("Finished page: {}", processedImage.pageNumber()); } } diff --git a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/StringCleaningUtility.java b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/StringCleaningUtility.java new file mode 100644 index 0000000..06aa59c --- /dev/null +++ b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/StringCleaningUtility.java @@ -0,0 +1,40 @@ +package com.knecon.fforesight.service.ocr.processor.utils; + +import java.util.regex.Pattern; + +import lombok.experimental.UtilityClass; + +@UtilityClass +public class StringCleaningUtility { + + public static final Pattern hyphenLineBreaks = Pattern.compile("[-~‐‒⁻−﹣゠⁓‑\\u00AD][\\r\\n]+"); + public static final Pattern linebreaks = Pattern.compile("[\\r\\n]+"); + public static final Pattern doubleWhitespaces = Pattern.compile("\\s{2,}"); + + + public static String cleanString(String value) { + + String noHyphenLinebreaks = removeHyphenLinebreaks(value); + String noLinebreaks = removeLinebreaks(noHyphenLinebreaks); + return removeMultipleWhitespaces(noLinebreaks); + } + + + private String removeHyphenLinebreaks(String value) { + + return hyphenLineBreaks.matcher(value).replaceAll(""); + } + + + private String removeMultipleWhitespaces(String value) { + + return doubleWhitespaces.matcher(value).replaceAll(" "); + } + + + private String removeLinebreaks(String value) { + + return linebreaks.matcher(value).replaceAll(" "); + } + +} diff --git a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/visualizations/AnalyzeResultMapper.java b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/visualizations/AnalyzeResultMapper.java index 0570b92..e2ea13b 100644 --- a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/visualizations/AnalyzeResultMapper.java +++ b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/visualizations/AnalyzeResultMapper.java @@ -1,14 +1,14 @@ package com.knecon.fforesight.service.ocr.processor.visualizations; import com.azure.ai.documentintelligence.models.AnalyzeResult; -import com.knecon.fforesight.service.ocr.v1.api.model.AzureAnalyzeResult; +import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult; import lombok.experimental.UtilityClass; @UtilityClass public class AnalyzeResultMapper { - public AzureAnalyzeResult map(AnalyzeResult analyzeResult) { + public IdpResult map(AnalyzeResult analyzeResult) { return null; } diff --git a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/visualizations/layers/IdpResultFactory.java b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/visualizations/layers/IdpResultFactory.java new file mode 100644 index 0000000..cfd0ad4 --- /dev/null +++ b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/visualizations/layers/IdpResultFactory.java @@ -0,0 +1,236 @@ +package com.knecon.fforesight.service.ocr.processor.visualizations.layers; + +import static com.knecon.fforesight.service.ocr.processor.utils.StringCleaningUtility.cleanString; + +import java.awt.geom.AffineTransform; +import java.util.Collections; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Set; + +import com.azure.ai.documentintelligence.models.AnalyzeResult; +import com.azure.ai.documentintelligence.models.BoundingRegion; +import com.azure.ai.documentintelligence.models.DocumentCaption; +import com.azure.ai.documentintelligence.models.DocumentFigure; +import com.azure.ai.documentintelligence.models.DocumentFootnote; +import com.azure.ai.documentintelligence.models.DocumentKeyValuePair; +import com.azure.ai.documentintelligence.models.DocumentTable; +import com.azure.ai.documentintelligence.models.DocumentTableCell; +import com.knecon.fforesight.service.ocr.processor.model.DocumentSpanLookup; +import com.knecon.fforesight.service.ocr.processor.model.PageBatch; +import com.knecon.fforesight.service.ocr.processor.model.PageInformation; +import com.knecon.fforesight.service.ocr.processor.visualizations.utils.Rectangle2DBBoxCollector; +import com.knecon.fforesight.service.ocr.processor.visualizations.utils.RotationCorrectionUtility; +import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult; +import com.knecon.fforesight.service.ocr.v1.api.model.AzureOcrFeature; +import com.knecon.fforesight.service.ocr.v1.api.model.Figure; +import com.knecon.fforesight.service.ocr.v1.api.model.KeyValuePair; +import com.knecon.fforesight.service.ocr.v1.api.model.QuadPoint; +import com.knecon.fforesight.service.ocr.v1.api.model.Region; +import com.knecon.fforesight.service.ocr.v1.api.model.Table; +import com.knecon.fforesight.service.ocr.v1.api.model.TableCell; +import com.knecon.fforesight.service.ocr.v1.api.model.TableCellType; +import com.knecon.fforesight.service.ocr.v1.api.model.TextRegion; + +import lombok.AccessLevel; +import lombok.Getter; +import lombok.experimental.FieldDefaults; + +@Getter +@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) +public class IdpResultFactory { + + IdpResult idpResult; + Map resultToPageTransforms; + Map pageInformation; + Map angles; + boolean rotationCorrection; + + + public IdpResultFactory(Map resultToPageTransforms, + Map pageInformation, + Map angles, + Set features) { + + this.angles = angles; + + this.rotationCorrection = features.contains(AzureOcrFeature.ROTATION_CORRECTION); + this.resultToPageTransforms = resultToPageTransforms; + this.pageInformation = pageInformation; + this.idpResult = IdpResult.initSynchronized(); + } + + + public AffineTransform getResultToPageTransform(Integer pageNumber) { + + AffineTransform transform = resultToPageTransforms.get(pageNumber); + if (rotationCorrection) { + PageInformation page = pageInformation.get(pageNumber); + transform.preConcatenate(RotationCorrectionUtility.buildTransform(angles.get(pageNumber), page.width(), page.height(), false)); + } + return transform; + } + + + public void addAnalyzeResult(AnalyzeResult analyzeResult, PageBatch batch) { + + DocumentSpanLookup words = new DocumentSpanLookup(analyzeResult); + analyzeResult.getTables() + .forEach(documentTable -> addTable(documentTable, words, batch)); + analyzeResult.getKeyValuePairs() + .forEach(documentKeyValuePair -> addKeyValuePair(documentKeyValuePair, batch)); + analyzeResult.getFigures() + .forEach(documentFigure -> addFigure(documentFigure, batch, words)); + } + + + private void addFigure(DocumentFigure documentFigure, PageBatch batch, DocumentSpanLookup words) { + + List footNotes = new LinkedList<>(); + if (documentFigure.getFootnotes() != null) { + documentFigure.getFootnotes() + .stream() + .map(footNote -> toTextRegion(footNote, batch)) + .filter(Objects::nonNull) + .forEach(footNotes::add); + } + int batchPageNumber = documentFigure.getBoundingRegions() + .get(0).getPageNumber(); + Region bbox = toRegionFromRegions(batch.getPageNumber(batchPageNumber), documentFigure.getBoundingRegions()); + TextRegion caption = toTextRegion(documentFigure.getCaption(), batch); + idpResult.figures().add(new Figure(caption, bbox, footNotes)); + + } + + + private void addKeyValuePair(DocumentKeyValuePair documentKeyValuePair, PageBatch batch) { + + TextRegion key = null; + if (documentKeyValuePair.getKey() != null) { + Region region = toRegionFromRegions(batch, documentKeyValuePair.getKey().getBoundingRegions()); + key = new TextRegion(region, cleanString(documentKeyValuePair.getKey().getContent())); + } + TextRegion value = null; + if (documentKeyValuePair.getValue() != null) { + Region region = toRegionFromRegions(batch, documentKeyValuePair.getValue().getBoundingRegions()); + value = new TextRegion(region, cleanString(documentKeyValuePair.getValue().getContent())); + } + + idpResult.keyValuePairs().add(new KeyValuePair(key, value)); + } + + + private void addTable(DocumentTable documentTable, DocumentSpanLookup words, PageBatch batch) { + + TextRegion caption = toTextRegion(documentTable.getCaption(), batch); + List tableCells = documentTable.getCells() + .stream() + .map(documentTableCell -> toTableCell(documentTableCell, words, batch)) + .toList(); + List footNotes = new LinkedList<>(); + + if (documentTable.getFootnotes() != null) { + documentTable.getFootnotes() + .stream() + .map(footNote -> toTextRegion(footNote, batch)) + .filter(Objects::nonNull) + .forEach(footNotes::add); + } + List bbox = documentTable.getBoundingRegions() + .stream() + .map(b -> toRegion(b, batch)) + .toList(); + Table table = new Table(caption, documentTable.getColumnCount(), documentTable.getRowCount(), tableCells, footNotes, bbox); + idpResult.tables().add(table); + } + + + private TextRegion toTextRegion(DocumentFootnote footNote, PageBatch batch) { + + if (footNote == null || footNote.getBoundingRegions().isEmpty()) { + return null; + } + + Region region = toRegionFromRegions(batch, footNote.getBoundingRegions()); + return new TextRegion(region, cleanString(footNote.getContent())); + } + + + private TextRegion toTextRegion(DocumentCaption caption, PageBatch batch) { + + if (caption == null || caption.getBoundingRegions().isEmpty()) { + return null; + } + + Region region = toRegionFromRegions(batch, caption.getBoundingRegions()); + return new TextRegion(region, cleanString(caption.getContent())); + } + + + private TableCell toTableCell(DocumentTableCell documentTableCell, DocumentSpanLookup words, PageBatch batch) { + + int batchPageNumber = documentTableCell.getBoundingRegions() + .get(0).getPageNumber(); + Region region = toRegionFromRegions(batch.getPageNumber(batchPageNumber), documentTableCell.getBoundingRegions()); + TableCellType kind = mapTableCellType(documentTableCell); + return new TableCell(new TextRegion(region, cleanString(documentTableCell.getContent())), documentTableCell.getRowIndex(), documentTableCell.getColumnIndex(), kind); + } + + + private static TableCellType mapTableCellType(DocumentTableCell documentTableCell) { + + if (documentTableCell.getKind() == null) { + return TableCellType.CONTENT; + } + return switch (documentTableCell.getKind().toString()) { + case "columnHeader" -> TableCellType.COLUMN_HEADER; + case "rowHeader" -> TableCellType.ROW_HEADER; + case "description" -> TableCellType.DESCRIPTION; + case "stubHead" -> TableCellType.STUB_HEAD; + default -> TableCellType.CONTENT; + }; + } + + + private Region toRegion(BoundingRegion boundingRegion, PageBatch batch) { + + int pageNumber = batch.getPageNumber(boundingRegion.getPageNumber()); + QuadPoint qp = QuadPoint.fromPolygons(boundingRegion.getPolygon()).getTransformed(getResultToPageTransform(pageNumber)); + return new Region(pageNumber, qp.toData()); + } + + + private Region toRegionFromRegions(int pageNumber, List regions) { + + QuadPoint bbox = QuadPoint.fromRectangle2D(regions.stream() + .map(BoundingRegion::getPolygon) + .map(QuadPoint::fromPolygons) + .map(qp -> qp.getTransformed(getResultToPageTransform(pageNumber)).getBounds2D()) + .collect(new Rectangle2DBBoxCollector())); + + return new Region(pageNumber, bbox.toData()); + } + + + private Region toRegionFromRegions(PageBatch batch, List regions) { + + assert !regions.isEmpty(); + int batchPageNumber = regions.get(0).getPageNumber(); + if (!regions.stream() + .map(BoundingRegion::getPageNumber) + .allMatch(number -> number == batchPageNumber)) { + throw new AssertionError(); + } + int pageNumber = batch.getPageNumber(batchPageNumber); + QuadPoint bbox = QuadPoint.fromRectangle2D(regions.stream() + .map(BoundingRegion::getPolygon) + .map(QuadPoint::fromPolygons) + .map(qp -> qp.getTransformed(getResultToPageTransform(pageNumber)).getBounds2D()) + .collect(new Rectangle2DBBoxCollector())); + + return new Region(pageNumber, bbox.toData()); + } + +} diff --git a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/visualizations/layers/LayerFactory.java b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/visualizations/layers/LayerFactory.java index 8ca934f..8ec499d 100644 --- a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/visualizations/layers/LayerFactory.java +++ b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/visualizations/layers/LayerFactory.java @@ -8,19 +8,22 @@ import java.util.Map; import java.util.Set; import com.azure.ai.documentintelligence.models.AnalyzeResult; +import com.knecon.fforesight.service.ocr.processor.OcrServiceSettings; import com.knecon.fforesight.service.ocr.processor.model.PageBatch; import com.knecon.fforesight.service.ocr.processor.model.PageInformation; import com.knecon.fforesight.service.ocr.processor.service.OcrExecutionSupervisor; -import com.knecon.fforesight.service.ocr.processor.OcrServiceSettings; +import com.knecon.fforesight.service.ocr.processor.service.OcrResultPostProcessingPipeline; import com.knecon.fforesight.service.ocr.processor.service.imageprocessing.ImageProcessingPipeline; import com.knecon.fforesight.service.ocr.processor.visualizations.WritableOcrResult; -import com.knecon.fforesight.service.ocr.processor.service.OcrResultPostProcessingPipeline; +import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult; import com.knecon.fforesight.service.ocr.v1.api.model.AzureOcrFeature; import com.knecon.fforesight.service.viewerdoc.layers.LayerGroup; import lombok.AccessLevel; import lombok.experimental.FieldDefaults; +import lombok.extern.slf4j.Slf4j; +@Slf4j @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) public class LayerFactory { @@ -29,6 +32,7 @@ public class LayerFactory { IdpLayerFactory idpLayerFactory; OcrDebugLayerFactory ocrDebugLayerFactory; OcrTextLayerFactory ocrTextLayerFactory; + IdpResultFactory idpResultFactory; OcrServiceSettings settings; Set features; Map angles; @@ -48,13 +52,13 @@ public class LayerFactory { this.features = features; this.supervisor = supervisor; this.angles = Collections.synchronizedMap(new HashMap<>()); + this.idpResultFactory = new IdpResultFactory(ocrResultPostProcessingPipeline.getResultToPageTransforms(), pageInformation, angles, features); } public void processAnalyzeResult(PageBatch batch, AnalyzeResult analyzeResult) throws InterruptedException { List results = ocrResultPostProcessingPipeline.processAnalyzeResult(analyzeResult, batch); - results.forEach(result -> angles.put(result.getPageNumber(), result.getAngle())); ocrTextLayerFactory.addWritableOcrResult(results); @@ -63,7 +67,10 @@ public class LayerFactory { ocrDebugLayerFactory.addAnalysisResult(results); } if (features.contains(AzureOcrFeature.IDP)) { + log.info("Batch {}: Start building IDP stuff", batch.getIndex()); idpLayerFactory.addAnalyzeResult(analyzeResult, batch); + idpResultFactory.addAnalyzeResult(analyzeResult, batch); + log.info("Batch {}: Finished building IDP stuff", batch.getIndex()); } this.supervisor.finishMappingResult(batch); @@ -82,7 +89,8 @@ public class LayerFactory { if (features.contains(AzureOcrFeature.IDP)) { debugLayers.add(idpLayerFactory.getIdpLayer()); } - return new OcrResult(List.of(ocrTextLayer), debugLayers, angles); + IdpResult idpResult = features.contains(AzureOcrFeature.IDP) ? idpResultFactory.getIdpResult() : null; + return new OcrResult(List.of(ocrTextLayer), debugLayers, angles, idpResult); } } diff --git a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/visualizations/layers/OcrResult.java b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/visualizations/layers/OcrResult.java index bcefda0..f57e4e9 100644 --- a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/visualizations/layers/OcrResult.java +++ b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/visualizations/layers/OcrResult.java @@ -3,8 +3,9 @@ package com.knecon.fforesight.service.ocr.processor.visualizations.layers; import java.util.List; import java.util.Map; +import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult; import com.knecon.fforesight.service.viewerdoc.layers.LayerGroup; -public record OcrResult(List regularLayers, List debugLayers, Map anglesPerPage) { +public record OcrResult(List regularLayers, List debugLayers, Map anglesPerPage, IdpResult idpResult) { } \ No newline at end of file diff --git a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/visualizations/utils/RotationCorrectionUtility.java b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/visualizations/utils/RotationCorrectionUtility.java index bb0e37c..88371aa 100644 --- a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/visualizations/utils/RotationCorrectionUtility.java +++ b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/visualizations/utils/RotationCorrectionUtility.java @@ -133,22 +133,32 @@ public class RotationCorrectionUtility { public static AffineTransform buildTransform(double angle, double originalWidth, double originalHeight) { + return buildTransform(angle, originalWidth, originalHeight, true); + } + + + public static AffineTransform buildTransform(double angle, double originalWidth, double originalHeight, boolean quadrantRotation) { + int quadrants = getQuadrantRotation(angle); double h = originalHeight; double w = originalWidth; - if (quadrants == 1 || quadrants == 3) { - w = originalHeight; - h = originalWidth; - } + AffineTransform quadrantRotationTransform = new AffineTransform(); + if (quadrantRotation) { - AffineTransform quadrantRotation = switch (quadrants) { - case 1 -> new AffineTransform(0, 1, -1, 0, h, 0); - case 2 -> new AffineTransform(-1, 0, 0, -1, w, h); - case 3 -> new AffineTransform(0, -1, 1, 0, w - h, h); - default -> new AffineTransform(); - }; + if (quadrants == 1 || quadrants == 3) { + w = originalHeight; + h = originalWidth; + } + + quadrantRotationTransform = switch (quadrants) { + case 1 -> new AffineTransform(0, 1, -1, 0, h, 0); + case 2 -> new AffineTransform(-1, 0, 0, -1, w, h); + case 3 -> new AffineTransform(0, -1, 1, 0, w - h, h); + default -> new AffineTransform(); + }; + } double remainder = getRemainingAngle(angle, quadrants); double scale = getScalingFactor(remainder, w, h); @@ -158,7 +168,7 @@ public class RotationCorrectionUtility { transform.rotate(Math.toRadians(remainder)); transform.scale(scale, scale); transform.translate(-w / 2, -h / 2); - transform.concatenate(quadrantRotation); + transform.concatenate(quadrantRotationTransform); return transform; } diff --git a/azure-ocr-service/azure-ocr-service-server/src/main/java/com/knecon/fforesight/service/ocr/v1/server/FileStorageService.java b/azure-ocr-service/azure-ocr-service-server/src/main/java/com/knecon/fforesight/service/ocr/v1/server/FileStorageService.java index 3a53cf9..5bd6d97 100644 --- a/azure-ocr-service/azure-ocr-service-server/src/main/java/com/knecon/fforesight/service/ocr/v1/server/FileStorageService.java +++ b/azure-ocr-service/azure-ocr-service-server/src/main/java/com/knecon/fforesight/service/ocr/v1/server/FileStorageService.java @@ -33,20 +33,23 @@ public class FileStorageService { public void storeFiles(DocumentRequest request, File documentFile, File viewerDocumentFile, File analyzeResultFile) { try (var in = new FileInputStream(viewerDocumentFile)) { - if (request.optionalViewerDocumentId().isPresent()) { + if (request.optionalViewerDocumentId() + .isPresent()) { storageService.storeObject(TenantContext.getTenantId(), request.getViewerDocId(), in); } else { storageService.storeObject(TenantContext.getTenantId(), getStorageId(request.getDossierId(), request.getFileId(), FileType.VIEWER_DOCUMENT), in); } } try (var in = new FileInputStream(documentFile)) { - if (request.optionalOriginDocumentId().isPresent()) { + if (request.optionalOriginDocumentId() + .isPresent()) { storageService.storeObject(TenantContext.getTenantId(), request.getOriginDocumentId(), in); } else { storageService.storeObject(TenantContext.getTenantId(), getStorageId(request.getDossierId(), request.getFileId(), FileType.ORIGIN), in); } } - if (request.optionalIdpResultId().isPresent()) { + if (request.optionalIdpResultId() + .isPresent() && analyzeResultFile.exists()) { try (var in = new FileInputStream(analyzeResultFile)) { storageService.storeObject(TenantContext.getTenantId(), request.getIdpResultId(), in); } @@ -59,7 +62,8 @@ public class FileStorageService { Files.createDirectories(documentFile.getParentFile().toPath()); - String originDocumentId = request.optionalOriginDocumentId().orElse(getStorageId(request.getDossierId(), request.getFileId(), FileType.ORIGIN)); + String originDocumentId = request.optionalOriginDocumentId() + .orElse(getStorageId(request.getDossierId(), request.getFileId(), FileType.ORIGIN)); storageService.downloadTo(TenantContext.getTenantId(), originDocumentId, documentFile); diff --git a/azure-ocr-service/azure-ocr-service-server/src/main/resources/application.yml b/azure-ocr-service/azure-ocr-service-server/src/main/resources/application.yml index 25fa477..19d1431 100644 --- a/azure-ocr-service/azure-ocr-service-server/src/main/resources/application.yml +++ b/azure-ocr-service/azure-ocr-service-server/src/main/resources/application.yml @@ -63,3 +63,5 @@ azure: ocrService: sendStatusUpdates: true + +native-libs.path: ${VCPKG_DYNAMIC_LIB} \ No newline at end of file diff --git a/azure-ocr-service/azure-ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java b/azure-ocr-service/azure-ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java index cbb7236..869e02e 100644 --- a/azure-ocr-service/azure-ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java +++ b/azure-ocr-service/azure-ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java @@ -26,11 +26,11 @@ import com.knecon.fforesight.service.ocr.v1.api.model.AzureOcrFeature; import lombok.SneakyThrows; // in order to run, the azure.key must be set first in the application.yml and you must set the env variable VCPKG_DYNAMIC_LIB to your tesseract and leptonica installation folder -@Disabled +//@Disabled @SpringBootTest() public class OcrServiceIntegrationTest extends AbstractTest { - public static final Set FEATURES = Set.of(AzureOcrFeature.ROTATION_CORRECTION, AzureOcrFeature.FONT_STYLE_DETECTION); + public static final Set FEATURES = Set.of(AzureOcrFeature.ROTATION_CORRECTION, AzureOcrFeature.FONT_STYLE_DETECTION, AzureOcrFeature.IDP); @Autowired private OCRService ocrService; @@ -55,7 +55,7 @@ public class OcrServiceIntegrationTest extends AbstractTest { @SneakyThrows public void testOcrWithFile() { - testOCR("/home/kschuettler/Dokumente/LayoutparsingEvaluation/RAW_FILES/Difficult Headlines/VV-284053.pdf/VV-284053.pdf.ORIGIN.pdf"); + testOCR("/home/kschuettler/Dokumente/402Study.pdf"); } diff --git a/azure-ocr-service/azure-ocr-service-server/src/test/resources/application.yml b/azure-ocr-service/azure-ocr-service-server/src/test/resources/application.yml index 5ea1a41..fd3da16 100644 --- a/azure-ocr-service/azure-ocr-service-server/src/test/resources/application.yml +++ b/azure-ocr-service/azure-ocr-service-server/src/test/resources/application.yml @@ -2,8 +2,10 @@ persistence-service.url: "http://persistence-service-v1:8080" pdftron.license: demo:1650351709282:7bd235e003000000004ec28a6743e1163a085e2115de2536ab6e2cfe5a azure: - endpoint: https://ff-ocr-test.cognitiveservices.azure.com/ - key: # find key in Bitwarden under: Azure IDP Test Key + endpoint: https://ff-ocr-dev.cognitiveservices.azure.com/ + key: 444fe2f83e9c48da8e588c7bd5295309 # find key in Bitwarden under: Azure IDP Test Key +native-libs: + logging.type: ${LOGGING_TYPE:CONSOLE} @@ -19,4 +21,5 @@ management: endpoints.web.exposure.include: prometheus, health, metrics metrics.export.prometheus.enabled: true -POD_NAME: azure-ocr-service \ No newline at end of file +POD_NAME: azure-ocr-service +native-libs.path: /home/kschuettler/software/leptonica/vcpkg/installed/x64-linux-dynamic/lib/