diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/LegacyPDFStreamEngine.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/LegacyPDFStreamEngine.java index 4bc6d0a0..5c6fea21 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/LegacyPDFStreamEngine.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/LegacyPDFStreamEngine.java @@ -301,12 +301,24 @@ class LegacyPDFStreamEngine extends PDFStreamEngine nextY -= pageSize.getLowerLeftY(); } - processTextPosition(new TextPosition(pageRotation, pageSize.getWidth(), - pageSize.getHeight(), translatedTextRenderingMatrix, nextX, nextY, - Math.abs(dyDisplay), dxDisplay, - Math.abs(spaceWidthDisplay), unicodeMapping, new int[] { code }, font, - fontSize, - (int)(fontSize * textMatrix.getScalingFactorX()))); + // This is a hack for unicode letter with 2 chars e.g. RA see unicodeProblem.pdf + if(unicodeMapping.length() == 2){ + processTextPosition(new TextPosition(pageRotation, pageSize.getWidth(), + pageSize.getHeight(), translatedTextRenderingMatrix, nextX, nextY, + Math.abs(dyDisplay), dxDisplay, + Math.abs(spaceWidthDisplay), Character.toString(unicodeMapping.charAt(0)), new int[] { code }, font, + fontSize, + (int)(fontSize * textMatrix.getScalingFactorX()))); + processTextPosition(new TextPosition(pageRotation, pageSize.getWidth(), + pageSize.getHeight(), translatedTextRenderingMatrix, nextX, nextY, + Math.abs(dyDisplay), dxDisplay, + Math.abs(spaceWidthDisplay), Character.toString(unicodeMapping.charAt(1)), new int[] { code }, font, + fontSize, + (int)(fontSize * textMatrix.getScalingFactorX()))); + } else { + + processTextPosition(new TextPosition(pageRotation, pageSize.getWidth(), pageSize.getHeight(), translatedTextRenderingMatrix, nextX, nextY, Math.abs(dyDisplay), dxDisplay, Math.abs(spaceWidthDisplay), unicodeMapping, new int[]{code}, font, fontSize, (int) (fontSize * textMatrix.getScalingFactorX()))); + } } /** diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java index bd1029ac..a1ee95aa 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java @@ -807,6 +807,30 @@ public class RedactionIntegrationTest { } + @Test + public void testUnicodeProblem() throws IOException { + + long start = System.currentTimeMillis(); + + AnalyzeRequest request = prepareStorage("files/new/unicodeProblem.pdf"); + analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId())); + AnalyzeResult result = analyzeService.analyze(request); + + AnnotateResponse annotateResponse = annotationService.annotate(AnnotateRequest.builder() + .dossierId(TEST_DOSSIER_ID) + .fileId(TEST_FILE_ID) + .build()); + + try (FileOutputStream fileOutputStream = new FileOutputStream(OsUtils.getTemporaryDirectory() + "/Annotated.pdf")) { + fileOutputStream.write(annotateResponse.getDocument()); + } + long end = System.currentTimeMillis(); + + System.out.println("duration: " + (end - start)); + System.out.println("numberOfPages: " + result.getNumberOfPages()); + } + + @Test public void testRotations() throws IOException { diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/PII.txt b/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/PII.txt index d3d1ad7d..182a08a0 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/PII.txt +++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/PII.txt @@ -12,4 +12,5 @@ Xinyi Y. Tao Dorn Prasher David -annotation \ No newline at end of file +annotation +J.B. RASCLE \ No newline at end of file diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/new/unicodeProblem.pdf b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/new/unicodeProblem.pdf new file mode 100644 index 00000000..543cc6a3 Binary files /dev/null and b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/new/unicodeProblem.pdf differ