diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java index dfd45f9..47926f2 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java +++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java @@ -35,6 +35,7 @@ import com.pdftron.pdf.Optimizer; import com.pdftron.pdf.PDFDoc; import com.pdftron.pdf.Page; import com.pdftron.pdf.PageIterator; +import com.pdftron.pdf.PathData; import com.pdftron.pdf.Rect; import com.pdftron.pdf.RectCollection; import com.pdftron.sdf.Obj; @@ -232,10 +233,6 @@ public class OCRService { for (Element element = reader.next(); element != null; element = reader.next()) switch (element.getType()) { - case Element.e_image: - case Element.e_inline_image: - processImage(element, writer, isInForm); - break; case Element.e_text: processText(element, writer, filledRectangles); @@ -255,18 +252,11 @@ public class OCRService { } - @SneakyThrows - private void processImage(Element element, ElementWriter writer, boolean isInForm) { - - if (!isInForm || !settings.isRemoveWatermark()) { - writer.writeElement(element); - } - } - @SneakyThrows private void processText(Element element, ElementWriter writer, Set filledRectangles) { + if (element.getBBox() == null) { writer.writeElement(element); return; @@ -282,21 +272,112 @@ public class OCRService { } }); + var gState = element.getGState(); + //See PDF Reference 5.3 Text rendering modes, 3 = Invisible, however this ocr does not use it. - if (!filledRectangleIntersection && gState.getTextRenderMode() != 3 || filledRectangleIntersection && gState.getTextRenderMode() == 0) { + if (!filledRectangleIntersection && gState.getTextRenderMode() != 3) { writer.writeElement(element); } } @SneakyThrows - private void processPath(Element element, ElementWriter writer, Set filledRectangles) { + private void processPath(Element path, ElementWriter writer, Set filledRectangles) { - writer.writeElement(element); - if (element.getPathData() != null && element.getPathData().getPoints().length > 4) { - filledRectangles.add(element.getBBox()); + System.out.println("New Path"); + + if (path.isClippingPath()) { + System.out.println(" This is a clipping path"); + } + + System.out.println("ClipWindingFill: " + path.isClipWindingFill()); + System.out.println("WindingFill: " + path.isWindingFill()); + System.out.println("OCVisible: " + path.isOCVisible()); + + System.out.println("Filled: " + path.isFilled()); + + + PathData pathData = path.getPathData(); + double[] data = pathData.getPoints(); + byte[] opr = pathData.getOperators(); + + double x1, y1, x2, y2, x3, y3; + + int data_index = 0; + for (int opr_index = 0; opr_index < opr.length; ++opr_index) { + switch (opr[opr_index]) { + case PathData.e_moveto: + x1 = data[data_index]; + ++data_index; + y1 = data[data_index]; + ++data_index; + System.out.println(" M" + x1 + " " + y1); + break; + case PathData.e_lineto: + x1 = data[data_index]; + ++data_index; + y1 = data[data_index]; + ++data_index; + System.out.println(" L" + x1 + " " + y1); + + break; + case PathData.e_cubicto: + x1 = data[data_index]; + ++data_index; + y1 = data[data_index]; + ++data_index; + x2 = data[data_index]; + ++data_index; + y2 = data[data_index]; + ++data_index; + x3 = data[data_index]; + ++data_index; + y3 = data[data_index]; + ++data_index; + System.out.println(" CU P1 " + x1 + " " + y1 + " P2 " + x2 + " " + y2 + " P3 " + x3 + " " + y3); + break; + case PathData.e_rect: { + x1 = data[data_index]; + ++data_index; + y1 = data[data_index]; + ++data_index; + double w = data[data_index]; + ++data_index; + double h = data[data_index]; + ++data_index; + x2 = x1 + w; + y2 = y1; + x3 = x2; + y3 = y1 + h; + double x4 = x1; + double y4 = y3; + System.out.println(" RE P1 " + x1 + " " + y1 + " P2 " + x2 + " " + y2 + " P3 " + x3 + " " + y3 + " P4 " + x4 + " " + y4); + } + + break; + case PathData.e_closepath: + System.out.println(" Close Path"); + break; + default: + throw new PDFNetException("Invalid Element Type", 0, "", "", ""); + } + + + } + + + + + System.out.println("End Path"); + + + + + writer.writeElement(path); + if (path.getPathData() != null && path.getPathData().getPoints().length > 4 && path.isClippingPath()) { + filledRectangles.add(path.getBBox()); } } @@ -309,6 +390,7 @@ public class OCRService { if (!visited.contains((int) formObj.getObjNum())) { visited.add((int) formObj.getObjNum()); + System.out.println("Form num:" +(int) formObj.getObjNum()); ElementWriter new_writer = new ElementWriter(); reader.formBegin(); new_writer.begin(formObj); diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/OcrServiceIntegrationTest.java b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/OcrServiceIntegrationTest.java index 6293a62..e8ffe00 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/OcrServiceIntegrationTest.java +++ b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/OcrServiceIntegrationTest.java @@ -81,6 +81,34 @@ public class OcrServiceIntegrationTest { } + @Test + @Disabled // OCRModule is not available on build server. If you want to run the test set the property at the top. + @SneakyThrows + public void testRemoveInvisibleText() { + + String fileName = "InvisibleText"; +// String fileName = "InvisiblePathElements"; + + ClassPathResource imageInfoResource = new ClassPathResource("files/" + fileName + ".IMAGE_INFO.json"); + ClassPathResource pdfFileResource = new ClassPathResource("files/" + fileName + ".pdf"); + + var originId = FileStorageService.getStorageId("dossier", "file", FileType.ORIGIN); + storageService.storeObject(originId, pdfFileResource.getInputStream()); + + var imageId = FileStorageService.getStorageId("dossier", "file", FileType.IMAGE_INFO); + storageService.storeObject(imageId, imageInfoResource.getInputStream()); + + var response = ocrService.ocrDocument("dossier", "file"); + + var out = FileUtils.openOutputStream(new File(getTemporaryDirectory() + "/" + fileName + ".pdf")); + IOUtils.copy(response, out); + + System.out.println("File:" + getTemporaryDirectory() + "/" + fileName + ".pdf"); + } + + + + @SneakyThrows public void dummyTest() { diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/InvisiblePathElements.IMAGE_INFO.json b/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/InvisiblePathElements.IMAGE_INFO.json new file mode 100644 index 0000000..3a1becb --- /dev/null +++ b/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/InvisiblePathElements.IMAGE_INFO.json @@ -0,0 +1 @@ +{"dossierId": "55547c91-6b0e-4aa6-9009-2e7c4cd90f13", "fileId": "917b9d9c9f548f85fef3679db45ff46c", "targetFileExtension": "ORIGIN.pdf.gz", "responseFileExtension": "IMAGE_INFO.json.gz", "data": []} \ No newline at end of file diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/InvisiblePathElements.pdf b/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/InvisiblePathElements.pdf new file mode 100644 index 0000000..59737aa Binary files /dev/null and b/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/InvisiblePathElements.pdf differ diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/InvisiblePathElements34.pdf b/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/InvisiblePathElements34.pdf new file mode 100644 index 0000000..86fe3ec Binary files /dev/null and b/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/InvisiblePathElements34.pdf differ diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/InvisiblePathElementsNoW1.pdf b/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/InvisiblePathElementsNoW1.pdf new file mode 100644 index 0000000..588eca0 Binary files /dev/null and b/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/InvisiblePathElementsNoW1.pdf differ diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/InvisibleText.IMAGE_INFO.json b/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/InvisibleText.IMAGE_INFO.json new file mode 100644 index 0000000..3a1becb --- /dev/null +++ b/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/InvisibleText.IMAGE_INFO.json @@ -0,0 +1 @@ +{"dossierId": "55547c91-6b0e-4aa6-9009-2e7c4cd90f13", "fileId": "917b9d9c9f548f85fef3679db45ff46c", "targetFileExtension": "ORIGIN.pdf.gz", "responseFileExtension": "IMAGE_INFO.json.gz", "data": []} \ No newline at end of file diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/InvisibleText.pdf b/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/InvisibleText.pdf new file mode 100644 index 0000000..0eb7128 Binary files /dev/null and b/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/InvisibleText.pdf differ diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/Untouched Kopie 2A.pdf b/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/Untouched Kopie 2A.pdf new file mode 100644 index 0000000..bc6ced9 Binary files /dev/null and b/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/Untouched Kopie 2A.pdf differ diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/abc b/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/abc new file mode 100644 index 0000000..6271fa1 --- /dev/null +++ b/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/abc @@ -0,0 +1,235 @@ + +0 TL +q + q + 150 0 m + 400 0 l + 400 300 l + 150 300 l + h + 58 50 m + 58 200 l + 570 200 l + 570 50 l + F + W + n + BT + -0.011 Tc + 0 Tw + 100 Tz + /C2_0 12 Tf + 0 Tr + 0 Ts + 485.52 66.48 Td + (\0003\000D\000J\000H\000\003) Tj + 0 Tc + 26.4 0 Td + (\000\024) Tj + 0.24 Tc + 8.88 0 Td + (\000R\000I) Tj + 0 Tc + 9.84 0 Td + [(\000\003)-10(\000\030)] TJ + 0.078 Tc + -440.88 -27.6 Td + (\0006\000W\000X\000G\000\\) Tj + -0.007 Tc + 30.96 0 Td + (\0001\000X\000P\000E\000H\000U\000\035\000\003\000\003) Tj + -0.014 Tc + 48.72 0 Td + (\0007\0000\0005\000\023\000\024\000\027\000\025) Tj + ET + q + 154.32 0 0 76.32 385.68 643.68 cm + /Im0 Do + Q + BT + 0.013 Tc + /C2_1 12 Tf + 288.96 601.44 Td + (\0006\000X\000E\000V\000W\000D\000Q\000F\000H) Tj + 0 Tc + -168.24 -27.6 Td + [(\0006\000<\0001\000\030\000\027\000\031\000\026\000\026\000\023\000\003)-10(\000\261)] TJ + ET + q + 192.96 584.88 317.52 -13.92 re + W + n + BT + -0.006 Tc + 192.96 573.84 Td + (\0007\000R\000[\000L\000F\000L\000W\000\\\000\003\0006\000W\000X\000G\000\\\000\003\000E\000\\\000\003\000'\000H\000U\000P\000D\000O\000\003\000$\000G\000P\000L\000Q\000L\000V\000W\000U\000D\000W\000L\000R\000Q\000\003\000W\000R\000\003\000+\000D\000Q\000\003\000:\000L\000V\000W\000D\000U\000\003\0005\000D\000W\000V\000\003) Tj + ET + Q + BT + -0.024 Tc + 284.4 560.16 Td + (\000I\000R\000U\000\003\000\027\000\003\000:\000H\000H\000N\000V) Tj + -0.011 Tc + -14.4 -27.6 Td + (\0003\000D\000W\000K\000R\000O\000R\000J\000\\\000\003\0005\000H\000S\000R\000U\000W) Tj + ET + 117.12 630 0.72 -3.6 re + f* + 117.12 630 2.16 -0.72 re + f* + 118.56 628.56 0.72 -2.16 re + f* + 118.56 628.56 0.72 -0.72 re + f* + 119.28 630 390.96 -0.72 re + f* + 119.28 628.56 390.96 -0.72 re + f* + 511.68 630 0.72 -3.6 re + f* + 510.24 630 2.16 -0.72 re + f* + 510.24 628.56 0.72 -2.16 re + f* + 510.24 628.56 0.72 -0.72 re + f* + 118.56 626.4 0.72 -111.84 re + f* + 117.12 626.4 0.72 -111.84 re + f* + 117.12 514.56 0.72 -2.16 re + f* + 117.12 513.12 2.16 -0.72 re + f* + 118.56 514.56 0.72 -0.72 re + f* + 118.56 514.56 0.72 -0.72 re + f* + 119.28 513.12 390.96 -0.72 re + f* + 119.28 514.56 390.96 -0.72 re + f* + 511.68 626.4 0.72 -111.84 re + f* + 510.24 626.4 0.72 -111.84 re + f* + 511.68 514.56 0.72 -2.16 re + f* + 510.24 513.12 2.16 -0.72 re + f* + 510.24 514.56 0.72 -0.72 re + f* + 510.24 514.56 0.72 -0.72 re + f* + BT + 0.011 Tc + 89.76 266.64 Td + (\000$\0008\0007\000+\0002\0005\000\013\0006\000\f\000\035) Tj + -0.012 Tc + /C2_0 12 Tf + 184.8 0.24 Td + (\0000\000L\000F\000K\000H\000O\000D\000\003\000*\000U\000H\000J\000R\000U\000L\000\003) Tj + -0.037 Tc + 82.56 0 Td + (\000'\0009\0000\000\003\0003\000K\000'\000\003) Tj + 0.101 Tc + 55.2 0 Td + (\0003\000D\000W\000K\000R\000O) Tj + -0.02 Tc + 30.72 0 Td + (\000R\000J\000L\000V\000W) Tj + -0.01 Tc + /C2_1 12 Tf + -353.28 -27.84 Td + (\000&\0002\0000\0003\000/\000\(\0007\000,\0002\0001\000\003\000'\000$\0007\000\(\000\035) Tj + -0.009 Tc + /C2_0 12 Tf + 184.8 0.24 Td + (\000\023\000\030\000\003\0000\000D\000U\000F\000K\000\003\000\025\000\023\000\024\000\033) Tj + /C2_1 12 Tf + -184.8 -27.84 Td + (\000/\000$\000%\0002\0005\000$\0007\0002\0005\000<\000\003\0003\0005\0002\000-\000\(\000&\0007\000\003\000,) Tj + -0.026 Tc + 152.4 0 Td + (\000'\000\035) Tj + 0.009 Tc + /C2_0 12 Tf + 32.4 0.24 Td + (\0005\000H\000S\000R\000U\000W\000\003\0001\000X\000P\000E) Tj + 0.021 Tc + 65.52 0 Td + (\000H\000U\000\035\000\003) Tj + -0.014 Tc + 15.84 0 Td + (\0007\0000\0005\000\023\000\024\000\027\000\025) Tj + 0.078 Tc + -81.36 -13.68 Td + (\0006\000W\000X\000G\000\\) Tj + -0.011 Tc + 27.84 0 Td + (\000\003\0001\000X\000P\000E\000H\000U\000\035) Tj + -0.014 Tc + 48.72 0 Td + (\0007\0000\0005\000\023\000\024\000\027\000\025) Tj + -0.008 Tc + -76.56 -13.92 Td + (\0007\000D\000V\000N\000\003\0001\000X\000P\000E\000H\000U\000\035\000\003) Tj + -0.019 Tc + 72 0 Td + (\0007\000.\000\023\000\025\000\024\000\023\000\024\000\031\000\033) Tj + 0 Tc + /C2_1 12 Tf + -256.8 -27.84 Td + [(\0009\0002\000/\0008\0000\000\(\000\003)-10(\000\024)] TJ + -0.052 Tc + 66.24 0 Td + (\0002\000\)\000\003) Tj + 0 Tc + 19.68 0 Td + (\000\024) Tj + -0.01 Tc + 8.88 0 Td + (\0002\000\)\000\003\0006\0007\0008\000'\000<) Tj + ET + 146.88 155.04 6 -1.2 re + f* + 175.68 155.04 5.76 -1.2 re + f* + BT + -0.053 Tc + 89.76 142.56 Td + (\0003\000$\000*\000\(\000\003) Tj + 0 Tc + 36.24 0 Td + (\000\024) Tj + -0.052 Tc + 9.12 0 Td + (\0002\000\)\000\003) Tj + 0 Tc + 19.68 0 Td + (\000\030) Tj + ET + 126 141.36 6 -1.2 re + f* + 154.8 141.36 6 -1.2 re + f* + /Artifact <> BDC + q + /G0 gs + 0.940613 0 0 0.940613 26.0628 0 cm + 0 0 0 RG + 0 w + /Fm0 Do + Q + EMC + /Artifact <> BDC + q + /G0 gs + 1 0 0 1 458.67 48.412 cm + 0 0 0 RG + 0 w + /Fm1 Do + Q + EMC + Q +Q