RED-6019 InvisibleText

This commit is contained in:
deiflaender 2023-01-17 09:13:23 +01:00
parent e535861da8
commit 579e6a5c67
10 changed files with 364 additions and 17 deletions

View File

@ -35,6 +35,7 @@ import com.pdftron.pdf.Optimizer;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.PageIterator;
import com.pdftron.pdf.PathData;
import com.pdftron.pdf.Rect;
import com.pdftron.pdf.RectCollection;
import com.pdftron.sdf.Obj;
@ -232,10 +233,6 @@ public class OCRService {
for (Element element = reader.next(); element != null; element = reader.next())
switch (element.getType()) {
case Element.e_image:
case Element.e_inline_image:
processImage(element, writer, isInForm);
break;
case Element.e_text:
processText(element, writer, filledRectangles);
@ -255,18 +252,11 @@ public class OCRService {
}
@SneakyThrows
private void processImage(Element element, ElementWriter writer, boolean isInForm) {
if (!isInForm || !settings.isRemoveWatermark()) {
writer.writeElement(element);
}
}
@SneakyThrows
private void processText(Element element, ElementWriter writer, Set<Rect> filledRectangles) {
if (element.getBBox() == null) {
writer.writeElement(element);
return;
@ -282,21 +272,112 @@ public class OCRService {
}
});
var gState = element.getGState();
//See PDF Reference 5.3 Text rendering modes, 3 = Invisible, however this ocr does not use it.
if (!filledRectangleIntersection && gState.getTextRenderMode() != 3 || filledRectangleIntersection && gState.getTextRenderMode() == 0) {
if (!filledRectangleIntersection && gState.getTextRenderMode() != 3) {
writer.writeElement(element);
}
}
@SneakyThrows
private void processPath(Element element, ElementWriter writer, Set<Rect> filledRectangles) {
private void processPath(Element path, ElementWriter writer, Set<Rect> filledRectangles) {
writer.writeElement(element);
if (element.getPathData() != null && element.getPathData().getPoints().length > 4) {
filledRectangles.add(element.getBBox());
System.out.println("New Path");
if (path.isClippingPath()) {
System.out.println(" This is a clipping path");
}
System.out.println("ClipWindingFill: " + path.isClipWindingFill());
System.out.println("WindingFill: " + path.isWindingFill());
System.out.println("OCVisible: " + path.isOCVisible());
System.out.println("Filled: " + path.isFilled());
PathData pathData = path.getPathData();
double[] data = pathData.getPoints();
byte[] opr = pathData.getOperators();
double x1, y1, x2, y2, x3, y3;
int data_index = 0;
for (int opr_index = 0; opr_index < opr.length; ++opr_index) {
switch (opr[opr_index]) {
case PathData.e_moveto:
x1 = data[data_index];
++data_index;
y1 = data[data_index];
++data_index;
System.out.println(" M" + x1 + " " + y1);
break;
case PathData.e_lineto:
x1 = data[data_index];
++data_index;
y1 = data[data_index];
++data_index;
System.out.println(" L" + x1 + " " + y1);
break;
case PathData.e_cubicto:
x1 = data[data_index];
++data_index;
y1 = data[data_index];
++data_index;
x2 = data[data_index];
++data_index;
y2 = data[data_index];
++data_index;
x3 = data[data_index];
++data_index;
y3 = data[data_index];
++data_index;
System.out.println(" CU P1 " + x1 + " " + y1 + " P2 " + x2 + " " + y2 + " P3 " + x3 + " " + y3);
break;
case PathData.e_rect: {
x1 = data[data_index];
++data_index;
y1 = data[data_index];
++data_index;
double w = data[data_index];
++data_index;
double h = data[data_index];
++data_index;
x2 = x1 + w;
y2 = y1;
x3 = x2;
y3 = y1 + h;
double x4 = x1;
double y4 = y3;
System.out.println(" RE P1 " + x1 + " " + y1 + " P2 " + x2 + " " + y2 + " P3 " + x3 + " " + y3 + " P4 " + x4 + " " + y4);
}
break;
case PathData.e_closepath:
System.out.println(" Close Path");
break;
default:
throw new PDFNetException("Invalid Element Type", 0, "", "", "");
}
}
System.out.println("End Path");
writer.writeElement(path);
if (path.getPathData() != null && path.getPathData().getPoints().length > 4 && path.isClippingPath()) {
filledRectangles.add(path.getBBox());
}
}
@ -309,6 +390,7 @@ public class OCRService {
if (!visited.contains((int) formObj.getObjNum())) {
visited.add((int) formObj.getObjNum());
System.out.println("Form num:" +(int) formObj.getObjNum());
ElementWriter new_writer = new ElementWriter();
reader.formBegin();
new_writer.begin(formObj);

View File

@ -81,6 +81,34 @@ public class OcrServiceIntegrationTest {
}
@Test
@Disabled // OCRModule is not available on build server. If you want to run the test set the property at the top.
@SneakyThrows
public void testRemoveInvisibleText() {
String fileName = "InvisibleText";
// String fileName = "InvisiblePathElements";
ClassPathResource imageInfoResource = new ClassPathResource("files/" + fileName + ".IMAGE_INFO.json");
ClassPathResource pdfFileResource = new ClassPathResource("files/" + fileName + ".pdf");
var originId = FileStorageService.getStorageId("dossier", "file", FileType.ORIGIN);
storageService.storeObject(originId, pdfFileResource.getInputStream());
var imageId = FileStorageService.getStorageId("dossier", "file", FileType.IMAGE_INFO);
storageService.storeObject(imageId, imageInfoResource.getInputStream());
var response = ocrService.ocrDocument("dossier", "file");
var out = FileUtils.openOutputStream(new File(getTemporaryDirectory() + "/" + fileName + ".pdf"));
IOUtils.copy(response, out);
System.out.println("File:" + getTemporaryDirectory() + "/" + fileName + ".pdf");
}
@SneakyThrows
public void dummyTest() {

View File

@ -0,0 +1 @@
{"dossierId": "55547c91-6b0e-4aa6-9009-2e7c4cd90f13", "fileId": "917b9d9c9f548f85fef3679db45ff46c", "targetFileExtension": "ORIGIN.pdf.gz", "responseFileExtension": "IMAGE_INFO.json.gz", "data": []}

View File

@ -0,0 +1 @@
{"dossierId": "55547c91-6b0e-4aa6-9009-2e7c4cd90f13", "fileId": "917b9d9c9f548f85fef3679db45ff46c", "targetFileExtension": "ORIGIN.pdf.gz", "responseFileExtension": "IMAGE_INFO.json.gz", "data": []}

View File

@ -0,0 +1,235 @@
0 TL
q
q
150 0 m
400 0 l
400 300 l
150 300 l
h
58 50 m
58 200 l
570 200 l
570 50 l
F
W
n
BT
-0.011 Tc
0 Tw
100 Tz
/C2_0 12 Tf
0 Tr
0 Ts
485.52 66.48 Td
(\0003\000D\000J\000H\000\003) Tj
0 Tc
26.4 0 Td
(\000\024) Tj
0.24 Tc
8.88 0 Td
(\000R\000I) Tj
0 Tc
9.84 0 Td
[(\000\003)-10(\000\030)] TJ
0.078 Tc
-440.88 -27.6 Td
(\0006\000W\000X\000G\000\\) Tj
-0.007 Tc
30.96 0 Td
(\0001\000X\000P\000E\000H\000U\000\035\000\003\000\003) Tj
-0.014 Tc
48.72 0 Td
(\0007\0000\0005\000\023\000\024\000\027\000\025) Tj
ET
q
154.32 0 0 76.32 385.68 643.68 cm
/Im0 Do
Q
BT
0.013 Tc
/C2_1 12 Tf
288.96 601.44 Td
(\0006\000X\000E\000V\000W\000D\000Q\000F\000H) Tj
0 Tc
-168.24 -27.6 Td
[(\0006\000<\0001\000\030\000\027\000\031\000\026\000\026\000\023\000\003)-10(\000\261)] TJ
ET
q
192.96 584.88 317.52 -13.92 re
W
n
BT
-0.006 Tc
192.96 573.84 Td
(\0007\000R\000[\000L\000F\000L\000W\000\\\000\003\0006\000W\000X\000G\000\\\000\003\000E\000\\\000\003\000'\000H\000U\000P\000D\000O\000\003\000$\000G\000P\000L\000Q\000L\000V\000W\000U\000D\000W\000L\000R\000Q\000\003\000W\000R\000\003\000+\000D\000Q\000\003\000:\000L\000V\000W\000D\000U\000\003\0005\000D\000W\000V\000\003) Tj
ET
Q
BT
-0.024 Tc
284.4 560.16 Td
(\000I\000R\000U\000\003\000\027\000\003\000:\000H\000H\000N\000V) Tj
-0.011 Tc
-14.4 -27.6 Td
(\0003\000D\000W\000K\000R\000O\000R\000J\000\\\000\003\0005\000H\000S\000R\000U\000W) Tj
ET
117.12 630 0.72 -3.6 re
f*
117.12 630 2.16 -0.72 re
f*
118.56 628.56 0.72 -2.16 re
f*
118.56 628.56 0.72 -0.72 re
f*
119.28 630 390.96 -0.72 re
f*
119.28 628.56 390.96 -0.72 re
f*
511.68 630 0.72 -3.6 re
f*
510.24 630 2.16 -0.72 re
f*
510.24 628.56 0.72 -2.16 re
f*
510.24 628.56 0.72 -0.72 re
f*
118.56 626.4 0.72 -111.84 re
f*
117.12 626.4 0.72 -111.84 re
f*
117.12 514.56 0.72 -2.16 re
f*
117.12 513.12 2.16 -0.72 re
f*
118.56 514.56 0.72 -0.72 re
f*
118.56 514.56 0.72 -0.72 re
f*
119.28 513.12 390.96 -0.72 re
f*
119.28 514.56 390.96 -0.72 re
f*
511.68 626.4 0.72 -111.84 re
f*
510.24 626.4 0.72 -111.84 re
f*
511.68 514.56 0.72 -2.16 re
f*
510.24 513.12 2.16 -0.72 re
f*
510.24 514.56 0.72 -0.72 re
f*
510.24 514.56 0.72 -0.72 re
f*
BT
0.011 Tc
89.76 266.64 Td
(\000$\0008\0007\000+\0002\0005\000\013\0006\000\f\000\035) Tj
-0.012 Tc
/C2_0 12 Tf
184.8 0.24 Td
(\0000\000L\000F\000K\000H\000O\000D\000\003\000*\000U\000H\000J\000R\000U\000L\000\003) Tj
-0.037 Tc
82.56 0 Td
(\000'\0009\0000\000\003\0003\000K\000'\000\003) Tj
0.101 Tc
55.2 0 Td
(\0003\000D\000W\000K\000R\000O) Tj
-0.02 Tc
30.72 0 Td
(\000R\000J\000L\000V\000W) Tj
-0.01 Tc
/C2_1 12 Tf
-353.28 -27.84 Td
(\000&\0002\0000\0003\000/\000\(\0007\000,\0002\0001\000\003\000'\000$\0007\000\(\000\035) Tj
-0.009 Tc
/C2_0 12 Tf
184.8 0.24 Td
(\000\023\000\030\000\003\0000\000D\000U\000F\000K\000\003\000\025\000\023\000\024\000\033) Tj
/C2_1 12 Tf
-184.8 -27.84 Td
(\000/\000$\000%\0002\0005\000$\0007\0002\0005\000<\000\003\0003\0005\0002\000-\000\(\000&\0007\000\003\000,) Tj
-0.026 Tc
152.4 0 Td
(\000'\000\035) Tj
0.009 Tc
/C2_0 12 Tf
32.4 0.24 Td
(\0005\000H\000S\000R\000U\000W\000\003\0001\000X\000P\000E) Tj
0.021 Tc
65.52 0 Td
(\000H\000U\000\035\000\003) Tj
-0.014 Tc
15.84 0 Td
(\0007\0000\0005\000\023\000\024\000\027\000\025) Tj
0.078 Tc
-81.36 -13.68 Td
(\0006\000W\000X\000G\000\\) Tj
-0.011 Tc
27.84 0 Td
(\000\003\0001\000X\000P\000E\000H\000U\000\035) Tj
-0.014 Tc
48.72 0 Td
(\0007\0000\0005\000\023\000\024\000\027\000\025) Tj
-0.008 Tc
-76.56 -13.92 Td
(\0007\000D\000V\000N\000\003\0001\000X\000P\000E\000H\000U\000\035\000\003) Tj
-0.019 Tc
72 0 Td
(\0007\000.\000\023\000\025\000\024\000\023\000\024\000\031\000\033) Tj
0 Tc
/C2_1 12 Tf
-256.8 -27.84 Td
[(\0009\0002\000/\0008\0000\000\(\000\003)-10(\000\024)] TJ
-0.052 Tc
66.24 0 Td
(\0002\000\)\000\003) Tj
0 Tc
19.68 0 Td
(\000\024) Tj
-0.01 Tc
8.88 0 Td
(\0002\000\)\000\003\0006\0007\0008\000'\000<) Tj
ET
146.88 155.04 6 -1.2 re
f*
175.68 155.04 5.76 -1.2 re
f*
BT
-0.053 Tc
89.76 142.56 Td
(\0003\000$\000*\000\(\000\003) Tj
0 Tc
36.24 0 Td
(\000\024) Tj
-0.052 Tc
9.12 0 Td
(\0002\000\)\000\003) Tj
0 Tc
19.68 0 Td
(\000\030) Tj
ET
126 141.36 6 -1.2 re
f*
154.8 141.36 6 -1.2 re
f*
/Artifact <</Subtype/Watermark/Type/Pagination>> BDC
q
/G0 gs
0.940613 0 0 0.940613 26.0628 0 cm
0 0 0 RG
0 w
/Fm0 Do
Q
EMC
/Artifact <</Contents( Page 250 of 256)/Subtype/Header/Type/Pagination>> BDC
q
/G0 gs
1 0 0 1 458.67 48.412 cm
0 0 0 RG
0 w
/Fm1 Do
Q
EMC
Q
Q