RED-6019 InvisibleText
This commit is contained in:
parent
e535861da8
commit
579e6a5c67
@ -35,6 +35,7 @@ import com.pdftron.pdf.Optimizer;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
import com.pdftron.pdf.Page;
|
||||
import com.pdftron.pdf.PageIterator;
|
||||
import com.pdftron.pdf.PathData;
|
||||
import com.pdftron.pdf.Rect;
|
||||
import com.pdftron.pdf.RectCollection;
|
||||
import com.pdftron.sdf.Obj;
|
||||
@ -232,10 +233,6 @@ public class OCRService {
|
||||
for (Element element = reader.next(); element != null; element = reader.next())
|
||||
|
||||
switch (element.getType()) {
|
||||
case Element.e_image:
|
||||
case Element.e_inline_image:
|
||||
processImage(element, writer, isInForm);
|
||||
break;
|
||||
|
||||
case Element.e_text:
|
||||
processText(element, writer, filledRectangles);
|
||||
@ -255,18 +252,11 @@ public class OCRService {
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void processImage(Element element, ElementWriter writer, boolean isInForm) {
|
||||
|
||||
if (!isInForm || !settings.isRemoveWatermark()) {
|
||||
writer.writeElement(element);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void processText(Element element, ElementWriter writer, Set<Rect> filledRectangles) {
|
||||
|
||||
|
||||
if (element.getBBox() == null) {
|
||||
writer.writeElement(element);
|
||||
return;
|
||||
@ -282,21 +272,112 @@ public class OCRService {
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
var gState = element.getGState();
|
||||
|
||||
|
||||
//See PDF Reference 5.3 Text rendering modes, 3 = Invisible, however this ocr does not use it.
|
||||
if (!filledRectangleIntersection && gState.getTextRenderMode() != 3 || filledRectangleIntersection && gState.getTextRenderMode() == 0) {
|
||||
if (!filledRectangleIntersection && gState.getTextRenderMode() != 3) {
|
||||
writer.writeElement(element);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void processPath(Element element, ElementWriter writer, Set<Rect> filledRectangles) {
|
||||
private void processPath(Element path, ElementWriter writer, Set<Rect> filledRectangles) {
|
||||
|
||||
writer.writeElement(element);
|
||||
if (element.getPathData() != null && element.getPathData().getPoints().length > 4) {
|
||||
filledRectangles.add(element.getBBox());
|
||||
System.out.println("New Path");
|
||||
|
||||
if (path.isClippingPath()) {
|
||||
System.out.println(" This is a clipping path");
|
||||
}
|
||||
|
||||
System.out.println("ClipWindingFill: " + path.isClipWindingFill());
|
||||
System.out.println("WindingFill: " + path.isWindingFill());
|
||||
System.out.println("OCVisible: " + path.isOCVisible());
|
||||
|
||||
System.out.println("Filled: " + path.isFilled());
|
||||
|
||||
|
||||
PathData pathData = path.getPathData();
|
||||
double[] data = pathData.getPoints();
|
||||
byte[] opr = pathData.getOperators();
|
||||
|
||||
double x1, y1, x2, y2, x3, y3;
|
||||
|
||||
int data_index = 0;
|
||||
for (int opr_index = 0; opr_index < opr.length; ++opr_index) {
|
||||
switch (opr[opr_index]) {
|
||||
case PathData.e_moveto:
|
||||
x1 = data[data_index];
|
||||
++data_index;
|
||||
y1 = data[data_index];
|
||||
++data_index;
|
||||
System.out.println(" M" + x1 + " " + y1);
|
||||
break;
|
||||
case PathData.e_lineto:
|
||||
x1 = data[data_index];
|
||||
++data_index;
|
||||
y1 = data[data_index];
|
||||
++data_index;
|
||||
System.out.println(" L" + x1 + " " + y1);
|
||||
|
||||
break;
|
||||
case PathData.e_cubicto:
|
||||
x1 = data[data_index];
|
||||
++data_index;
|
||||
y1 = data[data_index];
|
||||
++data_index;
|
||||
x2 = data[data_index];
|
||||
++data_index;
|
||||
y2 = data[data_index];
|
||||
++data_index;
|
||||
x3 = data[data_index];
|
||||
++data_index;
|
||||
y3 = data[data_index];
|
||||
++data_index;
|
||||
System.out.println(" CU P1 " + x1 + " " + y1 + " P2 " + x2 + " " + y2 + " P3 " + x3 + " " + y3);
|
||||
break;
|
||||
case PathData.e_rect: {
|
||||
x1 = data[data_index];
|
||||
++data_index;
|
||||
y1 = data[data_index];
|
||||
++data_index;
|
||||
double w = data[data_index];
|
||||
++data_index;
|
||||
double h = data[data_index];
|
||||
++data_index;
|
||||
x2 = x1 + w;
|
||||
y2 = y1;
|
||||
x3 = x2;
|
||||
y3 = y1 + h;
|
||||
double x4 = x1;
|
||||
double y4 = y3;
|
||||
System.out.println(" RE P1 " + x1 + " " + y1 + " P2 " + x2 + " " + y2 + " P3 " + x3 + " " + y3 + " P4 " + x4 + " " + y4);
|
||||
}
|
||||
|
||||
break;
|
||||
case PathData.e_closepath:
|
||||
System.out.println(" Close Path");
|
||||
break;
|
||||
default:
|
||||
throw new PDFNetException("Invalid Element Type", 0, "", "", "");
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
System.out.println("End Path");
|
||||
|
||||
|
||||
|
||||
|
||||
writer.writeElement(path);
|
||||
if (path.getPathData() != null && path.getPathData().getPoints().length > 4 && path.isClippingPath()) {
|
||||
filledRectangles.add(path.getBBox());
|
||||
}
|
||||
}
|
||||
|
||||
@ -309,6 +390,7 @@ public class OCRService {
|
||||
|
||||
if (!visited.contains((int) formObj.getObjNum())) {
|
||||
visited.add((int) formObj.getObjNum());
|
||||
System.out.println("Form num:" +(int) formObj.getObjNum());
|
||||
ElementWriter new_writer = new ElementWriter();
|
||||
reader.formBegin();
|
||||
new_writer.begin(formObj);
|
||||
|
||||
@ -81,6 +81,34 @@ public class OcrServiceIntegrationTest {
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@Disabled // OCRModule is not available on build server. If you want to run the test set the property at the top.
|
||||
@SneakyThrows
|
||||
public void testRemoveInvisibleText() {
|
||||
|
||||
String fileName = "InvisibleText";
|
||||
// String fileName = "InvisiblePathElements";
|
||||
|
||||
ClassPathResource imageInfoResource = new ClassPathResource("files/" + fileName + ".IMAGE_INFO.json");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/" + fileName + ".pdf");
|
||||
|
||||
var originId = FileStorageService.getStorageId("dossier", "file", FileType.ORIGIN);
|
||||
storageService.storeObject(originId, pdfFileResource.getInputStream());
|
||||
|
||||
var imageId = FileStorageService.getStorageId("dossier", "file", FileType.IMAGE_INFO);
|
||||
storageService.storeObject(imageId, imageInfoResource.getInputStream());
|
||||
|
||||
var response = ocrService.ocrDocument("dossier", "file");
|
||||
|
||||
var out = FileUtils.openOutputStream(new File(getTemporaryDirectory() + "/" + fileName + ".pdf"));
|
||||
IOUtils.copy(response, out);
|
||||
|
||||
System.out.println("File:" + getTemporaryDirectory() + "/" + fileName + ".pdf");
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void dummyTest() {
|
||||
|
||||
|
||||
@ -0,0 +1 @@
|
||||
{"dossierId": "55547c91-6b0e-4aa6-9009-2e7c4cd90f13", "fileId": "917b9d9c9f548f85fef3679db45ff46c", "targetFileExtension": "ORIGIN.pdf.gz", "responseFileExtension": "IMAGE_INFO.json.gz", "data": []}
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1 @@
|
||||
{"dossierId": "55547c91-6b0e-4aa6-9009-2e7c4cd90f13", "fileId": "917b9d9c9f548f85fef3679db45ff46c", "targetFileExtension": "ORIGIN.pdf.gz", "responseFileExtension": "IMAGE_INFO.json.gz", "data": []}
|
||||
Binary file not shown.
Binary file not shown.
@ -0,0 +1,235 @@
|
||||
|
||||
0 TL
|
||||
q
|
||||
q
|
||||
150 0 m
|
||||
400 0 l
|
||||
400 300 l
|
||||
150 300 l
|
||||
h
|
||||
58 50 m
|
||||
58 200 l
|
||||
570 200 l
|
||||
570 50 l
|
||||
F
|
||||
W
|
||||
n
|
||||
BT
|
||||
-0.011 Tc
|
||||
0 Tw
|
||||
100 Tz
|
||||
/C2_0 12 Tf
|
||||
0 Tr
|
||||
0 Ts
|
||||
485.52 66.48 Td
|
||||
(\0003\000D\000J\000H\000\003) Tj
|
||||
0 Tc
|
||||
26.4 0 Td
|
||||
(\000\024) Tj
|
||||
0.24 Tc
|
||||
8.88 0 Td
|
||||
(\000R\000I) Tj
|
||||
0 Tc
|
||||
9.84 0 Td
|
||||
[(\000\003)-10(\000\030)] TJ
|
||||
0.078 Tc
|
||||
-440.88 -27.6 Td
|
||||
(\0006\000W\000X\000G\000\\) Tj
|
||||
-0.007 Tc
|
||||
30.96 0 Td
|
||||
(\0001\000X\000P\000E\000H\000U\000\035\000\003\000\003) Tj
|
||||
-0.014 Tc
|
||||
48.72 0 Td
|
||||
(\0007\0000\0005\000\023\000\024\000\027\000\025) Tj
|
||||
ET
|
||||
q
|
||||
154.32 0 0 76.32 385.68 643.68 cm
|
||||
/Im0 Do
|
||||
Q
|
||||
BT
|
||||
0.013 Tc
|
||||
/C2_1 12 Tf
|
||||
288.96 601.44 Td
|
||||
(\0006\000X\000E\000V\000W\000D\000Q\000F\000H) Tj
|
||||
0 Tc
|
||||
-168.24 -27.6 Td
|
||||
[(\0006\000<\0001\000\030\000\027\000\031\000\026\000\026\000\023\000\003)-10(\000\261)] TJ
|
||||
ET
|
||||
q
|
||||
192.96 584.88 317.52 -13.92 re
|
||||
W
|
||||
n
|
||||
BT
|
||||
-0.006 Tc
|
||||
192.96 573.84 Td
|
||||
(\0007\000R\000[\000L\000F\000L\000W\000\\\000\003\0006\000W\000X\000G\000\\\000\003\000E\000\\\000\003\000'\000H\000U\000P\000D\000O\000\003\000$\000G\000P\000L\000Q\000L\000V\000W\000U\000D\000W\000L\000R\000Q\000\003\000W\000R\000\003\000+\000D\000Q\000\003\000:\000L\000V\000W\000D\000U\000\003\0005\000D\000W\000V\000\003) Tj
|
||||
ET
|
||||
Q
|
||||
BT
|
||||
-0.024 Tc
|
||||
284.4 560.16 Td
|
||||
(\000I\000R\000U\000\003\000\027\000\003\000:\000H\000H\000N\000V) Tj
|
||||
-0.011 Tc
|
||||
-14.4 -27.6 Td
|
||||
(\0003\000D\000W\000K\000R\000O\000R\000J\000\\\000\003\0005\000H\000S\000R\000U\000W) Tj
|
||||
ET
|
||||
117.12 630 0.72 -3.6 re
|
||||
f*
|
||||
117.12 630 2.16 -0.72 re
|
||||
f*
|
||||
118.56 628.56 0.72 -2.16 re
|
||||
f*
|
||||
118.56 628.56 0.72 -0.72 re
|
||||
f*
|
||||
119.28 630 390.96 -0.72 re
|
||||
f*
|
||||
119.28 628.56 390.96 -0.72 re
|
||||
f*
|
||||
511.68 630 0.72 -3.6 re
|
||||
f*
|
||||
510.24 630 2.16 -0.72 re
|
||||
f*
|
||||
510.24 628.56 0.72 -2.16 re
|
||||
f*
|
||||
510.24 628.56 0.72 -0.72 re
|
||||
f*
|
||||
118.56 626.4 0.72 -111.84 re
|
||||
f*
|
||||
117.12 626.4 0.72 -111.84 re
|
||||
f*
|
||||
117.12 514.56 0.72 -2.16 re
|
||||
f*
|
||||
117.12 513.12 2.16 -0.72 re
|
||||
f*
|
||||
118.56 514.56 0.72 -0.72 re
|
||||
f*
|
||||
118.56 514.56 0.72 -0.72 re
|
||||
f*
|
||||
119.28 513.12 390.96 -0.72 re
|
||||
f*
|
||||
119.28 514.56 390.96 -0.72 re
|
||||
f*
|
||||
511.68 626.4 0.72 -111.84 re
|
||||
f*
|
||||
510.24 626.4 0.72 -111.84 re
|
||||
f*
|
||||
511.68 514.56 0.72 -2.16 re
|
||||
f*
|
||||
510.24 513.12 2.16 -0.72 re
|
||||
f*
|
||||
510.24 514.56 0.72 -0.72 re
|
||||
f*
|
||||
510.24 514.56 0.72 -0.72 re
|
||||
f*
|
||||
BT
|
||||
0.011 Tc
|
||||
89.76 266.64 Td
|
||||
(\000$\0008\0007\000+\0002\0005\000\013\0006\000\f\000\035) Tj
|
||||
-0.012 Tc
|
||||
/C2_0 12 Tf
|
||||
184.8 0.24 Td
|
||||
(\0000\000L\000F\000K\000H\000O\000D\000\003\000*\000U\000H\000J\000R\000U\000L\000\003) Tj
|
||||
-0.037 Tc
|
||||
82.56 0 Td
|
||||
(\000'\0009\0000\000\003\0003\000K\000'\000\003) Tj
|
||||
0.101 Tc
|
||||
55.2 0 Td
|
||||
(\0003\000D\000W\000K\000R\000O) Tj
|
||||
-0.02 Tc
|
||||
30.72 0 Td
|
||||
(\000R\000J\000L\000V\000W) Tj
|
||||
-0.01 Tc
|
||||
/C2_1 12 Tf
|
||||
-353.28 -27.84 Td
|
||||
(\000&\0002\0000\0003\000/\000\(\0007\000,\0002\0001\000\003\000'\000$\0007\000\(\000\035) Tj
|
||||
-0.009 Tc
|
||||
/C2_0 12 Tf
|
||||
184.8 0.24 Td
|
||||
(\000\023\000\030\000\003\0000\000D\000U\000F\000K\000\003\000\025\000\023\000\024\000\033) Tj
|
||||
/C2_1 12 Tf
|
||||
-184.8 -27.84 Td
|
||||
(\000/\000$\000%\0002\0005\000$\0007\0002\0005\000<\000\003\0003\0005\0002\000-\000\(\000&\0007\000\003\000,) Tj
|
||||
-0.026 Tc
|
||||
152.4 0 Td
|
||||
(\000'\000\035) Tj
|
||||
0.009 Tc
|
||||
/C2_0 12 Tf
|
||||
32.4 0.24 Td
|
||||
(\0005\000H\000S\000R\000U\000W\000\003\0001\000X\000P\000E) Tj
|
||||
0.021 Tc
|
||||
65.52 0 Td
|
||||
(\000H\000U\000\035\000\003) Tj
|
||||
-0.014 Tc
|
||||
15.84 0 Td
|
||||
(\0007\0000\0005\000\023\000\024\000\027\000\025) Tj
|
||||
0.078 Tc
|
||||
-81.36 -13.68 Td
|
||||
(\0006\000W\000X\000G\000\\) Tj
|
||||
-0.011 Tc
|
||||
27.84 0 Td
|
||||
(\000\003\0001\000X\000P\000E\000H\000U\000\035) Tj
|
||||
-0.014 Tc
|
||||
48.72 0 Td
|
||||
(\0007\0000\0005\000\023\000\024\000\027\000\025) Tj
|
||||
-0.008 Tc
|
||||
-76.56 -13.92 Td
|
||||
(\0007\000D\000V\000N\000\003\0001\000X\000P\000E\000H\000U\000\035\000\003) Tj
|
||||
-0.019 Tc
|
||||
72 0 Td
|
||||
(\0007\000.\000\023\000\025\000\024\000\023\000\024\000\031\000\033) Tj
|
||||
0 Tc
|
||||
/C2_1 12 Tf
|
||||
-256.8 -27.84 Td
|
||||
[(\0009\0002\000/\0008\0000\000\(\000\003)-10(\000\024)] TJ
|
||||
-0.052 Tc
|
||||
66.24 0 Td
|
||||
(\0002\000\)\000\003) Tj
|
||||
0 Tc
|
||||
19.68 0 Td
|
||||
(\000\024) Tj
|
||||
-0.01 Tc
|
||||
8.88 0 Td
|
||||
(\0002\000\)\000\003\0006\0007\0008\000'\000<) Tj
|
||||
ET
|
||||
146.88 155.04 6 -1.2 re
|
||||
f*
|
||||
175.68 155.04 5.76 -1.2 re
|
||||
f*
|
||||
BT
|
||||
-0.053 Tc
|
||||
89.76 142.56 Td
|
||||
(\0003\000$\000*\000\(\000\003) Tj
|
||||
0 Tc
|
||||
36.24 0 Td
|
||||
(\000\024) Tj
|
||||
-0.052 Tc
|
||||
9.12 0 Td
|
||||
(\0002\000\)\000\003) Tj
|
||||
0 Tc
|
||||
19.68 0 Td
|
||||
(\000\030) Tj
|
||||
ET
|
||||
126 141.36 6 -1.2 re
|
||||
f*
|
||||
154.8 141.36 6 -1.2 re
|
||||
f*
|
||||
/Artifact <</Subtype/Watermark/Type/Pagination>> BDC
|
||||
q
|
||||
/G0 gs
|
||||
0.940613 0 0 0.940613 26.0628 0 cm
|
||||
0 0 0 RG
|
||||
0 w
|
||||
/Fm0 Do
|
||||
Q
|
||||
EMC
|
||||
/Artifact <</Contents( Page 250 of 256)/Subtype/Header/Type/Pagination>> BDC
|
||||
q
|
||||
/G0 gs
|
||||
1 0 0 1 458.67 48.412 cm
|
||||
0 0 0 RG
|
||||
0 w
|
||||
/Fm1 Do
|
||||
Q
|
||||
EMC
|
||||
Q
|
||||
Q
|
||||
Loading…
x
Reference in New Issue
Block a user