RED-4254: Added dirty hack in pdfbox classes to find words that contains uniqueCharacters with 2 chars like 'RA'

This commit is contained in:
deiflaender 2022-06-28 15:11:33 +02:00
parent 6b89572c87
commit 264d7e3a87
4 changed files with 44 additions and 7 deletions

View File

@ -301,12 +301,24 @@ class LegacyPDFStreamEngine extends PDFStreamEngine
nextY -= pageSize.getLowerLeftY();
}
processTextPosition(new TextPosition(pageRotation, pageSize.getWidth(),
pageSize.getHeight(), translatedTextRenderingMatrix, nextX, nextY,
Math.abs(dyDisplay), dxDisplay,
Math.abs(spaceWidthDisplay), unicodeMapping, new int[] { code }, font,
fontSize,
(int)(fontSize * textMatrix.getScalingFactorX())));
// This is a hack for unicode letter with 2 chars e.g. RA see unicodeProblem.pdf
if(unicodeMapping.length() == 2){
processTextPosition(new TextPosition(pageRotation, pageSize.getWidth(),
pageSize.getHeight(), translatedTextRenderingMatrix, nextX, nextY,
Math.abs(dyDisplay), dxDisplay,
Math.abs(spaceWidthDisplay), Character.toString(unicodeMapping.charAt(0)), new int[] { code }, font,
fontSize,
(int)(fontSize * textMatrix.getScalingFactorX())));
processTextPosition(new TextPosition(pageRotation, pageSize.getWidth(),
pageSize.getHeight(), translatedTextRenderingMatrix, nextX, nextY,
Math.abs(dyDisplay), dxDisplay,
Math.abs(spaceWidthDisplay), Character.toString(unicodeMapping.charAt(1)), new int[] { code }, font,
fontSize,
(int)(fontSize * textMatrix.getScalingFactorX())));
} else {
processTextPosition(new TextPosition(pageRotation, pageSize.getWidth(), pageSize.getHeight(), translatedTextRenderingMatrix, nextX, nextY, Math.abs(dyDisplay), dxDisplay, Math.abs(spaceWidthDisplay), unicodeMapping, new int[]{code}, font, fontSize, (int) (fontSize * textMatrix.getScalingFactorX())));
}
}
/**

View File

@ -807,6 +807,30 @@ public class RedactionIntegrationTest {
}
@Test
public void testUnicodeProblem() throws IOException {
long start = System.currentTimeMillis();
AnalyzeRequest request = prepareStorage("files/new/unicodeProblem.pdf");
analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId()));
AnalyzeResult result = analyzeService.analyze(request);
AnnotateResponse annotateResponse = annotationService.annotate(AnnotateRequest.builder()
.dossierId(TEST_DOSSIER_ID)
.fileId(TEST_FILE_ID)
.build());
try (FileOutputStream fileOutputStream = new FileOutputStream(OsUtils.getTemporaryDirectory() + "/Annotated.pdf")) {
fileOutputStream.write(annotateResponse.getDocument());
}
long end = System.currentTimeMillis();
System.out.println("duration: " + (end - start));
System.out.println("numberOfPages: " + result.getNumberOfPages());
}
@Test
public void testRotations() throws IOException {

View File

@ -13,3 +13,4 @@ Dorn
Prasher
David
annotation
J.B. RASCLE