RED-2536: Treat \t same as whitespace

This commit is contained in:
Dominique Eifländer 2021-10-28 12:42:18 +02:00
parent 699cad7666
commit ca04c78724
4 changed files with 16 additions and 9 deletions

View File

@ -295,7 +295,9 @@ public class PDFLinesTextStripper extends PDFTextStripper {
if (i == 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i)
.getUnicode()
.equals("\u00A0"))) {
.equals("\u00A0") || textPositions.get(i)
.getUnicode()
.equals("\t"))) {
startIndex++;
continue;
}
@ -305,7 +307,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
List<TextPosition> sublist = textPositions.subList(startIndex, i);
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0)
.getUnicode()
.equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) {
.equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0).getUnicode().equals("\t")))) {
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
}
startIndex = i;
@ -316,7 +318,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
List<TextPosition> sublist = textPositions.subList(startIndex, i);
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0)
.getUnicode()
.equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) {
.equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0).getUnicode().equals("\t")))) {
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
}
startIndex = i;
@ -324,11 +326,13 @@ public class PDFLinesTextStripper extends PDFTextStripper {
if (i > 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i)
.getUnicode()
.equals("\u00A0")) && i <= textPositions.size() - 2) {
.equals("\u00A0") || textPositions.get(i)
.getUnicode()
.equals("\t")) && i <= textPositions.size() - 2) {
List<TextPosition> sublist = textPositions.subList(startIndex, i);
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0)
.getUnicode()
.equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) {
.equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0).getUnicode().equals("\t")))) {
// Remove false sequence ends (whitespaces)
if (previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0)
@ -347,13 +351,15 @@ public class PDFLinesTextStripper extends PDFTextStripper {
List<TextPosition> sublist = textPositions.subList(startIndex, textPositions.size());
if (!sublist.isEmpty() && (sublist.get(sublist.size() - 1)
.getUnicode()
.equals(" ") || sublist.get(sublist.size() - 1).getUnicode().equals("\u00A0"))) {
.equals(" ") || sublist.get(sublist.size() - 1).getUnicode().equals("\u00A0") || sublist.get(sublist.size() - 1).getUnicode().equals("\t"))) {
sublist = sublist.subList(0, sublist.size() - 1);
}
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0)
.getUnicode()
.equals("\u00A0")))) {
.equals("\u00A0") || sublist.get(0)
.getUnicode()
.equals("\t")))) {
if (previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0)
.getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < 0.01) {
for (TextPosition t : sublist) {

View File

@ -656,7 +656,7 @@ public class RedactionIntegrationTest {
public void redactionTest() throws IOException {
long start = System.currentTimeMillis();
ClassPathResource pdfFileResource = new ClassPathResource("files/new/Single Study - Oral (Gavage) Mouse.pdf");
ClassPathResource pdfFileResource = new ClassPathResource("files/new/VV-919901.pdf");
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
request.setExcludedPages(Set.of(1));

View File

@ -1,4 +1,5 @@
Amendment 1
Report Number: 33168
Page
Report Number: BFI0714
Report Number: BFI0714
Tesh Consultants International