RED-2536: Treat \t same as whitespace
This commit is contained in:
parent
699cad7666
commit
ca04c78724
@ -295,7 +295,9 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
|
||||
if (i == 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i)
|
||||
.getUnicode()
|
||||
.equals("\u00A0"))) {
|
||||
.equals("\u00A0") || textPositions.get(i)
|
||||
.getUnicode()
|
||||
.equals("\t"))) {
|
||||
startIndex++;
|
||||
continue;
|
||||
}
|
||||
@ -305,7 +307,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
||||
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0)
|
||||
.getUnicode()
|
||||
.equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) {
|
||||
.equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0).getUnicode().equals("\t")))) {
|
||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
|
||||
}
|
||||
startIndex = i;
|
||||
@ -316,7 +318,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
||||
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0)
|
||||
.getUnicode()
|
||||
.equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) {
|
||||
.equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0).getUnicode().equals("\t")))) {
|
||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
|
||||
}
|
||||
startIndex = i;
|
||||
@ -324,11 +326,13 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
|
||||
if (i > 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i)
|
||||
.getUnicode()
|
||||
.equals("\u00A0")) && i <= textPositions.size() - 2) {
|
||||
.equals("\u00A0") || textPositions.get(i)
|
||||
.getUnicode()
|
||||
.equals("\t")) && i <= textPositions.size() - 2) {
|
||||
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
||||
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0)
|
||||
.getUnicode()
|
||||
.equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) {
|
||||
.equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0).getUnicode().equals("\t")))) {
|
||||
|
||||
// Remove false sequence ends (whitespaces)
|
||||
if (previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0)
|
||||
@ -347,13 +351,15 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
List<TextPosition> sublist = textPositions.subList(startIndex, textPositions.size());
|
||||
if (!sublist.isEmpty() && (sublist.get(sublist.size() - 1)
|
||||
.getUnicode()
|
||||
.equals(" ") || sublist.get(sublist.size() - 1).getUnicode().equals("\u00A0"))) {
|
||||
.equals(" ") || sublist.get(sublist.size() - 1).getUnicode().equals("\u00A0") || sublist.get(sublist.size() - 1).getUnicode().equals("\t"))) {
|
||||
sublist = sublist.subList(0, sublist.size() - 1);
|
||||
}
|
||||
|
||||
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0)
|
||||
.getUnicode()
|
||||
.equals("\u00A0")))) {
|
||||
.equals("\u00A0") || sublist.get(0)
|
||||
.getUnicode()
|
||||
.equals("\t")))) {
|
||||
if (previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0)
|
||||
.getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < 0.01) {
|
||||
for (TextPosition t : sublist) {
|
||||
|
||||
@ -656,7 +656,7 @@ public class RedactionIntegrationTest {
|
||||
public void redactionTest() throws IOException {
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/new/Single Study - Oral (Gavage) Mouse.pdf");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/new/VV-919901.pdf");
|
||||
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
|
||||
request.setExcludedPages(Set.of(1));
|
||||
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
Amendment 1
|
||||
Report Number: 33168
|
||||
Page
|
||||
Report Number: BFI0714
|
||||
Report Number: BFI0714
|
||||
Tesh Consultants International
|
||||
|
||||
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user