diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFLinesTextStripper.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFLinesTextStripper.java index d68a1f97..95fdad6a 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFLinesTextStripper.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFLinesTextStripper.java @@ -295,7 +295,9 @@ public class PDFLinesTextStripper extends PDFTextStripper { if (i == 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i) .getUnicode() - .equals("\u00A0"))) { + .equals("\u00A0") || textPositions.get(i) + .getUnicode() + .equals("\t"))) { startIndex++; continue; } @@ -305,7 +307,7 @@ public class PDFLinesTextStripper extends PDFTextStripper { List sublist = textPositions.subList(startIndex, i); if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0) .getUnicode() - .equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) { + .equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0).getUnicode().equals("\t")))) { textPositionSequences.add(new TextPositionSequence(sublist, pageNumber)); } startIndex = i; @@ -316,7 +318,7 @@ public class PDFLinesTextStripper extends PDFTextStripper { List sublist = textPositions.subList(startIndex, i); if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0) .getUnicode() - .equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) { + .equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0).getUnicode().equals("\t")))) { textPositionSequences.add(new TextPositionSequence(sublist, pageNumber)); } startIndex = i; @@ -324,11 +326,13 @@ public class PDFLinesTextStripper extends PDFTextStripper { if (i > 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i) .getUnicode() - .equals("\u00A0")) && i <= textPositions.size() - 2) { + .equals("\u00A0") || textPositions.get(i) + .getUnicode() + .equals("\t")) && i <= textPositions.size() - 2) { List sublist = textPositions.subList(startIndex, i); if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0) .getUnicode() - .equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) { + .equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0).getUnicode().equals("\t")))) { // Remove false sequence ends (whitespaces) if (previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0) @@ -347,13 +351,15 @@ public class PDFLinesTextStripper extends PDFTextStripper { List sublist = textPositions.subList(startIndex, textPositions.size()); if (!sublist.isEmpty() && (sublist.get(sublist.size() - 1) .getUnicode() - .equals(" ") || sublist.get(sublist.size() - 1).getUnicode().equals("\u00A0"))) { + .equals(" ") || sublist.get(sublist.size() - 1).getUnicode().equals("\u00A0") || sublist.get(sublist.size() - 1).getUnicode().equals("\t"))) { sublist = sublist.subList(0, sublist.size() - 1); } if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0) .getUnicode() - .equals("\u00A0")))) { + .equals("\u00A0") || sublist.get(0) + .getUnicode() + .equals("\t")))) { if (previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0) .getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < 0.01) { for (TextPosition t : sublist) { diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java index 6949ff40..8af6fa0b 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java @@ -656,7 +656,7 @@ public class RedactionIntegrationTest { public void redactionTest() throws IOException { long start = System.currentTimeMillis(); - ClassPathResource pdfFileResource = new ClassPathResource("files/new/Single Study - Oral (Gavage) Mouse.pdf"); + ClassPathResource pdfFileResource = new ClassPathResource("files/new/VV-919901.pdf"); AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream()); request.setExcludedPages(Set.of(1)); diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/PII.txt b/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/PII.txt index 303a4a54..6c81517d 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/PII.txt +++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/PII.txt @@ -1,4 +1,5 @@ Amendment 1 Report Number: 33168 Page -Report Number: BFI0714 \ No newline at end of file +Report Number: BFI0714 +Tesh Consultants International diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/new/VV-919901.pdf b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/new/VV-919901.pdf new file mode 100644 index 00000000..48345622 Binary files /dev/null and b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/new/VV-919901.pdf differ