From 35dec94ccdfc569f24d6983c5b2810dfd0f80e7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dominique=20Eifl=C3=A4nder?= Date: Fri, 28 May 2021 13:42:36 +0200 Subject: [PATCH] Fixed missing whitespaces --- .../v1/server/parsing/PDFAreaTextStripper.java | 11 +++++++++++ .../v1/server/parsing/PDFLinesTextStripper.java | 12 ++++++++++++ 2 files changed, 23 insertions(+) diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFAreaTextStripper.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFAreaTextStripper.java index 9b52bf7b..8925d426 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFAreaTextStripper.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFAreaTextStripper.java @@ -46,6 +46,17 @@ public class PDFAreaTextStripper extends PDFTextStripperByArea { startIndex = i; } + + if (textPositions.get(i).getRotation() == 0 && i > 0 && textPositions.get(i).getX() > textPositions.get(i - 1).getEndX() + 1) { + List sublist = textPositions.subList(startIndex, i); + if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0) + .getUnicode() + .equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) { + textPositionSequences.add(new TextPositionSequence(sublist, pageNumber)); + } + startIndex = i; + } + if (i > 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i) .getUnicode() .equals("\u00A0")) && i <= textPositions.size() - 2) { diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFLinesTextStripper.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFLinesTextStripper.java index aa69cbbc..45bcef6a 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFLinesTextStripper.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFLinesTextStripper.java @@ -300,6 +300,18 @@ public class PDFLinesTextStripper extends PDFTextStripper { startIndex = i; } + + if (textPositions.get(i).getRotation() == 0 && i > 0 && textPositions.get(i).getX() > textPositions.get(i - 1).getEndX() + 1) { + List sublist = textPositions.subList(startIndex, i); + if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0) + .getUnicode() + .equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) { + textPositionSequences.add(new TextPositionSequence(sublist, pageNumber)); + } + startIndex = i; + } + + if (i > 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i) .getUnicode() .equals("\u00A0")) && i <= textPositions.size() - 2) {