Handle 'u00A0' character the same way as ' '

This commit is contained in:
Dominique Eifländer 2020-12-23 10:57:42 +01:00
parent b173975ff5
commit 79b57e85cd

View File

@ -212,7 +212,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
maxCharWidths = charWidth;
}
if (i == 0 && textPositions.get(i).getUnicode().equals(" ")) {
if (i == 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i).getUnicode().equals("\u00A0"))) {
startIndex++;
continue;
}
@ -220,15 +220,15 @@ public class PDFLinesTextStripper extends PDFTextStripper {
// Strange but sometimes this is happening, for example: Metolachlor2.pdf
if (i > 0 && textPositions.get(i).getX() < textPositions.get(i - 1).getX()) {
List<TextPosition> sublist = textPositions.subList(startIndex, i);
if (!(sublist.isEmpty() || sublist.size() == 1 && sublist.get(0).getUnicode().equals(" "))) {
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) {
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
}
startIndex = i;
}
if (i > 0 && textPositions.get(i).getUnicode().equals(" ") && i <= textPositions.size() - 2) {
if (i > 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i).getUnicode().equals("\u00A0")) && i <= textPositions.size() - 2) {
List<TextPosition> sublist = textPositions.subList(startIndex, i);
if (!(sublist.isEmpty() || sublist.size() == 1 && sublist.get(0).getUnicode().equals(" "))) {
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) {
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
}
startIndex = i + 1;
@ -236,14 +236,13 @@ public class PDFLinesTextStripper extends PDFTextStripper {
}
List<TextPosition> sublist = textPositions.subList(startIndex, textPositions.size());
if (!sublist.isEmpty() && sublist.get(sublist.size() - 1).getUnicode().equals(" ")) {
if (!sublist.isEmpty() && (sublist.get(sublist.size() - 1).getUnicode().equals(" ") || sublist.get(sublist.size() - 1).getUnicode().equals("\u00A0"))) {
sublist = sublist.subList(0, sublist.size() - 1);
}
if (!(sublist.isEmpty() || sublist.size() == 1 && sublist.get(0).getUnicode().equals(" "))) {
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) {
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
}
super.writeString(text);
}
@Override