Pull request #498: RED-5562 japanse space characters

Merge in RED/redaction-service from RED-5562-mst to master

* commit '623b8df5e6136d2ae6649e5016933abdc40454cc':
  RED-5562 improved pattern compile
  RED-5562 japanse space characters - fixed PMD
  RED-5562 japanse space characters
This commit is contained in:
Timo Bejan 2022-11-15 10:05:33 +01:00
commit 91e227248d

View File

@ -1,16 +1,26 @@
package com.iqser.red.service.redaction.v1.server.redaction.utils;
import lombok.extern.slf4j.Slf4j;
import java.util.Set;
import java.util.regex.Pattern;
@Slf4j
public class SeparatorUtils {
private final static Set<Character> quotes = Set.of('\'', '\u0022', '\u00AB', '\u00BB', '\u2018', '\u2019', '\u201A', '\u201C', '\u201D', '\u201E', '\u2039', '\u203A');
private final static Pattern punctuationPattern = Pattern.compile("\\p{Punct}");
private final static Set<Character> quotes = Set.of('\'', '\u0022', '\u00AB', '\u00BB', '\u2018', '\u2019', '\u201A', '\u201C', '\u201D', '\u201E'
, '\u2039', '\u203A');
private final static Set<Integer> japaneseAltPunctuationMarks = Set.of(65288, 65289, 65294, 65339, 65341, 65371, 65373, 65375, 65376, 12443, 12444, 65309);
public static boolean isSeparator(char c) {
return Character.isWhitespace(c) || punctuationPattern.matcher(String.valueOf(c)).matches() || quotes.contains(c) || isJapaneseSeparator(c);
}
return Character.isWhitespace(c) || Pattern.matches("\\p{Punct}", String.valueOf(c)) || quotes.contains(c);
public static boolean isJapaneseSeparator(char c) {
var intValue = (int) c;
return intValue >= 12288 && intValue <= 12336 || japaneseAltPunctuationMarks.contains(intValue);
}
}