diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/SeparatorUtils.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/SeparatorUtils.java index 131973c8..8ae6988f 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/SeparatorUtils.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/SeparatorUtils.java @@ -1,16 +1,26 @@ package com.iqser.red.service.redaction.v1.server.redaction.utils; +import lombok.extern.slf4j.Slf4j; + import java.util.Set; import java.util.regex.Pattern; +@Slf4j public class SeparatorUtils { - private final static Set quotes = Set.of('\'', '\u0022', '\u00AB', '\u00BB', '\u2018', '\u2019', '\u201A', '\u201C', '\u201D', '\u201E', '\u2039', '\u203A'); + private final static Pattern punctuationPattern = Pattern.compile("\\p{Punct}"); + private final static Set quotes = Set.of('\'', '\u0022', '\u00AB', '\u00BB', '\u2018', '\u2019', '\u201A', '\u201C', '\u201D', '\u201E' + , '\u2039', '\u203A'); + private final static Set japaneseAltPunctuationMarks = Set.of(65288, 65289, 65294, 65339, 65341, 65371, 65373, 65375, 65376, 12443, 12444, 65309); public static boolean isSeparator(char c) { + return Character.isWhitespace(c) || punctuationPattern.matcher(String.valueOf(c)).matches() || quotes.contains(c) || isJapaneseSeparator(c); + } - return Character.isWhitespace(c) || Pattern.matches("\\p{Punct}", String.valueOf(c)) || quotes.contains(c); + public static boolean isJapaneseSeparator(char c) { + var intValue = (int) c; + return intValue >= 12288 && intValue <= 12336 || japaneseAltPunctuationMarks.contains(intValue); } }