diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/SearchableText.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/SearchableText.java index 8ed10a20..82debd10 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/SearchableText.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/SearchableText.java @@ -91,8 +91,8 @@ public class SearchableText { if (searchSpace.get(i).charAt(j, caseInsensitive) == searchChars[counter] || counter != 0 && searchSpace.get(i).charAt(j, caseInsensitive) == '-') { - if (counter != 0 || i == 0 && j == 0 || j != 0 && SeparatorUtils.isSeparator(searchSpace.get(i) - .charAt(j - 1, caseInsensitive)) || j == 0 && i != 0 && SeparatorUtils.isSeparator(searchSpace.get(i - 1) + if (counter != 0 || i == 0 && j == 0 || j != 0 && (SeparatorUtils.isSeparator(searchSpace.get(i).charAt(j - 1, caseInsensitive)) || SeparatorUtils.isSeparator( + searchSpace.get(i).charAt(j, caseInsensitive))) || j == 0 && i != 0 && SeparatorUtils.isSeparator(searchSpace.get(i - 1) .charAt(searchSpace.get(i - 1).length() - 1, caseInsensitive)) || j == 0 && i != 0 && searchSpace.get(i - 1) .charAt(searchSpace.get(i - 1).length() - 1, caseInsensitive) != ' ' && searchSpace.get(i).charAt(j, caseInsensitive) != ' ') { partMatch.add(searchSpace.get(i), searchSpace.get(i).textPositionAt(j)); @@ -105,9 +105,10 @@ public class SearchableText { crossSequenceParts.add(partMatch); if (i == searchSpace.size() - 1 && j == searchSpace.get(i).length() - 1 || j != searchSpace.get(i) - .length() - 1 && SeparatorUtils.isSeparator(searchSpace.get(i).charAt(j + 1, caseInsensitive)) || j == searchSpace.get(i) - .length() - 1 && SeparatorUtils.isSeparator(searchSpace.get(i + 1).charAt(0, caseInsensitive)) || j == searchSpace.get(i) - .length() - 1 && searchSpace.get(i).charAt(j, caseInsensitive) != ' ' && searchSpace.get(i + 1).charAt(0, caseInsensitive) != ' ') { + .length() - 1 && (SeparatorUtils.isSeparator(searchSpace.get(i).charAt(j + 1, caseInsensitive)) || SeparatorUtils.isSeparator(searchSpace.get(i) + .charAt(j, caseInsensitive))) || j == searchSpace.get(i).length() - 1 && SeparatorUtils.isSeparator(searchSpace.get(i + 1) + .charAt(0, caseInsensitive)) || j == searchSpace.get(i).length() - 1 && searchSpace.get(i) + .charAt(j, caseInsensitive) != ' ' && searchSpace.get(i + 1).charAt(0, caseInsensitive) != ' ') { finalMatches.addAll(buildEntityPositionSequence(crossSequenceParts, normalizedSearchString)); } @@ -118,7 +119,7 @@ public class SearchableText { } else { counter = 0; if (!crossSequenceParts.isEmpty()) { - j--; + j = j - partMatch.length() - 1; } crossSequenceParts = new ArrayList<>(); partMatch = new TextPositionSequence(searchSpace.get(i).getPage()); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/EntitySearchUtils.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/EntitySearchUtils.java index d7e70cac..510e2276 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/EntitySearchUtils.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/EntitySearchUtils.java @@ -71,8 +71,8 @@ public final class EntitySearchUtils { private void validateAndAddEntity(Set entities, FindEntityDetails findEntityDetails, String inputString, int startIndex, int stopIndex) { - if ((startIndex == 0 || SeparatorUtils.isSeparator(inputString.charAt(startIndex - 1))) && (stopIndex == inputString.length() || SeparatorUtils.isSeparator(inputString.charAt( - stopIndex)))) { + if ((startIndex == 0 || SeparatorUtils.isSeparator(inputString.charAt(startIndex - 1)) || SeparatorUtils.isSeparator(inputString.charAt(startIndex))) && (stopIndex == inputString.length() || SeparatorUtils.isSeparator( + inputString.charAt(stopIndex)) || SeparatorUtils.isSeparator(inputString.charAt(stopIndex - 1)))) { entities.add(new Entity(inputString.substring(startIndex, stopIndex), findEntityDetails.getType(), startIndex, @@ -305,9 +305,12 @@ public final class EntitySearchUtils { .get(0) .getSequences() .get(0) - .getMinXDirAdj() && image.getPosition().getX() + image.getPosition().getWidth() > entity.getPositionSequences().get(0).getSequences().get(0).getMaxXDirAdj() && image.getPosition() - .getY() < entity.getPositionSequences().get(0).getSequences().get(0).getMinYDirAdj() && image.getPosition().getY() + image.getPosition() - .getHeight() > entity.getPositionSequences().get(0).getSequences().get(0).getMaxYDirAdj(); + .getMinXDirAdj() && image.getPosition().getX() + image.getPosition().getWidth() > entity.getPositionSequences() + .get(0) + .getSequences() + .get(0) + .getMaxXDirAdj() && image.getPosition().getY() < entity.getPositionSequences().get(0).getSequences().get(0).getMinYDirAdj() && image.getPosition() + .getY() + image.getPosition().getHeight() > entity.getPositionSequences().get(0).getSequences().get(0).getMaxYDirAdj(); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java index 9898a85c..d4972a98 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java @@ -364,7 +364,7 @@ public class RedactionIntegrationTest { @Test public void titleExtraction() throws IOException { - AnalyzeRequest request = prepareStorage("files/new/table-with-merged-cells.pdf"); + AnalyzeRequest request = prepareStorage("files/new/APN3_Clean_6.1 (6.4.3.01-02)_Apple_211029.pdf"); analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId())); AnalyzeResult result = analyzeService.analyze(request); diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/PII.txt b/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/PII.txt index 182a08a0..1a480892 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/PII.txt +++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/PII.txt @@ -13,4 +13,20 @@ Dorn Prasher David annotation -J.B. RASCLE \ No newline at end of file +J.B. RASCLE +(果梗を除去したもの) +(青森植) +逸脱: +(青森植)、ふじ(岩手植)、 +ひろさきふじ(青森植)、ふじ(岩手植)、つがる(長野植須坂) +ひろさきふじ(青森植)、ふじ(岩手植)、つがる(長野植須坂) 学名 +りんご 品種 :ひろさきふじ(青森植)、ふじ(岩手植)、つがる(長野植須坂) 学名 +要約 +:準拠 +作物残留試験において、 +日間保存した。 +-20℃ +青森植 +サンプル量 +供試試料 (無処理 区) +材料 diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/new/APN3_Clean_6.1 (6.4.3.01-02)_Apple_211029.pdf b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/new/APN3_Clean_6.1 (6.4.3.01-02)_Apple_211029.pdf new file mode 100644 index 00000000..83fab663 Binary files /dev/null and b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/new/APN3_Clean_6.1 (6.4.3.01-02)_Apple_211029.pdf differ diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/new/JapanWord1.pdf b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/new/JapanWord1.pdf new file mode 100644 index 00000000..8b1da24a Binary files /dev/null and b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/new/JapanWord1.pdf differ