RED-5664: Enabled to redact words that start or end with seperator, needed for japan documents

This commit is contained in:
deiflaender 2023-02-13 10:33:22 +01:00
parent 0e925f2f24
commit 1fca62f578
6 changed files with 33 additions and 13 deletions

View File

@ -91,8 +91,8 @@ public class SearchableText {
if (searchSpace.get(i).charAt(j, caseInsensitive) == searchChars[counter] || counter != 0 && searchSpace.get(i).charAt(j, caseInsensitive) == '-') {
if (counter != 0 || i == 0 && j == 0 || j != 0 && SeparatorUtils.isSeparator(searchSpace.get(i)
.charAt(j - 1, caseInsensitive)) || j == 0 && i != 0 && SeparatorUtils.isSeparator(searchSpace.get(i - 1)
if (counter != 0 || i == 0 && j == 0 || j != 0 && (SeparatorUtils.isSeparator(searchSpace.get(i).charAt(j - 1, caseInsensitive)) || SeparatorUtils.isSeparator(
searchSpace.get(i).charAt(j, caseInsensitive))) || j == 0 && i != 0 && SeparatorUtils.isSeparator(searchSpace.get(i - 1)
.charAt(searchSpace.get(i - 1).length() - 1, caseInsensitive)) || j == 0 && i != 0 && searchSpace.get(i - 1)
.charAt(searchSpace.get(i - 1).length() - 1, caseInsensitive) != ' ' && searchSpace.get(i).charAt(j, caseInsensitive) != ' ') {
partMatch.add(searchSpace.get(i), searchSpace.get(i).textPositionAt(j));
@ -105,9 +105,10 @@ public class SearchableText {
crossSequenceParts.add(partMatch);
if (i == searchSpace.size() - 1 && j == searchSpace.get(i).length() - 1 || j != searchSpace.get(i)
.length() - 1 && SeparatorUtils.isSeparator(searchSpace.get(i).charAt(j + 1, caseInsensitive)) || j == searchSpace.get(i)
.length() - 1 && SeparatorUtils.isSeparator(searchSpace.get(i + 1).charAt(0, caseInsensitive)) || j == searchSpace.get(i)
.length() - 1 && searchSpace.get(i).charAt(j, caseInsensitive) != ' ' && searchSpace.get(i + 1).charAt(0, caseInsensitive) != ' ') {
.length() - 1 && (SeparatorUtils.isSeparator(searchSpace.get(i).charAt(j + 1, caseInsensitive)) || SeparatorUtils.isSeparator(searchSpace.get(i)
.charAt(j, caseInsensitive))) || j == searchSpace.get(i).length() - 1 && SeparatorUtils.isSeparator(searchSpace.get(i + 1)
.charAt(0, caseInsensitive)) || j == searchSpace.get(i).length() - 1 && searchSpace.get(i)
.charAt(j, caseInsensitive) != ' ' && searchSpace.get(i + 1).charAt(0, caseInsensitive) != ' ') {
finalMatches.addAll(buildEntityPositionSequence(crossSequenceParts, normalizedSearchString));
}
@ -118,7 +119,7 @@ public class SearchableText {
} else {
counter = 0;
if (!crossSequenceParts.isEmpty()) {
j--;
j = j - partMatch.length() - 1;
}
crossSequenceParts = new ArrayList<>();
partMatch = new TextPositionSequence(searchSpace.get(i).getPage());

View File

@ -71,8 +71,8 @@ public final class EntitySearchUtils {
private void validateAndAddEntity(Set<Entity> entities, FindEntityDetails findEntityDetails, String inputString, int startIndex, int stopIndex) {
if ((startIndex == 0 || SeparatorUtils.isSeparator(inputString.charAt(startIndex - 1))) && (stopIndex == inputString.length() || SeparatorUtils.isSeparator(inputString.charAt(
stopIndex)))) {
if ((startIndex == 0 || SeparatorUtils.isSeparator(inputString.charAt(startIndex - 1)) || SeparatorUtils.isSeparator(inputString.charAt(startIndex))) && (stopIndex == inputString.length() || SeparatorUtils.isSeparator(
inputString.charAt(stopIndex)) || SeparatorUtils.isSeparator(inputString.charAt(stopIndex - 1)))) {
entities.add(new Entity(inputString.substring(startIndex, stopIndex),
findEntityDetails.getType(),
startIndex,
@ -305,9 +305,12 @@ public final class EntitySearchUtils {
.get(0)
.getSequences()
.get(0)
.getMinXDirAdj() && image.getPosition().getX() + image.getPosition().getWidth() > entity.getPositionSequences().get(0).getSequences().get(0).getMaxXDirAdj() && image.getPosition()
.getY() < entity.getPositionSequences().get(0).getSequences().get(0).getMinYDirAdj() && image.getPosition().getY() + image.getPosition()
.getHeight() > entity.getPositionSequences().get(0).getSequences().get(0).getMaxYDirAdj();
.getMinXDirAdj() && image.getPosition().getX() + image.getPosition().getWidth() > entity.getPositionSequences()
.get(0)
.getSequences()
.get(0)
.getMaxXDirAdj() && image.getPosition().getY() < entity.getPositionSequences().get(0).getSequences().get(0).getMinYDirAdj() && image.getPosition()
.getY() + image.getPosition().getHeight() > entity.getPositionSequences().get(0).getSequences().get(0).getMaxYDirAdj();
}

View File

@ -364,7 +364,7 @@ public class RedactionIntegrationTest {
@Test
public void titleExtraction() throws IOException {
AnalyzeRequest request = prepareStorage("files/new/table-with-merged-cells.pdf");
AnalyzeRequest request = prepareStorage("files/new/APN3_Clean_6.1 (6.4.3.01-02)_Apple_211029.pdf");
analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId()));
AnalyzeResult result = analyzeService.analyze(request);

View File

@ -13,4 +13,20 @@ Dorn
Prasher
David
annotation
J.B. RASCLE
J.B. RASCLE
(果梗を除去したもの)
(青森植)
逸脱:
(青森植)、ふじ(岩手植)、
ひろさきふじ(青森植)、ふじ(岩手植)、つがる(長野植須坂)
ひろさきふじ(青森植)、ふじ(岩手植)、つがる(長野植須坂) 学名
りんご 品種 :ひろさきふじ(青森植)、ふじ(岩手植)、つがる(長野植須坂) 学名
要約
:準拠
作物残留試験において、
日間保存した。
-20℃
青森植
サンプル量
供試試料 (無処理 区)
材料