RED-3059: Fixed inital expansion overlaps

This commit is contained in:
deiflaender 2021-12-10 10:54:04 +01:00
parent 4f8d15e13e
commit 870eacfacf
5 changed files with 61 additions and 18 deletions

View File

@ -43,7 +43,8 @@ public class EntitySearchUtils {
startIndex = inputString.indexOf(cleanValue, stopIndex);
stopIndex = startIndex + cleanValue.length();
if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString.charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) {
if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString
.charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) {
return true;
}
} while (startIndex > -1);
@ -71,7 +72,8 @@ public class EntitySearchUtils {
startIndex = inputString.indexOf(cleanValue, stopIndex);
stopIndex = startIndex + cleanValue.length();
if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString.charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) {
if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString
.charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) {
found.add(new Entity(inputString.substring(startIndex, stopIndex), type, startIndex, stopIndex, headline, sectionNumber, !local, isDossierDictionary));
}
} while (startIndex > -1);
@ -125,7 +127,8 @@ public class EntitySearchUtils {
for (Entity word : entities) {
for (Entity inner : entities) {
if (inner.getWord().length() < word.getWord()
.length() && inner.getStart() >= word.getStart() && inner.getEnd() <= word.getEnd() && word != inner && word.getSectionNumber() == inner.getSectionNumber()) {
.length() && inner.getStart() >= word.getStart() && inner.getEnd() <= word.getEnd() && word != inner && word
.getSectionNumber() == inner.getSectionNumber()) {
wordsToRemove.add(inner);
}
}
@ -163,15 +166,35 @@ public class EntitySearchUtils {
Set<Entity> result = new HashSet<>();
if (existingEntities != null && foundEntities != null) {
for (Entity existingEntity : existingEntities) {
for (Entity foundEntity : foundEntities) {
if (existingEntity.getEnd() < foundEntity.getStart() || foundEntity.getEnd() < existingEntity.getStart()) {
result.add(foundEntity);
}
for (Entity foundEntity : foundEntities) {
if (!overlaps(existingEntities, foundEntity)) {
result.add(foundEntity);
}
}
}
return result;
}
private boolean overlaps(Set<Entity> existingEntities, Entity found) {
for (Entity existing : existingEntities) {
if(existing.getStart().equals(found.getStart())){
continue;
}
for (int i = existing.getStart(); i <= existing.getEnd(); i++) {
for (int j = found.getStart(); j <= found.getEnd(); j++) {
if (i == j) {
return true;
}
}
}
}
return false;
}
}

View File

@ -652,11 +652,28 @@ public class RedactionIntegrationTest {
}
@Test
public void redactionExpansionOverlap() throws IOException {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/ExpansionTest.pdf");
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
AnalyzeResult result = reanalyzeService.analyze(request);
var redactionLog = redactionStorageService.getRedactionLog(TEST_DOSSIER_ID, TEST_FILE_ID);
var values = redactionLog.getRedactionLogEntry().stream().map(RedactionLogEntry::getValue).collect(Collectors.toList());
assertThat(values).containsExactlyInAnyOrder("Lastname M.", "Doe", "Doe J.", "M. Mustermann", "Mustermann M.", "F. Lastname");
}
@Test
public void redactionTest() throws IOException {
long start = System.currentTimeMillis();
ClassPathResource pdfFileResource = new ClassPathResource("files/new/VV-919901.pdf");
ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf");
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
request.setExcludedPages(Set.of(1));
@ -1121,10 +1138,10 @@ public class RedactionIntegrationTest {
private static String getTemporaryDirectory() {
String tmpdir = System.getProperty("java.io.tmpdir");
if (StringUtils.isNotBlank(tmpdir)) {
return tmpdir;
}
// String tmpdir = System.getProperty("java.io.tmpdir");
// if (StringUtils.isNotBlank(tmpdir)) {
// return tmpdir;
// }
return "/tmp";
}

View File

@ -134,8 +134,7 @@ public class EntitySearchUtilsTest {
Set<Entity> result = EntitySearchUtils.findNonOverlappingMatchEntities(existingEntities, foundEntities);
// Assert
assertThat(result.size()).isEqualTo(1);
assertThat(result).contains(foundEntities2);
assertThat(result.size()).isEqualTo(0);
}
@ -165,8 +164,7 @@ public class EntitySearchUtilsTest {
Set<Entity> result = EntitySearchUtils.findNonOverlappingMatchEntities(existingEntities, foundEntities);
// Assert
assertThat(result.size()).isEqualTo(1);
assertThat(result).contains(foundEntities2);
assertThat(result.size()).isEqualTo(0);
}

View File

@ -8585,4 +8585,9 @@ Zoriki Hosomi R.
Zoriki Hosomi Rosana
Zuberer D
Zubrod J
Zwicker R.E.
Zwicker R.E.
Doe
M. Mustermann
F. Lastname
Mustermann
Lastname