RED-3059: Fixed inital expansion overlaps
This commit is contained in:
parent
4f8d15e13e
commit
870eacfacf
@ -43,7 +43,8 @@ public class EntitySearchUtils {
|
||||
startIndex = inputString.indexOf(cleanValue, stopIndex);
|
||||
stopIndex = startIndex + cleanValue.length();
|
||||
|
||||
if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString.charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) {
|
||||
if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString
|
||||
.charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) {
|
||||
return true;
|
||||
}
|
||||
} while (startIndex > -1);
|
||||
@ -71,7 +72,8 @@ public class EntitySearchUtils {
|
||||
startIndex = inputString.indexOf(cleanValue, stopIndex);
|
||||
stopIndex = startIndex + cleanValue.length();
|
||||
|
||||
if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString.charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) {
|
||||
if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString
|
||||
.charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) {
|
||||
found.add(new Entity(inputString.substring(startIndex, stopIndex), type, startIndex, stopIndex, headline, sectionNumber, !local, isDossierDictionary));
|
||||
}
|
||||
} while (startIndex > -1);
|
||||
@ -125,7 +127,8 @@ public class EntitySearchUtils {
|
||||
for (Entity word : entities) {
|
||||
for (Entity inner : entities) {
|
||||
if (inner.getWord().length() < word.getWord()
|
||||
.length() && inner.getStart() >= word.getStart() && inner.getEnd() <= word.getEnd() && word != inner && word.getSectionNumber() == inner.getSectionNumber()) {
|
||||
.length() && inner.getStart() >= word.getStart() && inner.getEnd() <= word.getEnd() && word != inner && word
|
||||
.getSectionNumber() == inner.getSectionNumber()) {
|
||||
wordsToRemove.add(inner);
|
||||
}
|
||||
}
|
||||
@ -163,15 +166,35 @@ public class EntitySearchUtils {
|
||||
|
||||
Set<Entity> result = new HashSet<>();
|
||||
if (existingEntities != null && foundEntities != null) {
|
||||
for (Entity existingEntity : existingEntities) {
|
||||
for (Entity foundEntity : foundEntities) {
|
||||
if (existingEntity.getEnd() < foundEntity.getStart() || foundEntity.getEnd() < existingEntity.getStart()) {
|
||||
result.add(foundEntity);
|
||||
}
|
||||
for (Entity foundEntity : foundEntities) {
|
||||
|
||||
if (!overlaps(existingEntities, foundEntity)) {
|
||||
result.add(foundEntity);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
private boolean overlaps(Set<Entity> existingEntities, Entity found) {
|
||||
|
||||
for (Entity existing : existingEntities) {
|
||||
|
||||
if(existing.getStart().equals(found.getStart())){
|
||||
continue;
|
||||
}
|
||||
|
||||
for (int i = existing.getStart(); i <= existing.getEnd(); i++) {
|
||||
for (int j = found.getStart(); j <= found.getEnd(); j++) {
|
||||
if (i == j) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -652,11 +652,28 @@ public class RedactionIntegrationTest {
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void redactionExpansionOverlap() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/ExpansionTest.pdf");
|
||||
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
|
||||
|
||||
|
||||
AnalyzeResult result = reanalyzeService.analyze(request);
|
||||
|
||||
var redactionLog = redactionStorageService.getRedactionLog(TEST_DOSSIER_ID, TEST_FILE_ID);
|
||||
|
||||
var values = redactionLog.getRedactionLogEntry().stream().map(RedactionLogEntry::getValue).collect(Collectors.toList());
|
||||
|
||||
assertThat(values).containsExactlyInAnyOrder("Lastname M.", "Doe", "Doe J.", "M. Mustermann", "Mustermann M.", "F. Lastname");
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void redactionTest() throws IOException {
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/new/VV-919901.pdf");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf");
|
||||
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
|
||||
request.setExcludedPages(Set.of(1));
|
||||
|
||||
@ -1121,10 +1138,10 @@ public class RedactionIntegrationTest {
|
||||
|
||||
private static String getTemporaryDirectory() {
|
||||
|
||||
String tmpdir = System.getProperty("java.io.tmpdir");
|
||||
if (StringUtils.isNotBlank(tmpdir)) {
|
||||
return tmpdir;
|
||||
}
|
||||
// String tmpdir = System.getProperty("java.io.tmpdir");
|
||||
// if (StringUtils.isNotBlank(tmpdir)) {
|
||||
// return tmpdir;
|
||||
// }
|
||||
return "/tmp";
|
||||
}
|
||||
|
||||
|
||||
@ -134,8 +134,7 @@ public class EntitySearchUtilsTest {
|
||||
Set<Entity> result = EntitySearchUtils.findNonOverlappingMatchEntities(existingEntities, foundEntities);
|
||||
|
||||
// Assert
|
||||
assertThat(result.size()).isEqualTo(1);
|
||||
assertThat(result).contains(foundEntities2);
|
||||
assertThat(result.size()).isEqualTo(0);
|
||||
|
||||
}
|
||||
|
||||
@ -165,8 +164,7 @@ public class EntitySearchUtilsTest {
|
||||
Set<Entity> result = EntitySearchUtils.findNonOverlappingMatchEntities(existingEntities, foundEntities);
|
||||
|
||||
// Assert
|
||||
assertThat(result.size()).isEqualTo(1);
|
||||
assertThat(result).contains(foundEntities2);
|
||||
assertThat(result.size()).isEqualTo(0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -8585,4 +8585,9 @@ Zoriki Hosomi R.
|
||||
Zoriki Hosomi Rosana
|
||||
Zuberer D
|
||||
Zubrod J
|
||||
Zwicker R.E.
|
||||
Zwicker R.E.
|
||||
Doe
|
||||
M. Mustermann
|
||||
F. Lastname
|
||||
Mustermann
|
||||
Lastname
|
||||
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user