Pull request #120: RED-1039: Fixed finding textpositions, RED-1042: Fixed get rectangles per line

Merge in RED/redaction-service from RED-1039 to master

* commit '00b0cb160342f1857ac0e523f994918057d5fc6b':
  RED-1039: Fixed finding textpositions, RED-1042: Fixed get rectangles per line
This commit is contained in:
Dominique Eiflaender 2021-02-08 14:50:02 +01:00
commit 6d9ed080ce
3 changed files with 39 additions and 14 deletions

View File

@ -32,6 +32,7 @@ public class SearchableText {
}
@SuppressWarnings("checkstyle:ModifiedControlVariable")
public List<EntityPositionSequence> getSequences(String searchString, boolean caseInsensitive,
List<TextPositionSequence> sequencesSubList) {
@ -66,9 +67,12 @@ public class SearchableText {
for (int j = 0; j < searchSpace.get(i).length(); j++) {
if (i > 0 && j == 0 && searchSpace.get(i).charAt(0, caseInsensitive) == ' ' && searchSpace.get(i - 1)
.charAt(searchSpace.get(i - 1).length() - 1, caseInsensitive) == ' ' || j > 0 && searchSpace.get(i)
.charAt(j, caseInsensitive) == ' ' && searchSpace.get(i).charAt(j - 1, caseInsensitive) == ' ') {
if (j == searchSpace.get(i).length() - 1 && counter != 0 && !partMatch.getTextPositions().isEmpty()) {
.charAt(searchSpace.get(i - 1)
.length() - 1, caseInsensitive) == ' ' || j > 0 && searchSpace.get(i)
.charAt(j, caseInsensitive) == ' ' && searchSpace.get(i)
.charAt(j - 1, caseInsensitive) == ' ') {
if (j == searchSpace.get(i).length() - 1 && counter != 0 && !partMatch.getTextPositions()
.isEmpty()) {
crossSequenceParts.add(partMatch);
}
continue;
@ -80,8 +84,8 @@ public class SearchableText {
counter++;
}
if (searchSpace.get(i)
.charAt(j, caseInsensitive) == searchChars[counter] || counter != 0 && searchSpace.get(i)
if (searchSpace.get(i).charAt(j, caseInsensitive) == searchChars[counter] || counter != 0 && searchSpace
.get(i)
.charAt(j, caseInsensitive) == '-') {
if (counter != 0 || i == 0 && j == 0 || j != 0 && isSeparator(searchSpace.get(i)
@ -100,14 +104,15 @@ public class SearchableText {
if (counter == searchString.length()) {
crossSequenceParts.add(partMatch);
if (i == searchSpace.size() - 1 && j == searchSpace.get(i).length() - 1 || j != searchSpace.get(i)
.length() - 1 && isSeparator(searchSpace.get(i)
if (i == searchSpace.size() - 1 && j == searchSpace.get(i)
.length() - 1 || j != searchSpace.get(i).length() - 1 && isSeparator(searchSpace.get(i)
.charAt(j + 1, caseInsensitive)) || j == searchSpace.get(i)
.length() - 1 && isSeparator(searchSpace.get(i + 1)
.charAt(0, caseInsensitive)) || j == searchSpace.get(i).length() - 1 && searchSpace.get(i)
.charAt(0, caseInsensitive)) || j == searchSpace.get(i)
.length() - 1 && searchSpace.get(i)
.charAt(j, caseInsensitive) != ' ' && searchSpace.get(i + 1)
.charAt(0, caseInsensitive) != ' ') {
finalMatches.addAll(buildEntityPositionSequence(crossSequenceParts));
finalMatches.addAll(buildEntityPositionSequence(crossSequenceParts, normalizedSearchString));
}
counter = 0;
@ -130,15 +135,21 @@ public class SearchableText {
}
return finalMatches;
}
private List<EntityPositionSequence> buildEntityPositionSequence(List<TextPositionSequence> crossSequenceParts) {
private List<EntityPositionSequence> buildEntityPositionSequence(List<TextPositionSequence> crossSequenceParts,
String searchString) {
List<EntityPositionSequence> result = new ArrayList<>();
String asString = buildString(crossSequenceParts);
if (!asString.equalsIgnoreCase(searchString)) {
return result;
}
String plainId = IdBuilder.buildId(crossSequenceParts);
String id = plainId;
List<EntityPositionSequence> result = new ArrayList<>();
int currentPage = -1;
int idDiffentPageSuffix = 1;
EntityPositionSequence entityPositionSequence = new EntityPositionSequence(id);
@ -173,6 +184,12 @@ public class SearchableText {
@Override
public String toString() {
return buildString(sequences);
}
public String buildString(List<TextPositionSequence> sequences) {
StringBuilder sb = new StringBuilder();
TextPositionSequence previous = null;

View File

@ -194,7 +194,7 @@ public class RedactionLogCreatorService {
startIndex = i;
}
}
if (startIndex != textPositions.size() - 1) {
if (startIndex != textPositions.size()) {
rectangles.add(new TextPositionSequence(textPositions.subList(startIndex, textPositions.size()), page).getRectangle());
}
}

View File

@ -418,6 +418,7 @@ public class RedactionIntegrationTest {
}
private List<File> getPathsRecursively(File path) {
List<File> result = new ArrayList<>();
@ -439,9 +440,16 @@ public class RedactionIntegrationTest {
@Test
public void redactionTest() throws IOException {
// 49 Cyprodinil - EU AIR3 - MCA Section 8 Supplement - Ecotoxicological studies on the active substance.pdf
// 182 Fludioxonil - EU AIR3 - MCA Section 8 Supplement - Ecotoxicological studies on the active substance.pdf
// 38 A14325E - EU AIR3 - MCP Section 10 - Ecotoxicological studies on the plant protection product.pdf
// 91 Trinexapac-ethyl_RAR_01_Volume_1_2018-02-23.pdf
// 95 Trinexapac-ethyl_RAR_08_Volume_3CA_B-6_2018-01-10.pdf
System.out.println("redactionTest");
long start = System.currentTimeMillis();
ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf");
ClassPathResource pdfFileResource = new ClassPathResource("files/Cyprodinil/49 Cyprodinil - EU AIR3 - MCA Section 8 Supplement - Ecotoxicological studies on the active substance.pdf");
AnalyzeRequest request = AnalyzeRequest.builder()
.ruleSetId(TEST_RULESET_ID)