Pull request #50: Make sure section and table char indices match

Merge in RED/redaction-service from RED-381 to master

* commit '9a965dd68a72285af6bf6a28ec6aa67358c35dba':
  Make sure section and table char indices match
This commit is contained in:
Thierry Goeckel 2020-10-05 11:02:44 +02:00
commit d91612a69a
10 changed files with 115 additions and 77 deletions

View File

@ -131,7 +131,7 @@ public class TextBlock extends AbstractTextContainer {
TextPositionSequence previous = null;
for (TextPositionSequence word : sequences) {
if (previous != null) {
if (Math.abs(previous.getY1() - word.getY1()) > word.getTextHeight()) {
if (Math.abs(previous.getRotationAdjustedY() - word.getRotationAdjustedY()) > word.getTextHeight()) {
sb.append('\n');
} else {
sb.append(' ');

View File

@ -101,6 +101,9 @@ public class TextPositionSequence implements CharSequence {
}
}
public float getRotationAdjustedY() {
return textPositions.get(0).getY();
}
public float getY1() {

View File

@ -4,36 +4,33 @@ import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
import lombok.RequiredArgsConstructor;
import lombok.Value;
@Value
@RequiredArgsConstructor
public class CellValue {
TextBlock textBlock;
int rowSpanStart;
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
TextPositionSequence previous = null;
for (TextPositionSequence word : textBlock.getSequences()) {
if (previous != null) {
if (Math.abs(previous.getY1() - word.getY1()) > word.getTextHeight()) {
sb.append('\n');
} else {
sb.append(' ');
}
TextPositionSequence previous = null;
for (TextPositionSequence word : textBlock.getSequences()) {
if (previous != null) {
if (Math.abs(previous.getRotationAdjustedY() - word.getRotationAdjustedY()) > word.getTextHeight()) {
sb.append('\n');
} else {
sb.append(' ');
}
sb.append(word.toString());
previous = word;
}
sb.append(word.toString());
previous = word;
}
return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString())
.replaceAll("\n", " ")

View File

@ -179,7 +179,7 @@ public class SearchableText {
for (TextPositionSequence word : sequences) {
if (previous != null) {
if (Math.abs(previous.getY1() - word.getY1()) > word.getTextHeight()) {
if (Math.abs(previous.getRotationAdjustedY() - word.getRotationAdjustedY()) > word.getTextHeight()) {
sb.append('\n');
} else {
sb.append(' ');
@ -203,7 +203,7 @@ public class SearchableText {
for (TextPositionSequence word : sequences) {
if (previous != null) {
if (Math.abs(previous.getY1() - word.getY1()) > word.getTextHeight()) {
if (Math.abs(previous.getRotationAdjustedY() - word.getRotationAdjustedY()) > word.getTextHeight()) {
sb.append('\n');
} else {
sb.append(' ');

View File

@ -210,9 +210,13 @@ public class Section {
if (value == null) {
log.warn("Could not find any data for {}.", cellHeader);
} else {
Entity entity = new Entity(value.toString(), type, value.getRowSpanStart(),
value.getRowSpanStart() + value.toString()
.length(), headline, sectionNumber);
String word = value.toString();
Entity entity = new Entity(word,
type,
value.getRowSpanStart(),
value.getRowSpanStart() + word.length(),
headline,
sectionNumber);
entity.setRedaction(redact);
entity.setMatchedRule(ruleNumber);
entity.setRedactionReason(reason);
@ -220,9 +224,7 @@ public class Section {
.getSequences()); // Make sure no other cells with same content are highlighted
// HashSet keeps the older value, but we want the new only.
if (entities.contains(entity)) {
entities.remove(entity);
}
entities.remove(entity);
entities.add(entity);
entities = removeEntitiesContainedInLarger(entities);

View File

@ -70,7 +70,7 @@ public class EntityRedactionService {
.replaceAll("-", "");
tabularData.put(headerName, new CellValue(cell.getTextBlocks().get(0), cellStart));
});
start = start + cell.toString().length();
start = start + cell.toString().length() + 1; // include automatically appended white space
for (TextBlock textBlock : cell.getTextBlocks()) {
searchableRow.addAll(textBlock.getSequences());
}
@ -143,17 +143,17 @@ public class EntityRedactionService {
private Set<Entity> findEntities(SearchableText searchableText, String headline, int sectionNumber) {
Set<Entity> found = new HashSet<>();
if (StringUtils.isEmpty(searchableText.toString())) {
String searchableString = searchableText.toString();
if (StringUtils.isEmpty(searchableString)) {
return found;
}
String inputString = searchableText.toString();
String lowercaseInputString = inputString.toLowerCase();
String lowercaseInputString = searchableString.toLowerCase();
for (Map.Entry<String, Set<String>> entry : dictionaryService.getDictionary().entrySet()) {
if (dictionaryService.getCaseInsensitiveTypes().contains(entry.getKey())) {
found.addAll(find(lowercaseInputString, entry.getValue(), entry.getKey(), headline, sectionNumber));
} else {
found.addAll(find(inputString, entry.getValue(), entry.getKey(), headline, sectionNumber));
found.addAll(find(searchableString, entry.getValue(), entry.getKey(), headline, sectionNumber));
}
}

View File

@ -41,14 +41,11 @@ public class Cell extends Rectangle {
StringBuilder sb = new StringBuilder();
TextPositionSequence previous = null;
for (TextBlock textBlock : textBlocks) {
TextPositionSequence previous = null;
for (TextPositionSequence word : textBlock.getSequences()) {
if (previous != null) {
if (Math.abs(previous.getY1() - word.getY1()) > word.getTextHeight()) {
if (Math.abs(previous.getRotationAdjustedY() - word.getRotationAdjustedY()) > word.getTextHeight()) {
sb.append('\n');
} else {
sb.append(' ');
@ -64,5 +61,4 @@ public class Cell extends Rectangle {
.replaceAll(" {2}", " ");
}
}
}

View File

@ -93,8 +93,8 @@ public class RedactionIntegrationTest {
KieFileSystem kieFileSystem = kieServices.newKieFileSystem();
InputStream input = new ByteArrayInputStream(RULES.getBytes(StandardCharsets.UTF_8));
kieFileSystem.write("src/test/resources/drools/rules.drl", kieServices.getResources()
.newInputStreamResource(input));
kieFileSystem.write("src/test/resources/drools/rules.drl",
kieServices.getResources().newInputStreamResource(input));
KieBuilder kieBuilder = kieServices.newKieBuilder(kieFileSystem);
kieBuilder.buildAll();
KieModule kieModule = kieBuilder.getKieModule();
@ -119,8 +119,10 @@ public class RedactionIntegrationTest {
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE)).thenReturn(getDictionaryResponse(ADDRESS_CODE));
when(dictionaryClient.getDictionaryForType(NAME_CODE)).thenReturn(getDictionaryResponse(NAME_CODE));
when(dictionaryClient.getDictionaryForType(SPONSOR)).thenReturn(getDictionaryResponse(SPONSOR));
when(dictionaryClient.getDictionaryForType(NO_REDACTION_INDICATOR)).thenReturn(getDictionaryResponse(NO_REDACTION_INDICATOR));
when(dictionaryClient.getDictionaryForType(REDACTION_INDICATOR)).thenReturn(getDictionaryResponse(REDACTION_INDICATOR));
when(dictionaryClient.getDictionaryForType(NO_REDACTION_INDICATOR)).thenReturn(getDictionaryResponse(
NO_REDACTION_INDICATOR));
when(dictionaryClient.getDictionaryForType(REDACTION_INDICATOR)).thenReturn(getDictionaryResponse(
REDACTION_INDICATOR));
when(dictionaryClient.getDictionaryForType(HINT_ONLY)).thenReturn(getDictionaryResponse(HINT_ONLY));
when(dictionaryClient.getDictionaryForType(MUST_REDACT)).thenReturn(getDictionaryResponse(MUST_REDACT));
when(dictionaryClient.getDefaultColor()).thenReturn(new DefaultColor(new float[]{1f, 0.502f, 0f}));
@ -131,44 +133,44 @@ public class RedactionIntegrationTest {
dictionary.computeIfAbsent(NAME_CODE, v -> new ArrayList<>())
.addAll(ResourceLoader.load("dictionaries/names.txt")
.stream()
.map(this::cleanDictionaryEntry)
.collect(Collectors.toSet()));
.stream()
.map(this::cleanDictionaryEntry)
.collect(Collectors.toSet()));
dictionary.computeIfAbsent(SPONSOR, v -> new ArrayList<>())
.addAll(ResourceLoader.load("dictionaries/sponsor_companies.txt")
.stream()
.map(this::cleanDictionaryEntry)
.collect(Collectors.toSet()));
.stream()
.map(this::cleanDictionaryEntry)
.collect(Collectors.toSet()));
dictionary.computeIfAbsent(VERTEBRATES_CODE, v -> new ArrayList<>())
.addAll(ResourceLoader.load("dictionaries/vertebrates.txt")
.stream()
.map(this::cleanDictionaryEntry)
.collect(Collectors.toSet()));
.stream()
.map(this::cleanDictionaryEntry)
.collect(Collectors.toSet()));
dictionary.computeIfAbsent(ADDRESS_CODE, v -> new ArrayList<>())
.addAll(ResourceLoader.load("dictionaries/addresses.txt")
.stream()
.map(this::cleanDictionaryEntry)
.collect(Collectors.toSet()));
.stream()
.map(this::cleanDictionaryEntry)
.collect(Collectors.toSet()));
dictionary.computeIfAbsent(NO_REDACTION_INDICATOR, v -> new ArrayList<>())
.addAll(ResourceLoader.load("dictionaries/no_redaction_indicator.txt")
.stream()
.map(this::cleanDictionaryEntry)
.collect(Collectors.toSet()));
.stream()
.map(this::cleanDictionaryEntry)
.collect(Collectors.toSet()));
dictionary.computeIfAbsent(REDACTION_INDICATOR, v -> new ArrayList<>())
.addAll(ResourceLoader.load("dictionaries/redaction_indicator.txt")
.stream()
.map(this::cleanDictionaryEntry)
.collect(Collectors.toSet()));
.stream()
.map(this::cleanDictionaryEntry)
.collect(Collectors.toSet()));
dictionary.computeIfAbsent(HINT_ONLY, v -> new ArrayList<>())
.addAll(ResourceLoader.load("dictionaries/hint_only.txt")
.stream()
.map(this::cleanDictionaryEntry)
.collect(Collectors.toSet()));
.stream()
.map(this::cleanDictionaryEntry)
.collect(Collectors.toSet()));
dictionary.computeIfAbsent(MUST_REDACT, v -> new ArrayList<>())
.addAll(ResourceLoader.load("dictionaries/must_redact.txt")
.stream()
.map(this::cleanDictionaryEntry)
.collect(Collectors.toSet()));
.stream()
.map(this::cleanDictionaryEntry)
.collect(Collectors.toSet()));
}
@ -280,7 +282,8 @@ public class RedactionIntegrationTest {
System.out.println("redactionTest");
long start = System.currentTimeMillis();
ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf");
ClassPathResource pdfFileResource = new ClassPathResource(
"files/Metolachlor/S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf");
RedactionRequest request = RedactionRequest.builder()
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
@ -304,7 +307,8 @@ public class RedactionIntegrationTest {
System.out.println("testTableRedaction");
long start = System.currentTimeMillis();
ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf");
ClassPathResource pdfFileResource = new ClassPathResource(
"files/Metolachlor/S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf");
RedactionRequest request = RedactionRequest.builder()
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
@ -336,7 +340,14 @@ public class RedactionIntegrationTest {
manualRedactionEntry.setType("name");
manualRedactionEntry.setValue("O'Loughlin C.K.");
manualRedactionEntry.setReason("Manual Redaction");
manualRedactionEntry.setPositions(List.of(new Rectangle(new Point(375.61096f, 241.282f), 7.648041f, 43.72262f, 1), new Rectangle(new Point(384.83517f, 241.282f), 7.648041f, 17.043358f, 1)));
manualRedactionEntry.setPositions(List.of(new Rectangle(new Point(375.61096f, 241.282f),
7.648041f,
43.72262f,
1),
new Rectangle(new Point(384.83517f, 241.282f),
7.648041f,
17.043358f,
1)));
manualRedactions.getEntriesToAdd().add(manualRedactionEntry);
@ -361,8 +372,8 @@ public class RedactionIntegrationTest {
public void classificationTest() throws IOException {
System.out.println("classificationTest");
ClassPathResource pdfFileResource = new ClassPathResource("files/Fludioxonil/51 " +
"Fludioxonil_RAR_02_Volume_2_2018-02-21.pdf");
ClassPathResource pdfFileResource = new ClassPathResource(
"files/Fludioxonil/51 Fludioxonil_RAR_02_Volume_2_2018-02-21.pdf");
RedactionRequest request = RedactionRequest.builder()
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
@ -381,7 +392,7 @@ public class RedactionIntegrationTest {
System.out.println("sectionsTest");
ClassPathResource pdfFileResource = new ClassPathResource("files/Fludioxonil/51 " +
"Fludioxonil_RAR_02_Volume_2_2018-02-21.pdf");
"Fludioxonil_RAR_02_Volume_2_2018-02-21.pdf");
RedactionRequest request = RedactionRequest.builder()
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
@ -399,7 +410,8 @@ public class RedactionIntegrationTest {
public void htmlTablesTest() throws IOException {
System.out.println("htmlTablesTest");
ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf");
ClassPathResource pdfFileResource = new ClassPathResource(
"files/Metolachlor/S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf");
RedactionRequest request = RedactionRequest.builder()
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
@ -417,8 +429,8 @@ public class RedactionIntegrationTest {
public void htmlTableRotationTest() throws IOException {
System.out.println("htmlTableRotationTest");
ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S" +
"-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf");
ClassPathResource pdfFileResource = new ClassPathResource(
"files/Metolachlor/S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf");
RedactionRequest request = RedactionRequest.builder()
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
@ -482,7 +494,7 @@ public class RedactionIntegrationTest {
throw new IllegalArgumentException("could not load classpath resource: drools/rules.drl");
}
try (BufferedReader br = new BufferedReader(new InputStreamReader(resource.openStream(),
StandardCharsets.UTF_8))) {
StandardCharsets.UTF_8))) {
StringBuilder sb = new StringBuilder();
String str;
while ((str = br.readLine()) != null) {

View File

@ -134,7 +134,35 @@ public class EntityRedactionServiceTest {
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
entityRedactionService.processDocument(classifiedDoc, null);
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1)).hasSize(7);// 3 names, 1 address, 1 Y and 2 N entities
assertThat(classifiedDoc.getEntities().get(1)).hasSize(7);// 3 author cells, 1 address, 1 Y and 2 N entities
}
}
@Test
public void testNestedRedaction() throws IOException {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/nested_redaction.pdf");
RedactionRequest redactionRequest = RedactionRequest.builder()
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
.build();
DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
.entries(Arrays.asList("Casey, H.W.", "OLoughlin, C.K.", "Salamon, C.M.", "Smith, S.H."))
.build();
when(dictionaryClient.getVersion()).thenReturn(DICTIONARY_VERSION.incrementAndGet());
when(dictionaryClient.getDictionaryForType(NAME_CODE)).thenReturn(dictionaryResponse);
DictionaryResponse addressResponse = DictionaryResponse.builder()
.entries(Collections.singletonList("Toxigenics, Inc., Decatur, IL 62526, USA"))
.build();
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE)).thenReturn(addressResponse);
try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(redactionRequest.getDocument()))) {
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
entityRedactionService.processDocument(classifiedDoc, null);
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1)).hasSize(7);// 3 author cells, 1 address, 1 Y and 2 N entities
}
}
@ -273,7 +301,7 @@ public class EntityRedactionServiceTest {
entityRedactionService.processDocument(classifiedDoc, null);
assertThat(classifiedDoc.getEntities()).hasSize(2); // two pages
assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(8);
assertThat(classifiedDoc.getEntities().get(2).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(4);
assertThat(classifiedDoc.getEntities().get(2).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(5); // 2 names, 1 address, 2 Y
}
pdfFileResource = new ClassPathResource("files/Minimal Examples/Header Propagation2.pdf");
@ -335,7 +363,7 @@ public class EntityRedactionServiceTest {
" when\n" +
" Section(rowEquals(\"Vertebrate study Y/N\", \"N\") || rowEquals(\"Vertebrate study Y/N\", \"No\"))\n" +
" then\n" +
" section.redactNot(\"name\", 8, \"Not redacted because row is not a vertebrate study\");\n" +
" section.redactNotCell(\"Author(s)\", 8, \"name\", \"Not redacted because row is not a vertebrate study\");\n" +
" section.redactNot(\"address\", 8, \"Not redacted because row is not a vertebrate study\");\n" +
" section.highlightCell(\"Vertebrate study Y/N\", 8, \"hint_only\");\n" +
" end\n" +
@ -344,7 +372,7 @@ public class EntityRedactionServiceTest {
" Section(rowEquals(\"Vertebrate study Y/N\", \"Y\") || rowEquals(\"Vertebrate study Y/N\", " +
"\"Yes\"))\n" +
" then\n" +
" section.redact(\"name\", 9, \"Redacted because row is a vertebrate study\");\n" +
" section.redactCell(\"Author(s)\", 9, \"name\", \"Redacted because row is a vertebrate study\");\n" +
" section.redact(\"address\", 9, \"Redacted because row is a vertebrate study\");\n" +
" section.highlightCell(\"Vertebrate study Y/N\", 9, \"must_redact\");\n" +
" end";