Fixed duplicated redaction/RedactionLog entries
This commit is contained in:
parent
e2895a1c7a
commit
d2d7f8c50c
@ -4,7 +4,6 @@ import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
|
||||
@ -18,7 +17,7 @@ public class Document {
|
||||
|
||||
private List<Page> pages = new ArrayList<>();
|
||||
private List<Paragraph> paragraphs = new ArrayList<>();
|
||||
private Map<Integer, Set<Entity>> entities = new HashMap<>();
|
||||
private Map<Integer, List<Entity>> entities = new HashMap<>();
|
||||
private FloatFrequencyCounter textHeightCounter = new FloatFrequencyCounter();
|
||||
private FloatFrequencyCounter fontSizeCounter= new FloatFrequencyCounter();
|
||||
private StringFrequencyCounter fontCounter= new StringFrequencyCounter();
|
||||
|
||||
@ -3,6 +3,7 @@ package com.iqser.red.service.redaction.v1.server.redaction.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@ -18,8 +19,9 @@ public class Entity {
|
||||
private Integer end;
|
||||
private String headline;
|
||||
private int matchedRule;
|
||||
private int sectionNumber;
|
||||
|
||||
public Entity(String word, String type, boolean redaction, String redactionReason, List<EntityPositionSequence> positionSequences, String headline, int matchedRule) {
|
||||
public Entity(String word, String type, boolean redaction, String redactionReason, List<EntityPositionSequence> positionSequences, String headline, int matchedRule, int sectionNumber) {
|
||||
this.word = word;
|
||||
this.type = type;
|
||||
this.redaction = redaction;
|
||||
@ -27,13 +29,38 @@ public class Entity {
|
||||
this.positionSequences = positionSequences;
|
||||
this.headline = headline;
|
||||
this.matchedRule = matchedRule;
|
||||
this.sectionNumber = sectionNumber;
|
||||
}
|
||||
|
||||
public Entity(String word, String type, Integer start, Integer end, String headline) {
|
||||
public Entity(String word, String type, Integer start, Integer end, String headline, int sectionNumber) {
|
||||
this.word = word;
|
||||
this.type = type;
|
||||
this.start = start;
|
||||
this.end = end;
|
||||
this.headline = headline;
|
||||
this.sectionNumber = sectionNumber;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
|
||||
if (this == o) {
|
||||
return true;
|
||||
}
|
||||
if (o == null || getClass() != o.getClass()) {
|
||||
return false;
|
||||
}
|
||||
Entity entity = (Entity) o;
|
||||
return sectionNumber == entity.sectionNumber && Objects.equals(word, entity.word) && Objects.equals(type, entity.type) && Objects
|
||||
.equals(headline, entity.headline);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
|
||||
return Objects.hash(word, type, headline, sectionNumber);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -2,19 +2,44 @@ package com.iqser.red.service.redaction.v1.server.redaction.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.UUID;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
|
||||
@Data
|
||||
@RequiredArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class EntityPositionSequence {
|
||||
|
||||
private List<TextPositionSequence> sequences = new ArrayList<>();
|
||||
private int pageNumber;
|
||||
private final UUID id;
|
||||
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
|
||||
if (this == o) {
|
||||
return true;
|
||||
}
|
||||
if (o == null || getClass() != o.getClass()) {
|
||||
return false;
|
||||
}
|
||||
EntityPositionSequence that = (EntityPositionSequence) o;
|
||||
return pageNumber == that.pageNumber && Objects.equals(id, that.id);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
|
||||
return Objects.hash(pageNumber, id);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -25,6 +25,8 @@ public class Section {
|
||||
|
||||
private String headline;
|
||||
|
||||
private int sectionNumber;
|
||||
|
||||
|
||||
public boolean contains(String type) {
|
||||
|
||||
@ -32,6 +34,11 @@ public class Section {
|
||||
}
|
||||
|
||||
|
||||
public boolean headlineContainsWord(String word){
|
||||
return StringUtils.containsIgnoreCase(headline, word);
|
||||
}
|
||||
|
||||
|
||||
public void redact(String type, int ruleNumber, String reason) {
|
||||
|
||||
entities.forEach(entity -> {
|
||||
@ -109,13 +116,11 @@ public class Section {
|
||||
|
||||
if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(searchText.charAt(startIndex - 1)) || isSeparator(searchText
|
||||
.charAt(startIndex - 1))) && (stopIndex == searchText.length() || isSeparator(searchText.charAt(stopIndex)))) {
|
||||
found.add(new Entity(searchText.substring(startIndex, stopIndex), asType, startIndex, stopIndex, headline));
|
||||
found.add(new Entity(searchText.substring(startIndex, stopIndex), asType, startIndex, stopIndex, headline, sectionNumber));
|
||||
}
|
||||
} while (startIndex > -1);
|
||||
|
||||
removeEntitiesContainedInLarger(found);
|
||||
|
||||
return found;
|
||||
return removeEntitiesContainedInLarger(found);
|
||||
}
|
||||
|
||||
|
||||
@ -125,7 +130,7 @@ public class Section {
|
||||
}
|
||||
|
||||
|
||||
public void removeEntitiesContainedInLarger(Set<Entity> entities) {
|
||||
public Set<Entity> removeEntitiesContainedInLarger(Set<Entity> entities) {
|
||||
|
||||
List<Entity> wordsToRemove = new ArrayList<>();
|
||||
for (Entity word : entities) {
|
||||
@ -137,6 +142,7 @@ public class Section {
|
||||
}
|
||||
}
|
||||
entities.removeAll(wordsToRemove);
|
||||
return entities;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.service;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
@ -13,6 +14,7 @@ import com.iqser.red.service.redaction.v1.server.classification.model.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Section;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
|
||||
@ -34,6 +36,7 @@ public class EntityRedactionService {
|
||||
droolsExecutionService.updateRules();
|
||||
|
||||
Set<Entity> documentEntities = new HashSet<>();
|
||||
int sectionNumber = 1;
|
||||
for (Paragraph paragraph : classifiedDoc.getParagraphs()) {
|
||||
|
||||
SearchableText searchableText = paragraph.getSearchableText();
|
||||
@ -51,57 +54,70 @@ public class EntityRedactionService {
|
||||
searchableRow.addAll(textBlock.getSequences());
|
||||
}
|
||||
}
|
||||
Set<Entity> rowEntities = findEntities(searchableRow, table.getHeadline());
|
||||
Set<Entity> rowEntities = findEntities(searchableRow, table.getHeadline(), sectionNumber);
|
||||
|
||||
Section analysedRowSection = droolsExecutionService.executeRules(Section.builder()
|
||||
.entities(rowEntities)
|
||||
.text(searchableRow.getAsStringWithLinebreaks())
|
||||
.searchText(searchableRow.toString())
|
||||
.headline(table.getHeadline())
|
||||
.sectionNumber(sectionNumber)
|
||||
.build());
|
||||
|
||||
for (Entity entity : analysedRowSection.getEntities()) {
|
||||
if (dictionaryService.getCaseInsensitiveTypes().contains(entity.getType())) {
|
||||
entity.setPositionSequences(searchableRow.getSequences(entity.getWord(), true));
|
||||
} else {
|
||||
entity.setPositionSequences(searchableRow.getSequences(entity.getWord(), false));
|
||||
}
|
||||
}
|
||||
documentEntities.addAll(analysedRowSection.getEntities());
|
||||
documentEntities.addAll(clearAndFindPositions(analysedRowSection.getEntities(), searchableRow));
|
||||
sectionNumber++;
|
||||
}
|
||||
sectionNumber++;
|
||||
}
|
||||
|
||||
Set<Entity> entities = findEntities(searchableText, paragraph.getHeadline());
|
||||
Set<Entity> entities = findEntities(searchableText, paragraph.getHeadline(), sectionNumber);
|
||||
Section analysedSection = droolsExecutionService.executeRules(Section.builder()
|
||||
.entities(entities)
|
||||
.text(searchableText.getAsStringWithLinebreaks())
|
||||
.searchText(searchableText.toString())
|
||||
.headline(paragraph.getHeadline())
|
||||
.sectionNumber(sectionNumber)
|
||||
.build());
|
||||
|
||||
for (Entity entity : analysedSection.getEntities()) {
|
||||
if (dictionaryService.getCaseInsensitiveTypes().contains(entity.getType())) {
|
||||
entity.setPositionSequences(searchableText.getSequences(entity.getWord(), true));
|
||||
} else {
|
||||
entity.setPositionSequences(searchableText.getSequences(entity.getWord(), false));
|
||||
}
|
||||
}
|
||||
|
||||
documentEntities.addAll(analysedSection.getEntities());
|
||||
documentEntities.addAll(clearAndFindPositions(analysedSection.getEntities(), searchableText));
|
||||
sectionNumber++;
|
||||
}
|
||||
|
||||
documentEntities.forEach(entity -> {
|
||||
entity.getPositionSequences().forEach(sequence -> {
|
||||
for (Entity entity : documentEntities) {
|
||||
Map<Integer, List<EntityPositionSequence>> sequenceOnPage = new HashMap<>();
|
||||
for (EntityPositionSequence entityPositionSequence : entity.getPositionSequences()) {
|
||||
sequenceOnPage.computeIfAbsent(entityPositionSequence.getPageNumber(), (x) -> new ArrayList<>())
|
||||
.add(entityPositionSequence);
|
||||
}
|
||||
|
||||
for (Map.Entry<Integer, List<EntityPositionSequence>> entry : sequenceOnPage.entrySet()) {
|
||||
classifiedDoc.getEntities()
|
||||
.computeIfAbsent(sequence.getPageNumber(), (x) -> new HashSet<>())
|
||||
.add(new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), List
|
||||
.of(sequence), entity.getHeadline(), entity.getMatchedRule()));
|
||||
});
|
||||
});
|
||||
.computeIfAbsent(entry.getKey(), (x) -> new ArrayList<>())
|
||||
.add(new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), entry
|
||||
.getValue(), entity.getHeadline(), entity.getMatchedRule(), entity.getSectionNumber()));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
private Set<Entity> findEntities(SearchableText searchableText, String headline) {
|
||||
private Set<Entity> clearAndFindPositions(Set<Entity> entities, SearchableText text) {
|
||||
|
||||
Set<Entity> cleanEntities = removeEntitiesContainedInLarger(entities);
|
||||
|
||||
for (Entity entity : cleanEntities) {
|
||||
if (dictionaryService.getCaseInsensitiveTypes().contains(entity.getType())) {
|
||||
entity.setPositionSequences(text.getSequences(entity.getWord(), true));
|
||||
} else {
|
||||
entity.setPositionSequences(text.getSequences(entity.getWord(), false));
|
||||
}
|
||||
}
|
||||
|
||||
return cleanEntities;
|
||||
}
|
||||
|
||||
|
||||
private Set<Entity> findEntities(SearchableText searchableText, String headline, int sectionNumber) {
|
||||
|
||||
String inputString = searchableText.toString();
|
||||
String lowercaseInputString = inputString.toLowerCase();
|
||||
@ -110,19 +126,17 @@ public class EntityRedactionService {
|
||||
for (Map.Entry<String, Set<String>> entry : dictionaryService.getDictionary().entrySet()) {
|
||||
|
||||
if (dictionaryService.getCaseInsensitiveTypes().contains(entry.getKey())) {
|
||||
found.addAll(find(lowercaseInputString, entry.getValue(), entry.getKey(), headline));
|
||||
found.addAll(find(lowercaseInputString, entry.getValue(), entry.getKey(), headline, sectionNumber));
|
||||
} else {
|
||||
found.addAll(find(inputString, entry.getValue(), entry.getKey(), headline));
|
||||
found.addAll(find(inputString, entry.getValue(), entry.getKey(), headline, sectionNumber));
|
||||
}
|
||||
}
|
||||
|
||||
removeEntitiesContainedInLarger(found);
|
||||
|
||||
return found;
|
||||
return removeEntitiesContainedInLarger(found);
|
||||
}
|
||||
|
||||
|
||||
private Set<Entity> find(String inputString, Set<String> values, String type, String headline) {
|
||||
private Set<Entity> find(String inputString, Set<String> values, String type, String headline, int sectionNumber) {
|
||||
|
||||
Set<Entity> found = new HashSet<>();
|
||||
for (String value : values) {
|
||||
@ -134,7 +148,7 @@ public class EntityRedactionService {
|
||||
|
||||
if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString
|
||||
.charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) {
|
||||
found.add(new Entity(inputString.substring(startIndex, stopIndex), type, startIndex, stopIndex, headline));
|
||||
found.add(new Entity(inputString.substring(startIndex, stopIndex), type, startIndex, stopIndex, headline, sectionNumber));
|
||||
}
|
||||
} while (startIndex > -1);
|
||||
}
|
||||
@ -148,7 +162,7 @@ public class EntityRedactionService {
|
||||
}
|
||||
|
||||
|
||||
public void removeEntitiesContainedInLarger(Set<Entity> entities) {
|
||||
public Set<Entity> removeEntitiesContainedInLarger(Set<Entity> entities) {
|
||||
|
||||
List<Entity> wordsToRemove = new ArrayList<>();
|
||||
for (Entity word : entities) {
|
||||
@ -160,6 +174,7 @@ public class EntityRedactionService {
|
||||
}
|
||||
}
|
||||
entities.removeAll(wordsToRemove);
|
||||
return entities;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -19,6 +19,7 @@ import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.junit.Before;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
import org.junit.runner.RunWith;
|
||||
import org.kie.api.KieServices;
|
||||
@ -47,6 +48,7 @@ import com.iqser.red.service.redaction.v1.server.controller.RedactionController;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
|
||||
|
||||
@Ignore
|
||||
@RunWith(SpringRunner.class)
|
||||
@SpringBootTest(webEnvironment = DEFINED_PORT)
|
||||
public class RedactionIntegrationTest {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user